1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
152 Below is the template of these functions. */
156 detect_coding_XXX (coding
, detect_info
)
157 struct coding_system
*coding
;
158 struct coding_detection_info
*detect_info
;
160 unsigned char *src
= coding
->source
;
161 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
162 int multibytep
= coding
->src_multibyte
;
163 int consumed_chars
= 0;
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
173 if (! __C_conforms_to_XXX___ (c
))
175 if (! __C_strongly_suggests_XXX__ (c
))
176 found
= CATEGORY_MASK_XXX
;
178 /* The byte sequence is invalid for XXX. */
179 detect_info
->rejected
|= CATEGORY_MASK_XXX
;
183 /* The source exausted successfully. */
184 detect_info
->found
|= found
;
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
201 Below is the template of these functions. */
205 decode_coding_XXXX (coding
)
206 struct coding_system
*coding
;
208 unsigned char *src
= coding
->source
+ coding
->consumed
;
209 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base
;
214 /* A buffer to produce decoded characters. */
215 int *charbuf
= coding
->charbuf
;
216 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
217 int multibytep
= coding
->src_multibyte
;
222 if (charbuf
< charbuf_end
)
223 /* No more room to produce a decoded character. */
230 if (src_base
< src_end
231 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base
< src_end
&& charbuf
< charbuf_end
)
235 *charbuf
++ = *src_base
++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
239 /* Remember how many characters we produced. */
240 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
260 Below is a template of these functions. */
263 encode_coding_XXX (coding
)
264 struct coding_system
*coding
;
266 int multibytep
= coding
->dst_multibyte
;
267 int *charbuf
= coding
->charbuf
;
268 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
269 unsigned char *dst
= coding
->destination
+ coding
->produced
;
270 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
271 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
272 int produced_chars
= 0;
274 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
277 /* Encode C into DST, and increment DST. */
279 label_no_more_destination
:
280 /* How many chars and bytes we produced. */
281 coding
->produced_char
+= produced_chars
;
282 coding
->produced
= dst
- coding
->destination
;
287 /*** 1. Preamble ***/
294 #include "character.h"
297 #include "composite.h"
301 Lisp_Object Vcoding_system_hash_table
;
303 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
304 Lisp_Object Qunix
, Qdos
;
305 extern Lisp_Object Qmac
; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system
;
307 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
308 Lisp_Object Qdefault_char
;
309 Lisp_Object Qno_conversion
, Qundecided
;
310 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
311 Lisp_Object Qbig
, Qlittle
;
312 Lisp_Object Qcoding_system_history
;
313 Lisp_Object Qvalid_codes
;
315 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
316 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
317 Lisp_Object Qstart_process
, Qopen_network_stream
;
318 Lisp_Object Qtarget_idx
;
320 Lisp_Object Vselect_safe_coding_system_function
;
322 /* Mnemonic string for each format of end-of-line. */
323 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
324 /* Mnemonic string to indicate format of end-of-line is not yet
326 Lisp_Object eol_mnemonic_undecided
;
330 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
332 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
334 /* Coding system emacs-mule and raw-text are for converting only
335 end-of-line format. */
336 Lisp_Object Qemacs_mule
, Qraw_text
;
338 /* Coding-systems are handed between Emacs Lisp programs and C internal
339 routines by the following three variables. */
340 /* Coding-system for reading files and receiving data from process. */
341 Lisp_Object Vcoding_system_for_read
;
342 /* Coding-system for writing files and sending data to process. */
343 Lisp_Object Vcoding_system_for_write
;
344 /* Coding-system actually used in the latest I/O. */
345 Lisp_Object Vlast_coding_system_used
;
347 /* A vector of length 256 which contains information about special
348 Latin codes (especially for dealing with Microsoft codes). */
349 Lisp_Object Vlatin_extra_code_table
;
351 /* Flag to inhibit code conversion of end-of-line format. */
352 int inhibit_eol_conversion
;
354 /* Flag to inhibit ISO2022 escape sequence detection. */
355 int inhibit_iso_escape_detection
;
357 /* Flag to make buffer-file-coding-system inherit from process-coding. */
358 int inherit_process_coding_system
;
360 /* Coding system to be used to encode text for terminal display. */
361 struct coding_system terminal_coding
;
363 /* Coding system to be used to encode text for terminal display when
364 terminal coding system is nil. */
365 struct coding_system safe_terminal_coding
;
367 /* Coding system of what is sent from terminal keyboard. */
368 struct coding_system keyboard_coding
;
370 Lisp_Object Vfile_coding_system_alist
;
371 Lisp_Object Vprocess_coding_system_alist
;
372 Lisp_Object Vnetwork_coding_system_alist
;
374 Lisp_Object Vlocale_coding_system
;
378 /* Flag to tell if we look up translation table on character code
380 Lisp_Object Venable_character_translation
;
381 /* Standard translation table to look up on decoding (reading). */
382 Lisp_Object Vstandard_translation_table_for_decode
;
383 /* Standard translation table to look up on encoding (writing). */
384 Lisp_Object Vstandard_translation_table_for_encode
;
386 Lisp_Object Qtranslation_table
;
387 Lisp_Object Qtranslation_table_id
;
388 Lisp_Object Qtranslation_table_for_decode
;
389 Lisp_Object Qtranslation_table_for_encode
;
391 /* Alist of charsets vs revision number. */
392 static Lisp_Object Vcharset_revision_table
;
394 /* Default coding systems used for process I/O. */
395 Lisp_Object Vdefault_process_coding_system
;
397 /* Global flag to tell that we can't call post-read-conversion and
398 pre-write-conversion functions. Usually the value is zero, but it
399 is set to 1 temporarily while such functions are running. This is
400 to avoid infinite recursive call. */
401 static int inhibit_pre_post_conversion
;
403 /* Two special coding systems. */
404 Lisp_Object Vsjis_coding_system
;
405 Lisp_Object Vbig5_coding_system
;
408 static int detect_coding_utf_8
P_ ((struct coding_system
*,
409 struct coding_detection_info
*info
));
410 static void decode_coding_utf_8
P_ ((struct coding_system
*));
411 static int encode_coding_utf_8
P_ ((struct coding_system
*));
413 static int detect_coding_utf_16
P_ ((struct coding_system
*,
414 struct coding_detection_info
*info
));
415 static void decode_coding_utf_16
P_ ((struct coding_system
*));
416 static int encode_coding_utf_16
P_ ((struct coding_system
*));
418 static int detect_coding_iso_2022
P_ ((struct coding_system
*,
419 struct coding_detection_info
*info
));
420 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
421 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
423 static int detect_coding_emacs_mule
P_ ((struct coding_system
*,
424 struct coding_detection_info
*info
));
425 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
426 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
428 static int detect_coding_sjis
P_ ((struct coding_system
*,
429 struct coding_detection_info
*info
));
430 static void decode_coding_sjis
P_ ((struct coding_system
*));
431 static int encode_coding_sjis
P_ ((struct coding_system
*));
433 static int detect_coding_big5
P_ ((struct coding_system
*,
434 struct coding_detection_info
*info
));
435 static void decode_coding_big5
P_ ((struct coding_system
*));
436 static int encode_coding_big5
P_ ((struct coding_system
*));
438 static int detect_coding_ccl
P_ ((struct coding_system
*,
439 struct coding_detection_info
*info
));
440 static void decode_coding_ccl
P_ ((struct coding_system
*));
441 static int encode_coding_ccl
P_ ((struct coding_system
*));
443 static void decode_coding_raw_text
P_ ((struct coding_system
*));
444 static int encode_coding_raw_text
P_ ((struct coding_system
*));
447 /* ISO2022 section */
449 #define CODING_ISO_INITIAL(coding, reg) \
450 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
451 coding_attr_iso_initial), \
455 #define CODING_ISO_REQUEST(coding, charset_id) \
456 ((charset_id <= (coding)->max_charset_id \
457 ? (coding)->safe_charsets[charset_id] \
461 #define CODING_ISO_FLAGS(coding) \
462 ((coding)->spec.iso_2022.flags)
463 #define CODING_ISO_DESIGNATION(coding, reg) \
464 ((coding)->spec.iso_2022.current_designation[reg])
465 #define CODING_ISO_INVOCATION(coding, plane) \
466 ((coding)->spec.iso_2022.current_invocation[plane])
467 #define CODING_ISO_SINGLE_SHIFTING(coding) \
468 ((coding)->spec.iso_2022.single_shifting)
469 #define CODING_ISO_BOL(coding) \
470 ((coding)->spec.iso_2022.bol)
471 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
472 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
474 /* Control characters of ISO2022. */
475 /* code */ /* function */
476 #define ISO_CODE_LF 0x0A /* line-feed */
477 #define ISO_CODE_CR 0x0D /* carriage-return */
478 #define ISO_CODE_SO 0x0E /* shift-out */
479 #define ISO_CODE_SI 0x0F /* shift-in */
480 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
481 #define ISO_CODE_ESC 0x1B /* escape */
482 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
483 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
484 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
486 /* All code (1-byte) of ISO2022 is classified into one of the
488 enum iso_code_class_type
490 ISO_control_0
, /* Control codes in the range
491 0x00..0x1F and 0x7F, except for the
492 following 5 codes. */
493 ISO_carriage_return
, /* ISO_CODE_CR (0x0D) */
494 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
495 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
496 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
497 ISO_escape
, /* ISO_CODE_SO (0x1B) */
498 ISO_control_1
, /* Control codes in the range
499 0x80..0x9F, except for the
500 following 3 codes. */
501 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
502 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
503 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
504 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
505 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
506 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
507 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
510 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
511 `iso-flags' attribute of an iso2022 coding system. */
513 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
514 instead of the correct short-form sequence (e.g. ESC $ A). */
515 #define CODING_ISO_FLAG_LONG_FORM 0x0001
517 /* If set, reset graphic planes and registers at end-of-line to the
519 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
521 /* If set, reset graphic planes and registers before any control
522 characters to the initial state. */
523 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
525 /* If set, encode by 7-bit environment. */
526 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
528 /* If set, use locking-shift function. */
529 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
531 /* If set, use single-shift function. Overwrite
532 CODING_ISO_FLAG_LOCKING_SHIFT. */
533 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
535 /* If set, use designation escape sequence. */
536 #define CODING_ISO_FLAG_DESIGNATION 0x0040
538 /* If set, produce revision number sequence. */
539 #define CODING_ISO_FLAG_REVISION 0x0080
541 /* If set, produce ISO6429's direction specifying sequence. */
542 #define CODING_ISO_FLAG_DIRECTION 0x0100
544 /* If set, assume designation states are reset at beginning of line on
546 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
548 /* If set, designation sequence should be placed at beginning of line
550 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
552 /* If set, do not encode unsafe charactes on output. */
553 #define CODING_ISO_FLAG_SAFE 0x0800
555 /* If set, extra latin codes (128..159) are accepted as a valid code
557 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
559 #define CODING_ISO_FLAG_COMPOSITION 0x2000
561 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
563 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
565 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
567 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
569 /* A character to be produced on output if encoding of the original
570 character is prohibited by CODING_ISO_FLAG_SAFE. */
571 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
575 #define CODING_UTF_16_BOM(coding) \
576 ((coding)->spec.utf_16.bom)
578 #define CODING_UTF_16_ENDIAN(coding) \
579 ((coding)->spec.utf_16.endian)
581 #define CODING_UTF_16_SURROGATE(coding) \
582 ((coding)->spec.utf_16.surrogate)
586 #define CODING_CCL_DECODER(coding) \
587 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
588 #define CODING_CCL_ENCODER(coding) \
589 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
590 #define CODING_CCL_VALIDS(coding) \
591 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
594 /* Index for each coding category in `coding_categories' */
598 coding_category_iso_7
,
599 coding_category_iso_7_tight
,
600 coding_category_iso_8_1
,
601 coding_category_iso_8_2
,
602 coding_category_iso_7_else
,
603 coding_category_iso_8_else
,
604 coding_category_utf_8
,
605 coding_category_utf_16_auto
,
606 coding_category_utf_16_be
,
607 coding_category_utf_16_le
,
608 coding_category_utf_16_be_nosig
,
609 coding_category_utf_16_le_nosig
,
610 coding_category_charset
,
611 coding_category_sjis
,
612 coding_category_big5
,
614 coding_category_emacs_mule
,
615 /* All above are targets of code detection. */
616 coding_category_raw_text
,
617 coding_category_undecided
,
621 /* Definitions of flag bits used in detect_coding_XXXX. */
622 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
623 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
624 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
625 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
626 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
627 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
628 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
629 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
630 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
631 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
632 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
633 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
634 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
635 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
636 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
637 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
638 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
639 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
641 /* This value is returned if detect_coding_mask () find nothing other
642 than ASCII characters. */
643 #define CATEGORY_MASK_ANY \
644 (CATEGORY_MASK_ISO_7 \
645 | CATEGORY_MASK_ISO_7_TIGHT \
646 | CATEGORY_MASK_ISO_8_1 \
647 | CATEGORY_MASK_ISO_8_2 \
648 | CATEGORY_MASK_ISO_7_ELSE \
649 | CATEGORY_MASK_ISO_8_ELSE \
650 | CATEGORY_MASK_UTF_8 \
651 | CATEGORY_MASK_UTF_16_BE \
652 | CATEGORY_MASK_UTF_16_LE \
653 | CATEGORY_MASK_UTF_16_BE_NOSIG \
654 | CATEGORY_MASK_UTF_16_LE_NOSIG \
655 | CATEGORY_MASK_CHARSET \
656 | CATEGORY_MASK_SJIS \
657 | CATEGORY_MASK_BIG5 \
658 | CATEGORY_MASK_CCL \
659 | CATEGORY_MASK_EMACS_MULE)
662 #define CATEGORY_MASK_ISO_7BIT \
663 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
665 #define CATEGORY_MASK_ISO_8BIT \
666 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
668 #define CATEGORY_MASK_ISO_ELSE \
669 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
671 #define CATEGORY_MASK_ISO_ESCAPE \
672 (CATEGORY_MASK_ISO_7 \
673 | CATEGORY_MASK_ISO_7_TIGHT \
674 | CATEGORY_MASK_ISO_7_ELSE \
675 | CATEGORY_MASK_ISO_8_ELSE)
677 #define CATEGORY_MASK_ISO \
678 ( CATEGORY_MASK_ISO_7BIT \
679 | CATEGORY_MASK_ISO_8BIT \
680 | CATEGORY_MASK_ISO_ELSE)
682 #define CATEGORY_MASK_UTF_16 \
683 (CATEGORY_MASK_UTF_16_BE \
684 | CATEGORY_MASK_UTF_16_LE \
685 | CATEGORY_MASK_UTF_16_BE_NOSIG \
686 | CATEGORY_MASK_UTF_16_LE_NOSIG)
689 /* List of symbols `coding-category-xxx' ordered by priority. This
690 variable is exposed to Emacs Lisp. */
691 static Lisp_Object Vcoding_category_list
;
693 /* Table of coding categories (Lisp symbols). This variable is for
695 static Lisp_Object Vcoding_category_table
;
697 /* Table of coding-categories ordered by priority. */
698 static enum coding_category coding_priorities
[coding_category_max
];
700 /* Nth element is a coding context for the coding system bound to the
701 Nth coding category. */
702 static struct coding_system coding_categories
[coding_category_max
];
704 /*** Commonly used macros and functions ***/
707 #define min(a, b) ((a) < (b) ? (a) : (b))
710 #define max(a, b) ((a) > (b) ? (a) : (b))
713 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
715 attrs = CODING_ID_ATTRS (coding->id); \
716 eol_type = CODING_ID_EOL_TYPE (coding->id); \
717 if (VECTORP (eol_type)) \
719 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
723 /* Safely get one byte from the source text pointed by SRC which ends
724 at SRC_END, and set C to that byte. If there are not enough bytes
725 in the source, it jumps to `no_more_source'. The caller
726 should declare and set these variables appropriately in advance:
727 src, src_end, multibytep
730 #define ONE_MORE_BYTE(c) \
732 if (src == src_end) \
734 if (src_base < src) \
735 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
736 goto no_more_source; \
739 if (multibytep && (c & 0x80)) \
741 if ((c & 0xFE) != 0xC0) \
742 error ("Undecodable char found"); \
743 c = ((c & 1) << 6) | *src++; \
749 #define ONE_MORE_BYTE_NO_CHECK(c) \
752 if (multibytep && (c & 0x80)) \
754 if ((c & 0xFE) != 0xC0) \
755 error ("Undecodable char found"); \
756 c = ((c & 1) << 6) | *src++; \
762 /* Store a byte C in the place pointed by DST and increment DST to the
763 next free point, and increment PRODUCED_CHARS. The caller should
764 assure that C is 0..127, and declare and set the variable `dst'
765 appropriately in advance.
769 #define EMIT_ONE_ASCII_BYTE(c) \
776 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
778 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
780 produced_chars += 2; \
781 *dst++ = (c1), *dst++ = (c2); \
785 /* Store a byte C in the place pointed by DST and increment DST to the
786 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
787 nonzero, store in an appropriate multibyte from. The caller should
788 declare and set the variables `dst' and `multibytep' appropriately
791 #define EMIT_ONE_BYTE(c) \
798 ch = BYTE8_TO_CHAR (ch); \
799 CHAR_STRING_ADVANCE (ch, dst); \
806 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
808 #define EMIT_TWO_BYTES(c1, c2) \
810 produced_chars += 2; \
817 ch = BYTE8_TO_CHAR (ch); \
818 CHAR_STRING_ADVANCE (ch, dst); \
821 ch = BYTE8_TO_CHAR (ch); \
822 CHAR_STRING_ADVANCE (ch, dst); \
832 #define EMIT_THREE_BYTES(c1, c2, c3) \
834 EMIT_ONE_BYTE (c1); \
835 EMIT_TWO_BYTES (c2, c3); \
839 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
841 EMIT_TWO_BYTES (c1, c2); \
842 EMIT_TWO_BYTES (c3, c4); \
846 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
848 charset_map_loaded = 0; \
849 c = DECODE_CHAR (charset, code); \
850 if (charset_map_loaded) \
852 unsigned char *orig = coding->source; \
855 coding_set_source (coding); \
856 offset = coding->source - orig; \
858 src_base += offset; \
864 #define ASSURE_DESTINATION(bytes) \
866 if (dst + (bytes) >= dst_end) \
868 int more_bytes = charbuf_end - charbuf + (bytes); \
870 dst = alloc_destination (coding, more_bytes, dst); \
871 dst_end = coding->destination + coding->dst_bytes; \
878 coding_set_source (coding
)
879 struct coding_system
*coding
;
881 if (BUFFERP (coding
->src_object
))
883 struct buffer
*buf
= XBUFFER (coding
->src_object
);
885 if (coding
->src_pos
< 0)
886 coding
->source
= BUF_GAP_END_ADDR (buf
) + coding
->src_pos_byte
;
888 coding
->source
= BUF_BYTE_ADDRESS (buf
, coding
->src_pos_byte
);
890 else if (STRINGP (coding
->src_object
))
892 coding
->source
= (XSTRING (coding
->src_object
)->data
893 + coding
->src_pos_byte
);
896 /* Otherwise, the source is C string and is never relocated
897 automatically. Thus we don't have to update anything. */
902 coding_set_destination (coding
)
903 struct coding_system
*coding
;
905 if (BUFFERP (coding
->dst_object
))
907 if (coding
->src_pos
< 0)
909 coding
->destination
= BEG_ADDR
+ coding
->dst_pos_byte
- 1;
910 coding
->dst_bytes
= (GAP_END_ADDR
911 - (coding
->src_bytes
- coding
->consumed
)
912 - coding
->destination
);
916 /* We are sure that coding->dst_pos_byte is before the gap
918 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
919 + coding
->dst_pos_byte
- 1);
920 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
921 - coding
->destination
);
925 /* Otherwise, the destination is C string and is never relocated
926 automatically. Thus we don't have to update anything. */
932 coding_alloc_by_realloc (coding
, bytes
)
933 struct coding_system
*coding
;
936 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
937 coding
->dst_bytes
+ bytes
);
938 coding
->dst_bytes
+= bytes
;
942 coding_alloc_by_making_gap (coding
, bytes
)
943 struct coding_system
*coding
;
946 if (BUFFERP (coding
->dst_object
)
947 && EQ (coding
->src_object
, coding
->dst_object
))
949 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
951 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
953 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
957 Lisp_Object this_buffer
;
959 this_buffer
= Fcurrent_buffer ();
960 set_buffer_internal (XBUFFER (coding
->dst_object
));
962 set_buffer_internal (XBUFFER (this_buffer
));
967 static unsigned char *
968 alloc_destination (coding
, nbytes
, dst
)
969 struct coding_system
*coding
;
973 EMACS_INT offset
= dst
- coding
->destination
;
975 if (BUFFERP (coding
->dst_object
))
976 coding_alloc_by_making_gap (coding
, nbytes
);
978 coding_alloc_by_realloc (coding
, nbytes
);
979 coding
->result
= CODING_RESULT_SUCCESS
;
980 coding_set_destination (coding
);
981 dst
= coding
->destination
+ offset
;
985 /** Macros for annotations. */
987 /* Maximum length of annotation data (sum of annotations for
988 composition and charset). */
989 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
991 /* An annotation data is stored in the array coding->charbuf in this
993 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
994 LENGTH is the number of elements in the annotation.
995 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
996 FROM and TO specify the range of text annotated. They are relative
997 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
999 The format of the following elements depend on ANNOTATION_MASK.
1001 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1003 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1004 METHOD is one of enum composition_method.
1005 Optionnal COMPOSITION-COMPONENTS are characters and composition
1008 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1011 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1013 *(buf)++ = -(len); \
1014 *(buf)++ = (mask); \
1015 *(buf)++ = (from); \
1017 coding->annotated = 1; \
1020 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1022 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1027 #define ADD_CHARSET_DATA(buf, from, to, id) \
1029 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1034 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1041 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1042 Check if a text is encoded in UTF-8. If it is, return 1, else
1045 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1046 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1047 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1048 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1049 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1050 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1053 detect_coding_utf_8 (coding
, detect_info
)
1054 struct coding_system
*coding
;
1055 struct coding_detection_info
*detect_info
;
1057 unsigned char *src
= coding
->source
, *src_base
= src
;
1058 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1059 int multibytep
= coding
->src_multibyte
;
1060 int consumed_chars
= 0;
1064 detect_info
->checked
|= CATEGORY_MASK_UTF_8
;
1065 /* A coding system of this category is always ASCII compatible. */
1066 src
+= coding
->head_ascii
;
1070 int c
, c1
, c2
, c3
, c4
;
1074 if (UTF_8_1_OCTET_P (c
))
1078 if (! UTF_8_EXTRA_OCTET_P (c1
))
1080 if (UTF_8_2_OCTET_LEADING_P (c
))
1082 found
= CATEGORY_MASK_UTF_8
;
1086 if (! UTF_8_EXTRA_OCTET_P (c2
))
1088 if (UTF_8_3_OCTET_LEADING_P (c
))
1090 found
= CATEGORY_MASK_UTF_8
;
1094 if (! UTF_8_EXTRA_OCTET_P (c3
))
1096 if (UTF_8_4_OCTET_LEADING_P (c
))
1098 found
= CATEGORY_MASK_UTF_8
;
1102 if (! UTF_8_EXTRA_OCTET_P (c4
))
1104 if (UTF_8_5_OCTET_LEADING_P (c
))
1106 found
= CATEGORY_MASK_UTF_8
;
1111 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1115 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1117 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1120 detect_info
->found
|= found
;
1126 decode_coding_utf_8 (coding
)
1127 struct coding_system
*coding
;
1129 unsigned char *src
= coding
->source
+ coding
->consumed
;
1130 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1131 unsigned char *src_base
;
1132 int *charbuf
= coding
->charbuf
;
1133 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1134 int consumed_chars
= 0, consumed_chars_base
;
1135 int multibytep
= coding
->src_multibyte
;
1136 Lisp_Object attr
, eol_type
, charset_list
;
1138 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1142 int c
, c1
, c2
, c3
, c4
, c5
;
1145 consumed_chars_base
= consumed_chars
;
1147 if (charbuf
>= charbuf_end
)
1151 if (UTF_8_1_OCTET_P(c1
))
1156 if (EQ (eol_type
, Qdos
))
1160 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
1161 goto no_more_source
;
1166 else if (EQ (eol_type
, Qmac
))
1173 if (! UTF_8_EXTRA_OCTET_P (c2
))
1175 if (UTF_8_2_OCTET_LEADING_P (c1
))
1177 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1178 /* Reject overlong sequences here and below. Encoders
1179 producing them are incorrect, they can be misleading,
1180 and they mess up read/write invariance. */
1187 if (! UTF_8_EXTRA_OCTET_P (c3
))
1189 if (UTF_8_3_OCTET_LEADING_P (c1
))
1191 c
= (((c1
& 0xF) << 12)
1192 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1194 || (c
>= 0xd800 && c
< 0xe000)) /* surrogates (invalid) */
1200 if (! UTF_8_EXTRA_OCTET_P (c4
))
1202 if (UTF_8_4_OCTET_LEADING_P (c1
))
1204 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1205 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1212 if (! UTF_8_EXTRA_OCTET_P (c5
))
1214 if (UTF_8_5_OCTET_LEADING_P (c1
))
1216 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1217 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1219 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1234 consumed_chars
= consumed_chars_base
;
1236 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1241 coding
->consumed_char
+= consumed_chars_base
;
1242 coding
->consumed
= src_base
- coding
->source
;
1243 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1248 encode_coding_utf_8 (coding
)
1249 struct coding_system
*coding
;
1251 int multibytep
= coding
->dst_multibyte
;
1252 int *charbuf
= coding
->charbuf
;
1253 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1254 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1255 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1256 int produced_chars
= 0;
1261 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1263 while (charbuf
< charbuf_end
)
1265 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1267 ASSURE_DESTINATION (safe_room
);
1269 if (CHAR_BYTE8_P (c
))
1271 c
= CHAR_TO_BYTE8 (c
);
1276 CHAR_STRING_ADVANCE (c
, pend
);
1277 for (p
= str
; p
< pend
; p
++)
1284 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1286 while (charbuf
< charbuf_end
)
1288 ASSURE_DESTINATION (safe_room
);
1290 dst
+= CHAR_STRING (c
, dst
);
1294 coding
->result
= CODING_RESULT_SUCCESS
;
1295 coding
->produced_char
+= produced_chars
;
1296 coding
->produced
= dst
- coding
->destination
;
1301 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1302 Check if a text is encoded in one of UTF-16 based coding systems.
1303 If it is, return 1, else return 0. */
1305 #define UTF_16_HIGH_SURROGATE_P(val) \
1306 (((val) & 0xFC00) == 0xD800)
1308 #define UTF_16_LOW_SURROGATE_P(val) \
1309 (((val) & 0xFC00) == 0xDC00)
1311 #define UTF_16_INVALID_P(val) \
1312 (((val) == 0xFFFE) \
1313 || ((val) == 0xFFFF) \
1314 || UTF_16_LOW_SURROGATE_P (val))
1318 detect_coding_utf_16 (coding
, detect_info
)
1319 struct coding_system
*coding
;
1320 struct coding_detection_info
*detect_info
;
1322 unsigned char *src
= coding
->source
, *src_base
= src
;
1323 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1324 int multibytep
= coding
->src_multibyte
;
1325 int consumed_chars
= 0;
1328 detect_info
->checked
|= CATEGORY_MASK_UTF_16
;
1330 if (coding
->mode
& CODING_MODE_LAST_BLOCK
1331 && (coding
->src_bytes
& 1))
1333 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1339 if ((c1
== 0xFF) && (c2
== 0xFE))
1341 detect_info
->found
|= (CATEGORY_MASK_UTF_16_LE
1342 | CATEGORY_MASK_UTF_16_AUTO
);
1343 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_BE
;
1345 else if ((c1
== 0xFE) && (c2
== 0xFF))
1347 detect_info
->found
|= (CATEGORY_MASK_UTF_16_BE
1348 | CATEGORY_MASK_UTF_16_AUTO
);
1349 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_LE
;
1356 decode_coding_utf_16 (coding
)
1357 struct coding_system
*coding
;
1359 unsigned char *src
= coding
->source
+ coding
->consumed
;
1360 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1361 unsigned char *src_base
;
1362 int *charbuf
= coding
->charbuf
;
1363 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1364 int consumed_chars
= 0, consumed_chars_base
;
1365 int multibytep
= coding
->src_multibyte
;
1366 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1367 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1368 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1369 Lisp_Object attr
, eol_type
, charset_list
;
1371 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1373 if (bom
== utf_16_with_bom
)
1382 if (endian
== utf_16_big_endian
1383 ? c
!= 0xFEFF : c
!= 0xFFFE)
1385 /* The first two bytes are not BOM. Treat them as bytes
1386 for a normal character. */
1390 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1392 else if (bom
== utf_16_detect_bom
)
1394 /* We have already tried to detect BOM and failed in
1396 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1404 consumed_chars_base
= consumed_chars
;
1406 if (charbuf
+ 2 >= charbuf_end
)
1411 c
= (endian
== utf_16_big_endian
1412 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1415 if (! UTF_16_LOW_SURROGATE_P (c
))
1417 if (endian
== utf_16_big_endian
)
1418 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1420 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1424 if (UTF_16_HIGH_SURROGATE_P (c
))
1425 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1431 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1432 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1438 if (UTF_16_HIGH_SURROGATE_P (c
))
1439 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1446 coding
->consumed_char
+= consumed_chars_base
;
1447 coding
->consumed
= src_base
- coding
->source
;
1448 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1452 encode_coding_utf_16 (coding
)
1453 struct coding_system
*coding
;
1455 int multibytep
= coding
->dst_multibyte
;
1456 int *charbuf
= coding
->charbuf
;
1457 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1458 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1459 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1461 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1462 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1463 int produced_chars
= 0;
1464 Lisp_Object attrs
, eol_type
, charset_list
;
1467 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1469 if (bom
!= utf_16_without_bom
)
1471 ASSURE_DESTINATION (safe_room
);
1473 EMIT_TWO_BYTES (0xFE, 0xFF);
1475 EMIT_TWO_BYTES (0xFF, 0xFE);
1476 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1479 while (charbuf
< charbuf_end
)
1481 ASSURE_DESTINATION (safe_room
);
1483 if (c
>= MAX_UNICODE_CHAR
)
1484 c
= coding
->default_char
;
1489 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1491 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1498 c1
= (c
>> 10) + 0xD800;
1499 c2
= (c
& 0x3FF) + 0xDC00;
1501 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1503 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1506 coding
->result
= CODING_RESULT_SUCCESS
;
1507 coding
->produced
= dst
- coding
->destination
;
1508 coding
->produced_char
+= produced_chars
;
1513 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1515 /* Emacs' internal format for representation of multiple character
1516 sets is a kind of multi-byte encoding, i.e. characters are
1517 represented by variable-length sequences of one-byte codes.
1519 ASCII characters and control characters (e.g. `tab', `newline') are
1520 represented by one-byte sequences which are their ASCII codes, in
1521 the range 0x00 through 0x7F.
1523 8-bit characters of the range 0x80..0x9F are represented by
1524 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1527 8-bit characters of the range 0xA0..0xFF are represented by
1528 one-byte sequences which are their 8-bit code.
1530 The other characters are represented by a sequence of `base
1531 leading-code', optional `extended leading-code', and one or two
1532 `position-code's. The length of the sequence is determined by the
1533 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1534 whereas extended leading-code and position-code take the range 0xA0
1535 through 0xFF. See `charset.h' for more details about leading-code
1538 --- CODE RANGE of Emacs' internal format ---
1542 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1543 eight-bit-graphic 0xA0..0xBF
1544 ELSE 0x81..0x9D + [0xA0..0xFF]+
1545 ---------------------------------------------
1547 As this is the internal character representation, the format is
1548 usually not used externally (i.e. in a file or in a data sent to a
1549 process). But, it is possible to have a text externally in this
1550 format (i.e. by encoding by the coding system `emacs-mule').
1552 In that case, a sequence of one-byte codes has a slightly different
1555 At first, all characters in eight-bit-control are represented by
1556 one-byte sequences which are their 8-bit code.
1558 Next, character composition data are represented by the byte
1559 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1561 METHOD is 0xF0 plus one of composition method (enum
1562 composition_method),
1564 BYTES is 0xA0 plus a byte length of this composition data,
1566 CHARS is 0x20 plus a number of characters composed by this
1569 COMPONENTs are characters of multibye form or composition
1570 rules encoded by two-byte of ASCII codes.
1572 In addition, for backward compatibility, the following formats are
1573 also recognized as composition data on decoding.
1576 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1579 MSEQ is a multibyte form but in these special format:
1580 ASCII: 0xA0 ASCII_CODE+0x80,
1581 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1582 RULE is a one byte code of the range 0xA0..0xF0 that
1583 represents a composition rule.
1586 char emacs_mule_bytes
[256];
1589 emacs_mule_char (coding
, src
, nbytes
, nchars
, id
)
1590 struct coding_system
*coding
;
1592 int *nbytes
, *nchars
, *id
;
1594 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1595 int multibytep
= coding
->src_multibyte
;
1596 unsigned char *src_base
= src
;
1597 struct charset
*charset
;
1600 int consumed_chars
= 0;
1603 switch (emacs_mule_bytes
[c
])
1606 if (! (charset
= emacs_mule_charset
[c
]))
1613 if (c
== EMACS_MULE_LEADING_CODE_PRIVATE_11
1614 || c
== EMACS_MULE_LEADING_CODE_PRIVATE_12
)
1617 if (! (charset
= emacs_mule_charset
[c
]))
1624 if (! (charset
= emacs_mule_charset
[c
]))
1627 code
= (c
& 0x7F) << 8;
1635 if (! (charset
= emacs_mule_charset
[c
]))
1638 code
= (c
& 0x7F) << 8;
1645 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
)
1646 ? charset_ascii
: charset_eight_bit
);
1652 c
= DECODE_CHAR (charset
, code
);
1655 *nbytes
= src
- src_base
;
1656 *nchars
= consumed_chars
;
1669 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1670 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1674 detect_coding_emacs_mule (coding
, detect_info
)
1675 struct coding_system
*coding
;
1676 struct coding_detection_info
*detect_info
;
1678 unsigned char *src
= coding
->source
, *src_base
= src
;
1679 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1680 int multibytep
= coding
->src_multibyte
;
1681 int consumed_chars
= 0;
1686 detect_info
->checked
|= CATEGORY_MASK_EMACS_MULE
;
1687 /* A coding system of this category is always ASCII compatible. */
1688 src
+= coding
->head_ascii
;
1698 /* Perhaps the start of composite character. We simple skip
1699 it because analyzing it is too heavy for detecting. But,
1700 at least, we check that the composite character
1701 constitues of more than 4 bytes. */
1702 unsigned char *src_base
;
1712 if (src
- src_base
<= 4)
1714 found
= CATEGORY_MASK_EMACS_MULE
;
1722 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1727 unsigned char *src_base
= src
- 1;
1734 if (src
- src_base
!= emacs_mule_bytes
[*src_base
])
1736 found
= CATEGORY_MASK_EMACS_MULE
;
1739 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1743 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1745 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1748 detect_info
->found
|= found
;
1753 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1755 /* Decode a character represented as a component of composition
1756 sequence of Emacs 20/21 style at SRC. Set C to that character and
1757 update SRC to the head of next character (or an encoded composition
1758 rule). If SRC doesn't points a composition component, set C to -1.
1759 If SRC points an invalid byte sequence, global exit by a return
1762 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1766 int nbytes, nchars; \
1768 if (src == src_end) \
1770 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1775 goto invalid_code; \
1779 consumed_chars += nchars; \
1784 /* Decode a composition rule represented as a component of composition
1785 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1786 and increment BUF. If SRC points an invalid byte sequence, set C
1789 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1791 int c, gref, nref; \
1793 if (src >= src_end) \
1794 goto invalid_code; \
1795 ONE_MORE_BYTE_NO_CHECK (c); \
1797 if (c < 0 || c >= 81) \
1798 goto invalid_code; \
1800 gref = c / 9, nref = c % 9; \
1801 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1805 /* Decode a composition rule represented as a component of composition
1806 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1807 and increment BUF. If SRC points an invalid byte sequence, set C
1810 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1814 if (src + 1>= src_end) \
1815 goto invalid_code; \
1816 ONE_MORE_BYTE_NO_CHECK (gref); \
1818 ONE_MORE_BYTE_NO_CHECK (nref); \
1820 if (gref < 0 || gref >= 81 \
1821 || nref < 0 || nref >= 81) \
1822 goto invalid_code; \
1823 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1827 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1829 /* Emacs 21 style format. The first three bytes at SRC are \
1830 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1831 the byte length of this composition information, CHARS is the \
1832 number of characters composed by this composition. */ \
1833 enum composition_method method = c - 0xF2; \
1834 int *charbuf_base = charbuf; \
1836 int consumed_chars_limit; \
1837 int nbytes, nchars; \
1839 ONE_MORE_BYTE (c); \
1840 nbytes = c - 0xA0; \
1842 goto invalid_code; \
1843 ONE_MORE_BYTE (c); \
1844 nchars = c - 0xA0; \
1845 from = coding->produced + char_offset; \
1846 to = from + nchars; \
1847 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1848 consumed_chars_limit = consumed_chars_base + nbytes; \
1849 if (method != COMPOSITION_RELATIVE) \
1852 while (consumed_chars < consumed_chars_limit) \
1854 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1855 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1857 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1860 if (consumed_chars < consumed_chars_limit) \
1861 goto invalid_code; \
1862 charbuf_base[0] -= i; \
1867 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1869 /* Emacs 20 style format for relative composition. */ \
1870 /* Store multibyte form of characters to be composed. */ \
1871 enum composition_method method = COMPOSITION_RELATIVE; \
1872 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1873 int *buf = components; \
1878 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1879 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1880 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1882 goto invalid_code; \
1883 from = coding->produced_char + char_offset; \
1885 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1886 for (j = 0; j < i; j++) \
1887 *charbuf++ = components[j]; \
1891 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1893 /* Emacs 20 style format for rule-base composition. */ \
1894 /* Store multibyte form of characters to be composed. */ \
1895 enum composition_method method = COMPOSITION_WITH_RULE; \
1896 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1897 int *buf = components; \
1901 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1902 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1904 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1905 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1907 if (i < 1 || (buf - components) % 2 == 0) \
1908 goto invalid_code; \
1909 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1910 goto no_more_source; \
1911 from = coding->produced_char + char_offset; \
1913 ADD_COMPOSITION_DATA (buf, from, to, method); \
1914 for (j = 0; j < i; j++) \
1915 *charbuf++ = components[j]; \
1916 for (j = 0; j < i; j += 2) \
1917 *charbuf++ = components[j]; \
1922 decode_coding_emacs_mule (coding
)
1923 struct coding_system
*coding
;
1925 unsigned char *src
= coding
->source
+ coding
->consumed
;
1926 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1927 unsigned char *src_base
;
1928 int *charbuf
= coding
->charbuf
;
1929 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
1930 int consumed_chars
= 0, consumed_chars_base
;
1931 int multibytep
= coding
->src_multibyte
;
1932 Lisp_Object attrs
, eol_type
, charset_list
;
1933 int char_offset
= coding
->produced_char
;
1934 int last_offset
= char_offset
;
1935 int last_id
= charset_ascii
;
1937 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1944 consumed_chars_base
= consumed_chars
;
1946 if (charbuf
>= charbuf_end
)
1955 if (EQ (eol_type
, Qdos
))
1959 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
1960 goto no_more_source
;
1965 else if (EQ (eol_type
, Qmac
))
1974 if (c
- 0xF2 >= COMPOSITION_RELATIVE
1975 && c
- 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS
)
1976 DECODE_EMACS_MULE_21_COMPOSITION (c
);
1978 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
1980 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
1984 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
1990 consumed_chars
= consumed_chars_base
;
1991 c
= emacs_mule_char (coding
, src
, &nbytes
, &nchars
, &id
);
2000 if (last_id
!= charset_ascii
)
2001 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2003 last_offset
= char_offset
;
2007 consumed_chars
+= nchars
;
2014 consumed_chars
= consumed_chars_base
;
2016 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
2022 if (last_id
!= charset_ascii
)
2023 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2024 coding
->consumed_char
+= consumed_chars_base
;
2025 coding
->consumed
= src_base
- coding
->source
;
2026 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
2030 #define EMACS_MULE_LEADING_CODES(id, codes) \
2033 codes[0] = id, codes[1] = 0; \
2034 else if (id < 0xE0) \
2035 codes[0] = 0x9A, codes[1] = id; \
2036 else if (id < 0xF0) \
2037 codes[0] = 0x9B, codes[1] = id; \
2038 else if (id < 0xF5) \
2039 codes[0] = 0x9C, codes[1] = id; \
2041 codes[0] = 0x9D, codes[1] = id; \
2046 encode_coding_emacs_mule (coding
)
2047 struct coding_system
*coding
;
2049 int multibytep
= coding
->dst_multibyte
;
2050 int *charbuf
= coding
->charbuf
;
2051 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
2052 unsigned char *dst
= coding
->destination
+ coding
->produced
;
2053 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
2055 int produced_chars
= 0;
2056 Lisp_Object attrs
, eol_type
, charset_list
;
2058 int preferred_charset_id
= -1;
2060 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2062 while (charbuf
< charbuf_end
)
2064 ASSURE_DESTINATION (safe_room
);
2069 /* Handle an annotation. */
2072 case CODING_ANNOTATE_COMPOSITION_MASK
:
2073 /* Not yet implemented. */
2075 case CODING_ANNOTATE_CHARSET_MASK
:
2076 preferred_charset_id
= charbuf
[3];
2077 if (preferred_charset_id
>= 0
2078 && NILP (Fmemq (make_number (preferred_charset_id
),
2080 preferred_charset_id
= -1;
2089 if (ASCII_CHAR_P (c
))
2090 EMIT_ONE_ASCII_BYTE (c
);
2091 else if (CHAR_BYTE8_P (c
))
2093 c
= CHAR_TO_BYTE8 (c
);
2098 struct charset
*charset
;
2102 unsigned char leading_codes
[2];
2104 if (preferred_charset_id
>= 0)
2106 charset
= CHARSET_FROM_ID (preferred_charset_id
);
2107 if (! CHAR_CHARSET_P (c
, charset
))
2108 charset
= char_charset (c
, charset_list
, NULL
);
2111 charset
= char_charset (c
, charset_list
, &code
);
2114 c
= coding
->default_char
;
2115 if (ASCII_CHAR_P (c
))
2117 EMIT_ONE_ASCII_BYTE (c
);
2120 charset
= char_charset (c
, charset_list
, &code
);
2122 dimension
= CHARSET_DIMENSION (charset
);
2123 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2124 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2125 EMIT_ONE_BYTE (leading_codes
[0]);
2126 if (leading_codes
[1])
2127 EMIT_ONE_BYTE (leading_codes
[1]);
2129 EMIT_ONE_BYTE (code
);
2132 EMIT_ONE_BYTE (code
>> 8);
2133 EMIT_ONE_BYTE (code
& 0xFF);
2137 coding
->result
= CODING_RESULT_SUCCESS
;
2138 coding
->produced_char
+= produced_chars
;
2139 coding
->produced
= dst
- coding
->destination
;
2144 /*** 7. ISO2022 handlers ***/
2146 /* The following note describes the coding system ISO2022 briefly.
2147 Since the intention of this note is to help understand the
2148 functions in this file, some parts are NOT ACCURATE or are OVERLY
2149 SIMPLIFIED. For thorough understanding, please refer to the
2150 original document of ISO2022. This is equivalent to the standard
2151 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2153 ISO2022 provides many mechanisms to encode several character sets
2154 in 7-bit and 8-bit environments. For 7-bit environments, all text
2155 is encoded using bytes less than 128. This may make the encoded
2156 text a little bit longer, but the text passes more easily through
2157 several types of gateway, some of which strip off the MSB (Most
2160 There are two kinds of character sets: control character sets and
2161 graphic character sets. The former contain control characters such
2162 as `newline' and `escape' to provide control functions (control
2163 functions are also provided by escape sequences). The latter
2164 contain graphic characters such as 'A' and '-'. Emacs recognizes
2165 two control character sets and many graphic character sets.
2167 Graphic character sets are classified into one of the following
2168 four classes, according to the number of bytes (DIMENSION) and
2169 number of characters in one dimension (CHARS) of the set:
2170 - DIMENSION1_CHARS94
2171 - DIMENSION1_CHARS96
2172 - DIMENSION2_CHARS94
2173 - DIMENSION2_CHARS96
2175 In addition, each character set is assigned an identification tag,
2176 unique for each set, called the "final character" (denoted as <F>
2177 hereafter). The <F> of each character set is decided by ECMA(*)
2178 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2179 (0x30..0x3F are for private use only).
2181 Note (*): ECMA = European Computer Manufacturers Association
2183 Here are examples of graphic character sets [NAME(<F>)]:
2184 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2185 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2186 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2187 o DIMENSION2_CHARS96 -- none for the moment
2189 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2190 C0 [0x00..0x1F] -- control character plane 0
2191 GL [0x20..0x7F] -- graphic character plane 0
2192 C1 [0x80..0x9F] -- control character plane 1
2193 GR [0xA0..0xFF] -- graphic character plane 1
2195 A control character set is directly designated and invoked to C0 or
2196 C1 by an escape sequence. The most common case is that:
2197 - ISO646's control character set is designated/invoked to C0, and
2198 - ISO6429's control character set is designated/invoked to C1,
2199 and usually these designations/invocations are omitted in encoded
2200 text. In a 7-bit environment, only C0 can be used, and a control
2201 character for C1 is encoded by an appropriate escape sequence to
2202 fit into the environment. All control characters for C1 are
2203 defined to have corresponding escape sequences.
2205 A graphic character set is at first designated to one of four
2206 graphic registers (G0 through G3), then these graphic registers are
2207 invoked to GL or GR. These designations and invocations can be
2208 done independently. The most common case is that G0 is invoked to
2209 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2210 these invocations and designations are omitted in encoded text.
2211 In a 7-bit environment, only GL can be used.
2213 When a graphic character set of CHARS94 is invoked to GL, codes
2214 0x20 and 0x7F of the GL area work as control characters SPACE and
2215 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2218 There are two ways of invocation: locking-shift and single-shift.
2219 With locking-shift, the invocation lasts until the next different
2220 invocation, whereas with single-shift, the invocation affects the
2221 following character only and doesn't affect the locking-shift
2222 state. Invocations are done by the following control characters or
2225 ----------------------------------------------------------------------
2226 abbrev function cntrl escape seq description
2227 ----------------------------------------------------------------------
2228 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2229 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2230 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2231 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2232 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2233 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2234 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2235 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2236 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2237 ----------------------------------------------------------------------
2238 (*) These are not used by any known coding system.
2240 Control characters for these functions are defined by macros
2241 ISO_CODE_XXX in `coding.h'.
2243 Designations are done by the following escape sequences:
2244 ----------------------------------------------------------------------
2245 escape sequence description
2246 ----------------------------------------------------------------------
2247 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2248 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2249 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2250 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2251 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2252 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2253 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2254 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2255 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2256 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2257 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2258 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2259 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2260 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2261 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2262 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2263 ----------------------------------------------------------------------
2265 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2266 of dimension 1, chars 94, and final character <F>, etc...
2268 Note (*): Although these designations are not allowed in ISO2022,
2269 Emacs accepts them on decoding, and produces them on encoding
2270 CHARS96 character sets in a coding system which is characterized as
2271 7-bit environment, non-locking-shift, and non-single-shift.
2273 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2274 '(' must be omitted. We refer to this as "short-form" hereafter.
2276 Now you may notice that there are a lot of ways of encoding the
2277 same multilingual text in ISO2022. Actually, there exist many
2278 coding systems such as Compound Text (used in X11's inter client
2279 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2280 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2281 localized platforms), and all of these are variants of ISO2022.
2283 In addition to the above, Emacs handles two more kinds of escape
2284 sequences: ISO6429's direction specification and Emacs' private
2285 sequence for specifying character composition.
2287 ISO6429's direction specification takes the following form:
2288 o CSI ']' -- end of the current direction
2289 o CSI '0' ']' -- end of the current direction
2290 o CSI '1' ']' -- start of left-to-right text
2291 o CSI '2' ']' -- start of right-to-left text
2292 The control character CSI (0x9B: control sequence introducer) is
2293 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2295 Character composition specification takes the following form:
2296 o ESC '0' -- start relative composition
2297 o ESC '1' -- end composition
2298 o ESC '2' -- start rule-base composition (*)
2299 o ESC '3' -- start relative composition with alternate chars (**)
2300 o ESC '4' -- start rule-base composition with alternate chars (**)
2301 Since these are not standard escape sequences of any ISO standard,
2302 the use of them with these meanings is restricted to Emacs only.
2304 (*) This form is used only in Emacs 20.7 and older versions,
2305 but newer versions can safely decode it.
2306 (**) This form is used only in Emacs 21.1 and newer versions,
2307 and older versions can't decode it.
2309 Here's a list of example usages of these composition escape
2310 sequences (categorized by `enum composition_method').
2312 COMPOSITION_RELATIVE:
2313 ESC 0 CHAR [ CHAR ] ESC 1
2314 COMPOSITION_WITH_RULE:
2315 ESC 2 CHAR [ RULE CHAR ] ESC 1
2316 COMPOSITION_WITH_ALTCHARS:
2317 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2318 COMPOSITION_WITH_RULE_ALTCHARS:
2319 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2321 enum iso_code_class_type iso_code_class
[256];
2323 #define SAFE_CHARSET_P(coding, id) \
2324 ((id) <= (coding)->max_charset_id \
2325 && (coding)->safe_charsets[id] >= 0)
2328 #define SHIFT_OUT_OK(category) \
2329 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2332 setup_iso_safe_charsets (attrs
)
2335 Lisp_Object charset_list
, safe_charsets
;
2336 Lisp_Object request
;
2337 Lisp_Object reg_usage
;
2340 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2343 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2344 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2345 && ! EQ (charset_list
, Viso_2022_charset_list
))
2347 CODING_ATTR_CHARSET_LIST (attrs
)
2348 = charset_list
= Viso_2022_charset_list
;
2349 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2352 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2356 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2358 int id
= XINT (XCAR (tail
));
2359 if (max_charset_id
< id
)
2360 max_charset_id
= id
;
2363 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2365 request
= AREF (attrs
, coding_attr_iso_request
);
2366 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2367 reg94
= XINT (XCAR (reg_usage
));
2368 reg96
= XINT (XCDR (reg_usage
));
2370 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2374 struct charset
*charset
;
2377 charset
= CHARSET_FROM_ID (XINT (id
));
2378 reg
= Fcdr (Fassq (id
, request
));
2380 XSTRING (safe_charsets
)->data
[XINT (id
)] = XINT (reg
);
2381 else if (charset
->iso_chars_96
)
2384 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg96
;
2389 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg94
;
2392 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2396 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2397 Check if a text is encoded in one of ISO-2022 based codig systems.
2398 If it is, return 1, else return 0. */
2401 detect_coding_iso_2022 (coding
, detect_info
)
2402 struct coding_system
*coding
;
2403 struct coding_detection_info
*detect_info
;
2405 unsigned char *src
= coding
->source
, *src_base
= src
;
2406 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2407 int multibytep
= coding
->src_multibyte
;
2408 int single_shifting
= 0;
2411 int consumed_chars
= 0;
2416 detect_info
->checked
|= CATEGORY_MASK_ISO
;
2418 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2420 struct coding_system
*this = &(coding_categories
[i
]);
2421 Lisp_Object attrs
, val
;
2423 attrs
= CODING_ID_ATTRS (this->id
);
2424 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2425 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2426 setup_iso_safe_charsets (attrs
);
2427 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2428 this->max_charset_id
= XSTRING (val
)->size
- 1;
2429 this->safe_charsets
= (char *) XSTRING (val
)->data
;
2432 /* A coding system of this category is always ASCII compatible. */
2433 src
+= coding
->head_ascii
;
2435 while (rejected
!= CATEGORY_MASK_ISO
)
2441 if (inhibit_iso_escape_detection
)
2443 single_shifting
= 0;
2445 if (c
>= '(' && c
<= '/')
2447 /* Designation sequence for a charset of dimension 1. */
2449 if (c1
< ' ' || c1
>= 0x80
2450 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2451 /* Invalid designation sequence. Just ignore. */
2456 /* Designation sequence for a charset of dimension 2. */
2458 if (c
>= '@' && c
<= 'B')
2459 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2460 id
= iso_charset_table
[1][0][c
];
2461 else if (c
>= '(' && c
<= '/')
2464 if (c1
< ' ' || c1
>= 0x80
2465 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2466 /* Invalid designation sequence. Just ignore. */
2470 /* Invalid designation sequence. Just ignore it. */
2473 else if (c
== 'N' || c
== 'O')
2475 /* ESC <Fe> for SS2 or SS3. */
2476 single_shifting
= 1;
2477 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2480 else if (c
>= '0' && c
<= '4')
2482 /* ESC <Fp> for start/end composition. */
2483 found
|= CATEGORY_MASK_ISO
;
2488 /* Invalid escape sequence. Just ignore it. */
2492 /* We found a valid designation sequence for CHARSET. */
2493 rejected
|= CATEGORY_MASK_ISO_8BIT
;
2494 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2496 found
|= CATEGORY_MASK_ISO_7
;
2498 rejected
|= CATEGORY_MASK_ISO_7
;
2499 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2501 found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2503 rejected
|= CATEGORY_MASK_ISO_7_TIGHT
;
2504 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2506 found
|= CATEGORY_MASK_ISO_7_ELSE
;
2508 rejected
|= CATEGORY_MASK_ISO_7_ELSE
;
2509 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2511 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2513 rejected
|= CATEGORY_MASK_ISO_8_ELSE
;
2518 /* Locking shift out/in. */
2519 if (inhibit_iso_escape_detection
)
2521 single_shifting
= 0;
2522 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2523 found
|= CATEGORY_MASK_ISO_ELSE
;
2527 /* Control sequence introducer. */
2528 single_shifting
= 0;
2529 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2530 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2531 goto check_extra_latin
;
2537 if (inhibit_iso_escape_detection
)
2539 single_shifting
= 1;
2540 rejected
|= CATEGORY_MASK_ISO_7BIT
;
2541 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2542 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2543 found
|= CATEGORY_MASK_ISO_8_1
;
2544 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2545 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2546 found
|= CATEGORY_MASK_ISO_8_2
;
2547 goto check_extra_latin
;
2552 single_shifting
= 0;
2557 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2558 found
|= CATEGORY_MASK_ISO_8_1
;
2559 /* Check the length of succeeding codes of the range
2560 0xA0..0FF. If the byte length is even, we include
2561 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2562 only when we are not single shifting. */
2563 if (! single_shifting
2564 && ! (rejected
& CATEGORY_MASK_ISO_8_2
))
2567 while (src
< src_end
)
2575 if (i
& 1 && src
< src_end
)
2576 rejected
|= CATEGORY_MASK_ISO_8_2
;
2578 found
|= CATEGORY_MASK_ISO_8_2
;
2583 single_shifting
= 0;
2584 if (! VECTORP (Vlatin_extra_code_table
)
2585 || NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2587 rejected
= CATEGORY_MASK_ISO
;
2590 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2591 & CODING_ISO_FLAG_LATIN_EXTRA
)
2592 found
|= CATEGORY_MASK_ISO_8_1
;
2594 rejected
|= CATEGORY_MASK_ISO_8_1
;
2595 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2596 & CODING_ISO_FLAG_LATIN_EXTRA
)
2597 found
|= CATEGORY_MASK_ISO_8_2
;
2599 rejected
|= CATEGORY_MASK_ISO_8_2
;
2602 detect_info
->rejected
|= CATEGORY_MASK_ISO
;
2606 detect_info
->rejected
|= rejected
;
2607 detect_info
->found
|= (found
& ~rejected
);
2612 /* Set designation state into CODING. */
2613 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2617 if (final < '0' || final >= 128 \
2618 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2619 || !SAFE_CHARSET_P (coding, id)) \
2621 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2622 goto invalid_code; \
2624 prev = CODING_ISO_DESIGNATION (coding, reg); \
2625 if (id == charset_jisx0201_roman) \
2627 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2628 id = charset_ascii; \
2630 else if (id == charset_jisx0208_1978) \
2632 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2633 id = charset_jisx0208; \
2635 CODING_ISO_DESIGNATION (coding, reg) = id; \
2636 /* If there was an invalid designation to REG previously, and this \
2637 designation is ASCII to REG, we should keep this designation \
2639 if (prev == -2 && id == charset_ascii) \
2640 goto invalid_code; \
2644 #define MAYBE_FINISH_COMPOSITION() \
2647 if (composition_state == COMPOSING_NO) \
2649 /* It is assured that we have enough room for producing \
2650 characters stored in the table `components'. */ \
2651 if (charbuf + component_idx > charbuf_end) \
2652 goto no_more_source; \
2653 composition_state = COMPOSING_NO; \
2654 if (method == COMPOSITION_RELATIVE \
2655 || method == COMPOSITION_WITH_ALTCHARS) \
2657 for (i = 0; i < component_idx; i++) \
2658 *charbuf++ = components[i]; \
2659 char_offset += component_idx; \
2663 for (i = 0; i < component_idx; i += 2) \
2664 *charbuf++ = components[i]; \
2665 char_offset += (component_idx / 2) + 1; \
2670 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2671 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2672 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2673 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2674 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2677 #define DECODE_COMPOSITION_START(c1) \
2680 && composition_state == COMPOSING_COMPONENT_RULE) \
2682 component_len = component_idx; \
2683 composition_state = COMPOSING_CHAR; \
2689 MAYBE_FINISH_COMPOSITION (); \
2690 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2691 goto no_more_source; \
2692 for (p = src; p < src_end - 1; p++) \
2693 if (*p == ISO_CODE_ESC && p[1] == '1') \
2695 if (p == src_end - 1) \
2697 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2698 goto invalid_code; \
2699 goto no_more_source; \
2702 /* This is surely the start of a composition. */ \
2703 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2704 : c1 == '2' ? COMPOSITION_WITH_RULE \
2705 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2706 : COMPOSITION_WITH_RULE_ALTCHARS); \
2707 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2708 : COMPOSING_COMPONENT_CHAR); \
2709 component_idx = component_len = 0; \
2714 /* Handle compositoin end sequence ESC 1. */
2716 #define DECODE_COMPOSITION_END() \
2718 int nchars = (component_len > 0 ? component_idx - component_len \
2719 : method == COMPOSITION_RELATIVE ? component_idx \
2720 : (component_idx + 1) / 2); \
2722 int *saved_charbuf = charbuf; \
2723 int from = coding->produced_char + char_offset; \
2724 int to = from + nchars; \
2726 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2727 if (method != COMPOSITION_RELATIVE) \
2729 if (component_len == 0) \
2730 for (i = 0; i < component_idx; i++) \
2731 *charbuf++ = components[i]; \
2733 for (i = 0; i < component_len; i++) \
2734 *charbuf++ = components[i]; \
2735 *saved_charbuf = saved_charbuf - charbuf; \
2737 if (method == COMPOSITION_WITH_RULE) \
2738 for (i = 0; i < component_idx; i += 2, char_offset++) \
2739 *charbuf++ = components[i]; \
2741 for (i = component_len; i < component_idx; i++, char_offset++) \
2742 *charbuf++ = components[i]; \
2743 coding->annotated = 1; \
2744 composition_state = COMPOSING_NO; \
2748 /* Decode a composition rule from the byte C1 (and maybe one more byte
2749 from SRC) and store one encoded composition rule in
2750 coding->cmp_data. */
2752 #define DECODE_COMPOSITION_RULE(c1) \
2755 if (c1 < 81) /* old format (before ver.21) */ \
2757 int gref = (c1) / 9; \
2758 int nref = (c1) % 9; \
2759 if (gref == 4) gref = 10; \
2760 if (nref == 4) nref = 10; \
2761 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2763 else if (c1 < 93) /* new format (after ver.21) */ \
2765 ONE_MORE_BYTE (c2); \
2766 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2773 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2776 decode_coding_iso_2022 (coding
)
2777 struct coding_system
*coding
;
2779 unsigned char *src
= coding
->source
+ coding
->consumed
;
2780 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2781 unsigned char *src_base
;
2782 int *charbuf
= coding
->charbuf
;
2784 = charbuf
+ coding
->charbuf_size
- 4 - MAX_ANNOTATION_LENGTH
;
2785 int consumed_chars
= 0, consumed_chars_base
;
2786 int multibytep
= coding
->src_multibyte
;
2787 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2788 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2789 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2790 struct charset
*charset
;
2792 /* For handling composition sequence. */
2793 #define COMPOSING_NO 0
2794 #define COMPOSING_CHAR 1
2795 #define COMPOSING_RULE 2
2796 #define COMPOSING_COMPONENT_CHAR 3
2797 #define COMPOSING_COMPONENT_RULE 4
2799 int composition_state
= COMPOSING_NO
;
2800 enum composition_method method
;
2801 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2804 Lisp_Object attrs
, eol_type
, charset_list
;
2805 int char_offset
= coding
->produced_char
;
2806 int last_offset
= char_offset
;
2807 int last_id
= charset_ascii
;
2809 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2810 setup_iso_safe_charsets (attrs
);
2817 consumed_chars_base
= consumed_chars
;
2819 if (charbuf
>= charbuf_end
)
2824 /* We produce at most one character. */
2825 switch (iso_code_class
[c1
])
2827 case ISO_0x20_or_0x7F
:
2828 if (composition_state
!= COMPOSING_NO
)
2830 if (composition_state
== COMPOSING_RULE
2831 || composition_state
== COMPOSING_COMPONENT_RULE
)
2833 DECODE_COMPOSITION_RULE (c1
);
2834 components
[component_idx
++] = c1
;
2835 composition_state
--;
2839 if (charset_id_0
< 0
2840 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2841 /* This is SPACE or DEL. */
2842 charset
= CHARSET_FROM_ID (charset_ascii
);
2844 charset
= CHARSET_FROM_ID (charset_id_0
);
2847 case ISO_graphic_plane_0
:
2848 if (composition_state
!= COMPOSING_NO
)
2850 if (composition_state
== COMPOSING_RULE
2851 || composition_state
== COMPOSING_COMPONENT_RULE
)
2853 DECODE_COMPOSITION_RULE (c1
);
2854 components
[component_idx
++] = c1
;
2855 composition_state
--;
2859 charset
= CHARSET_FROM_ID (charset_id_0
);
2862 case ISO_0xA0_or_0xFF
:
2863 if (charset_id_1
< 0
2864 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
2865 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
2867 /* This is a graphic character, we fall down ... */
2869 case ISO_graphic_plane_1
:
2870 if (charset_id_1
< 0)
2872 charset
= CHARSET_FROM_ID (charset_id_1
);
2875 case ISO_carriage_return
:
2878 if (EQ (eol_type
, Qdos
))
2882 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
2883 goto no_more_source
;
2888 else if (EQ (eol_type
, Qmac
))
2894 MAYBE_FINISH_COMPOSITION ();
2895 charset
= CHARSET_FROM_ID (charset_ascii
);
2899 MAYBE_FINISH_COMPOSITION ();
2903 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2904 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
2906 CODING_ISO_INVOCATION (coding
, 0) = 1;
2907 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2911 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
2913 CODING_ISO_INVOCATION (coding
, 0) = 0;
2914 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2917 case ISO_single_shift_2_7
:
2918 case ISO_single_shift_2
:
2919 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2921 /* SS2 is handled as an escape sequence of ESC 'N' */
2923 goto label_escape_sequence
;
2925 case ISO_single_shift_3
:
2926 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2928 /* SS2 is handled as an escape sequence of ESC 'O' */
2930 goto label_escape_sequence
;
2932 case ISO_control_sequence_introducer
:
2933 /* CSI is handled as an escape sequence of ESC '[' ... */
2935 goto label_escape_sequence
;
2939 label_escape_sequence
:
2940 /* Escape sequences handled here are invocation,
2941 designation, direction specification, and character
2942 composition specification. */
2945 case '&': /* revision of following character set */
2947 if (!(c1
>= '@' && c1
<= '~'))
2950 if (c1
!= ISO_CODE_ESC
)
2953 goto label_escape_sequence
;
2955 case '$': /* designation of 2-byte character set */
2956 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2959 if (c1
>= '@' && c1
<= 'B')
2960 { /* designation of JISX0208.1978, GB2312.1980,
2962 DECODE_DESIGNATION (0, 2, 0, c1
);
2964 else if (c1
>= 0x28 && c1
<= 0x2B)
2965 { /* designation of DIMENSION2_CHARS94 character set */
2967 DECODE_DESIGNATION (c1
- 0x28, 2, 0, c2
);
2969 else if (c1
>= 0x2C && c1
<= 0x2F)
2970 { /* designation of DIMENSION2_CHARS96 character set */
2972 DECODE_DESIGNATION (c1
- 0x2C, 2, 1, c2
);
2976 /* We must update these variables now. */
2977 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2978 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2981 case 'n': /* invocation of locking-shift-2 */
2982 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2983 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2985 CODING_ISO_INVOCATION (coding
, 0) = 2;
2986 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2989 case 'o': /* invocation of locking-shift-3 */
2990 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2991 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2993 CODING_ISO_INVOCATION (coding
, 0) = 3;
2994 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2997 case 'N': /* invocation of single-shift-2 */
2998 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2999 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3001 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 2));
3003 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3007 case 'O': /* invocation of single-shift-3 */
3008 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3009 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3011 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 3));
3013 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3017 case '0': case '2': case '3': case '4': /* start composition */
3018 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
3020 DECODE_COMPOSITION_START (c1
);
3023 case '1': /* end composition */
3024 if (composition_state
== COMPOSING_NO
)
3026 DECODE_COMPOSITION_END ();
3029 case '[': /* specification of direction */
3030 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
3032 /* For the moment, nested direction is not supported.
3033 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3034 left-to-right, and nozero means right-to-left. */
3038 case ']': /* end of the current direction */
3039 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3041 case '0': /* end of the current direction */
3042 case '1': /* start of left-to-right direction */
3045 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3050 case '2': /* start of right-to-left direction */
3053 coding
->mode
|= CODING_MODE_DIRECTION
;
3067 /* CTEXT extended segment:
3068 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3069 We keep these bytes as is for the moment.
3070 They may be decoded by post-read-conversion. */
3074 ONE_MORE_BYTE (dim
);
3077 size
= ((M
- 128) * 128) + (L
- 128);
3078 if (charbuf
+ 8 + size
> charbuf_end
)
3080 *charbuf
++ = ISO_CODE_ESC
;
3084 *charbuf
++ = BYTE8_TO_CHAR (M
);
3085 *charbuf
++ = BYTE8_TO_CHAR (L
);
3089 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3094 /* XFree86 extension for embedding UTF-8 in CTEXT:
3095 ESC % G --UTF-8-BYTES-- ESC % @
3096 We keep these bytes as is for the moment.
3097 They may be decoded by post-read-conversion. */
3100 if (p
+ 6 > charbuf_end
)
3102 *p
++ = ISO_CODE_ESC
;
3105 while (p
< charbuf_end
)
3108 if (c1
== ISO_CODE_ESC
3109 && src
+ 1 < src_end
3113 *p
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3115 if (p
+ 3 > charbuf_end
)
3117 *p
++ = ISO_CODE_ESC
;
3128 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3130 if (c1
>= 0x28 && c1
<= 0x2B)
3131 { /* designation of DIMENSION1_CHARS94 character set */
3133 DECODE_DESIGNATION (c1
- 0x28, 1, 0, c2
);
3135 else if (c1
>= 0x2C && c1
<= 0x2F)
3136 { /* designation of DIMENSION1_CHARS96 character set */
3138 DECODE_DESIGNATION (c1
- 0x2C, 1, 1, c2
);
3142 /* We must update these variables now. */
3143 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3144 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3149 if (charset
->id
!= charset_ascii
3150 && last_id
!= charset
->id
)
3152 if (last_id
!= charset_ascii
)
3153 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3154 last_id
= charset
->id
;
3155 last_offset
= char_offset
;
3158 /* Now we know CHARSET and 1st position code C1 of a character.
3159 Produce a decoded character while getting 2nd position code
3162 if (CHARSET_DIMENSION (charset
) > 1)
3165 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3166 /* C2 is not in a valid range. */
3168 c1
= (c1
<< 8) | (c2
& 0x7F);
3169 if (CHARSET_DIMENSION (charset
) > 2)
3172 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3173 /* C2 is not in a valid range. */
3175 c1
= (c1
<< 8) | (c2
& 0x7F);
3179 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3182 MAYBE_FINISH_COMPOSITION ();
3183 for (; src_base
< src
; src_base
++, char_offset
++)
3185 if (ASCII_BYTE_P (*src_base
))
3186 *charbuf
++ = *src_base
;
3188 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3192 else if (composition_state
== COMPOSING_NO
)
3199 components
[component_idx
++] = c
;
3200 if (method
== COMPOSITION_WITH_RULE
3201 || (method
== COMPOSITION_WITH_RULE_ALTCHARS
3202 && composition_state
== COMPOSING_COMPONENT_CHAR
))
3203 composition_state
++;
3208 MAYBE_FINISH_COMPOSITION ();
3210 consumed_chars
= consumed_chars_base
;
3212 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3222 if (last_id
!= charset_ascii
)
3223 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3224 coding
->consumed_char
+= consumed_chars_base
;
3225 coding
->consumed
= src_base
- coding
->source
;
3226 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3230 /* ISO2022 encoding stuff. */
3233 It is not enough to say just "ISO2022" on encoding, we have to
3234 specify more details. In Emacs, each coding system of ISO2022
3235 variant has the following specifications:
3236 1. Initial designation to G0 thru G3.
3237 2. Allows short-form designation?
3238 3. ASCII should be designated to G0 before control characters?
3239 4. ASCII should be designated to G0 at end of line?
3240 5. 7-bit environment or 8-bit environment?
3241 6. Use locking-shift?
3242 7. Use Single-shift?
3243 And the following two are only for Japanese:
3244 8. Use ASCII in place of JIS0201-1976-Roman?
3245 9. Use JISX0208-1983 in place of JISX0208-1978?
3246 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3247 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3251 /* Produce codes (escape sequence) for designating CHARSET to graphic
3252 register REG at DST, and increment DST. If <final-char> of CHARSET is
3253 '@', 'A', or 'B' and the coding system CODING allows, produce
3254 designation sequence of short-form. */
3256 #define ENCODE_DESIGNATION(charset, reg, coding) \
3258 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3259 char *intermediate_char_94 = "()*+"; \
3260 char *intermediate_char_96 = ",-./"; \
3261 int revision = -1; \
3264 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3265 revision = CHARSET_ISO_REVISION (charset); \
3267 if (revision >= 0) \
3269 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3270 EMIT_ONE_BYTE ('@' + revision); \
3272 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3273 if (CHARSET_DIMENSION (charset) == 1) \
3275 if (! CHARSET_ISO_CHARS_96 (charset)) \
3276 c = intermediate_char_94[reg]; \
3278 c = intermediate_char_96[reg]; \
3279 EMIT_ONE_ASCII_BYTE (c); \
3283 EMIT_ONE_ASCII_BYTE ('$'); \
3284 if (! CHARSET_ISO_CHARS_96 (charset)) \
3286 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3288 || final_char < '@' || final_char > 'B') \
3289 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3292 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3294 EMIT_ONE_ASCII_BYTE (final_char); \
3296 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3300 /* The following two macros produce codes (control character or escape
3301 sequence) for ISO2022 single-shift functions (single-shift-2 and
3304 #define ENCODE_SINGLE_SHIFT_2 \
3306 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3307 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3309 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3310 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3314 #define ENCODE_SINGLE_SHIFT_3 \
3316 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3317 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3319 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3320 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3324 /* The following four macros produce codes (control character or
3325 escape sequence) for ISO2022 locking-shift functions (shift-in,
3326 shift-out, locking-shift-2, and locking-shift-3). */
3328 #define ENCODE_SHIFT_IN \
3330 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3331 CODING_ISO_INVOCATION (coding, 0) = 0; \
3335 #define ENCODE_SHIFT_OUT \
3337 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3338 CODING_ISO_INVOCATION (coding, 0) = 1; \
3342 #define ENCODE_LOCKING_SHIFT_2 \
3344 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3345 CODING_ISO_INVOCATION (coding, 0) = 2; \
3349 #define ENCODE_LOCKING_SHIFT_3 \
3351 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3352 CODING_ISO_INVOCATION (coding, 0) = 3; \
3356 /* Produce codes for a DIMENSION1 character whose character set is
3357 CHARSET and whose position-code is C1. Designation and invocation
3358 sequences are also produced in advance if necessary. */
3360 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3362 int id = CHARSET_ID (charset); \
3364 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3365 && id == charset_ascii) \
3367 id = charset_jisx0201_roman; \
3368 charset = CHARSET_FROM_ID (id); \
3371 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3373 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3374 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3376 EMIT_ONE_BYTE (c1 | 0x80); \
3377 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3380 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3382 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3385 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3387 EMIT_ONE_BYTE (c1 | 0x80); \
3391 /* Since CHARSET is not yet invoked to any graphic planes, we \
3392 must invoke it, or, at first, designate it to some graphic \
3393 register. Then repeat the loop to actually produce the \
3395 dst = encode_invocation_designation (charset, coding, dst, \
3400 /* Produce codes for a DIMENSION2 character whose character set is
3401 CHARSET and whose position-codes are C1 and C2. Designation and
3402 invocation codes are also produced in advance if necessary. */
3404 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3406 int id = CHARSET_ID (charset); \
3408 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3409 && id == charset_jisx0208) \
3411 id = charset_jisx0208_1978; \
3412 charset = CHARSET_FROM_ID (id); \
3415 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3417 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3418 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3420 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3421 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3424 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3426 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3429 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3431 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3435 /* Since CHARSET is not yet invoked to any graphic planes, we \
3436 must invoke it, or, at first, designate it to some graphic \
3437 register. Then repeat the loop to actually produce the \
3439 dst = encode_invocation_designation (charset, coding, dst, \
3444 #define ENCODE_ISO_CHARACTER(charset, c) \
3446 int code = ENCODE_CHAR ((charset),(c)); \
3448 if (CHARSET_DIMENSION (charset) == 1) \
3449 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3451 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3455 /* Produce designation and invocation codes at a place pointed by DST
3456 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3460 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3461 struct charset
*charset
;
3462 struct coding_system
*coding
;
3466 int multibytep
= coding
->dst_multibyte
;
3467 int produced_chars
= *p_nchars
;
3468 int reg
; /* graphic register number */
3469 int id
= CHARSET_ID (charset
);
3471 /* At first, check designations. */
3472 for (reg
= 0; reg
< 4; reg
++)
3473 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3478 /* CHARSET is not yet designated to any graphic registers. */
3479 /* At first check the requested designation. */
3480 reg
= CODING_ISO_REQUEST (coding
, id
);
3482 /* Since CHARSET requests no special designation, designate it
3483 to graphic register 0. */
3486 ENCODE_DESIGNATION (charset
, reg
, coding
);
3489 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3490 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3492 /* Since the graphic register REG is not invoked to any graphic
3493 planes, invoke it to graphic plane 0. */
3496 case 0: /* graphic register 0 */
3500 case 1: /* graphic register 1 */
3504 case 2: /* graphic register 2 */
3505 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3506 ENCODE_SINGLE_SHIFT_2
;
3508 ENCODE_LOCKING_SHIFT_2
;
3511 case 3: /* graphic register 3 */
3512 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3513 ENCODE_SINGLE_SHIFT_3
;
3515 ENCODE_LOCKING_SHIFT_3
;
3520 *p_nchars
= produced_chars
;
3524 /* The following three macros produce codes for indicating direction
3526 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3528 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3529 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3531 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3535 #define ENCODE_DIRECTION_R2L() \
3537 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3538 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3542 #define ENCODE_DIRECTION_L2R() \
3544 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3545 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3549 /* Produce codes for designation and invocation to reset the graphic
3550 planes and registers to initial state. */
3551 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3554 struct charset *charset; \
3556 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3558 for (reg = 0; reg < 4; reg++) \
3559 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3560 && (CODING_ISO_DESIGNATION (coding, reg) \
3561 != CODING_ISO_INITIAL (coding, reg))) \
3563 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3564 ENCODE_DESIGNATION (charset, reg, coding); \
3569 /* Produce designation sequences of charsets in the line started from
3570 SRC to a place pointed by DST, and return updated DST.
3572 If the current block ends before any end-of-line, we may fail to
3573 find all the necessary designations. */
3575 static unsigned char *
3576 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3577 struct coding_system
*coding
;
3578 int *charbuf
, *charbuf_end
;
3581 struct charset
*charset
;
3582 /* Table of charsets to be designated to each graphic register. */
3584 int c
, found
= 0, reg
;
3585 int produced_chars
= 0;
3586 int multibytep
= coding
->dst_multibyte
;
3588 Lisp_Object charset_list
;
3590 attrs
= CODING_ID_ATTRS (coding
->id
);
3591 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3592 if (EQ (charset_list
, Qiso_2022
))
3593 charset_list
= Viso_2022_charset_list
;
3595 for (reg
= 0; reg
< 4; reg
++)
3605 charset
= char_charset (c
, charset_list
, NULL
);
3606 id
= CHARSET_ID (charset
);
3607 reg
= CODING_ISO_REQUEST (coding
, id
);
3608 if (reg
>= 0 && r
[reg
] < 0)
3617 for (reg
= 0; reg
< 4; reg
++)
3619 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3620 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3626 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3629 encode_coding_iso_2022 (coding
)
3630 struct coding_system
*coding
;
3632 int multibytep
= coding
->dst_multibyte
;
3633 int *charbuf
= coding
->charbuf
;
3634 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3635 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3636 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3639 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3640 && CODING_ISO_BOL (coding
));
3641 int produced_chars
= 0;
3642 Lisp_Object attrs
, eol_type
, charset_list
;
3643 int ascii_compatible
;
3645 int preferred_charset_id
= -1;
3647 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3648 setup_iso_safe_charsets (attrs
);
3649 /* Charset list may have been changed. */
3650 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
); \
3651 coding
->safe_charsets
3652 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs
))->data
;
3654 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3656 while (charbuf
< charbuf_end
)
3658 ASSURE_DESTINATION (safe_room
);
3660 if (bol_designation
)
3662 unsigned char *dst_prev
= dst
;
3664 /* We have to produce designation sequences if any now. */
3665 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3666 bol_designation
= 0;
3667 /* We are sure that designation sequences are all ASCII bytes. */
3668 produced_chars
+= dst
- dst_prev
;
3675 /* Handle an annotation. */
3678 case CODING_ANNOTATE_COMPOSITION_MASK
:
3679 /* Not yet implemented. */
3681 case CODING_ANNOTATE_CHARSET_MASK
:
3682 preferred_charset_id
= charbuf
[3];
3683 if (preferred_charset_id
>= 0
3684 && NILP (Fmemq (make_number (preferred_charset_id
),
3686 preferred_charset_id
= -1;
3695 /* Now encode the character C. */
3696 if (c
< 0x20 || c
== 0x7F)
3699 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3701 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3702 ENCODE_RESET_PLANE_AND_REGISTER ();
3703 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3707 for (i
= 0; i
< 4; i
++)
3708 CODING_ISO_DESIGNATION (coding
, i
)
3709 = CODING_ISO_INITIAL (coding
, i
);
3712 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3714 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3715 ENCODE_RESET_PLANE_AND_REGISTER ();
3716 EMIT_ONE_ASCII_BYTE (c
);
3718 else if (ASCII_CHAR_P (c
))
3720 if (ascii_compatible
)
3721 EMIT_ONE_ASCII_BYTE (c
);
3724 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
3725 ENCODE_ISO_CHARACTER (charset
, c
);
3728 else if (CHAR_BYTE8_P (c
))
3730 c
= CHAR_TO_BYTE8 (c
);
3735 struct charset
*charset
;
3737 if (preferred_charset_id
>= 0)
3739 charset
= CHARSET_FROM_ID (preferred_charset_id
);
3740 if (! CHAR_CHARSET_P (c
, charset
))
3741 charset
= char_charset (c
, charset_list
, NULL
);
3744 charset
= char_charset (c
, charset_list
, NULL
);
3747 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3749 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3750 charset
= CHARSET_FROM_ID (charset_ascii
);
3754 c
= coding
->default_char
;
3755 charset
= char_charset (c
, charset_list
, NULL
);
3758 ENCODE_ISO_CHARACTER (charset
, c
);
3762 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3763 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3765 ASSURE_DESTINATION (safe_room
);
3766 ENCODE_RESET_PLANE_AND_REGISTER ();
3768 coding
->result
= CODING_RESULT_SUCCESS
;
3769 CODING_ISO_BOL (coding
) = bol_designation
;
3770 coding
->produced_char
+= produced_chars
;
3771 coding
->produced
= dst
- coding
->destination
;
3776 /*** 8,9. SJIS and BIG5 handlers ***/
3778 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3779 quite widely. So, for the moment, Emacs supports them in the bare
3780 C code. But, in the future, they may be supported only by CCL. */
3782 /* SJIS is a coding system encoding three character sets: ASCII, right
3783 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3784 as is. A character of charset katakana-jisx0201 is encoded by
3785 "position-code + 0x80". A character of charset japanese-jisx0208
3786 is encoded in 2-byte but two position-codes are divided and shifted
3787 so that it fit in the range below.
3789 --- CODE RANGE of SJIS ---
3790 (character set) (range)
3792 KATAKANA-JISX0201 0xA0 .. 0xDF
3793 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3794 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3795 -------------------------------
3799 /* BIG5 is a coding system encoding two character sets: ASCII and
3800 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3801 character set and is encoded in two-byte.
3803 --- CODE RANGE of BIG5 ---
3804 (character set) (range)
3806 Big5 (1st byte) 0xA1 .. 0xFE
3807 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3808 --------------------------
3812 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3813 Check if a text is encoded in SJIS. If it is, return
3814 CATEGORY_MASK_SJIS, else return 0. */
3817 detect_coding_sjis (coding
, detect_info
)
3818 struct coding_system
*coding
;
3819 struct coding_detection_info
*detect_info
;
3821 unsigned char *src
= coding
->source
, *src_base
= src
;
3822 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3823 int multibytep
= coding
->src_multibyte
;
3824 int consumed_chars
= 0;
3829 detect_info
->checked
|= CATEGORY_MASK_SJIS
;
3830 /* A coding system of this category is always ASCII compatible. */
3831 src
+= coding
->head_ascii
;
3840 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3843 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
3845 found
= CATEGORY_MASK_SJIS
;
3847 else if (c
>= 0xA0 && c
< 0xE0)
3848 found
= CATEGORY_MASK_SJIS
;
3852 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3856 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3858 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3861 detect_info
->found
|= found
;
3865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3866 Check if a text is encoded in BIG5. If it is, return
3867 CATEGORY_MASK_BIG5, else return 0. */
3870 detect_coding_big5 (coding
, detect_info
)
3871 struct coding_system
*coding
;
3872 struct coding_detection_info
*detect_info
;
3874 unsigned char *src
= coding
->source
, *src_base
= src
;
3875 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3876 int multibytep
= coding
->src_multibyte
;
3877 int consumed_chars
= 0;
3882 detect_info
->checked
|= CATEGORY_MASK_BIG5
;
3883 /* A coding system of this category is always ASCII compatible. */
3884 src
+= coding
->head_ascii
;
3896 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
3898 found
= CATEGORY_MASK_BIG5
;
3903 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3907 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3909 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3912 detect_info
->found
|= found
;
3916 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3917 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3920 decode_coding_sjis (coding
)
3921 struct coding_system
*coding
;
3923 unsigned char *src
= coding
->source
+ coding
->consumed
;
3924 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3925 unsigned char *src_base
;
3926 int *charbuf
= coding
->charbuf
;
3927 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
3928 int consumed_chars
= 0, consumed_chars_base
;
3929 int multibytep
= coding
->src_multibyte
;
3930 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3931 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3932 int char_offset
= coding
->produced_char
;
3933 int last_offset
= char_offset
;
3934 int last_id
= charset_ascii
;
3936 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3939 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3940 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3941 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3948 consumed_chars_base
= consumed_chars
;
3950 if (charbuf
>= charbuf_end
)
3957 if (EQ (eol_type
, Qdos
))
3961 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
3962 goto no_more_source
;
3967 else if (EQ (eol_type
, Qmac
))
3972 struct charset
*charset
;
3975 charset
= charset_roman
;
3980 if (c
< 0xA0 || c
>= 0xE0)
3982 /* SJIS -> JISX0208 */
3984 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
3988 charset
= charset_kanji
;
3992 /* SJIS -> JISX0201-Kana */
3994 charset
= charset_kana
;
3997 if (charset
->id
!= charset_ascii
3998 && last_id
!= charset
->id
)
4000 if (last_id
!= charset_ascii
)
4001 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4002 last_id
= charset
->id
;
4003 last_offset
= char_offset
;
4005 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4013 consumed_chars
= consumed_chars_base
;
4015 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4021 if (last_id
!= charset_ascii
)
4022 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4023 coding
->consumed_char
+= consumed_chars_base
;
4024 coding
->consumed
= src_base
- coding
->source
;
4025 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4029 decode_coding_big5 (coding
)
4030 struct coding_system
*coding
;
4032 unsigned char *src
= coding
->source
+ coding
->consumed
;
4033 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4034 unsigned char *src_base
;
4035 int *charbuf
= coding
->charbuf
;
4036 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4037 int consumed_chars
= 0, consumed_chars_base
;
4038 int multibytep
= coding
->src_multibyte
;
4039 struct charset
*charset_roman
, *charset_big5
;
4040 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4041 int char_offset
= coding
->produced_char
;
4042 int last_offset
= char_offset
;
4043 int last_id
= charset_ascii
;
4045 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4047 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4048 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4055 consumed_chars_base
= consumed_chars
;
4057 if (charbuf
>= charbuf_end
)
4064 if (EQ (eol_type
, Qdos
))
4068 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4069 goto no_more_source
;
4074 else if (EQ (eol_type
, Qmac
))
4079 struct charset
*charset
;
4081 charset
= charset_roman
;
4085 if (c
< 0xA1 || c
> 0xFE)
4088 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
4091 charset
= charset_big5
;
4093 if (charset
->id
!= charset_ascii
4094 && last_id
!= charset
->id
)
4096 if (last_id
!= charset_ascii
)
4097 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4098 last_id
= charset
->id
;
4099 last_offset
= char_offset
;
4101 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4110 consumed_chars
= consumed_chars_base
;
4112 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4118 if (last_id
!= charset_ascii
)
4119 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4120 coding
->consumed_char
+= consumed_chars_base
;
4121 coding
->consumed
= src_base
- coding
->source
;
4122 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4125 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4126 This function can encode charsets `ascii', `katakana-jisx0201',
4127 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4128 are sure that all these charsets are registered as official charset
4129 (i.e. do not have extended leading-codes). Characters of other
4130 charsets are produced without any encoding. If SJIS_P is 1, encode
4131 SJIS text, else encode BIG5 text. */
4134 encode_coding_sjis (coding
)
4135 struct coding_system
*coding
;
4137 int multibytep
= coding
->dst_multibyte
;
4138 int *charbuf
= coding
->charbuf
;
4139 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4140 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4141 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4143 int produced_chars
= 0;
4144 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4145 int ascii_compatible
;
4146 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4149 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4151 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4152 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4153 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4155 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4157 while (charbuf
< charbuf_end
)
4159 ASSURE_DESTINATION (safe_room
);
4161 /* Now encode the character C. */
4162 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4163 EMIT_ONE_ASCII_BYTE (c
);
4164 else if (CHAR_BYTE8_P (c
))
4166 c
= CHAR_TO_BYTE8 (c
);
4172 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4176 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4178 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4179 charset
= CHARSET_FROM_ID (charset_ascii
);
4183 c
= coding
->default_char
;
4184 charset
= char_charset (c
, charset_list
, &code
);
4187 if (code
== CHARSET_INVALID_CODE (charset
))
4189 if (charset
== charset_kanji
)
4193 c1
= code
>> 8, c2
= code
& 0xFF;
4194 EMIT_TWO_BYTES (c1
, c2
);
4196 else if (charset
== charset_kana
)
4197 EMIT_ONE_BYTE (code
| 0x80);
4199 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4202 coding
->result
= CODING_RESULT_SUCCESS
;
4203 coding
->produced_char
+= produced_chars
;
4204 coding
->produced
= dst
- coding
->destination
;
4209 encode_coding_big5 (coding
)
4210 struct coding_system
*coding
;
4212 int multibytep
= coding
->dst_multibyte
;
4213 int *charbuf
= coding
->charbuf
;
4214 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4215 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4216 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4218 int produced_chars
= 0;
4219 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4220 int ascii_compatible
;
4221 struct charset
*charset_roman
, *charset_big5
;
4224 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4226 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4227 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4228 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4230 while (charbuf
< charbuf_end
)
4232 ASSURE_DESTINATION (safe_room
);
4234 /* Now encode the character C. */
4235 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4236 EMIT_ONE_ASCII_BYTE (c
);
4237 else if (CHAR_BYTE8_P (c
))
4239 c
= CHAR_TO_BYTE8 (c
);
4245 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4249 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4251 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4252 charset
= CHARSET_FROM_ID (charset_ascii
);
4256 c
= coding
->default_char
;
4257 charset
= char_charset (c
, charset_list
, &code
);
4260 if (code
== CHARSET_INVALID_CODE (charset
))
4262 if (charset
== charset_big5
)
4266 c1
= code
>> 8, c2
= code
& 0xFF;
4267 EMIT_TWO_BYTES (c1
, c2
);
4270 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4273 coding
->result
= CODING_RESULT_SUCCESS
;
4274 coding
->produced_char
+= produced_chars
;
4275 coding
->produced
= dst
- coding
->destination
;
4280 /*** 10. CCL handlers ***/
4282 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4283 Check if a text is encoded in a coding system of which
4284 encoder/decoder are written in CCL program. If it is, return
4285 CATEGORY_MASK_CCL, else return 0. */
4288 detect_coding_ccl (coding
, detect_info
)
4289 struct coding_system
*coding
;
4290 struct coding_detection_info
*detect_info
;
4292 unsigned char *src
= coding
->source
, *src_base
= src
;
4293 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4294 int multibytep
= coding
->src_multibyte
;
4295 int consumed_chars
= 0;
4297 unsigned char *valids
= CODING_CCL_VALIDS (coding
);
4298 int head_ascii
= coding
->head_ascii
;
4301 detect_info
->checked
|= CATEGORY_MASK_CCL
;
4303 coding
= &coding_categories
[coding_category_ccl
];
4304 attrs
= CODING_ID_ATTRS (coding
->id
);
4305 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4314 if ((valids
[c
] > 1))
4315 found
= CATEGORY_MASK_CCL
;
4317 detect_info
->rejected
|= CATEGORY_MASK_CCL
;
4321 detect_info
->found
|= found
;
4326 decode_coding_ccl (coding
)
4327 struct coding_system
*coding
;
4329 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4330 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4331 int *charbuf
= coding
->charbuf
;
4332 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4333 int consumed_chars
= 0;
4334 int multibytep
= coding
->src_multibyte
;
4335 struct ccl_program ccl
;
4336 int source_charbuf
[1024];
4337 int source_byteidx
[1024];
4338 Lisp_Object attrs
, eol_type
, charset_list
;
4340 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4341 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
4343 while (src
< src_end
)
4345 const unsigned char *p
= src
;
4346 int *source
, *source_end
;
4350 while (i
< 1024 && p
< src_end
)
4352 source_byteidx
[i
] = p
- src
;
4353 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4356 while (i
< 1024 && p
< src_end
)
4357 source_charbuf
[i
++] = *p
++;
4359 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4362 source
= source_charbuf
;
4363 source_end
= source
+ i
;
4364 while (source
< source_end
)
4366 ccl_driver (&ccl
, source
, charbuf
,
4367 source_end
- source
, charbuf_end
- charbuf
,
4369 source
+= ccl
.consumed
;
4370 charbuf
+= ccl
.produced
;
4371 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4374 if (source
< source_end
)
4375 src
+= source_byteidx
[source
- source_charbuf
];
4378 consumed_chars
+= source
- source_charbuf
;
4380 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4381 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4387 case CCL_STAT_SUSPEND_BY_SRC
:
4388 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4390 case CCL_STAT_SUSPEND_BY_DST
:
4393 case CCL_STAT_INVALID_CMD
:
4394 coding
->result
= CODING_RESULT_INTERRUPT
;
4397 coding
->result
= CODING_RESULT_SUCCESS
;
4400 coding
->consumed_char
+= consumed_chars
;
4401 coding
->consumed
= src
- coding
->source
;
4402 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4406 encode_coding_ccl (coding
)
4407 struct coding_system
*coding
;
4409 struct ccl_program ccl
;
4410 int multibytep
= coding
->dst_multibyte
;
4411 int *charbuf
= coding
->charbuf
;
4412 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4413 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4414 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4415 unsigned char *adjusted_dst_end
= dst_end
- 1;
4416 int destination_charbuf
[1024];
4417 int i
, produced_chars
= 0;
4418 Lisp_Object attrs
, eol_type
, charset_list
;
4420 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4421 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4423 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4424 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4426 while (charbuf
< charbuf_end
&& dst
< adjusted_dst_end
)
4428 int dst_bytes
= dst_end
- dst
;
4429 if (dst_bytes
> 1024)
4432 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4433 charbuf_end
- charbuf
, dst_bytes
, charset_list
);
4434 charbuf
+= ccl
.consumed
;
4436 for (i
= 0; i
< ccl
.produced
; i
++)
4437 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4440 for (i
= 0; i
< ccl
.produced
; i
++)
4441 *dst
++ = destination_charbuf
[i
] & 0xFF;
4442 produced_chars
+= ccl
.produced
;
4448 case CCL_STAT_SUSPEND_BY_SRC
:
4449 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4451 case CCL_STAT_SUSPEND_BY_DST
:
4452 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
4455 case CCL_STAT_INVALID_CMD
:
4456 coding
->result
= CODING_RESULT_INTERRUPT
;
4459 coding
->result
= CODING_RESULT_SUCCESS
;
4463 coding
->produced_char
+= produced_chars
;
4464 coding
->produced
= dst
- coding
->destination
;
4470 /*** 10, 11. no-conversion handlers ***/
4472 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4475 decode_coding_raw_text (coding
)
4476 struct coding_system
*coding
;
4478 coding
->chars_at_source
= 1;
4479 coding
->consumed_char
= 0;
4480 coding
->consumed
= 0;
4481 coding
->result
= CODING_RESULT_SUCCESS
;
4485 encode_coding_raw_text (coding
)
4486 struct coding_system
*coding
;
4488 int multibytep
= coding
->dst_multibyte
;
4489 int *charbuf
= coding
->charbuf
;
4490 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4491 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4492 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4493 int produced_chars
= 0;
4498 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4500 if (coding
->src_multibyte
)
4501 while (charbuf
< charbuf_end
)
4503 ASSURE_DESTINATION (safe_room
);
4505 if (ASCII_CHAR_P (c
))
4506 EMIT_ONE_ASCII_BYTE (c
);
4507 else if (CHAR_BYTE8_P (c
))
4509 c
= CHAR_TO_BYTE8 (c
);
4514 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4516 CHAR_STRING_ADVANCE (c
, p1
);
4519 EMIT_ONE_BYTE (*p0
);
4525 while (charbuf
< charbuf_end
)
4527 ASSURE_DESTINATION (safe_room
);
4534 if (coding
->src_multibyte
)
4536 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4538 while (charbuf
< charbuf_end
)
4540 ASSURE_DESTINATION (safe_room
);
4542 if (ASCII_CHAR_P (c
))
4544 else if (CHAR_BYTE8_P (c
))
4545 *dst
++ = CHAR_TO_BYTE8 (c
);
4547 CHAR_STRING_ADVANCE (c
, dst
);
4553 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4554 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4555 *dst
++ = *charbuf
++;
4556 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4559 coding
->result
= CODING_RESULT_SUCCESS
;
4560 coding
->produced_char
+= produced_chars
;
4561 coding
->produced
= dst
- coding
->destination
;
4565 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4566 Check if a text is encoded in a charset-based coding system. If it
4567 is, return 1, else return 0. */
4570 detect_coding_charset (coding
, detect_info
)
4571 struct coding_system
*coding
;
4572 struct coding_detection_info
*detect_info
;
4574 unsigned char *src
= coding
->source
, *src_base
= src
;
4575 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4576 int multibytep
= coding
->src_multibyte
;
4577 int consumed_chars
= 0;
4578 Lisp_Object attrs
, valids
;
4581 detect_info
->checked
|= CATEGORY_MASK_CHARSET
;
4583 coding
= &coding_categories
[coding_category_charset
];
4584 attrs
= CODING_ID_ATTRS (coding
->id
);
4585 valids
= AREF (attrs
, coding_attr_charset_valids
);
4587 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4588 src
+= coding
->head_ascii
;
4595 if (NILP (AREF (valids
, c
)))
4598 found
= CATEGORY_MASK_CHARSET
;
4600 detect_info
->rejected
|= CATEGORY_MASK_CHARSET
;
4604 detect_info
->found
|= found
;
4609 decode_coding_charset (coding
)
4610 struct coding_system
*coding
;
4612 unsigned char *src
= coding
->source
+ coding
->consumed
;
4613 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4614 unsigned char *src_base
;
4615 int *charbuf
= coding
->charbuf
;
4616 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4617 int consumed_chars
= 0, consumed_chars_base
;
4618 int multibytep
= coding
->src_multibyte
;
4619 Lisp_Object attrs
, eol_type
, charset_list
, valids
;
4620 int char_offset
= coding
->produced_char
;
4621 int last_offset
= char_offset
;
4622 int last_id
= charset_ascii
;
4624 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4625 valids
= AREF (attrs
, coding_attr_charset_valids
);
4632 consumed_chars_base
= consumed_chars
;
4634 if (charbuf
>= charbuf_end
)
4640 /* Here we assume that no charset maps '\r' to something
4642 if (EQ (eol_type
, Qdos
))
4646 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4647 goto no_more_source
;
4652 else if (EQ (eol_type
, Qmac
))
4658 struct charset
*charset
;
4663 val
= AREF (valids
, c
);
4668 charset
= CHARSET_FROM_ID (XFASTINT (val
));
4669 dim
= CHARSET_DIMENSION (charset
);
4673 code
= (code
<< 8) | c
;
4676 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
4681 /* VAL is a list of charset IDs. It is assured that the
4682 list is sorted by charset dimensions (smaller one
4686 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
4687 dim
= CHARSET_DIMENSION (charset
);
4691 code
= (code
<< 8) | c
;
4694 CODING_DECODE_CHAR (coding
, src
, src_base
,
4695 src_end
, charset
, code
, c
);
4703 if (charset
->id
!= charset_ascii
4704 && last_id
!= charset
->id
)
4706 if (last_id
!= charset_ascii
)
4707 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4708 last_id
= charset
->id
;
4709 last_offset
= char_offset
;
4718 consumed_chars
= consumed_chars_base
;
4720 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4726 if (last_id
!= charset_ascii
)
4727 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4728 coding
->consumed_char
+= consumed_chars_base
;
4729 coding
->consumed
= src_base
- coding
->source
;
4730 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4734 encode_coding_charset (coding
)
4735 struct coding_system
*coding
;
4737 int multibytep
= coding
->dst_multibyte
;
4738 int *charbuf
= coding
->charbuf
;
4739 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4740 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4741 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4742 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4743 int produced_chars
= 0;
4744 Lisp_Object attrs
, eol_type
, charset_list
;
4745 int ascii_compatible
;
4748 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4749 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4751 while (charbuf
< charbuf_end
)
4753 struct charset
*charset
;
4756 ASSURE_DESTINATION (safe_room
);
4758 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4759 EMIT_ONE_ASCII_BYTE (c
);
4760 else if (CHAR_BYTE8_P (c
))
4762 c
= CHAR_TO_BYTE8 (c
);
4767 charset
= char_charset (c
, charset_list
, &code
);
4770 if (CHARSET_DIMENSION (charset
) == 1)
4771 EMIT_ONE_BYTE (code
);
4772 else if (CHARSET_DIMENSION (charset
) == 2)
4773 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
4774 else if (CHARSET_DIMENSION (charset
) == 3)
4775 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
4777 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
4778 (code
>> 8) & 0xFF, code
& 0xFF);
4782 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4783 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4785 c
= coding
->default_char
;
4791 coding
->result
= CODING_RESULT_SUCCESS
;
4792 coding
->produced_char
+= produced_chars
;
4793 coding
->produced
= dst
- coding
->destination
;
4798 /*** 7. C library functions ***/
4800 /* Setup coding context CODING from information about CODING_SYSTEM.
4801 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4802 CODING_SYSTEM is invalid, signal an error. */
4805 setup_coding_system (coding_system
, coding
)
4806 Lisp_Object coding_system
;
4807 struct coding_system
*coding
;
4810 Lisp_Object eol_type
;
4811 Lisp_Object coding_type
;
4814 if (NILP (coding_system
))
4815 coding_system
= Qno_conversion
;
4817 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
4819 attrs
= CODING_ID_ATTRS (coding
->id
);
4820 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4823 coding
->head_ascii
= -1;
4824 coding
->common_flags
4825 = (VECTORP (eol_type
) ? CODING_REQUIRE_DETECTION_MASK
: 0);
4827 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4828 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4829 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4830 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
4832 coding_type
= CODING_ATTR_TYPE (attrs
);
4833 if (EQ (coding_type
, Qundecided
))
4835 coding
->detector
= NULL
;
4836 coding
->decoder
= decode_coding_raw_text
;
4837 coding
->encoder
= encode_coding_raw_text
;
4838 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4840 else if (EQ (coding_type
, Qiso_2022
))
4843 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
4845 /* Invoke graphic register 0 to plane 0. */
4846 CODING_ISO_INVOCATION (coding
, 0) = 0;
4847 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4848 CODING_ISO_INVOCATION (coding
, 1)
4849 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
4850 /* Setup the initial status of designation. */
4851 for (i
= 0; i
< 4; i
++)
4852 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
4853 /* Not single shifting initially. */
4854 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
4855 /* Beginning of buffer should also be regarded as bol. */
4856 CODING_ISO_BOL (coding
) = 1;
4857 coding
->detector
= detect_coding_iso_2022
;
4858 coding
->decoder
= decode_coding_iso_2022
;
4859 coding
->encoder
= encode_coding_iso_2022
;
4860 if (flags
& CODING_ISO_FLAG_SAFE
)
4861 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
4862 coding
->common_flags
4863 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4864 | CODING_REQUIRE_FLUSHING_MASK
);
4865 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
4866 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
4867 if (flags
& CODING_ISO_FLAG_DESIGNATION
)
4868 coding
->common_flags
|= CODING_ANNOTATE_CHARSET_MASK
;
4869 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
4871 setup_iso_safe_charsets (attrs
);
4872 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4873 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4874 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4876 CODING_ISO_FLAGS (coding
) = flags
;
4878 else if (EQ (coding_type
, Qcharset
))
4880 coding
->detector
= detect_coding_charset
;
4881 coding
->decoder
= decode_coding_charset
;
4882 coding
->encoder
= encode_coding_charset
;
4883 coding
->common_flags
4884 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4886 else if (EQ (coding_type
, Qutf_8
))
4888 coding
->detector
= detect_coding_utf_8
;
4889 coding
->decoder
= decode_coding_utf_8
;
4890 coding
->encoder
= encode_coding_utf_8
;
4891 coding
->common_flags
4892 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4894 else if (EQ (coding_type
, Qutf_16
))
4896 val
= AREF (attrs
, coding_attr_utf_16_bom
);
4897 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
4898 : EQ (val
, Qt
) ? utf_16_with_bom
4899 : utf_16_without_bom
);
4900 val
= AREF (attrs
, coding_attr_utf_16_endian
);
4901 CODING_UTF_16_ENDIAN (coding
) = (EQ (val
, Qbig
) ? utf_16_big_endian
4902 : utf_16_little_endian
);
4903 CODING_UTF_16_SURROGATE (coding
) = 0;
4904 coding
->detector
= detect_coding_utf_16
;
4905 coding
->decoder
= decode_coding_utf_16
;
4906 coding
->encoder
= encode_coding_utf_16
;
4907 coding
->common_flags
4908 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4909 if (CODING_UTF_16_BOM (coding
) == utf_16_detect_bom
)
4910 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4912 else if (EQ (coding_type
, Qccl
))
4914 coding
->detector
= detect_coding_ccl
;
4915 coding
->decoder
= decode_coding_ccl
;
4916 coding
->encoder
= encode_coding_ccl
;
4917 coding
->common_flags
4918 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4919 | CODING_REQUIRE_FLUSHING_MASK
);
4921 else if (EQ (coding_type
, Qemacs_mule
))
4923 coding
->detector
= detect_coding_emacs_mule
;
4924 coding
->decoder
= decode_coding_emacs_mule
;
4925 coding
->encoder
= encode_coding_emacs_mule
;
4926 coding
->common_flags
4927 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4928 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
4929 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
4931 Lisp_Object tail
, safe_charsets
;
4932 int max_charset_id
= 0;
4934 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4936 if (max_charset_id
< XFASTINT (XCAR (tail
)))
4937 max_charset_id
= XFASTINT (XCAR (tail
));
4938 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
4940 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4942 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
4943 coding
->max_charset_id
= max_charset_id
;
4944 coding
->safe_charsets
= (char *) XSTRING (safe_charsets
)->data
;
4947 else if (EQ (coding_type
, Qshift_jis
))
4949 coding
->detector
= detect_coding_sjis
;
4950 coding
->decoder
= decode_coding_sjis
;
4951 coding
->encoder
= encode_coding_sjis
;
4952 coding
->common_flags
4953 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4955 else if (EQ (coding_type
, Qbig5
))
4957 coding
->detector
= detect_coding_big5
;
4958 coding
->decoder
= decode_coding_big5
;
4959 coding
->encoder
= encode_coding_big5
;
4960 coding
->common_flags
4961 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4963 else /* EQ (coding_type, Qraw_text) */
4965 coding
->detector
= NULL
;
4966 coding
->decoder
= decode_coding_raw_text
;
4967 coding
->encoder
= encode_coding_raw_text
;
4968 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
4974 /* Return raw-text or one of its subsidiaries that has the same
4975 eol_type as CODING-SYSTEM. */
4978 raw_text_coding_system (coding_system
)
4979 Lisp_Object coding_system
;
4981 Lisp_Object spec
, attrs
;
4982 Lisp_Object eol_type
, raw_text_eol_type
;
4984 spec
= CODING_SYSTEM_SPEC (coding_system
);
4985 attrs
= AREF (spec
, 0);
4987 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
4988 return coding_system
;
4990 eol_type
= AREF (spec
, 2);
4991 if (VECTORP (eol_type
))
4993 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
4994 raw_text_eol_type
= AREF (spec
, 2);
4995 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
4996 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
4997 : AREF (raw_text_eol_type
, 2));
5001 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5002 does, return one of the subsidiary that has the same eol-spec as
5003 PARENT. Otherwise, return CODING_SYSTEM. */
5006 coding_inherit_eol_type (coding_system
, parent
)
5007 Lisp_Object coding_system
, parent
;
5009 Lisp_Object spec
, attrs
, eol_type
;
5011 spec
= CODING_SYSTEM_SPEC (coding_system
);
5012 attrs
= AREF (spec
, 0);
5013 eol_type
= AREF (spec
, 2);
5014 if (VECTORP (eol_type
))
5016 Lisp_Object parent_spec
;
5017 Lisp_Object parent_eol_type
;
5020 = CODING_SYSTEM_SPEC (buffer_defaults
.buffer_file_coding_system
);
5021 parent_eol_type
= AREF (parent_spec
, 2);
5022 if (EQ (parent_eol_type
, Qunix
))
5023 coding_system
= AREF (eol_type
, 0);
5024 else if (EQ (parent_eol_type
, Qdos
))
5025 coding_system
= AREF (eol_type
, 1);
5026 else if (EQ (parent_eol_type
, Qmac
))
5027 coding_system
= AREF (eol_type
, 2);
5029 return coding_system
;
5032 /* Emacs has a mechanism to automatically detect a coding system if it
5033 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5034 it's impossible to distinguish some coding systems accurately
5035 because they use the same range of codes. So, at first, coding
5036 systems are categorized into 7, those are:
5038 o coding-category-emacs-mule
5040 The category for a coding system which has the same code range
5041 as Emacs' internal format. Assigned the coding-system (Lisp
5042 symbol) `emacs-mule' by default.
5044 o coding-category-sjis
5046 The category for a coding system which has the same code range
5047 as SJIS. Assigned the coding-system (Lisp
5048 symbol) `japanese-shift-jis' by default.
5050 o coding-category-iso-7
5052 The category for a coding system which has the same code range
5053 as ISO2022 of 7-bit environment. This doesn't use any locking
5054 shift and single shift functions. This can encode/decode all
5055 charsets. Assigned the coding-system (Lisp symbol)
5056 `iso-2022-7bit' by default.
5058 o coding-category-iso-7-tight
5060 Same as coding-category-iso-7 except that this can
5061 encode/decode only the specified charsets.
5063 o coding-category-iso-8-1
5065 The category for a coding system which has the same code range
5066 as ISO2022 of 8-bit environment and graphic plane 1 used only
5067 for DIMENSION1 charset. This doesn't use any locking shift
5068 and single shift functions. Assigned the coding-system (Lisp
5069 symbol) `iso-latin-1' by default.
5071 o coding-category-iso-8-2
5073 The category for a coding system which has the same code range
5074 as ISO2022 of 8-bit environment and graphic plane 1 used only
5075 for DIMENSION2 charset. This doesn't use any locking shift
5076 and single shift functions. Assigned the coding-system (Lisp
5077 symbol) `japanese-iso-8bit' by default.
5079 o coding-category-iso-7-else
5081 The category for a coding system which has the same code range
5082 as ISO2022 of 7-bit environemnt but uses locking shift or
5083 single shift functions. Assigned the coding-system (Lisp
5084 symbol) `iso-2022-7bit-lock' by default.
5086 o coding-category-iso-8-else
5088 The category for a coding system which has the same code range
5089 as ISO2022 of 8-bit environemnt but uses locking shift or
5090 single shift functions. Assigned the coding-system (Lisp
5091 symbol) `iso-2022-8bit-ss2' by default.
5093 o coding-category-big5
5095 The category for a coding system which has the same code range
5096 as BIG5. Assigned the coding-system (Lisp symbol)
5097 `cn-big5' by default.
5099 o coding-category-utf-8
5101 The category for a coding system which has the same code range
5102 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5103 symbol) `utf-8' by default.
5105 o coding-category-utf-16-be
5107 The category for a coding system in which a text has an
5108 Unicode signature (cf. Unicode Standard) in the order of BIG
5109 endian at the head. Assigned the coding-system (Lisp symbol)
5110 `utf-16-be' by default.
5112 o coding-category-utf-16-le
5114 The category for a coding system in which a text has an
5115 Unicode signature (cf. Unicode Standard) in the order of
5116 LITTLE endian at the head. Assigned the coding-system (Lisp
5117 symbol) `utf-16-le' by default.
5119 o coding-category-ccl
5121 The category for a coding system of which encoder/decoder is
5122 written in CCL programs. The default value is nil, i.e., no
5123 coding system is assigned.
5125 o coding-category-binary
5127 The category for a coding system not categorized in any of the
5128 above. Assigned the coding-system (Lisp symbol)
5129 `no-conversion' by default.
5131 Each of them is a Lisp symbol and the value is an actual
5132 `coding-system's (this is also a Lisp symbol) assigned by a user.
5133 What Emacs does actually is to detect a category of coding system.
5134 Then, it uses a `coding-system' assigned to it. If Emacs can't
5135 decide only one possible category, it selects a category of the
5136 highest priority. Priorities of categories are also specified by a
5137 user in a Lisp variable `coding-category-list'.
5141 #define EOL_SEEN_NONE 0
5142 #define EOL_SEEN_LF 1
5143 #define EOL_SEEN_CR 2
5144 #define EOL_SEEN_CRLF 4
5146 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5147 SOURCE is encoded. If CATEGORY is one of
5148 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5149 two-byte, else they are encoded by one-byte.
5151 Return one of EOL_SEEN_XXX. */
5153 #define MAX_EOL_CHECK_COUNT 3
5156 detect_eol (source
, src_bytes
, category
)
5157 unsigned char *source
;
5158 EMACS_INT src_bytes
;
5159 enum coding_category category
;
5161 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
5164 int eol_seen
= EOL_SEEN_NONE
;
5166 if ((1 << category
) & CATEGORY_MASK_UTF_16
)
5170 msb
= category
== (coding_category_utf_16_le
5171 | coding_category_utf_16_le_nosig
);
5174 while (src
+ 1 < src_end
)
5177 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
5182 this_eol
= EOL_SEEN_LF
;
5183 else if (src
+ 3 >= src_end
5184 || src
[msb
+ 2] != 0
5185 || src
[lsb
+ 2] != '\n')
5186 this_eol
= EOL_SEEN_CR
;
5188 this_eol
= EOL_SEEN_CRLF
;
5190 if (eol_seen
== EOL_SEEN_NONE
)
5191 /* This is the first end-of-line. */
5192 eol_seen
= this_eol
;
5193 else if (eol_seen
!= this_eol
)
5195 /* The found type is different from what found before. */
5196 eol_seen
= EOL_SEEN_LF
;
5199 if (++total
== MAX_EOL_CHECK_COUNT
)
5207 while (src
< src_end
)
5210 if (c
== '\n' || c
== '\r')
5215 this_eol
= EOL_SEEN_LF
;
5216 else if (src
>= src_end
|| *src
!= '\n')
5217 this_eol
= EOL_SEEN_CR
;
5219 this_eol
= EOL_SEEN_CRLF
, src
++;
5221 if (eol_seen
== EOL_SEEN_NONE
)
5222 /* This is the first end-of-line. */
5223 eol_seen
= this_eol
;
5224 else if (eol_seen
!= this_eol
)
5226 /* The found type is different from what found before. */
5227 eol_seen
= EOL_SEEN_LF
;
5230 if (++total
== MAX_EOL_CHECK_COUNT
)
5240 adjust_coding_eol_type (coding
, eol_seen
)
5241 struct coding_system
*coding
;
5244 Lisp_Object eol_type
;
5246 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5247 if (eol_seen
& EOL_SEEN_LF
)
5248 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
5249 else if (eol_seen
& EOL_SEEN_CRLF
)
5250 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
5251 else if (eol_seen
& EOL_SEEN_CR
)
5252 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
5255 /* Detect how a text specified in CODING is encoded. If a coding
5256 system is detected, update fields of CODING by the detected coding
5260 detect_coding (coding
)
5261 struct coding_system
*coding
;
5263 unsigned char *src
, *src_end
;
5264 Lisp_Object attrs
, coding_type
;
5266 coding
->consumed
= coding
->consumed_char
= 0;
5267 coding
->produced
= coding
->produced_char
= 0;
5268 coding_set_source (coding
);
5270 src_end
= coding
->source
+ coding
->src_bytes
;
5272 /* If we have not yet decided the text encoding type, detect it
5274 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
5278 for (src
= coding
->source
; src
< src_end
; src
++)
5281 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
5283 || c
== ISO_CODE_SO
)))
5286 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
5288 if (coding
->head_ascii
< coding
->src_bytes
)
5290 struct coding_detection_info detect_info
;
5291 enum coding_category category
;
5292 struct coding_system
*this;
5294 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
5295 for (i
= 0; i
< coding_category_raw_text
; i
++)
5297 category
= coding_priorities
[i
];
5298 this = coding_categories
+ category
;
5301 /* No coding system of this category is defined. */
5302 detect_info
.rejected
|= (1 << category
);
5304 else if (category
>= coding_category_raw_text
)
5306 else if (detect_info
.checked
& (1 << category
))
5308 if (detect_info
.found
& (1 << category
))
5311 else if ((*(this->detector
)) (coding
, &detect_info
)
5312 && detect_info
.found
& (1 << category
))
5315 if (i
< coding_category_raw_text
)
5316 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5317 else if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
5318 setup_coding_system (Qraw_text
, coding
);
5319 else if (detect_info
.rejected
)
5320 for (i
= 0; i
< coding_category_raw_text
; i
++)
5321 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
5323 this = coding_categories
+ coding_priorities
[i
];
5324 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5329 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qutf_16
))
5331 Lisp_Object coding_systems
;
5332 struct coding_detection_info detect_info
;
5335 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_16_bom
);
5336 detect_info
.found
= detect_info
.rejected
= 0;
5337 if (CONSP (coding_systems
)
5338 && detect_coding_utf_16 (coding
, &detect_info
)
5339 && (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
5340 | CATEGORY_MASK_UTF_16_BE
)))
5342 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
5343 setup_coding_system (XCAR (coding_systems
), coding
);
5345 setup_coding_system (XCDR (coding_systems
), coding
);
5349 attrs
= CODING_ID_ATTRS (coding
->id
);
5350 coding_type
= CODING_ATTR_TYPE (attrs
);
5352 /* If we have not yet decided the EOL type, detect it now. But, the
5353 detection is impossible for a CCL based coding system, in which
5354 case, we detct the EOL type after decoding. */
5355 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
))
5356 && ! EQ (coding_type
, Qccl
))
5358 int eol_seen
= detect_eol (coding
->source
, coding
->src_bytes
,
5359 XINT (CODING_ATTR_CATEGORY (attrs
)));
5361 if (eol_seen
!= EOL_SEEN_NONE
)
5362 adjust_coding_eol_type (coding
, eol_seen
);
5369 struct coding_system
*coding
;
5371 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
)))
5373 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5374 unsigned char *pend
= p
+ coding
->produced
;
5375 int eol_seen
= EOL_SEEN_NONE
;
5377 for (; p
< pend
; p
++)
5380 eol_seen
|= EOL_SEEN_LF
;
5381 else if (*p
== '\r')
5383 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
5385 eol_seen
|= EOL_SEEN_CRLF
;
5389 eol_seen
|= EOL_SEEN_CR
;
5392 if (eol_seen
!= EOL_SEEN_NONE
)
5393 adjust_coding_eol_type (coding
, eol_seen
);
5396 if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qmac
))
5398 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5399 unsigned char *pend
= p
+ coding
->produced
;
5401 for (; p
< pend
; p
++)
5405 else if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
))
5407 unsigned char *p
, *pbeg
, *pend
;
5408 Lisp_Object undo_list
;
5410 move_gap_both (coding
->dst_pos
+ coding
->produced_char
,
5411 coding
->dst_pos_byte
+ coding
->produced
);
5412 undo_list
= current_buffer
->undo_list
;
5413 current_buffer
->undo_list
= Qt
;
5414 del_range_2 (coding
->dst_pos
, coding
->dst_pos_byte
, GPT
, GPT_BYTE
, 0);
5415 current_buffer
->undo_list
= undo_list
;
5417 pend
= pbeg
+ coding
->produced
;
5419 for (p
= pend
- 1; p
>= pbeg
; p
--)
5422 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
- p
- 1);
5425 coding
->produced_char
-= coding
->produced
- (pend
- pbeg
);
5426 coding
->produced
= pend
- pbeg
;
5427 insert_from_gap (coding
->produced_char
, coding
->produced
);
5432 translate_chars (coding
, table
)
5433 struct coding_system
*coding
;
5436 int *charbuf
= coding
->charbuf
;
5437 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5440 if (coding
->chars_at_source
)
5443 while (charbuf
< charbuf_end
)
5449 *charbuf
++ = translate_char (table
, c
);
5454 produce_chars (coding
)
5455 struct coding_system
*coding
;
5457 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5458 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5460 int produced_chars
= 0;
5462 if (! coding
->chars_at_source
)
5464 /* Characters are in coding->charbuf. */
5465 int *buf
= coding
->charbuf
;
5466 int *buf_end
= buf
+ coding
->charbuf_used
;
5467 unsigned char *adjusted_dst_end
;
5469 if (BUFFERP (coding
->src_object
)
5470 && EQ (coding
->src_object
, coding
->dst_object
))
5471 dst_end
= coding
->source
+ coding
->consumed
;
5472 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5474 while (buf
< buf_end
)
5478 if (dst
>= adjusted_dst_end
)
5480 dst
= alloc_destination (coding
,
5481 buf_end
- buf
+ MAX_MULTIBYTE_LENGTH
,
5483 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5484 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5488 if (coding
->dst_multibyte
5489 || ! CHAR_BYTE8_P (c
))
5490 CHAR_STRING_ADVANCE (c
, dst
);
5492 *dst
++ = CHAR_TO_BYTE8 (c
);
5496 /* This is an annotation datum. */
5502 unsigned char *src
= coding
->source
;
5503 unsigned char *src_end
= src
+ coding
->src_bytes
;
5504 Lisp_Object eol_type
;
5506 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5508 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5510 if (coding
->src_multibyte
)
5517 unsigned char *src_base
= src
;
5523 if (EQ (eol_type
, Qdos
))
5527 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
5528 goto no_more_source
;
5533 else if (EQ (eol_type
, Qmac
))
5538 coding
->consumed
= src
- coding
->source
;
5540 if (EQ (coding
->src_object
, coding
->dst_object
))
5544 dst
= alloc_destination (coding
, src_end
- src
+ 1,
5546 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5547 coding_set_source (coding
);
5548 src
= coding
->source
+ coding
->consumed
;
5549 src_end
= coding
->source
+ coding
->src_bytes
;
5559 while (src
< src_end
)
5566 if (EQ (eol_type
, Qdos
))
5572 else if (EQ (eol_type
, Qmac
))
5575 if (dst
>= dst_end
- 1)
5577 coding
->consumed
= src
- coding
->source
;
5579 if (EQ (coding
->src_object
, coding
->dst_object
))
5581 if (dst
>= dst_end
- 1)
5583 dst
= alloc_destination (coding
, src_end
- src
+ 2,
5585 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5586 coding_set_source (coding
);
5587 src
= coding
->source
+ coding
->consumed
;
5588 src_end
= coding
->source
+ coding
->src_bytes
;
5596 if (!EQ (coding
->src_object
, coding
->dst_object
))
5598 int require
= coding
->src_bytes
- coding
->dst_bytes
;
5602 EMACS_INT offset
= src
- coding
->source
;
5604 dst
= alloc_destination (coding
, require
, dst
);
5605 coding_set_source (coding
);
5606 src
= coding
->source
+ offset
;
5607 src_end
= coding
->source
+ coding
->src_bytes
;
5610 produced_chars
= coding
->src_chars
;
5611 while (src
< src_end
)
5617 if (EQ (eol_type
, Qdos
))
5624 else if (EQ (eol_type
, Qmac
))
5630 coding
->consumed
= coding
->src_bytes
;
5631 coding
->consumed_char
= coding
->src_chars
;
5634 produced
= dst
- (coding
->destination
+ coding
->produced
);
5635 if (BUFFERP (coding
->dst_object
))
5636 insert_from_gap (produced_chars
, produced
);
5637 coding
->produced
+= produced
;
5638 coding
->produced_char
+= produced_chars
;
5639 return produced_chars
;
5642 /* Compose text in CODING->object according to the annotation data at
5643 CHARBUF. CHARBUF is an array:
5644 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5648 produce_composition (coding
, charbuf
)
5649 struct coding_system
*coding
;
5654 enum composition_method method
;
5655 Lisp_Object components
;
5658 from
= coding
->dst_pos
+ charbuf
[2];
5659 to
= coding
->dst_pos
+ charbuf
[3];
5660 method
= (enum composition_method
) (charbuf
[4]);
5662 if (method
== COMPOSITION_RELATIVE
)
5666 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
5671 for (i
= 0; i
< len
; i
++)
5672 args
[i
] = make_number (charbuf
[i
]);
5673 components
= (method
== COMPOSITION_WITH_ALTCHARS
5674 ? Fstring (len
, args
) : Fvector (len
, args
));
5676 compose_text (from
, to
, components
, Qnil
, coding
->dst_object
);
5680 /* Put `charset' property on text in CODING->object according to
5681 the annotation data at CHARBUF. CHARBUF is an array:
5682 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5686 produce_charset (coding
, charbuf
)
5687 struct coding_system
*coding
;
5690 EMACS_INT from
= coding
->dst_pos
+ charbuf
[2];
5691 EMACS_INT to
= coding
->dst_pos
+ charbuf
[3];
5692 struct charset
*charset
= CHARSET_FROM_ID (charbuf
[4]);
5694 Fput_text_property (make_number (from
), make_number (to
),
5695 Qcharset
, CHARSET_NAME (charset
),
5696 coding
->dst_object
);
5700 #define CHARBUF_SIZE 0x4000
5702 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5704 int size = CHARBUF_SIZE;; \
5706 coding->charbuf = NULL; \
5707 while (size > 1024) \
5709 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5710 if (coding->charbuf) \
5714 if (! coding->charbuf) \
5716 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5717 return coding->result; \
5719 coding->charbuf_size = size; \
5724 produce_annotation (coding
)
5725 struct coding_system
*coding
;
5727 int *charbuf
= coding
->charbuf
;
5728 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5730 if (NILP (coding
->dst_object
))
5733 while (charbuf
< charbuf_end
)
5739 int len
= -*charbuf
;
5742 case CODING_ANNOTATE_COMPOSITION_MASK
:
5743 produce_composition (coding
, charbuf
);
5745 case CODING_ANNOTATE_CHARSET_MASK
:
5746 produce_charset (coding
, charbuf
);
5756 /* Decode the data at CODING->src_object into CODING->dst_object.
5757 CODING->src_object is a buffer, a string, or nil.
5758 CODING->dst_object is a buffer.
5760 If CODING->src_object is a buffer, it must be the current buffer.
5761 In this case, if CODING->src_pos is positive, it is a position of
5762 the source text in the buffer, otherwise, the source text is in the
5763 gap area of the buffer, and CODING->src_pos specifies the offset of
5764 the text from GPT (which must be the same as PT). If this is the
5765 same buffer as CODING->dst_object, CODING->src_pos must be
5768 If CODING->src_object is a string, CODING->src_pos in an index to
5771 If CODING->src_object is nil, CODING->source must already point to
5772 the non-relocatable memory area. In this case, CODING->src_pos is
5773 an offset from CODING->source.
5775 The decoded data is inserted at the current point of the buffer
5780 decode_coding (coding
)
5781 struct coding_system
*coding
;
5785 if (BUFFERP (coding
->src_object
)
5786 && coding
->src_pos
> 0
5787 && coding
->src_pos
< GPT
5788 && coding
->src_pos
+ coding
->src_chars
> GPT
)
5789 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
5791 if (BUFFERP (coding
->dst_object
))
5793 if (current_buffer
!= XBUFFER (coding
->dst_object
))
5794 set_buffer_internal (XBUFFER (coding
->dst_object
));
5796 move_gap_both (PT
, PT_BYTE
);
5799 coding
->consumed
= coding
->consumed_char
= 0;
5800 coding
->produced
= coding
->produced_char
= 0;
5801 coding
->chars_at_source
= 0;
5802 coding
->result
= CODING_RESULT_SUCCESS
;
5805 ALLOC_CONVERSION_WORK_AREA (coding
);
5807 attrs
= CODING_ID_ATTRS (coding
->id
);
5811 coding_set_source (coding
);
5812 coding
->annotated
= 0;
5813 (*(coding
->decoder
)) (coding
);
5814 if (!NILP (CODING_ATTR_DECODE_TBL (attrs
)))
5815 translate_chars (coding
, CODING_ATTR_DECODE_TBL (attrs
));
5816 else if (!NILP (Vstandard_translation_table_for_decode
))
5817 translate_chars (coding
, Vstandard_translation_table_for_decode
);
5818 coding_set_destination (coding
);
5819 produce_chars (coding
);
5820 if (coding
->annotated
)
5821 produce_annotation (coding
);
5823 while (coding
->consumed
< coding
->src_bytes
5824 && ! coding
->result
);
5826 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qccl
)
5827 && SYMBOLP (CODING_ID_EOL_TYPE (coding
->id
))
5828 && ! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
5829 decode_eol (coding
);
5831 coding
->carryover_bytes
= 0;
5832 if (coding
->consumed
< coding
->src_bytes
)
5834 int nbytes
= coding
->src_bytes
- coding
->consumed
;
5837 coding_set_source (coding
);
5838 coding_set_destination (coding
);
5839 src
= coding
->source
+ coding
->consumed
;
5841 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
5843 /* Flush out unprocessed data as binary chars. We are sure
5844 that the number of data is less than the size of
5846 while (nbytes
-- > 0)
5850 coding
->charbuf
[coding
->charbuf_used
++] = (c
& 0x80 ? - c
: c
);
5852 produce_chars (coding
);
5856 /* Record unprocessed bytes in coding->carryover. We are
5857 sure that the number of data is less than the size of
5858 coding->carryover. */
5859 unsigned char *p
= coding
->carryover
;
5861 coding
->carryover_bytes
= nbytes
;
5862 while (nbytes
-- > 0)
5865 coding
->consumed
= coding
->src_bytes
;
5868 return coding
->result
;
5872 /* Extract an annotation datum from a composition starting at POS and
5873 ending before LIMIT of CODING->src_object (buffer or string), store
5874 the data in BUF, set *STOP to a starting position of the next
5875 composition (if any) or to LIMIT, and return the address of the
5876 next element of BUF.
5878 If such an annotation is not found, set *STOP to a starting
5879 position of a composition after POS (if any) or to LIMIT, and
5883 handle_composition_annotation (pos
, limit
, coding
, buf
, stop
)
5884 EMACS_INT pos
, limit
;
5885 struct coding_system
*coding
;
5889 EMACS_INT start
, end
;
5892 if (! find_composition (pos
, limit
, &start
, &end
, &prop
, coding
->src_object
)
5895 else if (start
> pos
)
5901 /* We found a composition. Store the corresponding
5902 annotation data in BUF. */
5904 enum composition_method method
= COMPOSITION_METHOD (prop
);
5905 int nchars
= COMPOSITION_LENGTH (prop
);
5907 ADD_COMPOSITION_DATA (buf
, 0, nchars
, method
);
5908 if (method
!= COMPOSITION_RELATIVE
)
5910 Lisp_Object components
;
5913 components
= COMPOSITION_COMPONENTS (prop
);
5914 if (VECTORP (components
))
5916 len
= XVECTOR (components
)->size
;
5917 for (i
= 0; i
< len
; i
++)
5918 *buf
++ = XINT (AREF (components
, i
));
5920 else if (STRINGP (components
))
5922 len
= XSTRING (components
)->size
;
5926 FETCH_STRING_CHAR_ADVANCE (*buf
, components
, i
, i_byte
);
5930 else if (INTEGERP (components
))
5933 *buf
++ = XINT (components
);
5935 else if (CONSP (components
))
5937 for (len
= 0; CONSP (components
);
5938 len
++, components
= XCDR (components
))
5939 *buf
++ = XINT (XCAR (components
));
5947 if (find_composition (end
, limit
, &start
, &end
, &prop
,
5958 /* Extract an annotation datum from a text property `charset' at POS of
5959 CODING->src_object (buffer of string), store the data in BUF, set
5960 *STOP to the position where the value of `charset' property changes
5961 (limiting by LIMIT), and return the address of the next element of
5964 If the property value is nil, set *STOP to the position where the
5965 property value is non-nil (limiting by LIMIT), and return BUF. */
5968 handle_charset_annotation (pos
, limit
, coding
, buf
, stop
)
5969 EMACS_INT pos
, limit
;
5970 struct coding_system
*coding
;
5974 Lisp_Object val
, next
;
5977 val
= Fget_text_property (make_number (pos
), Qcharset
, coding
->src_object
);
5978 if (! NILP (val
) && CHARSETP (val
))
5979 id
= XINT (CHARSET_SYMBOL_ID (val
));
5982 ADD_CHARSET_DATA (buf
, 0, 0, id
);
5983 next
= Fnext_single_property_change (make_number (pos
), Qcharset
,
5985 make_number (limit
));
5986 *stop
= XINT (next
);
5992 consume_chars (coding
)
5993 struct coding_system
*coding
;
5995 int *buf
= coding
->charbuf
;
5996 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
;
5997 const unsigned char *src
= coding
->source
+ coding
->consumed
;
5998 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
5999 EMACS_INT pos
= coding
->src_pos
+ coding
->consumed_char
;
6000 EMACS_INT end_pos
= coding
->src_pos
+ coding
->src_chars
;
6001 int multibytep
= coding
->src_multibyte
;
6002 Lisp_Object eol_type
;
6004 EMACS_INT stop
, stop_composition
, stop_charset
;
6006 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6007 if (VECTORP (eol_type
))
6010 /* Note: composition handling is not yet implemented. */
6011 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
6013 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
)
6014 stop
= stop_composition
= pos
;
6016 stop
= stop_composition
= end_pos
;
6017 if (coding
->common_flags
& CODING_ANNOTATE_CHARSET_MASK
)
6018 stop
= stop_charset
= pos
;
6020 stop_charset
= end_pos
;
6022 /* Compensate for CRLF and annotation. */
6023 buf_end
-= 1 + MAX_ANNOTATION_LENGTH
;
6024 while (buf
< buf_end
)
6030 if (pos
== stop_composition
)
6031 buf
= handle_composition_annotation (pos
, end_pos
, coding
,
6032 buf
, &stop_composition
);
6033 if (pos
== stop_charset
)
6034 buf
= handle_charset_annotation (pos
, end_pos
, coding
,
6035 buf
, &stop_charset
);
6036 stop
= (stop_composition
< stop_charset
6037 ? stop_composition
: stop_charset
);
6042 EMACS_INT bytes
= MULTIBYTE_LENGTH (src
, src_end
);
6045 c
= STRING_CHAR_ADVANCE (src
), pos
+= bytes
;
6050 c
= STRING_CHAR_ADVANCE (src
), pos
++;
6051 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
6053 if (! EQ (eol_type
, Qunix
))
6057 if (EQ (eol_type
, Qdos
))
6066 coding
->consumed
= src
- coding
->source
;
6067 coding
->consumed_char
= pos
- coding
->src_pos
;
6068 coding
->charbuf_used
= buf
- coding
->charbuf
;
6069 coding
->chars_at_source
= 0;
6073 /* Encode the text at CODING->src_object into CODING->dst_object.
6074 CODING->src_object is a buffer or a string.
6075 CODING->dst_object is a buffer or nil.
6077 If CODING->src_object is a buffer, it must be the current buffer.
6078 In this case, if CODING->src_pos is positive, it is a position of
6079 the source text in the buffer, otherwise. the source text is in the
6080 gap area of the buffer, and coding->src_pos specifies the offset of
6081 the text from GPT (which must be the same as PT). If this is the
6082 same buffer as CODING->dst_object, CODING->src_pos must be
6083 negative and CODING should not have `pre-write-conversion'.
6085 If CODING->src_object is a string, CODING should not have
6086 `pre-write-conversion'.
6088 If CODING->dst_object is a buffer, the encoded data is inserted at
6089 the current point of that buffer.
6091 If CODING->dst_object is nil, the encoded data is placed at the
6092 memory area specified by CODING->destination. */
6095 encode_coding (coding
)
6096 struct coding_system
*coding
;
6100 attrs
= CODING_ID_ATTRS (coding
->id
);
6102 if (BUFFERP (coding
->dst_object
))
6104 set_buffer_internal (XBUFFER (coding
->dst_object
));
6105 coding
->dst_multibyte
6106 = ! NILP (current_buffer
->enable_multibyte_characters
);
6109 coding
->consumed
= coding
->consumed_char
= 0;
6110 coding
->produced
= coding
->produced_char
= 0;
6111 coding
->result
= CODING_RESULT_SUCCESS
;
6114 ALLOC_CONVERSION_WORK_AREA (coding
);
6117 coding_set_source (coding
);
6118 consume_chars (coding
);
6120 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs
)))
6121 translate_chars (coding
, CODING_ATTR_ENCODE_TBL (attrs
));
6122 else if (!NILP (Vstandard_translation_table_for_encode
))
6123 translate_chars (coding
, Vstandard_translation_table_for_encode
);
6125 coding_set_destination (coding
);
6126 (*(coding
->encoder
)) (coding
);
6127 } while (coding
->consumed_char
< coding
->src_chars
);
6129 if (BUFFERP (coding
->dst_object
))
6130 insert_from_gap (coding
->produced_char
, coding
->produced
);
6132 return (coding
->result
);
6136 /* Stack of working buffers used in code conversion. An nil element
6137 means that the code conversion of that level is not using a working
6139 Lisp_Object Vcode_conversion_work_buf_list
;
6141 /* A working buffer used by the top level conversion. */
6142 Lisp_Object Vcode_conversion_reused_work_buf
;
6145 /* Return a working buffer that can be freely used by the following
6146 code conversion. MULTIBYTEP specifies the multibyteness of the
6150 make_conversion_work_buffer (multibytep
, depth
)
6151 int multibytep
, depth
;
6153 struct buffer
*current
= current_buffer
;
6154 Lisp_Object buf
, name
;
6158 if (NILP (Vcode_conversion_reused_work_buf
))
6159 Vcode_conversion_reused_work_buf
6160 = Fget_buffer_create (build_string (" *code-conversion-work<0>*"));
6161 buf
= Vcode_conversion_reused_work_buf
;
6167 name
= build_string (" *code-conversion-work*");
6168 name
= Fgenerate_new_buffer_name (name
, Qnil
);
6174 sprintf (str
, " *code-conversion-work*<%d>", depth
);
6175 name
= build_string (str
);
6177 buf
= Fget_buffer_create (name
);
6179 set_buffer_internal (XBUFFER (buf
));
6180 current_buffer
->undo_list
= Qt
;
6182 Fset_buffer_multibyte (multibytep
? Qt
: Qnil
, Qnil
);
6183 set_buffer_internal (current
);
6188 code_conversion_restore (buffer
)
6191 Lisp_Object workbuf
;
6193 workbuf
= XCAR (Vcode_conversion_work_buf_list
);
6194 if (! NILP (workbuf
)
6195 && ! EQ (workbuf
, Vcode_conversion_reused_work_buf
)
6196 && ! NILP (Fbuffer_live_p (workbuf
)))
6197 Fkill_buffer (workbuf
);
6198 Vcode_conversion_work_buf_list
= XCDR (Vcode_conversion_work_buf_list
);
6199 set_buffer_internal (XBUFFER (buffer
));
6204 code_conversion_save (buffer
, with_work_buf
, multibyte
)
6206 int with_work_buf
, multibyte
;
6208 Lisp_Object workbuf
;
6212 int depth
= XINT (Flength (Vcode_conversion_work_buf_list
));
6214 workbuf
= make_conversion_work_buffer (multibyte
, depth
);
6218 Vcode_conversion_work_buf_list
6219 = Fcons (workbuf
, Vcode_conversion_work_buf_list
);
6220 record_unwind_protect (code_conversion_restore
, buffer
);
6225 decode_coding_gap (coding
, chars
, bytes
)
6226 struct coding_system
*coding
;
6227 EMACS_INT chars
, bytes
;
6229 int count
= specpdl_ptr
- specpdl
;
6232 buffer
= Fcurrent_buffer ();
6233 code_conversion_save (buffer
, 0, 0);
6235 coding
->src_object
= buffer
;
6236 coding
->src_chars
= chars
;
6237 coding
->src_bytes
= bytes
;
6238 coding
->src_pos
= -chars
;
6239 coding
->src_pos_byte
= -bytes
;
6240 coding
->src_multibyte
= chars
< bytes
;
6241 coding
->dst_object
= coding
->src_object
;
6242 coding
->dst_pos
= PT
;
6243 coding
->dst_pos_byte
= PT_BYTE
;
6244 coding
->dst_multibyte
= ! NILP (current_buffer
->enable_multibyte_characters
);
6245 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
6247 if (CODING_REQUIRE_DETECTION (coding
))
6248 detect_coding (coding
);
6250 decode_coding (coding
);
6252 unbind_to (count
, Qnil
);
6253 return coding
->result
;
6257 encode_coding_gap (coding
, chars
, bytes
)
6258 struct coding_system
*coding
;
6259 EMACS_INT chars
, bytes
;
6261 int count
= specpdl_ptr
- specpdl
;
6264 buffer
= Fcurrent_buffer ();
6265 code_conversion_save (buffer
, 0, 0);
6267 coding
->src_object
= buffer
;
6268 coding
->src_chars
= chars
;
6269 coding
->src_bytes
= bytes
;
6270 coding
->src_pos
= -chars
;
6271 coding
->src_pos_byte
= -bytes
;
6272 coding
->src_multibyte
= chars
< bytes
;
6273 coding
->dst_object
= coding
->src_object
;
6274 coding
->dst_pos
= PT
;
6275 coding
->dst_pos_byte
= PT_BYTE
;
6277 encode_coding (coding
);
6279 unbind_to (count
, Qnil
);
6280 return coding
->result
;
6284 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6285 SRC_OBJECT into DST_OBJECT by coding context CODING.
6287 SRC_OBJECT is a buffer, a string, or Qnil.
6289 If it is a buffer, the text is at point of the buffer. FROM and TO
6290 are positions in the buffer.
6292 If it is a string, the text is at the beginning of the string.
6293 FROM and TO are indices to the string.
6295 If it is nil, the text is at coding->source. FROM and TO are
6296 indices to coding->source.
6298 DST_OBJECT is a buffer, Qt, or Qnil.
6300 If it is a buffer, the decoded text is inserted at point of the
6301 buffer. If the buffer is the same as SRC_OBJECT, the source text
6304 If it is Qt, a string is made from the decoded text, and
6305 set in CODING->dst_object.
6307 If it is Qnil, the decoded text is stored at CODING->destination.
6308 The caller must allocate CODING->dst_bytes bytes at
6309 CODING->destination by xmalloc. If the decoded text is longer than
6310 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6314 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6316 struct coding_system
*coding
;
6317 Lisp_Object src_object
;
6318 EMACS_INT from
, from_byte
, to
, to_byte
;
6319 Lisp_Object dst_object
;
6321 int count
= specpdl_ptr
- specpdl
;
6322 unsigned char *destination
;
6323 EMACS_INT dst_bytes
;
6324 EMACS_INT chars
= to
- from
;
6325 EMACS_INT bytes
= to_byte
- from_byte
;
6328 int saved_pt
= -1, saved_pt_byte
;
6330 buffer
= Fcurrent_buffer ();
6332 if (NILP (dst_object
))
6334 destination
= coding
->destination
;
6335 dst_bytes
= coding
->dst_bytes
;
6338 coding
->src_object
= src_object
;
6339 coding
->src_chars
= chars
;
6340 coding
->src_bytes
= bytes
;
6341 coding
->src_multibyte
= chars
< bytes
;
6343 if (STRINGP (src_object
))
6345 coding
->src_pos
= from
;
6346 coding
->src_pos_byte
= from_byte
;
6348 else if (BUFFERP (src_object
))
6350 set_buffer_internal (XBUFFER (src_object
));
6352 move_gap_both (from
, from_byte
);
6353 if (EQ (src_object
, dst_object
))
6355 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6356 TEMP_SET_PT_BOTH (from
, from_byte
);
6357 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6358 coding
->src_pos
= -chars
;
6359 coding
->src_pos_byte
= -bytes
;
6363 coding
->src_pos
= from
;
6364 coding
->src_pos_byte
= from_byte
;
6368 if (CODING_REQUIRE_DETECTION (coding
))
6369 detect_coding (coding
);
6370 attrs
= CODING_ID_ATTRS (coding
->id
);
6372 if (EQ (dst_object
, Qt
)
6373 || (! NILP (CODING_ATTR_POST_READ (attrs
))
6374 && NILP (dst_object
)))
6376 coding
->dst_object
= code_conversion_save (buffer
, 1, 1);
6377 coding
->dst_pos
= BEG
;
6378 coding
->dst_pos_byte
= BEG_BYTE
;
6379 coding
->dst_multibyte
= 1;
6381 else if (BUFFERP (dst_object
))
6383 code_conversion_save (buffer
, 0, 0);
6384 coding
->dst_object
= dst_object
;
6385 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6386 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6387 coding
->dst_multibyte
6388 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6392 code_conversion_save (buffer
, 0, 0);
6393 coding
->dst_object
= Qnil
;
6394 coding
->dst_multibyte
= 1;
6397 decode_coding (coding
);
6399 if (BUFFERP (coding
->dst_object
))
6400 set_buffer_internal (XBUFFER (coding
->dst_object
));
6402 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6404 struct gcpro gcpro1
, gcpro2
;
6405 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6408 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6409 GCPRO2 (coding
->src_object
, coding
->dst_object
);
6410 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6411 make_number (coding
->produced_char
));
6414 coding
->produced_char
+= Z
- prev_Z
;
6415 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6418 if (EQ (dst_object
, Qt
))
6420 coding
->dst_object
= Fbuffer_string ();
6422 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
6424 set_buffer_internal (XBUFFER (coding
->dst_object
));
6425 if (dst_bytes
< coding
->produced
)
6428 = (unsigned char *) xrealloc (destination
, coding
->produced
);
6431 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
6432 unbind_to (count
, Qnil
);
6435 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
6436 move_gap_both (BEGV
, BEGV_BYTE
);
6437 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
6438 coding
->destination
= destination
;
6444 /* This is the case of:
6445 (BUFFERP (src_object) && EQ (src_object, dst_object))
6446 As we have moved PT while replacing the original buffer
6447 contents, we must recover it now. */
6448 set_buffer_internal (XBUFFER (src_object
));
6449 if (saved_pt
< from
)
6450 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6451 else if (saved_pt
< from
+ chars
)
6452 TEMP_SET_PT_BOTH (from
, from_byte
);
6453 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6454 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6455 saved_pt_byte
+ (coding
->produced
- bytes
));
6457 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6458 saved_pt_byte
+ (coding
->produced
- bytes
));
6461 unbind_to (count
, Qnil
);
6466 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6468 struct coding_system
*coding
;
6469 Lisp_Object src_object
;
6470 EMACS_INT from
, from_byte
, to
, to_byte
;
6471 Lisp_Object dst_object
;
6473 int count
= specpdl_ptr
- specpdl
;
6474 EMACS_INT chars
= to
- from
;
6475 EMACS_INT bytes
= to_byte
- from_byte
;
6478 int saved_pt
= -1, saved_pt_byte
;
6480 buffer
= Fcurrent_buffer ();
6482 coding
->src_object
= src_object
;
6483 coding
->src_chars
= chars
;
6484 coding
->src_bytes
= bytes
;
6485 coding
->src_multibyte
= chars
< bytes
;
6487 attrs
= CODING_ID_ATTRS (coding
->id
);
6489 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
6491 coding
->src_object
= code_conversion_save (buffer
, 1,
6492 coding
->src_multibyte
);
6493 set_buffer_internal (XBUFFER (coding
->src_object
));
6494 if (STRINGP (src_object
))
6495 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
6496 else if (BUFFERP (src_object
))
6497 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
6499 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
6501 if (EQ (src_object
, dst_object
))
6503 set_buffer_internal (XBUFFER (src_object
));
6504 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6505 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6506 set_buffer_internal (XBUFFER (coding
->src_object
));
6509 call2 (CODING_ATTR_PRE_WRITE (attrs
),
6510 make_number (BEG
), make_number (Z
));
6511 coding
->src_object
= Fcurrent_buffer ();
6513 move_gap_both (BEG
, BEG_BYTE
);
6514 coding
->src_chars
= Z
- BEG
;
6515 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
6516 coding
->src_pos
= BEG
;
6517 coding
->src_pos_byte
= BEG_BYTE
;
6518 coding
->src_multibyte
= Z
< Z_BYTE
;
6520 else if (STRINGP (src_object
))
6522 code_conversion_save (buffer
, 0, 0);
6523 coding
->src_pos
= from
;
6524 coding
->src_pos_byte
= from_byte
;
6526 else if (BUFFERP (src_object
))
6528 code_conversion_save (buffer
, 0, 0);
6529 set_buffer_internal (XBUFFER (src_object
));
6530 if (EQ (src_object
, dst_object
))
6532 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6533 coding
->src_object
= del_range_1 (from
, to
, 1, 1);
6534 coding
->src_pos
= 0;
6535 coding
->src_pos_byte
= 0;
6539 if (from
< GPT
&& to
>= GPT
)
6540 move_gap_both (from
, from_byte
);
6541 coding
->src_pos
= from
;
6542 coding
->src_pos_byte
= from_byte
;
6546 code_conversion_save (buffer
, 0, 0);
6548 if (BUFFERP (dst_object
))
6550 coding
->dst_object
= dst_object
;
6551 if (EQ (src_object
, dst_object
))
6553 coding
->dst_pos
= from
;
6554 coding
->dst_pos_byte
= from_byte
;
6558 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6559 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6561 coding
->dst_multibyte
6562 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6564 else if (EQ (dst_object
, Qt
))
6566 coding
->dst_object
= Qnil
;
6567 coding
->dst_bytes
= coding
->src_chars
;
6568 if (coding
->dst_bytes
== 0)
6569 coding
->dst_bytes
= 1;
6570 coding
->destination
= (unsigned char *) xmalloc (coding
->dst_bytes
);
6571 coding
->dst_multibyte
= 0;
6575 coding
->dst_object
= Qnil
;
6576 coding
->dst_multibyte
= 0;
6579 encode_coding (coding
);
6581 if (EQ (dst_object
, Qt
))
6583 if (BUFFERP (coding
->dst_object
))
6584 coding
->dst_object
= Fbuffer_string ();
6588 = make_unibyte_string ((char *) coding
->destination
,
6590 xfree (coding
->destination
);
6596 /* This is the case of:
6597 (BUFFERP (src_object) && EQ (src_object, dst_object))
6598 As we have moved PT while replacing the original buffer
6599 contents, we must recover it now. */
6600 set_buffer_internal (XBUFFER (src_object
));
6601 if (saved_pt
< from
)
6602 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6603 else if (saved_pt
< from
+ chars
)
6604 TEMP_SET_PT_BOTH (from
, from_byte
);
6605 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6606 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6607 saved_pt_byte
+ (coding
->produced
- bytes
));
6609 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6610 saved_pt_byte
+ (coding
->produced
- bytes
));
6613 unbind_to (count
, Qnil
);
6618 preferred_coding_system ()
6620 int id
= coding_categories
[coding_priorities
[0]].id
;
6622 return CODING_ID_NAME (id
);
6627 /*** 8. Emacs Lisp library functions ***/
6629 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
6630 doc
: /* Return t if OBJECT is nil or a coding-system.
6631 See the documentation of `define-coding-system' for information
6632 about coding-system objects. */)
6636 return ((NILP (obj
) || CODING_SYSTEM_P (obj
)) ? Qt
: Qnil
);
6639 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
6640 Sread_non_nil_coding_system
, 1, 1, 0,
6641 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6648 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6649 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
6651 while (XSTRING (val
)->size
== 0);
6652 return (Fintern (val
, Qnil
));
6655 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
6656 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6657 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6658 (prompt
, default_coding_system
)
6659 Lisp_Object prompt
, default_coding_system
;
6662 if (SYMBOLP (default_coding_system
))
6663 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
6664 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6665 Qt
, Qnil
, Qcoding_system_history
,
6666 default_coding_system
, Qnil
);
6667 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
6670 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
6672 doc
: /* Check validity of CODING-SYSTEM.
6673 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6675 Lisp_Object coding_system
;
6677 CHECK_SYMBOL (coding_system
);
6678 if (!NILP (Fcoding_system_p (coding_system
)))
6679 return coding_system
;
6681 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
6685 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6686 HIGHEST is nonzero, return the coding system of the highest
6687 priority among the detected coding systems. Otherwize return a
6688 list of detected coding systems sorted by their priorities. If
6689 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6690 multibyte form but contains only ASCII and eight-bit chars.
6691 Otherwise, the bytes are raw bytes.
6693 CODING-SYSTEM controls the detection as below:
6695 If it is nil, detect both text-format and eol-format. If the
6696 text-format part of CODING-SYSTEM is already specified
6697 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6698 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6699 detect only text-format. */
6702 detect_coding_system (src
, src_bytes
, highest
, multibytep
, coding_system
)
6704 int src_bytes
, highest
;
6706 Lisp_Object coding_system
;
6708 unsigned char *src_end
= src
+ src_bytes
;
6709 Lisp_Object attrs
, eol_type
;
6711 struct coding_system coding
;
6713 struct coding_detection_info detect_info
;
6715 if (NILP (coding_system
))
6716 coding_system
= Qundecided
;
6717 setup_coding_system (coding_system
, &coding
);
6718 attrs
= CODING_ID_ATTRS (coding
.id
);
6719 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
6720 coding_system
= CODING_ATTR_BASE_NAME (attrs
);
6722 coding
.source
= src
;
6723 coding
.src_bytes
= src_bytes
;
6724 coding
.src_multibyte
= multibytep
;
6725 coding
.consumed
= 0;
6726 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6728 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
6730 /* At first, detect text-format if necessary. */
6731 if (XINT (CODING_ATTR_CATEGORY (attrs
)) == coding_category_undecided
)
6733 enum coding_category category
;
6734 struct coding_system
*this;
6737 for (; src
< src_end
; src
++)
6741 || (c
< 0x20 && (c
== ISO_CODE_ESC
6743 || c
== ISO_CODE_SO
)))
6746 coding
.head_ascii
= src
- coding
.source
;
6749 for (i
= 0; i
< coding_category_raw_text
; i
++)
6751 category
= coding_priorities
[i
];
6752 this = coding_categories
+ category
;
6756 /* No coding system of this category is defined. */
6757 detect_info
.rejected
|= (1 << category
);
6759 else if (category
>= coding_category_raw_text
)
6761 else if (detect_info
.checked
& (1 << category
))
6764 && (detect_info
.found
& (1 << category
)))
6769 if ((*(this->detector
)) (&coding
, &detect_info
)
6771 && (detect_info
.found
& (1 << category
)))
6777 if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
6779 detect_info
.found
= CATEGORY_MASK_RAW_TEXT
;
6780 id
= coding_categories
[coding_category_raw_text
].id
;
6781 val
= Fcons (make_number (id
), Qnil
);
6783 else if (! detect_info
.rejected
&& ! detect_info
.found
)
6785 detect_info
.found
= CATEGORY_MASK_ANY
;
6786 id
= coding_categories
[coding_category_undecided
].id
;
6787 val
= Fcons (make_number (id
), Qnil
);
6791 if (detect_info
.found
)
6793 detect_info
.found
= 1 << category
;
6794 val
= Fcons (make_number (this->id
), Qnil
);
6797 for (i
= 0; i
< coding_category_raw_text
; i
++)
6798 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
6800 detect_info
.found
= 1 << coding_priorities
[i
];
6801 id
= coding_categories
[coding_priorities
[i
]].id
;
6802 val
= Fcons (make_number (id
), Qnil
);
6808 int mask
= detect_info
.rejected
| detect_info
.found
;
6812 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6814 category
= coding_priorities
[i
];
6815 if (! (mask
& (1 << category
)))
6817 found
|= 1 << category
;
6818 id
= coding_categories
[category
].id
;
6819 val
= Fcons (make_number (id
), val
);
6822 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6824 category
= coding_priorities
[i
];
6825 if (detect_info
.found
& (1 << category
))
6827 id
= coding_categories
[category
].id
;
6828 val
= Fcons (make_number (id
), val
);
6831 detect_info
.found
|= found
;
6836 detect_info
.found
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
6837 val
= Fcons (make_number (coding
.id
), Qnil
);
6840 /* Then, detect eol-format if necessary. */
6842 int normal_eol
= -1, utf_16_be_eol
= -1, utf_16_le_eol
;
6845 if (VECTORP (eol_type
))
6847 if (detect_info
.found
& ~CATEGORY_MASK_UTF_16
)
6848 normal_eol
= detect_eol (coding
.source
, src_bytes
,
6849 coding_category_raw_text
);
6850 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_BE
6851 | CATEGORY_MASK_UTF_16_BE_NOSIG
))
6852 utf_16_be_eol
= detect_eol (coding
.source
, src_bytes
,
6853 coding_category_utf_16_be
);
6854 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
6855 | CATEGORY_MASK_UTF_16_LE_NOSIG
))
6856 utf_16_le_eol
= detect_eol (coding
.source
, src_bytes
,
6857 coding_category_utf_16_le
);
6861 if (EQ (eol_type
, Qunix
))
6862 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_LF
;
6863 else if (EQ (eol_type
, Qdos
))
6864 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CRLF
;
6866 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CR
;
6869 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
6871 enum coding_category category
;
6874 id
= XINT (XCAR (tail
));
6875 attrs
= CODING_ID_ATTRS (id
);
6876 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
6877 eol_type
= CODING_ID_EOL_TYPE (id
);
6878 if (VECTORP (eol_type
))
6880 if (category
== coding_category_utf_16_be
6881 || category
== coding_category_utf_16_be_nosig
)
6882 this_eol
= utf_16_be_eol
;
6883 else if (category
== coding_category_utf_16_le
6884 || category
== coding_category_utf_16_le_nosig
)
6885 this_eol
= utf_16_le_eol
;
6887 this_eol
= normal_eol
;
6889 if (this_eol
== EOL_SEEN_LF
)
6890 XSETCAR (tail
, AREF (eol_type
, 0));
6891 else if (this_eol
== EOL_SEEN_CRLF
)
6892 XSETCAR (tail
, AREF (eol_type
, 1));
6893 else if (this_eol
== EOL_SEEN_CR
)
6894 XSETCAR (tail
, AREF (eol_type
, 2));
6896 XSETCAR (tail
, CODING_ID_NAME (id
));
6899 XSETCAR (tail
, CODING_ID_NAME (id
));
6903 return (highest
? XCAR (val
) : val
);
6907 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
6909 doc
: /* Detect coding system of the text in the region between START and END.
6910 Return a list of possible coding systems ordered by priority.
6912 If only ASCII characters are found, it returns a list of single element
6913 `undecided' or its subsidiary coding system according to a detected
6916 If optional argument HIGHEST is non-nil, return the coding system of
6917 highest priority. */)
6918 (start
, end
, highest
)
6919 Lisp_Object start
, end
, highest
;
6922 int from_byte
, to_byte
;
6924 CHECK_NUMBER_COERCE_MARKER (start
);
6925 CHECK_NUMBER_COERCE_MARKER (end
);
6927 validate_region (&start
, &end
);
6928 from
= XINT (start
), to
= XINT (end
);
6929 from_byte
= CHAR_TO_BYTE (from
);
6930 to_byte
= CHAR_TO_BYTE (to
);
6932 if (from
< GPT
&& to
>= GPT
)
6933 move_gap_both (to
, to_byte
);
6935 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
6936 to_byte
- from_byte
,
6938 !NILP (current_buffer
6939 ->enable_multibyte_characters
),
6943 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
6945 doc
: /* Detect coding system of the text in STRING.
6946 Return a list of possible coding systems ordered by priority.
6948 If only ASCII characters are found, it returns a list of single element
6949 `undecided' or its subsidiary coding system according to a detected
6952 If optional argument HIGHEST is non-nil, return the coding system of
6953 highest priority. */)
6955 Lisp_Object string
, highest
;
6957 CHECK_STRING (string
);
6959 return detect_coding_system (XSTRING (string
)->data
,
6960 STRING_BYTES (XSTRING (string
)),
6962 STRING_MULTIBYTE (string
),
6968 char_encodable_p (c
, attrs
)
6973 struct charset
*charset
;
6975 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
6976 CONSP (tail
); tail
= XCDR (tail
))
6978 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
6979 if (CHAR_CHARSET_P (c
, charset
))
6982 return (! NILP (tail
));
6986 /* Return a list of coding systems that safely encode the text between
6987 START and END. If EXCLUDE is non-nil, it is a list of coding
6988 systems not to check. The returned list doesn't contain any such
6989 coding systems. In any case, if the text contains only ASCII or is
6990 unibyte, return t. */
6992 DEFUN ("find-coding-systems-region-internal",
6993 Ffind_coding_systems_region_internal
,
6994 Sfind_coding_systems_region_internal
, 2, 3, 0,
6995 doc
: /* Internal use only. */)
6996 (start
, end
, exclude
)
6997 Lisp_Object start
, end
, exclude
;
6999 Lisp_Object coding_attrs_list
, safe_codings
;
7000 EMACS_INT start_byte
, end_byte
;
7001 const unsigned char *p
, *pbeg
, *pend
;
7003 Lisp_Object tail
, elt
;
7005 if (STRINGP (start
))
7007 if (!STRING_MULTIBYTE (start
)
7008 || XSTRING (start
)->size
== STRING_BYTES (XSTRING (start
)))
7011 end_byte
= STRING_BYTES (XSTRING (start
));
7015 CHECK_NUMBER_COERCE_MARKER (start
);
7016 CHECK_NUMBER_COERCE_MARKER (end
);
7017 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7018 args_out_of_range (start
, end
);
7019 if (NILP (current_buffer
->enable_multibyte_characters
))
7021 start_byte
= CHAR_TO_BYTE (XINT (start
));
7022 end_byte
= CHAR_TO_BYTE (XINT (end
));
7023 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7026 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7028 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7029 move_gap_both (XINT (start
), start_byte
);
7031 move_gap_both (XINT (end
), end_byte
);
7035 coding_attrs_list
= Qnil
;
7036 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7038 || NILP (Fmemq (XCAR (tail
), exclude
)))
7042 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
7043 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
7044 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
7045 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
7048 if (STRINGP (start
))
7049 p
= pbeg
= XSTRING (start
)->data
;
7051 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7052 pend
= p
+ (end_byte
- start_byte
);
7054 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
7055 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7059 if (ASCII_BYTE_P (*p
))
7063 c
= STRING_CHAR_ADVANCE (p
);
7065 charset_map_loaded
= 0;
7066 for (tail
= coding_attrs_list
; CONSP (tail
);)
7071 else if (char_encodable_p (c
, elt
))
7073 else if (CONSP (XCDR (tail
)))
7075 XSETCAR (tail
, XCAR (XCDR (tail
)));
7076 XSETCDR (tail
, XCDR (XCDR (tail
)));
7080 XSETCAR (tail
, Qnil
);
7084 if (charset_map_loaded
)
7086 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7088 if (STRINGP (start
))
7089 pbeg
= XSTRING (start
)->data
;
7091 pbeg
= BYTE_POS_ADDR (start_byte
);
7092 p
= pbeg
+ p_offset
;
7093 pend
= pbeg
+ pend_offset
;
7098 safe_codings
= Qnil
;
7099 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
7100 if (! NILP (XCAR (tail
)))
7101 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
7103 return safe_codings
;
7107 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
7108 Scheck_coding_systems_region
, 3, 3, 0,
7109 doc
: /* Check if the region is encodable by coding systems.
7111 START and END are buffer positions specifying the region.
7112 CODING-SYSTEM-LIST is a list of coding systems to check.
7114 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7115 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7116 whole region, POS0, POS1, ... are buffer positions where non-encodable
7117 characters are found.
7119 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7122 START may be a string. In that case, check if the string is
7123 encodable, and the value contains indices to the string instead of
7124 buffer positions. END is ignored. */)
7125 (start
, end
, coding_system_list
)
7126 Lisp_Object start
, end
, coding_system_list
;
7129 EMACS_INT start_byte
, end_byte
;
7131 const unsigned char *p
, *pbeg
, *pend
;
7133 Lisp_Object tail
, elt
;
7135 if (STRINGP (start
))
7137 if (!STRING_MULTIBYTE (start
)
7138 && XSTRING (start
)->size
!= STRING_BYTES (XSTRING (start
)))
7141 end_byte
= STRING_BYTES (XSTRING (start
));
7146 CHECK_NUMBER_COERCE_MARKER (start
);
7147 CHECK_NUMBER_COERCE_MARKER (end
);
7148 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7149 args_out_of_range (start
, end
);
7150 if (NILP (current_buffer
->enable_multibyte_characters
))
7152 start_byte
= CHAR_TO_BYTE (XINT (start
));
7153 end_byte
= CHAR_TO_BYTE (XINT (end
));
7154 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7157 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7159 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7160 move_gap_both (XINT (start
), start_byte
);
7162 move_gap_both (XINT (end
), end_byte
);
7168 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7171 list
= Fcons (Fcons (elt
, Fcons (AREF (CODING_SYSTEM_SPEC (elt
), 0),
7176 if (STRINGP (start
))
7177 p
= pbeg
= XSTRING (start
)->data
;
7179 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7180 pend
= p
+ (end_byte
- start_byte
);
7182 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
7183 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7187 if (ASCII_BYTE_P (*p
))
7191 c
= STRING_CHAR_ADVANCE (p
);
7193 charset_map_loaded
= 0;
7194 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
7196 elt
= XCDR (XCAR (tail
));
7197 if (! char_encodable_p (c
, XCAR (elt
)))
7198 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
7200 if (charset_map_loaded
)
7202 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7204 if (STRINGP (start
))
7205 pbeg
= XSTRING (start
)->data
;
7207 pbeg
= BYTE_POS_ADDR (start_byte
);
7208 p
= pbeg
+ p_offset
;
7209 pend
= pbeg
+ pend_offset
;
7217 for (; CONSP (tail
); tail
= XCDR (tail
))
7220 if (CONSP (XCDR (XCDR (elt
))))
7221 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
7231 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
7232 Lisp_Object start
, end
, coding_system
, dst_object
;
7233 int encodep
, norecord
;
7235 struct coding_system coding
;
7236 EMACS_INT from
, from_byte
, to
, to_byte
;
7237 Lisp_Object src_object
;
7239 CHECK_NUMBER_COERCE_MARKER (start
);
7240 CHECK_NUMBER_COERCE_MARKER (end
);
7241 if (NILP (coding_system
))
7242 coding_system
= Qno_conversion
;
7244 CHECK_CODING_SYSTEM (coding_system
);
7245 src_object
= Fcurrent_buffer ();
7246 if (NILP (dst_object
))
7247 dst_object
= src_object
;
7248 else if (! EQ (dst_object
, Qt
))
7249 CHECK_BUFFER (dst_object
);
7251 validate_region (&start
, &end
);
7252 from
= XFASTINT (start
);
7253 from_byte
= CHAR_TO_BYTE (from
);
7254 to
= XFASTINT (end
);
7255 to_byte
= CHAR_TO_BYTE (to
);
7257 setup_coding_system (coding_system
, &coding
);
7258 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7261 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7264 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7267 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7269 if (coding
.result
!= CODING_RESULT_SUCCESS
)
7270 error ("Code conversion error: %d", coding
.result
);
7272 return (BUFFERP (dst_object
)
7273 ? make_number (coding
.produced_char
)
7274 : coding
.dst_object
);
7278 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
7279 3, 4, "r\nzCoding system: ",
7280 doc
: /* Decode the current region from the specified coding system.
7281 When called from a program, takes four arguments:
7282 START, END, CODING-SYSTEM, and DESTINATION.
7283 START and END are buffer positions.
7285 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7286 If nil, the region between START and END is replace by the decoded text.
7287 If buffer, the decoded text is inserted in the buffer.
7288 If t, the decoded text is returned.
7290 This function sets `last-coding-system-used' to the precise coding system
7291 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7292 not fully specified.)
7293 It returns the length of the decoded text. */)
7294 (start
, end
, coding_system
, destination
)
7295 Lisp_Object start
, end
, coding_system
, destination
;
7297 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
7300 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
7301 3, 4, "r\nzCoding system: ",
7302 doc
: /* Encode the current region by specified coding system.
7303 When called from a program, takes three arguments:
7304 START, END, and CODING-SYSTEM. START and END are buffer positions.
7306 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7307 If nil, the region between START and END is replace by the encoded text.
7308 If buffer, the encoded text is inserted in the buffer.
7309 If t, the encoded text is returned.
7311 This function sets `last-coding-system-used' to the precise coding system
7312 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7313 not fully specified.)
7314 It returns the length of the encoded text. */)
7315 (start
, end
, coding_system
, destination
)
7316 Lisp_Object start
, end
, coding_system
, destination
;
7318 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
7322 code_convert_string (string
, coding_system
, dst_object
,
7323 encodep
, nocopy
, norecord
)
7324 Lisp_Object string
, coding_system
, dst_object
;
7325 int encodep
, nocopy
, norecord
;
7327 struct coding_system coding
;
7328 EMACS_INT chars
, bytes
;
7330 CHECK_STRING (string
);
7331 if (NILP (coding_system
))
7334 Vlast_coding_system_used
= Qno_conversion
;
7335 if (NILP (dst_object
))
7336 return (nocopy
? Fcopy_sequence (string
) : string
);
7339 if (NILP (coding_system
))
7340 coding_system
= Qno_conversion
;
7342 CHECK_CODING_SYSTEM (coding_system
);
7343 if (NILP (dst_object
))
7345 else if (! EQ (dst_object
, Qt
))
7346 CHECK_BUFFER (dst_object
);
7348 setup_coding_system (coding_system
, &coding
);
7349 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7350 chars
= XSTRING (string
)->size
;
7351 bytes
= STRING_BYTES (XSTRING (string
));
7353 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7355 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7357 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7359 if (coding
.result
!= CODING_RESULT_SUCCESS
)
7360 error ("Code conversion error: %d", coding
.result
);
7362 return (BUFFERP (dst_object
)
7363 ? make_number (coding
.produced_char
)
7364 : coding
.dst_object
);
7368 /* Encode or decode STRING according to CODING_SYSTEM.
7369 Do not set Vlast_coding_system_used.
7371 This function is called only from macros DECODE_FILE and
7372 ENCODE_FILE, thus we ignore character composition. */
7375 code_convert_string_norecord (string
, coding_system
, encodep
)
7376 Lisp_Object string
, coding_system
;
7379 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
7383 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
7385 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7387 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7388 if the decoding operation is trivial.
7390 Optional fourth arg BUFFER non-nil meant that the decoded text is
7391 inserted in BUFFER instead of returned as a string. In this case,
7392 the return value is BUFFER.
7394 This function sets `last-coding-system-used' to the precise coding system
7395 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7396 not fully specified. */)
7397 (string
, coding_system
, nocopy
, buffer
)
7398 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7400 return code_convert_string (string
, coding_system
, buffer
,
7401 0, ! NILP (nocopy
), 0);
7404 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
7406 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
7408 Optional third arg NOCOPY non-nil means it is OK to return STRING
7409 itself if the encoding operation is trivial.
7411 Optional fourth arg BUFFER non-nil meant that the encoded text is
7412 inserted in BUFFER instead of returned as a string. In this case,
7413 the return value is BUFFER.
7415 This function sets `last-coding-system-used' to the precise coding system
7416 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7417 not fully specified.) */)
7418 (string
, coding_system
, nocopy
, buffer
)
7419 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7421 return code_convert_string (string
, coding_system
, buffer
,
7422 1, ! NILP (nocopy
), 1);
7426 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
7427 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
7428 Return the corresponding character. */)
7432 Lisp_Object spec
, attrs
, val
;
7433 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
7436 CHECK_NATNUM (code
);
7437 c
= XFASTINT (code
);
7438 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7439 attrs
= AREF (spec
, 0);
7441 if (ASCII_BYTE_P (c
)
7442 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7445 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7446 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7447 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7448 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7451 charset
= charset_roman
;
7452 else if (c
>= 0xA0 && c
< 0xDF)
7454 charset
= charset_kana
;
7459 int s1
= c
>> 8, s2
= c
& 0xFF;
7461 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
7462 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
7463 error ("Invalid code: %d", code
);
7465 charset
= charset_kanji
;
7467 c
= DECODE_CHAR (charset
, c
);
7469 error ("Invalid code: %d", code
);
7470 return make_number (c
);
7474 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
7475 doc
: /* Encode a Japanese character CHAR to shift_jis encoding.
7476 Return the corresponding code in SJIS. */)
7480 Lisp_Object spec
, attrs
, charset_list
;
7482 struct charset
*charset
;
7485 CHECK_CHARACTER (ch
);
7487 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7488 attrs
= AREF (spec
, 0);
7490 if (ASCII_CHAR_P (c
)
7491 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7494 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7495 charset
= char_charset (c
, charset_list
, &code
);
7496 if (code
== CHARSET_INVALID_CODE (charset
))
7497 error ("Can't encode by shift_jis encoding: %d", c
);
7500 return make_number (code
);
7503 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
7504 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
7505 Return the corresponding character. */)
7509 Lisp_Object spec
, attrs
, val
;
7510 struct charset
*charset_roman
, *charset_big5
, *charset
;
7513 CHECK_NATNUM (code
);
7514 c
= XFASTINT (code
);
7515 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7516 attrs
= AREF (spec
, 0);
7518 if (ASCII_BYTE_P (c
)
7519 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7522 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7523 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7524 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7527 charset
= charset_roman
;
7530 int b1
= c
>> 8, b2
= c
& 0x7F;
7531 if (b1
< 0xA1 || b1
> 0xFE
7532 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
7533 error ("Invalid code: %d", code
);
7534 charset
= charset_big5
;
7536 c
= DECODE_CHAR (charset
, (unsigned )c
);
7538 error ("Invalid code: %d", code
);
7539 return make_number (c
);
7542 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
7543 doc
: /* Encode the Big5 character CHAR to BIG5 coding system.
7544 Return the corresponding character code in Big5. */)
7548 Lisp_Object spec
, attrs
, charset_list
;
7549 struct charset
*charset
;
7553 CHECK_CHARACTER (ch
);
7555 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7556 attrs
= AREF (spec
, 0);
7557 if (ASCII_CHAR_P (c
)
7558 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7561 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7562 charset
= char_charset (c
, charset_list
, &code
);
7563 if (code
== CHARSET_INVALID_CODE (charset
))
7564 error ("Can't encode by Big5 encoding: %d", c
);
7566 return make_number (code
);
7570 DEFUN ("set-terminal-coding-system-internal",
7571 Fset_terminal_coding_system_internal
,
7572 Sset_terminal_coding_system_internal
, 1, 1, 0,
7573 doc
: /* Internal use only. */)
7575 Lisp_Object coding_system
;
7577 CHECK_SYMBOL (coding_system
);
7578 setup_coding_system (Fcheck_coding_system (coding_system
),
7581 /* We had better not send unsafe characters to terminal. */
7582 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
7583 /* Characer composition should be disabled. */
7584 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7585 terminal_coding
.src_multibyte
= 1;
7586 terminal_coding
.dst_multibyte
= 0;
7590 DEFUN ("set-safe-terminal-coding-system-internal",
7591 Fset_safe_terminal_coding_system_internal
,
7592 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
7593 doc
: /* Internal use only. */)
7595 Lisp_Object coding_system
;
7597 CHECK_SYMBOL (coding_system
);
7598 setup_coding_system (Fcheck_coding_system (coding_system
),
7599 &safe_terminal_coding
);
7600 /* Characer composition should be disabled. */
7601 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7602 safe_terminal_coding
.src_multibyte
= 1;
7603 safe_terminal_coding
.dst_multibyte
= 0;
7607 DEFUN ("terminal-coding-system",
7608 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
7609 doc
: /* Return coding system specified for terminal output. */)
7612 return CODING_ID_NAME (terminal_coding
.id
);
7615 DEFUN ("set-keyboard-coding-system-internal",
7616 Fset_keyboard_coding_system_internal
,
7617 Sset_keyboard_coding_system_internal
, 1, 1, 0,
7618 doc
: /* Internal use only. */)
7620 Lisp_Object coding_system
;
7622 CHECK_SYMBOL (coding_system
);
7623 setup_coding_system (Fcheck_coding_system (coding_system
),
7625 /* Characer composition should be disabled. */
7626 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7630 DEFUN ("keyboard-coding-system",
7631 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
7632 doc
: /* Return coding system specified for decoding keyboard input. */)
7635 return CODING_ID_NAME (keyboard_coding
.id
);
7639 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
7640 Sfind_operation_coding_system
, 1, MANY
, 0,
7641 doc
: /* Choose a coding system for an operation based on the target name.
7642 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7643 DECODING-SYSTEM is the coding system to use for decoding
7644 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7645 for encoding (in case OPERATION does encoding).
7647 The first argument OPERATION specifies an I/O primitive:
7648 For file I/O, `insert-file-contents' or `write-region'.
7649 For process I/O, `call-process', `call-process-region', or `start-process'.
7650 For network I/O, `open-network-stream'.
7652 The remaining arguments should be the same arguments that were passed
7653 to the primitive. Depending on which primitive, one of those arguments
7654 is selected as the TARGET. For example, if OPERATION does file I/O,
7655 whichever argument specifies the file name is TARGET.
7657 TARGET has a meaning which depends on OPERATION:
7658 For file I/O, TARGET is a file name.
7659 For process I/O, TARGET is a process name.
7660 For network I/O, TARGET is a service name or a port number
7662 This function looks up what specified for TARGET in,
7663 `file-coding-system-alist', `process-coding-system-alist',
7664 or `network-coding-system-alist' depending on OPERATION.
7665 They may specify a coding system, a cons of coding systems,
7666 or a function symbol to call.
7667 In the last case, we call the function with one argument,
7668 which is a list of all the arguments given to this function.
7670 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7675 Lisp_Object operation
, target_idx
, target
, val
;
7676 register Lisp_Object chain
;
7679 error ("Too few arguments");
7680 operation
= args
[0];
7681 if (!SYMBOLP (operation
)
7682 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
7683 error ("Invalid first arguement");
7684 if (nargs
< 1 + XINT (target_idx
))
7685 error ("Too few arguments for operation: %s",
7686 XSYMBOL (operation
)->name
->data
);
7687 target
= args
[XINT (target_idx
) + 1];
7688 if (!(STRINGP (target
)
7689 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
7690 error ("Invalid %dth argument", XINT (target_idx
) + 1);
7692 chain
= ((EQ (operation
, Qinsert_file_contents
)
7693 || EQ (operation
, Qwrite_region
))
7694 ? Vfile_coding_system_alist
7695 : (EQ (operation
, Qopen_network_stream
)
7696 ? Vnetwork_coding_system_alist
7697 : Vprocess_coding_system_alist
));
7701 for (; CONSP (chain
); chain
= XCDR (chain
))
7707 && ((STRINGP (target
)
7708 && STRINGP (XCAR (elt
))
7709 && fast_string_match (XCAR (elt
), target
) >= 0)
7710 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
7713 /* Here, if VAL is both a valid coding system and a valid
7714 function symbol, we return VAL as a coding system. */
7717 if (! SYMBOLP (val
))
7719 if (! NILP (Fcoding_system_p (val
)))
7720 return Fcons (val
, val
);
7721 if (! NILP (Ffboundp (val
)))
7723 val
= call1 (val
, Flist (nargs
, args
));
7726 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
7727 return Fcons (val
, val
);
7735 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
7736 Sset_coding_system_priority
, 0, MANY
, 0,
7737 doc
: /* Assign higher priority to the coding systems given as arguments.
7738 usage: (set-coding-system-priority CODING-SYSTEM ...) */)
7744 int changed
[coding_category_max
];
7745 enum coding_category priorities
[coding_category_max
];
7747 bzero (changed
, sizeof changed
);
7749 for (i
= j
= 0; i
< nargs
; i
++)
7751 enum coding_category category
;
7752 Lisp_Object spec
, attrs
;
7754 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
7755 attrs
= AREF (spec
, 0);
7756 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7757 if (changed
[category
])
7758 /* Ignore this coding system because a coding system of the
7759 same category already had a higher priority. */
7761 changed
[category
] = 1;
7762 priorities
[j
++] = category
;
7763 if (coding_categories
[category
].id
>= 0
7764 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
7765 setup_coding_system (args
[i
], &coding_categories
[category
]);
7768 /* Now we have decided top J priorities. Reflect the order of the
7769 original priorities to the remaining priorities. */
7771 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
7773 while (j
< coding_category_max
7774 && changed
[coding_priorities
[j
]])
7776 if (j
== coding_category_max
)
7778 priorities
[i
] = coding_priorities
[j
];
7781 bcopy (priorities
, coding_priorities
, sizeof priorities
);
7785 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
7786 Scoding_system_priority_list
, 0, 1, 0,
7787 doc
: /* Return a list of coding systems ordered by their priorities.
7788 HIGHESTP non-nil means just return the highest priority one. */)
7790 Lisp_Object highestp
;
7795 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
7797 enum coding_category category
= coding_priorities
[i
];
7798 int id
= coding_categories
[category
].id
;
7803 attrs
= CODING_ID_ATTRS (id
);
7804 if (! NILP (highestp
))
7805 return CODING_ATTR_BASE_NAME (attrs
);
7806 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
7808 return Fnreverse (val
);
7811 static char *suffixes
[] = { "-unix", "-dos", "-mac" };
7814 make_subsidiaries (base
)
7817 Lisp_Object subsidiaries
;
7818 int base_name_len
= STRING_BYTES (XSYMBOL (base
)->name
);
7819 char *buf
= (char *) alloca (base_name_len
+ 6);
7822 bcopy (XSYMBOL (base
)->name
->data
, buf
, base_name_len
);
7823 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
7824 for (i
= 0; i
< 3; i
++)
7826 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
7827 ASET (subsidiaries
, i
, intern (buf
));
7829 return subsidiaries
;
7833 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
7834 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
7835 doc
: /* For internal use only.
7836 usage: (define-coding-system-internal ...) */)
7842 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
7843 Lisp_Object attrs
; /* Vector of attributes. */
7844 Lisp_Object eol_type
;
7845 Lisp_Object aliases
;
7846 Lisp_Object coding_type
, charset_list
, safe_charsets
;
7847 enum coding_category category
;
7848 Lisp_Object tail
, val
;
7849 int max_charset_id
= 0;
7852 if (nargs
< coding_arg_max
)
7855 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
7857 name
= args
[coding_arg_name
];
7858 CHECK_SYMBOL (name
);
7859 CODING_ATTR_BASE_NAME (attrs
) = name
;
7861 val
= args
[coding_arg_mnemonic
];
7862 if (! STRINGP (val
))
7863 CHECK_CHARACTER (val
);
7864 CODING_ATTR_MNEMONIC (attrs
) = val
;
7866 coding_type
= args
[coding_arg_coding_type
];
7867 CHECK_SYMBOL (coding_type
);
7868 CODING_ATTR_TYPE (attrs
) = coding_type
;
7870 charset_list
= args
[coding_arg_charset_list
];
7871 if (SYMBOLP (charset_list
))
7873 if (EQ (charset_list
, Qiso_2022
))
7875 if (! EQ (coding_type
, Qiso_2022
))
7876 error ("Invalid charset-list");
7877 charset_list
= Viso_2022_charset_list
;
7879 else if (EQ (charset_list
, Qemacs_mule
))
7881 if (! EQ (coding_type
, Qemacs_mule
))
7882 error ("Invalid charset-list");
7883 charset_list
= Vemacs_mule_charset_list
;
7885 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7886 if (max_charset_id
< XFASTINT (XCAR (tail
)))
7887 max_charset_id
= XFASTINT (XCAR (tail
));
7891 charset_list
= Fcopy_sequence (charset_list
);
7892 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
7894 struct charset
*charset
;
7897 CHECK_CHARSET_GET_CHARSET (val
, charset
);
7898 if (EQ (coding_type
, Qiso_2022
)
7899 ? CHARSET_ISO_FINAL (charset
) < 0
7900 : EQ (coding_type
, Qemacs_mule
)
7901 ? CHARSET_EMACS_MULE_ID (charset
) < 0
7903 error ("Can't handle charset `%s'",
7904 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7906 XCAR (tail
) = make_number (charset
->id
);
7907 if (max_charset_id
< charset
->id
)
7908 max_charset_id
= charset
->id
;
7911 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
7913 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
7915 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7916 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
7917 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
7919 CODING_ATTR_ASCII_COMPAT (attrs
) = args
[coding_arg_ascii_compatible_p
];
7921 val
= args
[coding_arg_decode_translation_table
];
7923 CHECK_CHAR_TABLE (val
);
7924 CODING_ATTR_DECODE_TBL (attrs
) = val
;
7926 val
= args
[coding_arg_encode_translation_table
];
7928 CHECK_CHAR_TABLE (val
);
7929 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
7931 val
= args
[coding_arg_post_read_conversion
];
7933 CODING_ATTR_POST_READ (attrs
) = val
;
7935 val
= args
[coding_arg_pre_write_conversion
];
7937 CODING_ATTR_PRE_WRITE (attrs
) = val
;
7939 val
= args
[coding_arg_default_char
];
7941 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
7944 CHECK_CHARACTER (val
);
7945 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
7948 val
= args
[coding_arg_plist
];
7950 CODING_ATTR_PLIST (attrs
) = val
;
7952 if (EQ (coding_type
, Qcharset
))
7955 /* Generate a lisp vector of 256 elements. Each element is nil,
7956 integer, or a list of charset IDs.
7958 If Nth element is nil, the byte code N is invalid in this
7961 If Nth element is a number NUM, N is the first byte of a
7962 charset whose ID is NUM.
7964 If Nth element is a list of charset IDs, N is the first byte
7965 of one of them. The list is sorted by dimensions of the
7966 charsets. A charset of smaller dimension comes firtst.
7968 for (list
= Qnil
, tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7970 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
7972 if (charset
->method
== CHARSET_METHOD_SUPERSET
)
7974 val
= CHARSET_SUPERSET (charset
);
7975 for (; CONSP (val
); val
= XCDR (val
))
7976 list
= Fcons (XCAR (XCAR (val
)), list
);
7979 list
= Fcons (XCAR (tail
), list
);
7982 val
= Fmake_vector (make_number (256), Qnil
);
7984 for (tail
= Fnreverse (list
); CONSP (tail
); tail
= XCDR (tail
))
7986 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
7987 int dim
= CHARSET_DIMENSION (charset
);
7988 int idx
= (dim
- 1) * 4;
7990 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
7991 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
7993 for (i
= charset
->code_space
[idx
];
7994 i
<= charset
->code_space
[idx
+ 1]; i
++)
7996 Lisp_Object tmp
, tmp2
;
7999 tmp
= AREF (val
, i
);
8002 else if (NUMBERP (tmp
))
8004 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
8006 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
8008 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
8012 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
8014 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
8019 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
8022 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
8023 XSETCAR (tmp2
, XCAR (tail
));
8029 ASET (attrs
, coding_attr_charset_valids
, val
);
8030 category
= coding_category_charset
;
8032 else if (EQ (coding_type
, Qccl
))
8036 if (nargs
< coding_arg_ccl_max
)
8039 val
= args
[coding_arg_ccl_decoder
];
8040 CHECK_CCL_PROGRAM (val
);
8042 val
= Fcopy_sequence (val
);
8043 ASET (attrs
, coding_attr_ccl_decoder
, val
);
8045 val
= args
[coding_arg_ccl_encoder
];
8046 CHECK_CCL_PROGRAM (val
);
8048 val
= Fcopy_sequence (val
);
8049 ASET (attrs
, coding_attr_ccl_encoder
, val
);
8051 val
= args
[coding_arg_ccl_valids
];
8052 valids
= Fmake_string (make_number (256), make_number (0));
8053 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
8060 from
= to
= XINT (val
);
8061 if (from
< 0 || from
> 255)
8062 args_out_of_range_3 (val
, make_number (0), make_number (255));
8067 CHECK_NUMBER (XCAR (val
));
8068 CHECK_NUMBER (XCDR (val
));
8069 from
= XINT (XCAR (val
));
8070 if (from
< 0 || from
> 255)
8071 args_out_of_range_3 (XCAR (val
),
8072 make_number (0), make_number (255));
8073 to
= XINT (XCDR (val
));
8074 if (to
< from
|| to
> 255)
8075 args_out_of_range_3 (XCDR (val
),
8076 XCAR (val
), make_number (255));
8078 for (i
= from
; i
<= to
; i
++)
8079 XSTRING (valids
)->data
[i
] = 1;
8081 ASET (attrs
, coding_attr_ccl_valids
, valids
);
8083 category
= coding_category_ccl
;
8085 else if (EQ (coding_type
, Qutf_16
))
8087 Lisp_Object bom
, endian
;
8089 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8091 if (nargs
< coding_arg_utf16_max
)
8094 bom
= args
[coding_arg_utf16_bom
];
8095 if (! NILP (bom
) && ! EQ (bom
, Qt
))
8098 CHECK_CODING_SYSTEM (XCAR (bom
));
8099 CHECK_CODING_SYSTEM (XCDR (bom
));
8101 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
8103 endian
= args
[coding_arg_utf16_endian
];
8104 CHECK_SYMBOL (endian
);
8107 else if (! EQ (endian
, Qbig
) && ! EQ (endian
, Qlittle
))
8108 error ("Invalid endian: %s", XSYMBOL (endian
)->name
->data
);
8109 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
8111 category
= (CONSP (bom
)
8112 ? coding_category_utf_16_auto
8114 ? (EQ (endian
, Qbig
)
8115 ? coding_category_utf_16_be_nosig
8116 : coding_category_utf_16_le_nosig
)
8117 : (EQ (endian
, Qbig
)
8118 ? coding_category_utf_16_be
8119 : coding_category_utf_16_le
));
8121 else if (EQ (coding_type
, Qiso_2022
))
8123 Lisp_Object initial
, reg_usage
, request
, flags
;
8126 if (nargs
< coding_arg_iso2022_max
)
8129 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
8130 CHECK_VECTOR (initial
);
8131 for (i
= 0; i
< 4; i
++)
8133 val
= Faref (initial
, make_number (i
));
8136 struct charset
*charset
;
8138 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8139 ASET (initial
, i
, make_number (CHARSET_ID (charset
)));
8140 if (i
== 0 && CHARSET_ASCII_COMPATIBLE_P (charset
))
8141 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8144 ASET (initial
, i
, make_number (-1));
8147 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
8148 CHECK_CONS (reg_usage
);
8149 CHECK_NATNUM (XCAR (reg_usage
));
8150 CHECK_NATNUM (XCDR (reg_usage
));
8152 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
8153 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
8159 CHECK_CHARSET_GET_ID (XCAR (val
), id
);
8160 CHECK_NATNUM (XCDR (val
));
8161 if (XINT (XCDR (val
)) >= 4)
8162 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
8163 XCAR (val
) = make_number (id
);
8166 flags
= args
[coding_arg_iso2022_flags
];
8167 CHECK_NATNUM (flags
);
8169 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
8170 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
8172 ASET (attrs
, coding_attr_iso_initial
, initial
);
8173 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
8174 ASET (attrs
, coding_attr_iso_request
, request
);
8175 ASET (attrs
, coding_attr_iso_flags
, flags
);
8176 setup_iso_safe_charsets (attrs
);
8178 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
8179 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
8180 | CODING_ISO_FLAG_SINGLE_SHIFT
))
8181 ? coding_category_iso_7_else
8182 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8183 ? coding_category_iso_7
8184 : coding_category_iso_7_tight
);
8187 int id
= XINT (AREF (initial
, 1));
8189 category
= (((i
& CODING_ISO_FLAG_LOCKING_SHIFT
)
8190 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8192 ? coding_category_iso_8_else
8193 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
8194 ? coding_category_iso_8_1
8195 : coding_category_iso_8_2
);
8197 if (category
!= coding_category_iso_8_1
8198 && category
!= coding_category_iso_8_2
)
8199 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8201 else if (EQ (coding_type
, Qemacs_mule
))
8203 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
8204 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
8205 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8206 category
= coding_category_emacs_mule
;
8208 else if (EQ (coding_type
, Qshift_jis
))
8211 struct charset
*charset
;
8213 if (XINT (Flength (charset_list
)) != 3)
8214 error ("There should be just three charsets");
8216 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8217 if (CHARSET_DIMENSION (charset
) != 1)
8218 error ("Dimension of charset %s is not one",
8219 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8220 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8221 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8223 charset_list
= XCDR (charset_list
);
8224 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8225 if (CHARSET_DIMENSION (charset
) != 1)
8226 error ("Dimension of charset %s is not one",
8227 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8229 charset_list
= XCDR (charset_list
);
8230 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8231 if (CHARSET_DIMENSION (charset
) != 2)
8232 error ("Dimension of charset %s is not two",
8233 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8235 category
= coding_category_sjis
;
8236 Vsjis_coding_system
= name
;
8238 else if (EQ (coding_type
, Qbig5
))
8240 struct charset
*charset
;
8242 if (XINT (Flength (charset_list
)) != 2)
8243 error ("There should be just two charsets");
8245 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8246 if (CHARSET_DIMENSION (charset
) != 1)
8247 error ("Dimension of charset %s is not one",
8248 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8249 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8250 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8252 charset_list
= XCDR (charset_list
);
8253 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8254 if (CHARSET_DIMENSION (charset
) != 2)
8255 error ("Dimension of charset %s is not two",
8256 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8258 category
= coding_category_big5
;
8259 Vbig5_coding_system
= name
;
8261 else if (EQ (coding_type
, Qraw_text
))
8263 category
= coding_category_raw_text
;
8264 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8266 else if (EQ (coding_type
, Qutf_8
))
8268 category
= coding_category_utf_8
;
8269 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8271 else if (EQ (coding_type
, Qundecided
))
8272 category
= coding_category_undecided
;
8274 error ("Invalid coding system type: %s",
8275 XSYMBOL (coding_type
)->name
->data
);
8277 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
8279 eol_type
= args
[coding_arg_eol_type
];
8280 if (! NILP (eol_type
)
8281 && ! EQ (eol_type
, Qunix
)
8282 && ! EQ (eol_type
, Qdos
)
8283 && ! EQ (eol_type
, Qmac
))
8284 error ("Invalid eol-type");
8286 aliases
= Fcons (name
, Qnil
);
8288 if (NILP (eol_type
))
8290 eol_type
= make_subsidiaries (name
);
8291 for (i
= 0; i
< 3; i
++)
8293 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
8295 this_name
= AREF (eol_type
, i
);
8296 this_aliases
= Fcons (this_name
, Qnil
);
8297 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
8298 this_spec
= Fmake_vector (make_number (3), attrs
);
8299 ASET (this_spec
, 1, this_aliases
);
8300 ASET (this_spec
, 2, this_eol_type
);
8301 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
8302 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
8303 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
8304 Vcoding_system_alist
);
8308 spec_vec
= Fmake_vector (make_number (3), attrs
);
8309 ASET (spec_vec
, 1, aliases
);
8310 ASET (spec_vec
, 2, eol_type
);
8312 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
8313 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
8314 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
8315 Vcoding_system_alist
);
8318 int id
= coding_categories
[category
].id
;
8320 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
8321 setup_coding_system (name
, &coding_categories
[category
]);
8327 return Fsignal (Qwrong_number_of_arguments
,
8328 Fcons (intern ("define-coding-system-internal"),
8329 make_number (nargs
)));
8332 /* Fixme: should this record the alias relationships for
8333 diagnostics? Should it update coding-system-list? */
8334 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
8335 Sdefine_coding_system_alias
, 2, 2, 0,
8336 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8337 (alias
, coding_system
)
8338 Lisp_Object alias
, coding_system
;
8340 Lisp_Object spec
, aliases
, eol_type
;
8342 CHECK_SYMBOL (alias
);
8343 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8344 aliases
= AREF (spec
, 1);
8345 while (!NILP (XCDR (aliases
)))
8346 aliases
= XCDR (aliases
);
8347 XCDR (aliases
) = Fcons (alias
, Qnil
);
8349 eol_type
= AREF (spec
, 2);
8350 if (VECTORP (eol_type
))
8352 Lisp_Object subsidiaries
;
8355 subsidiaries
= make_subsidiaries (alias
);
8356 for (i
= 0; i
< 3; i
++)
8357 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
8358 AREF (eol_type
, i
));
8360 ASET (spec
, 2, subsidiaries
);
8363 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
8364 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
8365 Vcoding_system_alist
);
8370 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
8372 doc
: /* Return the base of CODING-SYSTEM.
8373 Any alias or subsidiary coding system is not a base coding system. */)
8375 Lisp_Object coding_system
;
8377 Lisp_Object spec
, attrs
;
8379 if (NILP (coding_system
))
8380 return (Qno_conversion
);
8381 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8382 attrs
= AREF (spec
, 0);
8383 return CODING_ATTR_BASE_NAME (attrs
);
8386 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
8388 doc
: "Return the property list of CODING-SYSTEM.")
8390 Lisp_Object coding_system
;
8392 Lisp_Object spec
, attrs
;
8394 if (NILP (coding_system
))
8395 coding_system
= Qno_conversion
;
8396 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8397 attrs
= AREF (spec
, 0);
8398 return CODING_ATTR_PLIST (attrs
);
8402 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
8404 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
8406 Lisp_Object coding_system
;
8410 if (NILP (coding_system
))
8411 coding_system
= Qno_conversion
;
8412 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8413 return AREF (spec
, 1);
8416 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
8417 Scoding_system_eol_type
, 1, 1, 0,
8418 doc
: /* Return eol-type of CODING-SYSTEM.
8419 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8421 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8422 and CR respectively.
8424 A vector value indicates that a format of end-of-line should be
8425 detected automatically. Nth element of the vector is the subsidiary
8426 coding system whose eol-type is N. */)
8428 Lisp_Object coding_system
;
8430 Lisp_Object spec
, eol_type
;
8433 if (NILP (coding_system
))
8434 coding_system
= Qno_conversion
;
8435 if (! CODING_SYSTEM_P (coding_system
))
8437 spec
= CODING_SYSTEM_SPEC (coding_system
);
8438 eol_type
= AREF (spec
, 2);
8439 if (VECTORP (eol_type
))
8440 return Fcopy_sequence (eol_type
);
8441 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
8442 return make_number (n
);
8448 /*** 9. Post-amble ***/
8455 for (i
= 0; i
< coding_category_max
; i
++)
8457 coding_categories
[i
].id
= -1;
8458 coding_priorities
[i
] = i
;
8461 /* ISO2022 specific initialize routine. */
8462 for (i
= 0; i
< 0x20; i
++)
8463 iso_code_class
[i
] = ISO_control_0
;
8464 for (i
= 0x21; i
< 0x7F; i
++)
8465 iso_code_class
[i
] = ISO_graphic_plane_0
;
8466 for (i
= 0x80; i
< 0xA0; i
++)
8467 iso_code_class
[i
] = ISO_control_1
;
8468 for (i
= 0xA1; i
< 0xFF; i
++)
8469 iso_code_class
[i
] = ISO_graphic_plane_1
;
8470 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
8471 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
8472 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
8473 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
8474 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
8475 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
8476 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
8477 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
8478 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
8479 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
8481 inhibit_pre_post_conversion
= 0;
8483 for (i
= 0; i
< 256; i
++)
8485 emacs_mule_bytes
[i
] = 1;
8487 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_11
] = 3;
8488 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_12
] = 3;
8489 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_21
] = 4;
8490 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_22
] = 4;
8498 staticpro (&Vcoding_system_hash_table
);
8499 Vcoding_system_hash_table
= Fmakehash (Qeq
);
8501 staticpro (&Vsjis_coding_system
);
8502 Vsjis_coding_system
= Qnil
;
8504 staticpro (&Vbig5_coding_system
);
8505 Vbig5_coding_system
= Qnil
;
8507 staticpro (&Vcode_conversion_work_buf_list
);
8508 Vcode_conversion_work_buf_list
= Qnil
;
8510 staticpro (&Vcode_conversion_reused_work_buf
);
8511 Vcode_conversion_reused_work_buf
= Qnil
;
8513 DEFSYM (Qcharset
, "charset");
8514 DEFSYM (Qtarget_idx
, "target-idx");
8515 DEFSYM (Qcoding_system_history
, "coding-system-history");
8516 Fset (Qcoding_system_history
, Qnil
);
8518 /* Target FILENAME is the first argument. */
8519 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
8520 /* Target FILENAME is the third argument. */
8521 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
8523 DEFSYM (Qcall_process
, "call-process");
8524 /* Target PROGRAM is the first argument. */
8525 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
8527 DEFSYM (Qcall_process_region
, "call-process-region");
8528 /* Target PROGRAM is the third argument. */
8529 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
8531 DEFSYM (Qstart_process
, "start-process");
8532 /* Target PROGRAM is the third argument. */
8533 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
8535 DEFSYM (Qopen_network_stream
, "open-network-stream");
8536 /* Target SERVICE is the fourth argument. */
8537 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
8539 DEFSYM (Qcoding_system
, "coding-system");
8540 DEFSYM (Qcoding_aliases
, "coding-aliases");
8542 DEFSYM (Qeol_type
, "eol-type");
8543 DEFSYM (Qunix
, "unix");
8544 DEFSYM (Qdos
, "dos");
8546 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
8547 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
8548 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
8549 DEFSYM (Qdefault_char
, "default-char");
8550 DEFSYM (Qundecided
, "undecided");
8551 DEFSYM (Qno_conversion
, "no-conversion");
8552 DEFSYM (Qraw_text
, "raw-text");
8554 DEFSYM (Qiso_2022
, "iso-2022");
8556 DEFSYM (Qutf_8
, "utf-8");
8558 DEFSYM (Qutf_16
, "utf-16");
8559 DEFSYM (Qbig
, "big");
8560 DEFSYM (Qlittle
, "little");
8562 DEFSYM (Qshift_jis
, "shift-jis");
8563 DEFSYM (Qbig5
, "big5");
8565 DEFSYM (Qcoding_system_p
, "coding-system-p");
8567 DEFSYM (Qcoding_system_error
, "coding-system-error");
8568 Fput (Qcoding_system_error
, Qerror_conditions
,
8569 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
8570 Fput (Qcoding_system_error
, Qerror_message
,
8571 build_string ("Invalid coding system"));
8573 /* Intern this now in case it isn't already done.
8574 Setting this variable twice is harmless.
8575 But don't staticpro it here--that is done in alloc.c. */
8576 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
8578 DEFSYM (Qtranslation_table
, "translation-table");
8579 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
8580 DEFSYM (Qtranslation_table_id
, "translation-table-id");
8581 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
8582 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
8584 DEFSYM (Qvalid_codes
, "valid-codes");
8586 DEFSYM (Qemacs_mule
, "emacs-mule");
8588 Vcoding_category_table
8589 = Fmake_vector (make_number (coding_category_max
), Qnil
);
8590 staticpro (&Vcoding_category_table
);
8591 /* Followings are target of code detection. */
8592 ASET (Vcoding_category_table
, coding_category_iso_7
,
8593 intern ("coding-category-iso-7"));
8594 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
8595 intern ("coding-category-iso-7-tight"));
8596 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
8597 intern ("coding-category-iso-8-1"));
8598 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
8599 intern ("coding-category-iso-8-2"));
8600 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
8601 intern ("coding-category-iso-7-else"));
8602 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
8603 intern ("coding-category-iso-8-else"));
8604 ASET (Vcoding_category_table
, coding_category_utf_8
,
8605 intern ("coding-category-utf-8"));
8606 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
8607 intern ("coding-category-utf-16-be"));
8608 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
8609 intern ("coding-category-utf-16-le"));
8610 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
8611 intern ("coding-category-utf-16-be-nosig"));
8612 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
8613 intern ("coding-category-utf-16-le-nosig"));
8614 ASET (Vcoding_category_table
, coding_category_charset
,
8615 intern ("coding-category-charset"));
8616 ASET (Vcoding_category_table
, coding_category_sjis
,
8617 intern ("coding-category-sjis"));
8618 ASET (Vcoding_category_table
, coding_category_big5
,
8619 intern ("coding-category-big5"));
8620 ASET (Vcoding_category_table
, coding_category_ccl
,
8621 intern ("coding-category-ccl"));
8622 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
8623 intern ("coding-category-emacs-mule"));
8624 /* Followings are NOT target of code detection. */
8625 ASET (Vcoding_category_table
, coding_category_raw_text
,
8626 intern ("coding-category-raw-text"));
8627 ASET (Vcoding_category_table
, coding_category_undecided
,
8628 intern ("coding-category-undecided"));
8630 defsubr (&Scoding_system_p
);
8631 defsubr (&Sread_coding_system
);
8632 defsubr (&Sread_non_nil_coding_system
);
8633 defsubr (&Scheck_coding_system
);
8634 defsubr (&Sdetect_coding_region
);
8635 defsubr (&Sdetect_coding_string
);
8636 defsubr (&Sfind_coding_systems_region_internal
);
8637 defsubr (&Scheck_coding_systems_region
);
8638 defsubr (&Sdecode_coding_region
);
8639 defsubr (&Sencode_coding_region
);
8640 defsubr (&Sdecode_coding_string
);
8641 defsubr (&Sencode_coding_string
);
8642 defsubr (&Sdecode_sjis_char
);
8643 defsubr (&Sencode_sjis_char
);
8644 defsubr (&Sdecode_big5_char
);
8645 defsubr (&Sencode_big5_char
);
8646 defsubr (&Sset_terminal_coding_system_internal
);
8647 defsubr (&Sset_safe_terminal_coding_system_internal
);
8648 defsubr (&Sterminal_coding_system
);
8649 defsubr (&Sset_keyboard_coding_system_internal
);
8650 defsubr (&Skeyboard_coding_system
);
8651 defsubr (&Sfind_operation_coding_system
);
8652 defsubr (&Sset_coding_system_priority
);
8653 defsubr (&Sdefine_coding_system_internal
);
8654 defsubr (&Sdefine_coding_system_alias
);
8655 defsubr (&Scoding_system_base
);
8656 defsubr (&Scoding_system_plist
);
8657 defsubr (&Scoding_system_aliases
);
8658 defsubr (&Scoding_system_eol_type
);
8659 defsubr (&Scoding_system_priority_list
);
8661 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
8662 doc
: /* List of coding systems.
8664 Do not alter the value of this variable manually. This variable should be
8665 updated by the functions `define-coding-system' and
8666 `define-coding-system-alias'. */);
8667 Vcoding_system_list
= Qnil
;
8669 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
8670 doc
: /* Alist of coding system names.
8671 Each element is one element list of coding system name.
8672 This variable is given to `completing-read' as TABLE argument.
8674 Do not alter the value of this variable manually. This variable should be
8675 updated by the functions `make-coding-system' and
8676 `define-coding-system-alias'. */);
8677 Vcoding_system_alist
= Qnil
;
8679 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
8680 doc
: /* List of coding-categories (symbols) ordered by priority.
8682 On detecting a coding system, Emacs tries code detection algorithms
8683 associated with each coding-category one by one in this order. When
8684 one algorithm agrees with a byte sequence of source text, the coding
8685 system bound to the corresponding coding-category is selected. */);
8689 Vcoding_category_list
= Qnil
;
8690 for (i
= coding_category_max
- 1; i
>= 0; i
--)
8691 Vcoding_category_list
8692 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
8693 Vcoding_category_list
);
8696 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
8697 doc
: /* Specify the coding system for read operations.
8698 It is useful to bind this variable with `let', but do not set it globally.
8699 If the value is a coding system, it is used for decoding on read operation.
8700 If not, an appropriate element is used from one of the coding system alists:
8701 There are three such tables, `file-coding-system-alist',
8702 `process-coding-system-alist', and `network-coding-system-alist'. */);
8703 Vcoding_system_for_read
= Qnil
;
8705 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
8706 doc
: /* Specify the coding system for write operations.
8707 Programs bind this variable with `let', but you should not set it globally.
8708 If the value is a coding system, it is used for encoding of output,
8709 when writing it to a file and when sending it to a file or subprocess.
8711 If this does not specify a coding system, an appropriate element
8712 is used from one of the coding system alists:
8713 There are three such tables, `file-coding-system-alist',
8714 `process-coding-system-alist', and `network-coding-system-alist'.
8715 For output to files, if the above procedure does not specify a coding system,
8716 the value of `buffer-file-coding-system' is used. */);
8717 Vcoding_system_for_write
= Qnil
;
8719 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
8721 Coding system used in the latest file or process I/O. */);
8722 Vlast_coding_system_used
= Qnil
;
8724 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
8726 *Non-nil means always inhibit code conversion of end-of-line format.
8727 See info node `Coding Systems' and info node `Text and Binary' concerning
8728 such conversion. */);
8729 inhibit_eol_conversion
= 0;
8731 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
8733 Non-nil means process buffer inherits coding system of process output.
8734 Bind it to t if the process output is to be treated as if it were a file
8735 read from some filesystem. */);
8736 inherit_process_coding_system
= 0;
8738 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
8740 Alist to decide a coding system to use for a file I/O operation.
8741 The format is ((PATTERN . VAL) ...),
8742 where PATTERN is a regular expression matching a file name,
8743 VAL is a coding system, a cons of coding systems, or a function symbol.
8744 If VAL is a coding system, it is used for both decoding and encoding
8746 If VAL is a cons of coding systems, the car part is used for decoding,
8747 and the cdr part is used for encoding.
8748 If VAL is a function symbol, the function must return a coding system
8749 or a cons of coding systems which are used as above. The function gets
8750 the arguments with which `find-operation-coding-systems' was called.
8752 See also the function `find-operation-coding-system'
8753 and the variable `auto-coding-alist'. */);
8754 Vfile_coding_system_alist
= Qnil
;
8756 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
8758 Alist to decide a coding system to use for a process I/O operation.
8759 The format is ((PATTERN . VAL) ...),
8760 where PATTERN is a regular expression matching a program name,
8761 VAL is a coding system, a cons of coding systems, or a function symbol.
8762 If VAL is a coding system, it is used for both decoding what received
8763 from the program and encoding what sent to the program.
8764 If VAL is a cons of coding systems, the car part is used for decoding,
8765 and the cdr part is used for encoding.
8766 If VAL is a function symbol, the function must return a coding system
8767 or a cons of coding systems which are used as above.
8769 See also the function `find-operation-coding-system'. */);
8770 Vprocess_coding_system_alist
= Qnil
;
8772 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
8774 Alist to decide a coding system to use for a network I/O operation.
8775 The format is ((PATTERN . VAL) ...),
8776 where PATTERN is a regular expression matching a network service name
8777 or is a port number to connect to,
8778 VAL is a coding system, a cons of coding systems, or a function symbol.
8779 If VAL is a coding system, it is used for both decoding what received
8780 from the network stream and encoding what sent to the network stream.
8781 If VAL is a cons of coding systems, the car part is used for decoding,
8782 and the cdr part is used for encoding.
8783 If VAL is a function symbol, the function must return a coding system
8784 or a cons of coding systems which are used as above.
8786 See also the function `find-operation-coding-system'. */);
8787 Vnetwork_coding_system_alist
= Qnil
;
8789 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
8790 doc
: /* Coding system to use with system messages.
8791 Also used for decoding keyboard input on X Window system. */);
8792 Vlocale_coding_system
= Qnil
;
8794 /* The eol mnemonics are reset in startup.el system-dependently. */
8795 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
8797 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8798 eol_mnemonic_unix
= build_string (":");
8800 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
8802 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8803 eol_mnemonic_dos
= build_string ("\\");
8805 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
8807 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8808 eol_mnemonic_mac
= build_string ("/");
8810 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
8812 *String displayed in mode line when end-of-line format is not yet determined. */);
8813 eol_mnemonic_undecided
= build_string (":");
8815 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
8817 *Non-nil enables character translation while encoding and decoding. */);
8818 Venable_character_translation
= Qt
;
8820 DEFVAR_LISP ("standard-translation-table-for-decode",
8821 &Vstandard_translation_table_for_decode
,
8822 doc
: /* Table for translating characters while decoding. */);
8823 Vstandard_translation_table_for_decode
= Qnil
;
8825 DEFVAR_LISP ("standard-translation-table-for-encode",
8826 &Vstandard_translation_table_for_encode
,
8827 doc
: /* Table for translating characters while encoding. */);
8828 Vstandard_translation_table_for_encode
= Qnil
;
8830 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
8831 doc
: /* Alist of charsets vs revision numbers.
8832 While encoding, if a charset (car part of an element) is found,
8833 designate it with the escape sequence identifying revision (cdr part
8834 of the element). */);
8835 Vcharset_revision_table
= Qnil
;
8837 DEFVAR_LISP ("default-process-coding-system",
8838 &Vdefault_process_coding_system
,
8839 doc
: /* Cons of coding systems used for process I/O by default.
8840 The car part is used for decoding a process output,
8841 the cdr part is used for encoding a text to be sent to a process. */);
8842 Vdefault_process_coding_system
= Qnil
;
8844 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
8846 Table of extra Latin codes in the range 128..159 (inclusive).
8847 This is a vector of length 256.
8848 If Nth element is non-nil, the existence of code N in a file
8849 \(or output of subprocess) doesn't prevent it to be detected as
8850 a coding system of ISO 2022 variant which has a flag
8851 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8852 or reading output of a subprocess.
8853 Only 128th through 159th elements has a meaning. */);
8854 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
8856 DEFVAR_LISP ("select-safe-coding-system-function",
8857 &Vselect_safe_coding_system_function
,
8859 Function to call to select safe coding system for encoding a text.
8861 If set, this function is called to force a user to select a proper
8862 coding system which can encode the text in the case that a default
8863 coding system used in each operation can't encode the text.
8865 The default value is `select-safe-coding-system' (which see). */);
8866 Vselect_safe_coding_system_function
= Qnil
;
8868 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8869 &inhibit_iso_escape_detection
,
8871 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8873 By default, on reading a file, Emacs tries to detect how the text is
8874 encoded. This code detection is sensitive to escape sequences. If
8875 the sequence is valid as ISO2022, the code is determined as one of
8876 the ISO2022 encodings, and the file is decoded by the corresponding
8877 coding system (e.g. `iso-2022-7bit').
8879 However, there may be a case that you want to read escape sequences in
8880 a file as is. In such a case, you can set this variable to non-nil.
8881 Then, as the code detection ignores any escape sequences, no file is
8882 detected as encoded in some ISO2022 encoding. The result is that all
8883 escape sequences become visible in a buffer.
8885 The default value is nil, and it is strongly recommended not to change
8886 it. That is because many Emacs Lisp source files that contain
8887 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8888 in Emacs's distribution, and they won't be decoded correctly on
8889 reading if you suppress escape sequence detection.
8891 The other way to read escape sequences in a file without decoding is
8892 to explicitly specify some coding system that doesn't use ISO2022's
8893 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8894 inhibit_iso_escape_detection
= 0;
8897 Lisp_Object args
[coding_arg_max
];
8898 Lisp_Object plist
[14];
8901 for (i
= 0; i
< coding_arg_max
; i
++)
8904 plist
[0] = intern (":name");
8905 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
8906 plist
[2] = intern (":mnemonic");
8907 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
8908 plist
[4] = intern (":coding-type");
8909 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
8910 plist
[6] = intern (":ascii-compatible-p");
8911 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
8912 plist
[8] = intern (":default-char");
8913 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
8914 plist
[10] = intern (":docstring");
8915 plist
[11] = build_string ("Do no conversion.\n\
8917 When you visit a file with this coding, the file is read into a\n\
8918 unibyte buffer as is, thus each byte of a file is treated as a\n\
8920 plist
[12] = intern (":eol-type");
8921 plist
[13] = args
[coding_arg_eol_type
] = Qunix
;
8922 args
[coding_arg_plist
] = Flist (14, plist
);
8923 Fdefine_coding_system_internal (coding_arg_max
, args
);
8926 setup_coding_system (Qno_conversion
, &keyboard_coding
);
8927 setup_coding_system (Qno_conversion
, &terminal_coding
);
8928 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
8932 emacs_strerror (error_number
)
8937 synchronize_system_messages_locale ();
8938 str
= strerror (error_number
);
8940 if (! NILP (Vlocale_coding_system
))
8942 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
8943 Vlocale_coding_system
,
8945 str
= (char *) XSTRING (dec
)->data
;