1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
152 Below is the template of these functions. */
156 detect_coding_XXX (coding
, detect_info
)
157 struct coding_system
*coding
;
158 struct coding_detection_info
*detect_info
;
160 unsigned char *src
= coding
->source
;
161 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
162 int multibytep
= coding
->src_multibyte
;
163 int consumed_chars
= 0;
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
173 if (! __C_conforms_to_XXX___ (c
))
175 if (! __C_strongly_suggests_XXX__ (c
))
176 found
= CATEGORY_MASK_XXX
;
178 /* The byte sequence is invalid for XXX. */
179 detect_info
->rejected
|= CATEGORY_MASK_XXX
;
183 /* The source exausted successfully. */
184 detect_info
->found
|= found
;
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
201 Below is the template of these functions. */
205 decode_coding_XXXX (coding
)
206 struct coding_system
*coding
;
208 unsigned char *src
= coding
->source
+ coding
->consumed
;
209 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base
;
214 /* A buffer to produce decoded characters. */
215 int *charbuf
= coding
->charbuf
;
216 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
217 int multibytep
= coding
->src_multibyte
;
222 if (charbuf
< charbuf_end
)
223 /* No more room to produce a decoded character. */
230 if (src_base
< src_end
231 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base
< src_end
&& charbuf
< charbuf_end
)
235 *charbuf
++ = *src_base
++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
239 /* Remember how many characters we produced. */
240 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
260 Below is a template of these functions. */
263 encode_coding_XXX (coding
)
264 struct coding_system
*coding
;
266 int multibytep
= coding
->dst_multibyte
;
267 int *charbuf
= coding
->charbuf
;
268 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
269 unsigned char *dst
= coding
->destination
+ coding
->produced
;
270 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
271 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
272 int produced_chars
= 0;
274 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
277 /* Encode C into DST, and increment DST. */
279 label_no_more_destination
:
280 /* How many chars and bytes we produced. */
281 coding
->produced_char
+= produced_chars
;
282 coding
->produced
= dst
- coding
->destination
;
287 /*** 1. Preamble ***/
294 #include "character.h"
297 #include "composite.h"
301 Lisp_Object Vcoding_system_hash_table
;
303 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
304 Lisp_Object Qunix
, Qdos
;
305 extern Lisp_Object Qmac
; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system
;
307 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
308 Lisp_Object Qdefault_char
;
309 Lisp_Object Qno_conversion
, Qundecided
;
310 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
311 Lisp_Object Qbig
, Qlittle
;
312 Lisp_Object Qcoding_system_history
;
313 Lisp_Object Qvalid_codes
;
314 Lisp_Object QCcategory
;
316 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
317 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
318 Lisp_Object Qstart_process
, Qopen_network_stream
;
319 Lisp_Object Qtarget_idx
;
321 Lisp_Object Qinsufficient_source
, Qinconsistent_eol
, Qinvalid_source
;
322 Lisp_Object Qinterrupted
, Qinsufficient_memory
;
324 int coding_system_require_warning
;
326 Lisp_Object Vselect_safe_coding_system_function
;
328 /* Mnemonic string for each format of end-of-line. */
329 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
330 /* Mnemonic string to indicate format of end-of-line is not yet
332 Lisp_Object eol_mnemonic_undecided
;
336 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
338 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
340 /* Coding system emacs-mule and raw-text are for converting only
341 end-of-line format. */
342 Lisp_Object Qemacs_mule
, Qraw_text
;
343 Lisp_Object Qutf_8_emacs
;
345 /* Coding-systems are handed between Emacs Lisp programs and C internal
346 routines by the following three variables. */
347 /* Coding-system for reading files and receiving data from process. */
348 Lisp_Object Vcoding_system_for_read
;
349 /* Coding-system for writing files and sending data to process. */
350 Lisp_Object Vcoding_system_for_write
;
351 /* Coding-system actually used in the latest I/O. */
352 Lisp_Object Vlast_coding_system_used
;
353 /* Set to non-nil when an error is detected while code conversion. */
354 Lisp_Object Vlast_code_conversion_error
;
355 /* A vector of length 256 which contains information about special
356 Latin codes (especially for dealing with Microsoft codes). */
357 Lisp_Object Vlatin_extra_code_table
;
359 /* Flag to inhibit code conversion of end-of-line format. */
360 int inhibit_eol_conversion
;
362 /* Flag to inhibit ISO2022 escape sequence detection. */
363 int inhibit_iso_escape_detection
;
365 /* Flag to make buffer-file-coding-system inherit from process-coding. */
366 int inherit_process_coding_system
;
368 /* Coding system to be used to encode text for terminal display. */
369 struct coding_system terminal_coding
;
371 /* Coding system to be used to encode text for terminal display when
372 terminal coding system is nil. */
373 struct coding_system safe_terminal_coding
;
375 /* Coding system of what is sent from terminal keyboard. */
376 struct coding_system keyboard_coding
;
378 Lisp_Object Vfile_coding_system_alist
;
379 Lisp_Object Vprocess_coding_system_alist
;
380 Lisp_Object Vnetwork_coding_system_alist
;
382 Lisp_Object Vlocale_coding_system
;
386 /* Flag to tell if we look up translation table on character code
388 Lisp_Object Venable_character_translation
;
389 /* Standard translation table to look up on decoding (reading). */
390 Lisp_Object Vstandard_translation_table_for_decode
;
391 /* Standard translation table to look up on encoding (writing). */
392 Lisp_Object Vstandard_translation_table_for_encode
;
394 Lisp_Object Qtranslation_table
;
395 Lisp_Object Qtranslation_table_id
;
396 Lisp_Object Qtranslation_table_for_decode
;
397 Lisp_Object Qtranslation_table_for_encode
;
399 /* Alist of charsets vs revision number. */
400 static Lisp_Object Vcharset_revision_table
;
402 /* Default coding systems used for process I/O. */
403 Lisp_Object Vdefault_process_coding_system
;
405 /* Char table for translating Quail and self-inserting input. */
406 Lisp_Object Vtranslation_table_for_input
;
408 /* Two special coding systems. */
409 Lisp_Object Vsjis_coding_system
;
410 Lisp_Object Vbig5_coding_system
;
413 static void record_conversion_result (struct coding_system
*coding
,
414 enum coding_result_code result
);
415 static int detect_coding_utf_8
P_ ((struct coding_system
*,
416 struct coding_detection_info
*info
));
417 static void decode_coding_utf_8
P_ ((struct coding_system
*));
418 static int encode_coding_utf_8
P_ ((struct coding_system
*));
420 static int detect_coding_utf_16
P_ ((struct coding_system
*,
421 struct coding_detection_info
*info
));
422 static void decode_coding_utf_16
P_ ((struct coding_system
*));
423 static int encode_coding_utf_16
P_ ((struct coding_system
*));
425 static int detect_coding_iso_2022
P_ ((struct coding_system
*,
426 struct coding_detection_info
*info
));
427 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
428 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
430 static int detect_coding_emacs_mule
P_ ((struct coding_system
*,
431 struct coding_detection_info
*info
));
432 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
433 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
435 static int detect_coding_sjis
P_ ((struct coding_system
*,
436 struct coding_detection_info
*info
));
437 static void decode_coding_sjis
P_ ((struct coding_system
*));
438 static int encode_coding_sjis
P_ ((struct coding_system
*));
440 static int detect_coding_big5
P_ ((struct coding_system
*,
441 struct coding_detection_info
*info
));
442 static void decode_coding_big5
P_ ((struct coding_system
*));
443 static int encode_coding_big5
P_ ((struct coding_system
*));
445 static int detect_coding_ccl
P_ ((struct coding_system
*,
446 struct coding_detection_info
*info
));
447 static void decode_coding_ccl
P_ ((struct coding_system
*));
448 static int encode_coding_ccl
P_ ((struct coding_system
*));
450 static void decode_coding_raw_text
P_ ((struct coding_system
*));
451 static int encode_coding_raw_text
P_ ((struct coding_system
*));
454 /* ISO2022 section */
456 #define CODING_ISO_INITIAL(coding, reg) \
457 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
458 coding_attr_iso_initial), \
462 #define CODING_ISO_REQUEST(coding, charset_id) \
463 ((charset_id <= (coding)->max_charset_id \
464 ? (coding)->safe_charsets[charset_id] \
468 #define CODING_ISO_FLAGS(coding) \
469 ((coding)->spec.iso_2022.flags)
470 #define CODING_ISO_DESIGNATION(coding, reg) \
471 ((coding)->spec.iso_2022.current_designation[reg])
472 #define CODING_ISO_INVOCATION(coding, plane) \
473 ((coding)->spec.iso_2022.current_invocation[plane])
474 #define CODING_ISO_SINGLE_SHIFTING(coding) \
475 ((coding)->spec.iso_2022.single_shifting)
476 #define CODING_ISO_BOL(coding) \
477 ((coding)->spec.iso_2022.bol)
478 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
479 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
481 /* Control characters of ISO2022. */
482 /* code */ /* function */
483 #define ISO_CODE_LF 0x0A /* line-feed */
484 #define ISO_CODE_CR 0x0D /* carriage-return */
485 #define ISO_CODE_SO 0x0E /* shift-out */
486 #define ISO_CODE_SI 0x0F /* shift-in */
487 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
488 #define ISO_CODE_ESC 0x1B /* escape */
489 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
490 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
491 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
493 /* All code (1-byte) of ISO2022 is classified into one of the
495 enum iso_code_class_type
497 ISO_control_0
, /* Control codes in the range
498 0x00..0x1F and 0x7F, except for the
499 following 5 codes. */
500 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
501 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
502 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
503 ISO_escape
, /* ISO_CODE_SO (0x1B) */
504 ISO_control_1
, /* Control codes in the range
505 0x80..0x9F, except for the
506 following 3 codes. */
507 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
508 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
509 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
510 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
511 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
512 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
513 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
516 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
517 `iso-flags' attribute of an iso2022 coding system. */
519 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
520 instead of the correct short-form sequence (e.g. ESC $ A). */
521 #define CODING_ISO_FLAG_LONG_FORM 0x0001
523 /* If set, reset graphic planes and registers at end-of-line to the
525 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
527 /* If set, reset graphic planes and registers before any control
528 characters to the initial state. */
529 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
531 /* If set, encode by 7-bit environment. */
532 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
534 /* If set, use locking-shift function. */
535 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
537 /* If set, use single-shift function. Overwrite
538 CODING_ISO_FLAG_LOCKING_SHIFT. */
539 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
541 /* If set, use designation escape sequence. */
542 #define CODING_ISO_FLAG_DESIGNATION 0x0040
544 /* If set, produce revision number sequence. */
545 #define CODING_ISO_FLAG_REVISION 0x0080
547 /* If set, produce ISO6429's direction specifying sequence. */
548 #define CODING_ISO_FLAG_DIRECTION 0x0100
550 /* If set, assume designation states are reset at beginning of line on
552 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
554 /* If set, designation sequence should be placed at beginning of line
556 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
558 /* If set, do not encode unsafe charactes on output. */
559 #define CODING_ISO_FLAG_SAFE 0x0800
561 /* If set, extra latin codes (128..159) are accepted as a valid code
563 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
565 #define CODING_ISO_FLAG_COMPOSITION 0x2000
567 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
569 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
571 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
573 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
575 /* A character to be produced on output if encoding of the original
576 character is prohibited by CODING_ISO_FLAG_SAFE. */
577 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
581 #define CODING_UTF_16_BOM(coding) \
582 ((coding)->spec.utf_16.bom)
584 #define CODING_UTF_16_ENDIAN(coding) \
585 ((coding)->spec.utf_16.endian)
587 #define CODING_UTF_16_SURROGATE(coding) \
588 ((coding)->spec.utf_16.surrogate)
592 #define CODING_CCL_DECODER(coding) \
593 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
594 #define CODING_CCL_ENCODER(coding) \
595 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
596 #define CODING_CCL_VALIDS(coding) \
597 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
599 /* Index for each coding category in `coding_categories' */
603 coding_category_iso_7
,
604 coding_category_iso_7_tight
,
605 coding_category_iso_8_1
,
606 coding_category_iso_8_2
,
607 coding_category_iso_7_else
,
608 coding_category_iso_8_else
,
609 coding_category_utf_8
,
610 coding_category_utf_16_auto
,
611 coding_category_utf_16_be
,
612 coding_category_utf_16_le
,
613 coding_category_utf_16_be_nosig
,
614 coding_category_utf_16_le_nosig
,
615 coding_category_charset
,
616 coding_category_sjis
,
617 coding_category_big5
,
619 coding_category_emacs_mule
,
620 /* All above are targets of code detection. */
621 coding_category_raw_text
,
622 coding_category_undecided
,
626 /* Definitions of flag bits used in detect_coding_XXXX. */
627 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
628 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
629 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
630 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
631 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
632 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
633 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
634 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
635 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
636 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
637 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
638 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
639 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
640 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
641 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
642 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
643 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
644 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
646 /* This value is returned if detect_coding_mask () find nothing other
647 than ASCII characters. */
648 #define CATEGORY_MASK_ANY \
649 (CATEGORY_MASK_ISO_7 \
650 | CATEGORY_MASK_ISO_7_TIGHT \
651 | CATEGORY_MASK_ISO_8_1 \
652 | CATEGORY_MASK_ISO_8_2 \
653 | CATEGORY_MASK_ISO_7_ELSE \
654 | CATEGORY_MASK_ISO_8_ELSE \
655 | CATEGORY_MASK_UTF_8 \
656 | CATEGORY_MASK_UTF_16_BE \
657 | CATEGORY_MASK_UTF_16_LE \
658 | CATEGORY_MASK_UTF_16_BE_NOSIG \
659 | CATEGORY_MASK_UTF_16_LE_NOSIG \
660 | CATEGORY_MASK_CHARSET \
661 | CATEGORY_MASK_SJIS \
662 | CATEGORY_MASK_BIG5 \
663 | CATEGORY_MASK_CCL \
664 | CATEGORY_MASK_EMACS_MULE)
667 #define CATEGORY_MASK_ISO_7BIT \
668 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
670 #define CATEGORY_MASK_ISO_8BIT \
671 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
673 #define CATEGORY_MASK_ISO_ELSE \
674 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
676 #define CATEGORY_MASK_ISO_ESCAPE \
677 (CATEGORY_MASK_ISO_7 \
678 | CATEGORY_MASK_ISO_7_TIGHT \
679 | CATEGORY_MASK_ISO_7_ELSE \
680 | CATEGORY_MASK_ISO_8_ELSE)
682 #define CATEGORY_MASK_ISO \
683 ( CATEGORY_MASK_ISO_7BIT \
684 | CATEGORY_MASK_ISO_8BIT \
685 | CATEGORY_MASK_ISO_ELSE)
687 #define CATEGORY_MASK_UTF_16 \
688 (CATEGORY_MASK_UTF_16_BE \
689 | CATEGORY_MASK_UTF_16_LE \
690 | CATEGORY_MASK_UTF_16_BE_NOSIG \
691 | CATEGORY_MASK_UTF_16_LE_NOSIG)
694 /* List of symbols `coding-category-xxx' ordered by priority. This
695 variable is exposed to Emacs Lisp. */
696 static Lisp_Object Vcoding_category_list
;
698 /* Table of coding categories (Lisp symbols). This variable is for
700 static Lisp_Object Vcoding_category_table
;
702 /* Table of coding-categories ordered by priority. */
703 static enum coding_category coding_priorities
[coding_category_max
];
705 /* Nth element is a coding context for the coding system bound to the
706 Nth coding category. */
707 static struct coding_system coding_categories
[coding_category_max
];
709 /*** Commonly used macros and functions ***/
712 #define min(a, b) ((a) < (b) ? (a) : (b))
715 #define max(a, b) ((a) > (b) ? (a) : (b))
718 #define CODING_GET_INFO(coding, attrs, charset_list) \
720 (attrs) = CODING_ID_ATTRS ((coding)->id); \
721 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
725 /* Safely get one byte from the source text pointed by SRC which ends
726 at SRC_END, and set C to that byte. If there are not enough bytes
727 in the source, it jumps to `no_more_source'. If multibytep is
728 nonzero, and a multibyte character is found at SRC, set C to the
729 negative value of the character code. The caller should declare
730 and set these variables appropriately in advance:
731 src, src_end, multibytep */
733 #define ONE_MORE_BYTE(c) \
735 if (src == src_end) \
737 if (src_base < src) \
738 record_conversion_result \
739 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
740 goto no_more_source; \
743 if (multibytep && (c & 0x80)) \
745 if ((c & 0xFE) == 0xC0) \
746 c = ((c & 1) << 6) | *src++; \
749 c = - string_char (--src, &src, NULL); \
750 record_conversion_result \
751 (coding, CODING_RESULT_INVALID_SRC); \
758 #define ONE_MORE_BYTE_NO_CHECK(c) \
761 if (multibytep && (c & 0x80)) \
763 if ((c & 0xFE) == 0xC0) \
764 c = ((c & 1) << 6) | *src++; \
767 c = - string_char (--src, &src, NULL); \
768 record_conversion_result \
769 (coding, CODING_RESULT_INVALID_SRC); \
776 /* Store a byte C in the place pointed by DST and increment DST to the
777 next free point, and increment PRODUCED_CHARS. The caller should
778 assure that C is 0..127, and declare and set the variable `dst'
779 appropriately in advance.
783 #define EMIT_ONE_ASCII_BYTE(c) \
790 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
792 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
794 produced_chars += 2; \
795 *dst++ = (c1), *dst++ = (c2); \
799 /* Store a byte C in the place pointed by DST and increment DST to the
800 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
801 nonzero, store in an appropriate multibyte from. The caller should
802 declare and set the variables `dst' and `multibytep' appropriately
805 #define EMIT_ONE_BYTE(c) \
812 ch = BYTE8_TO_CHAR (ch); \
813 CHAR_STRING_ADVANCE (ch, dst); \
820 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
822 #define EMIT_TWO_BYTES(c1, c2) \
824 produced_chars += 2; \
831 ch = BYTE8_TO_CHAR (ch); \
832 CHAR_STRING_ADVANCE (ch, dst); \
835 ch = BYTE8_TO_CHAR (ch); \
836 CHAR_STRING_ADVANCE (ch, dst); \
846 #define EMIT_THREE_BYTES(c1, c2, c3) \
848 EMIT_ONE_BYTE (c1); \
849 EMIT_TWO_BYTES (c2, c3); \
853 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
855 EMIT_TWO_BYTES (c1, c2); \
856 EMIT_TWO_BYTES (c3, c4); \
861 record_conversion_result (struct coding_system
*coding
,
862 enum coding_result_code result
)
864 coding
->result
= result
;
867 case CODING_RESULT_INSUFFICIENT_SRC
:
868 Vlast_code_conversion_error
= Qinsufficient_source
;
870 case CODING_RESULT_INCONSISTENT_EOL
:
871 Vlast_code_conversion_error
= Qinconsistent_eol
;
873 case CODING_RESULT_INVALID_SRC
:
874 Vlast_code_conversion_error
= Qinvalid_source
;
876 case CODING_RESULT_INTERRUPT
:
877 Vlast_code_conversion_error
= Qinterrupted
;
879 case CODING_RESULT_INSUFFICIENT_MEM
:
880 Vlast_code_conversion_error
= Qinsufficient_memory
;
885 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
887 charset_map_loaded = 0; \
888 c = DECODE_CHAR (charset, code); \
889 if (charset_map_loaded) \
891 const unsigned char *orig = coding->source; \
894 coding_set_source (coding); \
895 offset = coding->source - orig; \
897 src_base += offset; \
903 #define ASSURE_DESTINATION(bytes) \
905 if (dst + (bytes) >= dst_end) \
907 int more_bytes = charbuf_end - charbuf + (bytes); \
909 dst = alloc_destination (coding, more_bytes, dst); \
910 dst_end = coding->destination + coding->dst_bytes; \
917 coding_set_source (coding
)
918 struct coding_system
*coding
;
920 if (BUFFERP (coding
->src_object
))
922 struct buffer
*buf
= XBUFFER (coding
->src_object
);
924 if (coding
->src_pos
< 0)
925 coding
->source
= BUF_GAP_END_ADDR (buf
) + coding
->src_pos_byte
;
927 coding
->source
= BUF_BYTE_ADDRESS (buf
, coding
->src_pos_byte
);
929 else if (STRINGP (coding
->src_object
))
931 coding
->source
= SDATA (coding
->src_object
) + coding
->src_pos_byte
;
934 /* Otherwise, the source is C string and is never relocated
935 automatically. Thus we don't have to update anything. */
940 coding_set_destination (coding
)
941 struct coding_system
*coding
;
943 if (BUFFERP (coding
->dst_object
))
945 if (coding
->src_pos
< 0)
947 coding
->destination
= BEG_ADDR
+ coding
->dst_pos_byte
- 1;
948 coding
->dst_bytes
= (GAP_END_ADDR
949 - (coding
->src_bytes
- coding
->consumed
)
950 - coding
->destination
);
954 /* We are sure that coding->dst_pos_byte is before the gap
956 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
957 + coding
->dst_pos_byte
- 1);
958 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
959 - coding
->destination
);
963 /* Otherwise, the destination is C string and is never relocated
964 automatically. Thus we don't have to update anything. */
970 coding_alloc_by_realloc (coding
, bytes
)
971 struct coding_system
*coding
;
974 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
975 coding
->dst_bytes
+ bytes
);
976 coding
->dst_bytes
+= bytes
;
980 coding_alloc_by_making_gap (coding
, bytes
)
981 struct coding_system
*coding
;
984 if (BUFFERP (coding
->dst_object
)
985 && EQ (coding
->src_object
, coding
->dst_object
))
987 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
989 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
991 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
995 Lisp_Object this_buffer
;
997 this_buffer
= Fcurrent_buffer ();
998 set_buffer_internal (XBUFFER (coding
->dst_object
));
1000 set_buffer_internal (XBUFFER (this_buffer
));
1005 static unsigned char *
1006 alloc_destination (coding
, nbytes
, dst
)
1007 struct coding_system
*coding
;
1011 EMACS_INT offset
= dst
- coding
->destination
;
1013 if (BUFFERP (coding
->dst_object
))
1014 coding_alloc_by_making_gap (coding
, nbytes
);
1016 coding_alloc_by_realloc (coding
, nbytes
);
1017 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1018 coding_set_destination (coding
);
1019 dst
= coding
->destination
+ offset
;
1023 /** Macros for annotations. */
1025 /* Maximum length of annotation data (sum of annotations for
1026 composition and charset). */
1027 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1029 /* An annotation data is stored in the array coding->charbuf in this
1031 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1032 LENGTH is the number of elements in the annotation.
1033 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1034 FROM and TO specify the range of text annotated. They are relative
1035 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1037 The format of the following elements depend on ANNOTATION_MASK.
1039 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1041 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1042 METHOD is one of enum composition_method.
1043 Optionnal COMPOSITION-COMPONENTS are characters and composition
1046 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1049 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1051 *(buf)++ = -(len); \
1052 *(buf)++ = (mask); \
1053 *(buf)++ = (from); \
1055 coding->annotated = 1; \
1058 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1060 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1065 #define ADD_CHARSET_DATA(buf, from, to, id) \
1067 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1072 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1079 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1080 Check if a text is encoded in UTF-8. If it is, return 1, else
1083 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1084 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1085 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1086 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1087 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1088 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1091 detect_coding_utf_8 (coding
, detect_info
)
1092 struct coding_system
*coding
;
1093 struct coding_detection_info
*detect_info
;
1095 const unsigned char *src
= coding
->source
, *src_base
;
1096 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1097 int multibytep
= coding
->src_multibyte
;
1098 int consumed_chars
= 0;
1101 detect_info
->checked
|= CATEGORY_MASK_UTF_8
;
1102 /* A coding system of this category is always ASCII compatible. */
1103 src
+= coding
->head_ascii
;
1107 int c
, c1
, c2
, c3
, c4
;
1111 if (c
< 0 || UTF_8_1_OCTET_P (c
))
1114 if (c1
< 0 || ! UTF_8_EXTRA_OCTET_P (c1
))
1116 if (UTF_8_2_OCTET_LEADING_P (c
))
1118 found
= CATEGORY_MASK_UTF_8
;
1122 if (c2
< 0 || ! UTF_8_EXTRA_OCTET_P (c2
))
1124 if (UTF_8_3_OCTET_LEADING_P (c
))
1126 found
= CATEGORY_MASK_UTF_8
;
1130 if (c3
< 0 || ! UTF_8_EXTRA_OCTET_P (c3
))
1132 if (UTF_8_4_OCTET_LEADING_P (c
))
1134 found
= CATEGORY_MASK_UTF_8
;
1138 if (c4
< 0 || ! UTF_8_EXTRA_OCTET_P (c4
))
1140 if (UTF_8_5_OCTET_LEADING_P (c
))
1142 found
= CATEGORY_MASK_UTF_8
;
1147 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1151 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1153 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1156 detect_info
->found
|= found
;
1162 decode_coding_utf_8 (coding
)
1163 struct coding_system
*coding
;
1165 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1166 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1167 const unsigned char *src_base
;
1168 int *charbuf
= coding
->charbuf
;
1169 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1170 int consumed_chars
= 0, consumed_chars_base
;
1171 int multibytep
= coding
->src_multibyte
;
1172 Lisp_Object attr
, charset_list
;
1174 CODING_GET_INFO (coding
, attr
, charset_list
);
1178 int c
, c1
, c2
, c3
, c4
, c5
;
1181 consumed_chars_base
= consumed_chars
;
1183 if (charbuf
>= charbuf_end
)
1191 else if (UTF_8_1_OCTET_P(c1
))
1198 if (c2
< 0 || ! UTF_8_EXTRA_OCTET_P (c2
))
1200 if (UTF_8_2_OCTET_LEADING_P (c1
))
1202 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1203 /* Reject overlong sequences here and below. Encoders
1204 producing them are incorrect, they can be misleading,
1205 and they mess up read/write invariance. */
1212 if (c3
< 0 || ! UTF_8_EXTRA_OCTET_P (c3
))
1214 if (UTF_8_3_OCTET_LEADING_P (c1
))
1216 c
= (((c1
& 0xF) << 12)
1217 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1219 || (c
>= 0xd800 && c
< 0xe000)) /* surrogates (invalid) */
1225 if (c4
< 0 || ! UTF_8_EXTRA_OCTET_P (c4
))
1227 if (UTF_8_4_OCTET_LEADING_P (c1
))
1229 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1230 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1237 if (c5
< 0 || ! UTF_8_EXTRA_OCTET_P (c5
))
1239 if (UTF_8_5_OCTET_LEADING_P (c1
))
1241 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1242 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1244 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1259 consumed_chars
= consumed_chars_base
;
1261 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1266 coding
->consumed_char
+= consumed_chars_base
;
1267 coding
->consumed
= src_base
- coding
->source
;
1268 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1273 encode_coding_utf_8 (coding
)
1274 struct coding_system
*coding
;
1276 int multibytep
= coding
->dst_multibyte
;
1277 int *charbuf
= coding
->charbuf
;
1278 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1279 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1280 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1281 int produced_chars
= 0;
1286 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1288 while (charbuf
< charbuf_end
)
1290 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1292 ASSURE_DESTINATION (safe_room
);
1294 if (CHAR_BYTE8_P (c
))
1296 c
= CHAR_TO_BYTE8 (c
);
1301 CHAR_STRING_ADVANCE (c
, pend
);
1302 for (p
= str
; p
< pend
; p
++)
1309 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1311 while (charbuf
< charbuf_end
)
1313 ASSURE_DESTINATION (safe_room
);
1315 dst
+= CHAR_STRING (c
, dst
);
1319 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1320 coding
->produced_char
+= produced_chars
;
1321 coding
->produced
= dst
- coding
->destination
;
1326 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1327 Check if a text is encoded in one of UTF-16 based coding systems.
1328 If it is, return 1, else return 0. */
1330 #define UTF_16_HIGH_SURROGATE_P(val) \
1331 (((val) & 0xFC00) == 0xD800)
1333 #define UTF_16_LOW_SURROGATE_P(val) \
1334 (((val) & 0xFC00) == 0xDC00)
1336 #define UTF_16_INVALID_P(val) \
1337 (((val) == 0xFFFE) \
1338 || ((val) == 0xFFFF) \
1339 || UTF_16_LOW_SURROGATE_P (val))
1343 detect_coding_utf_16 (coding
, detect_info
)
1344 struct coding_system
*coding
;
1345 struct coding_detection_info
*detect_info
;
1347 const unsigned char *src
= coding
->source
, *src_base
= src
;
1348 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1349 int multibytep
= coding
->src_multibyte
;
1350 int consumed_chars
= 0;
1353 detect_info
->checked
|= CATEGORY_MASK_UTF_16
;
1354 if (coding
->mode
& CODING_MODE_LAST_BLOCK
1355 && (coding
->src_chars
& 1))
1357 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1363 if ((c1
== 0xFF) && (c2
== 0xFE))
1365 detect_info
->found
|= (CATEGORY_MASK_UTF_16_LE
1366 | CATEGORY_MASK_UTF_16_AUTO
);
1367 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_BE
1368 | CATEGORY_MASK_UTF_16_BE_NOSIG
1369 | CATEGORY_MASK_UTF_16_LE_NOSIG
);
1371 else if ((c1
== 0xFE) && (c2
== 0xFF))
1373 detect_info
->found
|= (CATEGORY_MASK_UTF_16_BE
1374 | CATEGORY_MASK_UTF_16_AUTO
);
1375 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_LE
1376 | CATEGORY_MASK_UTF_16_BE_NOSIG
1377 | CATEGORY_MASK_UTF_16_LE_NOSIG
);
1379 else if (c1
>= 0 && c2
>= 0)
1381 unsigned char b1
[256], b2
[256];
1382 int b1_variants
= 1, b2_variants
= 1;
1385 bzero (b1
, 256), bzero (b2
, 256);
1387 for (n
= 0; n
< 256 && src
< src_end
; n
++)
1392 if (c1
< 0 || c2
< 0)
1394 if (! b1
[c1
++]) b1_variants
++;
1395 if (! b2
[c2
++]) b2_variants
++;
1397 if (b1_variants
< b2_variants
)
1398 detect_info
->found
|= CATEGORY_MASK_UTF_16_BE_NOSIG
;
1400 detect_info
->found
|= CATEGORY_MASK_UTF_16_LE_NOSIG
;
1401 detect_info
->rejected
1402 |= (CATEGORY_MASK_UTF_16_BE
| CATEGORY_MASK_UTF_16_LE
);
1409 decode_coding_utf_16 (coding
)
1410 struct coding_system
*coding
;
1412 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1413 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1414 const unsigned char *src_base
;
1415 int *charbuf
= coding
->charbuf
;
1416 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1417 int consumed_chars
= 0, consumed_chars_base
;
1418 int multibytep
= coding
->src_multibyte
;
1419 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1420 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1421 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1422 Lisp_Object attr
, charset_list
;
1424 CODING_GET_INFO (coding
, attr
, charset_list
);
1426 if (bom
== utf_16_with_bom
)
1435 if (endian
== utf_16_big_endian
1436 ? c
!= 0xFEFF : c
!= 0xFFFE)
1438 /* The first two bytes are not BOM. Treat them as bytes
1439 for a normal character. */
1443 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1445 else if (bom
== utf_16_detect_bom
)
1447 /* We have already tried to detect BOM and failed in
1449 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1457 consumed_chars_base
= consumed_chars
;
1459 if (charbuf
+ 2 >= charbuf_end
)
1471 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
1475 c
= (endian
== utf_16_big_endian
1476 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1479 if (! UTF_16_LOW_SURROGATE_P (c
))
1481 if (endian
== utf_16_big_endian
)
1482 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1484 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1488 if (UTF_16_HIGH_SURROGATE_P (c
))
1489 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1495 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1496 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1502 if (UTF_16_HIGH_SURROGATE_P (c
))
1503 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1510 coding
->consumed_char
+= consumed_chars_base
;
1511 coding
->consumed
= src_base
- coding
->source
;
1512 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1516 encode_coding_utf_16 (coding
)
1517 struct coding_system
*coding
;
1519 int multibytep
= coding
->dst_multibyte
;
1520 int *charbuf
= coding
->charbuf
;
1521 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1522 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1523 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1525 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1526 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1527 int produced_chars
= 0;
1528 Lisp_Object attrs
, charset_list
;
1531 CODING_GET_INFO (coding
, attrs
, charset_list
);
1533 if (bom
!= utf_16_without_bom
)
1535 ASSURE_DESTINATION (safe_room
);
1537 EMIT_TWO_BYTES (0xFE, 0xFF);
1539 EMIT_TWO_BYTES (0xFF, 0xFE);
1540 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1543 while (charbuf
< charbuf_end
)
1545 ASSURE_DESTINATION (safe_room
);
1547 if (c
>= MAX_UNICODE_CHAR
)
1548 c
= coding
->default_char
;
1553 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1555 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1562 c1
= (c
>> 10) + 0xD800;
1563 c2
= (c
& 0x3FF) + 0xDC00;
1565 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1567 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1570 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1571 coding
->produced
= dst
- coding
->destination
;
1572 coding
->produced_char
+= produced_chars
;
1577 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1579 /* Emacs' internal format for representation of multiple character
1580 sets is a kind of multi-byte encoding, i.e. characters are
1581 represented by variable-length sequences of one-byte codes.
1583 ASCII characters and control characters (e.g. `tab', `newline') are
1584 represented by one-byte sequences which are their ASCII codes, in
1585 the range 0x00 through 0x7F.
1587 8-bit characters of the range 0x80..0x9F are represented by
1588 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1591 8-bit characters of the range 0xA0..0xFF are represented by
1592 one-byte sequences which are their 8-bit code.
1594 The other characters are represented by a sequence of `base
1595 leading-code', optional `extended leading-code', and one or two
1596 `position-code's. The length of the sequence is determined by the
1597 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1598 whereas extended leading-code and position-code take the range 0xA0
1599 through 0xFF. See `charset.h' for more details about leading-code
1602 --- CODE RANGE of Emacs' internal format ---
1606 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1607 eight-bit-graphic 0xA0..0xBF
1608 ELSE 0x81..0x9D + [0xA0..0xFF]+
1609 ---------------------------------------------
1611 As this is the internal character representation, the format is
1612 usually not used externally (i.e. in a file or in a data sent to a
1613 process). But, it is possible to have a text externally in this
1614 format (i.e. by encoding by the coding system `emacs-mule').
1616 In that case, a sequence of one-byte codes has a slightly different
1619 At first, all characters in eight-bit-control are represented by
1620 one-byte sequences which are their 8-bit code.
1622 Next, character composition data are represented by the byte
1623 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1625 METHOD is 0xF0 plus one of composition method (enum
1626 composition_method),
1628 BYTES is 0xA0 plus a byte length of this composition data,
1630 CHARS is 0x20 plus a number of characters composed by this
1633 COMPONENTs are characters of multibye form or composition
1634 rules encoded by two-byte of ASCII codes.
1636 In addition, for backward compatibility, the following formats are
1637 also recognized as composition data on decoding.
1640 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1643 MSEQ is a multibyte form but in these special format:
1644 ASCII: 0xA0 ASCII_CODE+0x80,
1645 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1646 RULE is a one byte code of the range 0xA0..0xF0 that
1647 represents a composition rule.
1650 char emacs_mule_bytes
[256];
1653 emacs_mule_char (coding
, src
, nbytes
, nchars
, id
)
1654 struct coding_system
*coding
;
1655 const unsigned char *src
;
1656 int *nbytes
, *nchars
, *id
;
1658 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1659 const unsigned char *src_base
= src
;
1660 int multibytep
= coding
->src_multibyte
;
1661 struct charset
*charset
;
1664 int consumed_chars
= 0;
1670 charset
= emacs_mule_charset
[0];
1674 switch (emacs_mule_bytes
[c
])
1677 if (! (charset
= emacs_mule_charset
[c
]))
1686 if (c
== EMACS_MULE_LEADING_CODE_PRIVATE_11
1687 || c
== EMACS_MULE_LEADING_CODE_PRIVATE_12
)
1690 if (c
< 0 || ! (charset
= emacs_mule_charset
[c
]))
1699 if (! (charset
= emacs_mule_charset
[c
]))
1704 code
= (c
& 0x7F) << 8;
1714 if (c
< 0 || ! (charset
= emacs_mule_charset
[c
]))
1719 code
= (c
& 0x7F) << 8;
1728 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
)
1729 ? charset_ascii
: charset_eight_bit
);
1735 c
= DECODE_CHAR (charset
, code
);
1739 *nbytes
= src
- src_base
;
1740 *nchars
= consumed_chars
;
1753 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1754 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1758 detect_coding_emacs_mule (coding
, detect_info
)
1759 struct coding_system
*coding
;
1760 struct coding_detection_info
*detect_info
;
1762 const unsigned char *src
= coding
->source
, *src_base
;
1763 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1764 int multibytep
= coding
->src_multibyte
;
1765 int consumed_chars
= 0;
1769 detect_info
->checked
|= CATEGORY_MASK_EMACS_MULE
;
1770 /* A coding system of this category is always ASCII compatible. */
1771 src
+= coding
->head_ascii
;
1781 /* Perhaps the start of composite character. We simple skip
1782 it because analyzing it is too heavy for detecting. But,
1783 at least, we check that the composite character
1784 constitues of more than 4 bytes. */
1785 const unsigned char *src_base
;
1795 if (src
- src_base
<= 4)
1797 found
= CATEGORY_MASK_EMACS_MULE
;
1805 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1810 const unsigned char *src_base
= src
- 1;
1817 if (src
- src_base
!= emacs_mule_bytes
[*src_base
])
1819 found
= CATEGORY_MASK_EMACS_MULE
;
1822 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1826 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1828 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1831 detect_info
->found
|= found
;
1836 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1838 /* Decode a character represented as a component of composition
1839 sequence of Emacs 20/21 style at SRC. Set C to that character and
1840 update SRC to the head of next character (or an encoded composition
1841 rule). If SRC doesn't points a composition component, set C to -1.
1842 If SRC points an invalid byte sequence, global exit by a return
1845 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1849 int nbytes, nchars; \
1851 if (src == src_end) \
1853 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1858 goto invalid_code; \
1862 consumed_chars += nchars; \
1867 /* Decode a composition rule represented as a component of composition
1868 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1869 and increment BUF. If SRC points an invalid byte sequence, set C
1872 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1874 int c, gref, nref; \
1876 if (src >= src_end) \
1877 goto invalid_code; \
1878 ONE_MORE_BYTE_NO_CHECK (c); \
1880 if (c < 0 || c >= 81) \
1881 goto invalid_code; \
1883 gref = c / 9, nref = c % 9; \
1884 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1888 /* Decode a composition rule represented as a component of composition
1889 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1890 and increment BUF. If SRC points an invalid byte sequence, set C
1893 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1897 if (src + 1>= src_end) \
1898 goto invalid_code; \
1899 ONE_MORE_BYTE_NO_CHECK (gref); \
1901 ONE_MORE_BYTE_NO_CHECK (nref); \
1903 if (gref < 0 || gref >= 81 \
1904 || nref < 0 || nref >= 81) \
1905 goto invalid_code; \
1906 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1910 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1912 /* Emacs 21 style format. The first three bytes at SRC are \
1913 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1914 the byte length of this composition information, CHARS is the \
1915 number of characters composed by this composition. */ \
1916 enum composition_method method = c - 0xF2; \
1917 int *charbuf_base = charbuf; \
1919 int consumed_chars_limit; \
1920 int nbytes, nchars; \
1922 ONE_MORE_BYTE (c); \
1924 goto invalid_code; \
1925 nbytes = c - 0xA0; \
1927 goto invalid_code; \
1928 ONE_MORE_BYTE (c); \
1930 goto invalid_code; \
1931 nchars = c - 0xA0; \
1932 from = coding->produced + char_offset; \
1933 to = from + nchars; \
1934 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1935 consumed_chars_limit = consumed_chars_base + nbytes; \
1936 if (method != COMPOSITION_RELATIVE) \
1939 while (consumed_chars < consumed_chars_limit) \
1941 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1942 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1944 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1947 if (consumed_chars < consumed_chars_limit) \
1948 goto invalid_code; \
1949 charbuf_base[0] -= i; \
1954 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1956 /* Emacs 20 style format for relative composition. */ \
1957 /* Store multibyte form of characters to be composed. */ \
1958 enum composition_method method = COMPOSITION_RELATIVE; \
1959 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1960 int *buf = components; \
1965 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1966 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1967 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1969 goto invalid_code; \
1970 from = coding->produced_char + char_offset; \
1972 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1973 for (j = 0; j < i; j++) \
1974 *charbuf++ = components[j]; \
1978 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1980 /* Emacs 20 style format for rule-base composition. */ \
1981 /* Store multibyte form of characters to be composed. */ \
1982 enum composition_method method = COMPOSITION_WITH_RULE; \
1983 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1984 int *buf = components; \
1988 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1989 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1991 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1992 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1994 if (i < 1 || (buf - components) % 2 == 0) \
1995 goto invalid_code; \
1996 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1997 goto no_more_source; \
1998 from = coding->produced_char + char_offset; \
2000 ADD_COMPOSITION_DATA (buf, from, to, method); \
2001 for (j = 0; j < i; j++) \
2002 *charbuf++ = components[j]; \
2003 for (j = 0; j < i; j += 2) \
2004 *charbuf++ = components[j]; \
2009 decode_coding_emacs_mule (coding
)
2010 struct coding_system
*coding
;
2012 const unsigned char *src
= coding
->source
+ coding
->consumed
;
2013 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2014 const unsigned char *src_base
;
2015 int *charbuf
= coding
->charbuf
;
2016 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
2017 int consumed_chars
= 0, consumed_chars_base
;
2018 int multibytep
= coding
->src_multibyte
;
2019 Lisp_Object attrs
, charset_list
;
2020 int char_offset
= coding
->produced_char
;
2021 int last_offset
= char_offset
;
2022 int last_id
= charset_ascii
;
2024 CODING_GET_INFO (coding
, attrs
, charset_list
);
2031 consumed_chars_base
= consumed_chars
;
2033 if (charbuf
>= charbuf_end
)
2052 if (c
- 0xF2 >= COMPOSITION_RELATIVE
2053 && c
- 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS
)
2054 DECODE_EMACS_MULE_21_COMPOSITION (c
);
2056 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
2058 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
2062 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
2068 consumed_chars
= consumed_chars_base
;
2069 c
= emacs_mule_char (coding
, src
, &nbytes
, &nchars
, &id
);
2078 if (last_id
!= charset_ascii
)
2079 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2081 last_offset
= char_offset
;
2085 consumed_chars
+= nchars
;
2092 consumed_chars
= consumed_chars_base
;
2094 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
2100 if (last_id
!= charset_ascii
)
2101 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2102 coding
->consumed_char
+= consumed_chars_base
;
2103 coding
->consumed
= src_base
- coding
->source
;
2104 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
2108 #define EMACS_MULE_LEADING_CODES(id, codes) \
2111 codes[0] = id, codes[1] = 0; \
2112 else if (id < 0xE0) \
2113 codes[0] = 0x9A, codes[1] = id; \
2114 else if (id < 0xF0) \
2115 codes[0] = 0x9B, codes[1] = id; \
2116 else if (id < 0xF5) \
2117 codes[0] = 0x9C, codes[1] = id; \
2119 codes[0] = 0x9D, codes[1] = id; \
2124 encode_coding_emacs_mule (coding
)
2125 struct coding_system
*coding
;
2127 int multibytep
= coding
->dst_multibyte
;
2128 int *charbuf
= coding
->charbuf
;
2129 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
2130 unsigned char *dst
= coding
->destination
+ coding
->produced
;
2131 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
2133 int produced_chars
= 0;
2134 Lisp_Object attrs
, charset_list
;
2136 int preferred_charset_id
= -1;
2138 CODING_GET_INFO (coding
, attrs
, charset_list
);
2139 if (! EQ (charset_list
, Vemacs_mule_charset_list
))
2141 CODING_ATTR_CHARSET_LIST (attrs
)
2142 = charset_list
= Vemacs_mule_charset_list
;
2145 while (charbuf
< charbuf_end
)
2147 ASSURE_DESTINATION (safe_room
);
2152 /* Handle an annotation. */
2155 case CODING_ANNOTATE_COMPOSITION_MASK
:
2156 /* Not yet implemented. */
2158 case CODING_ANNOTATE_CHARSET_MASK
:
2159 preferred_charset_id
= charbuf
[3];
2160 if (preferred_charset_id
>= 0
2161 && NILP (Fmemq (make_number (preferred_charset_id
),
2163 preferred_charset_id
= -1;
2172 if (ASCII_CHAR_P (c
))
2173 EMIT_ONE_ASCII_BYTE (c
);
2174 else if (CHAR_BYTE8_P (c
))
2176 c
= CHAR_TO_BYTE8 (c
);
2181 struct charset
*charset
;
2185 unsigned char leading_codes
[2];
2187 if (preferred_charset_id
>= 0)
2189 charset
= CHARSET_FROM_ID (preferred_charset_id
);
2190 if (! CHAR_CHARSET_P (c
, charset
))
2191 charset
= char_charset (c
, charset_list
, NULL
);
2194 charset
= char_charset (c
, charset_list
, &code
);
2197 c
= coding
->default_char
;
2198 if (ASCII_CHAR_P (c
))
2200 EMIT_ONE_ASCII_BYTE (c
);
2203 charset
= char_charset (c
, charset_list
, &code
);
2205 dimension
= CHARSET_DIMENSION (charset
);
2206 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2207 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2208 EMIT_ONE_BYTE (leading_codes
[0]);
2209 if (leading_codes
[1])
2210 EMIT_ONE_BYTE (leading_codes
[1]);
2212 EMIT_ONE_BYTE (code
| 0x80);
2216 EMIT_ONE_BYTE (code
>> 8);
2217 EMIT_ONE_BYTE (code
& 0xFF);
2221 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
2222 coding
->produced_char
+= produced_chars
;
2223 coding
->produced
= dst
- coding
->destination
;
2228 /*** 7. ISO2022 handlers ***/
2230 /* The following note describes the coding system ISO2022 briefly.
2231 Since the intention of this note is to help understand the
2232 functions in this file, some parts are NOT ACCURATE or are OVERLY
2233 SIMPLIFIED. For thorough understanding, please refer to the
2234 original document of ISO2022. This is equivalent to the standard
2235 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2237 ISO2022 provides many mechanisms to encode several character sets
2238 in 7-bit and 8-bit environments. For 7-bit environments, all text
2239 is encoded using bytes less than 128. This may make the encoded
2240 text a little bit longer, but the text passes more easily through
2241 several types of gateway, some of which strip off the MSB (Most
2244 There are two kinds of character sets: control character sets and
2245 graphic character sets. The former contain control characters such
2246 as `newline' and `escape' to provide control functions (control
2247 functions are also provided by escape sequences). The latter
2248 contain graphic characters such as 'A' and '-'. Emacs recognizes
2249 two control character sets and many graphic character sets.
2251 Graphic character sets are classified into one of the following
2252 four classes, according to the number of bytes (DIMENSION) and
2253 number of characters in one dimension (CHARS) of the set:
2254 - DIMENSION1_CHARS94
2255 - DIMENSION1_CHARS96
2256 - DIMENSION2_CHARS94
2257 - DIMENSION2_CHARS96
2259 In addition, each character set is assigned an identification tag,
2260 unique for each set, called the "final character" (denoted as <F>
2261 hereafter). The <F> of each character set is decided by ECMA(*)
2262 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2263 (0x30..0x3F are for private use only).
2265 Note (*): ECMA = European Computer Manufacturers Association
2267 Here are examples of graphic character sets [NAME(<F>)]:
2268 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2269 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2270 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2271 o DIMENSION2_CHARS96 -- none for the moment
2273 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2274 C0 [0x00..0x1F] -- control character plane 0
2275 GL [0x20..0x7F] -- graphic character plane 0
2276 C1 [0x80..0x9F] -- control character plane 1
2277 GR [0xA0..0xFF] -- graphic character plane 1
2279 A control character set is directly designated and invoked to C0 or
2280 C1 by an escape sequence. The most common case is that:
2281 - ISO646's control character set is designated/invoked to C0, and
2282 - ISO6429's control character set is designated/invoked to C1,
2283 and usually these designations/invocations are omitted in encoded
2284 text. In a 7-bit environment, only C0 can be used, and a control
2285 character for C1 is encoded by an appropriate escape sequence to
2286 fit into the environment. All control characters for C1 are
2287 defined to have corresponding escape sequences.
2289 A graphic character set is at first designated to one of four
2290 graphic registers (G0 through G3), then these graphic registers are
2291 invoked to GL or GR. These designations and invocations can be
2292 done independently. The most common case is that G0 is invoked to
2293 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2294 these invocations and designations are omitted in encoded text.
2295 In a 7-bit environment, only GL can be used.
2297 When a graphic character set of CHARS94 is invoked to GL, codes
2298 0x20 and 0x7F of the GL area work as control characters SPACE and
2299 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2302 There are two ways of invocation: locking-shift and single-shift.
2303 With locking-shift, the invocation lasts until the next different
2304 invocation, whereas with single-shift, the invocation affects the
2305 following character only and doesn't affect the locking-shift
2306 state. Invocations are done by the following control characters or
2309 ----------------------------------------------------------------------
2310 abbrev function cntrl escape seq description
2311 ----------------------------------------------------------------------
2312 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2313 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2314 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2315 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2316 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2317 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2318 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2319 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2320 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2321 ----------------------------------------------------------------------
2322 (*) These are not used by any known coding system.
2324 Control characters for these functions are defined by macros
2325 ISO_CODE_XXX in `coding.h'.
2327 Designations are done by the following escape sequences:
2328 ----------------------------------------------------------------------
2329 escape sequence description
2330 ----------------------------------------------------------------------
2331 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2332 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2333 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2334 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2335 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2336 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2337 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2338 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2339 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2340 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2341 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2342 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2343 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2344 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2345 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2346 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2347 ----------------------------------------------------------------------
2349 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2350 of dimension 1, chars 94, and final character <F>, etc...
2352 Note (*): Although these designations are not allowed in ISO2022,
2353 Emacs accepts them on decoding, and produces them on encoding
2354 CHARS96 character sets in a coding system which is characterized as
2355 7-bit environment, non-locking-shift, and non-single-shift.
2357 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2358 '(' must be omitted. We refer to this as "short-form" hereafter.
2360 Now you may notice that there are a lot of ways of encoding the
2361 same multilingual text in ISO2022. Actually, there exist many
2362 coding systems such as Compound Text (used in X11's inter client
2363 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2364 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2365 localized platforms), and all of these are variants of ISO2022.
2367 In addition to the above, Emacs handles two more kinds of escape
2368 sequences: ISO6429's direction specification and Emacs' private
2369 sequence for specifying character composition.
2371 ISO6429's direction specification takes the following form:
2372 o CSI ']' -- end of the current direction
2373 o CSI '0' ']' -- end of the current direction
2374 o CSI '1' ']' -- start of left-to-right text
2375 o CSI '2' ']' -- start of right-to-left text
2376 The control character CSI (0x9B: control sequence introducer) is
2377 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2379 Character composition specification takes the following form:
2380 o ESC '0' -- start relative composition
2381 o ESC '1' -- end composition
2382 o ESC '2' -- start rule-base composition (*)
2383 o ESC '3' -- start relative composition with alternate chars (**)
2384 o ESC '4' -- start rule-base composition with alternate chars (**)
2385 Since these are not standard escape sequences of any ISO standard,
2386 the use of them with these meanings is restricted to Emacs only.
2388 (*) This form is used only in Emacs 20.7 and older versions,
2389 but newer versions can safely decode it.
2390 (**) This form is used only in Emacs 21.1 and newer versions,
2391 and older versions can't decode it.
2393 Here's a list of example usages of these composition escape
2394 sequences (categorized by `enum composition_method').
2396 COMPOSITION_RELATIVE:
2397 ESC 0 CHAR [ CHAR ] ESC 1
2398 COMPOSITION_WITH_RULE:
2399 ESC 2 CHAR [ RULE CHAR ] ESC 1
2400 COMPOSITION_WITH_ALTCHARS:
2401 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2402 COMPOSITION_WITH_RULE_ALTCHARS:
2403 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2405 enum iso_code_class_type iso_code_class
[256];
2407 #define SAFE_CHARSET_P(coding, id) \
2408 ((id) <= (coding)->max_charset_id \
2409 && (coding)->safe_charsets[id] >= 0)
2412 #define SHIFT_OUT_OK(category) \
2413 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2416 setup_iso_safe_charsets (attrs
)
2419 Lisp_Object charset_list
, safe_charsets
;
2420 Lisp_Object request
;
2421 Lisp_Object reg_usage
;
2424 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2427 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2428 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2429 && ! EQ (charset_list
, Viso_2022_charset_list
))
2431 CODING_ATTR_CHARSET_LIST (attrs
)
2432 = charset_list
= Viso_2022_charset_list
;
2433 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2436 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2440 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2442 int id
= XINT (XCAR (tail
));
2443 if (max_charset_id
< id
)
2444 max_charset_id
= id
;
2447 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2449 request
= AREF (attrs
, coding_attr_iso_request
);
2450 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2451 reg94
= XINT (XCAR (reg_usage
));
2452 reg96
= XINT (XCDR (reg_usage
));
2454 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2458 struct charset
*charset
;
2461 charset
= CHARSET_FROM_ID (XINT (id
));
2462 reg
= Fcdr (Fassq (id
, request
));
2464 SSET (safe_charsets
, XINT (id
), XINT (reg
));
2465 else if (charset
->iso_chars_96
)
2468 SSET (safe_charsets
, XINT (id
), reg96
);
2473 SSET (safe_charsets
, XINT (id
), reg94
);
2476 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2480 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2481 Check if a text is encoded in one of ISO-2022 based codig systems.
2482 If it is, return 1, else return 0. */
2485 detect_coding_iso_2022 (coding
, detect_info
)
2486 struct coding_system
*coding
;
2487 struct coding_detection_info
*detect_info
;
2489 const unsigned char *src
= coding
->source
, *src_base
= src
;
2490 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2491 int multibytep
= coding
->src_multibyte
;
2492 int single_shifting
= 0;
2495 int consumed_chars
= 0;
2500 detect_info
->checked
|= CATEGORY_MASK_ISO
;
2502 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2504 struct coding_system
*this = &(coding_categories
[i
]);
2505 Lisp_Object attrs
, val
;
2507 attrs
= CODING_ID_ATTRS (this->id
);
2508 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2509 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2510 setup_iso_safe_charsets (attrs
);
2511 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2512 this->max_charset_id
= SCHARS (val
) - 1;
2513 this->safe_charsets
= (char *) SDATA (val
);
2516 /* A coding system of this category is always ASCII compatible. */
2517 src
+= coding
->head_ascii
;
2519 while (rejected
!= CATEGORY_MASK_ISO
)
2526 if (inhibit_iso_escape_detection
)
2528 single_shifting
= 0;
2530 if (c
>= '(' && c
<= '/')
2532 /* Designation sequence for a charset of dimension 1. */
2534 if (c1
< ' ' || c1
>= 0x80
2535 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2536 /* Invalid designation sequence. Just ignore. */
2541 /* Designation sequence for a charset of dimension 2. */
2543 if (c
>= '@' && c
<= 'B')
2544 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2545 id
= iso_charset_table
[1][0][c
];
2546 else if (c
>= '(' && c
<= '/')
2549 if (c1
< ' ' || c1
>= 0x80
2550 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2551 /* Invalid designation sequence. Just ignore. */
2555 /* Invalid designation sequence. Just ignore it. */
2558 else if (c
== 'N' || c
== 'O')
2560 /* ESC <Fe> for SS2 or SS3. */
2561 single_shifting
= 1;
2562 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2565 else if (c
>= '0' && c
<= '4')
2567 /* ESC <Fp> for start/end composition. */
2568 found
|= CATEGORY_MASK_ISO
;
2573 /* Invalid escape sequence. Just ignore it. */
2577 /* We found a valid designation sequence for CHARSET. */
2578 rejected
|= CATEGORY_MASK_ISO_8BIT
;
2579 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2581 found
|= CATEGORY_MASK_ISO_7
;
2583 rejected
|= CATEGORY_MASK_ISO_7
;
2584 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2586 found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2588 rejected
|= CATEGORY_MASK_ISO_7_TIGHT
;
2589 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2591 found
|= CATEGORY_MASK_ISO_7_ELSE
;
2593 rejected
|= CATEGORY_MASK_ISO_7_ELSE
;
2594 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2596 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2598 rejected
|= CATEGORY_MASK_ISO_8_ELSE
;
2603 /* Locking shift out/in. */
2604 if (inhibit_iso_escape_detection
)
2606 single_shifting
= 0;
2607 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2608 found
|= CATEGORY_MASK_ISO_ELSE
;
2612 /* Control sequence introducer. */
2613 single_shifting
= 0;
2614 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2615 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2616 goto check_extra_latin
;
2622 if (inhibit_iso_escape_detection
)
2624 single_shifting
= 1;
2625 rejected
|= CATEGORY_MASK_ISO_7BIT
;
2626 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2627 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2628 found
|= CATEGORY_MASK_ISO_8_1
;
2629 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2630 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2631 found
|= CATEGORY_MASK_ISO_8_2
;
2632 goto check_extra_latin
;
2639 single_shifting
= 0;
2644 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2645 found
|= CATEGORY_MASK_ISO_8_1
;
2646 /* Check the length of succeeding codes of the range
2647 0xA0..0FF. If the byte length is even, we include
2648 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2649 only when we are not single shifting. */
2650 if (! single_shifting
2651 && ! (rejected
& CATEGORY_MASK_ISO_8_2
))
2654 while (src
< src_end
)
2662 if (i
& 1 && src
< src_end
)
2663 rejected
|= CATEGORY_MASK_ISO_8_2
;
2665 found
|= CATEGORY_MASK_ISO_8_2
;
2670 single_shifting
= 0;
2671 if (! VECTORP (Vlatin_extra_code_table
)
2672 || NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2674 rejected
= CATEGORY_MASK_ISO
;
2677 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2678 & CODING_ISO_FLAG_LATIN_EXTRA
)
2679 found
|= CATEGORY_MASK_ISO_8_1
;
2681 rejected
|= CATEGORY_MASK_ISO_8_1
;
2682 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2683 & CODING_ISO_FLAG_LATIN_EXTRA
)
2684 found
|= CATEGORY_MASK_ISO_8_2
;
2686 rejected
|= CATEGORY_MASK_ISO_8_2
;
2689 detect_info
->rejected
|= CATEGORY_MASK_ISO
;
2693 detect_info
->rejected
|= rejected
;
2694 detect_info
->found
|= (found
& ~rejected
);
2699 /* Set designation state into CODING. */
2700 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2704 if (final < '0' || final >= 128 \
2705 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2706 || !SAFE_CHARSET_P (coding, id)) \
2708 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2709 goto invalid_code; \
2711 prev = CODING_ISO_DESIGNATION (coding, reg); \
2712 if (id == charset_jisx0201_roman) \
2714 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2715 id = charset_ascii; \
2717 else if (id == charset_jisx0208_1978) \
2719 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2720 id = charset_jisx0208; \
2722 CODING_ISO_DESIGNATION (coding, reg) = id; \
2723 /* If there was an invalid designation to REG previously, and this \
2724 designation is ASCII to REG, we should keep this designation \
2726 if (prev == -2 && id == charset_ascii) \
2727 goto invalid_code; \
2731 #define MAYBE_FINISH_COMPOSITION() \
2734 if (composition_state == COMPOSING_NO) \
2736 /* It is assured that we have enough room for producing \
2737 characters stored in the table `components'. */ \
2738 if (charbuf + component_idx > charbuf_end) \
2739 goto no_more_source; \
2740 composition_state = COMPOSING_NO; \
2741 if (method == COMPOSITION_RELATIVE \
2742 || method == COMPOSITION_WITH_ALTCHARS) \
2744 for (i = 0; i < component_idx; i++) \
2745 *charbuf++ = components[i]; \
2746 char_offset += component_idx; \
2750 for (i = 0; i < component_idx; i += 2) \
2751 *charbuf++ = components[i]; \
2752 char_offset += (component_idx / 2) + 1; \
2757 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2758 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2759 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2760 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2761 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2764 #define DECODE_COMPOSITION_START(c1) \
2767 && composition_state == COMPOSING_COMPONENT_RULE) \
2769 component_len = component_idx; \
2770 composition_state = COMPOSING_CHAR; \
2774 const unsigned char *p; \
2776 MAYBE_FINISH_COMPOSITION (); \
2777 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2778 goto no_more_source; \
2779 for (p = src; p < src_end - 1; p++) \
2780 if (*p == ISO_CODE_ESC && p[1] == '1') \
2782 if (p == src_end - 1) \
2784 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2785 goto invalid_code; \
2786 goto no_more_source; \
2789 /* This is surely the start of a composition. */ \
2790 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2791 : c1 == '2' ? COMPOSITION_WITH_RULE \
2792 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2793 : COMPOSITION_WITH_RULE_ALTCHARS); \
2794 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2795 : COMPOSING_COMPONENT_CHAR); \
2796 component_idx = component_len = 0; \
2801 /* Handle compositoin end sequence ESC 1. */
2803 #define DECODE_COMPOSITION_END() \
2805 int nchars = (component_len > 0 ? component_idx - component_len \
2806 : method == COMPOSITION_RELATIVE ? component_idx \
2807 : (component_idx + 1) / 2); \
2809 int *saved_charbuf = charbuf; \
2810 int from = char_offset; \
2811 int to = from + nchars; \
2813 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2814 if (method != COMPOSITION_RELATIVE) \
2816 if (component_len == 0) \
2817 for (i = 0; i < component_idx; i++) \
2818 *charbuf++ = components[i]; \
2820 for (i = 0; i < component_len; i++) \
2821 *charbuf++ = components[i]; \
2822 *saved_charbuf = saved_charbuf - charbuf; \
2824 if (method == COMPOSITION_WITH_RULE) \
2825 for (i = 0; i < component_idx; i += 2, char_offset++) \
2826 *charbuf++ = components[i]; \
2828 for (i = component_len; i < component_idx; i++, char_offset++) \
2829 *charbuf++ = components[i]; \
2830 coding->annotated = 1; \
2831 composition_state = COMPOSING_NO; \
2835 /* Decode a composition rule from the byte C1 (and maybe one more byte
2836 from SRC) and store one encoded composition rule in
2837 coding->cmp_data. */
2839 #define DECODE_COMPOSITION_RULE(c1) \
2842 if (c1 < 81) /* old format (before ver.21) */ \
2844 int gref = (c1) / 9; \
2845 int nref = (c1) % 9; \
2846 if (gref == 4) gref = 10; \
2847 if (nref == 4) nref = 10; \
2848 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2850 else if (c1 < 93) /* new format (after ver.21) */ \
2852 ONE_MORE_BYTE (c2); \
2853 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2860 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2863 decode_coding_iso_2022 (coding
)
2864 struct coding_system
*coding
;
2866 const unsigned char *src
= coding
->source
+ coding
->consumed
;
2867 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2868 const unsigned char *src_base
;
2869 int *charbuf
= coding
->charbuf
;
2871 = charbuf
+ coding
->charbuf_size
- 4 - MAX_ANNOTATION_LENGTH
;
2872 int consumed_chars
= 0, consumed_chars_base
;
2873 int multibytep
= coding
->src_multibyte
;
2874 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2875 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2876 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2877 struct charset
*charset
;
2879 /* For handling composition sequence. */
2880 #define COMPOSING_NO 0
2881 #define COMPOSING_CHAR 1
2882 #define COMPOSING_RULE 2
2883 #define COMPOSING_COMPONENT_CHAR 3
2884 #define COMPOSING_COMPONENT_RULE 4
2886 int composition_state
= COMPOSING_NO
;
2887 enum composition_method method
;
2888 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2891 Lisp_Object attrs
, charset_list
;
2892 int char_offset
= coding
->produced_char
;
2893 int last_offset
= char_offset
;
2894 int last_id
= charset_ascii
;
2896 CODING_GET_INFO (coding
, attrs
, charset_list
);
2897 setup_iso_safe_charsets (attrs
);
2904 consumed_chars_base
= consumed_chars
;
2906 if (charbuf
>= charbuf_end
)
2913 /* We produce at most one character. */
2914 switch (iso_code_class
[c1
])
2916 case ISO_0x20_or_0x7F
:
2917 if (composition_state
!= COMPOSING_NO
)
2919 if (composition_state
== COMPOSING_RULE
2920 || composition_state
== COMPOSING_COMPONENT_RULE
)
2922 DECODE_COMPOSITION_RULE (c1
);
2923 components
[component_idx
++] = c1
;
2924 composition_state
--;
2928 if (charset_id_0
< 0
2929 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2930 /* This is SPACE or DEL. */
2931 charset
= CHARSET_FROM_ID (charset_ascii
);
2933 charset
= CHARSET_FROM_ID (charset_id_0
);
2936 case ISO_graphic_plane_0
:
2937 if (composition_state
!= COMPOSING_NO
)
2939 if (composition_state
== COMPOSING_RULE
2940 || composition_state
== COMPOSING_COMPONENT_RULE
)
2942 DECODE_COMPOSITION_RULE (c1
);
2943 components
[component_idx
++] = c1
;
2944 composition_state
--;
2948 charset
= CHARSET_FROM_ID (charset_id_0
);
2951 case ISO_0xA0_or_0xFF
:
2952 if (charset_id_1
< 0
2953 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
2954 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
2956 /* This is a graphic character, we fall down ... */
2958 case ISO_graphic_plane_1
:
2959 if (charset_id_1
< 0)
2961 charset
= CHARSET_FROM_ID (charset_id_1
);
2965 MAYBE_FINISH_COMPOSITION ();
2966 charset
= CHARSET_FROM_ID (charset_ascii
);
2970 MAYBE_FINISH_COMPOSITION ();
2974 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2975 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
2977 CODING_ISO_INVOCATION (coding
, 0) = 1;
2978 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2982 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
2984 CODING_ISO_INVOCATION (coding
, 0) = 0;
2985 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2988 case ISO_single_shift_2_7
:
2989 case ISO_single_shift_2
:
2990 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2992 /* SS2 is handled as an escape sequence of ESC 'N' */
2994 goto label_escape_sequence
;
2996 case ISO_single_shift_3
:
2997 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2999 /* SS2 is handled as an escape sequence of ESC 'O' */
3001 goto label_escape_sequence
;
3003 case ISO_control_sequence_introducer
:
3004 /* CSI is handled as an escape sequence of ESC '[' ... */
3006 goto label_escape_sequence
;
3010 label_escape_sequence
:
3011 /* Escape sequences handled here are invocation,
3012 designation, direction specification, and character
3013 composition specification. */
3016 case '&': /* revision of following character set */
3018 if (!(c1
>= '@' && c1
<= '~'))
3021 if (c1
!= ISO_CODE_ESC
)
3024 goto label_escape_sequence
;
3026 case '$': /* designation of 2-byte character set */
3027 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3030 if (c1
>= '@' && c1
<= 'B')
3031 { /* designation of JISX0208.1978, GB2312.1980,
3033 DECODE_DESIGNATION (0, 2, 0, c1
);
3035 else if (c1
>= 0x28 && c1
<= 0x2B)
3036 { /* designation of DIMENSION2_CHARS94 character set */
3038 DECODE_DESIGNATION (c1
- 0x28, 2, 0, c2
);
3040 else if (c1
>= 0x2C && c1
<= 0x2F)
3041 { /* designation of DIMENSION2_CHARS96 character set */
3043 DECODE_DESIGNATION (c1
- 0x2C, 2, 1, c2
);
3047 /* We must update these variables now. */
3048 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3049 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3052 case 'n': /* invocation of locking-shift-2 */
3053 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3054 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3056 CODING_ISO_INVOCATION (coding
, 0) = 2;
3057 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3060 case 'o': /* invocation of locking-shift-3 */
3061 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3062 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3064 CODING_ISO_INVOCATION (coding
, 0) = 3;
3065 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3068 case 'N': /* invocation of single-shift-2 */
3069 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3070 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3072 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 2));
3074 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3078 case 'O': /* invocation of single-shift-3 */
3079 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3080 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3082 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 3));
3084 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3088 case '0': case '2': case '3': case '4': /* start composition */
3089 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
3091 DECODE_COMPOSITION_START (c1
);
3094 case '1': /* end composition */
3095 if (composition_state
== COMPOSING_NO
)
3097 DECODE_COMPOSITION_END ();
3100 case '[': /* specification of direction */
3101 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
3103 /* For the moment, nested direction is not supported.
3104 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3105 left-to-right, and nozero means right-to-left. */
3109 case ']': /* end of the current direction */
3110 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3112 case '0': /* end of the current direction */
3113 case '1': /* start of left-to-right direction */
3116 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3121 case '2': /* start of right-to-left direction */
3124 coding
->mode
|= CODING_MODE_DIRECTION
;
3138 /* CTEXT extended segment:
3139 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3140 We keep these bytes as is for the moment.
3141 They may be decoded by post-read-conversion. */
3145 ONE_MORE_BYTE (dim
);
3148 size
= ((M
- 128) * 128) + (L
- 128);
3149 if (charbuf
+ 8 + size
> charbuf_end
)
3151 *charbuf
++ = ISO_CODE_ESC
;
3155 *charbuf
++ = BYTE8_TO_CHAR (M
);
3156 *charbuf
++ = BYTE8_TO_CHAR (L
);
3160 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3165 /* XFree86 extension for embedding UTF-8 in CTEXT:
3166 ESC % G --UTF-8-BYTES-- ESC % @
3167 We keep these bytes as is for the moment.
3168 They may be decoded by post-read-conversion. */
3171 if (p
+ 6 > charbuf_end
)
3173 *p
++ = ISO_CODE_ESC
;
3176 while (p
< charbuf_end
)
3179 if (c1
== ISO_CODE_ESC
3180 && src
+ 1 < src_end
3184 *p
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3186 if (p
+ 3 > charbuf_end
)
3188 *p
++ = ISO_CODE_ESC
;
3199 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3201 if (c1
>= 0x28 && c1
<= 0x2B)
3202 { /* designation of DIMENSION1_CHARS94 character set */
3204 DECODE_DESIGNATION (c1
- 0x28, 1, 0, c2
);
3206 else if (c1
>= 0x2C && c1
<= 0x2F)
3207 { /* designation of DIMENSION1_CHARS96 character set */
3209 DECODE_DESIGNATION (c1
- 0x2C, 1, 1, c2
);
3213 /* We must update these variables now. */
3214 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3215 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3220 if (charset
->id
!= charset_ascii
3221 && last_id
!= charset
->id
)
3223 if (last_id
!= charset_ascii
)
3224 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3225 last_id
= charset
->id
;
3226 last_offset
= char_offset
;
3229 /* Now we know CHARSET and 1st position code C1 of a character.
3230 Produce a decoded character while getting 2nd position code
3233 if (CHARSET_DIMENSION (charset
) > 1)
3236 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3237 /* C2 is not in a valid range. */
3239 c1
= (c1
<< 8) | (c2
& 0x7F);
3240 if (CHARSET_DIMENSION (charset
) > 2)
3243 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3244 /* C2 is not in a valid range. */
3246 c1
= (c1
<< 8) | (c2
& 0x7F);
3250 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3253 MAYBE_FINISH_COMPOSITION ();
3254 for (; src_base
< src
; src_base
++, char_offset
++)
3256 if (ASCII_BYTE_P (*src_base
))
3257 *charbuf
++ = *src_base
;
3259 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3262 else if (composition_state
== COMPOSING_NO
)
3269 components
[component_idx
++] = c
;
3270 if (method
== COMPOSITION_WITH_RULE
3271 || (method
== COMPOSITION_WITH_RULE_ALTCHARS
3272 && composition_state
== COMPOSING_COMPONENT_CHAR
))
3273 composition_state
++;
3278 MAYBE_FINISH_COMPOSITION ();
3280 consumed_chars
= consumed_chars_base
;
3282 *charbuf
++ = c
< 0 ? -c
: ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3292 if (last_id
!= charset_ascii
)
3293 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3294 coding
->consumed_char
+= consumed_chars_base
;
3295 coding
->consumed
= src_base
- coding
->source
;
3296 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3300 /* ISO2022 encoding stuff. */
3303 It is not enough to say just "ISO2022" on encoding, we have to
3304 specify more details. In Emacs, each coding system of ISO2022
3305 variant has the following specifications:
3306 1. Initial designation to G0 thru G3.
3307 2. Allows short-form designation?
3308 3. ASCII should be designated to G0 before control characters?
3309 4. ASCII should be designated to G0 at end of line?
3310 5. 7-bit environment or 8-bit environment?
3311 6. Use locking-shift?
3312 7. Use Single-shift?
3313 And the following two are only for Japanese:
3314 8. Use ASCII in place of JIS0201-1976-Roman?
3315 9. Use JISX0208-1983 in place of JISX0208-1978?
3316 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3317 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3321 /* Produce codes (escape sequence) for designating CHARSET to graphic
3322 register REG at DST, and increment DST. If <final-char> of CHARSET is
3323 '@', 'A', or 'B' and the coding system CODING allows, produce
3324 designation sequence of short-form. */
3326 #define ENCODE_DESIGNATION(charset, reg, coding) \
3328 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3329 char *intermediate_char_94 = "()*+"; \
3330 char *intermediate_char_96 = ",-./"; \
3331 int revision = -1; \
3334 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3335 revision = CHARSET_ISO_REVISION (charset); \
3337 if (revision >= 0) \
3339 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3340 EMIT_ONE_BYTE ('@' + revision); \
3342 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3343 if (CHARSET_DIMENSION (charset) == 1) \
3345 if (! CHARSET_ISO_CHARS_96 (charset)) \
3346 c = intermediate_char_94[reg]; \
3348 c = intermediate_char_96[reg]; \
3349 EMIT_ONE_ASCII_BYTE (c); \
3353 EMIT_ONE_ASCII_BYTE ('$'); \
3354 if (! CHARSET_ISO_CHARS_96 (charset)) \
3356 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3358 || final_char < '@' || final_char > 'B') \
3359 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3362 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3364 EMIT_ONE_ASCII_BYTE (final_char); \
3366 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3370 /* The following two macros produce codes (control character or escape
3371 sequence) for ISO2022 single-shift functions (single-shift-2 and
3374 #define ENCODE_SINGLE_SHIFT_2 \
3376 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3377 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3379 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3380 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3384 #define ENCODE_SINGLE_SHIFT_3 \
3386 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3387 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3389 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3390 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3394 /* The following four macros produce codes (control character or
3395 escape sequence) for ISO2022 locking-shift functions (shift-in,
3396 shift-out, locking-shift-2, and locking-shift-3). */
3398 #define ENCODE_SHIFT_IN \
3400 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3401 CODING_ISO_INVOCATION (coding, 0) = 0; \
3405 #define ENCODE_SHIFT_OUT \
3407 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3408 CODING_ISO_INVOCATION (coding, 0) = 1; \
3412 #define ENCODE_LOCKING_SHIFT_2 \
3414 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3415 CODING_ISO_INVOCATION (coding, 0) = 2; \
3419 #define ENCODE_LOCKING_SHIFT_3 \
3421 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3422 CODING_ISO_INVOCATION (coding, 0) = 3; \
3426 /* Produce codes for a DIMENSION1 character whose character set is
3427 CHARSET and whose position-code is C1. Designation and invocation
3428 sequences are also produced in advance if necessary. */
3430 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3432 int id = CHARSET_ID (charset); \
3434 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3435 && id == charset_ascii) \
3437 id = charset_jisx0201_roman; \
3438 charset = CHARSET_FROM_ID (id); \
3441 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3443 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3444 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3446 EMIT_ONE_BYTE (c1 | 0x80); \
3447 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3450 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3452 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3455 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3457 EMIT_ONE_BYTE (c1 | 0x80); \
3461 /* Since CHARSET is not yet invoked to any graphic planes, we \
3462 must invoke it, or, at first, designate it to some graphic \
3463 register. Then repeat the loop to actually produce the \
3465 dst = encode_invocation_designation (charset, coding, dst, \
3470 /* Produce codes for a DIMENSION2 character whose character set is
3471 CHARSET and whose position-codes are C1 and C2. Designation and
3472 invocation codes are also produced in advance if necessary. */
3474 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3476 int id = CHARSET_ID (charset); \
3478 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3479 && id == charset_jisx0208) \
3481 id = charset_jisx0208_1978; \
3482 charset = CHARSET_FROM_ID (id); \
3485 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3487 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3488 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3490 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3491 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3494 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3496 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3499 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3501 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3505 /* Since CHARSET is not yet invoked to any graphic planes, we \
3506 must invoke it, or, at first, designate it to some graphic \
3507 register. Then repeat the loop to actually produce the \
3509 dst = encode_invocation_designation (charset, coding, dst, \
3514 #define ENCODE_ISO_CHARACTER(charset, c) \
3516 int code = ENCODE_CHAR ((charset),(c)); \
3518 if (CHARSET_DIMENSION (charset) == 1) \
3519 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3521 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3525 /* Produce designation and invocation codes at a place pointed by DST
3526 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3530 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3531 struct charset
*charset
;
3532 struct coding_system
*coding
;
3536 int multibytep
= coding
->dst_multibyte
;
3537 int produced_chars
= *p_nchars
;
3538 int reg
; /* graphic register number */
3539 int id
= CHARSET_ID (charset
);
3541 /* At first, check designations. */
3542 for (reg
= 0; reg
< 4; reg
++)
3543 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3548 /* CHARSET is not yet designated to any graphic registers. */
3549 /* At first check the requested designation. */
3550 reg
= CODING_ISO_REQUEST (coding
, id
);
3552 /* Since CHARSET requests no special designation, designate it
3553 to graphic register 0. */
3556 ENCODE_DESIGNATION (charset
, reg
, coding
);
3559 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3560 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3562 /* Since the graphic register REG is not invoked to any graphic
3563 planes, invoke it to graphic plane 0. */
3566 case 0: /* graphic register 0 */
3570 case 1: /* graphic register 1 */
3574 case 2: /* graphic register 2 */
3575 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3576 ENCODE_SINGLE_SHIFT_2
;
3578 ENCODE_LOCKING_SHIFT_2
;
3581 case 3: /* graphic register 3 */
3582 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3583 ENCODE_SINGLE_SHIFT_3
;
3585 ENCODE_LOCKING_SHIFT_3
;
3590 *p_nchars
= produced_chars
;
3594 /* The following three macros produce codes for indicating direction
3596 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3598 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3599 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3601 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3605 #define ENCODE_DIRECTION_R2L() \
3607 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3608 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3612 #define ENCODE_DIRECTION_L2R() \
3614 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3615 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3619 /* Produce codes for designation and invocation to reset the graphic
3620 planes and registers to initial state. */
3621 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3624 struct charset *charset; \
3626 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3628 for (reg = 0; reg < 4; reg++) \
3629 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3630 && (CODING_ISO_DESIGNATION (coding, reg) \
3631 != CODING_ISO_INITIAL (coding, reg))) \
3633 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3634 ENCODE_DESIGNATION (charset, reg, coding); \
3639 /* Produce designation sequences of charsets in the line started from
3640 SRC to a place pointed by DST, and return updated DST.
3642 If the current block ends before any end-of-line, we may fail to
3643 find all the necessary designations. */
3645 static unsigned char *
3646 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3647 struct coding_system
*coding
;
3648 int *charbuf
, *charbuf_end
;
3651 struct charset
*charset
;
3652 /* Table of charsets to be designated to each graphic register. */
3654 int c
, found
= 0, reg
;
3655 int produced_chars
= 0;
3656 int multibytep
= coding
->dst_multibyte
;
3658 Lisp_Object charset_list
;
3660 attrs
= CODING_ID_ATTRS (coding
->id
);
3661 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3662 if (EQ (charset_list
, Qiso_2022
))
3663 charset_list
= Viso_2022_charset_list
;
3665 for (reg
= 0; reg
< 4; reg
++)
3675 charset
= char_charset (c
, charset_list
, NULL
);
3676 id
= CHARSET_ID (charset
);
3677 reg
= CODING_ISO_REQUEST (coding
, id
);
3678 if (reg
>= 0 && r
[reg
] < 0)
3687 for (reg
= 0; reg
< 4; reg
++)
3689 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3690 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3696 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3699 encode_coding_iso_2022 (coding
)
3700 struct coding_system
*coding
;
3702 int multibytep
= coding
->dst_multibyte
;
3703 int *charbuf
= coding
->charbuf
;
3704 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3705 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3706 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3709 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3710 && CODING_ISO_BOL (coding
));
3711 int produced_chars
= 0;
3712 Lisp_Object attrs
, eol_type
, charset_list
;
3713 int ascii_compatible
;
3715 int preferred_charset_id
= -1;
3717 CODING_GET_INFO (coding
, attrs
, charset_list
);
3718 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
3719 if (VECTORP (eol_type
))
3722 setup_iso_safe_charsets (attrs
);
3723 /* Charset list may have been changed. */
3724 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
); \
3725 coding
->safe_charsets
= (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs
));
3727 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3729 while (charbuf
< charbuf_end
)
3731 ASSURE_DESTINATION (safe_room
);
3733 if (bol_designation
)
3735 unsigned char *dst_prev
= dst
;
3737 /* We have to produce designation sequences if any now. */
3738 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3739 bol_designation
= 0;
3740 /* We are sure that designation sequences are all ASCII bytes. */
3741 produced_chars
+= dst
- dst_prev
;
3748 /* Handle an annotation. */
3751 case CODING_ANNOTATE_COMPOSITION_MASK
:
3752 /* Not yet implemented. */
3754 case CODING_ANNOTATE_CHARSET_MASK
:
3755 preferred_charset_id
= charbuf
[3];
3756 if (preferred_charset_id
>= 0
3757 && NILP (Fmemq (make_number (preferred_charset_id
),
3759 preferred_charset_id
= -1;
3768 /* Now encode the character C. */
3769 if (c
< 0x20 || c
== 0x7F)
3772 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3774 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3775 ENCODE_RESET_PLANE_AND_REGISTER ();
3776 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3780 for (i
= 0; i
< 4; i
++)
3781 CODING_ISO_DESIGNATION (coding
, i
)
3782 = CODING_ISO_INITIAL (coding
, i
);
3785 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3787 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3788 ENCODE_RESET_PLANE_AND_REGISTER ();
3789 EMIT_ONE_ASCII_BYTE (c
);
3791 else if (ASCII_CHAR_P (c
))
3793 if (ascii_compatible
)
3794 EMIT_ONE_ASCII_BYTE (c
);
3797 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
3798 ENCODE_ISO_CHARACTER (charset
, c
);
3801 else if (CHAR_BYTE8_P (c
))
3803 c
= CHAR_TO_BYTE8 (c
);
3808 struct charset
*charset
;
3810 if (preferred_charset_id
>= 0)
3812 charset
= CHARSET_FROM_ID (preferred_charset_id
);
3813 if (! CHAR_CHARSET_P (c
, charset
))
3814 charset
= char_charset (c
, charset_list
, NULL
);
3817 charset
= char_charset (c
, charset_list
, NULL
);
3820 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3822 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3823 charset
= CHARSET_FROM_ID (charset_ascii
);
3827 c
= coding
->default_char
;
3828 charset
= char_charset (c
, charset_list
, NULL
);
3831 ENCODE_ISO_CHARACTER (charset
, c
);
3835 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3836 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3838 ASSURE_DESTINATION (safe_room
);
3839 ENCODE_RESET_PLANE_AND_REGISTER ();
3841 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
3842 CODING_ISO_BOL (coding
) = bol_designation
;
3843 coding
->produced_char
+= produced_chars
;
3844 coding
->produced
= dst
- coding
->destination
;
3849 /*** 8,9. SJIS and BIG5 handlers ***/
3851 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3852 quite widely. So, for the moment, Emacs supports them in the bare
3853 C code. But, in the future, they may be supported only by CCL. */
3855 /* SJIS is a coding system encoding three character sets: ASCII, right
3856 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3857 as is. A character of charset katakana-jisx0201 is encoded by
3858 "position-code + 0x80". A character of charset japanese-jisx0208
3859 is encoded in 2-byte but two position-codes are divided and shifted
3860 so that it fit in the range below.
3862 --- CODE RANGE of SJIS ---
3863 (character set) (range)
3865 KATAKANA-JISX0201 0xA0 .. 0xDF
3866 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3867 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3868 -------------------------------
3872 /* BIG5 is a coding system encoding two character sets: ASCII and
3873 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3874 character set and is encoded in two-byte.
3876 --- CODE RANGE of BIG5 ---
3877 (character set) (range)
3879 Big5 (1st byte) 0xA1 .. 0xFE
3880 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3881 --------------------------
3885 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3886 Check if a text is encoded in SJIS. If it is, return
3887 CATEGORY_MASK_SJIS, else return 0. */
3890 detect_coding_sjis (coding
, detect_info
)
3891 struct coding_system
*coding
;
3892 struct coding_detection_info
*detect_info
;
3894 const unsigned char *src
= coding
->source
, *src_base
;
3895 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3896 int multibytep
= coding
->src_multibyte
;
3897 int consumed_chars
= 0;
3901 detect_info
->checked
|= CATEGORY_MASK_SJIS
;
3902 /* A coding system of this category is always ASCII compatible. */
3903 src
+= coding
->head_ascii
;
3911 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3914 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
3916 found
= CATEGORY_MASK_SJIS
;
3918 else if (c
>= 0xA0 && c
< 0xE0)
3919 found
= CATEGORY_MASK_SJIS
;
3923 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3927 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3929 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3932 detect_info
->found
|= found
;
3936 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3937 Check if a text is encoded in BIG5. If it is, return
3938 CATEGORY_MASK_BIG5, else return 0. */
3941 detect_coding_big5 (coding
, detect_info
)
3942 struct coding_system
*coding
;
3943 struct coding_detection_info
*detect_info
;
3945 const unsigned char *src
= coding
->source
, *src_base
;
3946 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3947 int multibytep
= coding
->src_multibyte
;
3948 int consumed_chars
= 0;
3952 detect_info
->checked
|= CATEGORY_MASK_BIG5
;
3953 /* A coding system of this category is always ASCII compatible. */
3954 src
+= coding
->head_ascii
;
3965 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
3967 found
= CATEGORY_MASK_BIG5
;
3972 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3976 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3978 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3981 detect_info
->found
|= found
;
3985 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3986 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3989 decode_coding_sjis (coding
)
3990 struct coding_system
*coding
;
3992 const unsigned char *src
= coding
->source
+ coding
->consumed
;
3993 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3994 const unsigned char *src_base
;
3995 int *charbuf
= coding
->charbuf
;
3996 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
3997 int consumed_chars
= 0, consumed_chars_base
;
3998 int multibytep
= coding
->src_multibyte
;
3999 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4000 Lisp_Object attrs
, charset_list
, val
;
4001 int char_offset
= coding
->produced_char
;
4002 int last_offset
= char_offset
;
4003 int last_id
= charset_ascii
;
4005 CODING_GET_INFO (coding
, attrs
, charset_list
);
4008 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4009 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4010 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4015 struct charset
*charset
;
4018 consumed_chars_base
= consumed_chars
;
4020 if (charbuf
>= charbuf_end
)
4027 charset
= charset_roman
;
4032 if (c
< 0xA0 || c
>= 0xE0)
4034 /* SJIS -> JISX0208 */
4036 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
4040 charset
= charset_kanji
;
4044 /* SJIS -> JISX0201-Kana */
4046 charset
= charset_kana
;
4051 if (charset
->id
!= charset_ascii
4052 && last_id
!= charset
->id
)
4054 if (last_id
!= charset_ascii
)
4055 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4056 last_id
= charset
->id
;
4057 last_offset
= char_offset
;
4059 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4066 consumed_chars
= consumed_chars_base
;
4068 *charbuf
++ = c
< 0 ? -c
: BYTE8_TO_CHAR (c
);
4074 if (last_id
!= charset_ascii
)
4075 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4076 coding
->consumed_char
+= consumed_chars_base
;
4077 coding
->consumed
= src_base
- coding
->source
;
4078 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4082 decode_coding_big5 (coding
)
4083 struct coding_system
*coding
;
4085 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4086 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4087 const unsigned char *src_base
;
4088 int *charbuf
= coding
->charbuf
;
4089 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4090 int consumed_chars
= 0, consumed_chars_base
;
4091 int multibytep
= coding
->src_multibyte
;
4092 struct charset
*charset_roman
, *charset_big5
;
4093 Lisp_Object attrs
, charset_list
, val
;
4094 int char_offset
= coding
->produced_char
;
4095 int last_offset
= char_offset
;
4096 int last_id
= charset_ascii
;
4098 CODING_GET_INFO (coding
, attrs
, charset_list
);
4100 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4101 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4106 struct charset
*charset
;
4109 consumed_chars_base
= consumed_chars
;
4111 if (charbuf
>= charbuf_end
)
4119 charset
= charset_roman
;
4123 if (c
< 0xA1 || c
> 0xFE)
4126 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
4129 charset
= charset_big5
;
4131 if (charset
->id
!= charset_ascii
4132 && last_id
!= charset
->id
)
4134 if (last_id
!= charset_ascii
)
4135 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4136 last_id
= charset
->id
;
4137 last_offset
= char_offset
;
4139 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4146 consumed_chars
= consumed_chars_base
;
4148 *charbuf
++ = c
< 0 ? -c
: BYTE8_TO_CHAR (c
);
4154 if (last_id
!= charset_ascii
)
4155 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4156 coding
->consumed_char
+= consumed_chars_base
;
4157 coding
->consumed
= src_base
- coding
->source
;
4158 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4161 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4162 This function can encode charsets `ascii', `katakana-jisx0201',
4163 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4164 are sure that all these charsets are registered as official charset
4165 (i.e. do not have extended leading-codes). Characters of other
4166 charsets are produced without any encoding. If SJIS_P is 1, encode
4167 SJIS text, else encode BIG5 text. */
4170 encode_coding_sjis (coding
)
4171 struct coding_system
*coding
;
4173 int multibytep
= coding
->dst_multibyte
;
4174 int *charbuf
= coding
->charbuf
;
4175 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4176 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4177 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4179 int produced_chars
= 0;
4180 Lisp_Object attrs
, charset_list
, val
;
4181 int ascii_compatible
;
4182 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4185 CODING_GET_INFO (coding
, attrs
, charset_list
);
4187 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4188 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4189 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4191 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4193 while (charbuf
< charbuf_end
)
4195 ASSURE_DESTINATION (safe_room
);
4197 /* Now encode the character C. */
4198 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4199 EMIT_ONE_ASCII_BYTE (c
);
4200 else if (CHAR_BYTE8_P (c
))
4202 c
= CHAR_TO_BYTE8 (c
);
4208 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4212 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4214 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4215 charset
= CHARSET_FROM_ID (charset_ascii
);
4219 c
= coding
->default_char
;
4220 charset
= char_charset (c
, charset_list
, &code
);
4223 if (code
== CHARSET_INVALID_CODE (charset
))
4225 if (charset
== charset_kanji
)
4229 c1
= code
>> 8, c2
= code
& 0xFF;
4230 EMIT_TWO_BYTES (c1
, c2
);
4232 else if (charset
== charset_kana
)
4233 EMIT_ONE_BYTE (code
| 0x80);
4235 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4238 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4239 coding
->produced_char
+= produced_chars
;
4240 coding
->produced
= dst
- coding
->destination
;
4245 encode_coding_big5 (coding
)
4246 struct coding_system
*coding
;
4248 int multibytep
= coding
->dst_multibyte
;
4249 int *charbuf
= coding
->charbuf
;
4250 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4251 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4252 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4254 int produced_chars
= 0;
4255 Lisp_Object attrs
, charset_list
, val
;
4256 int ascii_compatible
;
4257 struct charset
*charset_roman
, *charset_big5
;
4260 CODING_GET_INFO (coding
, attrs
, charset_list
);
4262 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4263 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4264 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4266 while (charbuf
< charbuf_end
)
4268 ASSURE_DESTINATION (safe_room
);
4270 /* Now encode the character C. */
4271 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4272 EMIT_ONE_ASCII_BYTE (c
);
4273 else if (CHAR_BYTE8_P (c
))
4275 c
= CHAR_TO_BYTE8 (c
);
4281 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4285 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4287 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4288 charset
= CHARSET_FROM_ID (charset_ascii
);
4292 c
= coding
->default_char
;
4293 charset
= char_charset (c
, charset_list
, &code
);
4296 if (code
== CHARSET_INVALID_CODE (charset
))
4298 if (charset
== charset_big5
)
4302 c1
= code
>> 8, c2
= code
& 0xFF;
4303 EMIT_TWO_BYTES (c1
, c2
);
4306 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4309 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4310 coding
->produced_char
+= produced_chars
;
4311 coding
->produced
= dst
- coding
->destination
;
4316 /*** 10. CCL handlers ***/
4318 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4319 Check if a text is encoded in a coding system of which
4320 encoder/decoder are written in CCL program. If it is, return
4321 CATEGORY_MASK_CCL, else return 0. */
4324 detect_coding_ccl (coding
, detect_info
)
4325 struct coding_system
*coding
;
4326 struct coding_detection_info
*detect_info
;
4328 const unsigned char *src
= coding
->source
, *src_base
;
4329 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4330 int multibytep
= coding
->src_multibyte
;
4331 int consumed_chars
= 0;
4333 unsigned char *valids
= CODING_CCL_VALIDS (coding
);
4334 int head_ascii
= coding
->head_ascii
;
4337 detect_info
->checked
|= CATEGORY_MASK_CCL
;
4339 coding
= &coding_categories
[coding_category_ccl
];
4340 attrs
= CODING_ID_ATTRS (coding
->id
);
4341 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4350 if (c
< 0 || ! valids
[c
])
4352 if ((valids
[c
] > 1))
4353 found
= CATEGORY_MASK_CCL
;
4355 detect_info
->rejected
|= CATEGORY_MASK_CCL
;
4359 detect_info
->found
|= found
;
4364 decode_coding_ccl (coding
)
4365 struct coding_system
*coding
;
4367 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4368 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4369 int *charbuf
= coding
->charbuf
;
4370 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4371 int consumed_chars
= 0;
4372 int multibytep
= coding
->src_multibyte
;
4373 struct ccl_program ccl
;
4374 int source_charbuf
[1024];
4375 int source_byteidx
[1024];
4376 Lisp_Object attrs
, charset_list
;
4378 CODING_GET_INFO (coding
, attrs
, charset_list
);
4379 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
4381 while (src
< src_end
)
4383 const unsigned char *p
= src
;
4384 int *source
, *source_end
;
4388 while (i
< 1024 && p
< src_end
)
4390 source_byteidx
[i
] = p
- src
;
4391 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4394 while (i
< 1024 && p
< src_end
)
4395 source_charbuf
[i
++] = *p
++;
4397 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4400 source
= source_charbuf
;
4401 source_end
= source
+ i
;
4402 while (source
< source_end
)
4404 ccl_driver (&ccl
, source
, charbuf
,
4405 source_end
- source
, charbuf_end
- charbuf
,
4407 source
+= ccl
.consumed
;
4408 charbuf
+= ccl
.produced
;
4409 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4412 if (source
< source_end
)
4413 src
+= source_byteidx
[source
- source_charbuf
];
4416 consumed_chars
+= source
- source_charbuf
;
4418 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4419 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4425 case CCL_STAT_SUSPEND_BY_SRC
:
4426 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
4428 case CCL_STAT_SUSPEND_BY_DST
:
4431 case CCL_STAT_INVALID_CMD
:
4432 record_conversion_result (coding
, CODING_RESULT_INTERRUPT
);
4435 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4438 coding
->consumed_char
+= consumed_chars
;
4439 coding
->consumed
= src
- coding
->source
;
4440 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4444 encode_coding_ccl (coding
)
4445 struct coding_system
*coding
;
4447 struct ccl_program ccl
;
4448 int multibytep
= coding
->dst_multibyte
;
4449 int *charbuf
= coding
->charbuf
;
4450 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4451 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4452 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4453 unsigned char *adjusted_dst_end
= dst_end
- 1;
4454 int destination_charbuf
[1024];
4455 int i
, produced_chars
= 0;
4456 Lisp_Object attrs
, charset_list
;
4458 CODING_GET_INFO (coding
, attrs
, charset_list
);
4459 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4461 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4462 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4464 while (charbuf
< charbuf_end
&& dst
< adjusted_dst_end
)
4466 int dst_bytes
= dst_end
- dst
;
4467 if (dst_bytes
> 1024)
4470 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4471 charbuf_end
- charbuf
, dst_bytes
, charset_list
);
4472 charbuf
+= ccl
.consumed
;
4474 for (i
= 0; i
< ccl
.produced
; i
++)
4475 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4478 for (i
= 0; i
< ccl
.produced
; i
++)
4479 *dst
++ = destination_charbuf
[i
] & 0xFF;
4480 produced_chars
+= ccl
.produced
;
4486 case CCL_STAT_SUSPEND_BY_SRC
:
4487 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
4489 case CCL_STAT_SUSPEND_BY_DST
:
4490 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_DST
);
4493 case CCL_STAT_INVALID_CMD
:
4494 record_conversion_result (coding
, CODING_RESULT_INTERRUPT
);
4497 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4501 coding
->produced_char
+= produced_chars
;
4502 coding
->produced
= dst
- coding
->destination
;
4508 /*** 10, 11. no-conversion handlers ***/
4510 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4513 decode_coding_raw_text (coding
)
4514 struct coding_system
*coding
;
4516 coding
->chars_at_source
= 1;
4517 coding
->consumed_char
= 0;
4518 coding
->consumed
= 0;
4519 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4523 encode_coding_raw_text (coding
)
4524 struct coding_system
*coding
;
4526 int multibytep
= coding
->dst_multibyte
;
4527 int *charbuf
= coding
->charbuf
;
4528 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4529 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4530 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4531 int produced_chars
= 0;
4536 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4538 if (coding
->src_multibyte
)
4539 while (charbuf
< charbuf_end
)
4541 ASSURE_DESTINATION (safe_room
);
4543 if (ASCII_CHAR_P (c
))
4544 EMIT_ONE_ASCII_BYTE (c
);
4545 else if (CHAR_BYTE8_P (c
))
4547 c
= CHAR_TO_BYTE8 (c
);
4552 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4554 CHAR_STRING_ADVANCE (c
, p1
);
4557 EMIT_ONE_BYTE (*p0
);
4563 while (charbuf
< charbuf_end
)
4565 ASSURE_DESTINATION (safe_room
);
4572 if (coding
->src_multibyte
)
4574 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4576 while (charbuf
< charbuf_end
)
4578 ASSURE_DESTINATION (safe_room
);
4580 if (ASCII_CHAR_P (c
))
4582 else if (CHAR_BYTE8_P (c
))
4583 *dst
++ = CHAR_TO_BYTE8 (c
);
4585 CHAR_STRING_ADVANCE (c
, dst
);
4591 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4592 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4593 *dst
++ = *charbuf
++;
4594 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4597 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4598 coding
->produced_char
+= produced_chars
;
4599 coding
->produced
= dst
- coding
->destination
;
4603 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4604 Check if a text is encoded in a charset-based coding system. If it
4605 is, return 1, else return 0. */
4608 detect_coding_charset (coding
, detect_info
)
4609 struct coding_system
*coding
;
4610 struct coding_detection_info
*detect_info
;
4612 const unsigned char *src
= coding
->source
, *src_base
;
4613 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4614 int multibytep
= coding
->src_multibyte
;
4615 int consumed_chars
= 0;
4616 Lisp_Object attrs
, valids
;
4619 detect_info
->checked
|= CATEGORY_MASK_CHARSET
;
4621 coding
= &coding_categories
[coding_category_charset
];
4622 attrs
= CODING_ID_ATTRS (coding
->id
);
4623 valids
= AREF (attrs
, coding_attr_charset_valids
);
4625 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4626 src
+= coding
->head_ascii
;
4636 if (NILP (AREF (valids
, c
)))
4639 found
= CATEGORY_MASK_CHARSET
;
4641 detect_info
->rejected
|= CATEGORY_MASK_CHARSET
;
4645 detect_info
->found
|= found
;
4650 decode_coding_charset (coding
)
4651 struct coding_system
*coding
;
4653 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4654 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4655 const unsigned char *src_base
;
4656 int *charbuf
= coding
->charbuf
;
4657 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4658 int consumed_chars
= 0, consumed_chars_base
;
4659 int multibytep
= coding
->src_multibyte
;
4660 Lisp_Object attrs
, charset_list
, valids
;
4661 int char_offset
= coding
->produced_char
;
4662 int last_offset
= char_offset
;
4663 int last_id
= charset_ascii
;
4665 CODING_GET_INFO (coding
, attrs
, charset_list
);
4666 valids
= AREF (attrs
, coding_attr_charset_valids
);
4672 struct charset
*charset
;
4678 consumed_chars_base
= consumed_chars
;
4680 if (charbuf
>= charbuf_end
)
4688 val
= AREF (valids
, c
);
4693 charset
= CHARSET_FROM_ID (XFASTINT (val
));
4694 dim
= CHARSET_DIMENSION (charset
);
4698 code
= (code
<< 8) | c
;
4701 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
4706 /* VAL is a list of charset IDs. It is assured that the
4707 list is sorted by charset dimensions (smaller one
4711 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
4712 dim
= CHARSET_DIMENSION (charset
);
4716 code
= (code
<< 8) | c
;
4719 CODING_DECODE_CHAR (coding
, src
, src_base
,
4720 src_end
, charset
, code
, c
);
4728 if (charset
->id
!= charset_ascii
4729 && last_id
!= charset
->id
)
4731 if (last_id
!= charset_ascii
)
4732 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4733 last_id
= charset
->id
;
4734 last_offset
= char_offset
;
4743 consumed_chars
= consumed_chars_base
;
4745 *charbuf
++ = c
< 0 ? -c
: ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4751 if (last_id
!= charset_ascii
)
4752 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4753 coding
->consumed_char
+= consumed_chars_base
;
4754 coding
->consumed
= src_base
- coding
->source
;
4755 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4759 encode_coding_charset (coding
)
4760 struct coding_system
*coding
;
4762 int multibytep
= coding
->dst_multibyte
;
4763 int *charbuf
= coding
->charbuf
;
4764 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4765 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4766 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4767 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4768 int produced_chars
= 0;
4769 Lisp_Object attrs
, charset_list
;
4770 int ascii_compatible
;
4773 CODING_GET_INFO (coding
, attrs
, charset_list
);
4774 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4776 while (charbuf
< charbuf_end
)
4778 struct charset
*charset
;
4781 ASSURE_DESTINATION (safe_room
);
4783 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4784 EMIT_ONE_ASCII_BYTE (c
);
4785 else if (CHAR_BYTE8_P (c
))
4787 c
= CHAR_TO_BYTE8 (c
);
4792 charset
= char_charset (c
, charset_list
, &code
);
4795 if (CHARSET_DIMENSION (charset
) == 1)
4796 EMIT_ONE_BYTE (code
);
4797 else if (CHARSET_DIMENSION (charset
) == 2)
4798 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
4799 else if (CHARSET_DIMENSION (charset
) == 3)
4800 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
4802 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
4803 (code
>> 8) & 0xFF, code
& 0xFF);
4807 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4808 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4810 c
= coding
->default_char
;
4816 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4817 coding
->produced_char
+= produced_chars
;
4818 coding
->produced
= dst
- coding
->destination
;
4823 /*** 7. C library functions ***/
4825 /* Setup coding context CODING from information about CODING_SYSTEM.
4826 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4827 CODING_SYSTEM is invalid, signal an error. */
4830 setup_coding_system (coding_system
, coding
)
4831 Lisp_Object coding_system
;
4832 struct coding_system
*coding
;
4835 Lisp_Object eol_type
;
4836 Lisp_Object coding_type
;
4839 if (NILP (coding_system
))
4840 coding_system
= Qno_conversion
;
4842 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
4844 attrs
= CODING_ID_ATTRS (coding
->id
);
4845 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4848 coding
->head_ascii
= -1;
4849 coding
->common_flags
4850 = (VECTORP (eol_type
) ? CODING_REQUIRE_DETECTION_MASK
: 0);
4851 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
4852 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
4853 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
4854 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
4855 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs
)))
4856 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
4858 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4859 coding
->max_charset_id
= SCHARS (val
) - 1;
4860 coding
->safe_charsets
= (char *) SDATA (val
);
4861 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
4863 coding_type
= CODING_ATTR_TYPE (attrs
);
4864 if (EQ (coding_type
, Qundecided
))
4866 coding
->detector
= NULL
;
4867 coding
->decoder
= decode_coding_raw_text
;
4868 coding
->encoder
= encode_coding_raw_text
;
4869 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4871 else if (EQ (coding_type
, Qiso_2022
))
4874 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
4876 /* Invoke graphic register 0 to plane 0. */
4877 CODING_ISO_INVOCATION (coding
, 0) = 0;
4878 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4879 CODING_ISO_INVOCATION (coding
, 1)
4880 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
4881 /* Setup the initial status of designation. */
4882 for (i
= 0; i
< 4; i
++)
4883 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
4884 /* Not single shifting initially. */
4885 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
4886 /* Beginning of buffer should also be regarded as bol. */
4887 CODING_ISO_BOL (coding
) = 1;
4888 coding
->detector
= detect_coding_iso_2022
;
4889 coding
->decoder
= decode_coding_iso_2022
;
4890 coding
->encoder
= encode_coding_iso_2022
;
4891 if (flags
& CODING_ISO_FLAG_SAFE
)
4892 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
4893 coding
->common_flags
4894 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4895 | CODING_REQUIRE_FLUSHING_MASK
);
4896 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
4897 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
4898 if (flags
& CODING_ISO_FLAG_DESIGNATION
)
4899 coding
->common_flags
|= CODING_ANNOTATE_CHARSET_MASK
;
4900 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
4902 setup_iso_safe_charsets (attrs
);
4903 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4904 coding
->max_charset_id
= SCHARS (val
) - 1;
4905 coding
->safe_charsets
= (char *) SDATA (val
);
4907 CODING_ISO_FLAGS (coding
) = flags
;
4909 else if (EQ (coding_type
, Qcharset
))
4911 coding
->detector
= detect_coding_charset
;
4912 coding
->decoder
= decode_coding_charset
;
4913 coding
->encoder
= encode_coding_charset
;
4914 coding
->common_flags
4915 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4917 else if (EQ (coding_type
, Qutf_8
))
4919 coding
->detector
= detect_coding_utf_8
;
4920 coding
->decoder
= decode_coding_utf_8
;
4921 coding
->encoder
= encode_coding_utf_8
;
4922 coding
->common_flags
4923 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4925 else if (EQ (coding_type
, Qutf_16
))
4927 val
= AREF (attrs
, coding_attr_utf_16_bom
);
4928 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
4929 : EQ (val
, Qt
) ? utf_16_with_bom
4930 : utf_16_without_bom
);
4931 val
= AREF (attrs
, coding_attr_utf_16_endian
);
4932 CODING_UTF_16_ENDIAN (coding
) = (EQ (val
, Qbig
) ? utf_16_big_endian
4933 : utf_16_little_endian
);
4934 CODING_UTF_16_SURROGATE (coding
) = 0;
4935 coding
->detector
= detect_coding_utf_16
;
4936 coding
->decoder
= decode_coding_utf_16
;
4937 coding
->encoder
= encode_coding_utf_16
;
4938 coding
->common_flags
4939 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4940 if (CODING_UTF_16_BOM (coding
) == utf_16_detect_bom
)
4941 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4943 else if (EQ (coding_type
, Qccl
))
4945 coding
->detector
= detect_coding_ccl
;
4946 coding
->decoder
= decode_coding_ccl
;
4947 coding
->encoder
= encode_coding_ccl
;
4948 coding
->common_flags
4949 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4950 | CODING_REQUIRE_FLUSHING_MASK
);
4952 else if (EQ (coding_type
, Qemacs_mule
))
4954 coding
->detector
= detect_coding_emacs_mule
;
4955 coding
->decoder
= decode_coding_emacs_mule
;
4956 coding
->encoder
= encode_coding_emacs_mule
;
4957 coding
->common_flags
4958 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4959 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
4960 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
4962 Lisp_Object tail
, safe_charsets
;
4963 int max_charset_id
= 0;
4965 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4967 if (max_charset_id
< XFASTINT (XCAR (tail
)))
4968 max_charset_id
= XFASTINT (XCAR (tail
));
4969 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
4971 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4973 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
4974 coding
->max_charset_id
= max_charset_id
;
4975 coding
->safe_charsets
= (char *) SDATA (safe_charsets
);
4978 else if (EQ (coding_type
, Qshift_jis
))
4980 coding
->detector
= detect_coding_sjis
;
4981 coding
->decoder
= decode_coding_sjis
;
4982 coding
->encoder
= encode_coding_sjis
;
4983 coding
->common_flags
4984 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4986 else if (EQ (coding_type
, Qbig5
))
4988 coding
->detector
= detect_coding_big5
;
4989 coding
->decoder
= decode_coding_big5
;
4990 coding
->encoder
= encode_coding_big5
;
4991 coding
->common_flags
4992 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4994 else /* EQ (coding_type, Qraw_text) */
4996 coding
->detector
= NULL
;
4997 coding
->decoder
= decode_coding_raw_text
;
4998 coding
->encoder
= encode_coding_raw_text
;
5004 /* Return raw-text or one of its subsidiaries that has the same
5005 eol_type as CODING-SYSTEM. */
5008 raw_text_coding_system (coding_system
)
5009 Lisp_Object coding_system
;
5011 Lisp_Object spec
, attrs
;
5012 Lisp_Object eol_type
, raw_text_eol_type
;
5014 if (NILP (coding_system
))
5016 spec
= CODING_SYSTEM_SPEC (coding_system
);
5017 attrs
= AREF (spec
, 0);
5019 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
5020 return coding_system
;
5022 eol_type
= AREF (spec
, 2);
5023 if (VECTORP (eol_type
))
5025 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
5026 raw_text_eol_type
= AREF (spec
, 2);
5027 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
5028 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
5029 : AREF (raw_text_eol_type
, 2));
5033 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5034 does, return one of the subsidiary that has the same eol-spec as
5035 PARENT. Otherwise, return CODING_SYSTEM. */
5038 coding_inherit_eol_type (coding_system
, parent
)
5039 Lisp_Object coding_system
, parent
;
5041 Lisp_Object spec
, eol_type
;
5043 if (NILP (coding_system
))
5044 coding_system
= Qraw_text
;
5045 spec
= CODING_SYSTEM_SPEC (coding_system
);
5046 eol_type
= AREF (spec
, 2);
5047 if (VECTORP (eol_type
)
5050 Lisp_Object parent_spec
;
5051 Lisp_Object parent_eol_type
;
5054 = CODING_SYSTEM_SPEC (buffer_defaults
.buffer_file_coding_system
);
5055 parent_eol_type
= AREF (parent_spec
, 2);
5056 if (EQ (parent_eol_type
, Qunix
))
5057 coding_system
= AREF (eol_type
, 0);
5058 else if (EQ (parent_eol_type
, Qdos
))
5059 coding_system
= AREF (eol_type
, 1);
5060 else if (EQ (parent_eol_type
, Qmac
))
5061 coding_system
= AREF (eol_type
, 2);
5063 return coding_system
;
5066 /* Emacs has a mechanism to automatically detect a coding system if it
5067 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5068 it's impossible to distinguish some coding systems accurately
5069 because they use the same range of codes. So, at first, coding
5070 systems are categorized into 7, those are:
5072 o coding-category-emacs-mule
5074 The category for a coding system which has the same code range
5075 as Emacs' internal format. Assigned the coding-system (Lisp
5076 symbol) `emacs-mule' by default.
5078 o coding-category-sjis
5080 The category for a coding system which has the same code range
5081 as SJIS. Assigned the coding-system (Lisp
5082 symbol) `japanese-shift-jis' by default.
5084 o coding-category-iso-7
5086 The category for a coding system which has the same code range
5087 as ISO2022 of 7-bit environment. This doesn't use any locking
5088 shift and single shift functions. This can encode/decode all
5089 charsets. Assigned the coding-system (Lisp symbol)
5090 `iso-2022-7bit' by default.
5092 o coding-category-iso-7-tight
5094 Same as coding-category-iso-7 except that this can
5095 encode/decode only the specified charsets.
5097 o coding-category-iso-8-1
5099 The category for a coding system which has the same code range
5100 as ISO2022 of 8-bit environment and graphic plane 1 used only
5101 for DIMENSION1 charset. This doesn't use any locking shift
5102 and single shift functions. Assigned the coding-system (Lisp
5103 symbol) `iso-latin-1' by default.
5105 o coding-category-iso-8-2
5107 The category for a coding system which has the same code range
5108 as ISO2022 of 8-bit environment and graphic plane 1 used only
5109 for DIMENSION2 charset. This doesn't use any locking shift
5110 and single shift functions. Assigned the coding-system (Lisp
5111 symbol) `japanese-iso-8bit' by default.
5113 o coding-category-iso-7-else
5115 The category for a coding system which has the same code range
5116 as ISO2022 of 7-bit environemnt but uses locking shift or
5117 single shift functions. Assigned the coding-system (Lisp
5118 symbol) `iso-2022-7bit-lock' by default.
5120 o coding-category-iso-8-else
5122 The category for a coding system which has the same code range
5123 as ISO2022 of 8-bit environemnt but uses locking shift or
5124 single shift functions. Assigned the coding-system (Lisp
5125 symbol) `iso-2022-8bit-ss2' by default.
5127 o coding-category-big5
5129 The category for a coding system which has the same code range
5130 as BIG5. Assigned the coding-system (Lisp symbol)
5131 `cn-big5' by default.
5133 o coding-category-utf-8
5135 The category for a coding system which has the same code range
5136 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5137 symbol) `utf-8' by default.
5139 o coding-category-utf-16-be
5141 The category for a coding system in which a text has an
5142 Unicode signature (cf. Unicode Standard) in the order of BIG
5143 endian at the head. Assigned the coding-system (Lisp symbol)
5144 `utf-16-be' by default.
5146 o coding-category-utf-16-le
5148 The category for a coding system in which a text has an
5149 Unicode signature (cf. Unicode Standard) in the order of
5150 LITTLE endian at the head. Assigned the coding-system (Lisp
5151 symbol) `utf-16-le' by default.
5153 o coding-category-ccl
5155 The category for a coding system of which encoder/decoder is
5156 written in CCL programs. The default value is nil, i.e., no
5157 coding system is assigned.
5159 o coding-category-binary
5161 The category for a coding system not categorized in any of the
5162 above. Assigned the coding-system (Lisp symbol)
5163 `no-conversion' by default.
5165 Each of them is a Lisp symbol and the value is an actual
5166 `coding-system's (this is also a Lisp symbol) assigned by a user.
5167 What Emacs does actually is to detect a category of coding system.
5168 Then, it uses a `coding-system' assigned to it. If Emacs can't
5169 decide only one possible category, it selects a category of the
5170 highest priority. Priorities of categories are also specified by a
5171 user in a Lisp variable `coding-category-list'.
5175 #define EOL_SEEN_NONE 0
5176 #define EOL_SEEN_LF 1
5177 #define EOL_SEEN_CR 2
5178 #define EOL_SEEN_CRLF 4
5180 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5181 SOURCE is encoded. If CATEGORY is one of
5182 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5183 two-byte, else they are encoded by one-byte.
5185 Return one of EOL_SEEN_XXX. */
5187 #define MAX_EOL_CHECK_COUNT 3
5190 detect_eol (source
, src_bytes
, category
)
5191 unsigned char *source
;
5192 EMACS_INT src_bytes
;
5193 enum coding_category category
;
5195 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
5198 int eol_seen
= EOL_SEEN_NONE
;
5200 if ((1 << category
) & CATEGORY_MASK_UTF_16
)
5204 msb
= category
== (coding_category_utf_16_le
5205 | coding_category_utf_16_le_nosig
);
5208 while (src
+ 1 < src_end
)
5211 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
5216 this_eol
= EOL_SEEN_LF
;
5217 else if (src
+ 3 >= src_end
5218 || src
[msb
+ 2] != 0
5219 || src
[lsb
+ 2] != '\n')
5220 this_eol
= EOL_SEEN_CR
;
5222 this_eol
= EOL_SEEN_CRLF
;
5224 if (eol_seen
== EOL_SEEN_NONE
)
5225 /* This is the first end-of-line. */
5226 eol_seen
= this_eol
;
5227 else if (eol_seen
!= this_eol
)
5229 /* The found type is different from what found before. */
5230 eol_seen
= EOL_SEEN_LF
;
5233 if (++total
== MAX_EOL_CHECK_COUNT
)
5241 while (src
< src_end
)
5244 if (c
== '\n' || c
== '\r')
5249 this_eol
= EOL_SEEN_LF
;
5250 else if (src
>= src_end
|| *src
!= '\n')
5251 this_eol
= EOL_SEEN_CR
;
5253 this_eol
= EOL_SEEN_CRLF
, src
++;
5255 if (eol_seen
== EOL_SEEN_NONE
)
5256 /* This is the first end-of-line. */
5257 eol_seen
= this_eol
;
5258 else if (eol_seen
!= this_eol
)
5260 /* The found type is different from what found before. */
5261 eol_seen
= EOL_SEEN_LF
;
5264 if (++total
== MAX_EOL_CHECK_COUNT
)
5274 adjust_coding_eol_type (coding
, eol_seen
)
5275 struct coding_system
*coding
;
5278 Lisp_Object eol_type
;
5280 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5281 if (eol_seen
& EOL_SEEN_LF
)
5283 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
5286 else if (eol_seen
& EOL_SEEN_CRLF
)
5288 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
5291 else if (eol_seen
& EOL_SEEN_CR
)
5293 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
5299 /* Detect how a text specified in CODING is encoded. If a coding
5300 system is detected, update fields of CODING by the detected coding
5304 detect_coding (coding
)
5305 struct coding_system
*coding
;
5307 const unsigned char *src
, *src_end
;
5308 Lisp_Object attrs
, coding_type
;
5310 coding
->consumed
= coding
->consumed_char
= 0;
5311 coding
->produced
= coding
->produced_char
= 0;
5312 coding_set_source (coding
);
5314 src_end
= coding
->source
+ coding
->src_bytes
;
5316 /* If we have not yet decided the text encoding type, detect it
5318 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
5322 for (i
= 0, src
= coding
->source
; src
< src_end
; i
++, src
++)
5325 if (c
& 0x80 || (c
< 0x20 && (c
== 0
5326 || c
== ISO_CODE_ESC
5328 || c
== ISO_CODE_SO
)))
5331 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
5333 if (coding
->head_ascii
< coding
->src_bytes
)
5335 struct coding_detection_info detect_info
;
5336 enum coding_category category
;
5337 struct coding_system
*this;
5339 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
5340 for (i
= 0; i
< coding_category_raw_text
; i
++)
5342 category
= coding_priorities
[i
];
5343 this = coding_categories
+ category
;
5346 /* No coding system of this category is defined. */
5347 detect_info
.rejected
|= (1 << category
);
5349 else if (category
>= coding_category_raw_text
)
5351 else if (detect_info
.checked
& (1 << category
))
5353 if (detect_info
.found
& (1 << category
))
5356 else if ((*(this->detector
)) (coding
, &detect_info
)
5357 && detect_info
.found
& (1 << category
))
5359 if (category
== coding_category_utf_16_auto
)
5361 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
5362 category
= coding_category_utf_16_le
;
5364 category
= coding_category_utf_16_be
;
5369 if (i
< coding_category_raw_text
)
5370 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5371 else if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
5372 setup_coding_system (Qraw_text
, coding
);
5373 else if (detect_info
.rejected
)
5374 for (i
= 0; i
< coding_category_raw_text
; i
++)
5375 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
5377 this = coding_categories
+ coding_priorities
[i
];
5378 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5383 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding
->id
)))
5384 == coding_category_utf_16_auto
)
5386 Lisp_Object coding_systems
;
5387 struct coding_detection_info detect_info
;
5390 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_16_bom
);
5391 detect_info
.found
= detect_info
.rejected
= 0;
5392 if (CONSP (coding_systems
)
5393 && detect_coding_utf_16 (coding
, &detect_info
))
5395 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
5396 setup_coding_system (XCAR (coding_systems
), coding
);
5397 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
5398 setup_coding_system (XCDR (coding_systems
), coding
);
5406 struct coding_system
*coding
;
5408 Lisp_Object eol_type
;
5409 unsigned char *p
, *pbeg
, *pend
;
5411 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5412 if (EQ (eol_type
, Qunix
))
5415 if (NILP (coding
->dst_object
))
5416 pbeg
= coding
->destination
;
5418 pbeg
= BYTE_POS_ADDR (coding
->dst_pos_byte
);
5419 pend
= pbeg
+ coding
->produced
;
5421 if (VECTORP (eol_type
))
5423 int eol_seen
= EOL_SEEN_NONE
;
5425 for (p
= pbeg
; p
< pend
; p
++)
5428 eol_seen
|= EOL_SEEN_LF
;
5429 else if (*p
== '\r')
5431 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
5433 eol_seen
|= EOL_SEEN_CRLF
;
5437 eol_seen
|= EOL_SEEN_CR
;
5440 if (eol_seen
!= EOL_SEEN_NONE
5441 && eol_seen
!= EOL_SEEN_LF
5442 && eol_seen
!= EOL_SEEN_CRLF
5443 && eol_seen
!= EOL_SEEN_CR
)
5444 eol_seen
= EOL_SEEN_LF
;
5445 if (eol_seen
!= EOL_SEEN_NONE
)
5446 eol_type
= adjust_coding_eol_type (coding
, eol_seen
);
5449 if (EQ (eol_type
, Qmac
))
5451 for (p
= pbeg
; p
< pend
; p
++)
5455 else if (EQ (eol_type
, Qdos
))
5459 if (NILP (coding
->dst_object
))
5461 for (p
= pend
- 2; p
>= pbeg
; p
--)
5464 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
-- - p
- 1);
5470 for (p
= pend
- 2; p
>= pbeg
; p
--)
5473 int pos_byte
= coding
->dst_pos_byte
+ (p
- pbeg
);
5474 int pos
= BYTE_TO_CHAR (pos_byte
);
5476 del_range_2 (pos
, pos_byte
, pos
+ 1, pos_byte
+ 1, 0);
5480 coding
->produced
-= n
;
5481 coding
->produced_char
-= n
;
5486 translate_chars (coding
, table
)
5487 struct coding_system
*coding
;
5490 int *charbuf
= coding
->charbuf
;
5491 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5494 if (coding
->chars_at_source
)
5497 while (charbuf
< charbuf_end
)
5503 *charbuf
++ = translate_char (table
, c
);
5508 produce_chars (coding
)
5509 struct coding_system
*coding
;
5511 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5512 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5514 int produced_chars
= 0;
5516 if (! coding
->chars_at_source
)
5518 /* Characters are in coding->charbuf. */
5519 int *buf
= coding
->charbuf
;
5520 int *buf_end
= buf
+ coding
->charbuf_used
;
5521 unsigned char *adjusted_dst_end
;
5523 if (BUFFERP (coding
->src_object
)
5524 && EQ (coding
->src_object
, coding
->dst_object
))
5525 dst_end
= ((unsigned char *) coding
->source
) + coding
->consumed
;
5526 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5528 while (buf
< buf_end
)
5532 if (dst
>= adjusted_dst_end
)
5534 dst
= alloc_destination (coding
,
5535 buf_end
- buf
+ MAX_MULTIBYTE_LENGTH
,
5537 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5538 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5542 if (coding
->dst_multibyte
5543 || ! CHAR_BYTE8_P (c
))
5544 CHAR_STRING_ADVANCE (c
, dst
);
5546 *dst
++ = CHAR_TO_BYTE8 (c
);
5550 /* This is an annotation datum. (-C) is the length of
5557 const unsigned char *src
= coding
->source
;
5558 const unsigned char *src_end
= src
+ coding
->src_bytes
;
5559 Lisp_Object eol_type
;
5561 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5563 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5565 if (coding
->src_multibyte
)
5572 const unsigned char *src_base
= src
;
5578 if (EQ (eol_type
, Qdos
))
5582 record_conversion_result
5583 (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
5584 goto no_more_source
;
5589 else if (EQ (eol_type
, Qmac
))
5594 coding
->consumed
= src
- coding
->source
;
5596 if (EQ (coding
->src_object
, coding
->dst_object
))
5597 dst_end
= (unsigned char *) src
;
5600 dst
= alloc_destination (coding
, src_end
- src
+ 1,
5602 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5603 coding_set_source (coding
);
5604 src
= coding
->source
+ coding
->consumed
;
5605 src_end
= coding
->source
+ coding
->src_bytes
;
5615 while (src
< src_end
)
5622 if (EQ (eol_type
, Qdos
))
5628 else if (EQ (eol_type
, Qmac
))
5631 if (dst
>= dst_end
- 1)
5633 coding
->consumed
= src
- coding
->source
;
5635 if (EQ (coding
->src_object
, coding
->dst_object
))
5636 dst_end
= (unsigned char *) src
;
5637 if (dst
>= dst_end
- 1)
5639 dst
= alloc_destination (coding
, src_end
- src
+ 2,
5641 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5642 coding_set_source (coding
);
5643 src
= coding
->source
+ coding
->consumed
;
5644 src_end
= coding
->source
+ coding
->src_bytes
;
5652 if (!EQ (coding
->src_object
, coding
->dst_object
))
5654 int require
= coding
->src_bytes
- coding
->dst_bytes
;
5658 EMACS_INT offset
= src
- coding
->source
;
5660 dst
= alloc_destination (coding
, require
, dst
);
5661 coding_set_source (coding
);
5662 src
= coding
->source
+ offset
;
5663 src_end
= coding
->source
+ coding
->src_bytes
;
5666 produced_chars
= coding
->src_chars
;
5667 while (src
< src_end
)
5673 if (EQ (eol_type
, Qdos
))
5680 else if (EQ (eol_type
, Qmac
))
5686 coding
->consumed
= coding
->src_bytes
;
5687 coding
->consumed_char
= coding
->src_chars
;
5690 produced
= dst
- (coding
->destination
+ coding
->produced
);
5691 if (BUFFERP (coding
->dst_object
))
5692 insert_from_gap (produced_chars
, produced
);
5693 coding
->produced
+= produced
;
5694 coding
->produced_char
+= produced_chars
;
5695 return produced_chars
;
5698 /* Compose text in CODING->object according to the annotation data at
5699 CHARBUF. CHARBUF is an array:
5700 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5704 produce_composition (coding
, charbuf
)
5705 struct coding_system
*coding
;
5710 enum composition_method method
;
5711 Lisp_Object components
;
5714 from
= coding
->dst_pos
+ charbuf
[2];
5715 to
= coding
->dst_pos
+ charbuf
[3];
5716 method
= (enum composition_method
) (charbuf
[4]);
5718 if (method
== COMPOSITION_RELATIVE
)
5722 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
5727 for (i
= 0; i
< len
; i
++)
5728 args
[i
] = make_number (charbuf
[i
]);
5729 components
= (method
== COMPOSITION_WITH_ALTCHARS
5730 ? Fstring (len
, args
) : Fvector (len
, args
));
5732 compose_text (from
, to
, components
, Qnil
, coding
->dst_object
);
5736 /* Put `charset' property on text in CODING->object according to
5737 the annotation data at CHARBUF. CHARBUF is an array:
5738 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5742 produce_charset (coding
, charbuf
)
5743 struct coding_system
*coding
;
5746 EMACS_INT from
= coding
->dst_pos
+ charbuf
[2];
5747 EMACS_INT to
= coding
->dst_pos
+ charbuf
[3];
5748 struct charset
*charset
= CHARSET_FROM_ID (charbuf
[4]);
5750 Fput_text_property (make_number (from
), make_number (to
),
5751 Qcharset
, CHARSET_NAME (charset
),
5752 coding
->dst_object
);
5756 #define CHARBUF_SIZE 0x4000
5758 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5760 int size = CHARBUF_SIZE;; \
5762 coding->charbuf = NULL; \
5763 while (size > 1024) \
5765 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5766 if (coding->charbuf) \
5770 if (! coding->charbuf) \
5772 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
5773 return coding->result; \
5775 coding->charbuf_size = size; \
5780 produce_annotation (coding
)
5781 struct coding_system
*coding
;
5783 int *charbuf
= coding
->charbuf
;
5784 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5786 if (NILP (coding
->dst_object
))
5789 while (charbuf
< charbuf_end
)
5795 int len
= -*charbuf
;
5798 case CODING_ANNOTATE_COMPOSITION_MASK
:
5799 produce_composition (coding
, charbuf
);
5801 case CODING_ANNOTATE_CHARSET_MASK
:
5802 produce_charset (coding
, charbuf
);
5812 /* Decode the data at CODING->src_object into CODING->dst_object.
5813 CODING->src_object is a buffer, a string, or nil.
5814 CODING->dst_object is a buffer.
5816 If CODING->src_object is a buffer, it must be the current buffer.
5817 In this case, if CODING->src_pos is positive, it is a position of
5818 the source text in the buffer, otherwise, the source text is in the
5819 gap area of the buffer, and CODING->src_pos specifies the offset of
5820 the text from GPT (which must be the same as PT). If this is the
5821 same buffer as CODING->dst_object, CODING->src_pos must be
5824 If CODING->src_object is a string, CODING->src_pos in an index to
5827 If CODING->src_object is nil, CODING->source must already point to
5828 the non-relocatable memory area. In this case, CODING->src_pos is
5829 an offset from CODING->source.
5831 The decoded data is inserted at the current point of the buffer
5836 decode_coding (coding
)
5837 struct coding_system
*coding
;
5840 Lisp_Object undo_list
;
5842 if (BUFFERP (coding
->src_object
)
5843 && coding
->src_pos
> 0
5844 && coding
->src_pos
< GPT
5845 && coding
->src_pos
+ coding
->src_chars
> GPT
)
5846 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
5849 if (BUFFERP (coding
->dst_object
))
5851 if (current_buffer
!= XBUFFER (coding
->dst_object
))
5852 set_buffer_internal (XBUFFER (coding
->dst_object
));
5854 move_gap_both (PT
, PT_BYTE
);
5855 undo_list
= current_buffer
->undo_list
;
5856 current_buffer
->undo_list
= Qt
;
5859 coding
->consumed
= coding
->consumed_char
= 0;
5860 coding
->produced
= coding
->produced_char
= 0;
5861 coding
->chars_at_source
= 0;
5862 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5865 ALLOC_CONVERSION_WORK_AREA (coding
);
5867 attrs
= CODING_ID_ATTRS (coding
->id
);
5871 coding_set_source (coding
);
5872 coding
->annotated
= 0;
5873 (*(coding
->decoder
)) (coding
);
5874 if (!NILP (CODING_ATTR_DECODE_TBL (attrs
)))
5875 translate_chars (coding
, CODING_ATTR_DECODE_TBL (attrs
));
5876 else if (!NILP (Vstandard_translation_table_for_decode
))
5877 translate_chars (coding
, Vstandard_translation_table_for_decode
);
5878 coding_set_destination (coding
);
5879 produce_chars (coding
);
5880 if (coding
->annotated
)
5881 produce_annotation (coding
);
5883 while (coding
->consumed
< coding
->src_bytes
5884 && ! coding
->result
);
5886 coding
->carryover_bytes
= 0;
5887 if (coding
->consumed
< coding
->src_bytes
)
5889 int nbytes
= coding
->src_bytes
- coding
->consumed
;
5890 const unsigned char *src
;
5892 coding_set_source (coding
);
5893 coding_set_destination (coding
);
5894 src
= coding
->source
+ coding
->consumed
;
5896 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
5898 /* Flush out unprocessed data as binary chars. We are sure
5899 that the number of data is less than the size of
5901 coding
->charbuf_used
= 0;
5902 while (nbytes
-- > 0)
5906 coding
->charbuf
[coding
->charbuf_used
++] = (c
& 0x80 ? - c
: c
);
5908 produce_chars (coding
);
5912 /* Record unprocessed bytes in coding->carryover. We are
5913 sure that the number of data is less than the size of
5914 coding->carryover. */
5915 unsigned char *p
= coding
->carryover
;
5917 coding
->carryover_bytes
= nbytes
;
5918 while (nbytes
-- > 0)
5921 coding
->consumed
= coding
->src_bytes
;
5924 if (BUFFERP (coding
->dst_object
))
5926 current_buffer
->undo_list
= undo_list
;
5927 record_insert (coding
->dst_pos
, coding
->produced_char
);
5929 if (! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
5930 decode_eol (coding
);
5931 return coding
->result
;
5935 /* Extract an annotation datum from a composition starting at POS and
5936 ending before LIMIT of CODING->src_object (buffer or string), store
5937 the data in BUF, set *STOP to a starting position of the next
5938 composition (if any) or to LIMIT, and return the address of the
5939 next element of BUF.
5941 If such an annotation is not found, set *STOP to a starting
5942 position of a composition after POS (if any) or to LIMIT, and
5946 handle_composition_annotation (pos
, limit
, coding
, buf
, stop
)
5947 EMACS_INT pos
, limit
;
5948 struct coding_system
*coding
;
5952 EMACS_INT start
, end
;
5955 if (! find_composition (pos
, limit
, &start
, &end
, &prop
, coding
->src_object
)
5958 else if (start
> pos
)
5964 /* We found a composition. Store the corresponding
5965 annotation data in BUF. */
5967 enum composition_method method
= COMPOSITION_METHOD (prop
);
5968 int nchars
= COMPOSITION_LENGTH (prop
);
5970 ADD_COMPOSITION_DATA (buf
, 0, nchars
, method
);
5971 if (method
!= COMPOSITION_RELATIVE
)
5973 Lisp_Object components
;
5976 components
= COMPOSITION_COMPONENTS (prop
);
5977 if (VECTORP (components
))
5979 len
= XVECTOR (components
)->size
;
5980 for (i
= 0; i
< len
; i
++)
5981 *buf
++ = XINT (AREF (components
, i
));
5983 else if (STRINGP (components
))
5985 len
= SCHARS (components
);
5989 FETCH_STRING_CHAR_ADVANCE (*buf
, components
, i
, i_byte
);
5993 else if (INTEGERP (components
))
5996 *buf
++ = XINT (components
);
5998 else if (CONSP (components
))
6000 for (len
= 0; CONSP (components
);
6001 len
++, components
= XCDR (components
))
6002 *buf
++ = XINT (XCAR (components
));
6010 if (find_composition (end
, limit
, &start
, &end
, &prop
,
6021 /* Extract an annotation datum from a text property `charset' at POS of
6022 CODING->src_object (buffer of string), store the data in BUF, set
6023 *STOP to the position where the value of `charset' property changes
6024 (limiting by LIMIT), and return the address of the next element of
6027 If the property value is nil, set *STOP to the position where the
6028 property value is non-nil (limiting by LIMIT), and return BUF. */
6031 handle_charset_annotation (pos
, limit
, coding
, buf
, stop
)
6032 EMACS_INT pos
, limit
;
6033 struct coding_system
*coding
;
6037 Lisp_Object val
, next
;
6040 val
= Fget_text_property (make_number (pos
), Qcharset
, coding
->src_object
);
6041 if (! NILP (val
) && CHARSETP (val
))
6042 id
= XINT (CHARSET_SYMBOL_ID (val
));
6045 ADD_CHARSET_DATA (buf
, 0, 0, id
);
6046 next
= Fnext_single_property_change (make_number (pos
), Qcharset
,
6048 make_number (limit
));
6049 *stop
= XINT (next
);
6055 consume_chars (coding
)
6056 struct coding_system
*coding
;
6058 int *buf
= coding
->charbuf
;
6059 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
;
6060 const unsigned char *src
= coding
->source
+ coding
->consumed
;
6061 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
6062 EMACS_INT pos
= coding
->src_pos
+ coding
->consumed_char
;
6063 EMACS_INT end_pos
= coding
->src_pos
+ coding
->src_chars
;
6064 int multibytep
= coding
->src_multibyte
;
6065 Lisp_Object eol_type
;
6067 EMACS_INT stop
, stop_composition
, stop_charset
;
6069 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6070 if (VECTORP (eol_type
))
6073 /* Note: composition handling is not yet implemented. */
6074 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
6076 if (NILP (coding
->src_object
))
6077 stop
= stop_composition
= stop_charset
= end_pos
;
6080 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
)
6081 stop
= stop_composition
= pos
;
6083 stop
= stop_composition
= end_pos
;
6084 if (coding
->common_flags
& CODING_ANNOTATE_CHARSET_MASK
)
6085 stop
= stop_charset
= pos
;
6087 stop_charset
= end_pos
;
6090 /* Compensate for CRLF and conversion. */
6091 buf_end
-= 1 + MAX_ANNOTATION_LENGTH
;
6092 while (buf
< buf_end
)
6098 if (pos
== stop_composition
)
6099 buf
= handle_composition_annotation (pos
, end_pos
, coding
,
6100 buf
, &stop_composition
);
6101 if (pos
== stop_charset
)
6102 buf
= handle_charset_annotation (pos
, end_pos
, coding
,
6103 buf
, &stop_charset
);
6104 stop
= (stop_composition
< stop_charset
6105 ? stop_composition
: stop_charset
);
6112 if (! CODING_FOR_UNIBYTE (coding
)
6113 && (bytes
= MULTIBYTE_LENGTH (src
, src_end
)) > 0)
6114 c
= STRING_CHAR_ADVANCE (src
), pos
+= bytes
;
6119 c
= STRING_CHAR_ADVANCE (src
), pos
++;
6120 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
6122 if (! EQ (eol_type
, Qunix
))
6126 if (EQ (eol_type
, Qdos
))
6135 coding
->consumed
= src
- coding
->source
;
6136 coding
->consumed_char
= pos
- coding
->src_pos
;
6137 coding
->charbuf_used
= buf
- coding
->charbuf
;
6138 coding
->chars_at_source
= 0;
6142 /* Encode the text at CODING->src_object into CODING->dst_object.
6143 CODING->src_object is a buffer or a string.
6144 CODING->dst_object is a buffer or nil.
6146 If CODING->src_object is a buffer, it must be the current buffer.
6147 In this case, if CODING->src_pos is positive, it is a position of
6148 the source text in the buffer, otherwise. the source text is in the
6149 gap area of the buffer, and coding->src_pos specifies the offset of
6150 the text from GPT (which must be the same as PT). If this is the
6151 same buffer as CODING->dst_object, CODING->src_pos must be
6152 negative and CODING should not have `pre-write-conversion'.
6154 If CODING->src_object is a string, CODING should not have
6155 `pre-write-conversion'.
6157 If CODING->dst_object is a buffer, the encoded data is inserted at
6158 the current point of that buffer.
6160 If CODING->dst_object is nil, the encoded data is placed at the
6161 memory area specified by CODING->destination. */
6164 encode_coding (coding
)
6165 struct coding_system
*coding
;
6169 attrs
= CODING_ID_ATTRS (coding
->id
);
6171 if (BUFFERP (coding
->dst_object
))
6173 set_buffer_internal (XBUFFER (coding
->dst_object
));
6174 coding
->dst_multibyte
6175 = ! NILP (current_buffer
->enable_multibyte_characters
);
6178 coding
->consumed
= coding
->consumed_char
= 0;
6179 coding
->produced
= coding
->produced_char
= 0;
6180 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
6183 ALLOC_CONVERSION_WORK_AREA (coding
);
6186 coding_set_source (coding
);
6187 consume_chars (coding
);
6189 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs
)))
6190 translate_chars (coding
, CODING_ATTR_ENCODE_TBL (attrs
));
6191 else if (!NILP (Vstandard_translation_table_for_encode
))
6192 translate_chars (coding
, Vstandard_translation_table_for_encode
);
6194 coding_set_destination (coding
);
6195 (*(coding
->encoder
)) (coding
);
6196 } while (coding
->consumed_char
< coding
->src_chars
);
6198 if (BUFFERP (coding
->dst_object
))
6199 insert_from_gap (coding
->produced_char
, coding
->produced
);
6201 return (coding
->result
);
6205 /* Name (or base name) of work buffer for code conversion. */
6206 static Lisp_Object Vcode_conversion_workbuf_name
;
6208 /* A working buffer used by the top level conversion. Once it is
6209 created, it is never destroyed. It has the name
6210 Vcode_conversion_workbuf_name. The other working buffers are
6211 destroyed after the use is finished, and their names are modified
6212 versions of Vcode_conversion_workbuf_name. */
6213 static Lisp_Object Vcode_conversion_reused_workbuf
;
6215 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6216 static int reused_workbuf_in_use
;
6219 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6220 multibyteness of returning buffer. */
6223 make_conversion_work_buffer (multibyte
)
6225 Lisp_Object name
, workbuf
;
6226 struct buffer
*current
;
6228 if (reused_workbuf_in_use
++)
6230 name
= Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name
, Qnil
);
6231 workbuf
= Fget_buffer_create (name
);
6235 name
= Vcode_conversion_workbuf_name
;
6236 workbuf
= Fget_buffer_create (name
);
6237 if (NILP (Vcode_conversion_reused_workbuf
))
6238 Vcode_conversion_reused_workbuf
= workbuf
;
6240 current
= current_buffer
;
6241 set_buffer_internal (XBUFFER (workbuf
));
6243 current_buffer
->undo_list
= Qt
;
6244 current_buffer
->enable_multibyte_characters
= multibyte
? Qt
: Qnil
;
6245 set_buffer_internal (current
);
6251 code_conversion_restore (arg
)
6254 Lisp_Object current
, workbuf
;
6256 current
= XCAR (arg
);
6257 workbuf
= XCDR (arg
);
6258 if (! NILP (workbuf
))
6260 if (EQ (workbuf
, Vcode_conversion_reused_workbuf
))
6261 reused_workbuf_in_use
= 0;
6262 else if (! NILP (Fbuffer_live_p (workbuf
)))
6263 Fkill_buffer (workbuf
);
6265 set_buffer_internal (XBUFFER (current
));
6270 code_conversion_save (with_work_buf
, multibyte
)
6271 int with_work_buf
, multibyte
;
6273 Lisp_Object workbuf
= Qnil
;
6276 workbuf
= make_conversion_work_buffer (multibyte
);
6277 record_unwind_protect (code_conversion_restore
,
6278 Fcons (Fcurrent_buffer (), workbuf
));
6283 decode_coding_gap (coding
, chars
, bytes
)
6284 struct coding_system
*coding
;
6285 EMACS_INT chars
, bytes
;
6287 int count
= specpdl_ptr
- specpdl
;
6290 code_conversion_save (0, 0);
6292 coding
->src_object
= Fcurrent_buffer ();
6293 coding
->src_chars
= chars
;
6294 coding
->src_bytes
= bytes
;
6295 coding
->src_pos
= -chars
;
6296 coding
->src_pos_byte
= -bytes
;
6297 coding
->src_multibyte
= chars
< bytes
;
6298 coding
->dst_object
= coding
->src_object
;
6299 coding
->dst_pos
= PT
;
6300 coding
->dst_pos_byte
= PT_BYTE
;
6301 coding
->dst_multibyte
= ! NILP (current_buffer
->enable_multibyte_characters
);
6302 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
6304 if (CODING_REQUIRE_DETECTION (coding
))
6305 detect_coding (coding
);
6307 decode_coding (coding
);
6309 attrs
= CODING_ID_ATTRS (coding
->id
);
6310 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6312 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6315 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6316 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6317 make_number (coding
->produced_char
));
6319 coding
->produced_char
+= Z
- prev_Z
;
6320 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6323 unbind_to (count
, Qnil
);
6324 return coding
->result
;
6328 encode_coding_gap (coding
, chars
, bytes
)
6329 struct coding_system
*coding
;
6330 EMACS_INT chars
, bytes
;
6332 int count
= specpdl_ptr
- specpdl
;
6334 code_conversion_save (0, 0);
6336 coding
->src_object
= Fcurrent_buffer ();
6337 coding
->src_chars
= chars
;
6338 coding
->src_bytes
= bytes
;
6339 coding
->src_pos
= -chars
;
6340 coding
->src_pos_byte
= -bytes
;
6341 coding
->src_multibyte
= chars
< bytes
;
6342 coding
->dst_object
= coding
->src_object
;
6343 coding
->dst_pos
= PT
;
6344 coding
->dst_pos_byte
= PT_BYTE
;
6346 encode_coding (coding
);
6348 unbind_to (count
, Qnil
);
6349 return coding
->result
;
6353 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6354 SRC_OBJECT into DST_OBJECT by coding context CODING.
6356 SRC_OBJECT is a buffer, a string, or Qnil.
6358 If it is a buffer, the text is at point of the buffer. FROM and TO
6359 are positions in the buffer.
6361 If it is a string, the text is at the beginning of the string.
6362 FROM and TO are indices to the string.
6364 If it is nil, the text is at coding->source. FROM and TO are
6365 indices to coding->source.
6367 DST_OBJECT is a buffer, Qt, or Qnil.
6369 If it is a buffer, the decoded text is inserted at point of the
6370 buffer. If the buffer is the same as SRC_OBJECT, the source text
6373 If it is Qt, a string is made from the decoded text, and
6374 set in CODING->dst_object.
6376 If it is Qnil, the decoded text is stored at CODING->destination.
6377 The caller must allocate CODING->dst_bytes bytes at
6378 CODING->destination by xmalloc. If the decoded text is longer than
6379 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6383 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6385 struct coding_system
*coding
;
6386 Lisp_Object src_object
;
6387 EMACS_INT from
, from_byte
, to
, to_byte
;
6388 Lisp_Object dst_object
;
6390 int count
= specpdl_ptr
- specpdl
;
6391 unsigned char *destination
;
6392 EMACS_INT dst_bytes
;
6393 EMACS_INT chars
= to
- from
;
6394 EMACS_INT bytes
= to_byte
- from_byte
;
6397 int saved_pt
= -1, saved_pt_byte
;
6399 buffer
= Fcurrent_buffer ();
6401 if (NILP (dst_object
))
6403 destination
= coding
->destination
;
6404 dst_bytes
= coding
->dst_bytes
;
6407 coding
->src_object
= src_object
;
6408 coding
->src_chars
= chars
;
6409 coding
->src_bytes
= bytes
;
6410 coding
->src_multibyte
= chars
< bytes
;
6412 if (STRINGP (src_object
))
6414 coding
->src_pos
= from
;
6415 coding
->src_pos_byte
= from_byte
;
6417 else if (BUFFERP (src_object
))
6419 set_buffer_internal (XBUFFER (src_object
));
6421 move_gap_both (from
, from_byte
);
6422 if (EQ (src_object
, dst_object
))
6424 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6425 TEMP_SET_PT_BOTH (from
, from_byte
);
6426 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6427 coding
->src_pos
= -chars
;
6428 coding
->src_pos_byte
= -bytes
;
6432 coding
->src_pos
= from
;
6433 coding
->src_pos_byte
= from_byte
;
6437 if (CODING_REQUIRE_DETECTION (coding
))
6438 detect_coding (coding
);
6439 attrs
= CODING_ID_ATTRS (coding
->id
);
6441 if (EQ (dst_object
, Qt
)
6442 || (! NILP (CODING_ATTR_POST_READ (attrs
))
6443 && NILP (dst_object
)))
6445 coding
->dst_object
= code_conversion_save (1, 1);
6446 coding
->dst_pos
= BEG
;
6447 coding
->dst_pos_byte
= BEG_BYTE
;
6448 coding
->dst_multibyte
= 1;
6450 else if (BUFFERP (dst_object
))
6452 code_conversion_save (0, 0);
6453 coding
->dst_object
= dst_object
;
6454 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6455 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6456 coding
->dst_multibyte
6457 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6461 code_conversion_save (0, 0);
6462 coding
->dst_object
= Qnil
;
6463 coding
->dst_multibyte
= 1;
6466 decode_coding (coding
);
6468 if (BUFFERP (coding
->dst_object
))
6469 set_buffer_internal (XBUFFER (coding
->dst_object
));
6471 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6473 struct gcpro gcpro1
, gcpro2
;
6474 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6477 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6478 GCPRO2 (coding
->src_object
, coding
->dst_object
);
6479 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6480 make_number (coding
->produced_char
));
6483 coding
->produced_char
+= Z
- prev_Z
;
6484 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6487 if (EQ (dst_object
, Qt
))
6489 coding
->dst_object
= Fbuffer_string ();
6491 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
6493 set_buffer_internal (XBUFFER (coding
->dst_object
));
6494 if (dst_bytes
< coding
->produced
)
6497 = (unsigned char *) xrealloc (destination
, coding
->produced
);
6500 record_conversion_result (coding
,
6501 CODING_RESULT_INSUFFICIENT_DST
);
6502 unbind_to (count
, Qnil
);
6505 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
6506 move_gap_both (BEGV
, BEGV_BYTE
);
6507 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
6508 coding
->destination
= destination
;
6514 /* This is the case of:
6515 (BUFFERP (src_object) && EQ (src_object, dst_object))
6516 As we have moved PT while replacing the original buffer
6517 contents, we must recover it now. */
6518 set_buffer_internal (XBUFFER (src_object
));
6519 if (saved_pt
< from
)
6520 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6521 else if (saved_pt
< from
+ chars
)
6522 TEMP_SET_PT_BOTH (from
, from_byte
);
6523 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6524 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6525 saved_pt_byte
+ (coding
->produced
- bytes
));
6527 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6528 saved_pt_byte
+ (coding
->produced
- bytes
));
6531 unbind_to (count
, coding
->dst_object
);
6536 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6538 struct coding_system
*coding
;
6539 Lisp_Object src_object
;
6540 EMACS_INT from
, from_byte
, to
, to_byte
;
6541 Lisp_Object dst_object
;
6543 int count
= specpdl_ptr
- specpdl
;
6544 EMACS_INT chars
= to
- from
;
6545 EMACS_INT bytes
= to_byte
- from_byte
;
6548 int saved_pt
= -1, saved_pt_byte
;
6550 buffer
= Fcurrent_buffer ();
6552 coding
->src_object
= src_object
;
6553 coding
->src_chars
= chars
;
6554 coding
->src_bytes
= bytes
;
6555 coding
->src_multibyte
= chars
< bytes
;
6557 attrs
= CODING_ID_ATTRS (coding
->id
);
6559 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
6561 coding
->src_object
= code_conversion_save (1, coding
->src_multibyte
);
6562 set_buffer_internal (XBUFFER (coding
->src_object
));
6563 if (STRINGP (src_object
))
6564 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
6565 else if (BUFFERP (src_object
))
6566 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
6568 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
6570 if (EQ (src_object
, dst_object
))
6572 set_buffer_internal (XBUFFER (src_object
));
6573 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6574 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6575 set_buffer_internal (XBUFFER (coding
->src_object
));
6578 call2 (CODING_ATTR_PRE_WRITE (attrs
),
6579 make_number (BEG
), make_number (Z
));
6580 coding
->src_object
= Fcurrent_buffer ();
6582 move_gap_both (BEG
, BEG_BYTE
);
6583 coding
->src_chars
= Z
- BEG
;
6584 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
6585 coding
->src_pos
= BEG
;
6586 coding
->src_pos_byte
= BEG_BYTE
;
6587 coding
->src_multibyte
= Z
< Z_BYTE
;
6589 else if (STRINGP (src_object
))
6591 code_conversion_save (0, 0);
6592 coding
->src_pos
= from
;
6593 coding
->src_pos_byte
= from_byte
;
6595 else if (BUFFERP (src_object
))
6597 code_conversion_save (0, 0);
6598 set_buffer_internal (XBUFFER (src_object
));
6599 if (EQ (src_object
, dst_object
))
6601 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6602 coding
->src_object
= del_range_1 (from
, to
, 1, 1);
6603 coding
->src_pos
= 0;
6604 coding
->src_pos_byte
= 0;
6608 if (from
< GPT
&& to
>= GPT
)
6609 move_gap_both (from
, from_byte
);
6610 coding
->src_pos
= from
;
6611 coding
->src_pos_byte
= from_byte
;
6615 code_conversion_save (0, 0);
6617 if (BUFFERP (dst_object
))
6619 coding
->dst_object
= dst_object
;
6620 if (EQ (src_object
, dst_object
))
6622 coding
->dst_pos
= from
;
6623 coding
->dst_pos_byte
= from_byte
;
6627 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6628 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6630 coding
->dst_multibyte
6631 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6633 else if (EQ (dst_object
, Qt
))
6635 coding
->dst_object
= Qnil
;
6636 coding
->dst_bytes
= coding
->src_chars
;
6637 if (coding
->dst_bytes
== 0)
6638 coding
->dst_bytes
= 1;
6639 coding
->destination
= (unsigned char *) xmalloc (coding
->dst_bytes
);
6640 coding
->dst_multibyte
= 0;
6644 coding
->dst_object
= Qnil
;
6645 coding
->dst_multibyte
= 0;
6648 encode_coding (coding
);
6650 if (EQ (dst_object
, Qt
))
6652 if (BUFFERP (coding
->dst_object
))
6653 coding
->dst_object
= Fbuffer_string ();
6657 = make_unibyte_string ((char *) coding
->destination
,
6659 xfree (coding
->destination
);
6665 /* This is the case of:
6666 (BUFFERP (src_object) && EQ (src_object, dst_object))
6667 As we have moved PT while replacing the original buffer
6668 contents, we must recover it now. */
6669 set_buffer_internal (XBUFFER (src_object
));
6670 if (saved_pt
< from
)
6671 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6672 else if (saved_pt
< from
+ chars
)
6673 TEMP_SET_PT_BOTH (from
, from_byte
);
6674 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6675 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6676 saved_pt_byte
+ (coding
->produced
- bytes
));
6678 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6679 saved_pt_byte
+ (coding
->produced
- bytes
));
6682 unbind_to (count
, Qnil
);
6687 preferred_coding_system ()
6689 int id
= coding_categories
[coding_priorities
[0]].id
;
6691 return CODING_ID_NAME (id
);
6696 /*** 8. Emacs Lisp library functions ***/
6698 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
6699 doc
: /* Return t if OBJECT is nil or a coding-system.
6700 See the documentation of `define-coding-system' for information
6701 about coding-system objects. */)
6705 return ((NILP (obj
) || CODING_SYSTEM_P (obj
)) ? Qt
: Qnil
);
6708 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
6709 Sread_non_nil_coding_system
, 1, 1, 0,
6710 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6717 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6718 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
6720 while (SCHARS (val
) == 0);
6721 return (Fintern (val
, Qnil
));
6724 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
6725 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6726 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6727 (prompt
, default_coding_system
)
6728 Lisp_Object prompt
, default_coding_system
;
6731 if (SYMBOLP (default_coding_system
))
6732 XSETSTRING (default_coding_system
, XPNTR (SYMBOL_NAME (default_coding_system
)));
6733 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6734 Qt
, Qnil
, Qcoding_system_history
,
6735 default_coding_system
, Qnil
);
6736 return (SCHARS (val
) == 0 ? Qnil
: Fintern (val
, Qnil
));
6739 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
6741 doc
: /* Check validity of CODING-SYSTEM.
6742 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6744 Lisp_Object coding_system
;
6746 CHECK_SYMBOL (coding_system
);
6747 if (!NILP (Fcoding_system_p (coding_system
)))
6748 return coding_system
;
6750 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
6754 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6755 HIGHEST is nonzero, return the coding system of the highest
6756 priority among the detected coding systems. Otherwize return a
6757 list of detected coding systems sorted by their priorities. If
6758 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6759 multibyte form but contains only ASCII and eight-bit chars.
6760 Otherwise, the bytes are raw bytes.
6762 CODING-SYSTEM controls the detection as below:
6764 If it is nil, detect both text-format and eol-format. If the
6765 text-format part of CODING-SYSTEM is already specified
6766 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6767 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6768 detect only text-format. */
6771 detect_coding_system (src
, src_chars
, src_bytes
, highest
, multibytep
,
6773 const unsigned char *src
;
6774 int src_chars
, src_bytes
, highest
;
6776 Lisp_Object coding_system
;
6778 const unsigned char *src_end
= src
+ src_bytes
;
6779 Lisp_Object attrs
, eol_type
;
6781 struct coding_system coding
;
6783 struct coding_detection_info detect_info
;
6784 enum coding_category base_category
;
6786 if (NILP (coding_system
))
6787 coding_system
= Qundecided
;
6788 setup_coding_system (coding_system
, &coding
);
6789 attrs
= CODING_ID_ATTRS (coding
.id
);
6790 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
6791 coding_system
= CODING_ATTR_BASE_NAME (attrs
);
6793 coding
.source
= src
;
6794 coding
.src_chars
= src_chars
;
6795 coding
.src_bytes
= src_bytes
;
6796 coding
.src_multibyte
= multibytep
;
6797 coding
.consumed
= 0;
6798 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6800 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
6802 /* At first, detect text-format if necessary. */
6803 base_category
= XINT (CODING_ATTR_CATEGORY (attrs
));
6804 if (base_category
== coding_category_undecided
)
6806 enum coding_category category
;
6807 struct coding_system
*this;
6810 /* Skip all ASCII bytes except for a few ISO2022 controls. */
6811 for (i
= 0; src
< src_end
; i
++, src
++)
6814 if (c
& 0x80 || (c
< 0x20 && (c
== 0
6815 || c
== ISO_CODE_ESC
6817 || c
== ISO_CODE_SO
)))
6820 coding
.head_ascii
= src
- coding
.source
;
6823 for (i
= 0; i
< coding_category_raw_text
; i
++)
6825 category
= coding_priorities
[i
];
6826 this = coding_categories
+ category
;
6830 /* No coding system of this category is defined. */
6831 detect_info
.rejected
|= (1 << category
);
6833 else if (category
>= coding_category_raw_text
)
6835 else if (detect_info
.checked
& (1 << category
))
6838 && (detect_info
.found
& (1 << category
)))
6843 if ((*(this->detector
)) (&coding
, &detect_info
)
6845 && (detect_info
.found
& (1 << category
)))
6847 if (category
== coding_category_utf_16_auto
)
6849 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
6850 category
= coding_category_utf_16_le
;
6852 category
= coding_category_utf_16_be
;
6859 if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
6861 detect_info
.found
= CATEGORY_MASK_RAW_TEXT
;
6862 id
= coding_categories
[coding_category_raw_text
].id
;
6863 val
= Fcons (make_number (id
), Qnil
);
6865 else if (! detect_info
.rejected
&& ! detect_info
.found
)
6867 detect_info
.found
= CATEGORY_MASK_ANY
;
6868 id
= coding_categories
[coding_category_undecided
].id
;
6869 val
= Fcons (make_number (id
), Qnil
);
6873 if (detect_info
.found
)
6875 detect_info
.found
= 1 << category
;
6876 val
= Fcons (make_number (this->id
), Qnil
);
6879 for (i
= 0; i
< coding_category_raw_text
; i
++)
6880 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
6882 detect_info
.found
= 1 << coding_priorities
[i
];
6883 id
= coding_categories
[coding_priorities
[i
]].id
;
6884 val
= Fcons (make_number (id
), Qnil
);
6890 int mask
= detect_info
.rejected
| detect_info
.found
;
6894 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6896 category
= coding_priorities
[i
];
6897 if (! (mask
& (1 << category
)))
6899 found
|= 1 << category
;
6900 id
= coding_categories
[category
].id
;
6901 val
= Fcons (make_number (id
), val
);
6904 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6906 category
= coding_priorities
[i
];
6907 if (detect_info
.found
& (1 << category
))
6909 id
= coding_categories
[category
].id
;
6910 val
= Fcons (make_number (id
), val
);
6913 detect_info
.found
|= found
;
6916 else if (base_category
== coding_category_utf_16_auto
)
6918 if (detect_coding_utf_16 (&coding
, &detect_info
))
6920 enum coding_category category
;
6921 struct coding_system
*this;
6923 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
6924 this = coding_categories
+ coding_category_utf_16_le
;
6925 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
6926 this = coding_categories
+ coding_category_utf_16_be
;
6927 else if (detect_info
.rejected
& CATEGORY_MASK_UTF_16_LE_NOSIG
)
6928 this = coding_categories
+ coding_category_utf_16_be_nosig
;
6930 this = coding_categories
+ coding_category_utf_16_le_nosig
;
6931 val
= Fcons (make_number (this->id
), Qnil
);
6936 detect_info
.found
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
6937 val
= Fcons (make_number (coding
.id
), Qnil
);
6940 /* Then, detect eol-format if necessary. */
6942 int normal_eol
= -1, utf_16_be_eol
= -1, utf_16_le_eol
;
6945 if (VECTORP (eol_type
))
6947 if (detect_info
.found
& ~CATEGORY_MASK_UTF_16
)
6948 normal_eol
= detect_eol (coding
.source
, src_bytes
,
6949 coding_category_raw_text
);
6950 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_BE
6951 | CATEGORY_MASK_UTF_16_BE_NOSIG
))
6952 utf_16_be_eol
= detect_eol (coding
.source
, src_bytes
,
6953 coding_category_utf_16_be
);
6954 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
6955 | CATEGORY_MASK_UTF_16_LE_NOSIG
))
6956 utf_16_le_eol
= detect_eol (coding
.source
, src_bytes
,
6957 coding_category_utf_16_le
);
6961 if (EQ (eol_type
, Qunix
))
6962 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_LF
;
6963 else if (EQ (eol_type
, Qdos
))
6964 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CRLF
;
6966 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CR
;
6969 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
6971 enum coding_category category
;
6974 id
= XINT (XCAR (tail
));
6975 attrs
= CODING_ID_ATTRS (id
);
6976 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
6977 eol_type
= CODING_ID_EOL_TYPE (id
);
6978 if (VECTORP (eol_type
))
6980 if (category
== coding_category_utf_16_be
6981 || category
== coding_category_utf_16_be_nosig
)
6982 this_eol
= utf_16_be_eol
;
6983 else if (category
== coding_category_utf_16_le
6984 || category
== coding_category_utf_16_le_nosig
)
6985 this_eol
= utf_16_le_eol
;
6987 this_eol
= normal_eol
;
6989 if (this_eol
== EOL_SEEN_LF
)
6990 XSETCAR (tail
, AREF (eol_type
, 0));
6991 else if (this_eol
== EOL_SEEN_CRLF
)
6992 XSETCAR (tail
, AREF (eol_type
, 1));
6993 else if (this_eol
== EOL_SEEN_CR
)
6994 XSETCAR (tail
, AREF (eol_type
, 2));
6996 XSETCAR (tail
, CODING_ID_NAME (id
));
6999 XSETCAR (tail
, CODING_ID_NAME (id
));
7003 return (highest
? XCAR (val
) : val
);
7007 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
7009 doc
: /* Detect coding system of the text in the region between START and END.
7010 Return a list of possible coding systems ordered by priority.
7012 If only ASCII characters are found, it returns a list of single element
7013 `undecided' or its subsidiary coding system according to a detected
7016 If optional argument HIGHEST is non-nil, return the coding system of
7017 highest priority. */)
7018 (start
, end
, highest
)
7019 Lisp_Object start
, end
, highest
;
7022 int from_byte
, to_byte
;
7024 CHECK_NUMBER_COERCE_MARKER (start
);
7025 CHECK_NUMBER_COERCE_MARKER (end
);
7027 validate_region (&start
, &end
);
7028 from
= XINT (start
), to
= XINT (end
);
7029 from_byte
= CHAR_TO_BYTE (from
);
7030 to_byte
= CHAR_TO_BYTE (to
);
7032 if (from
< GPT
&& to
>= GPT
)
7033 move_gap_both (to
, to_byte
);
7035 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
7036 to
- from
, to_byte
- from_byte
,
7038 !NILP (current_buffer
7039 ->enable_multibyte_characters
),
7043 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
7045 doc
: /* Detect coding system of the text in STRING.
7046 Return a list of possible coding systems ordered by priority.
7048 If only ASCII characters are found, it returns a list of single element
7049 `undecided' or its subsidiary coding system according to a detected
7052 If optional argument HIGHEST is non-nil, return the coding system of
7053 highest priority. */)
7055 Lisp_Object string
, highest
;
7057 CHECK_STRING (string
);
7059 return detect_coding_system (SDATA (string
),
7060 SCHARS (string
), SBYTES (string
),
7061 !NILP (highest
), STRING_MULTIBYTE (string
),
7067 char_encodable_p (c
, attrs
)
7072 struct charset
*charset
;
7074 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
7075 CONSP (tail
); tail
= XCDR (tail
))
7077 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
7078 if (CHAR_CHARSET_P (c
, charset
))
7081 return (! NILP (tail
));
7085 /* Return a list of coding systems that safely encode the text between
7086 START and END. If EXCLUDE is non-nil, it is a list of coding
7087 systems not to check. The returned list doesn't contain any such
7088 coding systems. In any case, if the text contains only ASCII or is
7089 unibyte, return t. */
7091 DEFUN ("find-coding-systems-region-internal",
7092 Ffind_coding_systems_region_internal
,
7093 Sfind_coding_systems_region_internal
, 2, 3, 0,
7094 doc
: /* Internal use only. */)
7095 (start
, end
, exclude
)
7096 Lisp_Object start
, end
, exclude
;
7098 Lisp_Object coding_attrs_list
, safe_codings
;
7099 EMACS_INT start_byte
, end_byte
;
7100 const unsigned char *p
, *pbeg
, *pend
;
7102 Lisp_Object tail
, elt
;
7104 if (STRINGP (start
))
7106 if (!STRING_MULTIBYTE (start
)
7107 || SCHARS (start
) == SBYTES (start
))
7110 end_byte
= SBYTES (start
);
7114 CHECK_NUMBER_COERCE_MARKER (start
);
7115 CHECK_NUMBER_COERCE_MARKER (end
);
7116 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7117 args_out_of_range (start
, end
);
7118 if (NILP (current_buffer
->enable_multibyte_characters
))
7120 start_byte
= CHAR_TO_BYTE (XINT (start
));
7121 end_byte
= CHAR_TO_BYTE (XINT (end
));
7122 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7125 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7127 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7128 move_gap_both (XINT (start
), start_byte
);
7130 move_gap_both (XINT (end
), end_byte
);
7134 coding_attrs_list
= Qnil
;
7135 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7137 || NILP (Fmemq (XCAR (tail
), exclude
)))
7141 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
7142 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
7143 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
7144 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
7147 if (STRINGP (start
))
7148 p
= pbeg
= SDATA (start
);
7150 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7151 pend
= p
+ (end_byte
- start_byte
);
7153 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
7154 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7158 if (ASCII_BYTE_P (*p
))
7162 c
= STRING_CHAR_ADVANCE (p
);
7164 charset_map_loaded
= 0;
7165 for (tail
= coding_attrs_list
; CONSP (tail
);)
7170 else if (char_encodable_p (c
, elt
))
7172 else if (CONSP (XCDR (tail
)))
7174 XSETCAR (tail
, XCAR (XCDR (tail
)));
7175 XSETCDR (tail
, XCDR (XCDR (tail
)));
7179 XSETCAR (tail
, Qnil
);
7183 if (charset_map_loaded
)
7185 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7187 if (STRINGP (start
))
7188 pbeg
= SDATA (start
);
7190 pbeg
= BYTE_POS_ADDR (start_byte
);
7191 p
= pbeg
+ p_offset
;
7192 pend
= pbeg
+ pend_offset
;
7197 safe_codings
= Qnil
;
7198 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
7199 if (! NILP (XCAR (tail
)))
7200 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
7202 return safe_codings
;
7206 DEFUN ("unencodable-char-position", Funencodable_char_position
,
7207 Sunencodable_char_position
, 3, 5, 0,
7209 Return position of first un-encodable character in a region.
7210 START and END specfiy the region and CODING-SYSTEM specifies the
7211 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7213 If optional 4th argument COUNT is non-nil, it specifies at most how
7214 many un-encodable characters to search. In this case, the value is a
7217 If optional 5th argument STRING is non-nil, it is a string to search
7218 for un-encodable characters. In that case, START and END are indexes
7220 (start
, end
, coding_system
, count
, string
)
7221 Lisp_Object start
, end
, coding_system
, count
, string
;
7224 struct coding_system coding
;
7225 Lisp_Object attrs
, charset_list
;
7226 Lisp_Object positions
;
7228 const unsigned char *p
, *stop
, *pend
;
7229 int ascii_compatible
;
7231 setup_coding_system (Fcheck_coding_system (coding_system
), &coding
);
7232 attrs
= CODING_ID_ATTRS (coding
.id
);
7233 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
7235 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
7236 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7240 validate_region (&start
, &end
);
7241 from
= XINT (start
);
7243 if (NILP (current_buffer
->enable_multibyte_characters
)
7244 || (ascii_compatible
7245 && (to
- from
) == (CHAR_TO_BYTE (to
) - (CHAR_TO_BYTE (from
)))))
7247 p
= CHAR_POS_ADDR (from
);
7248 pend
= CHAR_POS_ADDR (to
);
7249 if (from
< GPT
&& to
>= GPT
)
7256 CHECK_STRING (string
);
7257 CHECK_NATNUM (start
);
7259 from
= XINT (start
);
7262 || to
> SCHARS (string
))
7263 args_out_of_range_3 (string
, start
, end
);
7264 if (! STRING_MULTIBYTE (string
))
7266 p
= SDATA (string
) + string_char_to_byte (string
, from
);
7267 stop
= pend
= SDATA (string
) + string_char_to_byte (string
, to
);
7268 if (ascii_compatible
&& (to
- from
) == (pend
- p
))
7276 CHECK_NATNUM (count
);
7285 if (ascii_compatible
)
7286 while (p
< stop
&& ASCII_BYTE_P (*p
))
7296 c
= STRING_CHAR_ADVANCE (p
);
7297 if (! (ASCII_CHAR_P (c
) && ascii_compatible
)
7298 && ! char_charset (c
, charset_list
, NULL
))
7300 positions
= Fcons (make_number (from
), positions
);
7309 return (NILP (count
) ? Fcar (positions
) : Fnreverse (positions
));
7313 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
7314 Scheck_coding_systems_region
, 3, 3, 0,
7315 doc
: /* Check if the region is encodable by coding systems.
7317 START and END are buffer positions specifying the region.
7318 CODING-SYSTEM-LIST is a list of coding systems to check.
7320 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7321 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7322 whole region, POS0, POS1, ... are buffer positions where non-encodable
7323 characters are found.
7325 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7328 START may be a string. In that case, check if the string is
7329 encodable, and the value contains indices to the string instead of
7330 buffer positions. END is ignored. */)
7331 (start
, end
, coding_system_list
)
7332 Lisp_Object start
, end
, coding_system_list
;
7335 EMACS_INT start_byte
, end_byte
;
7337 const unsigned char *p
, *pbeg
, *pend
;
7339 Lisp_Object tail
, elt
;
7341 if (STRINGP (start
))
7343 if (!STRING_MULTIBYTE (start
)
7344 && SCHARS (start
) != SBYTES (start
))
7347 end_byte
= SBYTES (start
);
7352 CHECK_NUMBER_COERCE_MARKER (start
);
7353 CHECK_NUMBER_COERCE_MARKER (end
);
7354 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7355 args_out_of_range (start
, end
);
7356 if (NILP (current_buffer
->enable_multibyte_characters
))
7358 start_byte
= CHAR_TO_BYTE (XINT (start
));
7359 end_byte
= CHAR_TO_BYTE (XINT (end
));
7360 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7363 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7365 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7366 move_gap_both (XINT (start
), start_byte
);
7368 move_gap_both (XINT (end
), end_byte
);
7374 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7377 list
= Fcons (Fcons (elt
, Fcons (AREF (CODING_SYSTEM_SPEC (elt
), 0),
7382 if (STRINGP (start
))
7383 p
= pbeg
= SDATA (start
);
7385 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7386 pend
= p
+ (end_byte
- start_byte
);
7388 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
7389 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7393 if (ASCII_BYTE_P (*p
))
7397 c
= STRING_CHAR_ADVANCE (p
);
7399 charset_map_loaded
= 0;
7400 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
7402 elt
= XCDR (XCAR (tail
));
7403 if (! char_encodable_p (c
, XCAR (elt
)))
7404 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
7406 if (charset_map_loaded
)
7408 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7410 if (STRINGP (start
))
7411 pbeg
= SDATA (start
);
7413 pbeg
= BYTE_POS_ADDR (start_byte
);
7414 p
= pbeg
+ p_offset
;
7415 pend
= pbeg
+ pend_offset
;
7423 for (; CONSP (tail
); tail
= XCDR (tail
))
7426 if (CONSP (XCDR (XCDR (elt
))))
7427 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
7437 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
7438 Lisp_Object start
, end
, coding_system
, dst_object
;
7439 int encodep
, norecord
;
7441 struct coding_system coding
;
7442 EMACS_INT from
, from_byte
, to
, to_byte
;
7443 Lisp_Object src_object
;
7445 CHECK_NUMBER_COERCE_MARKER (start
);
7446 CHECK_NUMBER_COERCE_MARKER (end
);
7447 if (NILP (coding_system
))
7448 coding_system
= Qno_conversion
;
7450 CHECK_CODING_SYSTEM (coding_system
);
7451 src_object
= Fcurrent_buffer ();
7452 if (NILP (dst_object
))
7453 dst_object
= src_object
;
7454 else if (! EQ (dst_object
, Qt
))
7455 CHECK_BUFFER (dst_object
);
7457 validate_region (&start
, &end
);
7458 from
= XFASTINT (start
);
7459 from_byte
= CHAR_TO_BYTE (from
);
7460 to
= XFASTINT (end
);
7461 to_byte
= CHAR_TO_BYTE (to
);
7463 setup_coding_system (coding_system
, &coding
);
7464 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7467 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7470 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7473 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7475 return (BUFFERP (dst_object
)
7476 ? make_number (coding
.produced_char
)
7477 : coding
.dst_object
);
7481 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
7482 3, 4, "r\nzCoding system: ",
7483 doc
: /* Decode the current region from the specified coding system.
7484 When called from a program, takes four arguments:
7485 START, END, CODING-SYSTEM, and DESTINATION.
7486 START and END are buffer positions.
7488 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7489 If nil, the region between START and END is replace by the decoded text.
7490 If buffer, the decoded text is inserted in the buffer.
7491 If t, the decoded text is returned.
7493 This function sets `last-coding-system-used' to the precise coding system
7494 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7495 not fully specified.)
7496 It returns the length of the decoded text. */)
7497 (start
, end
, coding_system
, destination
)
7498 Lisp_Object start
, end
, coding_system
, destination
;
7500 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
7503 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
7504 3, 4, "r\nzCoding system: ",
7505 doc
: /* Encode the current region by specified coding system.
7506 When called from a program, takes three arguments:
7507 START, END, and CODING-SYSTEM. START and END are buffer positions.
7509 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7510 If nil, the region between START and END is replace by the encoded text.
7511 If buffer, the encoded text is inserted in the buffer.
7512 If t, the encoded text is returned.
7514 This function sets `last-coding-system-used' to the precise coding system
7515 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7516 not fully specified.)
7517 It returns the length of the encoded text. */)
7518 (start
, end
, coding_system
, destination
)
7519 Lisp_Object start
, end
, coding_system
, destination
;
7521 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
7525 code_convert_string (string
, coding_system
, dst_object
,
7526 encodep
, nocopy
, norecord
)
7527 Lisp_Object string
, coding_system
, dst_object
;
7528 int encodep
, nocopy
, norecord
;
7530 struct coding_system coding
;
7531 EMACS_INT chars
, bytes
;
7533 CHECK_STRING (string
);
7534 if (NILP (coding_system
))
7537 Vlast_coding_system_used
= Qno_conversion
;
7538 if (NILP (dst_object
))
7539 return (nocopy
? Fcopy_sequence (string
) : string
);
7542 if (NILP (coding_system
))
7543 coding_system
= Qno_conversion
;
7545 CHECK_CODING_SYSTEM (coding_system
);
7546 if (NILP (dst_object
))
7548 else if (! EQ (dst_object
, Qt
))
7549 CHECK_BUFFER (dst_object
);
7551 setup_coding_system (coding_system
, &coding
);
7552 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7553 chars
= SCHARS (string
);
7554 bytes
= SBYTES (string
);
7556 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7558 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7560 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7562 return (BUFFERP (dst_object
)
7563 ? make_number (coding
.produced_char
)
7564 : coding
.dst_object
);
7568 /* Encode or decode STRING according to CODING_SYSTEM.
7569 Do not set Vlast_coding_system_used.
7571 This function is called only from macros DECODE_FILE and
7572 ENCODE_FILE, thus we ignore character composition. */
7575 code_convert_string_norecord (string
, coding_system
, encodep
)
7576 Lisp_Object string
, coding_system
;
7579 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
7583 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
7585 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7587 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7588 if the decoding operation is trivial.
7590 Optional fourth arg BUFFER non-nil meant that the decoded text is
7591 inserted in BUFFER instead of returned as a string. In this case,
7592 the return value is BUFFER.
7594 This function sets `last-coding-system-used' to the precise coding system
7595 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7596 not fully specified. */)
7597 (string
, coding_system
, nocopy
, buffer
)
7598 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7600 return code_convert_string (string
, coding_system
, buffer
,
7601 0, ! NILP (nocopy
), 0);
7604 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
7606 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
7608 Optional third arg NOCOPY non-nil means it is OK to return STRING
7609 itself if the encoding operation is trivial.
7611 Optional fourth arg BUFFER non-nil meant that the encoded text is
7612 inserted in BUFFER instead of returned as a string. In this case,
7613 the return value is BUFFER.
7615 This function sets `last-coding-system-used' to the precise coding system
7616 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7617 not fully specified.) */)
7618 (string
, coding_system
, nocopy
, buffer
)
7619 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7621 return code_convert_string (string
, coding_system
, buffer
,
7622 1, ! NILP (nocopy
), 1);
7626 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
7627 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
7628 Return the corresponding character. */)
7632 Lisp_Object spec
, attrs
, val
;
7633 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
7636 CHECK_NATNUM (code
);
7637 c
= XFASTINT (code
);
7638 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7639 attrs
= AREF (spec
, 0);
7641 if (ASCII_BYTE_P (c
)
7642 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7645 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7646 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7647 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7648 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7651 charset
= charset_roman
;
7652 else if (c
>= 0xA0 && c
< 0xDF)
7654 charset
= charset_kana
;
7659 int s1
= c
>> 8, s2
= c
& 0xFF;
7661 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
7662 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
7663 error ("Invalid code: %d", code
);
7665 charset
= charset_kanji
;
7667 c
= DECODE_CHAR (charset
, c
);
7669 error ("Invalid code: %d", code
);
7670 return make_number (c
);
7674 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
7675 doc
: /* Encode a Japanese character CHAR to shift_jis encoding.
7676 Return the corresponding code in SJIS. */)
7680 Lisp_Object spec
, attrs
, charset_list
;
7682 struct charset
*charset
;
7685 CHECK_CHARACTER (ch
);
7687 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7688 attrs
= AREF (spec
, 0);
7690 if (ASCII_CHAR_P (c
)
7691 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7694 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7695 charset
= char_charset (c
, charset_list
, &code
);
7696 if (code
== CHARSET_INVALID_CODE (charset
))
7697 error ("Can't encode by shift_jis encoding: %d", c
);
7700 return make_number (code
);
7703 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
7704 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
7705 Return the corresponding character. */)
7709 Lisp_Object spec
, attrs
, val
;
7710 struct charset
*charset_roman
, *charset_big5
, *charset
;
7713 CHECK_NATNUM (code
);
7714 c
= XFASTINT (code
);
7715 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7716 attrs
= AREF (spec
, 0);
7718 if (ASCII_BYTE_P (c
)
7719 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7722 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7723 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7724 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7727 charset
= charset_roman
;
7730 int b1
= c
>> 8, b2
= c
& 0x7F;
7731 if (b1
< 0xA1 || b1
> 0xFE
7732 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
7733 error ("Invalid code: %d", code
);
7734 charset
= charset_big5
;
7736 c
= DECODE_CHAR (charset
, (unsigned )c
);
7738 error ("Invalid code: %d", code
);
7739 return make_number (c
);
7742 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
7743 doc
: /* Encode the Big5 character CHAR to BIG5 coding system.
7744 Return the corresponding character code in Big5. */)
7748 Lisp_Object spec
, attrs
, charset_list
;
7749 struct charset
*charset
;
7753 CHECK_CHARACTER (ch
);
7755 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7756 attrs
= AREF (spec
, 0);
7757 if (ASCII_CHAR_P (c
)
7758 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7761 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7762 charset
= char_charset (c
, charset_list
, &code
);
7763 if (code
== CHARSET_INVALID_CODE (charset
))
7764 error ("Can't encode by Big5 encoding: %d", c
);
7766 return make_number (code
);
7770 DEFUN ("set-terminal-coding-system-internal",
7771 Fset_terminal_coding_system_internal
,
7772 Sset_terminal_coding_system_internal
, 1, 1, 0,
7773 doc
: /* Internal use only. */)
7775 Lisp_Object coding_system
;
7777 CHECK_SYMBOL (coding_system
);
7778 setup_coding_system (Fcheck_coding_system (coding_system
),
7781 /* We had better not send unsafe characters to terminal. */
7782 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
7783 /* Characer composition should be disabled. */
7784 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7785 terminal_coding
.src_multibyte
= 1;
7786 terminal_coding
.dst_multibyte
= 0;
7790 DEFUN ("set-safe-terminal-coding-system-internal",
7791 Fset_safe_terminal_coding_system_internal
,
7792 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
7793 doc
: /* Internal use only. */)
7795 Lisp_Object coding_system
;
7797 CHECK_SYMBOL (coding_system
);
7798 setup_coding_system (Fcheck_coding_system (coding_system
),
7799 &safe_terminal_coding
);
7800 /* Characer composition should be disabled. */
7801 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7802 safe_terminal_coding
.src_multibyte
= 1;
7803 safe_terminal_coding
.dst_multibyte
= 0;
7807 DEFUN ("terminal-coding-system",
7808 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
7809 doc
: /* Return coding system specified for terminal output. */)
7812 return CODING_ID_NAME (terminal_coding
.id
);
7815 DEFUN ("set-keyboard-coding-system-internal",
7816 Fset_keyboard_coding_system_internal
,
7817 Sset_keyboard_coding_system_internal
, 1, 1, 0,
7818 doc
: /* Internal use only. */)
7820 Lisp_Object coding_system
;
7822 CHECK_SYMBOL (coding_system
);
7823 setup_coding_system (Fcheck_coding_system (coding_system
),
7825 /* Characer composition should be disabled. */
7826 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7830 DEFUN ("keyboard-coding-system",
7831 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
7832 doc
: /* Return coding system specified for decoding keyboard input. */)
7835 return CODING_ID_NAME (keyboard_coding
.id
);
7839 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
7840 Sfind_operation_coding_system
, 1, MANY
, 0,
7841 doc
: /* Choose a coding system for an operation based on the target name.
7842 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7843 DECODING-SYSTEM is the coding system to use for decoding
7844 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7845 for encoding (in case OPERATION does encoding).
7847 The first argument OPERATION specifies an I/O primitive:
7848 For file I/O, `insert-file-contents' or `write-region'.
7849 For process I/O, `call-process', `call-process-region', or `start-process'.
7850 For network I/O, `open-network-stream'.
7852 The remaining arguments should be the same arguments that were passed
7853 to the primitive. Depending on which primitive, one of those arguments
7854 is selected as the TARGET. For example, if OPERATION does file I/O,
7855 whichever argument specifies the file name is TARGET.
7857 TARGET has a meaning which depends on OPERATION:
7858 For file I/O, TARGET is a file name.
7859 For process I/O, TARGET is a process name.
7860 For network I/O, TARGET is a service name or a port number
7862 This function looks up what specified for TARGET in,
7863 `file-coding-system-alist', `process-coding-system-alist',
7864 or `network-coding-system-alist' depending on OPERATION.
7865 They may specify a coding system, a cons of coding systems,
7866 or a function symbol to call.
7867 In the last case, we call the function with one argument,
7868 which is a list of all the arguments given to this function.
7870 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7875 Lisp_Object operation
, target_idx
, target
, val
;
7876 register Lisp_Object chain
;
7879 error ("Too few arguments");
7880 operation
= args
[0];
7881 if (!SYMBOLP (operation
)
7882 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
7883 error ("Invalid first arguement");
7884 if (nargs
< 1 + XINT (target_idx
))
7885 error ("Too few arguments for operation: %s",
7886 SDATA (SYMBOL_NAME (operation
)));
7887 target
= args
[XINT (target_idx
) + 1];
7888 if (!(STRINGP (target
)
7889 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
7890 error ("Invalid %dth argument", XINT (target_idx
) + 1);
7892 chain
= ((EQ (operation
, Qinsert_file_contents
)
7893 || EQ (operation
, Qwrite_region
))
7894 ? Vfile_coding_system_alist
7895 : (EQ (operation
, Qopen_network_stream
)
7896 ? Vnetwork_coding_system_alist
7897 : Vprocess_coding_system_alist
));
7901 for (; CONSP (chain
); chain
= XCDR (chain
))
7907 && ((STRINGP (target
)
7908 && STRINGP (XCAR (elt
))
7909 && fast_string_match (XCAR (elt
), target
) >= 0)
7910 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
7913 /* Here, if VAL is both a valid coding system and a valid
7914 function symbol, we return VAL as a coding system. */
7917 if (! SYMBOLP (val
))
7919 if (! NILP (Fcoding_system_p (val
)))
7920 return Fcons (val
, val
);
7921 if (! NILP (Ffboundp (val
)))
7923 val
= call1 (val
, Flist (nargs
, args
));
7926 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
7927 return Fcons (val
, val
);
7935 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
7936 Sset_coding_system_priority
, 0, MANY
, 0,
7937 doc
: /* Assign higher priority to the coding systems given as arguments.
7938 If multiple coding systems belongs to the same category,
7939 all but the first one are ignored.
7941 usage: (set-coding-system-priority ...) */)
7947 int changed
[coding_category_max
];
7948 enum coding_category priorities
[coding_category_max
];
7950 bzero (changed
, sizeof changed
);
7952 for (i
= j
= 0; i
< nargs
; i
++)
7954 enum coding_category category
;
7955 Lisp_Object spec
, attrs
;
7957 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
7958 attrs
= AREF (spec
, 0);
7959 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7960 if (changed
[category
])
7961 /* Ignore this coding system because a coding system of the
7962 same category already had a higher priority. */
7964 changed
[category
] = 1;
7965 priorities
[j
++] = category
;
7966 if (coding_categories
[category
].id
>= 0
7967 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
7968 setup_coding_system (args
[i
], &coding_categories
[category
]);
7969 Fset (AREF (Vcoding_category_table
, category
), args
[i
]);
7972 /* Now we have decided top J priorities. Reflect the order of the
7973 original priorities to the remaining priorities. */
7975 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
7977 while (j
< coding_category_max
7978 && changed
[coding_priorities
[j
]])
7980 if (j
== coding_category_max
)
7982 priorities
[i
] = coding_priorities
[j
];
7985 bcopy (priorities
, coding_priorities
, sizeof priorities
);
7987 /* Update `coding-category-list'. */
7988 Vcoding_category_list
= Qnil
;
7989 for (i
= coding_category_max
- 1; i
>= 0; i
--)
7990 Vcoding_category_list
7991 = Fcons (AREF (Vcoding_category_table
, priorities
[i
]),
7992 Vcoding_category_list
);
7997 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
7998 Scoding_system_priority_list
, 0, 1, 0,
7999 doc
: /* Return a list of coding systems ordered by their priorities.
8000 HIGHESTP non-nil means just return the highest priority one. */)
8002 Lisp_Object highestp
;
8007 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
8009 enum coding_category category
= coding_priorities
[i
];
8010 int id
= coding_categories
[category
].id
;
8015 attrs
= CODING_ID_ATTRS (id
);
8016 if (! NILP (highestp
))
8017 return CODING_ATTR_BASE_NAME (attrs
);
8018 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
8020 return Fnreverse (val
);
8023 static char *suffixes
[] = { "-unix", "-dos", "-mac" };
8026 make_subsidiaries (base
)
8029 Lisp_Object subsidiaries
;
8030 int base_name_len
= SBYTES (SYMBOL_NAME (base
));
8031 char *buf
= (char *) alloca (base_name_len
+ 6);
8034 bcopy (SDATA (SYMBOL_NAME (base
)), buf
, base_name_len
);
8035 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
8036 for (i
= 0; i
< 3; i
++)
8038 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
8039 ASET (subsidiaries
, i
, intern (buf
));
8041 return subsidiaries
;
8045 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
8046 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
8047 doc
: /* For internal use only.
8048 usage: (define-coding-system-internal ...) */)
8054 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
8055 Lisp_Object attrs
; /* Vector of attributes. */
8056 Lisp_Object eol_type
;
8057 Lisp_Object aliases
;
8058 Lisp_Object coding_type
, charset_list
, safe_charsets
;
8059 enum coding_category category
;
8060 Lisp_Object tail
, val
;
8061 int max_charset_id
= 0;
8064 if (nargs
< coding_arg_max
)
8067 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
8069 name
= args
[coding_arg_name
];
8070 CHECK_SYMBOL (name
);
8071 CODING_ATTR_BASE_NAME (attrs
) = name
;
8073 val
= args
[coding_arg_mnemonic
];
8074 if (! STRINGP (val
))
8075 CHECK_CHARACTER (val
);
8076 CODING_ATTR_MNEMONIC (attrs
) = val
;
8078 coding_type
= args
[coding_arg_coding_type
];
8079 CHECK_SYMBOL (coding_type
);
8080 CODING_ATTR_TYPE (attrs
) = coding_type
;
8082 charset_list
= args
[coding_arg_charset_list
];
8083 if (SYMBOLP (charset_list
))
8085 if (EQ (charset_list
, Qiso_2022
))
8087 if (! EQ (coding_type
, Qiso_2022
))
8088 error ("Invalid charset-list");
8089 charset_list
= Viso_2022_charset_list
;
8091 else if (EQ (charset_list
, Qemacs_mule
))
8093 if (! EQ (coding_type
, Qemacs_mule
))
8094 error ("Invalid charset-list");
8095 charset_list
= Vemacs_mule_charset_list
;
8097 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8098 if (max_charset_id
< XFASTINT (XCAR (tail
)))
8099 max_charset_id
= XFASTINT (XCAR (tail
));
8103 charset_list
= Fcopy_sequence (charset_list
);
8104 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
8106 struct charset
*charset
;
8109 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8110 if (EQ (coding_type
, Qiso_2022
)
8111 ? CHARSET_ISO_FINAL (charset
) < 0
8112 : EQ (coding_type
, Qemacs_mule
)
8113 ? CHARSET_EMACS_MULE_ID (charset
) < 0
8115 error ("Can't handle charset `%s'",
8116 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8118 XSETCAR (tail
, make_number (charset
->id
));
8119 if (max_charset_id
< charset
->id
)
8120 max_charset_id
= charset
->id
;
8123 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
8125 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
8127 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8128 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
8129 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
8131 CODING_ATTR_ASCII_COMPAT (attrs
) = args
[coding_arg_ascii_compatible_p
];
8133 val
= args
[coding_arg_decode_translation_table
];
8135 CHECK_CHAR_TABLE (val
);
8136 CODING_ATTR_DECODE_TBL (attrs
) = val
;
8138 val
= args
[coding_arg_encode_translation_table
];
8140 CHECK_CHAR_TABLE (val
);
8141 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
8143 val
= args
[coding_arg_post_read_conversion
];
8145 CODING_ATTR_POST_READ (attrs
) = val
;
8147 val
= args
[coding_arg_pre_write_conversion
];
8149 CODING_ATTR_PRE_WRITE (attrs
) = val
;
8151 val
= args
[coding_arg_default_char
];
8153 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
8156 CHECK_CHARACTER (val
);
8157 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
8160 val
= args
[coding_arg_for_unibyte
];
8161 CODING_ATTR_FOR_UNIBYTE (attrs
) = NILP (val
) ? Qnil
: Qt
;
8163 val
= args
[coding_arg_plist
];
8165 CODING_ATTR_PLIST (attrs
) = val
;
8167 if (EQ (coding_type
, Qcharset
))
8169 /* Generate a lisp vector of 256 elements. Each element is nil,
8170 integer, or a list of charset IDs.
8172 If Nth element is nil, the byte code N is invalid in this
8175 If Nth element is a number NUM, N is the first byte of a
8176 charset whose ID is NUM.
8178 If Nth element is a list of charset IDs, N is the first byte
8179 of one of them. The list is sorted by dimensions of the
8180 charsets. A charset of smaller dimension comes firtst. */
8181 val
= Fmake_vector (make_number (256), Qnil
);
8183 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8185 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
8186 int dim
= CHARSET_DIMENSION (charset
);
8187 int idx
= (dim
- 1) * 4;
8189 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8190 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8192 for (i
= charset
->code_space
[idx
];
8193 i
<= charset
->code_space
[idx
+ 1]; i
++)
8195 Lisp_Object tmp
, tmp2
;
8198 tmp
= AREF (val
, i
);
8201 else if (NUMBERP (tmp
))
8203 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
8205 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
8207 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
8211 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
8213 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
8218 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
8221 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
8222 XSETCAR (tmp2
, XCAR (tail
));
8228 ASET (attrs
, coding_attr_charset_valids
, val
);
8229 category
= coding_category_charset
;
8231 else if (EQ (coding_type
, Qccl
))
8235 if (nargs
< coding_arg_ccl_max
)
8238 val
= args
[coding_arg_ccl_decoder
];
8239 CHECK_CCL_PROGRAM (val
);
8241 val
= Fcopy_sequence (val
);
8242 ASET (attrs
, coding_attr_ccl_decoder
, val
);
8244 val
= args
[coding_arg_ccl_encoder
];
8245 CHECK_CCL_PROGRAM (val
);
8247 val
= Fcopy_sequence (val
);
8248 ASET (attrs
, coding_attr_ccl_encoder
, val
);
8250 val
= args
[coding_arg_ccl_valids
];
8251 valids
= Fmake_string (make_number (256), make_number (0));
8252 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
8259 from
= to
= XINT (val
);
8260 if (from
< 0 || from
> 255)
8261 args_out_of_range_3 (val
, make_number (0), make_number (255));
8266 CHECK_NATNUM_CAR (val
);
8267 CHECK_NATNUM_CDR (val
);
8268 from
= XINT (XCAR (val
));
8270 args_out_of_range_3 (XCAR (val
),
8271 make_number (0), make_number (255));
8272 to
= XINT (XCDR (val
));
8273 if (to
< from
|| to
> 255)
8274 args_out_of_range_3 (XCDR (val
),
8275 XCAR (val
), make_number (255));
8277 for (i
= from
; i
<= to
; i
++)
8278 SSET (valids
, i
, 1);
8280 ASET (attrs
, coding_attr_ccl_valids
, valids
);
8282 category
= coding_category_ccl
;
8284 else if (EQ (coding_type
, Qutf_16
))
8286 Lisp_Object bom
, endian
;
8288 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8290 if (nargs
< coding_arg_utf16_max
)
8293 bom
= args
[coding_arg_utf16_bom
];
8294 if (! NILP (bom
) && ! EQ (bom
, Qt
))
8298 CHECK_CODING_SYSTEM (val
);
8300 CHECK_CODING_SYSTEM (val
);
8302 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
8304 endian
= args
[coding_arg_utf16_endian
];
8305 CHECK_SYMBOL (endian
);
8308 else if (! EQ (endian
, Qbig
) && ! EQ (endian
, Qlittle
))
8309 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian
)));
8310 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
8312 category
= (CONSP (bom
)
8313 ? coding_category_utf_16_auto
8315 ? (EQ (endian
, Qbig
)
8316 ? coding_category_utf_16_be_nosig
8317 : coding_category_utf_16_le_nosig
)
8318 : (EQ (endian
, Qbig
)
8319 ? coding_category_utf_16_be
8320 : coding_category_utf_16_le
));
8322 else if (EQ (coding_type
, Qiso_2022
))
8324 Lisp_Object initial
, reg_usage
, request
, flags
;
8327 if (nargs
< coding_arg_iso2022_max
)
8330 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
8331 CHECK_VECTOR (initial
);
8332 for (i
= 0; i
< 4; i
++)
8334 val
= Faref (initial
, make_number (i
));
8337 struct charset
*charset
;
8339 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8340 ASET (initial
, i
, make_number (CHARSET_ID (charset
)));
8341 if (i
== 0 && CHARSET_ASCII_COMPATIBLE_P (charset
))
8342 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8345 ASET (initial
, i
, make_number (-1));
8348 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
8349 CHECK_CONS (reg_usage
);
8350 CHECK_NUMBER_CAR (reg_usage
);
8351 CHECK_NUMBER_CDR (reg_usage
);
8353 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
8354 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
8362 CHECK_CHARSET_GET_ID (tmp
, id
);
8363 CHECK_NATNUM_CDR (val
);
8364 if (XINT (XCDR (val
)) >= 4)
8365 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
8366 XSETCAR (val
, make_number (id
));
8369 flags
= args
[coding_arg_iso2022_flags
];
8370 CHECK_NATNUM (flags
);
8372 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
8373 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
8375 ASET (attrs
, coding_attr_iso_initial
, initial
);
8376 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
8377 ASET (attrs
, coding_attr_iso_request
, request
);
8378 ASET (attrs
, coding_attr_iso_flags
, flags
);
8379 setup_iso_safe_charsets (attrs
);
8381 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
8382 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
8383 | CODING_ISO_FLAG_SINGLE_SHIFT
))
8384 ? coding_category_iso_7_else
8385 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8386 ? coding_category_iso_7
8387 : coding_category_iso_7_tight
);
8390 int id
= XINT (AREF (initial
, 1));
8392 category
= (((i
& CODING_ISO_FLAG_LOCKING_SHIFT
)
8393 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8395 ? coding_category_iso_8_else
8396 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
8397 ? coding_category_iso_8_1
8398 : coding_category_iso_8_2
);
8400 if (category
!= coding_category_iso_8_1
8401 && category
!= coding_category_iso_8_2
)
8402 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8404 else if (EQ (coding_type
, Qemacs_mule
))
8406 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
8407 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
8408 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8409 category
= coding_category_emacs_mule
;
8411 else if (EQ (coding_type
, Qshift_jis
))
8414 struct charset
*charset
;
8416 if (XINT (Flength (charset_list
)) != 3)
8417 error ("There should be just three charsets");
8419 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8420 if (CHARSET_DIMENSION (charset
) != 1)
8421 error ("Dimension of charset %s is not one",
8422 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8423 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8424 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8426 charset_list
= XCDR (charset_list
);
8427 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8428 if (CHARSET_DIMENSION (charset
) != 1)
8429 error ("Dimension of charset %s is not one",
8430 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8432 charset_list
= XCDR (charset_list
);
8433 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8434 if (CHARSET_DIMENSION (charset
) != 2)
8435 error ("Dimension of charset %s is not two",
8436 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8438 category
= coding_category_sjis
;
8439 Vsjis_coding_system
= name
;
8441 else if (EQ (coding_type
, Qbig5
))
8443 struct charset
*charset
;
8445 if (XINT (Flength (charset_list
)) != 2)
8446 error ("There should be just two charsets");
8448 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8449 if (CHARSET_DIMENSION (charset
) != 1)
8450 error ("Dimension of charset %s is not one",
8451 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8452 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8453 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8455 charset_list
= XCDR (charset_list
);
8456 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8457 if (CHARSET_DIMENSION (charset
) != 2)
8458 error ("Dimension of charset %s is not two",
8459 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8461 category
= coding_category_big5
;
8462 Vbig5_coding_system
= name
;
8464 else if (EQ (coding_type
, Qraw_text
))
8466 category
= coding_category_raw_text
;
8467 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8469 else if (EQ (coding_type
, Qutf_8
))
8471 category
= coding_category_utf_8
;
8472 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8474 else if (EQ (coding_type
, Qundecided
))
8475 category
= coding_category_undecided
;
8477 error ("Invalid coding system type: %s",
8478 SDATA (SYMBOL_NAME (coding_type
)));
8480 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
8481 CODING_ATTR_PLIST (attrs
)
8482 = Fcons (QCcategory
, Fcons (AREF (Vcoding_category_table
, category
),
8483 CODING_ATTR_PLIST (attrs
)));
8485 eol_type
= args
[coding_arg_eol_type
];
8486 if (! NILP (eol_type
)
8487 && ! EQ (eol_type
, Qunix
)
8488 && ! EQ (eol_type
, Qdos
)
8489 && ! EQ (eol_type
, Qmac
))
8490 error ("Invalid eol-type");
8492 aliases
= Fcons (name
, Qnil
);
8494 if (NILP (eol_type
))
8496 eol_type
= make_subsidiaries (name
);
8497 for (i
= 0; i
< 3; i
++)
8499 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
8501 this_name
= AREF (eol_type
, i
);
8502 this_aliases
= Fcons (this_name
, Qnil
);
8503 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
8504 this_spec
= Fmake_vector (make_number (3), attrs
);
8505 ASET (this_spec
, 1, this_aliases
);
8506 ASET (this_spec
, 2, this_eol_type
);
8507 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
8508 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
8509 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
8510 Vcoding_system_alist
);
8514 spec_vec
= Fmake_vector (make_number (3), attrs
);
8515 ASET (spec_vec
, 1, aliases
);
8516 ASET (spec_vec
, 2, eol_type
);
8518 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
8519 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
8520 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
8521 Vcoding_system_alist
);
8524 int id
= coding_categories
[category
].id
;
8526 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
8527 setup_coding_system (name
, &coding_categories
[category
]);
8533 return Fsignal (Qwrong_number_of_arguments
,
8534 Fcons (intern ("define-coding-system-internal"),
8535 make_number (nargs
)));
8539 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
8540 Sdefine_coding_system_alias
, 2, 2, 0,
8541 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8542 (alias
, coding_system
)
8543 Lisp_Object alias
, coding_system
;
8545 Lisp_Object spec
, aliases
, eol_type
;
8547 CHECK_SYMBOL (alias
);
8548 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8549 aliases
= AREF (spec
, 1);
8550 /* ALISES should be a list of length more than zero, and the first
8551 element is a base coding system. Append ALIAS at the tail of the
8553 while (!NILP (XCDR (aliases
)))
8554 aliases
= XCDR (aliases
);
8555 XSETCDR (aliases
, Fcons (alias
, Qnil
));
8557 eol_type
= AREF (spec
, 2);
8558 if (VECTORP (eol_type
))
8560 Lisp_Object subsidiaries
;
8563 subsidiaries
= make_subsidiaries (alias
);
8564 for (i
= 0; i
< 3; i
++)
8565 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
8566 AREF (eol_type
, i
));
8569 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
8570 Vcoding_system_list
= Fcons (alias
, Vcoding_system_list
);
8571 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
8572 Vcoding_system_alist
);
8577 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
8579 doc
: /* Return the base of CODING-SYSTEM.
8580 Any alias or subsidiary coding system is not a base coding system. */)
8582 Lisp_Object coding_system
;
8584 Lisp_Object spec
, attrs
;
8586 if (NILP (coding_system
))
8587 return (Qno_conversion
);
8588 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8589 attrs
= AREF (spec
, 0);
8590 return CODING_ATTR_BASE_NAME (attrs
);
8593 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
8595 doc
: "Return the property list of CODING-SYSTEM.")
8597 Lisp_Object coding_system
;
8599 Lisp_Object spec
, attrs
;
8601 if (NILP (coding_system
))
8602 coding_system
= Qno_conversion
;
8603 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8604 attrs
= AREF (spec
, 0);
8605 return CODING_ATTR_PLIST (attrs
);
8609 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
8611 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
8613 Lisp_Object coding_system
;
8617 if (NILP (coding_system
))
8618 coding_system
= Qno_conversion
;
8619 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8620 return AREF (spec
, 1);
8623 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
8624 Scoding_system_eol_type
, 1, 1, 0,
8625 doc
: /* Return eol-type of CODING-SYSTEM.
8626 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8628 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8629 and CR respectively.
8631 A vector value indicates that a format of end-of-line should be
8632 detected automatically. Nth element of the vector is the subsidiary
8633 coding system whose eol-type is N. */)
8635 Lisp_Object coding_system
;
8637 Lisp_Object spec
, eol_type
;
8640 if (NILP (coding_system
))
8641 coding_system
= Qno_conversion
;
8642 if (! CODING_SYSTEM_P (coding_system
))
8644 spec
= CODING_SYSTEM_SPEC (coding_system
);
8645 eol_type
= AREF (spec
, 2);
8646 if (VECTORP (eol_type
))
8647 return Fcopy_sequence (eol_type
);
8648 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
8649 return make_number (n
);
8655 /*** 9. Post-amble ***/
8662 for (i
= 0; i
< coding_category_max
; i
++)
8664 coding_categories
[i
].id
= -1;
8665 coding_priorities
[i
] = i
;
8668 /* ISO2022 specific initialize routine. */
8669 for (i
= 0; i
< 0x20; i
++)
8670 iso_code_class
[i
] = ISO_control_0
;
8671 for (i
= 0x21; i
< 0x7F; i
++)
8672 iso_code_class
[i
] = ISO_graphic_plane_0
;
8673 for (i
= 0x80; i
< 0xA0; i
++)
8674 iso_code_class
[i
] = ISO_control_1
;
8675 for (i
= 0xA1; i
< 0xFF; i
++)
8676 iso_code_class
[i
] = ISO_graphic_plane_1
;
8677 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
8678 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
8679 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
8680 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
8681 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
8682 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
8683 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
8684 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
8685 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
8687 for (i
= 0; i
< 256; i
++)
8689 emacs_mule_bytes
[i
] = 1;
8691 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_11
] = 3;
8692 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_12
] = 3;
8693 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_21
] = 4;
8694 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_22
] = 4;
8702 staticpro (&Vcoding_system_hash_table
);
8704 Lisp_Object args
[2];
8707 Vcoding_system_hash_table
= Fmake_hash_table (2, args
);
8710 staticpro (&Vsjis_coding_system
);
8711 Vsjis_coding_system
= Qnil
;
8713 staticpro (&Vbig5_coding_system
);
8714 Vbig5_coding_system
= Qnil
;
8716 staticpro (&Vcode_conversion_reused_workbuf
);
8717 Vcode_conversion_reused_workbuf
= Qnil
;
8719 staticpro (&Vcode_conversion_workbuf_name
);
8720 Vcode_conversion_workbuf_name
= build_string (" *code-conversion-work*");
8722 reused_workbuf_in_use
= 0;
8724 DEFSYM (Qcharset
, "charset");
8725 DEFSYM (Qtarget_idx
, "target-idx");
8726 DEFSYM (Qcoding_system_history
, "coding-system-history");
8727 Fset (Qcoding_system_history
, Qnil
);
8729 /* Target FILENAME is the first argument. */
8730 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
8731 /* Target FILENAME is the third argument. */
8732 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
8734 DEFSYM (Qcall_process
, "call-process");
8735 /* Target PROGRAM is the first argument. */
8736 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
8738 DEFSYM (Qcall_process_region
, "call-process-region");
8739 /* Target PROGRAM is the third argument. */
8740 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
8742 DEFSYM (Qstart_process
, "start-process");
8743 /* Target PROGRAM is the third argument. */
8744 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
8746 DEFSYM (Qopen_network_stream
, "open-network-stream");
8747 /* Target SERVICE is the fourth argument. */
8748 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
8750 DEFSYM (Qcoding_system
, "coding-system");
8751 DEFSYM (Qcoding_aliases
, "coding-aliases");
8753 DEFSYM (Qeol_type
, "eol-type");
8754 DEFSYM (Qunix
, "unix");
8755 DEFSYM (Qdos
, "dos");
8757 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
8758 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
8759 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
8760 DEFSYM (Qdefault_char
, "default-char");
8761 DEFSYM (Qundecided
, "undecided");
8762 DEFSYM (Qno_conversion
, "no-conversion");
8763 DEFSYM (Qraw_text
, "raw-text");
8765 DEFSYM (Qiso_2022
, "iso-2022");
8767 DEFSYM (Qutf_8
, "utf-8");
8768 DEFSYM (Qutf_8_emacs
, "utf-8-emacs");
8770 DEFSYM (Qutf_16
, "utf-16");
8771 DEFSYM (Qbig
, "big");
8772 DEFSYM (Qlittle
, "little");
8774 DEFSYM (Qshift_jis
, "shift-jis");
8775 DEFSYM (Qbig5
, "big5");
8777 DEFSYM (Qcoding_system_p
, "coding-system-p");
8779 DEFSYM (Qcoding_system_error
, "coding-system-error");
8780 Fput (Qcoding_system_error
, Qerror_conditions
,
8781 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
8782 Fput (Qcoding_system_error
, Qerror_message
,
8783 build_string ("Invalid coding system"));
8785 /* Intern this now in case it isn't already done.
8786 Setting this variable twice is harmless.
8787 But don't staticpro it here--that is done in alloc.c. */
8788 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
8790 DEFSYM (Qtranslation_table
, "translation-table");
8791 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
8792 DEFSYM (Qtranslation_table_id
, "translation-table-id");
8793 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
8794 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
8796 DEFSYM (Qvalid_codes
, "valid-codes");
8798 DEFSYM (Qemacs_mule
, "emacs-mule");
8800 DEFSYM (QCcategory
, ":category");
8802 Vcoding_category_table
8803 = Fmake_vector (make_number (coding_category_max
), Qnil
);
8804 staticpro (&Vcoding_category_table
);
8805 /* Followings are target of code detection. */
8806 ASET (Vcoding_category_table
, coding_category_iso_7
,
8807 intern ("coding-category-iso-7"));
8808 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
8809 intern ("coding-category-iso-7-tight"));
8810 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
8811 intern ("coding-category-iso-8-1"));
8812 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
8813 intern ("coding-category-iso-8-2"));
8814 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
8815 intern ("coding-category-iso-7-else"));
8816 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
8817 intern ("coding-category-iso-8-else"));
8818 ASET (Vcoding_category_table
, coding_category_utf_8
,
8819 intern ("coding-category-utf-8"));
8820 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
8821 intern ("coding-category-utf-16-be"));
8822 ASET (Vcoding_category_table
, coding_category_utf_16_auto
,
8823 intern ("coding-category-utf-16-auto"));
8824 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
8825 intern ("coding-category-utf-16-le"));
8826 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
8827 intern ("coding-category-utf-16-be-nosig"));
8828 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
8829 intern ("coding-category-utf-16-le-nosig"));
8830 ASET (Vcoding_category_table
, coding_category_charset
,
8831 intern ("coding-category-charset"));
8832 ASET (Vcoding_category_table
, coding_category_sjis
,
8833 intern ("coding-category-sjis"));
8834 ASET (Vcoding_category_table
, coding_category_big5
,
8835 intern ("coding-category-big5"));
8836 ASET (Vcoding_category_table
, coding_category_ccl
,
8837 intern ("coding-category-ccl"));
8838 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
8839 intern ("coding-category-emacs-mule"));
8840 /* Followings are NOT target of code detection. */
8841 ASET (Vcoding_category_table
, coding_category_raw_text
,
8842 intern ("coding-category-raw-text"));
8843 ASET (Vcoding_category_table
, coding_category_undecided
,
8844 intern ("coding-category-undecided"));
8846 DEFSYM (Qinsufficient_source
, "insufficient-source");
8847 DEFSYM (Qinconsistent_eol
, "inconsistent-eol");
8848 DEFSYM (Qinvalid_source
, "invalid-source");
8849 DEFSYM (Qinterrupted
, "interrupted");
8850 DEFSYM (Qinsufficient_memory
, "insufficient-memory");
8852 defsubr (&Scoding_system_p
);
8853 defsubr (&Sread_coding_system
);
8854 defsubr (&Sread_non_nil_coding_system
);
8855 defsubr (&Scheck_coding_system
);
8856 defsubr (&Sdetect_coding_region
);
8857 defsubr (&Sdetect_coding_string
);
8858 defsubr (&Sfind_coding_systems_region_internal
);
8859 defsubr (&Sunencodable_char_position
);
8860 defsubr (&Scheck_coding_systems_region
);
8861 defsubr (&Sdecode_coding_region
);
8862 defsubr (&Sencode_coding_region
);
8863 defsubr (&Sdecode_coding_string
);
8864 defsubr (&Sencode_coding_string
);
8865 defsubr (&Sdecode_sjis_char
);
8866 defsubr (&Sencode_sjis_char
);
8867 defsubr (&Sdecode_big5_char
);
8868 defsubr (&Sencode_big5_char
);
8869 defsubr (&Sset_terminal_coding_system_internal
);
8870 defsubr (&Sset_safe_terminal_coding_system_internal
);
8871 defsubr (&Sterminal_coding_system
);
8872 defsubr (&Sset_keyboard_coding_system_internal
);
8873 defsubr (&Skeyboard_coding_system
);
8874 defsubr (&Sfind_operation_coding_system
);
8875 defsubr (&Sset_coding_system_priority
);
8876 defsubr (&Sdefine_coding_system_internal
);
8877 defsubr (&Sdefine_coding_system_alias
);
8878 defsubr (&Scoding_system_base
);
8879 defsubr (&Scoding_system_plist
);
8880 defsubr (&Scoding_system_aliases
);
8881 defsubr (&Scoding_system_eol_type
);
8882 defsubr (&Scoding_system_priority_list
);
8884 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
8885 doc
: /* List of coding systems.
8887 Do not alter the value of this variable manually. This variable should be
8888 updated by the functions `define-coding-system' and
8889 `define-coding-system-alias'. */);
8890 Vcoding_system_list
= Qnil
;
8892 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
8893 doc
: /* Alist of coding system names.
8894 Each element is one element list of coding system name.
8895 This variable is given to `completing-read' as TABLE argument.
8897 Do not alter the value of this variable manually. This variable should be
8898 updated by the functions `make-coding-system' and
8899 `define-coding-system-alias'. */);
8900 Vcoding_system_alist
= Qnil
;
8902 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
8903 doc
: /* List of coding-categories (symbols) ordered by priority.
8905 On detecting a coding system, Emacs tries code detection algorithms
8906 associated with each coding-category one by one in this order. When
8907 one algorithm agrees with a byte sequence of source text, the coding
8908 system bound to the corresponding coding-category is selected. */);
8912 Vcoding_category_list
= Qnil
;
8913 for (i
= coding_category_max
- 1; i
>= 0; i
--)
8914 Vcoding_category_list
8915 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
8916 Vcoding_category_list
);
8919 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
8920 doc
: /* Specify the coding system for read operations.
8921 It is useful to bind this variable with `let', but do not set it globally.
8922 If the value is a coding system, it is used for decoding on read operation.
8923 If not, an appropriate element is used from one of the coding system alists:
8924 There are three such tables, `file-coding-system-alist',
8925 `process-coding-system-alist', and `network-coding-system-alist'. */);
8926 Vcoding_system_for_read
= Qnil
;
8928 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
8929 doc
: /* Specify the coding system for write operations.
8930 Programs bind this variable with `let', but you should not set it globally.
8931 If the value is a coding system, it is used for encoding of output,
8932 when writing it to a file and when sending it to a file or subprocess.
8934 If this does not specify a coding system, an appropriate element
8935 is used from one of the coding system alists:
8936 There are three such tables, `file-coding-system-alist',
8937 `process-coding-system-alist', and `network-coding-system-alist'.
8938 For output to files, if the above procedure does not specify a coding system,
8939 the value of `buffer-file-coding-system' is used. */);
8940 Vcoding_system_for_write
= Qnil
;
8942 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
8944 Coding system used in the latest file or process I/O. */);
8945 Vlast_coding_system_used
= Qnil
;
8947 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error
,
8949 Error status of the last code conversion.
8951 When an error was detected in the last code conversion, this variable
8952 is set to one of the following symbols.
8953 `insufficient-source'
8957 `insufficient-memory'
8958 When no error was detected, the value doesn't change. So, to check
8959 the error status of a code conversion by this variable, you must
8960 explicitly set this variable to nil before performing code
8962 Vlast_code_conversion_error
= Qnil
;
8964 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
8966 *Non-nil means always inhibit code conversion of end-of-line format.
8967 See info node `Coding Systems' and info node `Text and Binary' concerning
8968 such conversion. */);
8969 inhibit_eol_conversion
= 0;
8971 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
8973 Non-nil means process buffer inherits coding system of process output.
8974 Bind it to t if the process output is to be treated as if it were a file
8975 read from some filesystem. */);
8976 inherit_process_coding_system
= 0;
8978 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
8980 Alist to decide a coding system to use for a file I/O operation.
8981 The format is ((PATTERN . VAL) ...),
8982 where PATTERN is a regular expression matching a file name,
8983 VAL is a coding system, a cons of coding systems, or a function symbol.
8984 If VAL is a coding system, it is used for both decoding and encoding
8986 If VAL is a cons of coding systems, the car part is used for decoding,
8987 and the cdr part is used for encoding.
8988 If VAL is a function symbol, the function must return a coding system
8989 or a cons of coding systems which are used as above. The function gets
8990 the arguments with which `find-operation-coding-systems' was called.
8992 See also the function `find-operation-coding-system'
8993 and the variable `auto-coding-alist'. */);
8994 Vfile_coding_system_alist
= Qnil
;
8996 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
8998 Alist to decide a coding system to use for a process I/O operation.
8999 The format is ((PATTERN . VAL) ...),
9000 where PATTERN is a regular expression matching a program name,
9001 VAL is a coding system, a cons of coding systems, or a function symbol.
9002 If VAL is a coding system, it is used for both decoding what received
9003 from the program and encoding what sent to the program.
9004 If VAL is a cons of coding systems, the car part is used for decoding,
9005 and the cdr part is used for encoding.
9006 If VAL is a function symbol, the function must return a coding system
9007 or a cons of coding systems which are used as above.
9009 See also the function `find-operation-coding-system'. */);
9010 Vprocess_coding_system_alist
= Qnil
;
9012 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
9014 Alist to decide a coding system to use for a network I/O operation.
9015 The format is ((PATTERN . VAL) ...),
9016 where PATTERN is a regular expression matching a network service name
9017 or is a port number to connect to,
9018 VAL is a coding system, a cons of coding systems, or a function symbol.
9019 If VAL is a coding system, it is used for both decoding what received
9020 from the network stream and encoding what sent to the network stream.
9021 If VAL is a cons of coding systems, the car part is used for decoding,
9022 and the cdr part is used for encoding.
9023 If VAL is a function symbol, the function must return a coding system
9024 or a cons of coding systems which are used as above.
9026 See also the function `find-operation-coding-system'. */);
9027 Vnetwork_coding_system_alist
= Qnil
;
9029 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
9030 doc
: /* Coding system to use with system messages.
9031 Also used for decoding keyboard input on X Window system. */);
9032 Vlocale_coding_system
= Qnil
;
9034 /* The eol mnemonics are reset in startup.el system-dependently. */
9035 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
9037 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9038 eol_mnemonic_unix
= build_string (":");
9040 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
9042 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9043 eol_mnemonic_dos
= build_string ("\\");
9045 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
9047 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9048 eol_mnemonic_mac
= build_string ("/");
9050 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
9052 *String displayed in mode line when end-of-line format is not yet determined. */);
9053 eol_mnemonic_undecided
= build_string (":");
9055 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
9057 *Non-nil enables character translation while encoding and decoding. */);
9058 Venable_character_translation
= Qt
;
9060 DEFVAR_LISP ("standard-translation-table-for-decode",
9061 &Vstandard_translation_table_for_decode
,
9062 doc
: /* Table for translating characters while decoding. */);
9063 Vstandard_translation_table_for_decode
= Qnil
;
9065 DEFVAR_LISP ("standard-translation-table-for-encode",
9066 &Vstandard_translation_table_for_encode
,
9067 doc
: /* Table for translating characters while encoding. */);
9068 Vstandard_translation_table_for_encode
= Qnil
;
9070 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
9071 doc
: /* Alist of charsets vs revision numbers.
9072 While encoding, if a charset (car part of an element) is found,
9073 designate it with the escape sequence identifying revision (cdr part
9074 of the element). */);
9075 Vcharset_revision_table
= Qnil
;
9077 DEFVAR_LISP ("default-process-coding-system",
9078 &Vdefault_process_coding_system
,
9079 doc
: /* Cons of coding systems used for process I/O by default.
9080 The car part is used for decoding a process output,
9081 the cdr part is used for encoding a text to be sent to a process. */);
9082 Vdefault_process_coding_system
= Qnil
;
9084 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
9086 Table of extra Latin codes in the range 128..159 (inclusive).
9087 This is a vector of length 256.
9088 If Nth element is non-nil, the existence of code N in a file
9089 \(or output of subprocess) doesn't prevent it to be detected as
9090 a coding system of ISO 2022 variant which has a flag
9091 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9092 or reading output of a subprocess.
9093 Only 128th through 159th elements has a meaning. */);
9094 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
9096 DEFVAR_LISP ("select-safe-coding-system-function",
9097 &Vselect_safe_coding_system_function
,
9099 Function to call to select safe coding system for encoding a text.
9101 If set, this function is called to force a user to select a proper
9102 coding system which can encode the text in the case that a default
9103 coding system used in each operation can't encode the text.
9105 The default value is `select-safe-coding-system' (which see). */);
9106 Vselect_safe_coding_system_function
= Qnil
;
9108 DEFVAR_BOOL ("coding-system-require-warning",
9109 &coding_system_require_warning
,
9110 doc
: /* Internal use only.
9111 If non-nil, on writing a file, `select-safe-coding-system-function' is
9112 called even if `coding-system-for-write' is non-nil. The command
9113 `universal-coding-system-argument' binds this variable to t temporarily. */);
9114 coding_system_require_warning
= 0;
9117 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9118 &inhibit_iso_escape_detection
,
9120 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9122 By default, on reading a file, Emacs tries to detect how the text is
9123 encoded. This code detection is sensitive to escape sequences. If
9124 the sequence is valid as ISO2022, the code is determined as one of
9125 the ISO2022 encodings, and the file is decoded by the corresponding
9126 coding system (e.g. `iso-2022-7bit').
9128 However, there may be a case that you want to read escape sequences in
9129 a file as is. In such a case, you can set this variable to non-nil.
9130 Then, as the code detection ignores any escape sequences, no file is
9131 detected as encoded in some ISO2022 encoding. The result is that all
9132 escape sequences become visible in a buffer.
9134 The default value is nil, and it is strongly recommended not to change
9135 it. That is because many Emacs Lisp source files that contain
9136 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9137 in Emacs's distribution, and they won't be decoded correctly on
9138 reading if you suppress escape sequence detection.
9140 The other way to read escape sequences in a file without decoding is
9141 to explicitly specify some coding system that doesn't use ISO2022's
9142 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9143 inhibit_iso_escape_detection
= 0;
9145 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input
,
9146 doc
: /* Char table for translating self-inserting characters.
9147 This is applied to the result of input methods, not their input. See also
9148 `keyboard-translate-table'. */);
9149 Vtranslation_table_for_input
= Qnil
;
9152 Lisp_Object args
[coding_arg_max
];
9153 Lisp_Object plist
[16];
9156 for (i
= 0; i
< coding_arg_max
; i
++)
9159 plist
[0] = intern (":name");
9160 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
9161 plist
[2] = intern (":mnemonic");
9162 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
9163 plist
[4] = intern (":coding-type");
9164 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
9165 plist
[6] = intern (":ascii-compatible-p");
9166 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
9167 plist
[8] = intern (":default-char");
9168 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
9169 plist
[10] = intern (":for-unibyte");
9170 plist
[11] = args
[coding_arg_for_unibyte
] = Qt
;
9171 plist
[12] = intern (":docstring");
9172 plist
[13] = build_string ("Do no conversion.\n\
9174 When you visit a file with this coding, the file is read into a\n\
9175 unibyte buffer as is, thus each byte of a file is treated as a\n\
9177 plist
[14] = intern (":eol-type");
9178 plist
[15] = args
[coding_arg_eol_type
] = Qunix
;
9179 args
[coding_arg_plist
] = Flist (16, plist
);
9180 Fdefine_coding_system_internal (coding_arg_max
, args
);
9183 setup_coding_system (Qno_conversion
, &keyboard_coding
);
9184 setup_coding_system (Qno_conversion
, &terminal_coding
);
9185 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
9190 for (i
= 0; i
< coding_category_max
; i
++)
9191 Fset (AREF (Vcoding_category_table
, i
), Qno_conversion
);
9196 emacs_strerror (error_number
)
9201 synchronize_system_messages_locale ();
9202 str
= strerror (error_number
);
9204 if (! NILP (Vlocale_coding_system
))
9206 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
9207 Vlocale_coding_system
,
9209 str
= (char *) SDATA (dec
);