1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
11 This file is part of GNU Emacs.
13 GNU Emacs is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs; see the file COPYING. If not, write to
25 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26 Boston, MA 02110-1301, USA. */
28 /*** TABLE OF CONTENTS ***
32 2. Emacs' internal format (emacs-utf-8) handlers
35 5. Charset-base coding systems handlers
36 6. emacs-mule (old Emacs' internal format) handlers
38 8. Shift-JIS and BIG5 handlers
40 10. C library functions
41 11. Emacs Lisp library functions
46 /*** 0. General comments ***
51 A coding system is an object for an encoding mechanism that contains
52 information about how to convert byte sequences to character
53 sequences and vice versa. When we say "decode", it means converting
54 a byte sequence of a specific coding system into a character
55 sequence that is represented by Emacs' internal coding system
56 `emacs-utf-8', and when we say "encode", it means converting a
57 character sequence of emacs-utf-8 to a byte sequence of a specific
60 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
61 C level, a coding system is represented by a vector of attributes
62 stored in the hash table Vcharset_hash_table. The conversion from
63 coding system symbol to attributes vector is done by looking up
64 Vcharset_hash_table by the symbol.
66 Coding systems are classified into the following types depending on
67 the encoding mechanism. Here's a brief description of the types.
73 o Charset-base coding system
75 A coding system defined by one or more (coded) character sets.
76 Decoding and encoding are done by a code converter defined for each
79 o Old Emacs internal format (emacs-mule)
81 The coding system adopted by old versions of Emacs (20 and 21).
83 o ISO2022-base coding system
85 The most famous coding system for multiple character sets. X's
86 Compound Text, various EUCs (Extended Unix Code), and coding systems
87 used in the Internet communication such as ISO-2022-JP are all
90 o SJIS (or Shift-JIS or MS-Kanji-Code)
92 A coding system to encode character sets: ASCII, JISX0201, and
93 JISX0208. Widely used for PC's in Japan. Details are described in
98 A coding system to encode character sets: ASCII and Big5. Widely
99 used for Chinese (mainly in Taiwan and Hong Kong). Details are
100 described in section 8. In this file, when we write "big5" (all
101 lowercase), we mean the coding system, and when we write "Big5"
102 (capitalized), we mean the character set.
106 If a user wants to decode/encode text encoded in a coding system
107 not listed above, he can supply a decoder and an encoder for it in
108 CCL (Code Conversion Language) programs. Emacs executes the CCL
109 program while decoding/encoding.
113 A coding system for text containing raw eight-bit data. Emacs
114 treats each byte of source text as a character (except for
115 end-of-line conversion).
119 Like raw text, but don't do end-of-line conversion.
124 How text end-of-line is encoded depends on operating system. For
125 instance, Unix's format is just one byte of LF (line-feed) code,
126 whereas DOS's format is two-byte sequence of `carriage-return' and
127 `line-feed' codes. MacOS's format is usually one byte of
130 Since text character encoding and end-of-line encoding are
131 independent, any coding system described above can take any format
132 of end-of-line (except for no-conversion).
136 Before using a coding system for code conversion (i.e. decoding and
137 encoding), we setup a structure of type `struct coding_system'.
138 This structure keeps various information about a specific code
139 conversion (e.g. the location of source and destination data).
146 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
148 These functions check if a byte sequence specified as a source in
149 CODING conforms to the format of XXX, and update the members of
152 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
154 Below is the template of these functions. */
158 detect_coding_XXX (coding
, detect_info
)
159 struct coding_system
*coding
;
160 struct coding_detection_info
*detect_info
;
162 const unsigned char *src
= coding
->source
;
163 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
164 int multibytep
= coding
->src_multibyte
;
165 int consumed_chars
= 0;
171 /* Get one byte from the source. If the souce is exausted, jump
172 to no_more_source:. */
175 if (! __C_conforms_to_XXX___ (c
))
177 if (! __C_strongly_suggests_XXX__ (c
))
178 found
= CATEGORY_MASK_XXX
;
180 /* The byte sequence is invalid for XXX. */
181 detect_info
->rejected
|= CATEGORY_MASK_XXX
;
185 /* The source exausted successfully. */
186 detect_info
->found
|= found
;
191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
193 These functions decode a byte sequence specified as a source by
194 CODING. The resulting multibyte text goes to a place pointed to by
195 CODING->charbuf, the length of which should not exceed
196 CODING->charbuf_size;
198 These functions set the information of original and decoded texts in
199 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
200 They also set CODING->result to one of CODING_RESULT_XXX indicating
201 how the decoding is finished.
203 Below is the template of these functions. */
207 decode_coding_XXXX (coding
)
208 struct coding_system
*coding
;
210 const unsigned char *src
= coding
->source
+ coding
->consumed
;
211 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
212 /* SRC_BASE remembers the start position in source in each loop.
213 The loop will be exited when there's not enough source code, or
214 when there's no room in CHARBUF for a decoded character. */
215 const unsigned char *src_base
;
216 /* A buffer to produce decoded characters. */
217 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
218 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
;
219 int multibytep
= coding
->src_multibyte
;
224 if (charbuf
< charbuf_end
)
225 /* No more room to produce a decoded character. */
232 if (src_base
< src_end
233 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
234 /* If the source ends by partial bytes to construct a character,
235 treat them as eight-bit raw data. */
236 while (src_base
< src_end
&& charbuf
< charbuf_end
)
237 *charbuf
++ = *src_base
++;
238 /* Remember how many bytes and characters we consumed. If the
239 source is multibyte, the bytes and chars are not identical. */
240 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
241 /* Remember how many characters we produced. */
242 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
248 These functions encode SRC_BYTES length text at SOURCE of Emacs'
249 internal multibyte format by CODING. The resulting byte sequence
250 goes to a place pointed to by DESTINATION, the length of which
251 should not exceed DST_BYTES.
253 These functions set the information of original and encoded texts in
254 the members produced, produced_char, consumed, and consumed_char of
255 the structure *CODING. They also set the member result to one of
256 CODING_RESULT_XXX indicating how the encoding finished.
258 DST_BYTES zero means that source area and destination area are
259 overlapped, which means that we can produce a encoded text until it
260 reaches at the head of not-yet-encoded source text.
262 Below is a template of these functions. */
265 encode_coding_XXX (coding
)
266 struct coding_system
*coding
;
268 int multibytep
= coding
->dst_multibyte
;
269 int *charbuf
= coding
->charbuf
;
270 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
271 unsigned char *dst
= coding
->destination
+ coding
->produced
;
272 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
273 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
274 int produced_chars
= 0;
276 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
279 /* Encode C into DST, and increment DST. */
281 label_no_more_destination
:
282 /* How many chars and bytes we produced. */
283 coding
->produced_char
+= produced_chars
;
284 coding
->produced
= dst
- coding
->destination
;
289 /*** 1. Preamble ***/
296 #include "character.h"
299 #include "composite.h"
303 Lisp_Object Vcoding_system_hash_table
;
305 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
306 Lisp_Object Qunix
, Qdos
;
307 extern Lisp_Object Qmac
; /* frame.c */
308 Lisp_Object Qbuffer_file_coding_system
;
309 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
310 Lisp_Object Qdefault_char
;
311 Lisp_Object Qno_conversion
, Qundecided
;
312 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
313 Lisp_Object Qbig
, Qlittle
;
314 Lisp_Object Qcoding_system_history
;
315 Lisp_Object Qvalid_codes
;
316 Lisp_Object QCcategory
, QCmnemonic
, QCdefalut_char
;
317 Lisp_Object QCdecode_translation_table
, QCencode_translation_table
;
318 Lisp_Object QCpost_read_conversion
, QCpre_write_conversion
;
319 Lisp_Object QCascii_compatible_p
;
321 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
322 Lisp_Object Qcall_process
, Qcall_process_region
;
323 Lisp_Object Qstart_process
, Qopen_network_stream
;
324 Lisp_Object Qtarget_idx
;
326 Lisp_Object Qinsufficient_source
, Qinconsistent_eol
, Qinvalid_source
;
327 Lisp_Object Qinterrupted
, Qinsufficient_memory
;
329 /* If a symbol has this property, evaluate the value to define the
330 symbol as a coding system. */
331 static Lisp_Object Qcoding_system_define_form
;
333 int coding_system_require_warning
;
335 Lisp_Object Vselect_safe_coding_system_function
;
337 /* Mnemonic string for each format of end-of-line. */
338 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
339 /* Mnemonic string to indicate format of end-of-line is not yet
341 Lisp_Object eol_mnemonic_undecided
;
343 /* Format of end-of-line decided by system. This is Qunix on
344 Unix and Mac, Qdos on DOS/Windows.
345 This has an effect only for external encoding (i.e. for output to
346 file and process), not for in-buffer or Lisp string encoding. */
347 static Lisp_Object system_eol_type
;
351 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
353 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
355 /* Coding system emacs-mule and raw-text are for converting only
356 end-of-line format. */
357 Lisp_Object Qemacs_mule
, Qraw_text
;
358 Lisp_Object Qutf_8_emacs
;
360 /* Coding-systems are handed between Emacs Lisp programs and C internal
361 routines by the following three variables. */
362 /* Coding-system for reading files and receiving data from process. */
363 Lisp_Object Vcoding_system_for_read
;
364 /* Coding-system for writing files and sending data to process. */
365 Lisp_Object Vcoding_system_for_write
;
366 /* Coding-system actually used in the latest I/O. */
367 Lisp_Object Vlast_coding_system_used
;
368 /* Set to non-nil when an error is detected while code conversion. */
369 Lisp_Object Vlast_code_conversion_error
;
370 /* A vector of length 256 which contains information about special
371 Latin codes (especially for dealing with Microsoft codes). */
372 Lisp_Object Vlatin_extra_code_table
;
374 /* Flag to inhibit code conversion of end-of-line format. */
375 int inhibit_eol_conversion
;
377 /* Flag to inhibit ISO2022 escape sequence detection. */
378 int inhibit_iso_escape_detection
;
380 /* Flag to make buffer-file-coding-system inherit from process-coding. */
381 int inherit_process_coding_system
;
383 /* Coding system to be used to encode text for terminal display. */
384 struct coding_system terminal_coding
;
386 /* Coding system to be used to encode text for terminal display when
387 terminal coding system is nil. */
388 struct coding_system safe_terminal_coding
;
390 /* Coding system of what is sent from terminal keyboard. */
391 struct coding_system keyboard_coding
;
393 Lisp_Object Vfile_coding_system_alist
;
394 Lisp_Object Vprocess_coding_system_alist
;
395 Lisp_Object Vnetwork_coding_system_alist
;
397 Lisp_Object Vlocale_coding_system
;
401 /* Flag to tell if we look up translation table on character code
403 Lisp_Object Venable_character_translation
;
404 /* Standard translation table to look up on decoding (reading). */
405 Lisp_Object Vstandard_translation_table_for_decode
;
406 /* Standard translation table to look up on encoding (writing). */
407 Lisp_Object Vstandard_translation_table_for_encode
;
409 Lisp_Object Qtranslation_table
;
410 Lisp_Object Qtranslation_table_id
;
411 Lisp_Object Qtranslation_table_for_decode
;
412 Lisp_Object Qtranslation_table_for_encode
;
414 /* Alist of charsets vs revision number. */
415 static Lisp_Object Vcharset_revision_table
;
417 /* Default coding systems used for process I/O. */
418 Lisp_Object Vdefault_process_coding_system
;
420 /* Char table for translating Quail and self-inserting input. */
421 Lisp_Object Vtranslation_table_for_input
;
423 /* Two special coding systems. */
424 Lisp_Object Vsjis_coding_system
;
425 Lisp_Object Vbig5_coding_system
;
427 /* ISO2022 section */
429 #define CODING_ISO_INITIAL(coding, reg) \
430 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
431 coding_attr_iso_initial), \
435 #define CODING_ISO_REQUEST(coding, charset_id) \
436 ((charset_id <= (coding)->max_charset_id \
437 ? (coding)->safe_charsets[charset_id] \
441 #define CODING_ISO_FLAGS(coding) \
442 ((coding)->spec.iso_2022.flags)
443 #define CODING_ISO_DESIGNATION(coding, reg) \
444 ((coding)->spec.iso_2022.current_designation[reg])
445 #define CODING_ISO_INVOCATION(coding, plane) \
446 ((coding)->spec.iso_2022.current_invocation[plane])
447 #define CODING_ISO_SINGLE_SHIFTING(coding) \
448 ((coding)->spec.iso_2022.single_shifting)
449 #define CODING_ISO_BOL(coding) \
450 ((coding)->spec.iso_2022.bol)
451 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
452 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
454 /* Control characters of ISO2022. */
455 /* code */ /* function */
456 #define ISO_CODE_LF 0x0A /* line-feed */
457 #define ISO_CODE_CR 0x0D /* carriage-return */
458 #define ISO_CODE_SO 0x0E /* shift-out */
459 #define ISO_CODE_SI 0x0F /* shift-in */
460 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
461 #define ISO_CODE_ESC 0x1B /* escape */
462 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
463 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
464 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
466 /* All code (1-byte) of ISO2022 is classified into one of the
468 enum iso_code_class_type
470 ISO_control_0
, /* Control codes in the range
471 0x00..0x1F and 0x7F, except for the
472 following 5 codes. */
473 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
474 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
475 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
476 ISO_escape
, /* ISO_CODE_SO (0x1B) */
477 ISO_control_1
, /* Control codes in the range
478 0x80..0x9F, except for the
479 following 3 codes. */
480 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
481 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
482 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
483 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
484 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
485 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
486 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
489 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
490 `iso-flags' attribute of an iso2022 coding system. */
492 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
493 instead of the correct short-form sequence (e.g. ESC $ A). */
494 #define CODING_ISO_FLAG_LONG_FORM 0x0001
496 /* If set, reset graphic planes and registers at end-of-line to the
498 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
500 /* If set, reset graphic planes and registers before any control
501 characters to the initial state. */
502 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
504 /* If set, encode by 7-bit environment. */
505 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
507 /* If set, use locking-shift function. */
508 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
510 /* If set, use single-shift function. Overwrite
511 CODING_ISO_FLAG_LOCKING_SHIFT. */
512 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
514 /* If set, use designation escape sequence. */
515 #define CODING_ISO_FLAG_DESIGNATION 0x0040
517 /* If set, produce revision number sequence. */
518 #define CODING_ISO_FLAG_REVISION 0x0080
520 /* If set, produce ISO6429's direction specifying sequence. */
521 #define CODING_ISO_FLAG_DIRECTION 0x0100
523 /* If set, assume designation states are reset at beginning of line on
525 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
527 /* If set, designation sequence should be placed at beginning of line
529 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
531 /* If set, do not encode unsafe charactes on output. */
532 #define CODING_ISO_FLAG_SAFE 0x0800
534 /* If set, extra latin codes (128..159) are accepted as a valid code
536 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
538 #define CODING_ISO_FLAG_COMPOSITION 0x2000
540 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
542 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
544 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
546 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
548 /* A character to be produced on output if encoding of the original
549 character is prohibited by CODING_ISO_FLAG_SAFE. */
550 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
554 #define CODING_UTF_16_BOM(coding) \
555 ((coding)->spec.utf_16.bom)
557 #define CODING_UTF_16_ENDIAN(coding) \
558 ((coding)->spec.utf_16.endian)
560 #define CODING_UTF_16_SURROGATE(coding) \
561 ((coding)->spec.utf_16.surrogate)
565 #define CODING_CCL_DECODER(coding) \
566 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
567 #define CODING_CCL_ENCODER(coding) \
568 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
569 #define CODING_CCL_VALIDS(coding) \
570 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
572 /* Index for each coding category in `coding_categories' */
576 coding_category_iso_7
,
577 coding_category_iso_7_tight
,
578 coding_category_iso_8_1
,
579 coding_category_iso_8_2
,
580 coding_category_iso_7_else
,
581 coding_category_iso_8_else
,
582 coding_category_utf_8
,
583 coding_category_utf_16_auto
,
584 coding_category_utf_16_be
,
585 coding_category_utf_16_le
,
586 coding_category_utf_16_be_nosig
,
587 coding_category_utf_16_le_nosig
,
588 coding_category_charset
,
589 coding_category_sjis
,
590 coding_category_big5
,
592 coding_category_emacs_mule
,
593 /* All above are targets of code detection. */
594 coding_category_raw_text
,
595 coding_category_undecided
,
599 /* Definitions of flag bits used in detect_coding_XXXX. */
600 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
601 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
602 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
603 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
604 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
605 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
606 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
607 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
608 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
609 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
610 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
611 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
612 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
613 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
614 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
615 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
616 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
617 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
619 /* This value is returned if detect_coding_mask () find nothing other
620 than ASCII characters. */
621 #define CATEGORY_MASK_ANY \
622 (CATEGORY_MASK_ISO_7 \
623 | CATEGORY_MASK_ISO_7_TIGHT \
624 | CATEGORY_MASK_ISO_8_1 \
625 | CATEGORY_MASK_ISO_8_2 \
626 | CATEGORY_MASK_ISO_7_ELSE \
627 | CATEGORY_MASK_ISO_8_ELSE \
628 | CATEGORY_MASK_UTF_8 \
629 | CATEGORY_MASK_UTF_16_BE \
630 | CATEGORY_MASK_UTF_16_LE \
631 | CATEGORY_MASK_UTF_16_BE_NOSIG \
632 | CATEGORY_MASK_UTF_16_LE_NOSIG \
633 | CATEGORY_MASK_CHARSET \
634 | CATEGORY_MASK_SJIS \
635 | CATEGORY_MASK_BIG5 \
636 | CATEGORY_MASK_CCL \
637 | CATEGORY_MASK_EMACS_MULE)
640 #define CATEGORY_MASK_ISO_7BIT \
641 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
643 #define CATEGORY_MASK_ISO_8BIT \
644 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
646 #define CATEGORY_MASK_ISO_ELSE \
647 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
649 #define CATEGORY_MASK_ISO_ESCAPE \
650 (CATEGORY_MASK_ISO_7 \
651 | CATEGORY_MASK_ISO_7_TIGHT \
652 | CATEGORY_MASK_ISO_7_ELSE \
653 | CATEGORY_MASK_ISO_8_ELSE)
655 #define CATEGORY_MASK_ISO \
656 ( CATEGORY_MASK_ISO_7BIT \
657 | CATEGORY_MASK_ISO_8BIT \
658 | CATEGORY_MASK_ISO_ELSE)
660 #define CATEGORY_MASK_UTF_16 \
661 (CATEGORY_MASK_UTF_16_BE \
662 | CATEGORY_MASK_UTF_16_LE \
663 | CATEGORY_MASK_UTF_16_BE_NOSIG \
664 | CATEGORY_MASK_UTF_16_LE_NOSIG)
667 /* List of symbols `coding-category-xxx' ordered by priority. This
668 variable is exposed to Emacs Lisp. */
669 static Lisp_Object Vcoding_category_list
;
671 /* Table of coding categories (Lisp symbols). This variable is for
673 static Lisp_Object Vcoding_category_table
;
675 /* Table of coding-categories ordered by priority. */
676 static enum coding_category coding_priorities
[coding_category_max
];
678 /* Nth element is a coding context for the coding system bound to the
679 Nth coding category. */
680 static struct coding_system coding_categories
[coding_category_max
];
682 /*** Commonly used macros and functions ***/
685 #define min(a, b) ((a) < (b) ? (a) : (b))
688 #define max(a, b) ((a) > (b) ? (a) : (b))
691 #define CODING_GET_INFO(coding, attrs, charset_list) \
693 (attrs) = CODING_ID_ATTRS ((coding)->id); \
694 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
698 /* Safely get one byte from the source text pointed by SRC which ends
699 at SRC_END, and set C to that byte. If there are not enough bytes
700 in the source, it jumps to `no_more_source'. If multibytep is
701 nonzero, and a multibyte character is found at SRC, set C to the
702 negative value of the character code. The caller should declare
703 and set these variables appropriately in advance:
704 src, src_end, multibytep */
706 #define ONE_MORE_BYTE(c) \
708 if (src == src_end) \
710 if (src_base < src) \
711 record_conversion_result \
712 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
713 goto no_more_source; \
716 if (multibytep && (c & 0x80)) \
718 if ((c & 0xFE) == 0xC0) \
719 c = ((c & 1) << 6) | *src++; \
723 c = - string_char (src, &src, NULL); \
724 record_conversion_result \
725 (coding, CODING_RESULT_INVALID_SRC); \
732 #define ONE_MORE_BYTE_NO_CHECK(c) \
735 if (multibytep && (c & 0x80)) \
737 if ((c & 0xFE) == 0xC0) \
738 c = ((c & 1) << 6) | *src++; \
742 c = - string_char (src, &src, NULL); \
743 record_conversion_result \
744 (coding, CODING_RESULT_INVALID_SRC); \
751 /* Store a byte C in the place pointed by DST and increment DST to the
752 next free point, and increment PRODUCED_CHARS. The caller should
753 assure that C is 0..127, and declare and set the variable `dst'
754 appropriately in advance.
758 #define EMIT_ONE_ASCII_BYTE(c) \
765 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
767 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
769 produced_chars += 2; \
770 *dst++ = (c1), *dst++ = (c2); \
774 /* Store a byte C in the place pointed by DST and increment DST to the
775 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
776 nonzero, store in an appropriate multibyte from. The caller should
777 declare and set the variables `dst' and `multibytep' appropriately
780 #define EMIT_ONE_BYTE(c) \
787 ch = BYTE8_TO_CHAR (ch); \
788 CHAR_STRING_ADVANCE (ch, dst); \
795 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
797 #define EMIT_TWO_BYTES(c1, c2) \
799 produced_chars += 2; \
806 ch = BYTE8_TO_CHAR (ch); \
807 CHAR_STRING_ADVANCE (ch, dst); \
810 ch = BYTE8_TO_CHAR (ch); \
811 CHAR_STRING_ADVANCE (ch, dst); \
821 #define EMIT_THREE_BYTES(c1, c2, c3) \
823 EMIT_ONE_BYTE (c1); \
824 EMIT_TWO_BYTES (c2, c3); \
828 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
830 EMIT_TWO_BYTES (c1, c2); \
831 EMIT_TWO_BYTES (c3, c4); \
835 /* Prototypes for static functions. */
836 static void record_conversion_result
P_ ((struct coding_system
*coding
,
837 enum coding_result_code result
));
838 static int detect_coding_utf_8
P_ ((struct coding_system
*,
839 struct coding_detection_info
*info
));
840 static void decode_coding_utf_8
P_ ((struct coding_system
*));
841 static int encode_coding_utf_8
P_ ((struct coding_system
*));
843 static int detect_coding_utf_16
P_ ((struct coding_system
*,
844 struct coding_detection_info
*info
));
845 static void decode_coding_utf_16
P_ ((struct coding_system
*));
846 static int encode_coding_utf_16
P_ ((struct coding_system
*));
848 static int detect_coding_iso_2022
P_ ((struct coding_system
*,
849 struct coding_detection_info
*info
));
850 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
851 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
853 static int detect_coding_emacs_mule
P_ ((struct coding_system
*,
854 struct coding_detection_info
*info
));
855 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
856 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
858 static int detect_coding_sjis
P_ ((struct coding_system
*,
859 struct coding_detection_info
*info
));
860 static void decode_coding_sjis
P_ ((struct coding_system
*));
861 static int encode_coding_sjis
P_ ((struct coding_system
*));
863 static int detect_coding_big5
P_ ((struct coding_system
*,
864 struct coding_detection_info
*info
));
865 static void decode_coding_big5
P_ ((struct coding_system
*));
866 static int encode_coding_big5
P_ ((struct coding_system
*));
868 static int detect_coding_ccl
P_ ((struct coding_system
*,
869 struct coding_detection_info
*info
));
870 static void decode_coding_ccl
P_ ((struct coding_system
*));
871 static int encode_coding_ccl
P_ ((struct coding_system
*));
873 static void decode_coding_raw_text
P_ ((struct coding_system
*));
874 static int encode_coding_raw_text
P_ ((struct coding_system
*));
876 static void coding_set_source
P_ ((struct coding_system
*));
877 static void coding_set_destination
P_ ((struct coding_system
*));
878 static void coding_alloc_by_realloc
P_ ((struct coding_system
*, EMACS_INT
));
879 static void coding_alloc_by_making_gap
P_ ((struct coding_system
*,
881 static unsigned char *alloc_destination
P_ ((struct coding_system
*,
882 EMACS_INT
, unsigned char *));
883 static void setup_iso_safe_charsets
P_ ((Lisp_Object
));
884 static unsigned char *encode_designation_at_bol
P_ ((struct coding_system
*,
887 static int detect_eol
P_ ((const unsigned char *,
888 EMACS_INT
, enum coding_category
));
889 static Lisp_Object adjust_coding_eol_type
P_ ((struct coding_system
*, int));
890 static void decode_eol
P_ ((struct coding_system
*));
891 static Lisp_Object get_translation_table
P_ ((Lisp_Object
, int, int *));
892 static Lisp_Object get_translation
P_ ((Lisp_Object
, int *, int *,
894 static int produce_chars
P_ ((struct coding_system
*, Lisp_Object
, int));
895 static INLINE
void produce_composition
P_ ((struct coding_system
*, int *,
897 static INLINE
void produce_charset
P_ ((struct coding_system
*, int *,
899 static void produce_annotation
P_ ((struct coding_system
*, EMACS_INT
));
900 static int decode_coding
P_ ((struct coding_system
*));
901 static INLINE
int *handle_composition_annotation
P_ ((EMACS_INT
, EMACS_INT
,
902 struct coding_system
*,
903 int *, EMACS_INT
*));
904 static INLINE
int *handle_charset_annotation
P_ ((EMACS_INT
, EMACS_INT
,
905 struct coding_system
*,
906 int *, EMACS_INT
*));
907 static void consume_chars
P_ ((struct coding_system
*, Lisp_Object
, int));
908 static int encode_coding
P_ ((struct coding_system
*));
909 static Lisp_Object make_conversion_work_buffer
P_ ((int));
910 static Lisp_Object code_conversion_restore
P_ ((Lisp_Object
));
911 static INLINE
int char_encodable_p
P_ ((int, Lisp_Object
));
912 static Lisp_Object make_subsidiaries
P_ ((Lisp_Object
));
915 record_conversion_result (struct coding_system
*coding
,
916 enum coding_result_code result
)
918 coding
->result
= result
;
921 case CODING_RESULT_INSUFFICIENT_SRC
:
922 Vlast_code_conversion_error
= Qinsufficient_source
;
924 case CODING_RESULT_INCONSISTENT_EOL
:
925 Vlast_code_conversion_error
= Qinconsistent_eol
;
927 case CODING_RESULT_INVALID_SRC
:
928 Vlast_code_conversion_error
= Qinvalid_source
;
930 case CODING_RESULT_INTERRUPT
:
931 Vlast_code_conversion_error
= Qinterrupted
;
933 case CODING_RESULT_INSUFFICIENT_MEM
:
934 Vlast_code_conversion_error
= Qinsufficient_memory
;
937 Vlast_code_conversion_error
= intern ("Unknown error");
941 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
943 charset_map_loaded = 0; \
944 c = DECODE_CHAR (charset, code); \
945 if (charset_map_loaded) \
947 const unsigned char *orig = coding->source; \
950 coding_set_source (coding); \
951 offset = coding->source - orig; \
953 src_base += offset; \
959 #define ASSURE_DESTINATION(bytes) \
961 if (dst + (bytes) >= dst_end) \
963 int more_bytes = charbuf_end - charbuf + (bytes); \
965 dst = alloc_destination (coding, more_bytes, dst); \
966 dst_end = coding->destination + coding->dst_bytes; \
973 coding_set_source (coding
)
974 struct coding_system
*coding
;
976 if (BUFFERP (coding
->src_object
))
978 struct buffer
*buf
= XBUFFER (coding
->src_object
);
980 if (coding
->src_pos
< 0)
981 coding
->source
= BUF_GAP_END_ADDR (buf
) + coding
->src_pos_byte
;
983 coding
->source
= BUF_BYTE_ADDRESS (buf
, coding
->src_pos_byte
);
985 else if (STRINGP (coding
->src_object
))
987 coding
->source
= SDATA (coding
->src_object
) + coding
->src_pos_byte
;
990 /* Otherwise, the source is C string and is never relocated
991 automatically. Thus we don't have to update anything. */
996 coding_set_destination (coding
)
997 struct coding_system
*coding
;
999 if (BUFFERP (coding
->dst_object
))
1001 if (coding
->src_pos
< 0)
1003 coding
->destination
= BEG_ADDR
+ coding
->dst_pos_byte
- 1;
1004 coding
->dst_bytes
= (GAP_END_ADDR
1005 - (coding
->src_bytes
- coding
->consumed
)
1006 - coding
->destination
);
1010 /* We are sure that coding->dst_pos_byte is before the gap
1012 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
1013 + coding
->dst_pos_byte
- 1);
1014 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
1015 - coding
->destination
);
1019 /* Otherwise, the destination is C string and is never relocated
1020 automatically. Thus we don't have to update anything. */
1026 coding_alloc_by_realloc (coding
, bytes
)
1027 struct coding_system
*coding
;
1030 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
1031 coding
->dst_bytes
+ bytes
);
1032 coding
->dst_bytes
+= bytes
;
1036 coding_alloc_by_making_gap (coding
, bytes
)
1037 struct coding_system
*coding
;
1040 if (BUFFERP (coding
->dst_object
)
1041 && EQ (coding
->src_object
, coding
->dst_object
))
1043 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
1045 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
1047 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
1051 Lisp_Object this_buffer
;
1053 this_buffer
= Fcurrent_buffer ();
1054 set_buffer_internal (XBUFFER (coding
->dst_object
));
1056 set_buffer_internal (XBUFFER (this_buffer
));
1061 static unsigned char *
1062 alloc_destination (coding
, nbytes
, dst
)
1063 struct coding_system
*coding
;
1067 EMACS_INT offset
= dst
- coding
->destination
;
1069 if (BUFFERP (coding
->dst_object
))
1070 coding_alloc_by_making_gap (coding
, nbytes
);
1072 coding_alloc_by_realloc (coding
, nbytes
);
1073 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1074 coding_set_destination (coding
);
1075 dst
= coding
->destination
+ offset
;
1079 /** Macros for annotations. */
1081 /* Maximum length of annotation data (sum of annotations for
1082 composition and charset). */
1083 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1085 /* An annotation data is stored in the array coding->charbuf in this
1087 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1088 LENGTH is the number of elements in the annotation.
1089 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1090 NCHARS is the number of characters in the text annotated.
1092 The format of the following elements depend on ANNOTATION_MASK.
1094 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1096 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1097 METHOD is one of enum composition_method.
1098 Optionnal COMPOSITION-COMPONENTS are characters and composition
1101 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1104 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1106 *(buf)++ = -(len); \
1107 *(buf)++ = (mask); \
1108 *(buf)++ = (nchars); \
1109 coding->annotated = 1; \
1112 #define ADD_COMPOSITION_DATA(buf, nchars, method) \
1114 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1119 #define ADD_CHARSET_DATA(buf, nchars, id) \
1121 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134 Check if a text is encoded in UTF-8. If it is, return 1, else
1137 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1138 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1139 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1140 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1141 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1142 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1145 detect_coding_utf_8 (coding
, detect_info
)
1146 struct coding_system
*coding
;
1147 struct coding_detection_info
*detect_info
;
1149 const unsigned char *src
= coding
->source
, *src_base
;
1150 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1151 int multibytep
= coding
->src_multibyte
;
1152 int consumed_chars
= 0;
1155 detect_info
->checked
|= CATEGORY_MASK_UTF_8
;
1156 /* A coding system of this category is always ASCII compatible. */
1157 src
+= coding
->head_ascii
;
1161 int c
, c1
, c2
, c3
, c4
;
1165 if (c
< 0 || UTF_8_1_OCTET_P (c
))
1168 if (c1
< 0 || ! UTF_8_EXTRA_OCTET_P (c1
))
1170 if (UTF_8_2_OCTET_LEADING_P (c
))
1172 found
= CATEGORY_MASK_UTF_8
;
1176 if (c2
< 0 || ! UTF_8_EXTRA_OCTET_P (c2
))
1178 if (UTF_8_3_OCTET_LEADING_P (c
))
1180 found
= CATEGORY_MASK_UTF_8
;
1184 if (c3
< 0 || ! UTF_8_EXTRA_OCTET_P (c3
))
1186 if (UTF_8_4_OCTET_LEADING_P (c
))
1188 found
= CATEGORY_MASK_UTF_8
;
1192 if (c4
< 0 || ! UTF_8_EXTRA_OCTET_P (c4
))
1194 if (UTF_8_5_OCTET_LEADING_P (c
))
1196 found
= CATEGORY_MASK_UTF_8
;
1201 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1205 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1207 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1210 detect_info
->found
|= found
;
1216 decode_coding_utf_8 (coding
)
1217 struct coding_system
*coding
;
1219 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1220 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1221 const unsigned char *src_base
;
1222 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
1223 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
;
1224 int consumed_chars
= 0, consumed_chars_base
;
1225 int multibytep
= coding
->src_multibyte
;
1226 Lisp_Object attr
, charset_list
;
1228 CODING_GET_INFO (coding
, attr
, charset_list
);
1232 int c
, c1
, c2
, c3
, c4
, c5
;
1235 consumed_chars_base
= consumed_chars
;
1237 if (charbuf
>= charbuf_end
)
1245 else if (UTF_8_1_OCTET_P(c1
))
1252 if (c2
< 0 || ! UTF_8_EXTRA_OCTET_P (c2
))
1254 if (UTF_8_2_OCTET_LEADING_P (c1
))
1256 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1257 /* Reject overlong sequences here and below. Encoders
1258 producing them are incorrect, they can be misleading,
1259 and they mess up read/write invariance. */
1266 if (c3
< 0 || ! UTF_8_EXTRA_OCTET_P (c3
))
1268 if (UTF_8_3_OCTET_LEADING_P (c1
))
1270 c
= (((c1
& 0xF) << 12)
1271 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1273 || (c
>= 0xd800 && c
< 0xe000)) /* surrogates (invalid) */
1279 if (c4
< 0 || ! UTF_8_EXTRA_OCTET_P (c4
))
1281 if (UTF_8_4_OCTET_LEADING_P (c1
))
1283 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1284 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1291 if (c5
< 0 || ! UTF_8_EXTRA_OCTET_P (c5
))
1293 if (UTF_8_5_OCTET_LEADING_P (c1
))
1295 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1296 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1298 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1313 consumed_chars
= consumed_chars_base
;
1315 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1320 coding
->consumed_char
+= consumed_chars_base
;
1321 coding
->consumed
= src_base
- coding
->source
;
1322 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1327 encode_coding_utf_8 (coding
)
1328 struct coding_system
*coding
;
1330 int multibytep
= coding
->dst_multibyte
;
1331 int *charbuf
= coding
->charbuf
;
1332 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1333 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1334 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1335 int produced_chars
= 0;
1340 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1342 while (charbuf
< charbuf_end
)
1344 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1346 ASSURE_DESTINATION (safe_room
);
1348 if (CHAR_BYTE8_P (c
))
1350 c
= CHAR_TO_BYTE8 (c
);
1355 CHAR_STRING_ADVANCE (c
, pend
);
1356 for (p
= str
; p
< pend
; p
++)
1363 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1365 while (charbuf
< charbuf_end
)
1367 ASSURE_DESTINATION (safe_room
);
1369 if (CHAR_BYTE8_P (c
))
1370 *dst
++ = CHAR_TO_BYTE8 (c
);
1372 dst
+= CHAR_STRING (c
, dst
);
1376 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1377 coding
->produced_char
+= produced_chars
;
1378 coding
->produced
= dst
- coding
->destination
;
1383 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1384 Check if a text is encoded in one of UTF-16 based coding systems.
1385 If it is, return 1, else return 0. */
1387 #define UTF_16_HIGH_SURROGATE_P(val) \
1388 (((val) & 0xFC00) == 0xD800)
1390 #define UTF_16_LOW_SURROGATE_P(val) \
1391 (((val) & 0xFC00) == 0xDC00)
1393 #define UTF_16_INVALID_P(val) \
1394 (((val) == 0xFFFE) \
1395 || ((val) == 0xFFFF) \
1396 || UTF_16_LOW_SURROGATE_P (val))
1400 detect_coding_utf_16 (coding
, detect_info
)
1401 struct coding_system
*coding
;
1402 struct coding_detection_info
*detect_info
;
1404 const unsigned char *src
= coding
->source
, *src_base
= src
;
1405 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1406 int multibytep
= coding
->src_multibyte
;
1407 int consumed_chars
= 0;
1410 detect_info
->checked
|= CATEGORY_MASK_UTF_16
;
1411 if (coding
->mode
& CODING_MODE_LAST_BLOCK
1412 && (coding
->src_chars
& 1))
1414 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1420 if ((c1
== 0xFF) && (c2
== 0xFE))
1422 detect_info
->found
|= (CATEGORY_MASK_UTF_16_LE
1423 | CATEGORY_MASK_UTF_16_AUTO
);
1424 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_BE
1425 | CATEGORY_MASK_UTF_16_BE_NOSIG
1426 | CATEGORY_MASK_UTF_16_LE_NOSIG
);
1428 else if ((c1
== 0xFE) && (c2
== 0xFF))
1430 detect_info
->found
|= (CATEGORY_MASK_UTF_16_BE
1431 | CATEGORY_MASK_UTF_16_AUTO
);
1432 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_LE
1433 | CATEGORY_MASK_UTF_16_BE_NOSIG
1434 | CATEGORY_MASK_UTF_16_LE_NOSIG
);
1436 else if (c1
>= 0 && c2
>= 0)
1438 detect_info
->rejected
1439 |= (CATEGORY_MASK_UTF_16_BE
| CATEGORY_MASK_UTF_16_LE
);
1446 decode_coding_utf_16 (coding
)
1447 struct coding_system
*coding
;
1449 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1450 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1451 const unsigned char *src_base
;
1452 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
1453 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
;
1454 int consumed_chars
= 0, consumed_chars_base
;
1455 int multibytep
= coding
->src_multibyte
;
1456 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1457 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1458 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1459 Lisp_Object attr
, charset_list
;
1461 CODING_GET_INFO (coding
, attr
, charset_list
);
1463 if (bom
== utf_16_with_bom
)
1472 if (endian
== utf_16_big_endian
1473 ? c
!= 0xFEFF : c
!= 0xFFFE)
1475 /* The first two bytes are not BOM. Treat them as bytes
1476 for a normal character. */
1480 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1482 else if (bom
== utf_16_detect_bom
)
1484 /* We have already tried to detect BOM and failed in
1486 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1494 consumed_chars_base
= consumed_chars
;
1496 if (charbuf
+ 2 >= charbuf_end
)
1508 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
1512 c
= (endian
== utf_16_big_endian
1513 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1516 if (! UTF_16_LOW_SURROGATE_P (c
))
1518 if (endian
== utf_16_big_endian
)
1519 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1521 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1525 if (UTF_16_HIGH_SURROGATE_P (c
))
1526 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1532 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1533 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1534 *charbuf
++ = 0x10000 + c
;
1539 if (UTF_16_HIGH_SURROGATE_P (c
))
1540 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1547 coding
->consumed_char
+= consumed_chars_base
;
1548 coding
->consumed
= src_base
- coding
->source
;
1549 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1553 encode_coding_utf_16 (coding
)
1554 struct coding_system
*coding
;
1556 int multibytep
= coding
->dst_multibyte
;
1557 int *charbuf
= coding
->charbuf
;
1558 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1559 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1560 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1562 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1563 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1564 int produced_chars
= 0;
1565 Lisp_Object attrs
, charset_list
;
1568 CODING_GET_INFO (coding
, attrs
, charset_list
);
1570 if (bom
!= utf_16_without_bom
)
1572 ASSURE_DESTINATION (safe_room
);
1574 EMIT_TWO_BYTES (0xFE, 0xFF);
1576 EMIT_TWO_BYTES (0xFF, 0xFE);
1577 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1580 while (charbuf
< charbuf_end
)
1582 ASSURE_DESTINATION (safe_room
);
1584 if (c
>= MAX_UNICODE_CHAR
)
1585 c
= coding
->default_char
;
1590 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1592 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1599 c1
= (c
>> 10) + 0xD800;
1600 c2
= (c
& 0x3FF) + 0xDC00;
1602 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1604 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1607 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1608 coding
->produced
= dst
- coding
->destination
;
1609 coding
->produced_char
+= produced_chars
;
1614 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1616 /* Emacs' internal format for representation of multiple character
1617 sets is a kind of multi-byte encoding, i.e. characters are
1618 represented by variable-length sequences of one-byte codes.
1620 ASCII characters and control characters (e.g. `tab', `newline') are
1621 represented by one-byte sequences which are their ASCII codes, in
1622 the range 0x00 through 0x7F.
1624 8-bit characters of the range 0x80..0x9F are represented by
1625 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1628 8-bit characters of the range 0xA0..0xFF are represented by
1629 one-byte sequences which are their 8-bit code.
1631 The other characters are represented by a sequence of `base
1632 leading-code', optional `extended leading-code', and one or two
1633 `position-code's. The length of the sequence is determined by the
1634 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1635 whereas extended leading-code and position-code take the range 0xA0
1636 through 0xFF. See `charset.h' for more details about leading-code
1639 --- CODE RANGE of Emacs' internal format ---
1643 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1644 eight-bit-graphic 0xA0..0xBF
1645 ELSE 0x81..0x9D + [0xA0..0xFF]+
1646 ---------------------------------------------
1648 As this is the internal character representation, the format is
1649 usually not used externally (i.e. in a file or in a data sent to a
1650 process). But, it is possible to have a text externally in this
1651 format (i.e. by encoding by the coding system `emacs-mule').
1653 In that case, a sequence of one-byte codes has a slightly different
1656 At first, all characters in eight-bit-control are represented by
1657 one-byte sequences which are their 8-bit code.
1659 Next, character composition data are represented by the byte
1660 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1662 METHOD is 0xF0 plus one of composition method (enum
1663 composition_method),
1665 BYTES is 0xA0 plus a byte length of this composition data,
1667 CHARS is 0x20 plus a number of characters composed by this
1670 COMPONENTs are characters of multibye form or composition
1671 rules encoded by two-byte of ASCII codes.
1673 In addition, for backward compatibility, the following formats are
1674 also recognized as composition data on decoding.
1677 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1680 MSEQ is a multibyte form but in these special format:
1681 ASCII: 0xA0 ASCII_CODE+0x80,
1682 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1683 RULE is a one byte code of the range 0xA0..0xF0 that
1684 represents a composition rule.
1687 char emacs_mule_bytes
[256];
1690 emacs_mule_char (coding
, src
, nbytes
, nchars
, id
)
1691 struct coding_system
*coding
;
1692 const unsigned char *src
;
1693 int *nbytes
, *nchars
, *id
;
1695 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1696 const unsigned char *src_base
= src
;
1697 int multibytep
= coding
->src_multibyte
;
1698 struct charset
*charset
;
1701 int consumed_chars
= 0;
1707 charset
= emacs_mule_charset
[0];
1713 /* Old style component character of a compostion. */
1723 switch (emacs_mule_bytes
[c
])
1726 if (! (charset
= emacs_mule_charset
[c
]))
1735 if (c
== EMACS_MULE_LEADING_CODE_PRIVATE_11
1736 || c
== EMACS_MULE_LEADING_CODE_PRIVATE_12
)
1739 if (c
< 0xA0 || ! (charset
= emacs_mule_charset
[c
]))
1748 if (! (charset
= emacs_mule_charset
[c
]))
1753 code
= (c
& 0x7F) << 8;
1763 if (c
< 0 || ! (charset
= emacs_mule_charset
[c
]))
1768 code
= (c
& 0x7F) << 8;
1777 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
)
1778 ? charset_ascii
: charset_eight_bit
);
1784 c
= DECODE_CHAR (charset
, code
);
1788 *nbytes
= src
- src_base
;
1789 *nchars
= consumed_chars
;
1802 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1803 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1807 detect_coding_emacs_mule (coding
, detect_info
)
1808 struct coding_system
*coding
;
1809 struct coding_detection_info
*detect_info
;
1811 const unsigned char *src
= coding
->source
, *src_base
;
1812 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1813 int multibytep
= coding
->src_multibyte
;
1814 int consumed_chars
= 0;
1818 detect_info
->checked
|= CATEGORY_MASK_EMACS_MULE
;
1819 /* A coding system of this category is always ASCII compatible. */
1820 src
+= coding
->head_ascii
;
1830 /* Perhaps the start of composite character. We simple skip
1831 it because analyzing it is too heavy for detecting. But,
1832 at least, we check that the composite character
1833 constitues of more than 4 bytes. */
1834 const unsigned char *src_base
;
1844 if (src
- src_base
<= 4)
1846 found
= CATEGORY_MASK_EMACS_MULE
;
1854 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1859 int more_bytes
= emacs_mule_bytes
[*src_base
] - 1;
1861 while (more_bytes
> 0)
1866 src
--; /* Unread the last byte. */
1871 if (more_bytes
!= 0)
1873 found
= CATEGORY_MASK_EMACS_MULE
;
1876 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1880 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1882 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1885 detect_info
->found
|= found
;
1890 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1892 /* Decode a character represented as a component of composition
1893 sequence of Emacs 20/21 style at SRC. Set C to that character and
1894 update SRC to the head of next character (or an encoded composition
1895 rule). If SRC doesn't points a composition component, set C to -1.
1896 If SRC points an invalid byte sequence, global exit by a return
1899 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1903 int nbytes, nchars; \
1905 if (src == src_end) \
1907 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1912 goto invalid_code; \
1916 consumed_chars += nchars; \
1921 /* Decode a composition rule represented as a component of composition
1922 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1923 and increment BUF. If SRC points an invalid byte sequence, set C
1926 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1928 int c, gref, nref; \
1930 if (src >= src_end) \
1931 goto invalid_code; \
1932 ONE_MORE_BYTE_NO_CHECK (c); \
1934 if (c < 0 || c >= 81) \
1935 goto invalid_code; \
1937 gref = c / 9, nref = c % 9; \
1938 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1942 /* Decode a composition rule represented as a component of composition
1943 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1944 and increment BUF. If SRC points an invalid byte sequence, set C
1947 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1951 if (src + 1>= src_end) \
1952 goto invalid_code; \
1953 ONE_MORE_BYTE_NO_CHECK (gref); \
1955 ONE_MORE_BYTE_NO_CHECK (nref); \
1957 if (gref < 0 || gref >= 81 \
1958 || nref < 0 || nref >= 81) \
1959 goto invalid_code; \
1960 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1964 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1966 /* Emacs 21 style format. The first three bytes at SRC are \
1967 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1968 the byte length of this composition information, CHARS is the \
1969 number of characters composed by this composition. */ \
1970 enum composition_method method = c - 0xF2; \
1971 int *charbuf_base = charbuf; \
1972 int consumed_chars_limit; \
1973 int nbytes, nchars; \
1975 ONE_MORE_BYTE (c); \
1977 goto invalid_code; \
1978 nbytes = c - 0xA0; \
1980 goto invalid_code; \
1981 ONE_MORE_BYTE (c); \
1983 goto invalid_code; \
1984 nchars = c - 0xA0; \
1985 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
1986 consumed_chars_limit = consumed_chars_base + nbytes; \
1987 if (method != COMPOSITION_RELATIVE) \
1990 while (consumed_chars < consumed_chars_limit) \
1992 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1993 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1995 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1998 if (consumed_chars < consumed_chars_limit) \
1999 goto invalid_code; \
2000 charbuf_base[0] -= i; \
2005 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
2007 /* Emacs 20 style format for relative composition. */ \
2008 /* Store multibyte form of characters to be composed. */ \
2009 enum composition_method method = COMPOSITION_RELATIVE; \
2010 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2011 int *buf = components; \
2015 ONE_MORE_BYTE (c); /* skip 0x80 */ \
2016 for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \
2017 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2019 goto invalid_code; \
2020 ADD_COMPOSITION_DATA (charbuf, i, method); \
2021 for (j = 0; j < i; j++) \
2022 *charbuf++ = components[j]; \
2026 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
2028 /* Emacs 20 style format for rule-base composition. */ \
2029 /* Store multibyte form of characters to be composed. */ \
2030 enum composition_method method = COMPOSITION_WITH_RULE; \
2031 int *charbuf_base = charbuf; \
2032 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2033 int *buf = components; \
2036 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2037 for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \
2041 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
2042 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2044 if (i <= 1 || (buf - components) % 2 == 0) \
2045 goto invalid_code; \
2046 if (charbuf + i + (i / 2) + 1 >= charbuf_end) \
2047 goto no_more_source; \
2048 ADD_COMPOSITION_DATA (charbuf, i, method); \
2050 for (j = 0; j < i; j++) \
2051 *charbuf++ = components[j]; \
2052 charbuf_base[0] -= i; \
2053 for (j = 0; j < i; j += 2) \
2054 *charbuf++ = components[j]; \
2059 decode_coding_emacs_mule (coding
)
2060 struct coding_system
*coding
;
2062 const unsigned char *src
= coding
->source
+ coding
->consumed
;
2063 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2064 const unsigned char *src_base
;
2065 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
2067 = coding
->charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
2068 int consumed_chars
= 0, consumed_chars_base
;
2069 int multibytep
= coding
->src_multibyte
;
2070 Lisp_Object attrs
, charset_list
;
2071 int char_offset
= coding
->produced_char
;
2072 int last_offset
= char_offset
;
2073 int last_id
= charset_ascii
;
2075 CODING_GET_INFO (coding
, attrs
, charset_list
);
2082 consumed_chars_base
= consumed_chars
;
2084 if (charbuf
>= charbuf_end
)
2103 if (c
- 0xF2 >= COMPOSITION_RELATIVE
2104 && c
- 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS
)
2105 DECODE_EMACS_MULE_21_COMPOSITION (c
);
2107 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
2109 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
2113 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
2119 consumed_chars
= consumed_chars_base
;
2120 c
= emacs_mule_char (coding
, src
, &nbytes
, &nchars
, &id
);
2129 if (last_id
!= charset_ascii
)
2130 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
2132 last_offset
= char_offset
;
2136 consumed_chars
+= nchars
;
2145 consumed_chars
= consumed_chars_base
;
2147 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
2153 if (last_id
!= charset_ascii
)
2154 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
2155 coding
->consumed_char
+= consumed_chars_base
;
2156 coding
->consumed
= src_base
- coding
->source
;
2157 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
2161 #define EMACS_MULE_LEADING_CODES(id, codes) \
2164 codes[0] = id, codes[1] = 0; \
2165 else if (id < 0xE0) \
2166 codes[0] = 0x9A, codes[1] = id; \
2167 else if (id < 0xF0) \
2168 codes[0] = 0x9B, codes[1] = id; \
2169 else if (id < 0xF5) \
2170 codes[0] = 0x9C, codes[1] = id; \
2172 codes[0] = 0x9D, codes[1] = id; \
2177 encode_coding_emacs_mule (coding
)
2178 struct coding_system
*coding
;
2180 int multibytep
= coding
->dst_multibyte
;
2181 int *charbuf
= coding
->charbuf
;
2182 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
2183 unsigned char *dst
= coding
->destination
+ coding
->produced
;
2184 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
2186 int produced_chars
= 0;
2187 Lisp_Object attrs
, charset_list
;
2189 int preferred_charset_id
= -1;
2191 CODING_GET_INFO (coding
, attrs
, charset_list
);
2192 if (! EQ (charset_list
, Vemacs_mule_charset_list
))
2194 CODING_ATTR_CHARSET_LIST (attrs
)
2195 = charset_list
= Vemacs_mule_charset_list
;
2198 while (charbuf
< charbuf_end
)
2200 ASSURE_DESTINATION (safe_room
);
2205 /* Handle an annotation. */
2208 case CODING_ANNOTATE_COMPOSITION_MASK
:
2209 /* Not yet implemented. */
2211 case CODING_ANNOTATE_CHARSET_MASK
:
2212 preferred_charset_id
= charbuf
[3];
2213 if (preferred_charset_id
>= 0
2214 && NILP (Fmemq (make_number (preferred_charset_id
),
2216 preferred_charset_id
= -1;
2225 if (ASCII_CHAR_P (c
))
2226 EMIT_ONE_ASCII_BYTE (c
);
2227 else if (CHAR_BYTE8_P (c
))
2229 c
= CHAR_TO_BYTE8 (c
);
2234 struct charset
*charset
;
2238 unsigned char leading_codes
[2];
2240 if (preferred_charset_id
>= 0)
2242 charset
= CHARSET_FROM_ID (preferred_charset_id
);
2243 if (! CHAR_CHARSET_P (c
, charset
))
2244 charset
= char_charset (c
, charset_list
, NULL
);
2247 charset
= char_charset (c
, charset_list
, &code
);
2250 c
= coding
->default_char
;
2251 if (ASCII_CHAR_P (c
))
2253 EMIT_ONE_ASCII_BYTE (c
);
2256 charset
= char_charset (c
, charset_list
, &code
);
2258 dimension
= CHARSET_DIMENSION (charset
);
2259 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2260 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2261 EMIT_ONE_BYTE (leading_codes
[0]);
2262 if (leading_codes
[1])
2263 EMIT_ONE_BYTE (leading_codes
[1]);
2265 EMIT_ONE_BYTE (code
| 0x80);
2269 EMIT_ONE_BYTE (code
>> 8);
2270 EMIT_ONE_BYTE (code
& 0xFF);
2274 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
2275 coding
->produced_char
+= produced_chars
;
2276 coding
->produced
= dst
- coding
->destination
;
2281 /*** 7. ISO2022 handlers ***/
2283 /* The following note describes the coding system ISO2022 briefly.
2284 Since the intention of this note is to help understand the
2285 functions in this file, some parts are NOT ACCURATE or are OVERLY
2286 SIMPLIFIED. For thorough understanding, please refer to the
2287 original document of ISO2022. This is equivalent to the standard
2288 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2290 ISO2022 provides many mechanisms to encode several character sets
2291 in 7-bit and 8-bit environments. For 7-bit environments, all text
2292 is encoded using bytes less than 128. This may make the encoded
2293 text a little bit longer, but the text passes more easily through
2294 several types of gateway, some of which strip off the MSB (Most
2297 There are two kinds of character sets: control character sets and
2298 graphic character sets. The former contain control characters such
2299 as `newline' and `escape' to provide control functions (control
2300 functions are also provided by escape sequences). The latter
2301 contain graphic characters such as 'A' and '-'. Emacs recognizes
2302 two control character sets and many graphic character sets.
2304 Graphic character sets are classified into one of the following
2305 four classes, according to the number of bytes (DIMENSION) and
2306 number of characters in one dimension (CHARS) of the set:
2307 - DIMENSION1_CHARS94
2308 - DIMENSION1_CHARS96
2309 - DIMENSION2_CHARS94
2310 - DIMENSION2_CHARS96
2312 In addition, each character set is assigned an identification tag,
2313 unique for each set, called the "final character" (denoted as <F>
2314 hereafter). The <F> of each character set is decided by ECMA(*)
2315 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2316 (0x30..0x3F are for private use only).
2318 Note (*): ECMA = European Computer Manufacturers Association
2320 Here are examples of graphic character sets [NAME(<F>)]:
2321 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2322 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2323 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2324 o DIMENSION2_CHARS96 -- none for the moment
2326 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2327 C0 [0x00..0x1F] -- control character plane 0
2328 GL [0x20..0x7F] -- graphic character plane 0
2329 C1 [0x80..0x9F] -- control character plane 1
2330 GR [0xA0..0xFF] -- graphic character plane 1
2332 A control character set is directly designated and invoked to C0 or
2333 C1 by an escape sequence. The most common case is that:
2334 - ISO646's control character set is designated/invoked to C0, and
2335 - ISO6429's control character set is designated/invoked to C1,
2336 and usually these designations/invocations are omitted in encoded
2337 text. In a 7-bit environment, only C0 can be used, and a control
2338 character for C1 is encoded by an appropriate escape sequence to
2339 fit into the environment. All control characters for C1 are
2340 defined to have corresponding escape sequences.
2342 A graphic character set is at first designated to one of four
2343 graphic registers (G0 through G3), then these graphic registers are
2344 invoked to GL or GR. These designations and invocations can be
2345 done independently. The most common case is that G0 is invoked to
2346 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2347 these invocations and designations are omitted in encoded text.
2348 In a 7-bit environment, only GL can be used.
2350 When a graphic character set of CHARS94 is invoked to GL, codes
2351 0x20 and 0x7F of the GL area work as control characters SPACE and
2352 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2355 There are two ways of invocation: locking-shift and single-shift.
2356 With locking-shift, the invocation lasts until the next different
2357 invocation, whereas with single-shift, the invocation affects the
2358 following character only and doesn't affect the locking-shift
2359 state. Invocations are done by the following control characters or
2362 ----------------------------------------------------------------------
2363 abbrev function cntrl escape seq description
2364 ----------------------------------------------------------------------
2365 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2366 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2367 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2368 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2369 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2370 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2371 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2372 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2373 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2374 ----------------------------------------------------------------------
2375 (*) These are not used by any known coding system.
2377 Control characters for these functions are defined by macros
2378 ISO_CODE_XXX in `coding.h'.
2380 Designations are done by the following escape sequences:
2381 ----------------------------------------------------------------------
2382 escape sequence description
2383 ----------------------------------------------------------------------
2384 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2385 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2386 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2387 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2388 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2389 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2390 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2391 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2392 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2393 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2394 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2395 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2396 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2397 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2398 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2399 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2400 ----------------------------------------------------------------------
2402 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2403 of dimension 1, chars 94, and final character <F>, etc...
2405 Note (*): Although these designations are not allowed in ISO2022,
2406 Emacs accepts them on decoding, and produces them on encoding
2407 CHARS96 character sets in a coding system which is characterized as
2408 7-bit environment, non-locking-shift, and non-single-shift.
2410 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2411 '(' must be omitted. We refer to this as "short-form" hereafter.
2413 Now you may notice that there are a lot of ways of encoding the
2414 same multilingual text in ISO2022. Actually, there exist many
2415 coding systems such as Compound Text (used in X11's inter client
2416 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2417 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2418 localized platforms), and all of these are variants of ISO2022.
2420 In addition to the above, Emacs handles two more kinds of escape
2421 sequences: ISO6429's direction specification and Emacs' private
2422 sequence for specifying character composition.
2424 ISO6429's direction specification takes the following form:
2425 o CSI ']' -- end of the current direction
2426 o CSI '0' ']' -- end of the current direction
2427 o CSI '1' ']' -- start of left-to-right text
2428 o CSI '2' ']' -- start of right-to-left text
2429 The control character CSI (0x9B: control sequence introducer) is
2430 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2432 Character composition specification takes the following form:
2433 o ESC '0' -- start relative composition
2434 o ESC '1' -- end composition
2435 o ESC '2' -- start rule-base composition (*)
2436 o ESC '3' -- start relative composition with alternate chars (**)
2437 o ESC '4' -- start rule-base composition with alternate chars (**)
2438 Since these are not standard escape sequences of any ISO standard,
2439 the use of them with these meanings is restricted to Emacs only.
2441 (*) This form is used only in Emacs 20.7 and older versions,
2442 but newer versions can safely decode it.
2443 (**) This form is used only in Emacs 21.1 and newer versions,
2444 and older versions can't decode it.
2446 Here's a list of example usages of these composition escape
2447 sequences (categorized by `enum composition_method').
2449 COMPOSITION_RELATIVE:
2450 ESC 0 CHAR [ CHAR ] ESC 1
2451 COMPOSITION_WITH_RULE:
2452 ESC 2 CHAR [ RULE CHAR ] ESC 1
2453 COMPOSITION_WITH_ALTCHARS:
2454 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2455 COMPOSITION_WITH_RULE_ALTCHARS:
2456 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2458 enum iso_code_class_type iso_code_class
[256];
2460 #define SAFE_CHARSET_P(coding, id) \
2461 ((id) <= (coding)->max_charset_id \
2462 && (coding)->safe_charsets[id] >= 0)
2465 #define SHIFT_OUT_OK(category) \
2466 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2469 setup_iso_safe_charsets (attrs
)
2472 Lisp_Object charset_list
, safe_charsets
;
2473 Lisp_Object request
;
2474 Lisp_Object reg_usage
;
2477 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2480 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2481 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2482 && ! EQ (charset_list
, Viso_2022_charset_list
))
2484 CODING_ATTR_CHARSET_LIST (attrs
)
2485 = charset_list
= Viso_2022_charset_list
;
2486 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2489 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2493 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2495 int id
= XINT (XCAR (tail
));
2496 if (max_charset_id
< id
)
2497 max_charset_id
= id
;
2500 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2502 request
= AREF (attrs
, coding_attr_iso_request
);
2503 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2504 reg94
= XINT (XCAR (reg_usage
));
2505 reg96
= XINT (XCDR (reg_usage
));
2507 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2511 struct charset
*charset
;
2514 charset
= CHARSET_FROM_ID (XINT (id
));
2515 reg
= Fcdr (Fassq (id
, request
));
2517 SSET (safe_charsets
, XINT (id
), XINT (reg
));
2518 else if (charset
->iso_chars_96
)
2521 SSET (safe_charsets
, XINT (id
), reg96
);
2526 SSET (safe_charsets
, XINT (id
), reg94
);
2529 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2533 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2534 Check if a text is encoded in one of ISO-2022 based codig systems.
2535 If it is, return 1, else return 0. */
2538 detect_coding_iso_2022 (coding
, detect_info
)
2539 struct coding_system
*coding
;
2540 struct coding_detection_info
*detect_info
;
2542 const unsigned char *src
= coding
->source
, *src_base
= src
;
2543 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2544 int multibytep
= coding
->src_multibyte
;
2545 int single_shifting
= 0;
2548 int consumed_chars
= 0;
2553 detect_info
->checked
|= CATEGORY_MASK_ISO
;
2555 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2557 struct coding_system
*this = &(coding_categories
[i
]);
2558 Lisp_Object attrs
, val
;
2560 attrs
= CODING_ID_ATTRS (this->id
);
2561 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2562 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2563 setup_iso_safe_charsets (attrs
);
2564 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2565 this->max_charset_id
= SCHARS (val
) - 1;
2566 this->safe_charsets
= (char *) SDATA (val
);
2569 /* A coding system of this category is always ASCII compatible. */
2570 src
+= coding
->head_ascii
;
2572 while (rejected
!= CATEGORY_MASK_ISO
)
2579 if (inhibit_iso_escape_detection
)
2581 single_shifting
= 0;
2583 if (c
>= '(' && c
<= '/')
2585 /* Designation sequence for a charset of dimension 1. */
2587 if (c1
< ' ' || c1
>= 0x80
2588 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2589 /* Invalid designation sequence. Just ignore. */
2594 /* Designation sequence for a charset of dimension 2. */
2596 if (c
>= '@' && c
<= 'B')
2597 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2598 id
= iso_charset_table
[1][0][c
];
2599 else if (c
>= '(' && c
<= '/')
2602 if (c1
< ' ' || c1
>= 0x80
2603 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2604 /* Invalid designation sequence. Just ignore. */
2608 /* Invalid designation sequence. Just ignore it. */
2611 else if (c
== 'N' || c
== 'O')
2613 /* ESC <Fe> for SS2 or SS3. */
2614 single_shifting
= 1;
2615 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2618 else if (c
>= '0' && c
<= '4')
2620 /* ESC <Fp> for start/end composition. */
2621 found
|= CATEGORY_MASK_ISO
;
2626 /* Invalid escape sequence. Just ignore it. */
2630 /* We found a valid designation sequence for CHARSET. */
2631 rejected
|= CATEGORY_MASK_ISO_8BIT
;
2632 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2634 found
|= CATEGORY_MASK_ISO_7
;
2636 rejected
|= CATEGORY_MASK_ISO_7
;
2637 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2639 found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2641 rejected
|= CATEGORY_MASK_ISO_7_TIGHT
;
2642 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2644 found
|= CATEGORY_MASK_ISO_7_ELSE
;
2646 rejected
|= CATEGORY_MASK_ISO_7_ELSE
;
2647 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2649 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2651 rejected
|= CATEGORY_MASK_ISO_8_ELSE
;
2656 /* Locking shift out/in. */
2657 if (inhibit_iso_escape_detection
)
2659 single_shifting
= 0;
2660 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2661 found
|= CATEGORY_MASK_ISO_ELSE
;
2665 /* Control sequence introducer. */
2666 single_shifting
= 0;
2667 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2668 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2669 goto check_extra_latin
;
2674 if (inhibit_iso_escape_detection
)
2676 single_shifting
= 0;
2677 rejected
|= CATEGORY_MASK_ISO_7BIT
;
2678 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2679 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2680 found
|= CATEGORY_MASK_ISO_8_1
, single_shifting
= 1;
2681 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2682 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2683 found
|= CATEGORY_MASK_ISO_8_2
, single_shifting
= 1;
2684 if (single_shifting
)
2686 goto check_extra_latin
;
2693 single_shifting
= 0;
2698 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2699 found
|= CATEGORY_MASK_ISO_8_1
;
2700 /* Check the length of succeeding codes of the range
2701 0xA0..0FF. If the byte length is even, we include
2702 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2703 only when we are not single shifting. */
2704 if (! single_shifting
2705 && ! (rejected
& CATEGORY_MASK_ISO_8_2
))
2708 while (src
< src_end
)
2716 if (i
& 1 && src
< src_end
)
2717 rejected
|= CATEGORY_MASK_ISO_8_2
;
2719 found
|= CATEGORY_MASK_ISO_8_2
;
2724 single_shifting
= 0;
2725 if (! VECTORP (Vlatin_extra_code_table
)
2726 || NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2728 rejected
= CATEGORY_MASK_ISO
;
2731 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2732 & CODING_ISO_FLAG_LATIN_EXTRA
)
2733 found
|= CATEGORY_MASK_ISO_8_1
;
2735 rejected
|= CATEGORY_MASK_ISO_8_1
;
2736 rejected
|= CATEGORY_MASK_ISO_8_2
;
2739 detect_info
->rejected
|= CATEGORY_MASK_ISO
;
2743 detect_info
->rejected
|= rejected
;
2744 detect_info
->found
|= (found
& ~rejected
);
2749 /* Set designation state into CODING. Set CHARS_96 to -1 if the
2750 escape sequence should be kept. */
2751 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2755 if (final < '0' || final >= 128 \
2756 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2757 || !SAFE_CHARSET_P (coding, id)) \
2759 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2763 prev = CODING_ISO_DESIGNATION (coding, reg); \
2764 if (id == charset_jisx0201_roman) \
2766 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2767 id = charset_ascii; \
2769 else if (id == charset_jisx0208_1978) \
2771 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2772 id = charset_jisx0208; \
2774 CODING_ISO_DESIGNATION (coding, reg) = id; \
2775 /* If there was an invalid designation to REG previously, and this \
2776 designation is ASCII to REG, we should keep this designation \
2778 if (prev == -2 && id == charset_ascii) \
2783 #define MAYBE_FINISH_COMPOSITION() \
2786 if (composition_state == COMPOSING_NO) \
2788 /* It is assured that we have enough room for producing \
2789 characters stored in the table `components'. */ \
2790 if (charbuf + component_idx > charbuf_end) \
2791 goto no_more_source; \
2792 composition_state = COMPOSING_NO; \
2793 if (method == COMPOSITION_RELATIVE \
2794 || method == COMPOSITION_WITH_ALTCHARS) \
2796 for (i = 0; i < component_idx; i++) \
2797 *charbuf++ = components[i]; \
2798 char_offset += component_idx; \
2802 for (i = 0; i < component_idx; i += 2) \
2803 *charbuf++ = components[i]; \
2804 char_offset += (component_idx / 2) + 1; \
2809 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2810 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2811 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2812 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2813 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2816 #define DECODE_COMPOSITION_START(c1) \
2819 && composition_state == COMPOSING_COMPONENT_RULE) \
2821 component_len = component_idx; \
2822 composition_state = COMPOSING_CHAR; \
2826 const unsigned char *p; \
2828 MAYBE_FINISH_COMPOSITION (); \
2829 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2830 goto no_more_source; \
2831 for (p = src; p < src_end - 1; p++) \
2832 if (*p == ISO_CODE_ESC && p[1] == '1') \
2834 if (p == src_end - 1) \
2836 /* The current composition doesn't end in the current \
2838 record_conversion_result \
2839 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
2840 goto no_more_source; \
2843 /* This is surely the start of a composition. */ \
2844 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2845 : c1 == '2' ? COMPOSITION_WITH_RULE \
2846 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2847 : COMPOSITION_WITH_RULE_ALTCHARS); \
2848 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2849 : COMPOSING_COMPONENT_CHAR); \
2850 component_idx = component_len = 0; \
2855 /* Handle compositoin end sequence ESC 1. */
2857 #define DECODE_COMPOSITION_END() \
2859 int nchars = (component_len > 0 ? component_idx - component_len \
2860 : method == COMPOSITION_RELATIVE ? component_idx \
2861 : (component_idx + 1) / 2); \
2863 int *saved_charbuf = charbuf; \
2865 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
2866 if (method != COMPOSITION_RELATIVE) \
2868 if (component_len == 0) \
2869 for (i = 0; i < component_idx; i++) \
2870 *charbuf++ = components[i]; \
2872 for (i = 0; i < component_len; i++) \
2873 *charbuf++ = components[i]; \
2874 *saved_charbuf = saved_charbuf - charbuf; \
2876 if (method == COMPOSITION_WITH_RULE) \
2877 for (i = 0; i < component_idx; i += 2, char_offset++) \
2878 *charbuf++ = components[i]; \
2880 for (i = component_len; i < component_idx; i++, char_offset++) \
2881 *charbuf++ = components[i]; \
2882 coding->annotated = 1; \
2883 composition_state = COMPOSING_NO; \
2887 /* Decode a composition rule from the byte C1 (and maybe one more byte
2888 from SRC) and store one encoded composition rule in
2889 coding->cmp_data. */
2891 #define DECODE_COMPOSITION_RULE(c1) \
2894 if (c1 < 81) /* old format (before ver.21) */ \
2896 int gref = (c1) / 9; \
2897 int nref = (c1) % 9; \
2898 if (gref == 4) gref = 10; \
2899 if (nref == 4) nref = 10; \
2900 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2902 else if (c1 < 93) /* new format (after ver.21) */ \
2904 ONE_MORE_BYTE (c2); \
2905 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2912 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2915 decode_coding_iso_2022 (coding
)
2916 struct coding_system
*coding
;
2918 const unsigned char *src
= coding
->source
+ coding
->consumed
;
2919 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2920 const unsigned char *src_base
;
2921 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
2923 = coding
->charbuf
+ coding
->charbuf_size
- 4 - MAX_ANNOTATION_LENGTH
;
2924 int consumed_chars
= 0, consumed_chars_base
;
2925 int multibytep
= coding
->src_multibyte
;
2926 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2927 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2928 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2929 int charset_id_2
, charset_id_3
;
2930 struct charset
*charset
;
2932 /* For handling composition sequence. */
2933 #define COMPOSING_NO 0
2934 #define COMPOSING_CHAR 1
2935 #define COMPOSING_RULE 2
2936 #define COMPOSING_COMPONENT_CHAR 3
2937 #define COMPOSING_COMPONENT_RULE 4
2939 int composition_state
= COMPOSING_NO
;
2940 enum composition_method method
;
2941 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2944 Lisp_Object attrs
, charset_list
;
2945 int char_offset
= coding
->produced_char
;
2946 int last_offset
= char_offset
;
2947 int last_id
= charset_ascii
;
2949 CODING_GET_INFO (coding
, attrs
, charset_list
);
2950 setup_iso_safe_charsets (attrs
);
2957 consumed_chars_base
= consumed_chars
;
2959 if (charbuf
>= charbuf_end
)
2966 /* We produce at most one character. */
2967 switch (iso_code_class
[c1
])
2969 case ISO_0x20_or_0x7F
:
2970 if (composition_state
!= COMPOSING_NO
)
2972 if (composition_state
== COMPOSING_RULE
2973 || composition_state
== COMPOSING_COMPONENT_RULE
)
2975 DECODE_COMPOSITION_RULE (c1
);
2976 components
[component_idx
++] = c1
;
2977 composition_state
--;
2981 if (charset_id_0
< 0
2982 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2983 /* This is SPACE or DEL. */
2984 charset
= CHARSET_FROM_ID (charset_ascii
);
2986 charset
= CHARSET_FROM_ID (charset_id_0
);
2989 case ISO_graphic_plane_0
:
2990 if (composition_state
!= COMPOSING_NO
)
2992 if (composition_state
== COMPOSING_RULE
2993 || composition_state
== COMPOSING_COMPONENT_RULE
)
2995 DECODE_COMPOSITION_RULE (c1
);
2996 components
[component_idx
++] = c1
;
2997 composition_state
--;
3001 if (charset_id_0
< 0)
3002 charset
= CHARSET_FROM_ID (charset_ascii
);
3004 charset
= CHARSET_FROM_ID (charset_id_0
);
3007 case ISO_0xA0_or_0xFF
:
3008 if (charset_id_1
< 0
3009 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
3010 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
3012 /* This is a graphic character, we fall down ... */
3014 case ISO_graphic_plane_1
:
3015 if (charset_id_1
< 0)
3017 charset
= CHARSET_FROM_ID (charset_id_1
);
3021 MAYBE_FINISH_COMPOSITION ();
3022 charset
= CHARSET_FROM_ID (charset_ascii
);
3026 MAYBE_FINISH_COMPOSITION ();
3030 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3031 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
3033 CODING_ISO_INVOCATION (coding
, 0) = 1;
3034 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3038 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
3040 CODING_ISO_INVOCATION (coding
, 0) = 0;
3041 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3044 case ISO_single_shift_2_7
:
3045 case ISO_single_shift_2
:
3046 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
3048 /* SS2 is handled as an escape sequence of ESC 'N' */
3050 goto label_escape_sequence
;
3052 case ISO_single_shift_3
:
3053 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
3055 /* SS2 is handled as an escape sequence of ESC 'O' */
3057 goto label_escape_sequence
;
3059 case ISO_control_sequence_introducer
:
3060 /* CSI is handled as an escape sequence of ESC '[' ... */
3062 goto label_escape_sequence
;
3066 label_escape_sequence
:
3067 /* Escape sequences handled here are invocation,
3068 designation, direction specification, and character
3069 composition specification. */
3072 case '&': /* revision of following character set */
3074 if (!(c1
>= '@' && c1
<= '~'))
3077 if (c1
!= ISO_CODE_ESC
)
3080 goto label_escape_sequence
;
3082 case '$': /* designation of 2-byte character set */
3083 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3089 if (c1
>= '@' && c1
<= 'B')
3090 { /* designation of JISX0208.1978, GB2312.1980,
3092 reg
= 0, chars96
= 0;
3094 else if (c1
>= 0x28 && c1
<= 0x2B)
3095 { /* designation of DIMENSION2_CHARS94 character set */
3096 reg
= c1
- 0x28, chars96
= 0;
3099 else if (c1
>= 0x2C && c1
<= 0x2F)
3100 { /* designation of DIMENSION2_CHARS96 character set */
3101 reg
= c1
- 0x2C, chars96
= 1;
3106 DECODE_DESIGNATION (reg
, 2, chars96
, c1
);
3107 /* We must update these variables now. */
3109 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3111 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3117 case 'n': /* invocation of locking-shift-2 */
3118 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3119 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3121 CODING_ISO_INVOCATION (coding
, 0) = 2;
3122 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3125 case 'o': /* invocation of locking-shift-3 */
3126 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3127 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3129 CODING_ISO_INVOCATION (coding
, 0) = 3;
3130 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3133 case 'N': /* invocation of single-shift-2 */
3134 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3135 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3137 charset_id_2
= CODING_ISO_DESIGNATION (coding
, 2);
3138 if (charset_id_2
< 0)
3139 charset
= CHARSET_FROM_ID (charset_ascii
);
3141 charset
= CHARSET_FROM_ID (charset_id_2
);
3143 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3147 case 'O': /* invocation of single-shift-3 */
3148 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3149 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3151 charset_id_3
= CODING_ISO_DESIGNATION (coding
, 3);
3152 if (charset_id_3
< 0)
3153 charset
= CHARSET_FROM_ID (charset_ascii
);
3155 charset
= CHARSET_FROM_ID (charset_id_3
);
3157 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3161 case '0': case '2': case '3': case '4': /* start composition */
3162 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
3164 DECODE_COMPOSITION_START (c1
);
3167 case '1': /* end composition */
3168 if (composition_state
== COMPOSING_NO
)
3170 DECODE_COMPOSITION_END ();
3173 case '[': /* specification of direction */
3174 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
3176 /* For the moment, nested direction is not supported.
3177 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3178 left-to-right, and nozero means right-to-left. */
3182 case ']': /* end of the current direction */
3183 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3185 case '0': /* end of the current direction */
3186 case '1': /* start of left-to-right direction */
3189 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3194 case '2': /* start of right-to-left direction */
3197 coding
->mode
|= CODING_MODE_DIRECTION
;
3211 /* CTEXT extended segment:
3212 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3213 We keep these bytes as is for the moment.
3214 They may be decoded by post-read-conversion. */
3218 ONE_MORE_BYTE (dim
);
3221 size
= ((M
- 128) * 128) + (L
- 128);
3222 if (charbuf
+ 8 + size
> charbuf_end
)
3224 *charbuf
++ = ISO_CODE_ESC
;
3228 *charbuf
++ = BYTE8_TO_CHAR (M
);
3229 *charbuf
++ = BYTE8_TO_CHAR (L
);
3233 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3238 /* XFree86 extension for embedding UTF-8 in CTEXT:
3239 ESC % G --UTF-8-BYTES-- ESC % @
3240 We keep these bytes as is for the moment.
3241 They may be decoded by post-read-conversion. */
3244 if (p
+ 6 > charbuf_end
)
3246 *p
++ = ISO_CODE_ESC
;
3249 while (p
< charbuf_end
)
3252 if (c1
== ISO_CODE_ESC
3253 && src
+ 1 < src_end
3260 *p
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3262 if (p
+ 3 > charbuf_end
)
3264 *p
++ = ISO_CODE_ESC
;
3275 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3280 if (c1
>= 0x28 && c1
<= 0x2B)
3281 { /* designation of DIMENSION1_CHARS94 character set */
3282 reg
= c1
- 0x28, chars96
= 0;
3285 else if (c1
>= 0x2C && c1
<= 0x2F)
3286 { /* designation of DIMENSION1_CHARS96 character set */
3287 reg
= c1
- 0x2C, chars96
= 1;
3292 DECODE_DESIGNATION (reg
, 1, chars96
, c1
);
3293 /* We must update these variables now. */
3295 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3297 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3305 if (charset
->id
!= charset_ascii
3306 && last_id
!= charset
->id
)
3308 if (last_id
!= charset_ascii
)
3309 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
3310 last_id
= charset
->id
;
3311 last_offset
= char_offset
;
3314 /* Now we know CHARSET and 1st position code C1 of a character.
3315 Produce a decoded character while getting 2nd position code
3318 if (CHARSET_DIMENSION (charset
) > 1)
3321 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3322 /* C2 is not in a valid range. */
3324 c1
= (c1
<< 8) | (c2
& 0x7F);
3325 if (CHARSET_DIMENSION (charset
) > 2)
3328 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3329 /* C2 is not in a valid range. */
3331 c1
= (c1
<< 8) | (c2
& 0x7F);
3335 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3338 MAYBE_FINISH_COMPOSITION ();
3339 for (; src_base
< src
; src_base
++, char_offset
++)
3341 if (ASCII_BYTE_P (*src_base
))
3342 *charbuf
++ = *src_base
;
3344 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3347 else if (composition_state
== COMPOSING_NO
)
3354 components
[component_idx
++] = c
;
3355 if (method
== COMPOSITION_WITH_RULE
3356 || (method
== COMPOSITION_WITH_RULE_ALTCHARS
3357 && composition_state
== COMPOSING_COMPONENT_CHAR
))
3358 composition_state
++;
3363 MAYBE_FINISH_COMPOSITION ();
3365 consumed_chars
= consumed_chars_base
;
3367 *charbuf
++ = c
< 0 ? -c
: ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3377 if (last_id
!= charset_ascii
)
3378 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
3379 coding
->consumed_char
+= consumed_chars_base
;
3380 coding
->consumed
= src_base
- coding
->source
;
3381 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3385 /* ISO2022 encoding stuff. */
3388 It is not enough to say just "ISO2022" on encoding, we have to
3389 specify more details. In Emacs, each coding system of ISO2022
3390 variant has the following specifications:
3391 1. Initial designation to G0 thru G3.
3392 2. Allows short-form designation?
3393 3. ASCII should be designated to G0 before control characters?
3394 4. ASCII should be designated to G0 at end of line?
3395 5. 7-bit environment or 8-bit environment?
3396 6. Use locking-shift?
3397 7. Use Single-shift?
3398 And the following two are only for Japanese:
3399 8. Use ASCII in place of JIS0201-1976-Roman?
3400 9. Use JISX0208-1983 in place of JISX0208-1978?
3401 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3402 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3406 /* Produce codes (escape sequence) for designating CHARSET to graphic
3407 register REG at DST, and increment DST. If <final-char> of CHARSET is
3408 '@', 'A', or 'B' and the coding system CODING allows, produce
3409 designation sequence of short-form. */
3411 #define ENCODE_DESIGNATION(charset, reg, coding) \
3413 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3414 char *intermediate_char_94 = "()*+"; \
3415 char *intermediate_char_96 = ",-./"; \
3416 int revision = -1; \
3419 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3420 revision = CHARSET_ISO_REVISION (charset); \
3422 if (revision >= 0) \
3424 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3425 EMIT_ONE_BYTE ('@' + revision); \
3427 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3428 if (CHARSET_DIMENSION (charset) == 1) \
3430 if (! CHARSET_ISO_CHARS_96 (charset)) \
3431 c = intermediate_char_94[reg]; \
3433 c = intermediate_char_96[reg]; \
3434 EMIT_ONE_ASCII_BYTE (c); \
3438 EMIT_ONE_ASCII_BYTE ('$'); \
3439 if (! CHARSET_ISO_CHARS_96 (charset)) \
3441 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3443 || final_char < '@' || final_char > 'B') \
3444 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3447 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3449 EMIT_ONE_ASCII_BYTE (final_char); \
3451 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3455 /* The following two macros produce codes (control character or escape
3456 sequence) for ISO2022 single-shift functions (single-shift-2 and
3459 #define ENCODE_SINGLE_SHIFT_2 \
3461 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3462 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3464 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3465 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3469 #define ENCODE_SINGLE_SHIFT_3 \
3471 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3472 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3474 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3475 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3479 /* The following four macros produce codes (control character or
3480 escape sequence) for ISO2022 locking-shift functions (shift-in,
3481 shift-out, locking-shift-2, and locking-shift-3). */
3483 #define ENCODE_SHIFT_IN \
3485 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3486 CODING_ISO_INVOCATION (coding, 0) = 0; \
3490 #define ENCODE_SHIFT_OUT \
3492 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3493 CODING_ISO_INVOCATION (coding, 0) = 1; \
3497 #define ENCODE_LOCKING_SHIFT_2 \
3499 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3500 CODING_ISO_INVOCATION (coding, 0) = 2; \
3504 #define ENCODE_LOCKING_SHIFT_3 \
3506 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3507 CODING_ISO_INVOCATION (coding, 0) = 3; \
3511 /* Produce codes for a DIMENSION1 character whose character set is
3512 CHARSET and whose position-code is C1. Designation and invocation
3513 sequences are also produced in advance if necessary. */
3515 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3517 int id = CHARSET_ID (charset); \
3519 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3520 && id == charset_ascii) \
3522 id = charset_jisx0201_roman; \
3523 charset = CHARSET_FROM_ID (id); \
3526 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3528 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3529 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3531 EMIT_ONE_BYTE (c1 | 0x80); \
3532 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3535 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3537 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3540 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3542 EMIT_ONE_BYTE (c1 | 0x80); \
3546 /* Since CHARSET is not yet invoked to any graphic planes, we \
3547 must invoke it, or, at first, designate it to some graphic \
3548 register. Then repeat the loop to actually produce the \
3550 dst = encode_invocation_designation (charset, coding, dst, \
3555 /* Produce codes for a DIMENSION2 character whose character set is
3556 CHARSET and whose position-codes are C1 and C2. Designation and
3557 invocation codes are also produced in advance if necessary. */
3559 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3561 int id = CHARSET_ID (charset); \
3563 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3564 && id == charset_jisx0208) \
3566 id = charset_jisx0208_1978; \
3567 charset = CHARSET_FROM_ID (id); \
3570 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3572 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3573 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3575 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3576 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3579 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3581 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3584 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3586 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3590 /* Since CHARSET is not yet invoked to any graphic planes, we \
3591 must invoke it, or, at first, designate it to some graphic \
3592 register. Then repeat the loop to actually produce the \
3594 dst = encode_invocation_designation (charset, coding, dst, \
3599 #define ENCODE_ISO_CHARACTER(charset, c) \
3601 int code = ENCODE_CHAR ((charset),(c)); \
3603 if (CHARSET_DIMENSION (charset) == 1) \
3604 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3606 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3610 /* Produce designation and invocation codes at a place pointed by DST
3611 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3615 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3616 struct charset
*charset
;
3617 struct coding_system
*coding
;
3621 int multibytep
= coding
->dst_multibyte
;
3622 int produced_chars
= *p_nchars
;
3623 int reg
; /* graphic register number */
3624 int id
= CHARSET_ID (charset
);
3626 /* At first, check designations. */
3627 for (reg
= 0; reg
< 4; reg
++)
3628 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3633 /* CHARSET is not yet designated to any graphic registers. */
3634 /* At first check the requested designation. */
3635 reg
= CODING_ISO_REQUEST (coding
, id
);
3637 /* Since CHARSET requests no special designation, designate it
3638 to graphic register 0. */
3641 ENCODE_DESIGNATION (charset
, reg
, coding
);
3644 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3645 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3647 /* Since the graphic register REG is not invoked to any graphic
3648 planes, invoke it to graphic plane 0. */
3651 case 0: /* graphic register 0 */
3655 case 1: /* graphic register 1 */
3659 case 2: /* graphic register 2 */
3660 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3661 ENCODE_SINGLE_SHIFT_2
;
3663 ENCODE_LOCKING_SHIFT_2
;
3666 case 3: /* graphic register 3 */
3667 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3668 ENCODE_SINGLE_SHIFT_3
;
3670 ENCODE_LOCKING_SHIFT_3
;
3675 *p_nchars
= produced_chars
;
3679 /* The following three macros produce codes for indicating direction
3681 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3683 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3684 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3686 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3690 #define ENCODE_DIRECTION_R2L() \
3692 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3693 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3697 #define ENCODE_DIRECTION_L2R() \
3699 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3700 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3704 /* Produce codes for designation and invocation to reset the graphic
3705 planes and registers to initial state. */
3706 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3709 struct charset *charset; \
3711 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3713 for (reg = 0; reg < 4; reg++) \
3714 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3715 && (CODING_ISO_DESIGNATION (coding, reg) \
3716 != CODING_ISO_INITIAL (coding, reg))) \
3718 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3719 ENCODE_DESIGNATION (charset, reg, coding); \
3724 /* Produce designation sequences of charsets in the line started from
3725 SRC to a place pointed by DST, and return updated DST.
3727 If the current block ends before any end-of-line, we may fail to
3728 find all the necessary designations. */
3730 static unsigned char *
3731 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3732 struct coding_system
*coding
;
3733 int *charbuf
, *charbuf_end
;
3736 struct charset
*charset
;
3737 /* Table of charsets to be designated to each graphic register. */
3739 int c
, found
= 0, reg
;
3740 int produced_chars
= 0;
3741 int multibytep
= coding
->dst_multibyte
;
3743 Lisp_Object charset_list
;
3745 attrs
= CODING_ID_ATTRS (coding
->id
);
3746 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3747 if (EQ (charset_list
, Qiso_2022
))
3748 charset_list
= Viso_2022_charset_list
;
3750 for (reg
= 0; reg
< 4; reg
++)
3760 charset
= char_charset (c
, charset_list
, NULL
);
3761 id
= CHARSET_ID (charset
);
3762 reg
= CODING_ISO_REQUEST (coding
, id
);
3763 if (reg
>= 0 && r
[reg
] < 0)
3772 for (reg
= 0; reg
< 4; reg
++)
3774 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3775 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3781 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3784 encode_coding_iso_2022 (coding
)
3785 struct coding_system
*coding
;
3787 int multibytep
= coding
->dst_multibyte
;
3788 int *charbuf
= coding
->charbuf
;
3789 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3790 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3791 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3794 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3795 && CODING_ISO_BOL (coding
));
3796 int produced_chars
= 0;
3797 Lisp_Object attrs
, eol_type
, charset_list
;
3798 int ascii_compatible
;
3800 int preferred_charset_id
= -1;
3802 CODING_GET_INFO (coding
, attrs
, charset_list
);
3803 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
3804 if (VECTORP (eol_type
))
3807 setup_iso_safe_charsets (attrs
);
3808 /* Charset list may have been changed. */
3809 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
); \
3810 coding
->safe_charsets
= (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs
));
3812 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3814 while (charbuf
< charbuf_end
)
3816 ASSURE_DESTINATION (safe_room
);
3818 if (bol_designation
)
3820 unsigned char *dst_prev
= dst
;
3822 /* We have to produce designation sequences if any now. */
3823 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3824 bol_designation
= 0;
3825 /* We are sure that designation sequences are all ASCII bytes. */
3826 produced_chars
+= dst
- dst_prev
;
3833 /* Handle an annotation. */
3836 case CODING_ANNOTATE_COMPOSITION_MASK
:
3837 /* Not yet implemented. */
3839 case CODING_ANNOTATE_CHARSET_MASK
:
3840 preferred_charset_id
= charbuf
[2];
3841 if (preferred_charset_id
>= 0
3842 && NILP (Fmemq (make_number (preferred_charset_id
),
3844 preferred_charset_id
= -1;
3853 /* Now encode the character C. */
3854 if (c
< 0x20 || c
== 0x7F)
3857 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3859 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3860 ENCODE_RESET_PLANE_AND_REGISTER ();
3861 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3865 for (i
= 0; i
< 4; i
++)
3866 CODING_ISO_DESIGNATION (coding
, i
)
3867 = CODING_ISO_INITIAL (coding
, i
);
3870 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3872 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3873 ENCODE_RESET_PLANE_AND_REGISTER ();
3874 EMIT_ONE_ASCII_BYTE (c
);
3876 else if (ASCII_CHAR_P (c
))
3878 if (ascii_compatible
)
3879 EMIT_ONE_ASCII_BYTE (c
);
3882 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
3883 ENCODE_ISO_CHARACTER (charset
, c
);
3886 else if (CHAR_BYTE8_P (c
))
3888 c
= CHAR_TO_BYTE8 (c
);
3893 struct charset
*charset
;
3895 if (preferred_charset_id
>= 0)
3897 charset
= CHARSET_FROM_ID (preferred_charset_id
);
3898 if (! CHAR_CHARSET_P (c
, charset
))
3899 charset
= char_charset (c
, charset_list
, NULL
);
3902 charset
= char_charset (c
, charset_list
, NULL
);
3905 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3907 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3908 charset
= CHARSET_FROM_ID (charset_ascii
);
3912 c
= coding
->default_char
;
3913 charset
= char_charset (c
, charset_list
, NULL
);
3916 ENCODE_ISO_CHARACTER (charset
, c
);
3920 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3921 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3923 ASSURE_DESTINATION (safe_room
);
3924 ENCODE_RESET_PLANE_AND_REGISTER ();
3926 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
3927 CODING_ISO_BOL (coding
) = bol_designation
;
3928 coding
->produced_char
+= produced_chars
;
3929 coding
->produced
= dst
- coding
->destination
;
3934 /*** 8,9. SJIS and BIG5 handlers ***/
3936 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3937 quite widely. So, for the moment, Emacs supports them in the bare
3938 C code. But, in the future, they may be supported only by CCL. */
3940 /* SJIS is a coding system encoding three character sets: ASCII, right
3941 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3942 as is. A character of charset katakana-jisx0201 is encoded by
3943 "position-code + 0x80". A character of charset japanese-jisx0208
3944 is encoded in 2-byte but two position-codes are divided and shifted
3945 so that it fit in the range below.
3947 --- CODE RANGE of SJIS ---
3948 (character set) (range)
3950 KATAKANA-JISX0201 0xA0 .. 0xDF
3951 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3952 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3953 -------------------------------
3957 /* BIG5 is a coding system encoding two character sets: ASCII and
3958 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3959 character set and is encoded in two-byte.
3961 --- CODE RANGE of BIG5 ---
3962 (character set) (range)
3964 Big5 (1st byte) 0xA1 .. 0xFE
3965 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3966 --------------------------
3970 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3971 Check if a text is encoded in SJIS. If it is, return
3972 CATEGORY_MASK_SJIS, else return 0. */
3975 detect_coding_sjis (coding
, detect_info
)
3976 struct coding_system
*coding
;
3977 struct coding_detection_info
*detect_info
;
3979 const unsigned char *src
= coding
->source
, *src_base
;
3980 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3981 int multibytep
= coding
->src_multibyte
;
3982 int consumed_chars
= 0;
3986 detect_info
->checked
|= CATEGORY_MASK_SJIS
;
3987 /* A coding system of this category is always ASCII compatible. */
3988 src
+= coding
->head_ascii
;
3996 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3999 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
4001 found
= CATEGORY_MASK_SJIS
;
4003 else if (c
>= 0xA0 && c
< 0xE0)
4004 found
= CATEGORY_MASK_SJIS
;
4008 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
4012 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4014 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
4017 detect_info
->found
|= found
;
4021 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4022 Check if a text is encoded in BIG5. If it is, return
4023 CATEGORY_MASK_BIG5, else return 0. */
4026 detect_coding_big5 (coding
, detect_info
)
4027 struct coding_system
*coding
;
4028 struct coding_detection_info
*detect_info
;
4030 const unsigned char *src
= coding
->source
, *src_base
;
4031 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4032 int multibytep
= coding
->src_multibyte
;
4033 int consumed_chars
= 0;
4037 detect_info
->checked
|= CATEGORY_MASK_BIG5
;
4038 /* A coding system of this category is always ASCII compatible. */
4039 src
+= coding
->head_ascii
;
4050 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
4052 found
= CATEGORY_MASK_BIG5
;
4057 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
4061 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4063 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
4066 detect_info
->found
|= found
;
4070 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4071 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
4074 decode_coding_sjis (coding
)
4075 struct coding_system
*coding
;
4077 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4078 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4079 const unsigned char *src_base
;
4080 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
4082 = coding
->charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4083 int consumed_chars
= 0, consumed_chars_base
;
4084 int multibytep
= coding
->src_multibyte
;
4085 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4086 struct charset
*charset_kanji2
;
4087 Lisp_Object attrs
, charset_list
, val
;
4088 int char_offset
= coding
->produced_char
;
4089 int last_offset
= char_offset
;
4090 int last_id
= charset_ascii
;
4092 CODING_GET_INFO (coding
, attrs
, charset_list
);
4095 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4096 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4097 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4098 charset_kanji2
= NILP (val
) ? NULL
: CHARSET_FROM_ID (XINT (XCAR (val
)));
4103 struct charset
*charset
;
4106 consumed_chars_base
= consumed_chars
;
4108 if (charbuf
>= charbuf_end
)
4115 charset
= charset_roman
;
4116 else if (c
== 0x80 || c
== 0xA0)
4118 else if (c
>= 0xA1 && c
<= 0xDF)
4120 /* SJIS -> JISX0201-Kana */
4122 charset
= charset_kana
;
4126 /* SJIS -> JISX0208 */
4128 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
4132 charset
= charset_kanji
;
4134 else if (c
<= 0xFC && charset_kanji2
)
4136 /* SJIS -> JISX0213-2 */
4138 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
4142 charset
= charset_kanji2
;
4146 if (charset
->id
!= charset_ascii
4147 && last_id
!= charset
->id
)
4149 if (last_id
!= charset_ascii
)
4150 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4151 last_id
= charset
->id
;
4152 last_offset
= char_offset
;
4154 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4161 consumed_chars
= consumed_chars_base
;
4163 *charbuf
++ = c
< 0 ? -c
: BYTE8_TO_CHAR (c
);
4169 if (last_id
!= charset_ascii
)
4170 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4171 coding
->consumed_char
+= consumed_chars_base
;
4172 coding
->consumed
= src_base
- coding
->source
;
4173 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4177 decode_coding_big5 (coding
)
4178 struct coding_system
*coding
;
4180 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4181 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4182 const unsigned char *src_base
;
4183 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
4185 = coding
->charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4186 int consumed_chars
= 0, consumed_chars_base
;
4187 int multibytep
= coding
->src_multibyte
;
4188 struct charset
*charset_roman
, *charset_big5
;
4189 Lisp_Object attrs
, charset_list
, val
;
4190 int char_offset
= coding
->produced_char
;
4191 int last_offset
= char_offset
;
4192 int last_id
= charset_ascii
;
4194 CODING_GET_INFO (coding
, attrs
, charset_list
);
4196 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4197 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4202 struct charset
*charset
;
4205 consumed_chars_base
= consumed_chars
;
4207 if (charbuf
>= charbuf_end
)
4215 charset
= charset_roman
;
4219 if (c
< 0xA1 || c
> 0xFE)
4222 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
4225 charset
= charset_big5
;
4227 if (charset
->id
!= charset_ascii
4228 && last_id
!= charset
->id
)
4230 if (last_id
!= charset_ascii
)
4231 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4232 last_id
= charset
->id
;
4233 last_offset
= char_offset
;
4235 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4242 consumed_chars
= consumed_chars_base
;
4244 *charbuf
++ = c
< 0 ? -c
: BYTE8_TO_CHAR (c
);
4250 if (last_id
!= charset_ascii
)
4251 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4252 coding
->consumed_char
+= consumed_chars_base
;
4253 coding
->consumed
= src_base
- coding
->source
;
4254 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4257 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4258 This function can encode charsets `ascii', `katakana-jisx0201',
4259 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4260 are sure that all these charsets are registered as official charset
4261 (i.e. do not have extended leading-codes). Characters of other
4262 charsets are produced without any encoding. If SJIS_P is 1, encode
4263 SJIS text, else encode BIG5 text. */
4266 encode_coding_sjis (coding
)
4267 struct coding_system
*coding
;
4269 int multibytep
= coding
->dst_multibyte
;
4270 int *charbuf
= coding
->charbuf
;
4271 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4272 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4273 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4275 int produced_chars
= 0;
4276 Lisp_Object attrs
, charset_list
, val
;
4277 int ascii_compatible
;
4278 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4279 struct charset
*charset_kanji2
;
4282 CODING_GET_INFO (coding
, attrs
, charset_list
);
4284 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4285 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4286 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4287 charset_kanji2
= NILP (val
) ? NULL
: CHARSET_FROM_ID (XINT (XCAR (val
)));
4289 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4291 while (charbuf
< charbuf_end
)
4293 ASSURE_DESTINATION (safe_room
);
4295 /* Now encode the character C. */
4296 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4297 EMIT_ONE_ASCII_BYTE (c
);
4298 else if (CHAR_BYTE8_P (c
))
4300 c
= CHAR_TO_BYTE8 (c
);
4306 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4310 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4312 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4313 charset
= CHARSET_FROM_ID (charset_ascii
);
4317 c
= coding
->default_char
;
4318 charset
= char_charset (c
, charset_list
, &code
);
4321 if (code
== CHARSET_INVALID_CODE (charset
))
4323 if (charset
== charset_kanji
)
4327 c1
= code
>> 8, c2
= code
& 0xFF;
4328 EMIT_TWO_BYTES (c1
, c2
);
4330 else if (charset
== charset_kana
)
4331 EMIT_ONE_BYTE (code
| 0x80);
4332 else if (charset_kanji2
&& charset
== charset_kanji2
)
4337 if (c1
== 0x21 || (c1
>= 0x23 && c1
< 0x25)
4338 || (c1
>= 0x2C && c1
<= 0x2F) || c1
>= 0x6E)
4340 JIS_TO_SJIS2 (code
);
4341 c1
= code
>> 8, c2
= code
& 0xFF;
4342 EMIT_TWO_BYTES (c1
, c2
);
4345 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4348 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4351 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4352 coding
->produced_char
+= produced_chars
;
4353 coding
->produced
= dst
- coding
->destination
;
4358 encode_coding_big5 (coding
)
4359 struct coding_system
*coding
;
4361 int multibytep
= coding
->dst_multibyte
;
4362 int *charbuf
= coding
->charbuf
;
4363 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4364 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4365 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4367 int produced_chars
= 0;
4368 Lisp_Object attrs
, charset_list
, val
;
4369 int ascii_compatible
;
4370 struct charset
*charset_roman
, *charset_big5
;
4373 CODING_GET_INFO (coding
, attrs
, charset_list
);
4375 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4376 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4377 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4379 while (charbuf
< charbuf_end
)
4381 ASSURE_DESTINATION (safe_room
);
4383 /* Now encode the character C. */
4384 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4385 EMIT_ONE_ASCII_BYTE (c
);
4386 else if (CHAR_BYTE8_P (c
))
4388 c
= CHAR_TO_BYTE8 (c
);
4394 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4398 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4400 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4401 charset
= CHARSET_FROM_ID (charset_ascii
);
4405 c
= coding
->default_char
;
4406 charset
= char_charset (c
, charset_list
, &code
);
4409 if (code
== CHARSET_INVALID_CODE (charset
))
4411 if (charset
== charset_big5
)
4415 c1
= code
>> 8, c2
= code
& 0xFF;
4416 EMIT_TWO_BYTES (c1
, c2
);
4419 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4422 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4423 coding
->produced_char
+= produced_chars
;
4424 coding
->produced
= dst
- coding
->destination
;
4429 /*** 10. CCL handlers ***/
4431 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4432 Check if a text is encoded in a coding system of which
4433 encoder/decoder are written in CCL program. If it is, return
4434 CATEGORY_MASK_CCL, else return 0. */
4437 detect_coding_ccl (coding
, detect_info
)
4438 struct coding_system
*coding
;
4439 struct coding_detection_info
*detect_info
;
4441 const unsigned char *src
= coding
->source
, *src_base
;
4442 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4443 int multibytep
= coding
->src_multibyte
;
4444 int consumed_chars
= 0;
4446 unsigned char *valids
;
4447 int head_ascii
= coding
->head_ascii
;
4450 detect_info
->checked
|= CATEGORY_MASK_CCL
;
4452 coding
= &coding_categories
[coding_category_ccl
];
4453 valids
= CODING_CCL_VALIDS (coding
);
4454 attrs
= CODING_ID_ATTRS (coding
->id
);
4455 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4464 if (c
< 0 || ! valids
[c
])
4466 if ((valids
[c
] > 1))
4467 found
= CATEGORY_MASK_CCL
;
4469 detect_info
->rejected
|= CATEGORY_MASK_CCL
;
4473 detect_info
->found
|= found
;
4478 decode_coding_ccl (coding
)
4479 struct coding_system
*coding
;
4481 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4482 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4483 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
4484 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
;
4485 int consumed_chars
= 0;
4486 int multibytep
= coding
->src_multibyte
;
4487 struct ccl_program ccl
;
4488 int source_charbuf
[1024];
4489 int source_byteidx
[1024];
4490 Lisp_Object attrs
, charset_list
;
4492 CODING_GET_INFO (coding
, attrs
, charset_list
);
4493 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
4495 while (src
< src_end
)
4497 const unsigned char *p
= src
;
4498 int *source
, *source_end
;
4502 while (i
< 1024 && p
< src_end
)
4504 source_byteidx
[i
] = p
- src
;
4505 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4508 while (i
< 1024 && p
< src_end
)
4509 source_charbuf
[i
++] = *p
++;
4511 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4514 source
= source_charbuf
;
4515 source_end
= source
+ i
;
4516 while (source
< source_end
)
4518 ccl_driver (&ccl
, source
, charbuf
,
4519 source_end
- source
, charbuf_end
- charbuf
,
4521 source
+= ccl
.consumed
;
4522 charbuf
+= ccl
.produced
;
4523 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4526 if (source
< source_end
)
4527 src
+= source_byteidx
[source
- source_charbuf
];
4530 consumed_chars
+= source
- source_charbuf
;
4532 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4533 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4539 case CCL_STAT_SUSPEND_BY_SRC
:
4540 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
4542 case CCL_STAT_SUSPEND_BY_DST
:
4545 case CCL_STAT_INVALID_CMD
:
4546 record_conversion_result (coding
, CODING_RESULT_INTERRUPT
);
4549 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4552 coding
->consumed_char
+= consumed_chars
;
4553 coding
->consumed
= src
- coding
->source
;
4554 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4558 encode_coding_ccl (coding
)
4559 struct coding_system
*coding
;
4561 struct ccl_program ccl
;
4562 int multibytep
= coding
->dst_multibyte
;
4563 int *charbuf
= coding
->charbuf
;
4564 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4565 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4566 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4567 int destination_charbuf
[1024];
4568 int i
, produced_chars
= 0;
4569 Lisp_Object attrs
, charset_list
;
4571 CODING_GET_INFO (coding
, attrs
, charset_list
);
4572 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4574 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4575 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4577 while (charbuf
< charbuf_end
)
4579 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4580 charbuf_end
- charbuf
, 1024, charset_list
);
4583 ASSURE_DESTINATION (ccl
.produced
* 2);
4584 for (i
= 0; i
< ccl
.produced
; i
++)
4585 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4589 ASSURE_DESTINATION (ccl
.produced
);
4590 for (i
= 0; i
< ccl
.produced
; i
++)
4591 *dst
++ = destination_charbuf
[i
] & 0xFF;
4592 produced_chars
+= ccl
.produced
;
4594 charbuf
+= ccl
.consumed
;
4595 if (ccl
.status
== CCL_STAT_QUIT
4596 || ccl
.status
== CCL_STAT_INVALID_CMD
)
4602 case CCL_STAT_SUSPEND_BY_SRC
:
4603 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
4605 case CCL_STAT_SUSPEND_BY_DST
:
4606 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_DST
);
4609 case CCL_STAT_INVALID_CMD
:
4610 record_conversion_result (coding
, CODING_RESULT_INTERRUPT
);
4613 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4617 coding
->produced_char
+= produced_chars
;
4618 coding
->produced
= dst
- coding
->destination
;
4624 /*** 10, 11. no-conversion handlers ***/
4626 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4629 decode_coding_raw_text (coding
)
4630 struct coding_system
*coding
;
4632 coding
->chars_at_source
= 1;
4633 coding
->consumed_char
= 0;
4634 coding
->consumed
= 0;
4635 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4639 encode_coding_raw_text (coding
)
4640 struct coding_system
*coding
;
4642 int multibytep
= coding
->dst_multibyte
;
4643 int *charbuf
= coding
->charbuf
;
4644 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4645 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4646 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4647 int produced_chars
= 0;
4652 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4654 if (coding
->src_multibyte
)
4655 while (charbuf
< charbuf_end
)
4657 ASSURE_DESTINATION (safe_room
);
4659 if (ASCII_CHAR_P (c
))
4660 EMIT_ONE_ASCII_BYTE (c
);
4661 else if (CHAR_BYTE8_P (c
))
4663 c
= CHAR_TO_BYTE8 (c
);
4668 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4670 CHAR_STRING_ADVANCE (c
, p1
);
4673 EMIT_ONE_BYTE (*p0
);
4679 while (charbuf
< charbuf_end
)
4681 ASSURE_DESTINATION (safe_room
);
4688 if (coding
->src_multibyte
)
4690 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4692 while (charbuf
< charbuf_end
)
4694 ASSURE_DESTINATION (safe_room
);
4696 if (ASCII_CHAR_P (c
))
4698 else if (CHAR_BYTE8_P (c
))
4699 *dst
++ = CHAR_TO_BYTE8 (c
);
4701 CHAR_STRING_ADVANCE (c
, dst
);
4707 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4708 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4709 *dst
++ = *charbuf
++;
4710 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4713 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4714 coding
->produced_char
+= produced_chars
;
4715 coding
->produced
= dst
- coding
->destination
;
4719 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4720 Check if a text is encoded in a charset-based coding system. If it
4721 is, return 1, else return 0. */
4724 detect_coding_charset (coding
, detect_info
)
4725 struct coding_system
*coding
;
4726 struct coding_detection_info
*detect_info
;
4728 const unsigned char *src
= coding
->source
, *src_base
;
4729 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4730 int multibytep
= coding
->src_multibyte
;
4731 int consumed_chars
= 0;
4732 Lisp_Object attrs
, valids
;
4734 int head_ascii
= coding
->head_ascii
;
4736 detect_info
->checked
|= CATEGORY_MASK_CHARSET
;
4738 coding
= &coding_categories
[coding_category_charset
];
4739 attrs
= CODING_ID_ATTRS (coding
->id
);
4740 valids
= AREF (attrs
, coding_attr_charset_valids
);
4742 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4749 struct charset
*charset
;
4756 val
= AREF (valids
, c
);
4760 found
= CATEGORY_MASK_CHARSET
;
4763 charset
= CHARSET_FROM_ID (XFASTINT (val
));
4764 dim
= CHARSET_DIMENSION (charset
);
4765 for (idx
= 1; idx
< dim
; idx
++)
4770 if (c
< charset
->code_space
[(dim
- 1 - idx
) * 2]
4771 || c
> charset
->code_space
[(dim
- 1 - idx
) * 2 + 1])
4780 for (; CONSP (val
); val
= XCDR (val
))
4782 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
4783 dim
= CHARSET_DIMENSION (charset
);
4789 if (c
< charset
->code_space
[(dim
- 1 - idx
) * 4]
4790 || c
> charset
->code_space
[(dim
- 1 - idx
) * 4 + 1])
4805 detect_info
->rejected
|= CATEGORY_MASK_CHARSET
;
4809 detect_info
->found
|= found
;
4814 decode_coding_charset (coding
)
4815 struct coding_system
*coding
;
4817 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4818 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4819 const unsigned char *src_base
;
4820 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
4822 = coding
->charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4823 int consumed_chars
= 0, consumed_chars_base
;
4824 int multibytep
= coding
->src_multibyte
;
4825 Lisp_Object attrs
, charset_list
, valids
;
4826 int char_offset
= coding
->produced_char
;
4827 int last_offset
= char_offset
;
4828 int last_id
= charset_ascii
;
4830 CODING_GET_INFO (coding
, attrs
, charset_list
);
4831 valids
= AREF (attrs
, coding_attr_charset_valids
);
4837 struct charset
*charset
;
4843 consumed_chars_base
= consumed_chars
;
4845 if (charbuf
>= charbuf_end
)
4853 val
= AREF (valids
, c
);
4858 charset
= CHARSET_FROM_ID (XFASTINT (val
));
4859 dim
= CHARSET_DIMENSION (charset
);
4863 code
= (code
<< 8) | c
;
4866 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
4871 /* VAL is a list of charset IDs. It is assured that the
4872 list is sorted by charset dimensions (smaller one
4876 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
4877 dim
= CHARSET_DIMENSION (charset
);
4881 code
= (code
<< 8) | c
;
4884 CODING_DECODE_CHAR (coding
, src
, src_base
,
4885 src_end
, charset
, code
, c
);
4893 if (charset
->id
!= charset_ascii
4894 && last_id
!= charset
->id
)
4896 if (last_id
!= charset_ascii
)
4897 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4898 last_id
= charset
->id
;
4899 last_offset
= char_offset
;
4908 consumed_chars
= consumed_chars_base
;
4910 *charbuf
++ = c
< 0 ? -c
: ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4916 if (last_id
!= charset_ascii
)
4917 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4918 coding
->consumed_char
+= consumed_chars_base
;
4919 coding
->consumed
= src_base
- coding
->source
;
4920 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4924 encode_coding_charset (coding
)
4925 struct coding_system
*coding
;
4927 int multibytep
= coding
->dst_multibyte
;
4928 int *charbuf
= coding
->charbuf
;
4929 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4930 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4931 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4932 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4933 int produced_chars
= 0;
4934 Lisp_Object attrs
, charset_list
;
4935 int ascii_compatible
;
4938 CODING_GET_INFO (coding
, attrs
, charset_list
);
4939 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4941 while (charbuf
< charbuf_end
)
4943 struct charset
*charset
;
4946 ASSURE_DESTINATION (safe_room
);
4948 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4949 EMIT_ONE_ASCII_BYTE (c
);
4950 else if (CHAR_BYTE8_P (c
))
4952 c
= CHAR_TO_BYTE8 (c
);
4957 charset
= char_charset (c
, charset_list
, &code
);
4960 if (CHARSET_DIMENSION (charset
) == 1)
4961 EMIT_ONE_BYTE (code
);
4962 else if (CHARSET_DIMENSION (charset
) == 2)
4963 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
4964 else if (CHARSET_DIMENSION (charset
) == 3)
4965 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
4967 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
4968 (code
>> 8) & 0xFF, code
& 0xFF);
4972 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4973 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4975 c
= coding
->default_char
;
4981 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4982 coding
->produced_char
+= produced_chars
;
4983 coding
->produced
= dst
- coding
->destination
;
4988 /*** 7. C library functions ***/
4990 /* Setup coding context CODING from information about CODING_SYSTEM.
4991 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4992 CODING_SYSTEM is invalid, signal an error. */
4995 setup_coding_system (coding_system
, coding
)
4996 Lisp_Object coding_system
;
4997 struct coding_system
*coding
;
5000 Lisp_Object eol_type
;
5001 Lisp_Object coding_type
;
5004 if (NILP (coding_system
))
5005 coding_system
= Qundecided
;
5007 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
5009 attrs
= CODING_ID_ATTRS (coding
->id
);
5010 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5013 coding
->head_ascii
= -1;
5014 if (VECTORP (eol_type
))
5015 coding
->common_flags
= (CODING_REQUIRE_DECODING_MASK
5016 | CODING_REQUIRE_DETECTION_MASK
);
5017 else if (! EQ (eol_type
, Qunix
))
5018 coding
->common_flags
= (CODING_REQUIRE_DECODING_MASK
5019 | CODING_REQUIRE_ENCODING_MASK
);
5021 coding
->common_flags
= 0;
5022 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
5023 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
5024 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
5025 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
5026 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs
)))
5027 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
5029 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
5030 coding
->max_charset_id
= SCHARS (val
) - 1;
5031 coding
->safe_charsets
= (char *) SDATA (val
);
5032 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
5034 coding_type
= CODING_ATTR_TYPE (attrs
);
5035 if (EQ (coding_type
, Qundecided
))
5037 coding
->detector
= NULL
;
5038 coding
->decoder
= decode_coding_raw_text
;
5039 coding
->encoder
= encode_coding_raw_text
;
5040 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
5042 else if (EQ (coding_type
, Qiso_2022
))
5045 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
5047 /* Invoke graphic register 0 to plane 0. */
5048 CODING_ISO_INVOCATION (coding
, 0) = 0;
5049 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5050 CODING_ISO_INVOCATION (coding
, 1)
5051 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
5052 /* Setup the initial status of designation. */
5053 for (i
= 0; i
< 4; i
++)
5054 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
5055 /* Not single shifting initially. */
5056 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
5057 /* Beginning of buffer should also be regarded as bol. */
5058 CODING_ISO_BOL (coding
) = 1;
5059 coding
->detector
= detect_coding_iso_2022
;
5060 coding
->decoder
= decode_coding_iso_2022
;
5061 coding
->encoder
= encode_coding_iso_2022
;
5062 if (flags
& CODING_ISO_FLAG_SAFE
)
5063 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
5064 coding
->common_flags
5065 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
5066 | CODING_REQUIRE_FLUSHING_MASK
);
5067 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
5068 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
5069 if (flags
& CODING_ISO_FLAG_DESIGNATION
)
5070 coding
->common_flags
|= CODING_ANNOTATE_CHARSET_MASK
;
5071 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
5073 setup_iso_safe_charsets (attrs
);
5074 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
5075 coding
->max_charset_id
= SCHARS (val
) - 1;
5076 coding
->safe_charsets
= (char *) SDATA (val
);
5078 CODING_ISO_FLAGS (coding
) = flags
;
5080 else if (EQ (coding_type
, Qcharset
))
5082 coding
->detector
= detect_coding_charset
;
5083 coding
->decoder
= decode_coding_charset
;
5084 coding
->encoder
= encode_coding_charset
;
5085 coding
->common_flags
5086 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5088 else if (EQ (coding_type
, Qutf_8
))
5090 coding
->detector
= detect_coding_utf_8
;
5091 coding
->decoder
= decode_coding_utf_8
;
5092 coding
->encoder
= encode_coding_utf_8
;
5093 coding
->common_flags
5094 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5096 else if (EQ (coding_type
, Qutf_16
))
5098 val
= AREF (attrs
, coding_attr_utf_16_bom
);
5099 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
5100 : EQ (val
, Qt
) ? utf_16_with_bom
5101 : utf_16_without_bom
);
5102 val
= AREF (attrs
, coding_attr_utf_16_endian
);
5103 CODING_UTF_16_ENDIAN (coding
) = (EQ (val
, Qbig
) ? utf_16_big_endian
5104 : utf_16_little_endian
);
5105 CODING_UTF_16_SURROGATE (coding
) = 0;
5106 coding
->detector
= detect_coding_utf_16
;
5107 coding
->decoder
= decode_coding_utf_16
;
5108 coding
->encoder
= encode_coding_utf_16
;
5109 coding
->common_flags
5110 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5111 if (CODING_UTF_16_BOM (coding
) == utf_16_detect_bom
)
5112 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
5114 else if (EQ (coding_type
, Qccl
))
5116 coding
->detector
= detect_coding_ccl
;
5117 coding
->decoder
= decode_coding_ccl
;
5118 coding
->encoder
= encode_coding_ccl
;
5119 coding
->common_flags
5120 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
5121 | CODING_REQUIRE_FLUSHING_MASK
);
5123 else if (EQ (coding_type
, Qemacs_mule
))
5125 coding
->detector
= detect_coding_emacs_mule
;
5126 coding
->decoder
= decode_coding_emacs_mule
;
5127 coding
->encoder
= encode_coding_emacs_mule
;
5128 coding
->common_flags
5129 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5130 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
5131 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
5133 Lisp_Object tail
, safe_charsets
;
5134 int max_charset_id
= 0;
5136 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
5138 if (max_charset_id
< XFASTINT (XCAR (tail
)))
5139 max_charset_id
= XFASTINT (XCAR (tail
));
5140 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
5142 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
5144 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
5145 coding
->max_charset_id
= max_charset_id
;
5146 coding
->safe_charsets
= (char *) SDATA (safe_charsets
);
5149 else if (EQ (coding_type
, Qshift_jis
))
5151 coding
->detector
= detect_coding_sjis
;
5152 coding
->decoder
= decode_coding_sjis
;
5153 coding
->encoder
= encode_coding_sjis
;
5154 coding
->common_flags
5155 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5157 else if (EQ (coding_type
, Qbig5
))
5159 coding
->detector
= detect_coding_big5
;
5160 coding
->decoder
= decode_coding_big5
;
5161 coding
->encoder
= encode_coding_big5
;
5162 coding
->common_flags
5163 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5165 else /* EQ (coding_type, Qraw_text) */
5167 coding
->detector
= NULL
;
5168 coding
->decoder
= decode_coding_raw_text
;
5169 coding
->encoder
= encode_coding_raw_text
;
5170 if (! EQ (eol_type
, Qunix
))
5172 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
5173 if (! VECTORP (eol_type
))
5174 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
5182 /* Return a list of charsets supported by CODING. */
5185 coding_charset_list (coding
)
5186 struct coding_system
*coding
;
5188 Lisp_Object attrs
, charset_list
;
5190 CODING_GET_INFO (coding
, attrs
, charset_list
);
5191 if (EQ (CODING_ATTR_TYPE (attrs
), Qiso_2022
))
5193 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
5195 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
5196 charset_list
= Viso_2022_charset_list
;
5198 else if (EQ (CODING_ATTR_TYPE (attrs
), Qemacs_mule
))
5200 charset_list
= Vemacs_mule_charset_list
;
5202 return charset_list
;
5206 /* Return raw-text or one of its subsidiaries that has the same
5207 eol_type as CODING-SYSTEM. */
5210 raw_text_coding_system (coding_system
)
5211 Lisp_Object coding_system
;
5213 Lisp_Object spec
, attrs
;
5214 Lisp_Object eol_type
, raw_text_eol_type
;
5216 if (NILP (coding_system
))
5218 spec
= CODING_SYSTEM_SPEC (coding_system
);
5219 attrs
= AREF (spec
, 0);
5221 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
5222 return coding_system
;
5224 eol_type
= AREF (spec
, 2);
5225 if (VECTORP (eol_type
))
5227 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
5228 raw_text_eol_type
= AREF (spec
, 2);
5229 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
5230 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
5231 : AREF (raw_text_eol_type
, 2));
5235 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5236 does, return one of the subsidiary that has the same eol-spec as
5237 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
5238 inherit end-of-line format from the system's setting
5239 (system_eol_type). */
5242 coding_inherit_eol_type (coding_system
, parent
)
5243 Lisp_Object coding_system
, parent
;
5245 Lisp_Object spec
, eol_type
;
5247 if (NILP (coding_system
))
5248 coding_system
= Qraw_text
;
5249 spec
= CODING_SYSTEM_SPEC (coding_system
);
5250 eol_type
= AREF (spec
, 2);
5251 if (VECTORP (eol_type
))
5253 Lisp_Object parent_eol_type
;
5255 if (! NILP (parent
))
5257 Lisp_Object parent_spec
;
5259 parent_spec
= CODING_SYSTEM_SPEC (parent
);
5260 parent_eol_type
= AREF (parent_spec
, 2);
5263 parent_eol_type
= system_eol_type
;
5264 if (EQ (parent_eol_type
, Qunix
))
5265 coding_system
= AREF (eol_type
, 0);
5266 else if (EQ (parent_eol_type
, Qdos
))
5267 coding_system
= AREF (eol_type
, 1);
5268 else if (EQ (parent_eol_type
, Qmac
))
5269 coding_system
= AREF (eol_type
, 2);
5271 return coding_system
;
5274 /* Emacs has a mechanism to automatically detect a coding system if it
5275 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5276 it's impossible to distinguish some coding systems accurately
5277 because they use the same range of codes. So, at first, coding
5278 systems are categorized into 7, those are:
5280 o coding-category-emacs-mule
5282 The category for a coding system which has the same code range
5283 as Emacs' internal format. Assigned the coding-system (Lisp
5284 symbol) `emacs-mule' by default.
5286 o coding-category-sjis
5288 The category for a coding system which has the same code range
5289 as SJIS. Assigned the coding-system (Lisp
5290 symbol) `japanese-shift-jis' by default.
5292 o coding-category-iso-7
5294 The category for a coding system which has the same code range
5295 as ISO2022 of 7-bit environment. This doesn't use any locking
5296 shift and single shift functions. This can encode/decode all
5297 charsets. Assigned the coding-system (Lisp symbol)
5298 `iso-2022-7bit' by default.
5300 o coding-category-iso-7-tight
5302 Same as coding-category-iso-7 except that this can
5303 encode/decode only the specified charsets.
5305 o coding-category-iso-8-1
5307 The category for a coding system which has the same code range
5308 as ISO2022 of 8-bit environment and graphic plane 1 used only
5309 for DIMENSION1 charset. This doesn't use any locking shift
5310 and single shift functions. Assigned the coding-system (Lisp
5311 symbol) `iso-latin-1' by default.
5313 o coding-category-iso-8-2
5315 The category for a coding system which has the same code range
5316 as ISO2022 of 8-bit environment and graphic plane 1 used only
5317 for DIMENSION2 charset. This doesn't use any locking shift
5318 and single shift functions. Assigned the coding-system (Lisp
5319 symbol) `japanese-iso-8bit' by default.
5321 o coding-category-iso-7-else
5323 The category for a coding system which has the same code range
5324 as ISO2022 of 7-bit environemnt but uses locking shift or
5325 single shift functions. Assigned the coding-system (Lisp
5326 symbol) `iso-2022-7bit-lock' by default.
5328 o coding-category-iso-8-else
5330 The category for a coding system which has the same code range
5331 as ISO2022 of 8-bit environemnt but uses locking shift or
5332 single shift functions. Assigned the coding-system (Lisp
5333 symbol) `iso-2022-8bit-ss2' by default.
5335 o coding-category-big5
5337 The category for a coding system which has the same code range
5338 as BIG5. Assigned the coding-system (Lisp symbol)
5339 `cn-big5' by default.
5341 o coding-category-utf-8
5343 The category for a coding system which has the same code range
5344 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
5345 symbol) `utf-8' by default.
5347 o coding-category-utf-16-be
5349 The category for a coding system in which a text has an
5350 Unicode signature (cf. Unicode Standard) in the order of BIG
5351 endian at the head. Assigned the coding-system (Lisp symbol)
5352 `utf-16-be' by default.
5354 o coding-category-utf-16-le
5356 The category for a coding system in which a text has an
5357 Unicode signature (cf. Unicode Standard) in the order of
5358 LITTLE endian at the head. Assigned the coding-system (Lisp
5359 symbol) `utf-16-le' by default.
5361 o coding-category-ccl
5363 The category for a coding system of which encoder/decoder is
5364 written in CCL programs. The default value is nil, i.e., no
5365 coding system is assigned.
5367 o coding-category-binary
5369 The category for a coding system not categorized in any of the
5370 above. Assigned the coding-system (Lisp symbol)
5371 `no-conversion' by default.
5373 Each of them is a Lisp symbol and the value is an actual
5374 `coding-system's (this is also a Lisp symbol) assigned by a user.
5375 What Emacs does actually is to detect a category of coding system.
5376 Then, it uses a `coding-system' assigned to it. If Emacs can't
5377 decide only one possible category, it selects a category of the
5378 highest priority. Priorities of categories are also specified by a
5379 user in a Lisp variable `coding-category-list'.
5383 #define EOL_SEEN_NONE 0
5384 #define EOL_SEEN_LF 1
5385 #define EOL_SEEN_CR 2
5386 #define EOL_SEEN_CRLF 4
5388 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5389 SOURCE is encoded. If CATEGORY is one of
5390 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5391 two-byte, else they are encoded by one-byte.
5393 Return one of EOL_SEEN_XXX. */
5395 #define MAX_EOL_CHECK_COUNT 3
5398 detect_eol (source
, src_bytes
, category
)
5399 const unsigned char *source
;
5400 EMACS_INT src_bytes
;
5401 enum coding_category category
;
5403 const unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
5406 int eol_seen
= EOL_SEEN_NONE
;
5408 if ((1 << category
) & CATEGORY_MASK_UTF_16
)
5412 msb
= category
== (coding_category_utf_16_le
5413 | coding_category_utf_16_le_nosig
);
5416 while (src
+ 1 < src_end
)
5419 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
5424 this_eol
= EOL_SEEN_LF
;
5425 else if (src
+ 3 >= src_end
5426 || src
[msb
+ 2] != 0
5427 || src
[lsb
+ 2] != '\n')
5428 this_eol
= EOL_SEEN_CR
;
5430 this_eol
= EOL_SEEN_CRLF
;
5432 if (eol_seen
== EOL_SEEN_NONE
)
5433 /* This is the first end-of-line. */
5434 eol_seen
= this_eol
;
5435 else if (eol_seen
!= this_eol
)
5437 /* The found type is different from what found before. */
5438 eol_seen
= EOL_SEEN_LF
;
5441 if (++total
== MAX_EOL_CHECK_COUNT
)
5449 while (src
< src_end
)
5452 if (c
== '\n' || c
== '\r')
5457 this_eol
= EOL_SEEN_LF
;
5458 else if (src
>= src_end
|| *src
!= '\n')
5459 this_eol
= EOL_SEEN_CR
;
5461 this_eol
= EOL_SEEN_CRLF
, src
++;
5463 if (eol_seen
== EOL_SEEN_NONE
)
5464 /* This is the first end-of-line. */
5465 eol_seen
= this_eol
;
5466 else if (eol_seen
!= this_eol
)
5468 /* The found type is different from what found before. */
5469 eol_seen
= EOL_SEEN_LF
;
5472 if (++total
== MAX_EOL_CHECK_COUNT
)
5482 adjust_coding_eol_type (coding
, eol_seen
)
5483 struct coding_system
*coding
;
5486 Lisp_Object eol_type
;
5488 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5489 if (eol_seen
& EOL_SEEN_LF
)
5491 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
5494 else if (eol_seen
& EOL_SEEN_CRLF
)
5496 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
5499 else if (eol_seen
& EOL_SEEN_CR
)
5501 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
5507 /* Detect how a text specified in CODING is encoded. If a coding
5508 system is detected, update fields of CODING by the detected coding
5512 detect_coding (coding
)
5513 struct coding_system
*coding
;
5515 const unsigned char *src
, *src_end
;
5517 coding
->consumed
= coding
->consumed_char
= 0;
5518 coding
->produced
= coding
->produced_char
= 0;
5519 coding_set_source (coding
);
5521 src_end
= coding
->source
+ coding
->src_bytes
;
5523 /* If we have not yet decided the text encoding type, detect it
5525 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
5528 struct coding_detection_info detect_info
;
5530 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
5531 for (i
= 0, src
= coding
->source
; src
< src_end
; i
++, src
++)
5537 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
5538 && ! inhibit_iso_escape_detection
5539 && ! detect_info
.checked
)
5541 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
5542 if (detect_coding_iso_2022 (coding
, &detect_info
))
5544 /* We have scanned the whole data. */
5545 if (! (detect_info
.rejected
& CATEGORY_MASK_ISO_7_ELSE
))
5546 /* We didn't find an 8-bit code. */
5552 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
5554 if (coding
->head_ascii
< coding
->src_bytes
5555 || detect_info
.found
)
5557 enum coding_category category
;
5558 struct coding_system
*this;
5560 if (coding
->head_ascii
== coding
->src_bytes
)
5561 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
5562 for (i
= 0; i
< coding_category_raw_text
; i
++)
5564 category
= coding_priorities
[i
];
5565 this = coding_categories
+ category
;
5566 if (detect_info
.found
& (1 << category
))
5570 for (i
= 0; i
< coding_category_raw_text
; i
++)
5572 category
= coding_priorities
[i
];
5573 this = coding_categories
+ category
;
5576 /* No coding system of this category is defined. */
5577 detect_info
.rejected
|= (1 << category
);
5579 else if (category
>= coding_category_raw_text
)
5581 else if (detect_info
.checked
& (1 << category
))
5583 if (detect_info
.found
& (1 << category
))
5586 else if ((*(this->detector
)) (coding
, &detect_info
)
5587 && detect_info
.found
& (1 << category
))
5589 if (category
== coding_category_utf_16_auto
)
5591 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
5592 category
= coding_category_utf_16_le
;
5594 category
= coding_category_utf_16_be
;
5600 if (i
< coding_category_raw_text
)
5601 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5602 else if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
5603 setup_coding_system (Qraw_text
, coding
);
5604 else if (detect_info
.rejected
)
5605 for (i
= 0; i
< coding_category_raw_text
; i
++)
5606 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
5608 this = coding_categories
+ coding_priorities
[i
];
5609 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5614 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding
->id
)))
5615 == coding_category_utf_16_auto
)
5617 Lisp_Object coding_systems
;
5618 struct coding_detection_info detect_info
;
5621 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_16_bom
);
5622 detect_info
.found
= detect_info
.rejected
= 0;
5623 if (CONSP (coding_systems
)
5624 && detect_coding_utf_16 (coding
, &detect_info
))
5626 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
5627 setup_coding_system (XCAR (coding_systems
), coding
);
5628 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
5629 setup_coding_system (XCDR (coding_systems
), coding
);
5637 struct coding_system
*coding
;
5639 Lisp_Object eol_type
;
5640 unsigned char *p
, *pbeg
, *pend
;
5642 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5643 if (EQ (eol_type
, Qunix
))
5646 if (NILP (coding
->dst_object
))
5647 pbeg
= coding
->destination
;
5649 pbeg
= BYTE_POS_ADDR (coding
->dst_pos_byte
);
5650 pend
= pbeg
+ coding
->produced
;
5652 if (VECTORP (eol_type
))
5654 int eol_seen
= EOL_SEEN_NONE
;
5656 for (p
= pbeg
; p
< pend
; p
++)
5659 eol_seen
|= EOL_SEEN_LF
;
5660 else if (*p
== '\r')
5662 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
5664 eol_seen
|= EOL_SEEN_CRLF
;
5668 eol_seen
|= EOL_SEEN_CR
;
5671 if (eol_seen
!= EOL_SEEN_NONE
5672 && eol_seen
!= EOL_SEEN_LF
5673 && eol_seen
!= EOL_SEEN_CRLF
5674 && eol_seen
!= EOL_SEEN_CR
)
5675 eol_seen
= EOL_SEEN_LF
;
5676 if (eol_seen
!= EOL_SEEN_NONE
)
5677 eol_type
= adjust_coding_eol_type (coding
, eol_seen
);
5680 if (EQ (eol_type
, Qmac
))
5682 for (p
= pbeg
; p
< pend
; p
++)
5686 else if (EQ (eol_type
, Qdos
))
5690 if (NILP (coding
->dst_object
))
5692 /* Start deleting '\r' from the tail to minimize the memory
5694 for (p
= pend
- 2; p
>= pbeg
; p
--)
5697 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
-- - p
- 1);
5703 int pos_byte
= coding
->dst_pos_byte
;
5704 int pos
= coding
->dst_pos
;
5705 int pos_end
= pos
+ coding
->produced_char
- 1;
5707 while (pos
< pos_end
)
5709 p
= BYTE_POS_ADDR (pos_byte
);
5710 if (*p
== '\r' && p
[1] == '\n')
5712 del_range_2 (pos
, pos_byte
, pos
+ 1, pos_byte
+ 1, 0);
5717 pos_byte
+= BYTES_BY_CHAR_HEAD (*p
);
5720 coding
->produced
-= n
;
5721 coding
->produced_char
-= n
;
5726 /* Return a translation table (or list of them) from coding system
5727 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5728 decoding (ENCODEP is zero). */
5731 get_translation_table (attrs
, encodep
, max_lookup
)
5733 int encodep
, *max_lookup
;
5735 Lisp_Object standard
, translation_table
;
5739 translation_table
= CODING_ATTR_ENCODE_TBL (attrs
),
5740 standard
= Vstandard_translation_table_for_encode
;
5742 translation_table
= CODING_ATTR_DECODE_TBL (attrs
),
5743 standard
= Vstandard_translation_table_for_decode
;
5744 if (NILP (translation_table
))
5745 translation_table
= standard
;
5748 if (SYMBOLP (translation_table
))
5749 translation_table
= Fget (translation_table
, Qtranslation_table
);
5750 else if (CONSP (translation_table
))
5752 translation_table
= Fcopy_sequence (translation_table
);
5753 for (val
= translation_table
; CONSP (val
); val
= XCDR (val
))
5754 if (SYMBOLP (XCAR (val
)))
5755 XSETCAR (val
, Fget (XCAR (val
), Qtranslation_table
));
5757 if (CHAR_TABLE_P (standard
))
5759 if (CONSP (translation_table
))
5760 translation_table
= nconc2 (translation_table
,
5761 Fcons (standard
, Qnil
));
5763 translation_table
= Fcons (translation_table
,
5764 Fcons (standard
, Qnil
));
5771 if (CHAR_TABLE_P (translation_table
)
5772 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table
)) > 1)
5774 val
= XCHAR_TABLE (translation_table
)->extras
[1];
5775 if (NATNUMP (val
) && *max_lookup
< XFASTINT (val
))
5776 *max_lookup
= XFASTINT (val
);
5778 else if (CONSP (translation_table
))
5780 Lisp_Object tail
, val
;
5782 for (tail
= translation_table
; CONSP (tail
); tail
= XCDR (tail
))
5783 if (CHAR_TABLE_P (XCAR (tail
))
5784 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail
))) > 1)
5786 val
= XCHAR_TABLE (XCAR (tail
))->extras
[1];
5787 if (NATNUMP (val
) && *max_lookup
< XFASTINT (val
))
5788 *max_lookup
= XFASTINT (val
);
5792 return translation_table
;
5795 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
5798 if (CHAR_TABLE_P (table)) \
5800 trans = CHAR_TABLE_REF (table, c); \
5801 if (CHARACTERP (trans)) \
5802 c = XFASTINT (trans), trans = Qnil; \
5804 else if (CONSP (table)) \
5808 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
5809 if (CHAR_TABLE_P (XCAR (tail))) \
5811 trans = CHAR_TABLE_REF (XCAR (tail), c); \
5812 if (CHARACTERP (trans)) \
5813 c = XFASTINT (trans), trans = Qnil; \
5814 else if (! NILP (trans)) \
5822 get_translation (val
, buf
, buf_end
, last_block
, from_nchars
, to_nchars
)
5826 int *from_nchars
, *to_nchars
;
5828 /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
5832 Lisp_Object from
, tail
;
5835 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
5840 for (i
= 0; i
< len
; i
++)
5842 if (buf
+ i
== buf_end
)
5848 if (XINT (AREF (from
, i
)) != buf
[i
])
5862 *buf
= XINT (AREF (val
, 0)), *to_nchars
= ASIZE (val
);
5870 produce_chars (coding
, translation_table
, last_block
)
5871 struct coding_system
*coding
;
5872 Lisp_Object translation_table
;
5875 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5876 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5878 int produced_chars
= 0;
5881 if (! coding
->chars_at_source
)
5883 /* Characters are in coding->charbuf. */
5884 int *buf
= coding
->charbuf
;
5885 int *buf_end
= buf
+ coding
->charbuf_used
;
5887 if (BUFFERP (coding
->src_object
)
5888 && EQ (coding
->src_object
, coding
->dst_object
))
5889 dst_end
= ((unsigned char *) coding
->source
) + coding
->consumed
;
5891 while (buf
< buf_end
)
5897 int from_nchars
= 1, to_nchars
= 1;
5898 Lisp_Object trans
= Qnil
;
5900 LOOKUP_TRANSLATION_TABLE (translation_table
, c
, trans
);
5903 trans
= get_translation (trans
, buf
, buf_end
, last_block
,
5904 &from_nchars
, &to_nchars
);
5910 if (dst
+ MAX_MULTIBYTE_LENGTH
* to_nchars
> dst_end
)
5912 dst
= alloc_destination (coding
,
5914 + MAX_MULTIBYTE_LENGTH
* to_nchars
,
5916 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5919 for (i
= 0; i
< to_nchars
; i
++)
5922 c
= XINT (AREF (trans
, i
));
5923 if (coding
->dst_multibyte
5924 || ! CHAR_BYTE8_P (c
))
5925 CHAR_STRING_ADVANCE (c
, dst
);
5927 *dst
++ = CHAR_TO_BYTE8 (c
);
5929 produced_chars
+= to_nchars
;
5931 while (--from_nchars
> 0)
5935 /* This is an annotation datum. (-C) is the length. */
5938 carryover
= buf_end
- buf
;
5942 const unsigned char *src
= coding
->source
;
5943 const unsigned char *src_end
= src
+ coding
->src_bytes
;
5944 Lisp_Object eol_type
;
5946 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5948 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5950 if (coding
->src_multibyte
)
5957 const unsigned char *src_base
= src
;
5963 if (EQ (eol_type
, Qdos
))
5967 record_conversion_result
5968 (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
5969 goto no_more_source
;
5974 else if (EQ (eol_type
, Qmac
))
5979 coding
->consumed
= src
- coding
->source
;
5981 if (EQ (coding
->src_object
, coding
->dst_object
))
5982 dst_end
= (unsigned char *) src
;
5985 dst
= alloc_destination (coding
, src_end
- src
+ 1,
5987 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5988 coding_set_source (coding
);
5989 src
= coding
->source
+ coding
->consumed
;
5990 src_end
= coding
->source
+ coding
->src_bytes
;
6000 while (src
< src_end
)
6007 if (EQ (eol_type
, Qdos
))
6013 else if (EQ (eol_type
, Qmac
))
6016 if (dst
>= dst_end
- 1)
6018 coding
->consumed
= src
- coding
->source
;
6020 if (EQ (coding
->src_object
, coding
->dst_object
))
6021 dst_end
= (unsigned char *) src
;
6022 if (dst
>= dst_end
- 1)
6024 dst
= alloc_destination (coding
, src_end
- src
+ 2,
6026 dst_end
= coding
->destination
+ coding
->dst_bytes
;
6027 coding_set_source (coding
);
6028 src
= coding
->source
+ coding
->consumed
;
6029 src_end
= coding
->source
+ coding
->src_bytes
;
6037 if (!EQ (coding
->src_object
, coding
->dst_object
))
6039 int require
= coding
->src_bytes
- coding
->dst_bytes
;
6043 EMACS_INT offset
= src
- coding
->source
;
6045 dst
= alloc_destination (coding
, require
, dst
);
6046 coding_set_source (coding
);
6047 src
= coding
->source
+ offset
;
6048 src_end
= coding
->source
+ coding
->src_bytes
;
6051 produced_chars
= coding
->src_chars
;
6052 while (src
< src_end
)
6058 if (EQ (eol_type
, Qdos
))
6065 else if (EQ (eol_type
, Qmac
))
6071 coding
->consumed
= coding
->src_bytes
;
6072 coding
->consumed_char
= coding
->src_chars
;
6075 produced
= dst
- (coding
->destination
+ coding
->produced
);
6076 if (BUFFERP (coding
->dst_object
))
6077 insert_from_gap (produced_chars
, produced
);
6078 coding
->produced
+= produced
;
6079 coding
->produced_char
+= produced_chars
;
6083 /* Compose text in CODING->object according to the annotation data at
6084 CHARBUF. CHARBUF is an array:
6085 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
6089 produce_composition (coding
, charbuf
, pos
)
6090 struct coding_system
*coding
;
6096 enum composition_method method
;
6097 Lisp_Object components
;
6100 to
= pos
+ charbuf
[2];
6103 method
= (enum composition_method
) (charbuf
[3]);
6105 if (method
== COMPOSITION_RELATIVE
)
6107 else if (method
>= COMPOSITION_WITH_RULE
6108 && method
<= COMPOSITION_WITH_RULE_ALTCHARS
)
6110 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
6115 for (i
= 0; i
< len
; i
++)
6117 args
[i
] = make_number (charbuf
[i
]);
6121 components
= (method
== COMPOSITION_WITH_ALTCHARS
6122 ? Fstring (len
, args
) : Fvector (len
, args
));
6126 compose_text (pos
, to
, components
, Qnil
, coding
->dst_object
);
6130 /* Put `charset' property on text in CODING->object according to
6131 the annotation data at CHARBUF. CHARBUF is an array:
6132 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6136 produce_charset (coding
, charbuf
, pos
)
6137 struct coding_system
*coding
;
6141 EMACS_INT from
= pos
- charbuf
[2];
6142 struct charset
*charset
= CHARSET_FROM_ID (charbuf
[3]);
6144 Fput_text_property (make_number (from
), make_number (pos
),
6145 Qcharset
, CHARSET_NAME (charset
),
6146 coding
->dst_object
);
6150 #define CHARBUF_SIZE 0x4000
6152 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6154 int size = CHARBUF_SIZE;; \
6156 coding->charbuf = NULL; \
6157 while (size > 1024) \
6159 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6160 if (coding->charbuf) \
6164 if (! coding->charbuf) \
6166 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6167 return coding->result; \
6169 coding->charbuf_size = size; \
6174 produce_annotation (coding
, pos
)
6175 struct coding_system
*coding
;
6178 int *charbuf
= coding
->charbuf
;
6179 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
6181 if (NILP (coding
->dst_object
))
6184 while (charbuf
< charbuf_end
)
6190 int len
= -*charbuf
;
6193 case CODING_ANNOTATE_COMPOSITION_MASK
:
6194 produce_composition (coding
, charbuf
, pos
);
6196 case CODING_ANNOTATE_CHARSET_MASK
:
6197 produce_charset (coding
, charbuf
, pos
);
6207 /* Decode the data at CODING->src_object into CODING->dst_object.
6208 CODING->src_object is a buffer, a string, or nil.
6209 CODING->dst_object is a buffer.
6211 If CODING->src_object is a buffer, it must be the current buffer.
6212 In this case, if CODING->src_pos is positive, it is a position of
6213 the source text in the buffer, otherwise, the source text is in the
6214 gap area of the buffer, and CODING->src_pos specifies the offset of
6215 the text from GPT (which must be the same as PT). If this is the
6216 same buffer as CODING->dst_object, CODING->src_pos must be
6219 If CODING->src_object is a string, CODING->src_pos is an index to
6222 If CODING->src_object is nil, CODING->source must already point to
6223 the non-relocatable memory area. In this case, CODING->src_pos is
6224 an offset from CODING->source.
6226 The decoded data is inserted at the current point of the buffer
6231 decode_coding (coding
)
6232 struct coding_system
*coding
;
6235 Lisp_Object undo_list
;
6236 Lisp_Object translation_table
;
6240 if (BUFFERP (coding
->src_object
)
6241 && coding
->src_pos
> 0
6242 && coding
->src_pos
< GPT
6243 && coding
->src_pos
+ coding
->src_chars
> GPT
)
6244 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
6247 if (BUFFERP (coding
->dst_object
))
6249 if (current_buffer
!= XBUFFER (coding
->dst_object
))
6250 set_buffer_internal (XBUFFER (coding
->dst_object
));
6252 move_gap_both (PT
, PT_BYTE
);
6253 undo_list
= current_buffer
->undo_list
;
6254 current_buffer
->undo_list
= Qt
;
6257 coding
->consumed
= coding
->consumed_char
= 0;
6258 coding
->produced
= coding
->produced_char
= 0;
6259 coding
->chars_at_source
= 0;
6260 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
6263 ALLOC_CONVERSION_WORK_AREA (coding
);
6265 attrs
= CODING_ID_ATTRS (coding
->id
);
6266 translation_table
= get_translation_table (attrs
, 0, NULL
);
6271 EMACS_INT pos
= coding
->dst_pos
+ coding
->produced_char
;
6273 coding_set_source (coding
);
6274 coding
->annotated
= 0;
6275 coding
->charbuf_used
= carryover
;
6276 (*(coding
->decoder
)) (coding
);
6277 coding_set_destination (coding
);
6278 carryover
= produce_chars (coding
, translation_table
, 0);
6279 if (coding
->annotated
)
6280 produce_annotation (coding
, pos
);
6281 for (i
= 0; i
< carryover
; i
++)
6283 = coding
->charbuf
[coding
->charbuf_used
- carryover
+ i
];
6285 while (coding
->consumed
< coding
->src_bytes
6286 && (coding
->result
== CODING_RESULT_SUCCESS
6287 || coding
->result
== CODING_RESULT_INVALID_SRC
));
6291 coding_set_destination (coding
);
6292 coding
->charbuf_used
= carryover
;
6293 produce_chars (coding
, translation_table
, 1);
6296 coding
->carryover_bytes
= 0;
6297 if (coding
->consumed
< coding
->src_bytes
)
6299 int nbytes
= coding
->src_bytes
- coding
->consumed
;
6300 const unsigned char *src
;
6302 coding_set_source (coding
);
6303 coding_set_destination (coding
);
6304 src
= coding
->source
+ coding
->consumed
;
6306 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
6308 /* Flush out unprocessed data as binary chars. We are sure
6309 that the number of data is less than the size of
6311 coding
->charbuf_used
= 0;
6312 while (nbytes
-- > 0)
6317 c
= BYTE8_TO_CHAR (c
);
6318 coding
->charbuf
[coding
->charbuf_used
++] = c
;
6320 produce_chars (coding
, Qnil
, 1);
6324 /* Record unprocessed bytes in coding->carryover. We are
6325 sure that the number of data is less than the size of
6326 coding->carryover. */
6327 unsigned char *p
= coding
->carryover
;
6329 coding
->carryover_bytes
= nbytes
;
6330 while (nbytes
-- > 0)
6333 coding
->consumed
= coding
->src_bytes
;
6336 if (! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
6337 decode_eol (coding
);
6338 if (BUFFERP (coding
->dst_object
))
6340 current_buffer
->undo_list
= undo_list
;
6341 record_insert (coding
->dst_pos
, coding
->produced_char
);
6343 return coding
->result
;
6347 /* Extract an annotation datum from a composition starting at POS and
6348 ending before LIMIT of CODING->src_object (buffer or string), store
6349 the data in BUF, set *STOP to a starting position of the next
6350 composition (if any) or to LIMIT, and return the address of the
6351 next element of BUF.
6353 If such an annotation is not found, set *STOP to a starting
6354 position of a composition after POS (if any) or to LIMIT, and
6358 handle_composition_annotation (pos
, limit
, coding
, buf
, stop
)
6359 EMACS_INT pos
, limit
;
6360 struct coding_system
*coding
;
6364 EMACS_INT start
, end
;
6367 if (! find_composition (pos
, limit
, &start
, &end
, &prop
, coding
->src_object
)
6370 else if (start
> pos
)
6376 /* We found a composition. Store the corresponding
6377 annotation data in BUF. */
6379 enum composition_method method
= COMPOSITION_METHOD (prop
);
6380 int nchars
= COMPOSITION_LENGTH (prop
);
6382 ADD_COMPOSITION_DATA (buf
, nchars
, method
);
6383 if (method
!= COMPOSITION_RELATIVE
)
6385 Lisp_Object components
;
6388 components
= COMPOSITION_COMPONENTS (prop
);
6389 if (VECTORP (components
))
6391 len
= XVECTOR (components
)->size
;
6392 for (i
= 0; i
< len
; i
++)
6393 *buf
++ = XINT (AREF (components
, i
));
6395 else if (STRINGP (components
))
6397 len
= SCHARS (components
);
6401 FETCH_STRING_CHAR_ADVANCE (*buf
, components
, i
, i_byte
);
6405 else if (INTEGERP (components
))
6408 *buf
++ = XINT (components
);
6410 else if (CONSP (components
))
6412 for (len
= 0; CONSP (components
);
6413 len
++, components
= XCDR (components
))
6414 *buf
++ = XINT (XCAR (components
));
6422 if (find_composition (end
, limit
, &start
, &end
, &prop
,
6433 /* Extract an annotation datum from a text property `charset' at POS of
6434 CODING->src_object (buffer of string), store the data in BUF, set
6435 *STOP to the position where the value of `charset' property changes
6436 (limiting by LIMIT), and return the address of the next element of
6439 If the property value is nil, set *STOP to the position where the
6440 property value is non-nil (limiting by LIMIT), and return BUF. */
6443 handle_charset_annotation (pos
, limit
, coding
, buf
, stop
)
6444 EMACS_INT pos
, limit
;
6445 struct coding_system
*coding
;
6449 Lisp_Object val
, next
;
6452 val
= Fget_text_property (make_number (pos
), Qcharset
, coding
->src_object
);
6453 if (! NILP (val
) && CHARSETP (val
))
6454 id
= XINT (CHARSET_SYMBOL_ID (val
));
6457 ADD_CHARSET_DATA (buf
, 0, id
);
6458 next
= Fnext_single_property_change (make_number (pos
), Qcharset
,
6460 make_number (limit
));
6461 *stop
= XINT (next
);
6467 consume_chars (coding
, translation_table
, max_lookup
)
6468 struct coding_system
*coding
;
6469 Lisp_Object translation_table
;
6472 int *buf
= coding
->charbuf
;
6473 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
;
6474 const unsigned char *src
= coding
->source
+ coding
->consumed
;
6475 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
6476 EMACS_INT pos
= coding
->src_pos
+ coding
->consumed_char
;
6477 EMACS_INT end_pos
= coding
->src_pos
+ coding
->src_chars
;
6478 int multibytep
= coding
->src_multibyte
;
6479 Lisp_Object eol_type
;
6481 EMACS_INT stop
, stop_composition
, stop_charset
;
6482 int *lookup_buf
= NULL
;
6484 if (! NILP (translation_table
))
6485 lookup_buf
= alloca (sizeof (int) * max_lookup
);
6487 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6488 if (VECTORP (eol_type
))
6491 /* Note: composition handling is not yet implemented. */
6492 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
6494 if (NILP (coding
->src_object
))
6495 stop
= stop_composition
= stop_charset
= end_pos
;
6498 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
)
6499 stop
= stop_composition
= pos
;
6501 stop
= stop_composition
= end_pos
;
6502 if (coding
->common_flags
& CODING_ANNOTATE_CHARSET_MASK
)
6503 stop
= stop_charset
= pos
;
6505 stop_charset
= end_pos
;
6508 /* Compensate for CRLF and conversion. */
6509 buf_end
-= 1 + MAX_ANNOTATION_LENGTH
;
6510 while (buf
< buf_end
)
6518 if (pos
== stop_composition
)
6519 buf
= handle_composition_annotation (pos
, end_pos
, coding
,
6520 buf
, &stop_composition
);
6521 if (pos
== stop_charset
)
6522 buf
= handle_charset_annotation (pos
, end_pos
, coding
,
6523 buf
, &stop_charset
);
6524 stop
= (stop_composition
< stop_charset
6525 ? stop_composition
: stop_charset
);
6532 if (coding
->encoder
== encode_coding_raw_text
)
6534 else if ((bytes
= MULTIBYTE_LENGTH (src
, src_end
)) > 0)
6535 c
= STRING_CHAR_ADVANCE (src
), pos
+= bytes
;
6537 c
= BYTE8_TO_CHAR (*src
), src
++, pos
++;
6540 c
= STRING_CHAR_ADVANCE (src
), pos
++;
6541 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
6543 if (! EQ (eol_type
, Qunix
))
6547 if (EQ (eol_type
, Qdos
))
6555 LOOKUP_TRANSLATION_TABLE (translation_table
, c
, trans
);
6560 int from_nchars
= 1, to_nchars
= 1;
6561 int *lookup_buf_end
;
6562 const unsigned char *p
= src
;
6566 for (i
= 1; i
< max_lookup
&& p
< src_end
; i
++)
6567 lookup_buf
[i
] = STRING_CHAR_ADVANCE (p
);
6568 lookup_buf_end
= lookup_buf
+ i
;
6569 trans
= get_translation (trans
, lookup_buf
, lookup_buf_end
, 1,
6570 &from_nchars
, &to_nchars
);
6572 || buf
+ to_nchars
> buf_end
)
6574 *buf
++ = *lookup_buf
;
6575 for (i
= 1; i
< to_nchars
; i
++)
6576 *buf
++ = XINT (AREF (trans
, i
));
6577 for (i
= 1; i
< from_nchars
; i
++, pos
++)
6578 src
+= MULTIBYTE_LENGTH_NO_CHECK (src
);
6582 coding
->consumed
= src
- coding
->source
;
6583 coding
->consumed_char
= pos
- coding
->src_pos
;
6584 coding
->charbuf_used
= buf
- coding
->charbuf
;
6585 coding
->chars_at_source
= 0;
6589 /* Encode the text at CODING->src_object into CODING->dst_object.
6590 CODING->src_object is a buffer or a string.
6591 CODING->dst_object is a buffer or nil.
6593 If CODING->src_object is a buffer, it must be the current buffer.
6594 In this case, if CODING->src_pos is positive, it is a position of
6595 the source text in the buffer, otherwise. the source text is in the
6596 gap area of the buffer, and coding->src_pos specifies the offset of
6597 the text from GPT (which must be the same as PT). If this is the
6598 same buffer as CODING->dst_object, CODING->src_pos must be
6599 negative and CODING should not have `pre-write-conversion'.
6601 If CODING->src_object is a string, CODING should not have
6602 `pre-write-conversion'.
6604 If CODING->dst_object is a buffer, the encoded data is inserted at
6605 the current point of that buffer.
6607 If CODING->dst_object is nil, the encoded data is placed at the
6608 memory area specified by CODING->destination. */
6611 encode_coding (coding
)
6612 struct coding_system
*coding
;
6615 Lisp_Object translation_table
;
6618 attrs
= CODING_ID_ATTRS (coding
->id
);
6619 if (coding
->encoder
== encode_coding_raw_text
)
6620 translation_table
= Qnil
, max_lookup
= 0;
6622 translation_table
= get_translation_table (attrs
, 1, &max_lookup
);
6624 if (BUFFERP (coding
->dst_object
))
6626 set_buffer_internal (XBUFFER (coding
->dst_object
));
6627 coding
->dst_multibyte
6628 = ! NILP (current_buffer
->enable_multibyte_characters
);
6631 coding
->consumed
= coding
->consumed_char
= 0;
6632 coding
->produced
= coding
->produced_char
= 0;
6633 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
6636 ALLOC_CONVERSION_WORK_AREA (coding
);
6639 coding_set_source (coding
);
6640 consume_chars (coding
, translation_table
, max_lookup
);
6641 coding_set_destination (coding
);
6642 (*(coding
->encoder
)) (coding
);
6643 } while (coding
->consumed_char
< coding
->src_chars
);
6645 if (BUFFERP (coding
->dst_object
))
6646 insert_from_gap (coding
->produced_char
, coding
->produced
);
6648 return (coding
->result
);
6652 /* Name (or base name) of work buffer for code conversion. */
6653 static Lisp_Object Vcode_conversion_workbuf_name
;
6655 /* A working buffer used by the top level conversion. Once it is
6656 created, it is never destroyed. It has the name
6657 Vcode_conversion_workbuf_name. The other working buffers are
6658 destroyed after the use is finished, and their names are modified
6659 versions of Vcode_conversion_workbuf_name. */
6660 static Lisp_Object Vcode_conversion_reused_workbuf
;
6662 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6663 static int reused_workbuf_in_use
;
6666 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6667 multibyteness of returning buffer. */
6670 make_conversion_work_buffer (multibyte
)
6673 Lisp_Object name
, workbuf
;
6674 struct buffer
*current
;
6676 if (reused_workbuf_in_use
++)
6678 name
= Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name
, Qnil
);
6679 workbuf
= Fget_buffer_create (name
);
6683 name
= Vcode_conversion_workbuf_name
;
6684 workbuf
= Fget_buffer_create (name
);
6685 if (NILP (Vcode_conversion_reused_workbuf
))
6686 Vcode_conversion_reused_workbuf
= workbuf
;
6688 current
= current_buffer
;
6689 set_buffer_internal (XBUFFER (workbuf
));
6691 current_buffer
->undo_list
= Qt
;
6692 current_buffer
->enable_multibyte_characters
= multibyte
? Qt
: Qnil
;
6693 set_buffer_internal (current
);
6699 code_conversion_restore (arg
)
6702 Lisp_Object current
, workbuf
;
6703 struct gcpro gcpro1
;
6706 current
= XCAR (arg
);
6707 workbuf
= XCDR (arg
);
6708 if (! NILP (workbuf
))
6710 if (EQ (workbuf
, Vcode_conversion_reused_workbuf
))
6711 reused_workbuf_in_use
= 0;
6712 else if (! NILP (Fbuffer_live_p (workbuf
)))
6713 Fkill_buffer (workbuf
);
6715 set_buffer_internal (XBUFFER (current
));
6721 code_conversion_save (with_work_buf
, multibyte
)
6722 int with_work_buf
, multibyte
;
6724 Lisp_Object workbuf
= Qnil
;
6727 workbuf
= make_conversion_work_buffer (multibyte
);
6728 record_unwind_protect (code_conversion_restore
,
6729 Fcons (Fcurrent_buffer (), workbuf
));
6734 decode_coding_gap (coding
, chars
, bytes
)
6735 struct coding_system
*coding
;
6736 EMACS_INT chars
, bytes
;
6738 int count
= specpdl_ptr
- specpdl
;
6741 code_conversion_save (0, 0);
6743 coding
->src_object
= Fcurrent_buffer ();
6744 coding
->src_chars
= chars
;
6745 coding
->src_bytes
= bytes
;
6746 coding
->src_pos
= -chars
;
6747 coding
->src_pos_byte
= -bytes
;
6748 coding
->src_multibyte
= chars
< bytes
;
6749 coding
->dst_object
= coding
->src_object
;
6750 coding
->dst_pos
= PT
;
6751 coding
->dst_pos_byte
= PT_BYTE
;
6752 coding
->dst_multibyte
= ! NILP (current_buffer
->enable_multibyte_characters
);
6754 if (CODING_REQUIRE_DETECTION (coding
))
6755 detect_coding (coding
);
6757 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
6758 decode_coding (coding
);
6760 attrs
= CODING_ID_ATTRS (coding
->id
);
6761 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6763 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6766 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6767 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6768 make_number (coding
->produced_char
));
6770 coding
->produced_char
+= Z
- prev_Z
;
6771 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6774 unbind_to (count
, Qnil
);
6775 return coding
->result
;
6779 encode_coding_gap (coding
, chars
, bytes
)
6780 struct coding_system
*coding
;
6781 EMACS_INT chars
, bytes
;
6783 int count
= specpdl_ptr
- specpdl
;
6785 code_conversion_save (0, 0);
6787 coding
->src_object
= Fcurrent_buffer ();
6788 coding
->src_chars
= chars
;
6789 coding
->src_bytes
= bytes
;
6790 coding
->src_pos
= -chars
;
6791 coding
->src_pos_byte
= -bytes
;
6792 coding
->src_multibyte
= chars
< bytes
;
6793 coding
->dst_object
= coding
->src_object
;
6794 coding
->dst_pos
= PT
;
6795 coding
->dst_pos_byte
= PT_BYTE
;
6797 encode_coding (coding
);
6799 unbind_to (count
, Qnil
);
6800 return coding
->result
;
6804 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6805 SRC_OBJECT into DST_OBJECT by coding context CODING.
6807 SRC_OBJECT is a buffer, a string, or Qnil.
6809 If it is a buffer, the text is at point of the buffer. FROM and TO
6810 are positions in the buffer.
6812 If it is a string, the text is at the beginning of the string.
6813 FROM and TO are indices to the string.
6815 If it is nil, the text is at coding->source. FROM and TO are
6816 indices to coding->source.
6818 DST_OBJECT is a buffer, Qt, or Qnil.
6820 If it is a buffer, the decoded text is inserted at point of the
6821 buffer. If the buffer is the same as SRC_OBJECT, the source text
6824 If it is Qt, a string is made from the decoded text, and
6825 set in CODING->dst_object.
6827 If it is Qnil, the decoded text is stored at CODING->destination.
6828 The caller must allocate CODING->dst_bytes bytes at
6829 CODING->destination by xmalloc. If the decoded text is longer than
6830 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6834 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6836 struct coding_system
*coding
;
6837 Lisp_Object src_object
;
6838 EMACS_INT from
, from_byte
, to
, to_byte
;
6839 Lisp_Object dst_object
;
6841 int count
= specpdl_ptr
- specpdl
;
6842 unsigned char *destination
;
6843 EMACS_INT dst_bytes
;
6844 EMACS_INT chars
= to
- from
;
6845 EMACS_INT bytes
= to_byte
- from_byte
;
6848 int saved_pt
= -1, saved_pt_byte
;
6850 buffer
= Fcurrent_buffer ();
6852 if (NILP (dst_object
))
6854 destination
= coding
->destination
;
6855 dst_bytes
= coding
->dst_bytes
;
6858 coding
->src_object
= src_object
;
6859 coding
->src_chars
= chars
;
6860 coding
->src_bytes
= bytes
;
6861 coding
->src_multibyte
= chars
< bytes
;
6863 if (STRINGP (src_object
))
6865 coding
->src_pos
= from
;
6866 coding
->src_pos_byte
= from_byte
;
6868 else if (BUFFERP (src_object
))
6870 set_buffer_internal (XBUFFER (src_object
));
6872 move_gap_both (from
, from_byte
);
6873 if (EQ (src_object
, dst_object
))
6875 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6876 TEMP_SET_PT_BOTH (from
, from_byte
);
6877 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6878 coding
->src_pos
= -chars
;
6879 coding
->src_pos_byte
= -bytes
;
6883 coding
->src_pos
= from
;
6884 coding
->src_pos_byte
= from_byte
;
6888 if (CODING_REQUIRE_DETECTION (coding
))
6889 detect_coding (coding
);
6890 attrs
= CODING_ID_ATTRS (coding
->id
);
6892 if (EQ (dst_object
, Qt
)
6893 || (! NILP (CODING_ATTR_POST_READ (attrs
))
6894 && NILP (dst_object
)))
6896 coding
->dst_object
= code_conversion_save (1, 1);
6897 coding
->dst_pos
= BEG
;
6898 coding
->dst_pos_byte
= BEG_BYTE
;
6899 coding
->dst_multibyte
= 1;
6901 else if (BUFFERP (dst_object
))
6903 code_conversion_save (0, 0);
6904 coding
->dst_object
= dst_object
;
6905 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6906 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6907 coding
->dst_multibyte
6908 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6912 code_conversion_save (0, 0);
6913 coding
->dst_object
= Qnil
;
6914 coding
->dst_multibyte
= 1;
6917 decode_coding (coding
);
6919 if (BUFFERP (coding
->dst_object
))
6920 set_buffer_internal (XBUFFER (coding
->dst_object
));
6922 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6924 struct gcpro gcpro1
, gcpro2
;
6925 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6928 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6929 GCPRO2 (coding
->src_object
, coding
->dst_object
);
6930 val
= safe_call1 (CODING_ATTR_POST_READ (attrs
),
6931 make_number (coding
->produced_char
));
6934 coding
->produced_char
+= Z
- prev_Z
;
6935 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6938 if (EQ (dst_object
, Qt
))
6940 coding
->dst_object
= Fbuffer_string ();
6942 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
6944 set_buffer_internal (XBUFFER (coding
->dst_object
));
6945 if (dst_bytes
< coding
->produced
)
6948 = (unsigned char *) xrealloc (destination
, coding
->produced
);
6951 record_conversion_result (coding
,
6952 CODING_RESULT_INSUFFICIENT_DST
);
6953 unbind_to (count
, Qnil
);
6956 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
6957 move_gap_both (BEGV
, BEGV_BYTE
);
6958 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
6959 coding
->destination
= destination
;
6965 /* This is the case of:
6966 (BUFFERP (src_object) && EQ (src_object, dst_object))
6967 As we have moved PT while replacing the original buffer
6968 contents, we must recover it now. */
6969 set_buffer_internal (XBUFFER (src_object
));
6970 if (saved_pt
< from
)
6971 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6972 else if (saved_pt
< from
+ chars
)
6973 TEMP_SET_PT_BOTH (from
, from_byte
);
6974 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6975 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6976 saved_pt_byte
+ (coding
->produced
- bytes
));
6978 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6979 saved_pt_byte
+ (coding
->produced
- bytes
));
6982 unbind_to (count
, coding
->dst_object
);
6987 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6989 struct coding_system
*coding
;
6990 Lisp_Object src_object
;
6991 EMACS_INT from
, from_byte
, to
, to_byte
;
6992 Lisp_Object dst_object
;
6994 int count
= specpdl_ptr
- specpdl
;
6995 EMACS_INT chars
= to
- from
;
6996 EMACS_INT bytes
= to_byte
- from_byte
;
6999 int saved_pt
= -1, saved_pt_byte
;
7000 int kill_src_buffer
= 0;
7002 buffer
= Fcurrent_buffer ();
7004 coding
->src_object
= src_object
;
7005 coding
->src_chars
= chars
;
7006 coding
->src_bytes
= bytes
;
7007 coding
->src_multibyte
= chars
< bytes
;
7009 attrs
= CODING_ID_ATTRS (coding
->id
);
7011 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
7013 coding
->src_object
= code_conversion_save (1, coding
->src_multibyte
);
7014 set_buffer_internal (XBUFFER (coding
->src_object
));
7015 if (STRINGP (src_object
))
7016 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
7017 else if (BUFFERP (src_object
))
7018 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
7020 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
7022 if (EQ (src_object
, dst_object
))
7024 set_buffer_internal (XBUFFER (src_object
));
7025 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
7026 del_range_both (from
, from_byte
, to
, to_byte
, 1);
7027 set_buffer_internal (XBUFFER (coding
->src_object
));
7031 Lisp_Object args
[3];
7033 args
[0] = CODING_ATTR_PRE_WRITE (attrs
);
7034 args
[1] = make_number (BEG
);
7035 args
[2] = make_number (Z
);
7036 safe_call (3, args
);
7038 if (XBUFFER (coding
->src_object
) != current_buffer
)
7039 kill_src_buffer
= 1;
7040 coding
->src_object
= Fcurrent_buffer ();
7042 move_gap_both (BEG
, BEG_BYTE
);
7043 coding
->src_chars
= Z
- BEG
;
7044 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
7045 coding
->src_pos
= BEG
;
7046 coding
->src_pos_byte
= BEG_BYTE
;
7047 coding
->src_multibyte
= Z
< Z_BYTE
;
7049 else if (STRINGP (src_object
))
7051 code_conversion_save (0, 0);
7052 coding
->src_pos
= from
;
7053 coding
->src_pos_byte
= from_byte
;
7055 else if (BUFFERP (src_object
))
7057 code_conversion_save (0, 0);
7058 set_buffer_internal (XBUFFER (src_object
));
7059 if (EQ (src_object
, dst_object
))
7061 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
7062 coding
->src_object
= del_range_1 (from
, to
, 1, 1);
7063 coding
->src_pos
= 0;
7064 coding
->src_pos_byte
= 0;
7068 if (from
< GPT
&& to
>= GPT
)
7069 move_gap_both (from
, from_byte
);
7070 coding
->src_pos
= from
;
7071 coding
->src_pos_byte
= from_byte
;
7075 code_conversion_save (0, 0);
7077 if (BUFFERP (dst_object
))
7079 coding
->dst_object
= dst_object
;
7080 if (EQ (src_object
, dst_object
))
7082 coding
->dst_pos
= from
;
7083 coding
->dst_pos_byte
= from_byte
;
7087 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
7088 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
7090 coding
->dst_multibyte
7091 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
7093 else if (EQ (dst_object
, Qt
))
7095 coding
->dst_object
= Qnil
;
7096 coding
->dst_bytes
= coding
->src_chars
;
7097 if (coding
->dst_bytes
== 0)
7098 coding
->dst_bytes
= 1;
7099 coding
->destination
= (unsigned char *) xmalloc (coding
->dst_bytes
);
7100 coding
->dst_multibyte
= 0;
7104 coding
->dst_object
= Qnil
;
7105 coding
->dst_multibyte
= 0;
7108 encode_coding (coding
);
7110 if (EQ (dst_object
, Qt
))
7112 if (BUFFERP (coding
->dst_object
))
7113 coding
->dst_object
= Fbuffer_string ();
7117 = make_unibyte_string ((char *) coding
->destination
,
7119 xfree (coding
->destination
);
7125 /* This is the case of:
7126 (BUFFERP (src_object) && EQ (src_object, dst_object))
7127 As we have moved PT while replacing the original buffer
7128 contents, we must recover it now. */
7129 set_buffer_internal (XBUFFER (src_object
));
7130 if (saved_pt
< from
)
7131 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
7132 else if (saved_pt
< from
+ chars
)
7133 TEMP_SET_PT_BOTH (from
, from_byte
);
7134 else if (! NILP (current_buffer
->enable_multibyte_characters
))
7135 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
7136 saved_pt_byte
+ (coding
->produced
- bytes
));
7138 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
7139 saved_pt_byte
+ (coding
->produced
- bytes
));
7142 if (kill_src_buffer
)
7143 Fkill_buffer (coding
->src_object
);
7144 unbind_to (count
, Qnil
);
7149 preferred_coding_system ()
7151 int id
= coding_categories
[coding_priorities
[0]].id
;
7153 return CODING_ID_NAME (id
);
7158 /*** 8. Emacs Lisp library functions ***/
7160 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
7161 doc
: /* Return t if OBJECT is nil or a coding-system.
7162 See the documentation of `define-coding-system' for information
7163 about coding-system objects. */)
7168 || CODING_SYSTEM_ID (obj
) >= 0)
7171 || NILP (Fget (obj
, Qcoding_system_define_form
)))
7176 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
7177 Sread_non_nil_coding_system
, 1, 1, 0,
7178 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
7185 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
7186 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
7188 while (SCHARS (val
) == 0);
7189 return (Fintern (val
, Qnil
));
7192 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
7193 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7194 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
7195 (prompt
, default_coding_system
)
7196 Lisp_Object prompt
, default_coding_system
;
7199 if (SYMBOLP (default_coding_system
))
7200 XSETSTRING (default_coding_system
, XPNTR (SYMBOL_NAME (default_coding_system
)));
7201 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
7202 Qt
, Qnil
, Qcoding_system_history
,
7203 default_coding_system
, Qnil
);
7204 return (SCHARS (val
) == 0 ? Qnil
: Fintern (val
, Qnil
));
7207 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
7209 doc
: /* Check validity of CODING-SYSTEM.
7210 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7211 It is valid if it is nil or a symbol defined as a coding system by the
7212 function `define-coding-system'. */)
7214 Lisp_Object coding_system
;
7216 Lisp_Object define_form
;
7218 define_form
= Fget (coding_system
, Qcoding_system_define_form
);
7219 if (! NILP (define_form
))
7221 Fput (coding_system
, Qcoding_system_define_form
, Qnil
);
7222 safe_eval (define_form
);
7224 if (!NILP (Fcoding_system_p (coding_system
)))
7225 return coding_system
;
7226 xsignal1 (Qcoding_system_error
, coding_system
);
7230 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
7231 HIGHEST is nonzero, return the coding system of the highest
7232 priority among the detected coding systems. Otherwize return a
7233 list of detected coding systems sorted by their priorities. If
7234 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7235 multibyte form but contains only ASCII and eight-bit chars.
7236 Otherwise, the bytes are raw bytes.
7238 CODING-SYSTEM controls the detection as below:
7240 If it is nil, detect both text-format and eol-format. If the
7241 text-format part of CODING-SYSTEM is already specified
7242 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
7243 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7244 detect only text-format. */
7247 detect_coding_system (src
, src_chars
, src_bytes
, highest
, multibytep
,
7249 const unsigned char *src
;
7250 int src_chars
, src_bytes
, highest
;
7252 Lisp_Object coding_system
;
7254 const unsigned char *src_end
= src
+ src_bytes
;
7255 Lisp_Object attrs
, eol_type
;
7257 struct coding_system coding
;
7259 struct coding_detection_info detect_info
;
7260 enum coding_category base_category
;
7262 if (NILP (coding_system
))
7263 coding_system
= Qundecided
;
7264 setup_coding_system (coding_system
, &coding
);
7265 attrs
= CODING_ID_ATTRS (coding
.id
);
7266 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
7267 coding_system
= CODING_ATTR_BASE_NAME (attrs
);
7269 coding
.source
= src
;
7270 coding
.src_chars
= src_chars
;
7271 coding
.src_bytes
= src_bytes
;
7272 coding
.src_multibyte
= multibytep
;
7273 coding
.consumed
= 0;
7274 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7276 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
7278 /* At first, detect text-format if necessary. */
7279 base_category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7280 if (base_category
== coding_category_undecided
)
7282 enum coding_category category
;
7283 struct coding_system
*this;
7286 /* Skip all ASCII bytes except for a few ISO2022 controls. */
7287 for (i
= 0; src
< src_end
; i
++, src
++)
7293 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
7294 && ! inhibit_iso_escape_detection
)
7296 coding
.head_ascii
= src
- coding
.source
;
7297 if (detect_coding_iso_2022 (&coding
, &detect_info
))
7299 /* We have scanned the whole data. */
7300 if (! (detect_info
.rejected
& CATEGORY_MASK_ISO_7_ELSE
))
7301 /* We didn't find an 8-bit code. */
7307 coding
.head_ascii
= src
- coding
.source
;
7310 || detect_info
.found
)
7313 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7314 for (i
= 0; i
< coding_category_raw_text
; i
++)
7316 category
= coding_priorities
[i
];
7317 this = coding_categories
+ category
;
7318 if (detect_info
.found
& (1 << category
))
7322 for (i
= 0; i
< coding_category_raw_text
; i
++)
7324 category
= coding_priorities
[i
];
7325 this = coding_categories
+ category
;
7329 /* No coding system of this category is defined. */
7330 detect_info
.rejected
|= (1 << category
);
7332 else if (category
>= coding_category_raw_text
)
7334 else if (detect_info
.checked
& (1 << category
))
7337 && (detect_info
.found
& (1 << category
)))
7342 if ((*(this->detector
)) (&coding
, &detect_info
)
7344 && (detect_info
.found
& (1 << category
)))
7346 if (category
== coding_category_utf_16_auto
)
7348 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
7349 category
= coding_category_utf_16_le
;
7351 category
= coding_category_utf_16_be
;
7359 if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
7361 detect_info
.found
= CATEGORY_MASK_RAW_TEXT
;
7362 id
= coding_categories
[coding_category_raw_text
].id
;
7363 val
= Fcons (make_number (id
), Qnil
);
7365 else if (! detect_info
.rejected
&& ! detect_info
.found
)
7367 detect_info
.found
= CATEGORY_MASK_ANY
;
7368 id
= coding_categories
[coding_category_undecided
].id
;
7369 val
= Fcons (make_number (id
), Qnil
);
7373 if (detect_info
.found
)
7375 detect_info
.found
= 1 << category
;
7376 val
= Fcons (make_number (this->id
), Qnil
);
7379 for (i
= 0; i
< coding_category_raw_text
; i
++)
7380 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
7382 detect_info
.found
= 1 << coding_priorities
[i
];
7383 id
= coding_categories
[coding_priorities
[i
]].id
;
7384 val
= Fcons (make_number (id
), Qnil
);
7390 int mask
= detect_info
.rejected
| detect_info
.found
;
7394 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
7396 category
= coding_priorities
[i
];
7397 if (! (mask
& (1 << category
)))
7399 found
|= 1 << category
;
7400 id
= coding_categories
[category
].id
;
7402 val
= Fcons (make_number (id
), val
);
7405 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
7407 category
= coding_priorities
[i
];
7408 if (detect_info
.found
& (1 << category
))
7410 id
= coding_categories
[category
].id
;
7411 val
= Fcons (make_number (id
), val
);
7414 detect_info
.found
|= found
;
7417 else if (base_category
== coding_category_utf_16_auto
)
7419 if (detect_coding_utf_16 (&coding
, &detect_info
))
7421 struct coding_system
*this;
7423 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
7424 this = coding_categories
+ coding_category_utf_16_le
;
7425 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
7426 this = coding_categories
+ coding_category_utf_16_be
;
7427 else if (detect_info
.rejected
& CATEGORY_MASK_UTF_16_LE_NOSIG
)
7428 this = coding_categories
+ coding_category_utf_16_be_nosig
;
7430 this = coding_categories
+ coding_category_utf_16_le_nosig
;
7431 val
= Fcons (make_number (this->id
), Qnil
);
7436 detect_info
.found
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
7437 val
= Fcons (make_number (coding
.id
), Qnil
);
7440 /* Then, detect eol-format if necessary. */
7442 int normal_eol
= -1, utf_16_be_eol
= -1, utf_16_le_eol
;
7445 if (VECTORP (eol_type
))
7447 if (detect_info
.found
& ~CATEGORY_MASK_UTF_16
)
7448 normal_eol
= detect_eol (coding
.source
, src_bytes
,
7449 coding_category_raw_text
);
7450 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_BE
7451 | CATEGORY_MASK_UTF_16_BE_NOSIG
))
7452 utf_16_be_eol
= detect_eol (coding
.source
, src_bytes
,
7453 coding_category_utf_16_be
);
7454 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
7455 | CATEGORY_MASK_UTF_16_LE_NOSIG
))
7456 utf_16_le_eol
= detect_eol (coding
.source
, src_bytes
,
7457 coding_category_utf_16_le
);
7461 if (EQ (eol_type
, Qunix
))
7462 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_LF
;
7463 else if (EQ (eol_type
, Qdos
))
7464 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CRLF
;
7466 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CR
;
7469 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
7471 enum coding_category category
;
7474 id
= XINT (XCAR (tail
));
7475 attrs
= CODING_ID_ATTRS (id
);
7476 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7477 eol_type
= CODING_ID_EOL_TYPE (id
);
7478 if (VECTORP (eol_type
))
7480 if (category
== coding_category_utf_16_be
7481 || category
== coding_category_utf_16_be_nosig
)
7482 this_eol
= utf_16_be_eol
;
7483 else if (category
== coding_category_utf_16_le
7484 || category
== coding_category_utf_16_le_nosig
)
7485 this_eol
= utf_16_le_eol
;
7487 this_eol
= normal_eol
;
7489 if (this_eol
== EOL_SEEN_LF
)
7490 XSETCAR (tail
, AREF (eol_type
, 0));
7491 else if (this_eol
== EOL_SEEN_CRLF
)
7492 XSETCAR (tail
, AREF (eol_type
, 1));
7493 else if (this_eol
== EOL_SEEN_CR
)
7494 XSETCAR (tail
, AREF (eol_type
, 2));
7496 XSETCAR (tail
, CODING_ID_NAME (id
));
7499 XSETCAR (tail
, CODING_ID_NAME (id
));
7503 return (highest
? XCAR (val
) : val
);
7507 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
7509 doc
: /* Detect coding system of the text in the region between START and END.
7510 Return a list of possible coding systems ordered by priority.
7512 If only ASCII characters are found (except for such ISO-2022 control
7513 characters ISO-2022 as ESC), it returns a list of single element
7514 `undecided' or its subsidiary coding system according to a detected
7517 If optional argument HIGHEST is non-nil, return the coding system of
7518 highest priority. */)
7519 (start
, end
, highest
)
7520 Lisp_Object start
, end
, highest
;
7523 int from_byte
, to_byte
;
7525 CHECK_NUMBER_COERCE_MARKER (start
);
7526 CHECK_NUMBER_COERCE_MARKER (end
);
7528 validate_region (&start
, &end
);
7529 from
= XINT (start
), to
= XINT (end
);
7530 from_byte
= CHAR_TO_BYTE (from
);
7531 to_byte
= CHAR_TO_BYTE (to
);
7533 if (from
< GPT
&& to
>= GPT
)
7534 move_gap_both (to
, to_byte
);
7536 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
7537 to
- from
, to_byte
- from_byte
,
7539 !NILP (current_buffer
7540 ->enable_multibyte_characters
),
7544 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
7546 doc
: /* Detect coding system of the text in STRING.
7547 Return a list of possible coding systems ordered by priority.
7549 If only ASCII characters are found (except for such ISO-2022 control
7550 characters ISO-2022 as ESC), it returns a list of single element
7551 `undecided' or its subsidiary coding system according to a detected
7554 If optional argument HIGHEST is non-nil, return the coding system of
7555 highest priority. */)
7557 Lisp_Object string
, highest
;
7559 CHECK_STRING (string
);
7561 return detect_coding_system (SDATA (string
),
7562 SCHARS (string
), SBYTES (string
),
7563 !NILP (highest
), STRING_MULTIBYTE (string
),
7569 char_encodable_p (c
, attrs
)
7574 struct charset
*charset
;
7575 Lisp_Object translation_table
;
7577 translation_table
= CODING_ATTR_TRANS_TBL (attrs
);
7578 if (! NILP (translation_table
))
7579 c
= translate_char (translation_table
, c
);
7580 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
7581 CONSP (tail
); tail
= XCDR (tail
))
7583 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
7584 if (CHAR_CHARSET_P (c
, charset
))
7587 return (! NILP (tail
));
7591 /* Return a list of coding systems that safely encode the text between
7592 START and END. If EXCLUDE is non-nil, it is a list of coding
7593 systems not to check. The returned list doesn't contain any such
7594 coding systems. In any case, if the text contains only ASCII or is
7595 unibyte, return t. */
7597 DEFUN ("find-coding-systems-region-internal",
7598 Ffind_coding_systems_region_internal
,
7599 Sfind_coding_systems_region_internal
, 2, 3, 0,
7600 doc
: /* Internal use only. */)
7601 (start
, end
, exclude
)
7602 Lisp_Object start
, end
, exclude
;
7604 Lisp_Object coding_attrs_list
, safe_codings
;
7605 EMACS_INT start_byte
, end_byte
;
7606 const unsigned char *p
, *pbeg
, *pend
;
7608 Lisp_Object tail
, elt
;
7610 if (STRINGP (start
))
7612 if (!STRING_MULTIBYTE (start
)
7613 || SCHARS (start
) == SBYTES (start
))
7616 end_byte
= SBYTES (start
);
7620 CHECK_NUMBER_COERCE_MARKER (start
);
7621 CHECK_NUMBER_COERCE_MARKER (end
);
7622 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7623 args_out_of_range (start
, end
);
7624 if (NILP (current_buffer
->enable_multibyte_characters
))
7626 start_byte
= CHAR_TO_BYTE (XINT (start
));
7627 end_byte
= CHAR_TO_BYTE (XINT (end
));
7628 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7631 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7633 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7634 move_gap_both (XINT (start
), start_byte
);
7636 move_gap_both (XINT (end
), end_byte
);
7640 coding_attrs_list
= Qnil
;
7641 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7643 || NILP (Fmemq (XCAR (tail
), exclude
)))
7647 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
7648 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
7649 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
7651 ASET (attrs
, coding_attr_trans_tbl
,
7652 get_translation_table (attrs
, 1, NULL
));
7653 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
7657 if (STRINGP (start
))
7658 p
= pbeg
= SDATA (start
);
7660 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7661 pend
= p
+ (end_byte
- start_byte
);
7663 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
7664 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7668 if (ASCII_BYTE_P (*p
))
7672 c
= STRING_CHAR_ADVANCE (p
);
7674 charset_map_loaded
= 0;
7675 for (tail
= coding_attrs_list
; CONSP (tail
);)
7680 else if (char_encodable_p (c
, elt
))
7682 else if (CONSP (XCDR (tail
)))
7684 XSETCAR (tail
, XCAR (XCDR (tail
)));
7685 XSETCDR (tail
, XCDR (XCDR (tail
)));
7689 XSETCAR (tail
, Qnil
);
7693 if (charset_map_loaded
)
7695 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7697 if (STRINGP (start
))
7698 pbeg
= SDATA (start
);
7700 pbeg
= BYTE_POS_ADDR (start_byte
);
7701 p
= pbeg
+ p_offset
;
7702 pend
= pbeg
+ pend_offset
;
7707 safe_codings
= list2 (Qraw_text
, Qno_conversion
);
7708 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
7709 if (! NILP (XCAR (tail
)))
7710 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
7712 return safe_codings
;
7716 DEFUN ("unencodable-char-position", Funencodable_char_position
,
7717 Sunencodable_char_position
, 3, 5, 0,
7719 Return position of first un-encodable character in a region.
7720 START and END specfiy the region and CODING-SYSTEM specifies the
7721 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7723 If optional 4th argument COUNT is non-nil, it specifies at most how
7724 many un-encodable characters to search. In this case, the value is a
7727 If optional 5th argument STRING is non-nil, it is a string to search
7728 for un-encodable characters. In that case, START and END are indexes
7730 (start
, end
, coding_system
, count
, string
)
7731 Lisp_Object start
, end
, coding_system
, count
, string
;
7734 struct coding_system coding
;
7735 Lisp_Object attrs
, charset_list
, translation_table
;
7736 Lisp_Object positions
;
7738 const unsigned char *p
, *stop
, *pend
;
7739 int ascii_compatible
;
7741 setup_coding_system (Fcheck_coding_system (coding_system
), &coding
);
7742 attrs
= CODING_ID_ATTRS (coding
.id
);
7743 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
7745 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
7746 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7747 translation_table
= get_translation_table (attrs
, 1, NULL
);
7751 validate_region (&start
, &end
);
7752 from
= XINT (start
);
7754 if (NILP (current_buffer
->enable_multibyte_characters
)
7755 || (ascii_compatible
7756 && (to
- from
) == (CHAR_TO_BYTE (to
) - (CHAR_TO_BYTE (from
)))))
7758 p
= CHAR_POS_ADDR (from
);
7759 pend
= CHAR_POS_ADDR (to
);
7760 if (from
< GPT
&& to
>= GPT
)
7767 CHECK_STRING (string
);
7768 CHECK_NATNUM (start
);
7770 from
= XINT (start
);
7773 || to
> SCHARS (string
))
7774 args_out_of_range_3 (string
, start
, end
);
7775 if (! STRING_MULTIBYTE (string
))
7777 p
= SDATA (string
) + string_char_to_byte (string
, from
);
7778 stop
= pend
= SDATA (string
) + string_char_to_byte (string
, to
);
7779 if (ascii_compatible
&& (to
- from
) == (pend
- p
))
7787 CHECK_NATNUM (count
);
7796 if (ascii_compatible
)
7797 while (p
< stop
&& ASCII_BYTE_P (*p
))
7807 c
= STRING_CHAR_ADVANCE (p
);
7808 if (! (ASCII_CHAR_P (c
) && ascii_compatible
)
7809 && ! char_charset (translate_char (translation_table
, c
),
7810 charset_list
, NULL
))
7812 positions
= Fcons (make_number (from
), positions
);
7821 return (NILP (count
) ? Fcar (positions
) : Fnreverse (positions
));
7825 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
7826 Scheck_coding_systems_region
, 3, 3, 0,
7827 doc
: /* Check if the region is encodable by coding systems.
7829 START and END are buffer positions specifying the region.
7830 CODING-SYSTEM-LIST is a list of coding systems to check.
7832 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7833 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7834 whole region, POS0, POS1, ... are buffer positions where non-encodable
7835 characters are found.
7837 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7840 START may be a string. In that case, check if the string is
7841 encodable, and the value contains indices to the string instead of
7842 buffer positions. END is ignored. */)
7843 (start
, end
, coding_system_list
)
7844 Lisp_Object start
, end
, coding_system_list
;
7847 EMACS_INT start_byte
, end_byte
;
7849 const unsigned char *p
, *pbeg
, *pend
;
7851 Lisp_Object tail
, elt
, attrs
;
7853 if (STRINGP (start
))
7855 if (!STRING_MULTIBYTE (start
)
7856 && SCHARS (start
) != SBYTES (start
))
7859 end_byte
= SBYTES (start
);
7864 CHECK_NUMBER_COERCE_MARKER (start
);
7865 CHECK_NUMBER_COERCE_MARKER (end
);
7866 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7867 args_out_of_range (start
, end
);
7868 if (NILP (current_buffer
->enable_multibyte_characters
))
7870 start_byte
= CHAR_TO_BYTE (XINT (start
));
7871 end_byte
= CHAR_TO_BYTE (XINT (end
));
7872 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7875 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7877 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7878 move_gap_both (XINT (start
), start_byte
);
7880 move_gap_both (XINT (end
), end_byte
);
7886 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7889 attrs
= AREF (CODING_SYSTEM_SPEC (elt
), 0);
7890 ASET (attrs
, coding_attr_trans_tbl
,
7891 get_translation_table (attrs
, 1, NULL
));
7892 list
= Fcons (Fcons (elt
, Fcons (attrs
, Qnil
)), list
);
7895 if (STRINGP (start
))
7896 p
= pbeg
= SDATA (start
);
7898 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7899 pend
= p
+ (end_byte
- start_byte
);
7901 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
7902 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7906 if (ASCII_BYTE_P (*p
))
7910 c
= STRING_CHAR_ADVANCE (p
);
7912 charset_map_loaded
= 0;
7913 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
7915 elt
= XCDR (XCAR (tail
));
7916 if (! char_encodable_p (c
, XCAR (elt
)))
7917 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
7919 if (charset_map_loaded
)
7921 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7923 if (STRINGP (start
))
7924 pbeg
= SDATA (start
);
7926 pbeg
= BYTE_POS_ADDR (start_byte
);
7927 p
= pbeg
+ p_offset
;
7928 pend
= pbeg
+ pend_offset
;
7936 for (; CONSP (tail
); tail
= XCDR (tail
))
7939 if (CONSP (XCDR (XCDR (elt
))))
7940 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
7949 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
7950 Lisp_Object start
, end
, coding_system
, dst_object
;
7951 int encodep
, norecord
;
7953 struct coding_system coding
;
7954 EMACS_INT from
, from_byte
, to
, to_byte
;
7955 Lisp_Object src_object
;
7957 CHECK_NUMBER_COERCE_MARKER (start
);
7958 CHECK_NUMBER_COERCE_MARKER (end
);
7959 if (NILP (coding_system
))
7960 coding_system
= Qno_conversion
;
7962 CHECK_CODING_SYSTEM (coding_system
);
7963 src_object
= Fcurrent_buffer ();
7964 if (NILP (dst_object
))
7965 dst_object
= src_object
;
7966 else if (! EQ (dst_object
, Qt
))
7967 CHECK_BUFFER (dst_object
);
7969 validate_region (&start
, &end
);
7970 from
= XFASTINT (start
);
7971 from_byte
= CHAR_TO_BYTE (from
);
7972 to
= XFASTINT (end
);
7973 to_byte
= CHAR_TO_BYTE (to
);
7975 setup_coding_system (coding_system
, &coding
);
7976 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7979 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7982 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7985 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7987 return (BUFFERP (dst_object
)
7988 ? make_number (coding
.produced_char
)
7989 : coding
.dst_object
);
7993 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
7994 3, 4, "r\nzCoding system: ",
7995 doc
: /* Decode the current region from the specified coding system.
7996 When called from a program, takes four arguments:
7997 START, END, CODING-SYSTEM, and DESTINATION.
7998 START and END are buffer positions.
8000 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8001 If nil, the region between START and END is replace by the decoded text.
8002 If buffer, the decoded text is inserted in the buffer.
8003 If t, the decoded text is returned.
8005 This function sets `last-coding-system-used' to the precise coding system
8006 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8007 not fully specified.)
8008 It returns the length of the decoded text. */)
8009 (start
, end
, coding_system
, destination
)
8010 Lisp_Object start
, end
, coding_system
, destination
;
8012 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
8015 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
8016 3, 4, "r\nzCoding system: ",
8017 doc
: /* Encode the current region by specified coding system.
8018 When called from a program, takes three arguments:
8019 START, END, and CODING-SYSTEM. START and END are buffer positions.
8021 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8022 If nil, the region between START and END is replace by the encoded text.
8023 If buffer, the encoded text is inserted in the buffer.
8024 If t, the encoded text is returned.
8026 This function sets `last-coding-system-used' to the precise coding system
8027 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8028 not fully specified.)
8029 It returns the length of the encoded text. */)
8030 (start
, end
, coding_system
, destination
)
8031 Lisp_Object start
, end
, coding_system
, destination
;
8033 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
8037 code_convert_string (string
, coding_system
, dst_object
,
8038 encodep
, nocopy
, norecord
)
8039 Lisp_Object string
, coding_system
, dst_object
;
8040 int encodep
, nocopy
, norecord
;
8042 struct coding_system coding
;
8043 EMACS_INT chars
, bytes
;
8045 CHECK_STRING (string
);
8046 if (NILP (coding_system
))
8049 Vlast_coding_system_used
= Qno_conversion
;
8050 if (NILP (dst_object
))
8051 return (nocopy
? Fcopy_sequence (string
) : string
);
8054 if (NILP (coding_system
))
8055 coding_system
= Qno_conversion
;
8057 CHECK_CODING_SYSTEM (coding_system
);
8058 if (NILP (dst_object
))
8060 else if (! EQ (dst_object
, Qt
))
8061 CHECK_BUFFER (dst_object
);
8063 setup_coding_system (coding_system
, &coding
);
8064 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
8065 chars
= SCHARS (string
);
8066 bytes
= SBYTES (string
);
8068 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
8070 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
8072 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
8074 return (BUFFERP (dst_object
)
8075 ? make_number (coding
.produced_char
)
8076 : coding
.dst_object
);
8080 /* Encode or decode STRING according to CODING_SYSTEM.
8081 Do not set Vlast_coding_system_used.
8083 This function is called only from macros DECODE_FILE and
8084 ENCODE_FILE, thus we ignore character composition. */
8087 code_convert_string_norecord (string
, coding_system
, encodep
)
8088 Lisp_Object string
, coding_system
;
8091 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
8095 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
8097 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8099 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8100 if the decoding operation is trivial.
8102 Optional fourth arg BUFFER non-nil meant that the decoded text is
8103 inserted in BUFFER instead of returned as a string. In this case,
8104 the return value is BUFFER.
8106 This function sets `last-coding-system-used' to the precise coding system
8107 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8108 not fully specified. */)
8109 (string
, coding_system
, nocopy
, buffer
)
8110 Lisp_Object string
, coding_system
, nocopy
, buffer
;
8112 return code_convert_string (string
, coding_system
, buffer
,
8113 0, ! NILP (nocopy
), 0);
8116 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
8118 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
8120 Optional third arg NOCOPY non-nil means it is OK to return STRING
8121 itself if the encoding operation is trivial.
8123 Optional fourth arg BUFFER non-nil meant that the encoded text is
8124 inserted in BUFFER instead of returned as a string. In this case,
8125 the return value is BUFFER.
8127 This function sets `last-coding-system-used' to the precise coding system
8128 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8129 not fully specified.) */)
8130 (string
, coding_system
, nocopy
, buffer
)
8131 Lisp_Object string
, coding_system
, nocopy
, buffer
;
8133 return code_convert_string (string
, coding_system
, buffer
,
8134 1, ! NILP (nocopy
), 1);
8138 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
8139 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
8140 Return the corresponding character. */)
8144 Lisp_Object spec
, attrs
, val
;
8145 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
8148 CHECK_NATNUM (code
);
8149 c
= XFASTINT (code
);
8150 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
8151 attrs
= AREF (spec
, 0);
8153 if (ASCII_BYTE_P (c
)
8154 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
8157 val
= CODING_ATTR_CHARSET_LIST (attrs
);
8158 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
8159 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
8160 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
8163 charset
= charset_roman
;
8164 else if (c
>= 0xA0 && c
< 0xDF)
8166 charset
= charset_kana
;
8171 int s1
= c
>> 8, s2
= c
& 0xFF;
8173 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
8174 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
8175 error ("Invalid code: %d", code
);
8177 charset
= charset_kanji
;
8179 c
= DECODE_CHAR (charset
, c
);
8181 error ("Invalid code: %d", code
);
8182 return make_number (c
);
8186 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
8187 doc
: /* Encode a Japanese character CH to shift_jis encoding.
8188 Return the corresponding code in SJIS. */)
8192 Lisp_Object spec
, attrs
, charset_list
;
8194 struct charset
*charset
;
8197 CHECK_CHARACTER (ch
);
8199 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
8200 attrs
= AREF (spec
, 0);
8202 if (ASCII_CHAR_P (c
)
8203 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
8206 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
8207 charset
= char_charset (c
, charset_list
, &code
);
8208 if (code
== CHARSET_INVALID_CODE (charset
))
8209 error ("Can't encode by shift_jis encoding: %d", c
);
8212 return make_number (code
);
8215 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
8216 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
8217 Return the corresponding character. */)
8221 Lisp_Object spec
, attrs
, val
;
8222 struct charset
*charset_roman
, *charset_big5
, *charset
;
8225 CHECK_NATNUM (code
);
8226 c
= XFASTINT (code
);
8227 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
8228 attrs
= AREF (spec
, 0);
8230 if (ASCII_BYTE_P (c
)
8231 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
8234 val
= CODING_ATTR_CHARSET_LIST (attrs
);
8235 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
8236 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
8239 charset
= charset_roman
;
8242 int b1
= c
>> 8, b2
= c
& 0x7F;
8243 if (b1
< 0xA1 || b1
> 0xFE
8244 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
8245 error ("Invalid code: %d", code
);
8246 charset
= charset_big5
;
8248 c
= DECODE_CHAR (charset
, (unsigned )c
);
8250 error ("Invalid code: %d", code
);
8251 return make_number (c
);
8254 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
8255 doc
: /* Encode the Big5 character CH to BIG5 coding system.
8256 Return the corresponding character code in Big5. */)
8260 Lisp_Object spec
, attrs
, charset_list
;
8261 struct charset
*charset
;
8265 CHECK_CHARACTER (ch
);
8267 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
8268 attrs
= AREF (spec
, 0);
8269 if (ASCII_CHAR_P (c
)
8270 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
8273 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
8274 charset
= char_charset (c
, charset_list
, &code
);
8275 if (code
== CHARSET_INVALID_CODE (charset
))
8276 error ("Can't encode by Big5 encoding: %d", c
);
8278 return make_number (code
);
8282 DEFUN ("set-terminal-coding-system-internal",
8283 Fset_terminal_coding_system_internal
,
8284 Sset_terminal_coding_system_internal
, 1, 1, 0,
8285 doc
: /* Internal use only. */)
8287 Lisp_Object coding_system
;
8289 CHECK_SYMBOL (coding_system
);
8290 setup_coding_system (Fcheck_coding_system (coding_system
),
8293 /* We had better not send unsafe characters to terminal. */
8294 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
8295 /* Characer composition should be disabled. */
8296 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
8297 terminal_coding
.src_multibyte
= 1;
8298 terminal_coding
.dst_multibyte
= 0;
8302 DEFUN ("set-safe-terminal-coding-system-internal",
8303 Fset_safe_terminal_coding_system_internal
,
8304 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
8305 doc
: /* Internal use only. */)
8307 Lisp_Object coding_system
;
8309 CHECK_SYMBOL (coding_system
);
8310 setup_coding_system (Fcheck_coding_system (coding_system
),
8311 &safe_terminal_coding
);
8312 /* Characer composition should be disabled. */
8313 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
8314 safe_terminal_coding
.src_multibyte
= 1;
8315 safe_terminal_coding
.dst_multibyte
= 0;
8319 DEFUN ("terminal-coding-system",
8320 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
8321 doc
: /* Return coding system specified for terminal output. */)
8324 Lisp_Object coding_system
;
8326 coding_system
= CODING_ID_NAME (terminal_coding
.id
);
8327 /* For backward compatibility, return nil if it is `undecided'. */
8328 return (! EQ (coding_system
, Qundecided
) ? coding_system
: Qnil
);
8331 DEFUN ("set-keyboard-coding-system-internal",
8332 Fset_keyboard_coding_system_internal
,
8333 Sset_keyboard_coding_system_internal
, 1, 1, 0,
8334 doc
: /* Internal use only. */)
8336 Lisp_Object coding_system
;
8338 CHECK_SYMBOL (coding_system
);
8339 setup_coding_system (Fcheck_coding_system (coding_system
),
8341 /* Characer composition should be disabled. */
8342 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
8346 DEFUN ("keyboard-coding-system",
8347 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
8348 doc
: /* Return coding system specified for decoding keyboard input. */)
8351 return CODING_ID_NAME (keyboard_coding
.id
);
8355 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
8356 Sfind_operation_coding_system
, 1, MANY
, 0,
8357 doc
: /* Choose a coding system for an operation based on the target name.
8358 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8359 DECODING-SYSTEM is the coding system to use for decoding
8360 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8361 for encoding (in case OPERATION does encoding).
8363 The first argument OPERATION specifies an I/O primitive:
8364 For file I/O, `insert-file-contents' or `write-region'.
8365 For process I/O, `call-process', `call-process-region', or `start-process'.
8366 For network I/O, `open-network-stream'.
8368 The remaining arguments should be the same arguments that were passed
8369 to the primitive. Depending on which primitive, one of those arguments
8370 is selected as the TARGET. For example, if OPERATION does file I/O,
8371 whichever argument specifies the file name is TARGET.
8373 TARGET has a meaning which depends on OPERATION:
8374 For file I/O, TARGET is a file name (except for the special case below).
8375 For process I/O, TARGET is a process name.
8376 For network I/O, TARGET is a service name or a port number
8378 This function looks up what specified for TARGET in,
8379 `file-coding-system-alist', `process-coding-system-alist',
8380 or `network-coding-system-alist' depending on OPERATION.
8381 They may specify a coding system, a cons of coding systems,
8382 or a function symbol to call.
8383 In the last case, we call the function with one argument,
8384 which is a list of all the arguments given to this function.
8386 If OPERATION is `insert-file-contents', the argument corresponding to
8387 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
8388 file name to look up, and BUFFER is a buffer that contains the file's
8389 contents (not yet decoded). If `file-coding-system-alist' specifies a
8390 function to call for FILENAME, that function should examine the
8391 contents of BUFFER instead of reading the file.
8393 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
8398 Lisp_Object operation
, target_idx
, target
, val
;
8399 register Lisp_Object chain
;
8402 error ("Too few arguments");
8403 operation
= args
[0];
8404 if (!SYMBOLP (operation
)
8405 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
8406 error ("Invalid first arguement");
8407 if (nargs
< 1 + XINT (target_idx
))
8408 error ("Too few arguments for operation: %s",
8409 SDATA (SYMBOL_NAME (operation
)));
8410 target
= args
[XINT (target_idx
) + 1];
8411 if (!(STRINGP (target
)
8412 || (EQ (operation
, Qinsert_file_contents
) && CONSP (target
)
8413 && STRINGP (XCAR (target
)) && BUFFERP (XCDR (target
)))
8414 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
8415 error ("Invalid %dth argument", XINT (target_idx
) + 1);
8417 target
= XCAR (target
);
8419 chain
= ((EQ (operation
, Qinsert_file_contents
)
8420 || EQ (operation
, Qwrite_region
))
8421 ? Vfile_coding_system_alist
8422 : (EQ (operation
, Qopen_network_stream
)
8423 ? Vnetwork_coding_system_alist
8424 : Vprocess_coding_system_alist
));
8428 for (; CONSP (chain
); chain
= XCDR (chain
))
8434 && ((STRINGP (target
)
8435 && STRINGP (XCAR (elt
))
8436 && fast_string_match (XCAR (elt
), target
) >= 0)
8437 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
8440 /* Here, if VAL is both a valid coding system and a valid
8441 function symbol, we return VAL as a coding system. */
8444 if (! SYMBOLP (val
))
8446 if (! NILP (Fcoding_system_p (val
)))
8447 return Fcons (val
, val
);
8448 if (! NILP (Ffboundp (val
)))
8450 /* We use call1 rather than safe_call1
8451 so as to get bug reports about functions called here
8452 which don't handle the current interface. */
8453 val
= call1 (val
, Flist (nargs
, args
));
8456 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
8457 return Fcons (val
, val
);
8465 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
8466 Sset_coding_system_priority
, 0, MANY
, 0,
8467 doc
: /* Assign higher priority to the coding systems given as arguments.
8468 If multiple coding systems belongs to the same category,
8469 all but the first one are ignored.
8471 usage: (set-coding-system-priority ...) */)
8477 int changed
[coding_category_max
];
8478 enum coding_category priorities
[coding_category_max
];
8480 bzero (changed
, sizeof changed
);
8482 for (i
= j
= 0; i
< nargs
; i
++)
8484 enum coding_category category
;
8485 Lisp_Object spec
, attrs
;
8487 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
8488 attrs
= AREF (spec
, 0);
8489 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
8490 if (changed
[category
])
8491 /* Ignore this coding system because a coding system of the
8492 same category already had a higher priority. */
8494 changed
[category
] = 1;
8495 priorities
[j
++] = category
;
8496 if (coding_categories
[category
].id
>= 0
8497 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
8498 setup_coding_system (args
[i
], &coding_categories
[category
]);
8499 Fset (AREF (Vcoding_category_table
, category
), args
[i
]);
8502 /* Now we have decided top J priorities. Reflect the order of the
8503 original priorities to the remaining priorities. */
8505 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
8507 while (j
< coding_category_max
8508 && changed
[coding_priorities
[j
]])
8510 if (j
== coding_category_max
)
8512 priorities
[i
] = coding_priorities
[j
];
8515 bcopy (priorities
, coding_priorities
, sizeof priorities
);
8517 /* Update `coding-category-list'. */
8518 Vcoding_category_list
= Qnil
;
8519 for (i
= coding_category_max
- 1; i
>= 0; i
--)
8520 Vcoding_category_list
8521 = Fcons (AREF (Vcoding_category_table
, priorities
[i
]),
8522 Vcoding_category_list
);
8527 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
8528 Scoding_system_priority_list
, 0, 1, 0,
8529 doc
: /* Return a list of coding systems ordered by their priorities.
8530 HIGHESTP non-nil means just return the highest priority one. */)
8532 Lisp_Object highestp
;
8537 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
8539 enum coding_category category
= coding_priorities
[i
];
8540 int id
= coding_categories
[category
].id
;
8545 attrs
= CODING_ID_ATTRS (id
);
8546 if (! NILP (highestp
))
8547 return CODING_ATTR_BASE_NAME (attrs
);
8548 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
8550 return Fnreverse (val
);
8553 static char *suffixes
[] = { "-unix", "-dos", "-mac" };
8556 make_subsidiaries (base
)
8559 Lisp_Object subsidiaries
;
8560 int base_name_len
= SBYTES (SYMBOL_NAME (base
));
8561 char *buf
= (char *) alloca (base_name_len
+ 6);
8564 bcopy (SDATA (SYMBOL_NAME (base
)), buf
, base_name_len
);
8565 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
8566 for (i
= 0; i
< 3; i
++)
8568 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
8569 ASET (subsidiaries
, i
, intern (buf
));
8571 return subsidiaries
;
8575 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
8576 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
8577 doc
: /* For internal use only.
8578 usage: (define-coding-system-internal ...) */)
8584 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
8585 Lisp_Object attrs
; /* Vector of attributes. */
8586 Lisp_Object eol_type
;
8587 Lisp_Object aliases
;
8588 Lisp_Object coding_type
, charset_list
, safe_charsets
;
8589 enum coding_category category
;
8590 Lisp_Object tail
, val
;
8591 int max_charset_id
= 0;
8594 if (nargs
< coding_arg_max
)
8597 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
8599 name
= args
[coding_arg_name
];
8600 CHECK_SYMBOL (name
);
8601 CODING_ATTR_BASE_NAME (attrs
) = name
;
8603 val
= args
[coding_arg_mnemonic
];
8604 if (! STRINGP (val
))
8605 CHECK_CHARACTER (val
);
8606 CODING_ATTR_MNEMONIC (attrs
) = val
;
8608 coding_type
= args
[coding_arg_coding_type
];
8609 CHECK_SYMBOL (coding_type
);
8610 CODING_ATTR_TYPE (attrs
) = coding_type
;
8612 charset_list
= args
[coding_arg_charset_list
];
8613 if (SYMBOLP (charset_list
))
8615 if (EQ (charset_list
, Qiso_2022
))
8617 if (! EQ (coding_type
, Qiso_2022
))
8618 error ("Invalid charset-list");
8619 charset_list
= Viso_2022_charset_list
;
8621 else if (EQ (charset_list
, Qemacs_mule
))
8623 if (! EQ (coding_type
, Qemacs_mule
))
8624 error ("Invalid charset-list");
8625 charset_list
= Vemacs_mule_charset_list
;
8627 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8628 if (max_charset_id
< XFASTINT (XCAR (tail
)))
8629 max_charset_id
= XFASTINT (XCAR (tail
));
8633 charset_list
= Fcopy_sequence (charset_list
);
8634 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
8636 struct charset
*charset
;
8639 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8640 if (EQ (coding_type
, Qiso_2022
)
8641 ? CHARSET_ISO_FINAL (charset
) < 0
8642 : EQ (coding_type
, Qemacs_mule
)
8643 ? CHARSET_EMACS_MULE_ID (charset
) < 0
8645 error ("Can't handle charset `%s'",
8646 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8648 XSETCAR (tail
, make_number (charset
->id
));
8649 if (max_charset_id
< charset
->id
)
8650 max_charset_id
= charset
->id
;
8653 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
8655 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
8657 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8658 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
8659 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
8661 CODING_ATTR_ASCII_COMPAT (attrs
) = args
[coding_arg_ascii_compatible_p
];
8663 val
= args
[coding_arg_decode_translation_table
];
8664 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
8666 CODING_ATTR_DECODE_TBL (attrs
) = val
;
8668 val
= args
[coding_arg_encode_translation_table
];
8669 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
8671 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
8673 val
= args
[coding_arg_post_read_conversion
];
8675 CODING_ATTR_POST_READ (attrs
) = val
;
8677 val
= args
[coding_arg_pre_write_conversion
];
8679 CODING_ATTR_PRE_WRITE (attrs
) = val
;
8681 val
= args
[coding_arg_default_char
];
8683 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
8686 CHECK_CHARACTER (val
);
8687 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
8690 val
= args
[coding_arg_for_unibyte
];
8691 CODING_ATTR_FOR_UNIBYTE (attrs
) = NILP (val
) ? Qnil
: Qt
;
8693 val
= args
[coding_arg_plist
];
8695 CODING_ATTR_PLIST (attrs
) = val
;
8697 if (EQ (coding_type
, Qcharset
))
8699 /* Generate a lisp vector of 256 elements. Each element is nil,
8700 integer, or a list of charset IDs.
8702 If Nth element is nil, the byte code N is invalid in this
8705 If Nth element is a number NUM, N is the first byte of a
8706 charset whose ID is NUM.
8708 If Nth element is a list of charset IDs, N is the first byte
8709 of one of them. The list is sorted by dimensions of the
8710 charsets. A charset of smaller dimension comes firtst. */
8711 val
= Fmake_vector (make_number (256), Qnil
);
8713 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8715 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
8716 int dim
= CHARSET_DIMENSION (charset
);
8717 int idx
= (dim
- 1) * 4;
8719 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8720 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8722 for (i
= charset
->code_space
[idx
];
8723 i
<= charset
->code_space
[idx
+ 1]; i
++)
8725 Lisp_Object tmp
, tmp2
;
8728 tmp
= AREF (val
, i
);
8731 else if (NUMBERP (tmp
))
8733 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
8735 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
8737 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
8741 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
8743 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
8748 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
8751 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
8752 XSETCAR (tmp2
, XCAR (tail
));
8758 ASET (attrs
, coding_attr_charset_valids
, val
);
8759 category
= coding_category_charset
;
8761 else if (EQ (coding_type
, Qccl
))
8765 if (nargs
< coding_arg_ccl_max
)
8768 val
= args
[coding_arg_ccl_decoder
];
8769 CHECK_CCL_PROGRAM (val
);
8771 val
= Fcopy_sequence (val
);
8772 ASET (attrs
, coding_attr_ccl_decoder
, val
);
8774 val
= args
[coding_arg_ccl_encoder
];
8775 CHECK_CCL_PROGRAM (val
);
8777 val
= Fcopy_sequence (val
);
8778 ASET (attrs
, coding_attr_ccl_encoder
, val
);
8780 val
= args
[coding_arg_ccl_valids
];
8781 valids
= Fmake_string (make_number (256), make_number (0));
8782 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
8789 from
= to
= XINT (val
);
8790 if (from
< 0 || from
> 255)
8791 args_out_of_range_3 (val
, make_number (0), make_number (255));
8796 CHECK_NATNUM_CAR (val
);
8797 CHECK_NATNUM_CDR (val
);
8798 from
= XINT (XCAR (val
));
8800 args_out_of_range_3 (XCAR (val
),
8801 make_number (0), make_number (255));
8802 to
= XINT (XCDR (val
));
8803 if (to
< from
|| to
> 255)
8804 args_out_of_range_3 (XCDR (val
),
8805 XCAR (val
), make_number (255));
8807 for (i
= from
; i
<= to
; i
++)
8808 SSET (valids
, i
, 1);
8810 ASET (attrs
, coding_attr_ccl_valids
, valids
);
8812 category
= coding_category_ccl
;
8814 else if (EQ (coding_type
, Qutf_16
))
8816 Lisp_Object bom
, endian
;
8818 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8820 if (nargs
< coding_arg_utf16_max
)
8823 bom
= args
[coding_arg_utf16_bom
];
8824 if (! NILP (bom
) && ! EQ (bom
, Qt
))
8828 CHECK_CODING_SYSTEM (val
);
8830 CHECK_CODING_SYSTEM (val
);
8832 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
8834 endian
= args
[coding_arg_utf16_endian
];
8835 CHECK_SYMBOL (endian
);
8838 else if (! EQ (endian
, Qbig
) && ! EQ (endian
, Qlittle
))
8839 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian
)));
8840 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
8842 category
= (CONSP (bom
)
8843 ? coding_category_utf_16_auto
8845 ? (EQ (endian
, Qbig
)
8846 ? coding_category_utf_16_be_nosig
8847 : coding_category_utf_16_le_nosig
)
8848 : (EQ (endian
, Qbig
)
8849 ? coding_category_utf_16_be
8850 : coding_category_utf_16_le
));
8852 else if (EQ (coding_type
, Qiso_2022
))
8854 Lisp_Object initial
, reg_usage
, request
, flags
;
8857 if (nargs
< coding_arg_iso2022_max
)
8860 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
8861 CHECK_VECTOR (initial
);
8862 for (i
= 0; i
< 4; i
++)
8864 val
= Faref (initial
, make_number (i
));
8867 struct charset
*charset
;
8869 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8870 ASET (initial
, i
, make_number (CHARSET_ID (charset
)));
8871 if (i
== 0 && CHARSET_ASCII_COMPATIBLE_P (charset
))
8872 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8875 ASET (initial
, i
, make_number (-1));
8878 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
8879 CHECK_CONS (reg_usage
);
8880 CHECK_NUMBER_CAR (reg_usage
);
8881 CHECK_NUMBER_CDR (reg_usage
);
8883 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
8884 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
8892 CHECK_CHARSET_GET_ID (tmp
, id
);
8893 CHECK_NATNUM_CDR (val
);
8894 if (XINT (XCDR (val
)) >= 4)
8895 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
8896 XSETCAR (val
, make_number (id
));
8899 flags
= args
[coding_arg_iso2022_flags
];
8900 CHECK_NATNUM (flags
);
8902 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
8903 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
8905 ASET (attrs
, coding_attr_iso_initial
, initial
);
8906 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
8907 ASET (attrs
, coding_attr_iso_request
, request
);
8908 ASET (attrs
, coding_attr_iso_flags
, flags
);
8909 setup_iso_safe_charsets (attrs
);
8911 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
8912 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
8913 | CODING_ISO_FLAG_SINGLE_SHIFT
))
8914 ? coding_category_iso_7_else
8915 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8916 ? coding_category_iso_7
8917 : coding_category_iso_7_tight
);
8920 int id
= XINT (AREF (initial
, 1));
8922 category
= (((i
& CODING_ISO_FLAG_LOCKING_SHIFT
)
8923 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8925 ? coding_category_iso_8_else
8926 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
8927 ? coding_category_iso_8_1
8928 : coding_category_iso_8_2
);
8930 if (category
!= coding_category_iso_8_1
8931 && category
!= coding_category_iso_8_2
)
8932 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8934 else if (EQ (coding_type
, Qemacs_mule
))
8936 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
8937 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
8938 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8939 category
= coding_category_emacs_mule
;
8941 else if (EQ (coding_type
, Qshift_jis
))
8944 struct charset
*charset
;
8946 if (XINT (Flength (charset_list
)) != 3
8947 && XINT (Flength (charset_list
)) != 4)
8948 error ("There should be three or four charsets");
8950 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8951 if (CHARSET_DIMENSION (charset
) != 1)
8952 error ("Dimension of charset %s is not one",
8953 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8954 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8955 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8957 charset_list
= XCDR (charset_list
);
8958 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8959 if (CHARSET_DIMENSION (charset
) != 1)
8960 error ("Dimension of charset %s is not one",
8961 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8963 charset_list
= XCDR (charset_list
);
8964 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8965 if (CHARSET_DIMENSION (charset
) != 2)
8966 error ("Dimension of charset %s is not two",
8967 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8969 charset_list
= XCDR (charset_list
);
8970 if (! NILP (charset_list
))
8972 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8973 if (CHARSET_DIMENSION (charset
) != 2)
8974 error ("Dimension of charset %s is not two",
8975 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8978 category
= coding_category_sjis
;
8979 Vsjis_coding_system
= name
;
8981 else if (EQ (coding_type
, Qbig5
))
8983 struct charset
*charset
;
8985 if (XINT (Flength (charset_list
)) != 2)
8986 error ("There should be just two charsets");
8988 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8989 if (CHARSET_DIMENSION (charset
) != 1)
8990 error ("Dimension of charset %s is not one",
8991 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8992 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8993 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8995 charset_list
= XCDR (charset_list
);
8996 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8997 if (CHARSET_DIMENSION (charset
) != 2)
8998 error ("Dimension of charset %s is not two",
8999 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9001 category
= coding_category_big5
;
9002 Vbig5_coding_system
= name
;
9004 else if (EQ (coding_type
, Qraw_text
))
9006 category
= coding_category_raw_text
;
9007 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
9009 else if (EQ (coding_type
, Qutf_8
))
9011 category
= coding_category_utf_8
;
9012 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
9014 else if (EQ (coding_type
, Qundecided
))
9015 category
= coding_category_undecided
;
9017 error ("Invalid coding system type: %s",
9018 SDATA (SYMBOL_NAME (coding_type
)));
9020 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
9021 CODING_ATTR_PLIST (attrs
)
9022 = Fcons (QCcategory
, Fcons (AREF (Vcoding_category_table
, category
),
9023 CODING_ATTR_PLIST (attrs
)));
9024 CODING_ATTR_PLIST (attrs
)
9025 = Fcons (QCascii_compatible_p
,
9026 Fcons (CODING_ATTR_ASCII_COMPAT (attrs
),
9027 CODING_ATTR_PLIST (attrs
)));
9029 eol_type
= args
[coding_arg_eol_type
];
9030 if (! NILP (eol_type
)
9031 && ! EQ (eol_type
, Qunix
)
9032 && ! EQ (eol_type
, Qdos
)
9033 && ! EQ (eol_type
, Qmac
))
9034 error ("Invalid eol-type");
9036 aliases
= Fcons (name
, Qnil
);
9038 if (NILP (eol_type
))
9040 eol_type
= make_subsidiaries (name
);
9041 for (i
= 0; i
< 3; i
++)
9043 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
9045 this_name
= AREF (eol_type
, i
);
9046 this_aliases
= Fcons (this_name
, Qnil
);
9047 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
9048 this_spec
= Fmake_vector (make_number (3), attrs
);
9049 ASET (this_spec
, 1, this_aliases
);
9050 ASET (this_spec
, 2, this_eol_type
);
9051 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
9052 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
9053 val
= Fassoc (Fsymbol_name (this_name
), Vcoding_system_alist
);
9055 Vcoding_system_alist
9056 = Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
9057 Vcoding_system_alist
);
9061 spec_vec
= Fmake_vector (make_number (3), attrs
);
9062 ASET (spec_vec
, 1, aliases
);
9063 ASET (spec_vec
, 2, eol_type
);
9065 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
9066 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
9067 val
= Fassoc (Fsymbol_name (name
), Vcoding_system_alist
);
9069 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
9070 Vcoding_system_alist
);
9073 int id
= coding_categories
[category
].id
;
9075 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
9076 setup_coding_system (name
, &coding_categories
[category
]);
9082 return Fsignal (Qwrong_number_of_arguments
,
9083 Fcons (intern ("define-coding-system-internal"),
9084 make_number (nargs
)));
9088 DEFUN ("coding-system-put", Fcoding_system_put
, Scoding_system_put
,
9090 doc
: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
9091 (coding_system
, prop
, val
)
9092 Lisp_Object coding_system
, prop
, val
;
9094 Lisp_Object spec
, attrs
;
9096 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
9097 attrs
= AREF (spec
, 0);
9098 if (EQ (prop
, QCmnemonic
))
9100 if (! STRINGP (val
))
9101 CHECK_CHARACTER (val
);
9102 CODING_ATTR_MNEMONIC (attrs
) = val
;
9104 else if (EQ (prop
, QCdefalut_char
))
9107 val
= make_number (' ');
9109 CHECK_CHARACTER (val
);
9110 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
9112 else if (EQ (prop
, QCdecode_translation_table
))
9114 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
9116 CODING_ATTR_DECODE_TBL (attrs
) = val
;
9118 else if (EQ (prop
, QCencode_translation_table
))
9120 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
9122 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
9124 else if (EQ (prop
, QCpost_read_conversion
))
9127 CODING_ATTR_POST_READ (attrs
) = val
;
9129 else if (EQ (prop
, QCpre_write_conversion
))
9132 CODING_ATTR_PRE_WRITE (attrs
) = val
;
9134 else if (EQ (prop
, QCascii_compatible_p
))
9136 CODING_ATTR_ASCII_COMPAT (attrs
) = val
;
9139 CODING_ATTR_PLIST (attrs
)
9140 = Fplist_put (CODING_ATTR_PLIST (attrs
), prop
, val
);
9145 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
9146 Sdefine_coding_system_alias
, 2, 2, 0,
9147 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
9148 (alias
, coding_system
)
9149 Lisp_Object alias
, coding_system
;
9151 Lisp_Object spec
, aliases
, eol_type
, val
;
9153 CHECK_SYMBOL (alias
);
9154 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
9155 aliases
= AREF (spec
, 1);
9156 /* ALISES should be a list of length more than zero, and the first
9157 element is a base coding system. Append ALIAS at the tail of the
9159 while (!NILP (XCDR (aliases
)))
9160 aliases
= XCDR (aliases
);
9161 XSETCDR (aliases
, Fcons (alias
, Qnil
));
9163 eol_type
= AREF (spec
, 2);
9164 if (VECTORP (eol_type
))
9166 Lisp_Object subsidiaries
;
9169 subsidiaries
= make_subsidiaries (alias
);
9170 for (i
= 0; i
< 3; i
++)
9171 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
9172 AREF (eol_type
, i
));
9175 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
9176 Vcoding_system_list
= Fcons (alias
, Vcoding_system_list
);
9177 val
= Fassoc (Fsymbol_name (alias
), Vcoding_system_alist
);
9179 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
9180 Vcoding_system_alist
);
9185 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
9187 doc
: /* Return the base of CODING-SYSTEM.
9188 Any alias or subsidiary coding system is not a base coding system. */)
9190 Lisp_Object coding_system
;
9192 Lisp_Object spec
, attrs
;
9194 if (NILP (coding_system
))
9195 return (Qno_conversion
);
9196 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
9197 attrs
= AREF (spec
, 0);
9198 return CODING_ATTR_BASE_NAME (attrs
);
9201 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
9203 doc
: "Return the property list of CODING-SYSTEM.")
9205 Lisp_Object coding_system
;
9207 Lisp_Object spec
, attrs
;
9209 if (NILP (coding_system
))
9210 coding_system
= Qno_conversion
;
9211 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
9212 attrs
= AREF (spec
, 0);
9213 return CODING_ATTR_PLIST (attrs
);
9217 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
9219 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
9221 Lisp_Object coding_system
;
9225 if (NILP (coding_system
))
9226 coding_system
= Qno_conversion
;
9227 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
9228 return AREF (spec
, 1);
9231 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
9232 Scoding_system_eol_type
, 1, 1, 0,
9233 doc
: /* Return eol-type of CODING-SYSTEM.
9234 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9236 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9237 and CR respectively.
9239 A vector value indicates that a format of end-of-line should be
9240 detected automatically. Nth element of the vector is the subsidiary
9241 coding system whose eol-type is N. */)
9243 Lisp_Object coding_system
;
9245 Lisp_Object spec
, eol_type
;
9248 if (NILP (coding_system
))
9249 coding_system
= Qno_conversion
;
9250 if (! CODING_SYSTEM_P (coding_system
))
9252 spec
= CODING_SYSTEM_SPEC (coding_system
);
9253 eol_type
= AREF (spec
, 2);
9254 if (VECTORP (eol_type
))
9255 return Fcopy_sequence (eol_type
);
9256 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
9257 return make_number (n
);
9263 /*** 9. Post-amble ***/
9270 for (i
= 0; i
< coding_category_max
; i
++)
9272 coding_categories
[i
].id
= -1;
9273 coding_priorities
[i
] = i
;
9276 /* ISO2022 specific initialize routine. */
9277 for (i
= 0; i
< 0x20; i
++)
9278 iso_code_class
[i
] = ISO_control_0
;
9279 for (i
= 0x21; i
< 0x7F; i
++)
9280 iso_code_class
[i
] = ISO_graphic_plane_0
;
9281 for (i
= 0x80; i
< 0xA0; i
++)
9282 iso_code_class
[i
] = ISO_control_1
;
9283 for (i
= 0xA1; i
< 0xFF; i
++)
9284 iso_code_class
[i
] = ISO_graphic_plane_1
;
9285 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
9286 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
9287 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
9288 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
9289 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
9290 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
9291 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
9292 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
9293 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
9295 for (i
= 0; i
< 256; i
++)
9297 emacs_mule_bytes
[i
] = 1;
9299 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_11
] = 3;
9300 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_12
] = 3;
9301 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_21
] = 4;
9302 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_22
] = 4;
9310 staticpro (&Vcoding_system_hash_table
);
9312 Lisp_Object args
[2];
9315 Vcoding_system_hash_table
= Fmake_hash_table (2, args
);
9318 staticpro (&Vsjis_coding_system
);
9319 Vsjis_coding_system
= Qnil
;
9321 staticpro (&Vbig5_coding_system
);
9322 Vbig5_coding_system
= Qnil
;
9324 staticpro (&Vcode_conversion_reused_workbuf
);
9325 Vcode_conversion_reused_workbuf
= Qnil
;
9327 staticpro (&Vcode_conversion_workbuf_name
);
9328 Vcode_conversion_workbuf_name
= build_string (" *code-conversion-work*");
9330 reused_workbuf_in_use
= 0;
9332 DEFSYM (Qcharset
, "charset");
9333 DEFSYM (Qtarget_idx
, "target-idx");
9334 DEFSYM (Qcoding_system_history
, "coding-system-history");
9335 Fset (Qcoding_system_history
, Qnil
);
9337 /* Target FILENAME is the first argument. */
9338 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
9339 /* Target FILENAME is the third argument. */
9340 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
9342 DEFSYM (Qcall_process
, "call-process");
9343 /* Target PROGRAM is the first argument. */
9344 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
9346 DEFSYM (Qcall_process_region
, "call-process-region");
9347 /* Target PROGRAM is the third argument. */
9348 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
9350 DEFSYM (Qstart_process
, "start-process");
9351 /* Target PROGRAM is the third argument. */
9352 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
9354 DEFSYM (Qopen_network_stream
, "open-network-stream");
9355 /* Target SERVICE is the fourth argument. */
9356 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
9358 DEFSYM (Qcoding_system
, "coding-system");
9359 DEFSYM (Qcoding_aliases
, "coding-aliases");
9361 DEFSYM (Qeol_type
, "eol-type");
9362 DEFSYM (Qunix
, "unix");
9363 DEFSYM (Qdos
, "dos");
9365 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
9366 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
9367 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
9368 DEFSYM (Qdefault_char
, "default-char");
9369 DEFSYM (Qundecided
, "undecided");
9370 DEFSYM (Qno_conversion
, "no-conversion");
9371 DEFSYM (Qraw_text
, "raw-text");
9373 DEFSYM (Qiso_2022
, "iso-2022");
9375 DEFSYM (Qutf_8
, "utf-8");
9376 DEFSYM (Qutf_8_emacs
, "utf-8-emacs");
9378 DEFSYM (Qutf_16
, "utf-16");
9379 DEFSYM (Qbig
, "big");
9380 DEFSYM (Qlittle
, "little");
9382 DEFSYM (Qshift_jis
, "shift-jis");
9383 DEFSYM (Qbig5
, "big5");
9385 DEFSYM (Qcoding_system_p
, "coding-system-p");
9387 DEFSYM (Qcoding_system_error
, "coding-system-error");
9388 Fput (Qcoding_system_error
, Qerror_conditions
,
9389 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
9390 Fput (Qcoding_system_error
, Qerror_message
,
9391 build_string ("Invalid coding system"));
9393 /* Intern this now in case it isn't already done.
9394 Setting this variable twice is harmless.
9395 But don't staticpro it here--that is done in alloc.c. */
9396 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
9398 DEFSYM (Qtranslation_table
, "translation-table");
9399 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (2));
9400 DEFSYM (Qtranslation_table_id
, "translation-table-id");
9401 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
9402 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
9404 DEFSYM (Qvalid_codes
, "valid-codes");
9406 DEFSYM (Qemacs_mule
, "emacs-mule");
9408 DEFSYM (QCcategory
, ":category");
9409 DEFSYM (QCmnemonic
, ":mnemonic");
9410 DEFSYM (QCdefalut_char
, ":default-char");
9411 DEFSYM (QCdecode_translation_table
, ":decode-translation-table");
9412 DEFSYM (QCencode_translation_table
, ":encode-translation-table");
9413 DEFSYM (QCpost_read_conversion
, ":post-read-conversion");
9414 DEFSYM (QCpre_write_conversion
, ":pre-write-conversion");
9415 DEFSYM (QCascii_compatible_p
, ":ascii-compatible-p");
9417 Vcoding_category_table
9418 = Fmake_vector (make_number (coding_category_max
), Qnil
);
9419 staticpro (&Vcoding_category_table
);
9420 /* Followings are target of code detection. */
9421 ASET (Vcoding_category_table
, coding_category_iso_7
,
9422 intern ("coding-category-iso-7"));
9423 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
9424 intern ("coding-category-iso-7-tight"));
9425 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
9426 intern ("coding-category-iso-8-1"));
9427 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
9428 intern ("coding-category-iso-8-2"));
9429 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
9430 intern ("coding-category-iso-7-else"));
9431 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
9432 intern ("coding-category-iso-8-else"));
9433 ASET (Vcoding_category_table
, coding_category_utf_8
,
9434 intern ("coding-category-utf-8"));
9435 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
9436 intern ("coding-category-utf-16-be"));
9437 ASET (Vcoding_category_table
, coding_category_utf_16_auto
,
9438 intern ("coding-category-utf-16-auto"));
9439 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
9440 intern ("coding-category-utf-16-le"));
9441 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
9442 intern ("coding-category-utf-16-be-nosig"));
9443 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
9444 intern ("coding-category-utf-16-le-nosig"));
9445 ASET (Vcoding_category_table
, coding_category_charset
,
9446 intern ("coding-category-charset"));
9447 ASET (Vcoding_category_table
, coding_category_sjis
,
9448 intern ("coding-category-sjis"));
9449 ASET (Vcoding_category_table
, coding_category_big5
,
9450 intern ("coding-category-big5"));
9451 ASET (Vcoding_category_table
, coding_category_ccl
,
9452 intern ("coding-category-ccl"));
9453 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
9454 intern ("coding-category-emacs-mule"));
9455 /* Followings are NOT target of code detection. */
9456 ASET (Vcoding_category_table
, coding_category_raw_text
,
9457 intern ("coding-category-raw-text"));
9458 ASET (Vcoding_category_table
, coding_category_undecided
,
9459 intern ("coding-category-undecided"));
9461 DEFSYM (Qinsufficient_source
, "insufficient-source");
9462 DEFSYM (Qinconsistent_eol
, "inconsistent-eol");
9463 DEFSYM (Qinvalid_source
, "invalid-source");
9464 DEFSYM (Qinterrupted
, "interrupted");
9465 DEFSYM (Qinsufficient_memory
, "insufficient-memory");
9466 DEFSYM (Qcoding_system_define_form
, "coding-system-define-form");
9468 defsubr (&Scoding_system_p
);
9469 defsubr (&Sread_coding_system
);
9470 defsubr (&Sread_non_nil_coding_system
);
9471 defsubr (&Scheck_coding_system
);
9472 defsubr (&Sdetect_coding_region
);
9473 defsubr (&Sdetect_coding_string
);
9474 defsubr (&Sfind_coding_systems_region_internal
);
9475 defsubr (&Sunencodable_char_position
);
9476 defsubr (&Scheck_coding_systems_region
);
9477 defsubr (&Sdecode_coding_region
);
9478 defsubr (&Sencode_coding_region
);
9479 defsubr (&Sdecode_coding_string
);
9480 defsubr (&Sencode_coding_string
);
9481 defsubr (&Sdecode_sjis_char
);
9482 defsubr (&Sencode_sjis_char
);
9483 defsubr (&Sdecode_big5_char
);
9484 defsubr (&Sencode_big5_char
);
9485 defsubr (&Sset_terminal_coding_system_internal
);
9486 defsubr (&Sset_safe_terminal_coding_system_internal
);
9487 defsubr (&Sterminal_coding_system
);
9488 defsubr (&Sset_keyboard_coding_system_internal
);
9489 defsubr (&Skeyboard_coding_system
);
9490 defsubr (&Sfind_operation_coding_system
);
9491 defsubr (&Sset_coding_system_priority
);
9492 defsubr (&Sdefine_coding_system_internal
);
9493 defsubr (&Sdefine_coding_system_alias
);
9494 defsubr (&Scoding_system_put
);
9495 defsubr (&Scoding_system_base
);
9496 defsubr (&Scoding_system_plist
);
9497 defsubr (&Scoding_system_aliases
);
9498 defsubr (&Scoding_system_eol_type
);
9499 defsubr (&Scoding_system_priority_list
);
9501 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
9502 doc
: /* List of coding systems.
9504 Do not alter the value of this variable manually. This variable should be
9505 updated by the functions `define-coding-system' and
9506 `define-coding-system-alias'. */);
9507 Vcoding_system_list
= Qnil
;
9509 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
9510 doc
: /* Alist of coding system names.
9511 Each element is one element list of coding system name.
9512 This variable is given to `completing-read' as TABLE argument.
9514 Do not alter the value of this variable manually. This variable should be
9515 updated by the functions `make-coding-system' and
9516 `define-coding-system-alias'. */);
9517 Vcoding_system_alist
= Qnil
;
9519 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
9520 doc
: /* List of coding-categories (symbols) ordered by priority.
9522 On detecting a coding system, Emacs tries code detection algorithms
9523 associated with each coding-category one by one in this order. When
9524 one algorithm agrees with a byte sequence of source text, the coding
9525 system bound to the corresponding coding-category is selected.
9527 Don't modify this variable directly, but use `set-coding-priority'. */);
9531 Vcoding_category_list
= Qnil
;
9532 for (i
= coding_category_max
- 1; i
>= 0; i
--)
9533 Vcoding_category_list
9534 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
9535 Vcoding_category_list
);
9538 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
9539 doc
: /* Specify the coding system for read operations.
9540 It is useful to bind this variable with `let', but do not set it globally.
9541 If the value is a coding system, it is used for decoding on read operation.
9542 If not, an appropriate element is used from one of the coding system alists:
9543 There are three such tables, `file-coding-system-alist',
9544 `process-coding-system-alist', and `network-coding-system-alist'. */);
9545 Vcoding_system_for_read
= Qnil
;
9547 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
9548 doc
: /* Specify the coding system for write operations.
9549 Programs bind this variable with `let', but you should not set it globally.
9550 If the value is a coding system, it is used for encoding of output,
9551 when writing it to a file and when sending it to a file or subprocess.
9553 If this does not specify a coding system, an appropriate element
9554 is used from one of the coding system alists:
9555 There are three such tables, `file-coding-system-alist',
9556 `process-coding-system-alist', and `network-coding-system-alist'.
9557 For output to files, if the above procedure does not specify a coding system,
9558 the value of `buffer-file-coding-system' is used. */);
9559 Vcoding_system_for_write
= Qnil
;
9561 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
9563 Coding system used in the latest file or process I/O. */);
9564 Vlast_coding_system_used
= Qnil
;
9566 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error
,
9568 Error status of the last code conversion.
9570 When an error was detected in the last code conversion, this variable
9571 is set to one of the following symbols.
9572 `insufficient-source'
9576 `insufficient-memory'
9577 When no error was detected, the value doesn't change. So, to check
9578 the error status of a code conversion by this variable, you must
9579 explicitly set this variable to nil before performing code
9581 Vlast_code_conversion_error
= Qnil
;
9583 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
9585 *Non-nil means always inhibit code conversion of end-of-line format.
9586 See info node `Coding Systems' and info node `Text and Binary' concerning
9587 such conversion. */);
9588 inhibit_eol_conversion
= 0;
9590 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
9592 Non-nil means process buffer inherits coding system of process output.
9593 Bind it to t if the process output is to be treated as if it were a file
9594 read from some filesystem. */);
9595 inherit_process_coding_system
= 0;
9597 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
9599 Alist to decide a coding system to use for a file I/O operation.
9600 The format is ((PATTERN . VAL) ...),
9601 where PATTERN is a regular expression matching a file name,
9602 VAL is a coding system, a cons of coding systems, or a function symbol.
9603 If VAL is a coding system, it is used for both decoding and encoding
9605 If VAL is a cons of coding systems, the car part is used for decoding,
9606 and the cdr part is used for encoding.
9607 If VAL is a function symbol, the function must return a coding system
9608 or a cons of coding systems which are used as above. The function gets
9609 the arguments with which `find-operation-coding-systems' was called.
9611 See also the function `find-operation-coding-system'
9612 and the variable `auto-coding-alist'. */);
9613 Vfile_coding_system_alist
= Qnil
;
9615 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
9617 Alist to decide a coding system to use for a process I/O operation.
9618 The format is ((PATTERN . VAL) ...),
9619 where PATTERN is a regular expression matching a program name,
9620 VAL is a coding system, a cons of coding systems, or a function symbol.
9621 If VAL is a coding system, it is used for both decoding what received
9622 from the program and encoding what sent to the program.
9623 If VAL is a cons of coding systems, the car part is used for decoding,
9624 and the cdr part is used for encoding.
9625 If VAL is a function symbol, the function must return a coding system
9626 or a cons of coding systems which are used as above.
9628 See also the function `find-operation-coding-system'. */);
9629 Vprocess_coding_system_alist
= Qnil
;
9631 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
9633 Alist to decide a coding system to use for a network I/O operation.
9634 The format is ((PATTERN . VAL) ...),
9635 where PATTERN is a regular expression matching a network service name
9636 or is a port number to connect to,
9637 VAL is a coding system, a cons of coding systems, or a function symbol.
9638 If VAL is a coding system, it is used for both decoding what received
9639 from the network stream and encoding what sent to the network stream.
9640 If VAL is a cons of coding systems, the car part is used for decoding,
9641 and the cdr part is used for encoding.
9642 If VAL is a function symbol, the function must return a coding system
9643 or a cons of coding systems which are used as above.
9645 See also the function `find-operation-coding-system'. */);
9646 Vnetwork_coding_system_alist
= Qnil
;
9648 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
9649 doc
: /* Coding system to use with system messages.
9650 Also used for decoding keyboard input on X Window system. */);
9651 Vlocale_coding_system
= Qnil
;
9653 /* The eol mnemonics are reset in startup.el system-dependently. */
9654 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
9656 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9657 eol_mnemonic_unix
= build_string (":");
9659 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
9661 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9662 eol_mnemonic_dos
= build_string ("\\");
9664 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
9666 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9667 eol_mnemonic_mac
= build_string ("/");
9669 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
9671 *String displayed in mode line when end-of-line format is not yet determined. */);
9672 eol_mnemonic_undecided
= build_string (":");
9674 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
9676 *Non-nil enables character translation while encoding and decoding. */);
9677 Venable_character_translation
= Qt
;
9679 DEFVAR_LISP ("standard-translation-table-for-decode",
9680 &Vstandard_translation_table_for_decode
,
9681 doc
: /* Table for translating characters while decoding. */);
9682 Vstandard_translation_table_for_decode
= Qnil
;
9684 DEFVAR_LISP ("standard-translation-table-for-encode",
9685 &Vstandard_translation_table_for_encode
,
9686 doc
: /* Table for translating characters while encoding. */);
9687 Vstandard_translation_table_for_encode
= Qnil
;
9689 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
9690 doc
: /* Alist of charsets vs revision numbers.
9691 While encoding, if a charset (car part of an element) is found,
9692 designate it with the escape sequence identifying revision (cdr part
9693 of the element). */);
9694 Vcharset_revision_table
= Qnil
;
9696 DEFVAR_LISP ("default-process-coding-system",
9697 &Vdefault_process_coding_system
,
9698 doc
: /* Cons of coding systems used for process I/O by default.
9699 The car part is used for decoding a process output,
9700 the cdr part is used for encoding a text to be sent to a process. */);
9701 Vdefault_process_coding_system
= Qnil
;
9703 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
9705 Table of extra Latin codes in the range 128..159 (inclusive).
9706 This is a vector of length 256.
9707 If Nth element is non-nil, the existence of code N in a file
9708 \(or output of subprocess) doesn't prevent it to be detected as
9709 a coding system of ISO 2022 variant which has a flag
9710 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9711 or reading output of a subprocess.
9712 Only 128th through 159th elements has a meaning. */);
9713 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
9715 DEFVAR_LISP ("select-safe-coding-system-function",
9716 &Vselect_safe_coding_system_function
,
9718 Function to call to select safe coding system for encoding a text.
9720 If set, this function is called to force a user to select a proper
9721 coding system which can encode the text in the case that a default
9722 coding system used in each operation can't encode the text.
9724 The default value is `select-safe-coding-system' (which see). */);
9725 Vselect_safe_coding_system_function
= Qnil
;
9727 DEFVAR_BOOL ("coding-system-require-warning",
9728 &coding_system_require_warning
,
9729 doc
: /* Internal use only.
9730 If non-nil, on writing a file, `select-safe-coding-system-function' is
9731 called even if `coding-system-for-write' is non-nil. The command
9732 `universal-coding-system-argument' binds this variable to t temporarily. */);
9733 coding_system_require_warning
= 0;
9736 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9737 &inhibit_iso_escape_detection
,
9739 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9741 By default, on reading a file, Emacs tries to detect how the text is
9742 encoded. This code detection is sensitive to escape sequences. If
9743 the sequence is valid as ISO2022, the code is determined as one of
9744 the ISO2022 encodings, and the file is decoded by the corresponding
9745 coding system (e.g. `iso-2022-7bit').
9747 However, there may be a case that you want to read escape sequences in
9748 a file as is. In such a case, you can set this variable to non-nil.
9749 Then, as the code detection ignores any escape sequences, no file is
9750 detected as encoded in some ISO2022 encoding. The result is that all
9751 escape sequences become visible in a buffer.
9753 The default value is nil, and it is strongly recommended not to change
9754 it. That is because many Emacs Lisp source files that contain
9755 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9756 in Emacs's distribution, and they won't be decoded correctly on
9757 reading if you suppress escape sequence detection.
9759 The other way to read escape sequences in a file without decoding is
9760 to explicitly specify some coding system that doesn't use ISO2022's
9761 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9762 inhibit_iso_escape_detection
= 0;
9764 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input
,
9765 doc
: /* Char table for translating self-inserting characters.
9766 This is applied to the result of input methods, not their input. See also
9767 `keyboard-translate-table'. */);
9768 Vtranslation_table_for_input
= Qnil
;
9771 Lisp_Object args
[coding_arg_max
];
9772 Lisp_Object plist
[16];
9775 for (i
= 0; i
< coding_arg_max
; i
++)
9778 plist
[0] = intern (":name");
9779 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
9780 plist
[2] = intern (":mnemonic");
9781 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
9782 plist
[4] = intern (":coding-type");
9783 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
9784 plist
[6] = intern (":ascii-compatible-p");
9785 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
9786 plist
[8] = intern (":default-char");
9787 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
9788 plist
[10] = intern (":for-unibyte");
9789 plist
[11] = args
[coding_arg_for_unibyte
] = Qt
;
9790 plist
[12] = intern (":docstring");
9791 plist
[13] = build_string ("Do no conversion.\n\
9793 When you visit a file with this coding, the file is read into a\n\
9794 unibyte buffer as is, thus each byte of a file is treated as a\n\
9796 plist
[14] = intern (":eol-type");
9797 plist
[15] = args
[coding_arg_eol_type
] = Qunix
;
9798 args
[coding_arg_plist
] = Flist (16, plist
);
9799 Fdefine_coding_system_internal (coding_arg_max
, args
);
9801 plist
[1] = args
[coding_arg_name
] = Qundecided
;
9802 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('-');
9803 plist
[5] = args
[coding_arg_coding_type
] = Qundecided
;
9804 /* This is already set.
9805 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
9806 plist
[8] = intern (":charset-list");
9807 plist
[9] = args
[coding_arg_charset_list
] = Fcons (Qascii
, Qnil
);
9808 plist
[11] = args
[coding_arg_for_unibyte
] = Qnil
;
9809 plist
[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
9810 plist
[15] = args
[coding_arg_eol_type
] = Qnil
;
9811 args
[coding_arg_plist
] = Flist (16, plist
);
9812 Fdefine_coding_system_internal (coding_arg_max
, args
);
9815 setup_coding_system (Qno_conversion
, &keyboard_coding
);
9816 setup_coding_system (Qundecided
, &terminal_coding
);
9817 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
9822 for (i
= 0; i
< coding_category_max
; i
++)
9823 Fset (AREF (Vcoding_category_table
, i
), Qno_conversion
);
9825 #if defined (MSDOS) || defined (WINDOWSNT)
9826 system_eol_type
= Qdos
;
9828 system_eol_type
= Qunix
;
9830 staticpro (&system_eol_type
);
9834 emacs_strerror (error_number
)
9839 synchronize_system_messages_locale ();
9840 str
= strerror (error_number
);
9842 if (! NILP (Vlocale_coding_system
))
9844 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
9845 Vlocale_coding_system
,
9847 str
= (char *) SDATA (dec
);
9855 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9856 (do not change this comment) */