1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
152 Below is the template of these functions. */
156 detect_coding_XXX (coding
, detect_info
)
157 struct coding_system
*coding
;
158 struct coding_detection_info
*detect_info
;
160 unsigned char *src
= coding
->source
;
161 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
162 int multibytep
= coding
->src_multibyte
;
163 int consumed_chars
= 0;
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
173 if (! __C_conforms_to_XXX___ (c
))
175 if (! __C_strongly_suggests_XXX__ (c
))
176 found
= CATEGORY_MASK_XXX
;
178 /* The byte sequence is invalid for XXX. */
179 detect_info
->rejected
|= CATEGORY_MASK_XXX
;
183 /* The source exausted successfully. */
184 detect_info
->found
|= found
;
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
201 Below is the template of these functions. */
205 decode_coding_XXXX (coding
)
206 struct coding_system
*coding
;
208 unsigned char *src
= coding
->source
+ coding
->consumed
;
209 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base
;
214 /* A buffer to produce decoded characters. */
215 int *charbuf
= coding
->charbuf
;
216 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
217 int multibytep
= coding
->src_multibyte
;
222 if (charbuf
< charbuf_end
)
223 /* No more room to produce a decoded character. */
230 if (src_base
< src_end
231 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base
< src_end
&& charbuf
< charbuf_end
)
235 *charbuf
++ = *src_base
++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
239 /* Remember how many characters we produced. */
240 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
260 Below is a template of these functions. */
263 encode_coding_XXX (coding
)
264 struct coding_system
*coding
;
266 int multibytep
= coding
->dst_multibyte
;
267 int *charbuf
= coding
->charbuf
;
268 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
269 unsigned char *dst
= coding
->destination
+ coding
->produced
;
270 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
271 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
272 int produced_chars
= 0;
274 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
277 /* Encode C into DST, and increment DST. */
279 label_no_more_destination
:
280 /* How many chars and bytes we produced. */
281 coding
->produced_char
+= produced_chars
;
282 coding
->produced
= dst
- coding
->destination
;
287 /*** 1. Preamble ***/
294 #include "character.h"
297 #include "composite.h"
301 Lisp_Object Vcoding_system_hash_table
;
303 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
304 Lisp_Object Qunix
, Qdos
;
305 extern Lisp_Object Qmac
; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system
;
307 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
308 Lisp_Object Qdefault_char
;
309 Lisp_Object Qno_conversion
, Qundecided
;
310 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
311 Lisp_Object Qbig
, Qlittle
;
312 Lisp_Object Qcoding_system_history
;
313 Lisp_Object Qvalid_codes
;
314 Lisp_Object QCcategory
;
316 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
317 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
318 Lisp_Object Qstart_process
, Qopen_network_stream
;
319 Lisp_Object Qtarget_idx
;
321 Lisp_Object Qinsufficient_source
, Qinconsistent_eol
, Qinvalid_source
;
322 Lisp_Object Qinterrupted
, Qinsufficient_memory
;
324 int coding_system_require_warning
;
326 Lisp_Object Vselect_safe_coding_system_function
;
328 /* Mnemonic string for each format of end-of-line. */
329 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
330 /* Mnemonic string to indicate format of end-of-line is not yet
332 Lisp_Object eol_mnemonic_undecided
;
336 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
338 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
340 /* Coding system emacs-mule and raw-text are for converting only
341 end-of-line format. */
342 Lisp_Object Qemacs_mule
, Qraw_text
;
343 Lisp_Object Qutf_8_emacs
;
345 /* Coding-systems are handed between Emacs Lisp programs and C internal
346 routines by the following three variables. */
347 /* Coding-system for reading files and receiving data from process. */
348 Lisp_Object Vcoding_system_for_read
;
349 /* Coding-system for writing files and sending data to process. */
350 Lisp_Object Vcoding_system_for_write
;
351 /* Coding-system actually used in the latest I/O. */
352 Lisp_Object Vlast_coding_system_used
;
353 /* Set to non-nil when an error is detected while code conversion. */
354 Lisp_Object Vlast_code_conversion_error
;
355 /* A vector of length 256 which contains information about special
356 Latin codes (especially for dealing with Microsoft codes). */
357 Lisp_Object Vlatin_extra_code_table
;
359 /* Flag to inhibit code conversion of end-of-line format. */
360 int inhibit_eol_conversion
;
362 /* Flag to inhibit ISO2022 escape sequence detection. */
363 int inhibit_iso_escape_detection
;
365 /* Flag to make buffer-file-coding-system inherit from process-coding. */
366 int inherit_process_coding_system
;
368 /* Coding system to be used to encode text for terminal display. */
369 struct coding_system terminal_coding
;
371 /* Coding system to be used to encode text for terminal display when
372 terminal coding system is nil. */
373 struct coding_system safe_terminal_coding
;
375 /* Coding system of what is sent from terminal keyboard. */
376 struct coding_system keyboard_coding
;
378 Lisp_Object Vfile_coding_system_alist
;
379 Lisp_Object Vprocess_coding_system_alist
;
380 Lisp_Object Vnetwork_coding_system_alist
;
382 Lisp_Object Vlocale_coding_system
;
386 /* Flag to tell if we look up translation table on character code
388 Lisp_Object Venable_character_translation
;
389 /* Standard translation table to look up on decoding (reading). */
390 Lisp_Object Vstandard_translation_table_for_decode
;
391 /* Standard translation table to look up on encoding (writing). */
392 Lisp_Object Vstandard_translation_table_for_encode
;
394 Lisp_Object Qtranslation_table
;
395 Lisp_Object Qtranslation_table_id
;
396 Lisp_Object Qtranslation_table_for_decode
;
397 Lisp_Object Qtranslation_table_for_encode
;
399 /* Alist of charsets vs revision number. */
400 static Lisp_Object Vcharset_revision_table
;
402 /* Default coding systems used for process I/O. */
403 Lisp_Object Vdefault_process_coding_system
;
405 /* Char table for translating Quail and self-inserting input. */
406 Lisp_Object Vtranslation_table_for_input
;
408 /* Two special coding systems. */
409 Lisp_Object Vsjis_coding_system
;
410 Lisp_Object Vbig5_coding_system
;
412 static void record_conversion_result (struct coding_system
*coding
,
413 enum coding_result_code result
);
414 static int detect_coding_utf_8
P_ ((struct coding_system
*,
415 struct coding_detection_info
*info
));
416 static void decode_coding_utf_8
P_ ((struct coding_system
*));
417 static int encode_coding_utf_8
P_ ((struct coding_system
*));
419 static int detect_coding_utf_16
P_ ((struct coding_system
*,
420 struct coding_detection_info
*info
));
421 static void decode_coding_utf_16
P_ ((struct coding_system
*));
422 static int encode_coding_utf_16
P_ ((struct coding_system
*));
424 static int detect_coding_iso_2022
P_ ((struct coding_system
*,
425 struct coding_detection_info
*info
));
426 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
427 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
429 static int detect_coding_emacs_mule
P_ ((struct coding_system
*,
430 struct coding_detection_info
*info
));
431 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
432 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
434 static int detect_coding_sjis
P_ ((struct coding_system
*,
435 struct coding_detection_info
*info
));
436 static void decode_coding_sjis
P_ ((struct coding_system
*));
437 static int encode_coding_sjis
P_ ((struct coding_system
*));
439 static int detect_coding_big5
P_ ((struct coding_system
*,
440 struct coding_detection_info
*info
));
441 static void decode_coding_big5
P_ ((struct coding_system
*));
442 static int encode_coding_big5
P_ ((struct coding_system
*));
444 static int detect_coding_ccl
P_ ((struct coding_system
*,
445 struct coding_detection_info
*info
));
446 static void decode_coding_ccl
P_ ((struct coding_system
*));
447 static int encode_coding_ccl
P_ ((struct coding_system
*));
449 static void decode_coding_raw_text
P_ ((struct coding_system
*));
450 static int encode_coding_raw_text
P_ ((struct coding_system
*));
453 /* ISO2022 section */
455 #define CODING_ISO_INITIAL(coding, reg) \
456 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
457 coding_attr_iso_initial), \
461 #define CODING_ISO_REQUEST(coding, charset_id) \
462 ((charset_id <= (coding)->max_charset_id \
463 ? (coding)->safe_charsets[charset_id] \
467 #define CODING_ISO_FLAGS(coding) \
468 ((coding)->spec.iso_2022.flags)
469 #define CODING_ISO_DESIGNATION(coding, reg) \
470 ((coding)->spec.iso_2022.current_designation[reg])
471 #define CODING_ISO_INVOCATION(coding, plane) \
472 ((coding)->spec.iso_2022.current_invocation[plane])
473 #define CODING_ISO_SINGLE_SHIFTING(coding) \
474 ((coding)->spec.iso_2022.single_shifting)
475 #define CODING_ISO_BOL(coding) \
476 ((coding)->spec.iso_2022.bol)
477 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
478 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
480 /* Control characters of ISO2022. */
481 /* code */ /* function */
482 #define ISO_CODE_LF 0x0A /* line-feed */
483 #define ISO_CODE_CR 0x0D /* carriage-return */
484 #define ISO_CODE_SO 0x0E /* shift-out */
485 #define ISO_CODE_SI 0x0F /* shift-in */
486 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
487 #define ISO_CODE_ESC 0x1B /* escape */
488 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
489 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
490 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
492 /* All code (1-byte) of ISO2022 is classified into one of the
494 enum iso_code_class_type
496 ISO_control_0
, /* Control codes in the range
497 0x00..0x1F and 0x7F, except for the
498 following 5 codes. */
499 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
500 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
501 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
502 ISO_escape
, /* ISO_CODE_SO (0x1B) */
503 ISO_control_1
, /* Control codes in the range
504 0x80..0x9F, except for the
505 following 3 codes. */
506 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
507 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
508 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
509 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
510 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
511 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
512 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
515 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
516 `iso-flags' attribute of an iso2022 coding system. */
518 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
519 instead of the correct short-form sequence (e.g. ESC $ A). */
520 #define CODING_ISO_FLAG_LONG_FORM 0x0001
522 /* If set, reset graphic planes and registers at end-of-line to the
524 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
526 /* If set, reset graphic planes and registers before any control
527 characters to the initial state. */
528 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
530 /* If set, encode by 7-bit environment. */
531 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
533 /* If set, use locking-shift function. */
534 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
536 /* If set, use single-shift function. Overwrite
537 CODING_ISO_FLAG_LOCKING_SHIFT. */
538 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
540 /* If set, use designation escape sequence. */
541 #define CODING_ISO_FLAG_DESIGNATION 0x0040
543 /* If set, produce revision number sequence. */
544 #define CODING_ISO_FLAG_REVISION 0x0080
546 /* If set, produce ISO6429's direction specifying sequence. */
547 #define CODING_ISO_FLAG_DIRECTION 0x0100
549 /* If set, assume designation states are reset at beginning of line on
551 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
553 /* If set, designation sequence should be placed at beginning of line
555 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
557 /* If set, do not encode unsafe charactes on output. */
558 #define CODING_ISO_FLAG_SAFE 0x0800
560 /* If set, extra latin codes (128..159) are accepted as a valid code
562 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
564 #define CODING_ISO_FLAG_COMPOSITION 0x2000
566 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
568 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
570 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
572 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
574 /* A character to be produced on output if encoding of the original
575 character is prohibited by CODING_ISO_FLAG_SAFE. */
576 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
580 #define CODING_UTF_16_BOM(coding) \
581 ((coding)->spec.utf_16.bom)
583 #define CODING_UTF_16_ENDIAN(coding) \
584 ((coding)->spec.utf_16.endian)
586 #define CODING_UTF_16_SURROGATE(coding) \
587 ((coding)->spec.utf_16.surrogate)
591 #define CODING_CCL_DECODER(coding) \
592 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
593 #define CODING_CCL_ENCODER(coding) \
594 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
595 #define CODING_CCL_VALIDS(coding) \
596 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
598 /* Index for each coding category in `coding_categories' */
602 coding_category_iso_7
,
603 coding_category_iso_7_tight
,
604 coding_category_iso_8_1
,
605 coding_category_iso_8_2
,
606 coding_category_iso_7_else
,
607 coding_category_iso_8_else
,
608 coding_category_utf_8
,
609 coding_category_utf_16_auto
,
610 coding_category_utf_16_be
,
611 coding_category_utf_16_le
,
612 coding_category_utf_16_be_nosig
,
613 coding_category_utf_16_le_nosig
,
614 coding_category_charset
,
615 coding_category_sjis
,
616 coding_category_big5
,
618 coding_category_emacs_mule
,
619 /* All above are targets of code detection. */
620 coding_category_raw_text
,
621 coding_category_undecided
,
625 /* Definitions of flag bits used in detect_coding_XXXX. */
626 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
627 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
628 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
629 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
630 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
631 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
632 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
633 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
634 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
635 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
636 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
637 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
638 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
639 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
640 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
641 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
642 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
643 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
645 /* This value is returned if detect_coding_mask () find nothing other
646 than ASCII characters. */
647 #define CATEGORY_MASK_ANY \
648 (CATEGORY_MASK_ISO_7 \
649 | CATEGORY_MASK_ISO_7_TIGHT \
650 | CATEGORY_MASK_ISO_8_1 \
651 | CATEGORY_MASK_ISO_8_2 \
652 | CATEGORY_MASK_ISO_7_ELSE \
653 | CATEGORY_MASK_ISO_8_ELSE \
654 | CATEGORY_MASK_UTF_8 \
655 | CATEGORY_MASK_UTF_16_BE \
656 | CATEGORY_MASK_UTF_16_LE \
657 | CATEGORY_MASK_UTF_16_BE_NOSIG \
658 | CATEGORY_MASK_UTF_16_LE_NOSIG \
659 | CATEGORY_MASK_CHARSET \
660 | CATEGORY_MASK_SJIS \
661 | CATEGORY_MASK_BIG5 \
662 | CATEGORY_MASK_CCL \
663 | CATEGORY_MASK_EMACS_MULE)
666 #define CATEGORY_MASK_ISO_7BIT \
667 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
669 #define CATEGORY_MASK_ISO_8BIT \
670 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
672 #define CATEGORY_MASK_ISO_ELSE \
673 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
675 #define CATEGORY_MASK_ISO_ESCAPE \
676 (CATEGORY_MASK_ISO_7 \
677 | CATEGORY_MASK_ISO_7_TIGHT \
678 | CATEGORY_MASK_ISO_7_ELSE \
679 | CATEGORY_MASK_ISO_8_ELSE)
681 #define CATEGORY_MASK_ISO \
682 ( CATEGORY_MASK_ISO_7BIT \
683 | CATEGORY_MASK_ISO_8BIT \
684 | CATEGORY_MASK_ISO_ELSE)
686 #define CATEGORY_MASK_UTF_16 \
687 (CATEGORY_MASK_UTF_16_BE \
688 | CATEGORY_MASK_UTF_16_LE \
689 | CATEGORY_MASK_UTF_16_BE_NOSIG \
690 | CATEGORY_MASK_UTF_16_LE_NOSIG)
693 /* List of symbols `coding-category-xxx' ordered by priority. This
694 variable is exposed to Emacs Lisp. */
695 static Lisp_Object Vcoding_category_list
;
697 /* Table of coding categories (Lisp symbols). This variable is for
699 static Lisp_Object Vcoding_category_table
;
701 /* Table of coding-categories ordered by priority. */
702 static enum coding_category coding_priorities
[coding_category_max
];
704 /* Nth element is a coding context for the coding system bound to the
705 Nth coding category. */
706 static struct coding_system coding_categories
[coding_category_max
];
708 /*** Commonly used macros and functions ***/
711 #define min(a, b) ((a) < (b) ? (a) : (b))
714 #define max(a, b) ((a) > (b) ? (a) : (b))
717 #define CODING_GET_INFO(coding, attrs, charset_list) \
719 (attrs) = CODING_ID_ATTRS ((coding)->id); \
720 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
724 /* Safely get one byte from the source text pointed by SRC which ends
725 at SRC_END, and set C to that byte. If there are not enough bytes
726 in the source, it jumps to `no_more_source'. If multibytep is
727 nonzero, and a multibyte character is found at SRC, set C to the
728 negative value of the character code. The caller should declare
729 and set these variables appropriately in advance:
730 src, src_end, multibytep */
732 #define ONE_MORE_BYTE(c) \
734 if (src == src_end) \
736 if (src_base < src) \
737 record_conversion_result \
738 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
739 goto no_more_source; \
742 if (multibytep && (c & 0x80)) \
744 if ((c & 0xFE) == 0xC0) \
745 c = ((c & 1) << 6) | *src++; \
748 c = - string_char (--src, &src, NULL); \
749 record_conversion_result \
750 (coding, CODING_RESULT_INVALID_SRC); \
757 #define ONE_MORE_BYTE_NO_CHECK(c) \
760 if (multibytep && (c & 0x80)) \
762 if ((c & 0xFE) == 0xC0) \
763 c = ((c & 1) << 6) | *src++; \
766 c = - string_char (--src, &src, NULL); \
767 record_conversion_result \
768 (coding, CODING_RESULT_INVALID_SRC); \
775 /* Store a byte C in the place pointed by DST and increment DST to the
776 next free point, and increment PRODUCED_CHARS. The caller should
777 assure that C is 0..127, and declare and set the variable `dst'
778 appropriately in advance.
782 #define EMIT_ONE_ASCII_BYTE(c) \
789 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
791 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
793 produced_chars += 2; \
794 *dst++ = (c1), *dst++ = (c2); \
798 /* Store a byte C in the place pointed by DST and increment DST to the
799 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
800 nonzero, store in an appropriate multibyte from. The caller should
801 declare and set the variables `dst' and `multibytep' appropriately
804 #define EMIT_ONE_BYTE(c) \
811 ch = BYTE8_TO_CHAR (ch); \
812 CHAR_STRING_ADVANCE (ch, dst); \
819 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
821 #define EMIT_TWO_BYTES(c1, c2) \
823 produced_chars += 2; \
830 ch = BYTE8_TO_CHAR (ch); \
831 CHAR_STRING_ADVANCE (ch, dst); \
834 ch = BYTE8_TO_CHAR (ch); \
835 CHAR_STRING_ADVANCE (ch, dst); \
845 #define EMIT_THREE_BYTES(c1, c2, c3) \
847 EMIT_ONE_BYTE (c1); \
848 EMIT_TWO_BYTES (c2, c3); \
852 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
854 EMIT_TWO_BYTES (c1, c2); \
855 EMIT_TWO_BYTES (c3, c4); \
860 record_conversion_result (struct coding_system
*coding
,
861 enum coding_result_code result
)
863 coding
->result
= result
;
866 case CODING_RESULT_INSUFFICIENT_SRC
:
867 Vlast_code_conversion_error
= Qinsufficient_source
;
869 case CODING_RESULT_INCONSISTENT_EOL
:
870 Vlast_code_conversion_error
= Qinconsistent_eol
;
872 case CODING_RESULT_INVALID_SRC
:
873 Vlast_code_conversion_error
= Qinvalid_source
;
875 case CODING_RESULT_INTERRUPT
:
876 Vlast_code_conversion_error
= Qinterrupted
;
878 case CODING_RESULT_INSUFFICIENT_MEM
:
879 Vlast_code_conversion_error
= Qinsufficient_memory
;
884 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
886 charset_map_loaded = 0; \
887 c = DECODE_CHAR (charset, code); \
888 if (charset_map_loaded) \
890 const unsigned char *orig = coding->source; \
893 coding_set_source (coding); \
894 offset = coding->source - orig; \
896 src_base += offset; \
902 #define ASSURE_DESTINATION(bytes) \
904 if (dst + (bytes) >= dst_end) \
906 int more_bytes = charbuf_end - charbuf + (bytes); \
908 dst = alloc_destination (coding, more_bytes, dst); \
909 dst_end = coding->destination + coding->dst_bytes; \
916 coding_set_source (coding
)
917 struct coding_system
*coding
;
919 if (BUFFERP (coding
->src_object
))
921 struct buffer
*buf
= XBUFFER (coding
->src_object
);
923 if (coding
->src_pos
< 0)
924 coding
->source
= BUF_GAP_END_ADDR (buf
) + coding
->src_pos_byte
;
926 coding
->source
= BUF_BYTE_ADDRESS (buf
, coding
->src_pos_byte
);
928 else if (STRINGP (coding
->src_object
))
930 coding
->source
= SDATA (coding
->src_object
) + coding
->src_pos_byte
;
933 /* Otherwise, the source is C string and is never relocated
934 automatically. Thus we don't have to update anything. */
939 coding_set_destination (coding
)
940 struct coding_system
*coding
;
942 if (BUFFERP (coding
->dst_object
))
944 if (coding
->src_pos
< 0)
946 coding
->destination
= BEG_ADDR
+ coding
->dst_pos_byte
- 1;
947 coding
->dst_bytes
= (GAP_END_ADDR
948 - (coding
->src_bytes
- coding
->consumed
)
949 - coding
->destination
);
953 /* We are sure that coding->dst_pos_byte is before the gap
955 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
956 + coding
->dst_pos_byte
- 1);
957 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
958 - coding
->destination
);
962 /* Otherwise, the destination is C string and is never relocated
963 automatically. Thus we don't have to update anything. */
969 coding_alloc_by_realloc (coding
, bytes
)
970 struct coding_system
*coding
;
973 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
974 coding
->dst_bytes
+ bytes
);
975 coding
->dst_bytes
+= bytes
;
979 coding_alloc_by_making_gap (coding
, bytes
)
980 struct coding_system
*coding
;
983 if (BUFFERP (coding
->dst_object
)
984 && EQ (coding
->src_object
, coding
->dst_object
))
986 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
988 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
990 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
994 Lisp_Object this_buffer
;
996 this_buffer
= Fcurrent_buffer ();
997 set_buffer_internal (XBUFFER (coding
->dst_object
));
999 set_buffer_internal (XBUFFER (this_buffer
));
1004 static unsigned char *
1005 alloc_destination (coding
, nbytes
, dst
)
1006 struct coding_system
*coding
;
1010 EMACS_INT offset
= dst
- coding
->destination
;
1012 if (BUFFERP (coding
->dst_object
))
1013 coding_alloc_by_making_gap (coding
, nbytes
);
1015 coding_alloc_by_realloc (coding
, nbytes
);
1016 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1017 coding_set_destination (coding
);
1018 dst
= coding
->destination
+ offset
;
1022 /** Macros for annotations. */
1024 /* Maximum length of annotation data (sum of annotations for
1025 composition and charset). */
1026 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1028 /* An annotation data is stored in the array coding->charbuf in this
1030 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1031 LENGTH is the number of elements in the annotation.
1032 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1033 FROM and TO specify the range of text annotated. They are relative
1034 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1036 The format of the following elements depend on ANNOTATION_MASK.
1038 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1040 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1041 METHOD is one of enum composition_method.
1042 Optionnal COMPOSITION-COMPONENTS are characters and composition
1045 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1048 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1050 *(buf)++ = -(len); \
1051 *(buf)++ = (mask); \
1052 *(buf)++ = (from); \
1054 coding->annotated = 1; \
1057 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1059 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1064 #define ADD_CHARSET_DATA(buf, from, to, id) \
1066 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1071 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1078 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1079 Check if a text is encoded in UTF-8. If it is, return 1, else
1082 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1083 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1084 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1085 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1086 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1087 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1090 detect_coding_utf_8 (coding
, detect_info
)
1091 struct coding_system
*coding
;
1092 struct coding_detection_info
*detect_info
;
1094 const unsigned char *src
= coding
->source
, *src_base
;
1095 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1096 int multibytep
= coding
->src_multibyte
;
1097 int consumed_chars
= 0;
1100 detect_info
->checked
|= CATEGORY_MASK_UTF_8
;
1101 /* A coding system of this category is always ASCII compatible. */
1102 src
+= coding
->head_ascii
;
1106 int c
, c1
, c2
, c3
, c4
;
1110 if (c
< 0 || UTF_8_1_OCTET_P (c
))
1113 if (c1
< 0 || ! UTF_8_EXTRA_OCTET_P (c1
))
1115 if (UTF_8_2_OCTET_LEADING_P (c
))
1117 found
= CATEGORY_MASK_UTF_8
;
1121 if (c2
< 0 || ! UTF_8_EXTRA_OCTET_P (c2
))
1123 if (UTF_8_3_OCTET_LEADING_P (c
))
1125 found
= CATEGORY_MASK_UTF_8
;
1129 if (c3
< 0 || ! UTF_8_EXTRA_OCTET_P (c3
))
1131 if (UTF_8_4_OCTET_LEADING_P (c
))
1133 found
= CATEGORY_MASK_UTF_8
;
1137 if (c4
< 0 || ! UTF_8_EXTRA_OCTET_P (c4
))
1139 if (UTF_8_5_OCTET_LEADING_P (c
))
1141 found
= CATEGORY_MASK_UTF_8
;
1146 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1150 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1152 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1155 detect_info
->found
|= found
;
1161 decode_coding_utf_8 (coding
)
1162 struct coding_system
*coding
;
1164 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1165 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1166 const unsigned char *src_base
;
1167 int *charbuf
= coding
->charbuf
;
1168 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1169 int consumed_chars
= 0, consumed_chars_base
;
1170 int multibytep
= coding
->src_multibyte
;
1171 Lisp_Object attr
, charset_list
;
1173 CODING_GET_INFO (coding
, attr
, charset_list
);
1177 int c
, c1
, c2
, c3
, c4
, c5
;
1180 consumed_chars_base
= consumed_chars
;
1182 if (charbuf
>= charbuf_end
)
1190 else if (UTF_8_1_OCTET_P(c1
))
1197 if (c2
< 0 || ! UTF_8_EXTRA_OCTET_P (c2
))
1199 if (UTF_8_2_OCTET_LEADING_P (c1
))
1201 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1202 /* Reject overlong sequences here and below. Encoders
1203 producing them are incorrect, they can be misleading,
1204 and they mess up read/write invariance. */
1211 if (c3
< 0 || ! UTF_8_EXTRA_OCTET_P (c3
))
1213 if (UTF_8_3_OCTET_LEADING_P (c1
))
1215 c
= (((c1
& 0xF) << 12)
1216 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1218 || (c
>= 0xd800 && c
< 0xe000)) /* surrogates (invalid) */
1224 if (c4
< 0 || ! UTF_8_EXTRA_OCTET_P (c4
))
1226 if (UTF_8_4_OCTET_LEADING_P (c1
))
1228 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1229 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1236 if (c5
< 0 || ! UTF_8_EXTRA_OCTET_P (c5
))
1238 if (UTF_8_5_OCTET_LEADING_P (c1
))
1240 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1241 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1243 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1258 consumed_chars
= consumed_chars_base
;
1260 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1265 coding
->consumed_char
+= consumed_chars_base
;
1266 coding
->consumed
= src_base
- coding
->source
;
1267 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1272 encode_coding_utf_8 (coding
)
1273 struct coding_system
*coding
;
1275 int multibytep
= coding
->dst_multibyte
;
1276 int *charbuf
= coding
->charbuf
;
1277 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1278 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1279 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1280 int produced_chars
= 0;
1285 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1287 while (charbuf
< charbuf_end
)
1289 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1291 ASSURE_DESTINATION (safe_room
);
1293 if (CHAR_BYTE8_P (c
))
1295 c
= CHAR_TO_BYTE8 (c
);
1300 CHAR_STRING_ADVANCE (c
, pend
);
1301 for (p
= str
; p
< pend
; p
++)
1308 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1310 while (charbuf
< charbuf_end
)
1312 ASSURE_DESTINATION (safe_room
);
1314 dst
+= CHAR_STRING (c
, dst
);
1318 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1319 coding
->produced_char
+= produced_chars
;
1320 coding
->produced
= dst
- coding
->destination
;
1325 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1326 Check if a text is encoded in one of UTF-16 based coding systems.
1327 If it is, return 1, else return 0. */
1329 #define UTF_16_HIGH_SURROGATE_P(val) \
1330 (((val) & 0xFC00) == 0xD800)
1332 #define UTF_16_LOW_SURROGATE_P(val) \
1333 (((val) & 0xFC00) == 0xDC00)
1335 #define UTF_16_INVALID_P(val) \
1336 (((val) == 0xFFFE) \
1337 || ((val) == 0xFFFF) \
1338 || UTF_16_LOW_SURROGATE_P (val))
1342 detect_coding_utf_16 (coding
, detect_info
)
1343 struct coding_system
*coding
;
1344 struct coding_detection_info
*detect_info
;
1346 const unsigned char *src
= coding
->source
, *src_base
= src
;
1347 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1348 int multibytep
= coding
->src_multibyte
;
1349 int consumed_chars
= 0;
1352 detect_info
->checked
|= CATEGORY_MASK_UTF_16
;
1353 if (coding
->mode
& CODING_MODE_LAST_BLOCK
1354 && (coding
->src_chars
& 1))
1356 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1362 if ((c1
== 0xFF) && (c2
== 0xFE))
1364 detect_info
->found
|= (CATEGORY_MASK_UTF_16_LE
1365 | CATEGORY_MASK_UTF_16_AUTO
);
1366 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_BE
1367 | CATEGORY_MASK_UTF_16_BE_NOSIG
1368 | CATEGORY_MASK_UTF_16_LE_NOSIG
);
1370 else if ((c1
== 0xFE) && (c2
== 0xFF))
1372 detect_info
->found
|= (CATEGORY_MASK_UTF_16_BE
1373 | CATEGORY_MASK_UTF_16_AUTO
);
1374 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_LE
1375 | CATEGORY_MASK_UTF_16_BE_NOSIG
1376 | CATEGORY_MASK_UTF_16_LE_NOSIG
);
1378 else if (c1
>= 0 && c2
>= 0)
1380 unsigned char b1
[256], b2
[256];
1381 int b1_variants
= 1, b2_variants
= 1;
1384 bzero (b1
, 256), bzero (b2
, 256);
1386 for (n
= 0; n
< 256 && src
< src_end
; n
++)
1391 if (c1
< 0 || c2
< 0)
1393 if (! b1
[c1
++]) b1_variants
++;
1394 if (! b2
[c2
++]) b2_variants
++;
1396 if (b1_variants
< b2_variants
)
1397 detect_info
->found
|= CATEGORY_MASK_UTF_16_BE_NOSIG
;
1399 detect_info
->found
|= CATEGORY_MASK_UTF_16_LE_NOSIG
;
1400 detect_info
->rejected
1401 |= (CATEGORY_MASK_UTF_16_BE
| CATEGORY_MASK_UTF_16_LE
);
1408 decode_coding_utf_16 (coding
)
1409 struct coding_system
*coding
;
1411 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1412 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1413 const unsigned char *src_base
;
1414 int *charbuf
= coding
->charbuf
;
1415 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1416 int consumed_chars
= 0, consumed_chars_base
;
1417 int multibytep
= coding
->src_multibyte
;
1418 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1419 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1420 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1421 Lisp_Object attr
, charset_list
;
1423 CODING_GET_INFO (coding
, attr
, charset_list
);
1425 if (bom
== utf_16_with_bom
)
1434 if (endian
== utf_16_big_endian
1435 ? c
!= 0xFEFF : c
!= 0xFFFE)
1437 /* The first two bytes are not BOM. Treat them as bytes
1438 for a normal character. */
1442 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1444 else if (bom
== utf_16_detect_bom
)
1446 /* We have already tried to detect BOM and failed in
1448 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1456 consumed_chars_base
= consumed_chars
;
1458 if (charbuf
+ 2 >= charbuf_end
)
1470 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
1474 c
= (endian
== utf_16_big_endian
1475 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1478 if (! UTF_16_LOW_SURROGATE_P (c
))
1480 if (endian
== utf_16_big_endian
)
1481 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1483 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1487 if (UTF_16_HIGH_SURROGATE_P (c
))
1488 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1494 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1495 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1501 if (UTF_16_HIGH_SURROGATE_P (c
))
1502 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1509 coding
->consumed_char
+= consumed_chars_base
;
1510 coding
->consumed
= src_base
- coding
->source
;
1511 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1515 encode_coding_utf_16 (coding
)
1516 struct coding_system
*coding
;
1518 int multibytep
= coding
->dst_multibyte
;
1519 int *charbuf
= coding
->charbuf
;
1520 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1521 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1522 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1524 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1525 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1526 int produced_chars
= 0;
1527 Lisp_Object attrs
, charset_list
;
1530 CODING_GET_INFO (coding
, attrs
, charset_list
);
1532 if (bom
!= utf_16_without_bom
)
1534 ASSURE_DESTINATION (safe_room
);
1536 EMIT_TWO_BYTES (0xFE, 0xFF);
1538 EMIT_TWO_BYTES (0xFF, 0xFE);
1539 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1542 while (charbuf
< charbuf_end
)
1544 ASSURE_DESTINATION (safe_room
);
1546 if (c
>= MAX_UNICODE_CHAR
)
1547 c
= coding
->default_char
;
1552 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1554 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1561 c1
= (c
>> 10) + 0xD800;
1562 c2
= (c
& 0x3FF) + 0xDC00;
1564 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1566 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1569 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1570 coding
->produced
= dst
- coding
->destination
;
1571 coding
->produced_char
+= produced_chars
;
1576 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1578 /* Emacs' internal format for representation of multiple character
1579 sets is a kind of multi-byte encoding, i.e. characters are
1580 represented by variable-length sequences of one-byte codes.
1582 ASCII characters and control characters (e.g. `tab', `newline') are
1583 represented by one-byte sequences which are their ASCII codes, in
1584 the range 0x00 through 0x7F.
1586 8-bit characters of the range 0x80..0x9F are represented by
1587 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1590 8-bit characters of the range 0xA0..0xFF are represented by
1591 one-byte sequences which are their 8-bit code.
1593 The other characters are represented by a sequence of `base
1594 leading-code', optional `extended leading-code', and one or two
1595 `position-code's. The length of the sequence is determined by the
1596 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1597 whereas extended leading-code and position-code take the range 0xA0
1598 through 0xFF. See `charset.h' for more details about leading-code
1601 --- CODE RANGE of Emacs' internal format ---
1605 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1606 eight-bit-graphic 0xA0..0xBF
1607 ELSE 0x81..0x9D + [0xA0..0xFF]+
1608 ---------------------------------------------
1610 As this is the internal character representation, the format is
1611 usually not used externally (i.e. in a file or in a data sent to a
1612 process). But, it is possible to have a text externally in this
1613 format (i.e. by encoding by the coding system `emacs-mule').
1615 In that case, a sequence of one-byte codes has a slightly different
1618 At first, all characters in eight-bit-control are represented by
1619 one-byte sequences which are their 8-bit code.
1621 Next, character composition data are represented by the byte
1622 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1624 METHOD is 0xF0 plus one of composition method (enum
1625 composition_method),
1627 BYTES is 0xA0 plus a byte length of this composition data,
1629 CHARS is 0x20 plus a number of characters composed by this
1632 COMPONENTs are characters of multibye form or composition
1633 rules encoded by two-byte of ASCII codes.
1635 In addition, for backward compatibility, the following formats are
1636 also recognized as composition data on decoding.
1639 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1642 MSEQ is a multibyte form but in these special format:
1643 ASCII: 0xA0 ASCII_CODE+0x80,
1644 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1645 RULE is a one byte code of the range 0xA0..0xF0 that
1646 represents a composition rule.
1649 char emacs_mule_bytes
[256];
1652 emacs_mule_char (coding
, src
, nbytes
, nchars
, id
)
1653 struct coding_system
*coding
;
1654 const unsigned char *src
;
1655 int *nbytes
, *nchars
, *id
;
1657 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1658 const unsigned char *src_base
= src
;
1659 int multibytep
= coding
->src_multibyte
;
1660 struct charset
*charset
;
1663 int consumed_chars
= 0;
1669 charset
= emacs_mule_charset
[0];
1673 switch (emacs_mule_bytes
[c
])
1676 if (! (charset
= emacs_mule_charset
[c
]))
1685 if (c
== EMACS_MULE_LEADING_CODE_PRIVATE_11
1686 || c
== EMACS_MULE_LEADING_CODE_PRIVATE_12
)
1689 if (c
< 0 || ! (charset
= emacs_mule_charset
[c
]))
1698 if (! (charset
= emacs_mule_charset
[c
]))
1703 code
= (c
& 0x7F) << 8;
1713 if (c
< 0 || ! (charset
= emacs_mule_charset
[c
]))
1718 code
= (c
& 0x7F) << 8;
1727 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
)
1728 ? charset_ascii
: charset_eight_bit
);
1734 c
= DECODE_CHAR (charset
, code
);
1738 *nbytes
= src
- src_base
;
1739 *nchars
= consumed_chars
;
1752 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1753 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1757 detect_coding_emacs_mule (coding
, detect_info
)
1758 struct coding_system
*coding
;
1759 struct coding_detection_info
*detect_info
;
1761 const unsigned char *src
= coding
->source
, *src_base
;
1762 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1763 int multibytep
= coding
->src_multibyte
;
1764 int consumed_chars
= 0;
1768 detect_info
->checked
|= CATEGORY_MASK_EMACS_MULE
;
1769 /* A coding system of this category is always ASCII compatible. */
1770 src
+= coding
->head_ascii
;
1780 /* Perhaps the start of composite character. We simple skip
1781 it because analyzing it is too heavy for detecting. But,
1782 at least, we check that the composite character
1783 constitues of more than 4 bytes. */
1784 const unsigned char *src_base
;
1794 if (src
- src_base
<= 4)
1796 found
= CATEGORY_MASK_EMACS_MULE
;
1804 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1809 const unsigned char *src_base
= src
- 1;
1816 if (src
- src_base
!= emacs_mule_bytes
[*src_base
])
1818 found
= CATEGORY_MASK_EMACS_MULE
;
1821 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1825 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1827 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1830 detect_info
->found
|= found
;
1835 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1837 /* Decode a character represented as a component of composition
1838 sequence of Emacs 20/21 style at SRC. Set C to that character and
1839 update SRC to the head of next character (or an encoded composition
1840 rule). If SRC doesn't points a composition component, set C to -1.
1841 If SRC points an invalid byte sequence, global exit by a return
1844 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1848 int nbytes, nchars; \
1850 if (src == src_end) \
1852 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1857 goto invalid_code; \
1861 consumed_chars += nchars; \
1866 /* Decode a composition rule represented as a component of composition
1867 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1868 and increment BUF. If SRC points an invalid byte sequence, set C
1871 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1873 int c, gref, nref; \
1875 if (src >= src_end) \
1876 goto invalid_code; \
1877 ONE_MORE_BYTE_NO_CHECK (c); \
1879 if (c < 0 || c >= 81) \
1880 goto invalid_code; \
1882 gref = c / 9, nref = c % 9; \
1883 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1887 /* Decode a composition rule represented as a component of composition
1888 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1889 and increment BUF. If SRC points an invalid byte sequence, set C
1892 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1896 if (src + 1>= src_end) \
1897 goto invalid_code; \
1898 ONE_MORE_BYTE_NO_CHECK (gref); \
1900 ONE_MORE_BYTE_NO_CHECK (nref); \
1902 if (gref < 0 || gref >= 81 \
1903 || nref < 0 || nref >= 81) \
1904 goto invalid_code; \
1905 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1909 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1911 /* Emacs 21 style format. The first three bytes at SRC are \
1912 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1913 the byte length of this composition information, CHARS is the \
1914 number of characters composed by this composition. */ \
1915 enum composition_method method = c - 0xF2; \
1916 int *charbuf_base = charbuf; \
1918 int consumed_chars_limit; \
1919 int nbytes, nchars; \
1921 ONE_MORE_BYTE (c); \
1923 goto invalid_code; \
1924 nbytes = c - 0xA0; \
1926 goto invalid_code; \
1927 ONE_MORE_BYTE (c); \
1929 goto invalid_code; \
1930 nchars = c - 0xA0; \
1931 from = coding->produced + char_offset; \
1932 to = from + nchars; \
1933 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1934 consumed_chars_limit = consumed_chars_base + nbytes; \
1935 if (method != COMPOSITION_RELATIVE) \
1938 while (consumed_chars < consumed_chars_limit) \
1940 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1941 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1943 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1946 if (consumed_chars < consumed_chars_limit) \
1947 goto invalid_code; \
1948 charbuf_base[0] -= i; \
1953 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1955 /* Emacs 20 style format for relative composition. */ \
1956 /* Store multibyte form of characters to be composed. */ \
1957 enum composition_method method = COMPOSITION_RELATIVE; \
1958 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1959 int *buf = components; \
1964 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1965 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1966 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1968 goto invalid_code; \
1969 from = coding->produced_char + char_offset; \
1971 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1972 for (j = 0; j < i; j++) \
1973 *charbuf++ = components[j]; \
1977 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1979 /* Emacs 20 style format for rule-base composition. */ \
1980 /* Store multibyte form of characters to be composed. */ \
1981 enum composition_method method = COMPOSITION_WITH_RULE; \
1982 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1983 int *buf = components; \
1987 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1988 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1990 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1991 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1993 if (i < 1 || (buf - components) % 2 == 0) \
1994 goto invalid_code; \
1995 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1996 goto no_more_source; \
1997 from = coding->produced_char + char_offset; \
1999 ADD_COMPOSITION_DATA (buf, from, to, method); \
2000 for (j = 0; j < i; j++) \
2001 *charbuf++ = components[j]; \
2002 for (j = 0; j < i; j += 2) \
2003 *charbuf++ = components[j]; \
2008 decode_coding_emacs_mule (coding
)
2009 struct coding_system
*coding
;
2011 const unsigned char *src
= coding
->source
+ coding
->consumed
;
2012 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2013 const unsigned char *src_base
;
2014 int *charbuf
= coding
->charbuf
;
2015 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
2016 int consumed_chars
= 0, consumed_chars_base
;
2017 int multibytep
= coding
->src_multibyte
;
2018 Lisp_Object attrs
, charset_list
;
2019 int char_offset
= coding
->produced_char
;
2020 int last_offset
= char_offset
;
2021 int last_id
= charset_ascii
;
2023 CODING_GET_INFO (coding
, attrs
, charset_list
);
2030 consumed_chars_base
= consumed_chars
;
2032 if (charbuf
>= charbuf_end
)
2051 if (c
- 0xF2 >= COMPOSITION_RELATIVE
2052 && c
- 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS
)
2053 DECODE_EMACS_MULE_21_COMPOSITION (c
);
2055 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
2057 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
2061 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
2067 consumed_chars
= consumed_chars_base
;
2068 c
= emacs_mule_char (coding
, src
, &nbytes
, &nchars
, &id
);
2077 if (last_id
!= charset_ascii
)
2078 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2080 last_offset
= char_offset
;
2084 consumed_chars
+= nchars
;
2091 consumed_chars
= consumed_chars_base
;
2093 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
2099 if (last_id
!= charset_ascii
)
2100 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2101 coding
->consumed_char
+= consumed_chars_base
;
2102 coding
->consumed
= src_base
- coding
->source
;
2103 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
2107 #define EMACS_MULE_LEADING_CODES(id, codes) \
2110 codes[0] = id, codes[1] = 0; \
2111 else if (id < 0xE0) \
2112 codes[0] = 0x9A, codes[1] = id; \
2113 else if (id < 0xF0) \
2114 codes[0] = 0x9B, codes[1] = id; \
2115 else if (id < 0xF5) \
2116 codes[0] = 0x9C, codes[1] = id; \
2118 codes[0] = 0x9D, codes[1] = id; \
2123 encode_coding_emacs_mule (coding
)
2124 struct coding_system
*coding
;
2126 int multibytep
= coding
->dst_multibyte
;
2127 int *charbuf
= coding
->charbuf
;
2128 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
2129 unsigned char *dst
= coding
->destination
+ coding
->produced
;
2130 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
2132 int produced_chars
= 0;
2133 Lisp_Object attrs
, charset_list
;
2135 int preferred_charset_id
= -1;
2137 CODING_GET_INFO (coding
, attrs
, charset_list
);
2138 if (! EQ (charset_list
, Vemacs_mule_charset_list
))
2140 CODING_ATTR_CHARSET_LIST (attrs
)
2141 = charset_list
= Vemacs_mule_charset_list
;
2144 while (charbuf
< charbuf_end
)
2146 ASSURE_DESTINATION (safe_room
);
2151 /* Handle an annotation. */
2154 case CODING_ANNOTATE_COMPOSITION_MASK
:
2155 /* Not yet implemented. */
2157 case CODING_ANNOTATE_CHARSET_MASK
:
2158 preferred_charset_id
= charbuf
[3];
2159 if (preferred_charset_id
>= 0
2160 && NILP (Fmemq (make_number (preferred_charset_id
),
2162 preferred_charset_id
= -1;
2171 if (ASCII_CHAR_P (c
))
2172 EMIT_ONE_ASCII_BYTE (c
);
2173 else if (CHAR_BYTE8_P (c
))
2175 c
= CHAR_TO_BYTE8 (c
);
2180 struct charset
*charset
;
2184 unsigned char leading_codes
[2];
2186 if (preferred_charset_id
>= 0)
2188 charset
= CHARSET_FROM_ID (preferred_charset_id
);
2189 if (! CHAR_CHARSET_P (c
, charset
))
2190 charset
= char_charset (c
, charset_list
, NULL
);
2193 charset
= char_charset (c
, charset_list
, &code
);
2196 c
= coding
->default_char
;
2197 if (ASCII_CHAR_P (c
))
2199 EMIT_ONE_ASCII_BYTE (c
);
2202 charset
= char_charset (c
, charset_list
, &code
);
2204 dimension
= CHARSET_DIMENSION (charset
);
2205 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2206 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2207 EMIT_ONE_BYTE (leading_codes
[0]);
2208 if (leading_codes
[1])
2209 EMIT_ONE_BYTE (leading_codes
[1]);
2211 EMIT_ONE_BYTE (code
| 0x80);
2215 EMIT_ONE_BYTE (code
>> 8);
2216 EMIT_ONE_BYTE (code
& 0xFF);
2220 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
2221 coding
->produced_char
+= produced_chars
;
2222 coding
->produced
= dst
- coding
->destination
;
2227 /*** 7. ISO2022 handlers ***/
2229 /* The following note describes the coding system ISO2022 briefly.
2230 Since the intention of this note is to help understand the
2231 functions in this file, some parts are NOT ACCURATE or are OVERLY
2232 SIMPLIFIED. For thorough understanding, please refer to the
2233 original document of ISO2022. This is equivalent to the standard
2234 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2236 ISO2022 provides many mechanisms to encode several character sets
2237 in 7-bit and 8-bit environments. For 7-bit environments, all text
2238 is encoded using bytes less than 128. This may make the encoded
2239 text a little bit longer, but the text passes more easily through
2240 several types of gateway, some of which strip off the MSB (Most
2243 There are two kinds of character sets: control character sets and
2244 graphic character sets. The former contain control characters such
2245 as `newline' and `escape' to provide control functions (control
2246 functions are also provided by escape sequences). The latter
2247 contain graphic characters such as 'A' and '-'. Emacs recognizes
2248 two control character sets and many graphic character sets.
2250 Graphic character sets are classified into one of the following
2251 four classes, according to the number of bytes (DIMENSION) and
2252 number of characters in one dimension (CHARS) of the set:
2253 - DIMENSION1_CHARS94
2254 - DIMENSION1_CHARS96
2255 - DIMENSION2_CHARS94
2256 - DIMENSION2_CHARS96
2258 In addition, each character set is assigned an identification tag,
2259 unique for each set, called the "final character" (denoted as <F>
2260 hereafter). The <F> of each character set is decided by ECMA(*)
2261 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2262 (0x30..0x3F are for private use only).
2264 Note (*): ECMA = European Computer Manufacturers Association
2266 Here are examples of graphic character sets [NAME(<F>)]:
2267 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2268 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2269 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2270 o DIMENSION2_CHARS96 -- none for the moment
2272 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2273 C0 [0x00..0x1F] -- control character plane 0
2274 GL [0x20..0x7F] -- graphic character plane 0
2275 C1 [0x80..0x9F] -- control character plane 1
2276 GR [0xA0..0xFF] -- graphic character plane 1
2278 A control character set is directly designated and invoked to C0 or
2279 C1 by an escape sequence. The most common case is that:
2280 - ISO646's control character set is designated/invoked to C0, and
2281 - ISO6429's control character set is designated/invoked to C1,
2282 and usually these designations/invocations are omitted in encoded
2283 text. In a 7-bit environment, only C0 can be used, and a control
2284 character for C1 is encoded by an appropriate escape sequence to
2285 fit into the environment. All control characters for C1 are
2286 defined to have corresponding escape sequences.
2288 A graphic character set is at first designated to one of four
2289 graphic registers (G0 through G3), then these graphic registers are
2290 invoked to GL or GR. These designations and invocations can be
2291 done independently. The most common case is that G0 is invoked to
2292 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2293 these invocations and designations are omitted in encoded text.
2294 In a 7-bit environment, only GL can be used.
2296 When a graphic character set of CHARS94 is invoked to GL, codes
2297 0x20 and 0x7F of the GL area work as control characters SPACE and
2298 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2301 There are two ways of invocation: locking-shift and single-shift.
2302 With locking-shift, the invocation lasts until the next different
2303 invocation, whereas with single-shift, the invocation affects the
2304 following character only and doesn't affect the locking-shift
2305 state. Invocations are done by the following control characters or
2308 ----------------------------------------------------------------------
2309 abbrev function cntrl escape seq description
2310 ----------------------------------------------------------------------
2311 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2312 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2313 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2314 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2315 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2316 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2317 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2318 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2319 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2320 ----------------------------------------------------------------------
2321 (*) These are not used by any known coding system.
2323 Control characters for these functions are defined by macros
2324 ISO_CODE_XXX in `coding.h'.
2326 Designations are done by the following escape sequences:
2327 ----------------------------------------------------------------------
2328 escape sequence description
2329 ----------------------------------------------------------------------
2330 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2331 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2332 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2333 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2334 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2335 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2336 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2337 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2338 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2339 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2340 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2341 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2342 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2343 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2344 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2345 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2346 ----------------------------------------------------------------------
2348 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2349 of dimension 1, chars 94, and final character <F>, etc...
2351 Note (*): Although these designations are not allowed in ISO2022,
2352 Emacs accepts them on decoding, and produces them on encoding
2353 CHARS96 character sets in a coding system which is characterized as
2354 7-bit environment, non-locking-shift, and non-single-shift.
2356 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2357 '(' must be omitted. We refer to this as "short-form" hereafter.
2359 Now you may notice that there are a lot of ways of encoding the
2360 same multilingual text in ISO2022. Actually, there exist many
2361 coding systems such as Compound Text (used in X11's inter client
2362 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2363 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2364 localized platforms), and all of these are variants of ISO2022.
2366 In addition to the above, Emacs handles two more kinds of escape
2367 sequences: ISO6429's direction specification and Emacs' private
2368 sequence for specifying character composition.
2370 ISO6429's direction specification takes the following form:
2371 o CSI ']' -- end of the current direction
2372 o CSI '0' ']' -- end of the current direction
2373 o CSI '1' ']' -- start of left-to-right text
2374 o CSI '2' ']' -- start of right-to-left text
2375 The control character CSI (0x9B: control sequence introducer) is
2376 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2378 Character composition specification takes the following form:
2379 o ESC '0' -- start relative composition
2380 o ESC '1' -- end composition
2381 o ESC '2' -- start rule-base composition (*)
2382 o ESC '3' -- start relative composition with alternate chars (**)
2383 o ESC '4' -- start rule-base composition with alternate chars (**)
2384 Since these are not standard escape sequences of any ISO standard,
2385 the use of them with these meanings is restricted to Emacs only.
2387 (*) This form is used only in Emacs 20.7 and older versions,
2388 but newer versions can safely decode it.
2389 (**) This form is used only in Emacs 21.1 and newer versions,
2390 and older versions can't decode it.
2392 Here's a list of example usages of these composition escape
2393 sequences (categorized by `enum composition_method').
2395 COMPOSITION_RELATIVE:
2396 ESC 0 CHAR [ CHAR ] ESC 1
2397 COMPOSITION_WITH_RULE:
2398 ESC 2 CHAR [ RULE CHAR ] ESC 1
2399 COMPOSITION_WITH_ALTCHARS:
2400 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2401 COMPOSITION_WITH_RULE_ALTCHARS:
2402 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2404 enum iso_code_class_type iso_code_class
[256];
2406 #define SAFE_CHARSET_P(coding, id) \
2407 ((id) <= (coding)->max_charset_id \
2408 && (coding)->safe_charsets[id] >= 0)
2411 #define SHIFT_OUT_OK(category) \
2412 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2415 setup_iso_safe_charsets (attrs
)
2418 Lisp_Object charset_list
, safe_charsets
;
2419 Lisp_Object request
;
2420 Lisp_Object reg_usage
;
2423 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2426 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2427 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2428 && ! EQ (charset_list
, Viso_2022_charset_list
))
2430 CODING_ATTR_CHARSET_LIST (attrs
)
2431 = charset_list
= Viso_2022_charset_list
;
2432 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2435 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2439 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2441 int id
= XINT (XCAR (tail
));
2442 if (max_charset_id
< id
)
2443 max_charset_id
= id
;
2446 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2448 request
= AREF (attrs
, coding_attr_iso_request
);
2449 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2450 reg94
= XINT (XCAR (reg_usage
));
2451 reg96
= XINT (XCDR (reg_usage
));
2453 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2457 struct charset
*charset
;
2460 charset
= CHARSET_FROM_ID (XINT (id
));
2461 reg
= Fcdr (Fassq (id
, request
));
2463 SSET (safe_charsets
, XINT (id
), XINT (reg
));
2464 else if (charset
->iso_chars_96
)
2467 SSET (safe_charsets
, XINT (id
), reg96
);
2472 SSET (safe_charsets
, XINT (id
), reg94
);
2475 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2479 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2480 Check if a text is encoded in one of ISO-2022 based codig systems.
2481 If it is, return 1, else return 0. */
2484 detect_coding_iso_2022 (coding
, detect_info
)
2485 struct coding_system
*coding
;
2486 struct coding_detection_info
*detect_info
;
2488 const unsigned char *src
= coding
->source
, *src_base
= src
;
2489 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2490 int multibytep
= coding
->src_multibyte
;
2491 int single_shifting
= 0;
2494 int consumed_chars
= 0;
2499 detect_info
->checked
|= CATEGORY_MASK_ISO
;
2501 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2503 struct coding_system
*this = &(coding_categories
[i
]);
2504 Lisp_Object attrs
, val
;
2506 attrs
= CODING_ID_ATTRS (this->id
);
2507 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2508 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2509 setup_iso_safe_charsets (attrs
);
2510 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2511 this->max_charset_id
= SCHARS (val
) - 1;
2512 this->safe_charsets
= (char *) SDATA (val
);
2515 /* A coding system of this category is always ASCII compatible. */
2516 src
+= coding
->head_ascii
;
2518 while (rejected
!= CATEGORY_MASK_ISO
)
2525 if (inhibit_iso_escape_detection
)
2527 single_shifting
= 0;
2529 if (c
>= '(' && c
<= '/')
2531 /* Designation sequence for a charset of dimension 1. */
2533 if (c1
< ' ' || c1
>= 0x80
2534 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2535 /* Invalid designation sequence. Just ignore. */
2540 /* Designation sequence for a charset of dimension 2. */
2542 if (c
>= '@' && c
<= 'B')
2543 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2544 id
= iso_charset_table
[1][0][c
];
2545 else if (c
>= '(' && c
<= '/')
2548 if (c1
< ' ' || c1
>= 0x80
2549 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2550 /* Invalid designation sequence. Just ignore. */
2554 /* Invalid designation sequence. Just ignore it. */
2557 else if (c
== 'N' || c
== 'O')
2559 /* ESC <Fe> for SS2 or SS3. */
2560 single_shifting
= 1;
2561 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2564 else if (c
>= '0' && c
<= '4')
2566 /* ESC <Fp> for start/end composition. */
2567 found
|= CATEGORY_MASK_ISO
;
2572 /* Invalid escape sequence. Just ignore it. */
2576 /* We found a valid designation sequence for CHARSET. */
2577 rejected
|= CATEGORY_MASK_ISO_8BIT
;
2578 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2580 found
|= CATEGORY_MASK_ISO_7
;
2582 rejected
|= CATEGORY_MASK_ISO_7
;
2583 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2585 found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2587 rejected
|= CATEGORY_MASK_ISO_7_TIGHT
;
2588 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2590 found
|= CATEGORY_MASK_ISO_7_ELSE
;
2592 rejected
|= CATEGORY_MASK_ISO_7_ELSE
;
2593 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2595 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2597 rejected
|= CATEGORY_MASK_ISO_8_ELSE
;
2602 /* Locking shift out/in. */
2603 if (inhibit_iso_escape_detection
)
2605 single_shifting
= 0;
2606 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2607 found
|= CATEGORY_MASK_ISO_ELSE
;
2611 /* Control sequence introducer. */
2612 single_shifting
= 0;
2613 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2614 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2615 goto check_extra_latin
;
2621 if (inhibit_iso_escape_detection
)
2623 single_shifting
= 1;
2624 rejected
|= CATEGORY_MASK_ISO_7BIT
;
2625 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2626 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2627 found
|= CATEGORY_MASK_ISO_8_1
;
2628 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2629 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2630 found
|= CATEGORY_MASK_ISO_8_2
;
2631 goto check_extra_latin
;
2638 single_shifting
= 0;
2643 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2644 found
|= CATEGORY_MASK_ISO_8_1
;
2645 /* Check the length of succeeding codes of the range
2646 0xA0..0FF. If the byte length is even, we include
2647 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2648 only when we are not single shifting. */
2649 if (! single_shifting
2650 && ! (rejected
& CATEGORY_MASK_ISO_8_2
))
2653 while (src
< src_end
)
2661 if (i
& 1 && src
< src_end
)
2662 rejected
|= CATEGORY_MASK_ISO_8_2
;
2664 found
|= CATEGORY_MASK_ISO_8_2
;
2669 single_shifting
= 0;
2670 if (! VECTORP (Vlatin_extra_code_table
)
2671 || NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2673 rejected
= CATEGORY_MASK_ISO
;
2676 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2677 & CODING_ISO_FLAG_LATIN_EXTRA
)
2678 found
|= CATEGORY_MASK_ISO_8_1
;
2680 rejected
|= CATEGORY_MASK_ISO_8_1
;
2681 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2682 & CODING_ISO_FLAG_LATIN_EXTRA
)
2683 found
|= CATEGORY_MASK_ISO_8_2
;
2685 rejected
|= CATEGORY_MASK_ISO_8_2
;
2688 detect_info
->rejected
|= CATEGORY_MASK_ISO
;
2692 detect_info
->rejected
|= rejected
;
2693 detect_info
->found
|= (found
& ~rejected
);
2698 /* Set designation state into CODING. */
2699 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2703 if (final < '0' || final >= 128 \
2704 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2705 || !SAFE_CHARSET_P (coding, id)) \
2707 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2708 goto invalid_code; \
2710 prev = CODING_ISO_DESIGNATION (coding, reg); \
2711 if (id == charset_jisx0201_roman) \
2713 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2714 id = charset_ascii; \
2716 else if (id == charset_jisx0208_1978) \
2718 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2719 id = charset_jisx0208; \
2721 CODING_ISO_DESIGNATION (coding, reg) = id; \
2722 /* If there was an invalid designation to REG previously, and this \
2723 designation is ASCII to REG, we should keep this designation \
2725 if (prev == -2 && id == charset_ascii) \
2726 goto invalid_code; \
2730 #define MAYBE_FINISH_COMPOSITION() \
2733 if (composition_state == COMPOSING_NO) \
2735 /* It is assured that we have enough room for producing \
2736 characters stored in the table `components'. */ \
2737 if (charbuf + component_idx > charbuf_end) \
2738 goto no_more_source; \
2739 composition_state = COMPOSING_NO; \
2740 if (method == COMPOSITION_RELATIVE \
2741 || method == COMPOSITION_WITH_ALTCHARS) \
2743 for (i = 0; i < component_idx; i++) \
2744 *charbuf++ = components[i]; \
2745 char_offset += component_idx; \
2749 for (i = 0; i < component_idx; i += 2) \
2750 *charbuf++ = components[i]; \
2751 char_offset += (component_idx / 2) + 1; \
2756 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2757 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2758 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2759 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2760 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2763 #define DECODE_COMPOSITION_START(c1) \
2766 && composition_state == COMPOSING_COMPONENT_RULE) \
2768 component_len = component_idx; \
2769 composition_state = COMPOSING_CHAR; \
2773 const unsigned char *p; \
2775 MAYBE_FINISH_COMPOSITION (); \
2776 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2777 goto no_more_source; \
2778 for (p = src; p < src_end - 1; p++) \
2779 if (*p == ISO_CODE_ESC && p[1] == '1') \
2781 if (p == src_end - 1) \
2783 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2784 goto invalid_code; \
2785 goto no_more_source; \
2788 /* This is surely the start of a composition. */ \
2789 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2790 : c1 == '2' ? COMPOSITION_WITH_RULE \
2791 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2792 : COMPOSITION_WITH_RULE_ALTCHARS); \
2793 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2794 : COMPOSING_COMPONENT_CHAR); \
2795 component_idx = component_len = 0; \
2800 /* Handle compositoin end sequence ESC 1. */
2802 #define DECODE_COMPOSITION_END() \
2804 int nchars = (component_len > 0 ? component_idx - component_len \
2805 : method == COMPOSITION_RELATIVE ? component_idx \
2806 : (component_idx + 1) / 2); \
2808 int *saved_charbuf = charbuf; \
2809 int from = char_offset; \
2810 int to = from + nchars; \
2812 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2813 if (method != COMPOSITION_RELATIVE) \
2815 if (component_len == 0) \
2816 for (i = 0; i < component_idx; i++) \
2817 *charbuf++ = components[i]; \
2819 for (i = 0; i < component_len; i++) \
2820 *charbuf++ = components[i]; \
2821 *saved_charbuf = saved_charbuf - charbuf; \
2823 if (method == COMPOSITION_WITH_RULE) \
2824 for (i = 0; i < component_idx; i += 2, char_offset++) \
2825 *charbuf++ = components[i]; \
2827 for (i = component_len; i < component_idx; i++, char_offset++) \
2828 *charbuf++ = components[i]; \
2829 coding->annotated = 1; \
2830 composition_state = COMPOSING_NO; \
2834 /* Decode a composition rule from the byte C1 (and maybe one more byte
2835 from SRC) and store one encoded composition rule in
2836 coding->cmp_data. */
2838 #define DECODE_COMPOSITION_RULE(c1) \
2841 if (c1 < 81) /* old format (before ver.21) */ \
2843 int gref = (c1) / 9; \
2844 int nref = (c1) % 9; \
2845 if (gref == 4) gref = 10; \
2846 if (nref == 4) nref = 10; \
2847 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2849 else if (c1 < 93) /* new format (after ver.21) */ \
2851 ONE_MORE_BYTE (c2); \
2852 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2859 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2862 decode_coding_iso_2022 (coding
)
2863 struct coding_system
*coding
;
2865 const unsigned char *src
= coding
->source
+ coding
->consumed
;
2866 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2867 const unsigned char *src_base
;
2868 int *charbuf
= coding
->charbuf
;
2870 = charbuf
+ coding
->charbuf_size
- 4 - MAX_ANNOTATION_LENGTH
;
2871 int consumed_chars
= 0, consumed_chars_base
;
2872 int multibytep
= coding
->src_multibyte
;
2873 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2874 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2875 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2876 struct charset
*charset
;
2878 /* For handling composition sequence. */
2879 #define COMPOSING_NO 0
2880 #define COMPOSING_CHAR 1
2881 #define COMPOSING_RULE 2
2882 #define COMPOSING_COMPONENT_CHAR 3
2883 #define COMPOSING_COMPONENT_RULE 4
2885 int composition_state
= COMPOSING_NO
;
2886 enum composition_method method
;
2887 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2890 Lisp_Object attrs
, charset_list
;
2891 int char_offset
= coding
->produced_char
;
2892 int last_offset
= char_offset
;
2893 int last_id
= charset_ascii
;
2895 CODING_GET_INFO (coding
, attrs
, charset_list
);
2896 setup_iso_safe_charsets (attrs
);
2903 consumed_chars_base
= consumed_chars
;
2905 if (charbuf
>= charbuf_end
)
2912 /* We produce at most one character. */
2913 switch (iso_code_class
[c1
])
2915 case ISO_0x20_or_0x7F
:
2916 if (composition_state
!= COMPOSING_NO
)
2918 if (composition_state
== COMPOSING_RULE
2919 || composition_state
== COMPOSING_COMPONENT_RULE
)
2921 DECODE_COMPOSITION_RULE (c1
);
2922 components
[component_idx
++] = c1
;
2923 composition_state
--;
2927 if (charset_id_0
< 0
2928 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2929 /* This is SPACE or DEL. */
2930 charset
= CHARSET_FROM_ID (charset_ascii
);
2932 charset
= CHARSET_FROM_ID (charset_id_0
);
2935 case ISO_graphic_plane_0
:
2936 if (composition_state
!= COMPOSING_NO
)
2938 if (composition_state
== COMPOSING_RULE
2939 || composition_state
== COMPOSING_COMPONENT_RULE
)
2941 DECODE_COMPOSITION_RULE (c1
);
2942 components
[component_idx
++] = c1
;
2943 composition_state
--;
2947 charset
= CHARSET_FROM_ID (charset_id_0
);
2950 case ISO_0xA0_or_0xFF
:
2951 if (charset_id_1
< 0
2952 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
2953 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
2955 /* This is a graphic character, we fall down ... */
2957 case ISO_graphic_plane_1
:
2958 if (charset_id_1
< 0)
2960 charset
= CHARSET_FROM_ID (charset_id_1
);
2964 MAYBE_FINISH_COMPOSITION ();
2965 charset
= CHARSET_FROM_ID (charset_ascii
);
2969 MAYBE_FINISH_COMPOSITION ();
2973 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2974 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
2976 CODING_ISO_INVOCATION (coding
, 0) = 1;
2977 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2981 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
2983 CODING_ISO_INVOCATION (coding
, 0) = 0;
2984 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2987 case ISO_single_shift_2_7
:
2988 case ISO_single_shift_2
:
2989 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2991 /* SS2 is handled as an escape sequence of ESC 'N' */
2993 goto label_escape_sequence
;
2995 case ISO_single_shift_3
:
2996 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2998 /* SS2 is handled as an escape sequence of ESC 'O' */
3000 goto label_escape_sequence
;
3002 case ISO_control_sequence_introducer
:
3003 /* CSI is handled as an escape sequence of ESC '[' ... */
3005 goto label_escape_sequence
;
3009 label_escape_sequence
:
3010 /* Escape sequences handled here are invocation,
3011 designation, direction specification, and character
3012 composition specification. */
3015 case '&': /* revision of following character set */
3017 if (!(c1
>= '@' && c1
<= '~'))
3020 if (c1
!= ISO_CODE_ESC
)
3023 goto label_escape_sequence
;
3025 case '$': /* designation of 2-byte character set */
3026 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3029 if (c1
>= '@' && c1
<= 'B')
3030 { /* designation of JISX0208.1978, GB2312.1980,
3032 DECODE_DESIGNATION (0, 2, 0, c1
);
3034 else if (c1
>= 0x28 && c1
<= 0x2B)
3035 { /* designation of DIMENSION2_CHARS94 character set */
3037 DECODE_DESIGNATION (c1
- 0x28, 2, 0, c2
);
3039 else if (c1
>= 0x2C && c1
<= 0x2F)
3040 { /* designation of DIMENSION2_CHARS96 character set */
3042 DECODE_DESIGNATION (c1
- 0x2C, 2, 1, c2
);
3046 /* We must update these variables now. */
3047 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3048 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3051 case 'n': /* invocation of locking-shift-2 */
3052 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3053 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3055 CODING_ISO_INVOCATION (coding
, 0) = 2;
3056 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3059 case 'o': /* invocation of locking-shift-3 */
3060 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3061 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3063 CODING_ISO_INVOCATION (coding
, 0) = 3;
3064 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3067 case 'N': /* invocation of single-shift-2 */
3068 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3069 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3071 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 2));
3073 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3077 case 'O': /* invocation of single-shift-3 */
3078 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3079 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3081 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 3));
3083 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3087 case '0': case '2': case '3': case '4': /* start composition */
3088 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
3090 DECODE_COMPOSITION_START (c1
);
3093 case '1': /* end composition */
3094 if (composition_state
== COMPOSING_NO
)
3096 DECODE_COMPOSITION_END ();
3099 case '[': /* specification of direction */
3100 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
3102 /* For the moment, nested direction is not supported.
3103 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3104 left-to-right, and nozero means right-to-left. */
3108 case ']': /* end of the current direction */
3109 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3111 case '0': /* end of the current direction */
3112 case '1': /* start of left-to-right direction */
3115 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3120 case '2': /* start of right-to-left direction */
3123 coding
->mode
|= CODING_MODE_DIRECTION
;
3137 /* CTEXT extended segment:
3138 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3139 We keep these bytes as is for the moment.
3140 They may be decoded by post-read-conversion. */
3144 ONE_MORE_BYTE (dim
);
3147 size
= ((M
- 128) * 128) + (L
- 128);
3148 if (charbuf
+ 8 + size
> charbuf_end
)
3150 *charbuf
++ = ISO_CODE_ESC
;
3154 *charbuf
++ = BYTE8_TO_CHAR (M
);
3155 *charbuf
++ = BYTE8_TO_CHAR (L
);
3159 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3164 /* XFree86 extension for embedding UTF-8 in CTEXT:
3165 ESC % G --UTF-8-BYTES-- ESC % @
3166 We keep these bytes as is for the moment.
3167 They may be decoded by post-read-conversion. */
3170 if (p
+ 6 > charbuf_end
)
3172 *p
++ = ISO_CODE_ESC
;
3175 while (p
< charbuf_end
)
3178 if (c1
== ISO_CODE_ESC
3179 && src
+ 1 < src_end
3183 *p
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3185 if (p
+ 3 > charbuf_end
)
3187 *p
++ = ISO_CODE_ESC
;
3198 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3200 if (c1
>= 0x28 && c1
<= 0x2B)
3201 { /* designation of DIMENSION1_CHARS94 character set */
3203 DECODE_DESIGNATION (c1
- 0x28, 1, 0, c2
);
3205 else if (c1
>= 0x2C && c1
<= 0x2F)
3206 { /* designation of DIMENSION1_CHARS96 character set */
3208 DECODE_DESIGNATION (c1
- 0x2C, 1, 1, c2
);
3212 /* We must update these variables now. */
3213 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3214 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3219 if (charset
->id
!= charset_ascii
3220 && last_id
!= charset
->id
)
3222 if (last_id
!= charset_ascii
)
3223 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3224 last_id
= charset
->id
;
3225 last_offset
= char_offset
;
3228 /* Now we know CHARSET and 1st position code C1 of a character.
3229 Produce a decoded character while getting 2nd position code
3232 if (CHARSET_DIMENSION (charset
) > 1)
3235 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3236 /* C2 is not in a valid range. */
3238 c1
= (c1
<< 8) | (c2
& 0x7F);
3239 if (CHARSET_DIMENSION (charset
) > 2)
3242 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3243 /* C2 is not in a valid range. */
3245 c1
= (c1
<< 8) | (c2
& 0x7F);
3249 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3252 MAYBE_FINISH_COMPOSITION ();
3253 for (; src_base
< src
; src_base
++, char_offset
++)
3255 if (ASCII_BYTE_P (*src_base
))
3256 *charbuf
++ = *src_base
;
3258 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3261 else if (composition_state
== COMPOSING_NO
)
3268 components
[component_idx
++] = c
;
3269 if (method
== COMPOSITION_WITH_RULE
3270 || (method
== COMPOSITION_WITH_RULE_ALTCHARS
3271 && composition_state
== COMPOSING_COMPONENT_CHAR
))
3272 composition_state
++;
3277 MAYBE_FINISH_COMPOSITION ();
3279 consumed_chars
= consumed_chars_base
;
3281 *charbuf
++ = c
< 0 ? -c
: ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3291 if (last_id
!= charset_ascii
)
3292 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3293 coding
->consumed_char
+= consumed_chars_base
;
3294 coding
->consumed
= src_base
- coding
->source
;
3295 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3299 /* ISO2022 encoding stuff. */
3302 It is not enough to say just "ISO2022" on encoding, we have to
3303 specify more details. In Emacs, each coding system of ISO2022
3304 variant has the following specifications:
3305 1. Initial designation to G0 thru G3.
3306 2. Allows short-form designation?
3307 3. ASCII should be designated to G0 before control characters?
3308 4. ASCII should be designated to G0 at end of line?
3309 5. 7-bit environment or 8-bit environment?
3310 6. Use locking-shift?
3311 7. Use Single-shift?
3312 And the following two are only for Japanese:
3313 8. Use ASCII in place of JIS0201-1976-Roman?
3314 9. Use JISX0208-1983 in place of JISX0208-1978?
3315 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3316 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3320 /* Produce codes (escape sequence) for designating CHARSET to graphic
3321 register REG at DST, and increment DST. If <final-char> of CHARSET is
3322 '@', 'A', or 'B' and the coding system CODING allows, produce
3323 designation sequence of short-form. */
3325 #define ENCODE_DESIGNATION(charset, reg, coding) \
3327 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3328 char *intermediate_char_94 = "()*+"; \
3329 char *intermediate_char_96 = ",-./"; \
3330 int revision = -1; \
3333 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3334 revision = CHARSET_ISO_REVISION (charset); \
3336 if (revision >= 0) \
3338 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3339 EMIT_ONE_BYTE ('@' + revision); \
3341 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3342 if (CHARSET_DIMENSION (charset) == 1) \
3344 if (! CHARSET_ISO_CHARS_96 (charset)) \
3345 c = intermediate_char_94[reg]; \
3347 c = intermediate_char_96[reg]; \
3348 EMIT_ONE_ASCII_BYTE (c); \
3352 EMIT_ONE_ASCII_BYTE ('$'); \
3353 if (! CHARSET_ISO_CHARS_96 (charset)) \
3355 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3357 || final_char < '@' || final_char > 'B') \
3358 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3361 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3363 EMIT_ONE_ASCII_BYTE (final_char); \
3365 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3369 /* The following two macros produce codes (control character or escape
3370 sequence) for ISO2022 single-shift functions (single-shift-2 and
3373 #define ENCODE_SINGLE_SHIFT_2 \
3375 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3376 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3378 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3379 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3383 #define ENCODE_SINGLE_SHIFT_3 \
3385 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3386 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3388 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3389 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3393 /* The following four macros produce codes (control character or
3394 escape sequence) for ISO2022 locking-shift functions (shift-in,
3395 shift-out, locking-shift-2, and locking-shift-3). */
3397 #define ENCODE_SHIFT_IN \
3399 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3400 CODING_ISO_INVOCATION (coding, 0) = 0; \
3404 #define ENCODE_SHIFT_OUT \
3406 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3407 CODING_ISO_INVOCATION (coding, 0) = 1; \
3411 #define ENCODE_LOCKING_SHIFT_2 \
3413 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3414 CODING_ISO_INVOCATION (coding, 0) = 2; \
3418 #define ENCODE_LOCKING_SHIFT_3 \
3420 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3421 CODING_ISO_INVOCATION (coding, 0) = 3; \
3425 /* Produce codes for a DIMENSION1 character whose character set is
3426 CHARSET and whose position-code is C1. Designation and invocation
3427 sequences are also produced in advance if necessary. */
3429 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3431 int id = CHARSET_ID (charset); \
3433 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3434 && id == charset_ascii) \
3436 id = charset_jisx0201_roman; \
3437 charset = CHARSET_FROM_ID (id); \
3440 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3442 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3443 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3445 EMIT_ONE_BYTE (c1 | 0x80); \
3446 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3449 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3451 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3454 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3456 EMIT_ONE_BYTE (c1 | 0x80); \
3460 /* Since CHARSET is not yet invoked to any graphic planes, we \
3461 must invoke it, or, at first, designate it to some graphic \
3462 register. Then repeat the loop to actually produce the \
3464 dst = encode_invocation_designation (charset, coding, dst, \
3469 /* Produce codes for a DIMENSION2 character whose character set is
3470 CHARSET and whose position-codes are C1 and C2. Designation and
3471 invocation codes are also produced in advance if necessary. */
3473 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3475 int id = CHARSET_ID (charset); \
3477 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3478 && id == charset_jisx0208) \
3480 id = charset_jisx0208_1978; \
3481 charset = CHARSET_FROM_ID (id); \
3484 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3486 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3487 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3489 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3490 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3493 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3495 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3498 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3500 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3504 /* Since CHARSET is not yet invoked to any graphic planes, we \
3505 must invoke it, or, at first, designate it to some graphic \
3506 register. Then repeat the loop to actually produce the \
3508 dst = encode_invocation_designation (charset, coding, dst, \
3513 #define ENCODE_ISO_CHARACTER(charset, c) \
3515 int code = ENCODE_CHAR ((charset),(c)); \
3517 if (CHARSET_DIMENSION (charset) == 1) \
3518 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3520 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3524 /* Produce designation and invocation codes at a place pointed by DST
3525 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3529 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3530 struct charset
*charset
;
3531 struct coding_system
*coding
;
3535 int multibytep
= coding
->dst_multibyte
;
3536 int produced_chars
= *p_nchars
;
3537 int reg
; /* graphic register number */
3538 int id
= CHARSET_ID (charset
);
3540 /* At first, check designations. */
3541 for (reg
= 0; reg
< 4; reg
++)
3542 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3547 /* CHARSET is not yet designated to any graphic registers. */
3548 /* At first check the requested designation. */
3549 reg
= CODING_ISO_REQUEST (coding
, id
);
3551 /* Since CHARSET requests no special designation, designate it
3552 to graphic register 0. */
3555 ENCODE_DESIGNATION (charset
, reg
, coding
);
3558 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3559 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3561 /* Since the graphic register REG is not invoked to any graphic
3562 planes, invoke it to graphic plane 0. */
3565 case 0: /* graphic register 0 */
3569 case 1: /* graphic register 1 */
3573 case 2: /* graphic register 2 */
3574 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3575 ENCODE_SINGLE_SHIFT_2
;
3577 ENCODE_LOCKING_SHIFT_2
;
3580 case 3: /* graphic register 3 */
3581 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3582 ENCODE_SINGLE_SHIFT_3
;
3584 ENCODE_LOCKING_SHIFT_3
;
3589 *p_nchars
= produced_chars
;
3593 /* The following three macros produce codes for indicating direction
3595 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3597 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3598 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3600 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3604 #define ENCODE_DIRECTION_R2L() \
3606 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3607 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3611 #define ENCODE_DIRECTION_L2R() \
3613 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3614 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3618 /* Produce codes for designation and invocation to reset the graphic
3619 planes and registers to initial state. */
3620 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3623 struct charset *charset; \
3625 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3627 for (reg = 0; reg < 4; reg++) \
3628 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3629 && (CODING_ISO_DESIGNATION (coding, reg) \
3630 != CODING_ISO_INITIAL (coding, reg))) \
3632 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3633 ENCODE_DESIGNATION (charset, reg, coding); \
3638 /* Produce designation sequences of charsets in the line started from
3639 SRC to a place pointed by DST, and return updated DST.
3641 If the current block ends before any end-of-line, we may fail to
3642 find all the necessary designations. */
3644 static unsigned char *
3645 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3646 struct coding_system
*coding
;
3647 int *charbuf
, *charbuf_end
;
3650 struct charset
*charset
;
3651 /* Table of charsets to be designated to each graphic register. */
3653 int c
, found
= 0, reg
;
3654 int produced_chars
= 0;
3655 int multibytep
= coding
->dst_multibyte
;
3657 Lisp_Object charset_list
;
3659 attrs
= CODING_ID_ATTRS (coding
->id
);
3660 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3661 if (EQ (charset_list
, Qiso_2022
))
3662 charset_list
= Viso_2022_charset_list
;
3664 for (reg
= 0; reg
< 4; reg
++)
3674 charset
= char_charset (c
, charset_list
, NULL
);
3675 id
= CHARSET_ID (charset
);
3676 reg
= CODING_ISO_REQUEST (coding
, id
);
3677 if (reg
>= 0 && r
[reg
] < 0)
3686 for (reg
= 0; reg
< 4; reg
++)
3688 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3689 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3695 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3698 encode_coding_iso_2022 (coding
)
3699 struct coding_system
*coding
;
3701 int multibytep
= coding
->dst_multibyte
;
3702 int *charbuf
= coding
->charbuf
;
3703 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3704 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3705 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3708 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3709 && CODING_ISO_BOL (coding
));
3710 int produced_chars
= 0;
3711 Lisp_Object attrs
, eol_type
, charset_list
;
3712 int ascii_compatible
;
3714 int preferred_charset_id
= -1;
3716 CODING_GET_INFO (coding
, attrs
, charset_list
);
3717 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
3718 if (VECTORP (eol_type
))
3721 setup_iso_safe_charsets (attrs
);
3722 /* Charset list may have been changed. */
3723 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
); \
3724 coding
->safe_charsets
= (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs
));
3726 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3728 while (charbuf
< charbuf_end
)
3730 ASSURE_DESTINATION (safe_room
);
3732 if (bol_designation
)
3734 unsigned char *dst_prev
= dst
;
3736 /* We have to produce designation sequences if any now. */
3737 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3738 bol_designation
= 0;
3739 /* We are sure that designation sequences are all ASCII bytes. */
3740 produced_chars
+= dst
- dst_prev
;
3747 /* Handle an annotation. */
3750 case CODING_ANNOTATE_COMPOSITION_MASK
:
3751 /* Not yet implemented. */
3753 case CODING_ANNOTATE_CHARSET_MASK
:
3754 preferred_charset_id
= charbuf
[3];
3755 if (preferred_charset_id
>= 0
3756 && NILP (Fmemq (make_number (preferred_charset_id
),
3758 preferred_charset_id
= -1;
3767 /* Now encode the character C. */
3768 if (c
< 0x20 || c
== 0x7F)
3771 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3773 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3774 ENCODE_RESET_PLANE_AND_REGISTER ();
3775 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3779 for (i
= 0; i
< 4; i
++)
3780 CODING_ISO_DESIGNATION (coding
, i
)
3781 = CODING_ISO_INITIAL (coding
, i
);
3784 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3786 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3787 ENCODE_RESET_PLANE_AND_REGISTER ();
3788 EMIT_ONE_ASCII_BYTE (c
);
3790 else if (ASCII_CHAR_P (c
))
3792 if (ascii_compatible
)
3793 EMIT_ONE_ASCII_BYTE (c
);
3796 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
3797 ENCODE_ISO_CHARACTER (charset
, c
);
3800 else if (CHAR_BYTE8_P (c
))
3802 c
= CHAR_TO_BYTE8 (c
);
3807 struct charset
*charset
;
3809 if (preferred_charset_id
>= 0)
3811 charset
= CHARSET_FROM_ID (preferred_charset_id
);
3812 if (! CHAR_CHARSET_P (c
, charset
))
3813 charset
= char_charset (c
, charset_list
, NULL
);
3816 charset
= char_charset (c
, charset_list
, NULL
);
3819 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3821 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3822 charset
= CHARSET_FROM_ID (charset_ascii
);
3826 c
= coding
->default_char
;
3827 charset
= char_charset (c
, charset_list
, NULL
);
3830 ENCODE_ISO_CHARACTER (charset
, c
);
3834 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3835 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3837 ASSURE_DESTINATION (safe_room
);
3838 ENCODE_RESET_PLANE_AND_REGISTER ();
3840 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
3841 CODING_ISO_BOL (coding
) = bol_designation
;
3842 coding
->produced_char
+= produced_chars
;
3843 coding
->produced
= dst
- coding
->destination
;
3848 /*** 8,9. SJIS and BIG5 handlers ***/
3850 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3851 quite widely. So, for the moment, Emacs supports them in the bare
3852 C code. But, in the future, they may be supported only by CCL. */
3854 /* SJIS is a coding system encoding three character sets: ASCII, right
3855 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3856 as is. A character of charset katakana-jisx0201 is encoded by
3857 "position-code + 0x80". A character of charset japanese-jisx0208
3858 is encoded in 2-byte but two position-codes are divided and shifted
3859 so that it fit in the range below.
3861 --- CODE RANGE of SJIS ---
3862 (character set) (range)
3864 KATAKANA-JISX0201 0xA0 .. 0xDF
3865 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3866 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3867 -------------------------------
3871 /* BIG5 is a coding system encoding two character sets: ASCII and
3872 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3873 character set and is encoded in two-byte.
3875 --- CODE RANGE of BIG5 ---
3876 (character set) (range)
3878 Big5 (1st byte) 0xA1 .. 0xFE
3879 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3880 --------------------------
3884 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3885 Check if a text is encoded in SJIS. If it is, return
3886 CATEGORY_MASK_SJIS, else return 0. */
3889 detect_coding_sjis (coding
, detect_info
)
3890 struct coding_system
*coding
;
3891 struct coding_detection_info
*detect_info
;
3893 const unsigned char *src
= coding
->source
, *src_base
;
3894 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3895 int multibytep
= coding
->src_multibyte
;
3896 int consumed_chars
= 0;
3900 detect_info
->checked
|= CATEGORY_MASK_SJIS
;
3901 /* A coding system of this category is always ASCII compatible. */
3902 src
+= coding
->head_ascii
;
3910 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3913 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
3915 found
= CATEGORY_MASK_SJIS
;
3917 else if (c
>= 0xA0 && c
< 0xE0)
3918 found
= CATEGORY_MASK_SJIS
;
3922 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3926 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3928 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3931 detect_info
->found
|= found
;
3935 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3936 Check if a text is encoded in BIG5. If it is, return
3937 CATEGORY_MASK_BIG5, else return 0. */
3940 detect_coding_big5 (coding
, detect_info
)
3941 struct coding_system
*coding
;
3942 struct coding_detection_info
*detect_info
;
3944 const unsigned char *src
= coding
->source
, *src_base
;
3945 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3946 int multibytep
= coding
->src_multibyte
;
3947 int consumed_chars
= 0;
3951 detect_info
->checked
|= CATEGORY_MASK_BIG5
;
3952 /* A coding system of this category is always ASCII compatible. */
3953 src
+= coding
->head_ascii
;
3964 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
3966 found
= CATEGORY_MASK_BIG5
;
3971 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3975 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3977 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3980 detect_info
->found
|= found
;
3984 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3985 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3988 decode_coding_sjis (coding
)
3989 struct coding_system
*coding
;
3991 const unsigned char *src
= coding
->source
+ coding
->consumed
;
3992 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3993 const unsigned char *src_base
;
3994 int *charbuf
= coding
->charbuf
;
3995 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
3996 int consumed_chars
= 0, consumed_chars_base
;
3997 int multibytep
= coding
->src_multibyte
;
3998 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3999 Lisp_Object attrs
, charset_list
, val
;
4000 int char_offset
= coding
->produced_char
;
4001 int last_offset
= char_offset
;
4002 int last_id
= charset_ascii
;
4004 CODING_GET_INFO (coding
, attrs
, charset_list
);
4007 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4008 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4009 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4014 struct charset
*charset
;
4017 consumed_chars_base
= consumed_chars
;
4019 if (charbuf
>= charbuf_end
)
4026 charset
= charset_roman
;
4033 if (c
< 0xA0 || c
>= 0xE0)
4035 /* SJIS -> JISX0208 */
4037 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
4041 charset
= charset_kanji
;
4045 /* SJIS -> JISX0201-Kana */
4047 charset
= charset_kana
;
4052 if (charset
->id
!= charset_ascii
4053 && last_id
!= charset
->id
)
4055 if (last_id
!= charset_ascii
)
4056 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4057 last_id
= charset
->id
;
4058 last_offset
= char_offset
;
4060 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4067 consumed_chars
= consumed_chars_base
;
4069 *charbuf
++ = c
< 0 ? -c
: BYTE8_TO_CHAR (c
);
4075 if (last_id
!= charset_ascii
)
4076 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4077 coding
->consumed_char
+= consumed_chars_base
;
4078 coding
->consumed
= src_base
- coding
->source
;
4079 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4083 decode_coding_big5 (coding
)
4084 struct coding_system
*coding
;
4086 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4087 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4088 const unsigned char *src_base
;
4089 int *charbuf
= coding
->charbuf
;
4090 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4091 int consumed_chars
= 0, consumed_chars_base
;
4092 int multibytep
= coding
->src_multibyte
;
4093 struct charset
*charset_roman
, *charset_big5
;
4094 Lisp_Object attrs
, charset_list
, val
;
4095 int char_offset
= coding
->produced_char
;
4096 int last_offset
= char_offset
;
4097 int last_id
= charset_ascii
;
4099 CODING_GET_INFO (coding
, attrs
, charset_list
);
4101 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4102 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4107 struct charset
*charset
;
4110 consumed_chars_base
= consumed_chars
;
4112 if (charbuf
>= charbuf_end
)
4120 charset
= charset_roman
;
4124 if (c
< 0xA1 || c
> 0xFE)
4127 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
4130 charset
= charset_big5
;
4132 if (charset
->id
!= charset_ascii
4133 && last_id
!= charset
->id
)
4135 if (last_id
!= charset_ascii
)
4136 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4137 last_id
= charset
->id
;
4138 last_offset
= char_offset
;
4140 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4147 consumed_chars
= consumed_chars_base
;
4149 *charbuf
++ = c
< 0 ? -c
: BYTE8_TO_CHAR (c
);
4155 if (last_id
!= charset_ascii
)
4156 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4157 coding
->consumed_char
+= consumed_chars_base
;
4158 coding
->consumed
= src_base
- coding
->source
;
4159 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4162 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4163 This function can encode charsets `ascii', `katakana-jisx0201',
4164 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4165 are sure that all these charsets are registered as official charset
4166 (i.e. do not have extended leading-codes). Characters of other
4167 charsets are produced without any encoding. If SJIS_P is 1, encode
4168 SJIS text, else encode BIG5 text. */
4171 encode_coding_sjis (coding
)
4172 struct coding_system
*coding
;
4174 int multibytep
= coding
->dst_multibyte
;
4175 int *charbuf
= coding
->charbuf
;
4176 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4177 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4178 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4180 int produced_chars
= 0;
4181 Lisp_Object attrs
, charset_list
, val
;
4182 int ascii_compatible
;
4183 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4186 CODING_GET_INFO (coding
, attrs
, charset_list
);
4188 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4189 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4190 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4192 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4194 while (charbuf
< charbuf_end
)
4196 ASSURE_DESTINATION (safe_room
);
4198 /* Now encode the character C. */
4199 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4200 EMIT_ONE_ASCII_BYTE (c
);
4201 else if (CHAR_BYTE8_P (c
))
4203 c
= CHAR_TO_BYTE8 (c
);
4209 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4213 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4215 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4216 charset
= CHARSET_FROM_ID (charset_ascii
);
4220 c
= coding
->default_char
;
4221 charset
= char_charset (c
, charset_list
, &code
);
4224 if (code
== CHARSET_INVALID_CODE (charset
))
4226 if (charset
== charset_kanji
)
4230 c1
= code
>> 8, c2
= code
& 0xFF;
4231 EMIT_TWO_BYTES (c1
, c2
);
4233 else if (charset
== charset_kana
)
4234 EMIT_ONE_BYTE (code
| 0x80);
4236 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4239 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4240 coding
->produced_char
+= produced_chars
;
4241 coding
->produced
= dst
- coding
->destination
;
4246 encode_coding_big5 (coding
)
4247 struct coding_system
*coding
;
4249 int multibytep
= coding
->dst_multibyte
;
4250 int *charbuf
= coding
->charbuf
;
4251 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4252 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4253 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4255 int produced_chars
= 0;
4256 Lisp_Object attrs
, charset_list
, val
;
4257 int ascii_compatible
;
4258 struct charset
*charset_roman
, *charset_big5
;
4261 CODING_GET_INFO (coding
, attrs
, charset_list
);
4263 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4264 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4265 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4267 while (charbuf
< charbuf_end
)
4269 ASSURE_DESTINATION (safe_room
);
4271 /* Now encode the character C. */
4272 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4273 EMIT_ONE_ASCII_BYTE (c
);
4274 else if (CHAR_BYTE8_P (c
))
4276 c
= CHAR_TO_BYTE8 (c
);
4282 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4286 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4288 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4289 charset
= CHARSET_FROM_ID (charset_ascii
);
4293 c
= coding
->default_char
;
4294 charset
= char_charset (c
, charset_list
, &code
);
4297 if (code
== CHARSET_INVALID_CODE (charset
))
4299 if (charset
== charset_big5
)
4303 c1
= code
>> 8, c2
= code
& 0xFF;
4304 EMIT_TWO_BYTES (c1
, c2
);
4307 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4310 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4311 coding
->produced_char
+= produced_chars
;
4312 coding
->produced
= dst
- coding
->destination
;
4317 /*** 10. CCL handlers ***/
4319 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4320 Check if a text is encoded in a coding system of which
4321 encoder/decoder are written in CCL program. If it is, return
4322 CATEGORY_MASK_CCL, else return 0. */
4325 detect_coding_ccl (coding
, detect_info
)
4326 struct coding_system
*coding
;
4327 struct coding_detection_info
*detect_info
;
4329 const unsigned char *src
= coding
->source
, *src_base
;
4330 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4331 int multibytep
= coding
->src_multibyte
;
4332 int consumed_chars
= 0;
4334 unsigned char *valids
= CODING_CCL_VALIDS (coding
);
4335 int head_ascii
= coding
->head_ascii
;
4338 detect_info
->checked
|= CATEGORY_MASK_CCL
;
4340 coding
= &coding_categories
[coding_category_ccl
];
4341 attrs
= CODING_ID_ATTRS (coding
->id
);
4342 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4351 if (c
< 0 || ! valids
[c
])
4353 if ((valids
[c
] > 1))
4354 found
= CATEGORY_MASK_CCL
;
4356 detect_info
->rejected
|= CATEGORY_MASK_CCL
;
4360 detect_info
->found
|= found
;
4365 decode_coding_ccl (coding
)
4366 struct coding_system
*coding
;
4368 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4369 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4370 int *charbuf
= coding
->charbuf
;
4371 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4372 int consumed_chars
= 0;
4373 int multibytep
= coding
->src_multibyte
;
4374 struct ccl_program ccl
;
4375 int source_charbuf
[1024];
4376 int source_byteidx
[1024];
4377 Lisp_Object attrs
, charset_list
;
4379 CODING_GET_INFO (coding
, attrs
, charset_list
);
4380 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
4382 while (src
< src_end
)
4384 const unsigned char *p
= src
;
4385 int *source
, *source_end
;
4389 while (i
< 1024 && p
< src_end
)
4391 source_byteidx
[i
] = p
- src
;
4392 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4395 while (i
< 1024 && p
< src_end
)
4396 source_charbuf
[i
++] = *p
++;
4398 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4401 source
= source_charbuf
;
4402 source_end
= source
+ i
;
4403 while (source
< source_end
)
4405 ccl_driver (&ccl
, source
, charbuf
,
4406 source_end
- source
, charbuf_end
- charbuf
,
4408 source
+= ccl
.consumed
;
4409 charbuf
+= ccl
.produced
;
4410 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4413 if (source
< source_end
)
4414 src
+= source_byteidx
[source
- source_charbuf
];
4417 consumed_chars
+= source
- source_charbuf
;
4419 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4420 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4426 case CCL_STAT_SUSPEND_BY_SRC
:
4427 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
4429 case CCL_STAT_SUSPEND_BY_DST
:
4432 case CCL_STAT_INVALID_CMD
:
4433 record_conversion_result (coding
, CODING_RESULT_INTERRUPT
);
4436 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4439 coding
->consumed_char
+= consumed_chars
;
4440 coding
->consumed
= src
- coding
->source
;
4441 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4445 encode_coding_ccl (coding
)
4446 struct coding_system
*coding
;
4448 struct ccl_program ccl
;
4449 int multibytep
= coding
->dst_multibyte
;
4450 int *charbuf
= coding
->charbuf
;
4451 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4452 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4453 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4454 unsigned char *adjusted_dst_end
= dst_end
- 1;
4455 int destination_charbuf
[1024];
4456 int i
, produced_chars
= 0;
4457 Lisp_Object attrs
, charset_list
;
4459 CODING_GET_INFO (coding
, attrs
, charset_list
);
4460 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4462 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4463 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4465 while (charbuf
< charbuf_end
&& dst
< adjusted_dst_end
)
4467 int dst_bytes
= dst_end
- dst
;
4468 if (dst_bytes
> 1024)
4471 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4472 charbuf_end
- charbuf
, dst_bytes
, charset_list
);
4473 charbuf
+= ccl
.consumed
;
4475 for (i
= 0; i
< ccl
.produced
; i
++)
4476 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4479 for (i
= 0; i
< ccl
.produced
; i
++)
4480 *dst
++ = destination_charbuf
[i
] & 0xFF;
4481 produced_chars
+= ccl
.produced
;
4487 case CCL_STAT_SUSPEND_BY_SRC
:
4488 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
4490 case CCL_STAT_SUSPEND_BY_DST
:
4491 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_DST
);
4494 case CCL_STAT_INVALID_CMD
:
4495 record_conversion_result (coding
, CODING_RESULT_INTERRUPT
);
4498 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4502 coding
->produced_char
+= produced_chars
;
4503 coding
->produced
= dst
- coding
->destination
;
4509 /*** 10, 11. no-conversion handlers ***/
4511 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4514 decode_coding_raw_text (coding
)
4515 struct coding_system
*coding
;
4517 coding
->chars_at_source
= 1;
4518 coding
->consumed_char
= 0;
4519 coding
->consumed
= 0;
4520 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4524 encode_coding_raw_text (coding
)
4525 struct coding_system
*coding
;
4527 int multibytep
= coding
->dst_multibyte
;
4528 int *charbuf
= coding
->charbuf
;
4529 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4530 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4531 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4532 int produced_chars
= 0;
4537 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4539 if (coding
->src_multibyte
)
4540 while (charbuf
< charbuf_end
)
4542 ASSURE_DESTINATION (safe_room
);
4544 if (ASCII_CHAR_P (c
))
4545 EMIT_ONE_ASCII_BYTE (c
);
4546 else if (CHAR_BYTE8_P (c
))
4548 c
= CHAR_TO_BYTE8 (c
);
4553 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4555 CHAR_STRING_ADVANCE (c
, p1
);
4558 EMIT_ONE_BYTE (*p0
);
4564 while (charbuf
< charbuf_end
)
4566 ASSURE_DESTINATION (safe_room
);
4573 if (coding
->src_multibyte
)
4575 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4577 while (charbuf
< charbuf_end
)
4579 ASSURE_DESTINATION (safe_room
);
4581 if (ASCII_CHAR_P (c
))
4583 else if (CHAR_BYTE8_P (c
))
4584 *dst
++ = CHAR_TO_BYTE8 (c
);
4586 CHAR_STRING_ADVANCE (c
, dst
);
4592 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4593 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4594 *dst
++ = *charbuf
++;
4595 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4598 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4599 coding
->produced_char
+= produced_chars
;
4600 coding
->produced
= dst
- coding
->destination
;
4604 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4605 Check if a text is encoded in a charset-based coding system. If it
4606 is, return 1, else return 0. */
4609 detect_coding_charset (coding
, detect_info
)
4610 struct coding_system
*coding
;
4611 struct coding_detection_info
*detect_info
;
4613 const unsigned char *src
= coding
->source
, *src_base
;
4614 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4615 int multibytep
= coding
->src_multibyte
;
4616 int consumed_chars
= 0;
4617 Lisp_Object attrs
, valids
;
4620 detect_info
->checked
|= CATEGORY_MASK_CHARSET
;
4622 coding
= &coding_categories
[coding_category_charset
];
4623 attrs
= CODING_ID_ATTRS (coding
->id
);
4624 valids
= AREF (attrs
, coding_attr_charset_valids
);
4626 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4627 src
+= coding
->head_ascii
;
4637 if (NILP (AREF (valids
, c
)))
4640 found
= CATEGORY_MASK_CHARSET
;
4642 detect_info
->rejected
|= CATEGORY_MASK_CHARSET
;
4646 detect_info
->found
|= found
;
4651 decode_coding_charset (coding
)
4652 struct coding_system
*coding
;
4654 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4655 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4656 const unsigned char *src_base
;
4657 int *charbuf
= coding
->charbuf
;
4658 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4659 int consumed_chars
= 0, consumed_chars_base
;
4660 int multibytep
= coding
->src_multibyte
;
4661 Lisp_Object attrs
, charset_list
, valids
;
4662 int char_offset
= coding
->produced_char
;
4663 int last_offset
= char_offset
;
4664 int last_id
= charset_ascii
;
4666 CODING_GET_INFO (coding
, attrs
, charset_list
);
4667 valids
= AREF (attrs
, coding_attr_charset_valids
);
4673 struct charset
*charset
;
4679 consumed_chars_base
= consumed_chars
;
4681 if (charbuf
>= charbuf_end
)
4689 val
= AREF (valids
, c
);
4694 charset
= CHARSET_FROM_ID (XFASTINT (val
));
4695 dim
= CHARSET_DIMENSION (charset
);
4699 code
= (code
<< 8) | c
;
4702 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
4707 /* VAL is a list of charset IDs. It is assured that the
4708 list is sorted by charset dimensions (smaller one
4712 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
4713 dim
= CHARSET_DIMENSION (charset
);
4717 code
= (code
<< 8) | c
;
4720 CODING_DECODE_CHAR (coding
, src
, src_base
,
4721 src_end
, charset
, code
, c
);
4729 if (charset
->id
!= charset_ascii
4730 && last_id
!= charset
->id
)
4732 if (last_id
!= charset_ascii
)
4733 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4734 last_id
= charset
->id
;
4735 last_offset
= char_offset
;
4744 consumed_chars
= consumed_chars_base
;
4746 *charbuf
++ = c
< 0 ? -c
: ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4752 if (last_id
!= charset_ascii
)
4753 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4754 coding
->consumed_char
+= consumed_chars_base
;
4755 coding
->consumed
= src_base
- coding
->source
;
4756 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4760 encode_coding_charset (coding
)
4761 struct coding_system
*coding
;
4763 int multibytep
= coding
->dst_multibyte
;
4764 int *charbuf
= coding
->charbuf
;
4765 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4766 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4767 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4768 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4769 int produced_chars
= 0;
4770 Lisp_Object attrs
, charset_list
;
4771 int ascii_compatible
;
4774 CODING_GET_INFO (coding
, attrs
, charset_list
);
4775 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4777 while (charbuf
< charbuf_end
)
4779 struct charset
*charset
;
4782 ASSURE_DESTINATION (safe_room
);
4784 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4785 EMIT_ONE_ASCII_BYTE (c
);
4786 else if (CHAR_BYTE8_P (c
))
4788 c
= CHAR_TO_BYTE8 (c
);
4793 charset
= char_charset (c
, charset_list
, &code
);
4796 if (CHARSET_DIMENSION (charset
) == 1)
4797 EMIT_ONE_BYTE (code
);
4798 else if (CHARSET_DIMENSION (charset
) == 2)
4799 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
4800 else if (CHARSET_DIMENSION (charset
) == 3)
4801 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
4803 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
4804 (code
>> 8) & 0xFF, code
& 0xFF);
4808 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4809 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4811 c
= coding
->default_char
;
4817 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4818 coding
->produced_char
+= produced_chars
;
4819 coding
->produced
= dst
- coding
->destination
;
4824 /*** 7. C library functions ***/
4826 /* Setup coding context CODING from information about CODING_SYSTEM.
4827 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4828 CODING_SYSTEM is invalid, signal an error. */
4831 setup_coding_system (coding_system
, coding
)
4832 Lisp_Object coding_system
;
4833 struct coding_system
*coding
;
4836 Lisp_Object eol_type
;
4837 Lisp_Object coding_type
;
4840 if (NILP (coding_system
))
4841 coding_system
= Qno_conversion
;
4843 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
4845 attrs
= CODING_ID_ATTRS (coding
->id
);
4846 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4849 coding
->head_ascii
= -1;
4850 coding
->common_flags
4851 = (VECTORP (eol_type
) ? CODING_REQUIRE_DETECTION_MASK
: 0);
4852 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
4853 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
4854 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
4855 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
4856 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs
)))
4857 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
4859 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4860 coding
->max_charset_id
= SCHARS (val
) - 1;
4861 coding
->safe_charsets
= (char *) SDATA (val
);
4862 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
4864 coding_type
= CODING_ATTR_TYPE (attrs
);
4865 if (EQ (coding_type
, Qundecided
))
4867 coding
->detector
= NULL
;
4868 coding
->decoder
= decode_coding_raw_text
;
4869 coding
->encoder
= encode_coding_raw_text
;
4870 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4872 else if (EQ (coding_type
, Qiso_2022
))
4875 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
4877 /* Invoke graphic register 0 to plane 0. */
4878 CODING_ISO_INVOCATION (coding
, 0) = 0;
4879 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4880 CODING_ISO_INVOCATION (coding
, 1)
4881 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
4882 /* Setup the initial status of designation. */
4883 for (i
= 0; i
< 4; i
++)
4884 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
4885 /* Not single shifting initially. */
4886 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
4887 /* Beginning of buffer should also be regarded as bol. */
4888 CODING_ISO_BOL (coding
) = 1;
4889 coding
->detector
= detect_coding_iso_2022
;
4890 coding
->decoder
= decode_coding_iso_2022
;
4891 coding
->encoder
= encode_coding_iso_2022
;
4892 if (flags
& CODING_ISO_FLAG_SAFE
)
4893 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
4894 coding
->common_flags
4895 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4896 | CODING_REQUIRE_FLUSHING_MASK
);
4897 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
4898 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
4899 if (flags
& CODING_ISO_FLAG_DESIGNATION
)
4900 coding
->common_flags
|= CODING_ANNOTATE_CHARSET_MASK
;
4901 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
4903 setup_iso_safe_charsets (attrs
);
4904 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4905 coding
->max_charset_id
= SCHARS (val
) - 1;
4906 coding
->safe_charsets
= (char *) SDATA (val
);
4908 CODING_ISO_FLAGS (coding
) = flags
;
4910 else if (EQ (coding_type
, Qcharset
))
4912 coding
->detector
= detect_coding_charset
;
4913 coding
->decoder
= decode_coding_charset
;
4914 coding
->encoder
= encode_coding_charset
;
4915 coding
->common_flags
4916 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4918 else if (EQ (coding_type
, Qutf_8
))
4920 coding
->detector
= detect_coding_utf_8
;
4921 coding
->decoder
= decode_coding_utf_8
;
4922 coding
->encoder
= encode_coding_utf_8
;
4923 coding
->common_flags
4924 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4926 else if (EQ (coding_type
, Qutf_16
))
4928 val
= AREF (attrs
, coding_attr_utf_16_bom
);
4929 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
4930 : EQ (val
, Qt
) ? utf_16_with_bom
4931 : utf_16_without_bom
);
4932 val
= AREF (attrs
, coding_attr_utf_16_endian
);
4933 CODING_UTF_16_ENDIAN (coding
) = (EQ (val
, Qbig
) ? utf_16_big_endian
4934 : utf_16_little_endian
);
4935 CODING_UTF_16_SURROGATE (coding
) = 0;
4936 coding
->detector
= detect_coding_utf_16
;
4937 coding
->decoder
= decode_coding_utf_16
;
4938 coding
->encoder
= encode_coding_utf_16
;
4939 coding
->common_flags
4940 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4941 if (CODING_UTF_16_BOM (coding
) == utf_16_detect_bom
)
4942 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4944 else if (EQ (coding_type
, Qccl
))
4946 coding
->detector
= detect_coding_ccl
;
4947 coding
->decoder
= decode_coding_ccl
;
4948 coding
->encoder
= encode_coding_ccl
;
4949 coding
->common_flags
4950 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4951 | CODING_REQUIRE_FLUSHING_MASK
);
4953 else if (EQ (coding_type
, Qemacs_mule
))
4955 coding
->detector
= detect_coding_emacs_mule
;
4956 coding
->decoder
= decode_coding_emacs_mule
;
4957 coding
->encoder
= encode_coding_emacs_mule
;
4958 coding
->common_flags
4959 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4960 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
4961 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
4963 Lisp_Object tail
, safe_charsets
;
4964 int max_charset_id
= 0;
4966 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4968 if (max_charset_id
< XFASTINT (XCAR (tail
)))
4969 max_charset_id
= XFASTINT (XCAR (tail
));
4970 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
4972 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4974 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
4975 coding
->max_charset_id
= max_charset_id
;
4976 coding
->safe_charsets
= (char *) SDATA (safe_charsets
);
4979 else if (EQ (coding_type
, Qshift_jis
))
4981 coding
->detector
= detect_coding_sjis
;
4982 coding
->decoder
= decode_coding_sjis
;
4983 coding
->encoder
= encode_coding_sjis
;
4984 coding
->common_flags
4985 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4987 else if (EQ (coding_type
, Qbig5
))
4989 coding
->detector
= detect_coding_big5
;
4990 coding
->decoder
= decode_coding_big5
;
4991 coding
->encoder
= encode_coding_big5
;
4992 coding
->common_flags
4993 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4995 else /* EQ (coding_type, Qraw_text) */
4997 coding
->detector
= NULL
;
4998 coding
->decoder
= decode_coding_raw_text
;
4999 coding
->encoder
= encode_coding_raw_text
;
5005 /* Return raw-text or one of its subsidiaries that has the same
5006 eol_type as CODING-SYSTEM. */
5009 raw_text_coding_system (coding_system
)
5010 Lisp_Object coding_system
;
5012 Lisp_Object spec
, attrs
;
5013 Lisp_Object eol_type
, raw_text_eol_type
;
5015 if (NILP (coding_system
))
5017 spec
= CODING_SYSTEM_SPEC (coding_system
);
5018 attrs
= AREF (spec
, 0);
5020 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
5021 return coding_system
;
5023 eol_type
= AREF (spec
, 2);
5024 if (VECTORP (eol_type
))
5026 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
5027 raw_text_eol_type
= AREF (spec
, 2);
5028 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
5029 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
5030 : AREF (raw_text_eol_type
, 2));
5034 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5035 does, return one of the subsidiary that has the same eol-spec as
5036 PARENT. Otherwise, return CODING_SYSTEM. */
5039 coding_inherit_eol_type (coding_system
, parent
)
5040 Lisp_Object coding_system
, parent
;
5042 Lisp_Object spec
, eol_type
;
5044 if (NILP (coding_system
))
5045 coding_system
= Qraw_text
;
5046 spec
= CODING_SYSTEM_SPEC (coding_system
);
5047 eol_type
= AREF (spec
, 2);
5048 if (VECTORP (eol_type
)
5051 Lisp_Object parent_spec
;
5052 Lisp_Object parent_eol_type
;
5055 = CODING_SYSTEM_SPEC (buffer_defaults
.buffer_file_coding_system
);
5056 parent_eol_type
= AREF (parent_spec
, 2);
5057 if (EQ (parent_eol_type
, Qunix
))
5058 coding_system
= AREF (eol_type
, 0);
5059 else if (EQ (parent_eol_type
, Qdos
))
5060 coding_system
= AREF (eol_type
, 1);
5061 else if (EQ (parent_eol_type
, Qmac
))
5062 coding_system
= AREF (eol_type
, 2);
5064 return coding_system
;
5067 /* Emacs has a mechanism to automatically detect a coding system if it
5068 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5069 it's impossible to distinguish some coding systems accurately
5070 because they use the same range of codes. So, at first, coding
5071 systems are categorized into 7, those are:
5073 o coding-category-emacs-mule
5075 The category for a coding system which has the same code range
5076 as Emacs' internal format. Assigned the coding-system (Lisp
5077 symbol) `emacs-mule' by default.
5079 o coding-category-sjis
5081 The category for a coding system which has the same code range
5082 as SJIS. Assigned the coding-system (Lisp
5083 symbol) `japanese-shift-jis' by default.
5085 o coding-category-iso-7
5087 The category for a coding system which has the same code range
5088 as ISO2022 of 7-bit environment. This doesn't use any locking
5089 shift and single shift functions. This can encode/decode all
5090 charsets. Assigned the coding-system (Lisp symbol)
5091 `iso-2022-7bit' by default.
5093 o coding-category-iso-7-tight
5095 Same as coding-category-iso-7 except that this can
5096 encode/decode only the specified charsets.
5098 o coding-category-iso-8-1
5100 The category for a coding system which has the same code range
5101 as ISO2022 of 8-bit environment and graphic plane 1 used only
5102 for DIMENSION1 charset. This doesn't use any locking shift
5103 and single shift functions. Assigned the coding-system (Lisp
5104 symbol) `iso-latin-1' by default.
5106 o coding-category-iso-8-2
5108 The category for a coding system which has the same code range
5109 as ISO2022 of 8-bit environment and graphic plane 1 used only
5110 for DIMENSION2 charset. This doesn't use any locking shift
5111 and single shift functions. Assigned the coding-system (Lisp
5112 symbol) `japanese-iso-8bit' by default.
5114 o coding-category-iso-7-else
5116 The category for a coding system which has the same code range
5117 as ISO2022 of 7-bit environemnt but uses locking shift or
5118 single shift functions. Assigned the coding-system (Lisp
5119 symbol) `iso-2022-7bit-lock' by default.
5121 o coding-category-iso-8-else
5123 The category for a coding system which has the same code range
5124 as ISO2022 of 8-bit environemnt but uses locking shift or
5125 single shift functions. Assigned the coding-system (Lisp
5126 symbol) `iso-2022-8bit-ss2' by default.
5128 o coding-category-big5
5130 The category for a coding system which has the same code range
5131 as BIG5. Assigned the coding-system (Lisp symbol)
5132 `cn-big5' by default.
5134 o coding-category-utf-8
5136 The category for a coding system which has the same code range
5137 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5138 symbol) `utf-8' by default.
5140 o coding-category-utf-16-be
5142 The category for a coding system in which a text has an
5143 Unicode signature (cf. Unicode Standard) in the order of BIG
5144 endian at the head. Assigned the coding-system (Lisp symbol)
5145 `utf-16-be' by default.
5147 o coding-category-utf-16-le
5149 The category for a coding system in which a text has an
5150 Unicode signature (cf. Unicode Standard) in the order of
5151 LITTLE endian at the head. Assigned the coding-system (Lisp
5152 symbol) `utf-16-le' by default.
5154 o coding-category-ccl
5156 The category for a coding system of which encoder/decoder is
5157 written in CCL programs. The default value is nil, i.e., no
5158 coding system is assigned.
5160 o coding-category-binary
5162 The category for a coding system not categorized in any of the
5163 above. Assigned the coding-system (Lisp symbol)
5164 `no-conversion' by default.
5166 Each of them is a Lisp symbol and the value is an actual
5167 `coding-system's (this is also a Lisp symbol) assigned by a user.
5168 What Emacs does actually is to detect a category of coding system.
5169 Then, it uses a `coding-system' assigned to it. If Emacs can't
5170 decide only one possible category, it selects a category of the
5171 highest priority. Priorities of categories are also specified by a
5172 user in a Lisp variable `coding-category-list'.
5176 #define EOL_SEEN_NONE 0
5177 #define EOL_SEEN_LF 1
5178 #define EOL_SEEN_CR 2
5179 #define EOL_SEEN_CRLF 4
5181 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5182 SOURCE is encoded. If CATEGORY is one of
5183 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5184 two-byte, else they are encoded by one-byte.
5186 Return one of EOL_SEEN_XXX. */
5188 #define MAX_EOL_CHECK_COUNT 3
5191 detect_eol (source
, src_bytes
, category
)
5192 unsigned char *source
;
5193 EMACS_INT src_bytes
;
5194 enum coding_category category
;
5196 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
5199 int eol_seen
= EOL_SEEN_NONE
;
5201 if ((1 << category
) & CATEGORY_MASK_UTF_16
)
5205 msb
= category
== (coding_category_utf_16_le
5206 | coding_category_utf_16_le_nosig
);
5209 while (src
+ 1 < src_end
)
5212 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
5217 this_eol
= EOL_SEEN_LF
;
5218 else if (src
+ 3 >= src_end
5219 || src
[msb
+ 2] != 0
5220 || src
[lsb
+ 2] != '\n')
5221 this_eol
= EOL_SEEN_CR
;
5223 this_eol
= EOL_SEEN_CRLF
;
5225 if (eol_seen
== EOL_SEEN_NONE
)
5226 /* This is the first end-of-line. */
5227 eol_seen
= this_eol
;
5228 else if (eol_seen
!= this_eol
)
5230 /* The found type is different from what found before. */
5231 eol_seen
= EOL_SEEN_LF
;
5234 if (++total
== MAX_EOL_CHECK_COUNT
)
5242 while (src
< src_end
)
5245 if (c
== '\n' || c
== '\r')
5250 this_eol
= EOL_SEEN_LF
;
5251 else if (src
>= src_end
|| *src
!= '\n')
5252 this_eol
= EOL_SEEN_CR
;
5254 this_eol
= EOL_SEEN_CRLF
, src
++;
5256 if (eol_seen
== EOL_SEEN_NONE
)
5257 /* This is the first end-of-line. */
5258 eol_seen
= this_eol
;
5259 else if (eol_seen
!= this_eol
)
5261 /* The found type is different from what found before. */
5262 eol_seen
= EOL_SEEN_LF
;
5265 if (++total
== MAX_EOL_CHECK_COUNT
)
5275 adjust_coding_eol_type (coding
, eol_seen
)
5276 struct coding_system
*coding
;
5279 Lisp_Object eol_type
;
5281 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5282 if (eol_seen
& EOL_SEEN_LF
)
5284 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
5287 else if (eol_seen
& EOL_SEEN_CRLF
)
5289 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
5292 else if (eol_seen
& EOL_SEEN_CR
)
5294 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
5300 /* Detect how a text specified in CODING is encoded. If a coding
5301 system is detected, update fields of CODING by the detected coding
5305 detect_coding (coding
)
5306 struct coding_system
*coding
;
5308 const unsigned char *src
, *src_end
;
5309 Lisp_Object attrs
, coding_type
;
5311 coding
->consumed
= coding
->consumed_char
= 0;
5312 coding
->produced
= coding
->produced_char
= 0;
5313 coding_set_source (coding
);
5315 src_end
= coding
->source
+ coding
->src_bytes
;
5317 /* If we have not yet decided the text encoding type, detect it
5319 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
5323 for (i
= 0, src
= coding
->source
; src
< src_end
; i
++, src
++)
5326 if (c
& 0x80 || (c
< 0x20 && (c
== 0
5327 || c
== ISO_CODE_ESC
5329 || c
== ISO_CODE_SO
)))
5332 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
5334 if (coding
->head_ascii
< coding
->src_bytes
)
5336 struct coding_detection_info detect_info
;
5337 enum coding_category category
;
5338 struct coding_system
*this;
5340 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
5341 for (i
= 0; i
< coding_category_raw_text
; i
++)
5343 category
= coding_priorities
[i
];
5344 this = coding_categories
+ category
;
5347 /* No coding system of this category is defined. */
5348 detect_info
.rejected
|= (1 << category
);
5350 else if (category
>= coding_category_raw_text
)
5352 else if (detect_info
.checked
& (1 << category
))
5354 if (detect_info
.found
& (1 << category
))
5357 else if ((*(this->detector
)) (coding
, &detect_info
)
5358 && detect_info
.found
& (1 << category
))
5360 if (category
== coding_category_utf_16_auto
)
5362 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
5363 category
= coding_category_utf_16_le
;
5365 category
= coding_category_utf_16_be
;
5370 if (i
< coding_category_raw_text
)
5371 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5372 else if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
5373 setup_coding_system (Qraw_text
, coding
);
5374 else if (detect_info
.rejected
)
5375 for (i
= 0; i
< coding_category_raw_text
; i
++)
5376 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
5378 this = coding_categories
+ coding_priorities
[i
];
5379 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5384 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding
->id
)))
5385 == coding_category_utf_16_auto
)
5387 Lisp_Object coding_systems
;
5388 struct coding_detection_info detect_info
;
5391 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_16_bom
);
5392 detect_info
.found
= detect_info
.rejected
= 0;
5393 if (CONSP (coding_systems
)
5394 && detect_coding_utf_16 (coding
, &detect_info
))
5396 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
5397 setup_coding_system (XCAR (coding_systems
), coding
);
5398 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
5399 setup_coding_system (XCDR (coding_systems
), coding
);
5407 struct coding_system
*coding
;
5409 Lisp_Object eol_type
;
5410 unsigned char *p
, *pbeg
, *pend
;
5412 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5413 if (EQ (eol_type
, Qunix
))
5416 if (NILP (coding
->dst_object
))
5417 pbeg
= coding
->destination
;
5419 pbeg
= BYTE_POS_ADDR (coding
->dst_pos_byte
);
5420 pend
= pbeg
+ coding
->produced
;
5422 if (VECTORP (eol_type
))
5424 int eol_seen
= EOL_SEEN_NONE
;
5426 for (p
= pbeg
; p
< pend
; p
++)
5429 eol_seen
|= EOL_SEEN_LF
;
5430 else if (*p
== '\r')
5432 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
5434 eol_seen
|= EOL_SEEN_CRLF
;
5438 eol_seen
|= EOL_SEEN_CR
;
5441 if (eol_seen
!= EOL_SEEN_NONE
5442 && eol_seen
!= EOL_SEEN_LF
5443 && eol_seen
!= EOL_SEEN_CRLF
5444 && eol_seen
!= EOL_SEEN_CR
)
5445 eol_seen
= EOL_SEEN_LF
;
5446 if (eol_seen
!= EOL_SEEN_NONE
)
5447 eol_type
= adjust_coding_eol_type (coding
, eol_seen
);
5450 if (EQ (eol_type
, Qmac
))
5452 for (p
= pbeg
; p
< pend
; p
++)
5456 else if (EQ (eol_type
, Qdos
))
5460 if (NILP (coding
->dst_object
))
5462 for (p
= pend
- 2; p
>= pbeg
; p
--)
5465 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
-- - p
- 1);
5471 for (p
= pend
- 2; p
>= pbeg
; p
--)
5474 int pos_byte
= coding
->dst_pos_byte
+ (p
- pbeg
);
5475 int pos
= BYTE_TO_CHAR (pos_byte
);
5477 del_range_2 (pos
, pos_byte
, pos
+ 1, pos_byte
+ 1, 0);
5481 coding
->produced
-= n
;
5482 coding
->produced_char
-= n
;
5487 /* Return a translation table from coding system attribute vector ATTRS
5488 for encoding (ENCODEP is nonzero) or decoding (ENCODEP is zeor). */
5491 get_translation_table (attrs
, encodep
)
5493 Lisp_Object standard
, translation_table
;
5496 translation_table
= CODING_ATTR_ENCODE_TBL (attrs
),
5497 standard
= Vstandard_translation_table_for_encode
;
5499 translation_table
= CODING_ATTR_DECODE_TBL (attrs
),
5500 standard
= Vstandard_translation_table_for_decode
;
5501 if (! NILP (translation_table
) && SYMBOLP (translation_table
))
5502 translation_table
= Fget (translation_table
, Qtranslation_table
);
5503 if (NILP (translation_table
))
5504 translation_table
= standard
;
5505 if (! CHAR_TABLE_P (translation_table
))
5506 translation_table
= Qnil
;
5507 return translation_table
;
5512 translate_chars (coding
, table
)
5513 struct coding_system
*coding
;
5516 int *charbuf
= coding
->charbuf
;
5517 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5520 if (coding
->chars_at_source
)
5523 while (charbuf
< charbuf_end
)
5529 *charbuf
++ = translate_char (table
, c
);
5534 produce_chars (coding
)
5535 struct coding_system
*coding
;
5537 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5538 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5540 int produced_chars
= 0;
5542 if (! coding
->chars_at_source
)
5544 /* Characters are in coding->charbuf. */
5545 int *buf
= coding
->charbuf
;
5546 int *buf_end
= buf
+ coding
->charbuf_used
;
5547 unsigned char *adjusted_dst_end
;
5549 if (BUFFERP (coding
->src_object
)
5550 && EQ (coding
->src_object
, coding
->dst_object
))
5551 dst_end
= ((unsigned char *) coding
->source
) + coding
->consumed
;
5552 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5554 while (buf
< buf_end
)
5558 if (dst
>= adjusted_dst_end
)
5560 dst
= alloc_destination (coding
,
5561 buf_end
- buf
+ MAX_MULTIBYTE_LENGTH
,
5563 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5564 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5568 if (coding
->dst_multibyte
5569 || ! CHAR_BYTE8_P (c
))
5570 CHAR_STRING_ADVANCE (c
, dst
);
5572 *dst
++ = CHAR_TO_BYTE8 (c
);
5576 /* This is an annotation datum. (-C) is the length of
5583 const unsigned char *src
= coding
->source
;
5584 const unsigned char *src_end
= src
+ coding
->src_bytes
;
5585 Lisp_Object eol_type
;
5587 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5589 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5591 if (coding
->src_multibyte
)
5598 const unsigned char *src_base
= src
;
5604 if (EQ (eol_type
, Qdos
))
5608 record_conversion_result
5609 (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
5610 goto no_more_source
;
5615 else if (EQ (eol_type
, Qmac
))
5620 coding
->consumed
= src
- coding
->source
;
5622 if (EQ (coding
->src_object
, coding
->dst_object
))
5623 dst_end
= (unsigned char *) src
;
5626 dst
= alloc_destination (coding
, src_end
- src
+ 1,
5628 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5629 coding_set_source (coding
);
5630 src
= coding
->source
+ coding
->consumed
;
5631 src_end
= coding
->source
+ coding
->src_bytes
;
5641 while (src
< src_end
)
5648 if (EQ (eol_type
, Qdos
))
5654 else if (EQ (eol_type
, Qmac
))
5657 if (dst
>= dst_end
- 1)
5659 coding
->consumed
= src
- coding
->source
;
5661 if (EQ (coding
->src_object
, coding
->dst_object
))
5662 dst_end
= (unsigned char *) src
;
5663 if (dst
>= dst_end
- 1)
5665 dst
= alloc_destination (coding
, src_end
- src
+ 2,
5667 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5668 coding_set_source (coding
);
5669 src
= coding
->source
+ coding
->consumed
;
5670 src_end
= coding
->source
+ coding
->src_bytes
;
5678 if (!EQ (coding
->src_object
, coding
->dst_object
))
5680 int require
= coding
->src_bytes
- coding
->dst_bytes
;
5684 EMACS_INT offset
= src
- coding
->source
;
5686 dst
= alloc_destination (coding
, require
, dst
);
5687 coding_set_source (coding
);
5688 src
= coding
->source
+ offset
;
5689 src_end
= coding
->source
+ coding
->src_bytes
;
5692 produced_chars
= coding
->src_chars
;
5693 while (src
< src_end
)
5699 if (EQ (eol_type
, Qdos
))
5706 else if (EQ (eol_type
, Qmac
))
5712 coding
->consumed
= coding
->src_bytes
;
5713 coding
->consumed_char
= coding
->src_chars
;
5716 produced
= dst
- (coding
->destination
+ coding
->produced
);
5717 if (BUFFERP (coding
->dst_object
))
5718 insert_from_gap (produced_chars
, produced
);
5719 coding
->produced
+= produced
;
5720 coding
->produced_char
+= produced_chars
;
5721 return produced_chars
;
5724 /* Compose text in CODING->object according to the annotation data at
5725 CHARBUF. CHARBUF is an array:
5726 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5730 produce_composition (coding
, charbuf
)
5731 struct coding_system
*coding
;
5736 enum composition_method method
;
5737 Lisp_Object components
;
5740 from
= coding
->dst_pos
+ charbuf
[2];
5741 to
= coding
->dst_pos
+ charbuf
[3];
5742 method
= (enum composition_method
) (charbuf
[4]);
5744 if (method
== COMPOSITION_RELATIVE
)
5748 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
5753 for (i
= 0; i
< len
; i
++)
5754 args
[i
] = make_number (charbuf
[i
]);
5755 components
= (method
== COMPOSITION_WITH_ALTCHARS
5756 ? Fstring (len
, args
) : Fvector (len
, args
));
5758 compose_text (from
, to
, components
, Qnil
, coding
->dst_object
);
5762 /* Put `charset' property on text in CODING->object according to
5763 the annotation data at CHARBUF. CHARBUF is an array:
5764 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5768 produce_charset (coding
, charbuf
)
5769 struct coding_system
*coding
;
5772 EMACS_INT from
= coding
->dst_pos
+ charbuf
[2];
5773 EMACS_INT to
= coding
->dst_pos
+ charbuf
[3];
5774 struct charset
*charset
= CHARSET_FROM_ID (charbuf
[4]);
5776 Fput_text_property (make_number (from
), make_number (to
),
5777 Qcharset
, CHARSET_NAME (charset
),
5778 coding
->dst_object
);
5782 #define CHARBUF_SIZE 0x4000
5784 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5786 int size = CHARBUF_SIZE;; \
5788 coding->charbuf = NULL; \
5789 while (size > 1024) \
5791 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5792 if (coding->charbuf) \
5796 if (! coding->charbuf) \
5798 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
5799 return coding->result; \
5801 coding->charbuf_size = size; \
5806 produce_annotation (coding
)
5807 struct coding_system
*coding
;
5809 int *charbuf
= coding
->charbuf
;
5810 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5812 if (NILP (coding
->dst_object
))
5815 while (charbuf
< charbuf_end
)
5821 int len
= -*charbuf
;
5824 case CODING_ANNOTATE_COMPOSITION_MASK
:
5825 produce_composition (coding
, charbuf
);
5827 case CODING_ANNOTATE_CHARSET_MASK
:
5828 produce_charset (coding
, charbuf
);
5838 /* Decode the data at CODING->src_object into CODING->dst_object.
5839 CODING->src_object is a buffer, a string, or nil.
5840 CODING->dst_object is a buffer.
5842 If CODING->src_object is a buffer, it must be the current buffer.
5843 In this case, if CODING->src_pos is positive, it is a position of
5844 the source text in the buffer, otherwise, the source text is in the
5845 gap area of the buffer, and CODING->src_pos specifies the offset of
5846 the text from GPT (which must be the same as PT). If this is the
5847 same buffer as CODING->dst_object, CODING->src_pos must be
5850 If CODING->src_object is a string, CODING->src_pos in an index to
5853 If CODING->src_object is nil, CODING->source must already point to
5854 the non-relocatable memory area. In this case, CODING->src_pos is
5855 an offset from CODING->source.
5857 The decoded data is inserted at the current point of the buffer
5862 decode_coding (coding
)
5863 struct coding_system
*coding
;
5866 Lisp_Object undo_list
;
5867 Lisp_Object translation_table
;
5869 if (BUFFERP (coding
->src_object
)
5870 && coding
->src_pos
> 0
5871 && coding
->src_pos
< GPT
5872 && coding
->src_pos
+ coding
->src_chars
> GPT
)
5873 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
5876 if (BUFFERP (coding
->dst_object
))
5878 if (current_buffer
!= XBUFFER (coding
->dst_object
))
5879 set_buffer_internal (XBUFFER (coding
->dst_object
));
5881 move_gap_both (PT
, PT_BYTE
);
5882 undo_list
= current_buffer
->undo_list
;
5883 current_buffer
->undo_list
= Qt
;
5886 coding
->consumed
= coding
->consumed_char
= 0;
5887 coding
->produced
= coding
->produced_char
= 0;
5888 coding
->chars_at_source
= 0;
5889 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5892 ALLOC_CONVERSION_WORK_AREA (coding
);
5894 attrs
= CODING_ID_ATTRS (coding
->id
);
5895 translation_table
= get_translation_table (attrs
, 1);
5899 coding_set_source (coding
);
5900 coding
->annotated
= 0;
5901 (*(coding
->decoder
)) (coding
);
5902 if (!NILP (translation_table
))
5903 translate_chars (coding
, translation_table
);
5904 coding_set_destination (coding
);
5905 produce_chars (coding
);
5906 if (coding
->annotated
)
5907 produce_annotation (coding
);
5909 while (coding
->consumed
< coding
->src_bytes
5910 && ! coding
->result
);
5912 coding
->carryover_bytes
= 0;
5913 if (coding
->consumed
< coding
->src_bytes
)
5915 int nbytes
= coding
->src_bytes
- coding
->consumed
;
5916 const unsigned char *src
;
5918 coding_set_source (coding
);
5919 coding_set_destination (coding
);
5920 src
= coding
->source
+ coding
->consumed
;
5922 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
5924 /* Flush out unprocessed data as binary chars. We are sure
5925 that the number of data is less than the size of
5927 coding
->charbuf_used
= 0;
5928 while (nbytes
-- > 0)
5932 coding
->charbuf
[coding
->charbuf_used
++] = (c
& 0x80 ? - c
: c
);
5934 produce_chars (coding
);
5938 /* Record unprocessed bytes in coding->carryover. We are
5939 sure that the number of data is less than the size of
5940 coding->carryover. */
5941 unsigned char *p
= coding
->carryover
;
5943 coding
->carryover_bytes
= nbytes
;
5944 while (nbytes
-- > 0)
5947 coding
->consumed
= coding
->src_bytes
;
5950 if (BUFFERP (coding
->dst_object
))
5952 current_buffer
->undo_list
= undo_list
;
5953 record_insert (coding
->dst_pos
, coding
->produced_char
);
5955 if (! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
5956 decode_eol (coding
);
5957 return coding
->result
;
5961 /* Extract an annotation datum from a composition starting at POS and
5962 ending before LIMIT of CODING->src_object (buffer or string), store
5963 the data in BUF, set *STOP to a starting position of the next
5964 composition (if any) or to LIMIT, and return the address of the
5965 next element of BUF.
5967 If such an annotation is not found, set *STOP to a starting
5968 position of a composition after POS (if any) or to LIMIT, and
5972 handle_composition_annotation (pos
, limit
, coding
, buf
, stop
)
5973 EMACS_INT pos
, limit
;
5974 struct coding_system
*coding
;
5978 EMACS_INT start
, end
;
5981 if (! find_composition (pos
, limit
, &start
, &end
, &prop
, coding
->src_object
)
5984 else if (start
> pos
)
5990 /* We found a composition. Store the corresponding
5991 annotation data in BUF. */
5993 enum composition_method method
= COMPOSITION_METHOD (prop
);
5994 int nchars
= COMPOSITION_LENGTH (prop
);
5996 ADD_COMPOSITION_DATA (buf
, 0, nchars
, method
);
5997 if (method
!= COMPOSITION_RELATIVE
)
5999 Lisp_Object components
;
6002 components
= COMPOSITION_COMPONENTS (prop
);
6003 if (VECTORP (components
))
6005 len
= XVECTOR (components
)->size
;
6006 for (i
= 0; i
< len
; i
++)
6007 *buf
++ = XINT (AREF (components
, i
));
6009 else if (STRINGP (components
))
6011 len
= SCHARS (components
);
6015 FETCH_STRING_CHAR_ADVANCE (*buf
, components
, i
, i_byte
);
6019 else if (INTEGERP (components
))
6022 *buf
++ = XINT (components
);
6024 else if (CONSP (components
))
6026 for (len
= 0; CONSP (components
);
6027 len
++, components
= XCDR (components
))
6028 *buf
++ = XINT (XCAR (components
));
6036 if (find_composition (end
, limit
, &start
, &end
, &prop
,
6047 /* Extract an annotation datum from a text property `charset' at POS of
6048 CODING->src_object (buffer of string), store the data in BUF, set
6049 *STOP to the position where the value of `charset' property changes
6050 (limiting by LIMIT), and return the address of the next element of
6053 If the property value is nil, set *STOP to the position where the
6054 property value is non-nil (limiting by LIMIT), and return BUF. */
6057 handle_charset_annotation (pos
, limit
, coding
, buf
, stop
)
6058 EMACS_INT pos
, limit
;
6059 struct coding_system
*coding
;
6063 Lisp_Object val
, next
;
6066 val
= Fget_text_property (make_number (pos
), Qcharset
, coding
->src_object
);
6067 if (! NILP (val
) && CHARSETP (val
))
6068 id
= XINT (CHARSET_SYMBOL_ID (val
));
6071 ADD_CHARSET_DATA (buf
, 0, 0, id
);
6072 next
= Fnext_single_property_change (make_number (pos
), Qcharset
,
6074 make_number (limit
));
6075 *stop
= XINT (next
);
6081 consume_chars (coding
)
6082 struct coding_system
*coding
;
6084 int *buf
= coding
->charbuf
;
6085 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
;
6086 const unsigned char *src
= coding
->source
+ coding
->consumed
;
6087 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
6088 EMACS_INT pos
= coding
->src_pos
+ coding
->consumed_char
;
6089 EMACS_INT end_pos
= coding
->src_pos
+ coding
->src_chars
;
6090 int multibytep
= coding
->src_multibyte
;
6091 Lisp_Object eol_type
;
6093 EMACS_INT stop
, stop_composition
, stop_charset
;
6095 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6096 if (VECTORP (eol_type
))
6099 /* Note: composition handling is not yet implemented. */
6100 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
6102 if (NILP (coding
->src_object
))
6103 stop
= stop_composition
= stop_charset
= end_pos
;
6106 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
)
6107 stop
= stop_composition
= pos
;
6109 stop
= stop_composition
= end_pos
;
6110 if (coding
->common_flags
& CODING_ANNOTATE_CHARSET_MASK
)
6111 stop
= stop_charset
= pos
;
6113 stop_charset
= end_pos
;
6116 /* Compensate for CRLF and conversion. */
6117 buf_end
-= 1 + MAX_ANNOTATION_LENGTH
;
6118 while (buf
< buf_end
)
6124 if (pos
== stop_composition
)
6125 buf
= handle_composition_annotation (pos
, end_pos
, coding
,
6126 buf
, &stop_composition
);
6127 if (pos
== stop_charset
)
6128 buf
= handle_charset_annotation (pos
, end_pos
, coding
,
6129 buf
, &stop_charset
);
6130 stop
= (stop_composition
< stop_charset
6131 ? stop_composition
: stop_charset
);
6138 if (! CODING_FOR_UNIBYTE (coding
)
6139 && (bytes
= MULTIBYTE_LENGTH (src
, src_end
)) > 0)
6140 c
= STRING_CHAR_ADVANCE (src
), pos
+= bytes
;
6145 c
= STRING_CHAR_ADVANCE (src
), pos
++;
6146 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
6148 if (! EQ (eol_type
, Qunix
))
6152 if (EQ (eol_type
, Qdos
))
6161 coding
->consumed
= src
- coding
->source
;
6162 coding
->consumed_char
= pos
- coding
->src_pos
;
6163 coding
->charbuf_used
= buf
- coding
->charbuf
;
6164 coding
->chars_at_source
= 0;
6168 /* Encode the text at CODING->src_object into CODING->dst_object.
6169 CODING->src_object is a buffer or a string.
6170 CODING->dst_object is a buffer or nil.
6172 If CODING->src_object is a buffer, it must be the current buffer.
6173 In this case, if CODING->src_pos is positive, it is a position of
6174 the source text in the buffer, otherwise. the source text is in the
6175 gap area of the buffer, and coding->src_pos specifies the offset of
6176 the text from GPT (which must be the same as PT). If this is the
6177 same buffer as CODING->dst_object, CODING->src_pos must be
6178 negative and CODING should not have `pre-write-conversion'.
6180 If CODING->src_object is a string, CODING should not have
6181 `pre-write-conversion'.
6183 If CODING->dst_object is a buffer, the encoded data is inserted at
6184 the current point of that buffer.
6186 If CODING->dst_object is nil, the encoded data is placed at the
6187 memory area specified by CODING->destination. */
6190 encode_coding (coding
)
6191 struct coding_system
*coding
;
6194 Lisp_Object translation_table
;
6196 attrs
= CODING_ID_ATTRS (coding
->id
);
6197 translation_table
= get_translation_table (attrs
, 1);
6199 if (BUFFERP (coding
->dst_object
))
6201 set_buffer_internal (XBUFFER (coding
->dst_object
));
6202 coding
->dst_multibyte
6203 = ! NILP (current_buffer
->enable_multibyte_characters
);
6206 coding
->consumed
= coding
->consumed_char
= 0;
6207 coding
->produced
= coding
->produced_char
= 0;
6208 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
6211 ALLOC_CONVERSION_WORK_AREA (coding
);
6214 coding_set_source (coding
);
6215 consume_chars (coding
);
6217 if (!NILP (translation_table
))
6218 translate_chars (coding
, translation_table
);
6220 coding_set_destination (coding
);
6221 (*(coding
->encoder
)) (coding
);
6222 } while (coding
->consumed_char
< coding
->src_chars
);
6224 if (BUFFERP (coding
->dst_object
))
6225 insert_from_gap (coding
->produced_char
, coding
->produced
);
6227 return (coding
->result
);
6231 /* Name (or base name) of work buffer for code conversion. */
6232 static Lisp_Object Vcode_conversion_workbuf_name
;
6234 /* A working buffer used by the top level conversion. Once it is
6235 created, it is never destroyed. It has the name
6236 Vcode_conversion_workbuf_name. The other working buffers are
6237 destroyed after the use is finished, and their names are modified
6238 versions of Vcode_conversion_workbuf_name. */
6239 static Lisp_Object Vcode_conversion_reused_workbuf
;
6241 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6242 static int reused_workbuf_in_use
;
6245 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6246 multibyteness of returning buffer. */
6249 make_conversion_work_buffer (multibyte
)
6251 Lisp_Object name
, workbuf
;
6252 struct buffer
*current
;
6254 if (reused_workbuf_in_use
++)
6256 name
= Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name
, Qnil
);
6257 workbuf
= Fget_buffer_create (name
);
6261 name
= Vcode_conversion_workbuf_name
;
6262 workbuf
= Fget_buffer_create (name
);
6263 if (NILP (Vcode_conversion_reused_workbuf
))
6264 Vcode_conversion_reused_workbuf
= workbuf
;
6266 current
= current_buffer
;
6267 set_buffer_internal (XBUFFER (workbuf
));
6269 current_buffer
->undo_list
= Qt
;
6270 current_buffer
->enable_multibyte_characters
= multibyte
? Qt
: Qnil
;
6271 set_buffer_internal (current
);
6277 code_conversion_restore (arg
)
6280 Lisp_Object current
, workbuf
;
6282 current
= XCAR (arg
);
6283 workbuf
= XCDR (arg
);
6284 if (! NILP (workbuf
))
6286 if (EQ (workbuf
, Vcode_conversion_reused_workbuf
))
6287 reused_workbuf_in_use
= 0;
6288 else if (! NILP (Fbuffer_live_p (workbuf
)))
6289 Fkill_buffer (workbuf
);
6291 set_buffer_internal (XBUFFER (current
));
6296 code_conversion_save (with_work_buf
, multibyte
)
6297 int with_work_buf
, multibyte
;
6299 Lisp_Object workbuf
= Qnil
;
6302 workbuf
= make_conversion_work_buffer (multibyte
);
6303 record_unwind_protect (code_conversion_restore
,
6304 Fcons (Fcurrent_buffer (), workbuf
));
6309 decode_coding_gap (coding
, chars
, bytes
)
6310 struct coding_system
*coding
;
6311 EMACS_INT chars
, bytes
;
6313 int count
= specpdl_ptr
- specpdl
;
6316 code_conversion_save (0, 0);
6318 coding
->src_object
= Fcurrent_buffer ();
6319 coding
->src_chars
= chars
;
6320 coding
->src_bytes
= bytes
;
6321 coding
->src_pos
= -chars
;
6322 coding
->src_pos_byte
= -bytes
;
6323 coding
->src_multibyte
= chars
< bytes
;
6324 coding
->dst_object
= coding
->src_object
;
6325 coding
->dst_pos
= PT
;
6326 coding
->dst_pos_byte
= PT_BYTE
;
6327 coding
->dst_multibyte
= ! NILP (current_buffer
->enable_multibyte_characters
);
6328 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
6330 if (CODING_REQUIRE_DETECTION (coding
))
6331 detect_coding (coding
);
6333 decode_coding (coding
);
6335 attrs
= CODING_ID_ATTRS (coding
->id
);
6336 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6338 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6341 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6342 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6343 make_number (coding
->produced_char
));
6345 coding
->produced_char
+= Z
- prev_Z
;
6346 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6349 unbind_to (count
, Qnil
);
6350 return coding
->result
;
6354 encode_coding_gap (coding
, chars
, bytes
)
6355 struct coding_system
*coding
;
6356 EMACS_INT chars
, bytes
;
6358 int count
= specpdl_ptr
- specpdl
;
6360 code_conversion_save (0, 0);
6362 coding
->src_object
= Fcurrent_buffer ();
6363 coding
->src_chars
= chars
;
6364 coding
->src_bytes
= bytes
;
6365 coding
->src_pos
= -chars
;
6366 coding
->src_pos_byte
= -bytes
;
6367 coding
->src_multibyte
= chars
< bytes
;
6368 coding
->dst_object
= coding
->src_object
;
6369 coding
->dst_pos
= PT
;
6370 coding
->dst_pos_byte
= PT_BYTE
;
6372 encode_coding (coding
);
6374 unbind_to (count
, Qnil
);
6375 return coding
->result
;
6379 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6380 SRC_OBJECT into DST_OBJECT by coding context CODING.
6382 SRC_OBJECT is a buffer, a string, or Qnil.
6384 If it is a buffer, the text is at point of the buffer. FROM and TO
6385 are positions in the buffer.
6387 If it is a string, the text is at the beginning of the string.
6388 FROM and TO are indices to the string.
6390 If it is nil, the text is at coding->source. FROM and TO are
6391 indices to coding->source.
6393 DST_OBJECT is a buffer, Qt, or Qnil.
6395 If it is a buffer, the decoded text is inserted at point of the
6396 buffer. If the buffer is the same as SRC_OBJECT, the source text
6399 If it is Qt, a string is made from the decoded text, and
6400 set in CODING->dst_object.
6402 If it is Qnil, the decoded text is stored at CODING->destination.
6403 The caller must allocate CODING->dst_bytes bytes at
6404 CODING->destination by xmalloc. If the decoded text is longer than
6405 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6409 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6411 struct coding_system
*coding
;
6412 Lisp_Object src_object
;
6413 EMACS_INT from
, from_byte
, to
, to_byte
;
6414 Lisp_Object dst_object
;
6416 int count
= specpdl_ptr
- specpdl
;
6417 unsigned char *destination
;
6418 EMACS_INT dst_bytes
;
6419 EMACS_INT chars
= to
- from
;
6420 EMACS_INT bytes
= to_byte
- from_byte
;
6423 int saved_pt
= -1, saved_pt_byte
;
6425 buffer
= Fcurrent_buffer ();
6427 if (NILP (dst_object
))
6429 destination
= coding
->destination
;
6430 dst_bytes
= coding
->dst_bytes
;
6433 coding
->src_object
= src_object
;
6434 coding
->src_chars
= chars
;
6435 coding
->src_bytes
= bytes
;
6436 coding
->src_multibyte
= chars
< bytes
;
6438 if (STRINGP (src_object
))
6440 coding
->src_pos
= from
;
6441 coding
->src_pos_byte
= from_byte
;
6443 else if (BUFFERP (src_object
))
6445 set_buffer_internal (XBUFFER (src_object
));
6447 move_gap_both (from
, from_byte
);
6448 if (EQ (src_object
, dst_object
))
6450 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6451 TEMP_SET_PT_BOTH (from
, from_byte
);
6452 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6453 coding
->src_pos
= -chars
;
6454 coding
->src_pos_byte
= -bytes
;
6458 coding
->src_pos
= from
;
6459 coding
->src_pos_byte
= from_byte
;
6463 if (CODING_REQUIRE_DETECTION (coding
))
6464 detect_coding (coding
);
6465 attrs
= CODING_ID_ATTRS (coding
->id
);
6467 if (EQ (dst_object
, Qt
)
6468 || (! NILP (CODING_ATTR_POST_READ (attrs
))
6469 && NILP (dst_object
)))
6471 coding
->dst_object
= code_conversion_save (1, 1);
6472 coding
->dst_pos
= BEG
;
6473 coding
->dst_pos_byte
= BEG_BYTE
;
6474 coding
->dst_multibyte
= 1;
6476 else if (BUFFERP (dst_object
))
6478 code_conversion_save (0, 0);
6479 coding
->dst_object
= dst_object
;
6480 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6481 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6482 coding
->dst_multibyte
6483 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6487 code_conversion_save (0, 0);
6488 coding
->dst_object
= Qnil
;
6489 coding
->dst_multibyte
= 1;
6492 decode_coding (coding
);
6494 if (BUFFERP (coding
->dst_object
))
6495 set_buffer_internal (XBUFFER (coding
->dst_object
));
6497 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6499 struct gcpro gcpro1
, gcpro2
;
6500 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6503 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6504 GCPRO2 (coding
->src_object
, coding
->dst_object
);
6505 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6506 make_number (coding
->produced_char
));
6509 coding
->produced_char
+= Z
- prev_Z
;
6510 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6513 if (EQ (dst_object
, Qt
))
6515 coding
->dst_object
= Fbuffer_string ();
6517 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
6519 set_buffer_internal (XBUFFER (coding
->dst_object
));
6520 if (dst_bytes
< coding
->produced
)
6523 = (unsigned char *) xrealloc (destination
, coding
->produced
);
6526 record_conversion_result (coding
,
6527 CODING_RESULT_INSUFFICIENT_DST
);
6528 unbind_to (count
, Qnil
);
6531 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
6532 move_gap_both (BEGV
, BEGV_BYTE
);
6533 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
6534 coding
->destination
= destination
;
6540 /* This is the case of:
6541 (BUFFERP (src_object) && EQ (src_object, dst_object))
6542 As we have moved PT while replacing the original buffer
6543 contents, we must recover it now. */
6544 set_buffer_internal (XBUFFER (src_object
));
6545 if (saved_pt
< from
)
6546 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6547 else if (saved_pt
< from
+ chars
)
6548 TEMP_SET_PT_BOTH (from
, from_byte
);
6549 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6550 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6551 saved_pt_byte
+ (coding
->produced
- bytes
));
6553 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6554 saved_pt_byte
+ (coding
->produced
- bytes
));
6557 unbind_to (count
, coding
->dst_object
);
6562 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6564 struct coding_system
*coding
;
6565 Lisp_Object src_object
;
6566 EMACS_INT from
, from_byte
, to
, to_byte
;
6567 Lisp_Object dst_object
;
6569 int count
= specpdl_ptr
- specpdl
;
6570 EMACS_INT chars
= to
- from
;
6571 EMACS_INT bytes
= to_byte
- from_byte
;
6574 int saved_pt
= -1, saved_pt_byte
;
6576 buffer
= Fcurrent_buffer ();
6578 coding
->src_object
= src_object
;
6579 coding
->src_chars
= chars
;
6580 coding
->src_bytes
= bytes
;
6581 coding
->src_multibyte
= chars
< bytes
;
6583 attrs
= CODING_ID_ATTRS (coding
->id
);
6585 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
6587 coding
->src_object
= code_conversion_save (1, coding
->src_multibyte
);
6588 set_buffer_internal (XBUFFER (coding
->src_object
));
6589 if (STRINGP (src_object
))
6590 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
6591 else if (BUFFERP (src_object
))
6592 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
6594 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
6596 if (EQ (src_object
, dst_object
))
6598 set_buffer_internal (XBUFFER (src_object
));
6599 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6600 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6601 set_buffer_internal (XBUFFER (coding
->src_object
));
6604 call2 (CODING_ATTR_PRE_WRITE (attrs
),
6605 make_number (BEG
), make_number (Z
));
6606 coding
->src_object
= Fcurrent_buffer ();
6608 move_gap_both (BEG
, BEG_BYTE
);
6609 coding
->src_chars
= Z
- BEG
;
6610 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
6611 coding
->src_pos
= BEG
;
6612 coding
->src_pos_byte
= BEG_BYTE
;
6613 coding
->src_multibyte
= Z
< Z_BYTE
;
6615 else if (STRINGP (src_object
))
6617 code_conversion_save (0, 0);
6618 coding
->src_pos
= from
;
6619 coding
->src_pos_byte
= from_byte
;
6621 else if (BUFFERP (src_object
))
6623 code_conversion_save (0, 0);
6624 set_buffer_internal (XBUFFER (src_object
));
6625 if (EQ (src_object
, dst_object
))
6627 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6628 coding
->src_object
= del_range_1 (from
, to
, 1, 1);
6629 coding
->src_pos
= 0;
6630 coding
->src_pos_byte
= 0;
6634 if (from
< GPT
&& to
>= GPT
)
6635 move_gap_both (from
, from_byte
);
6636 coding
->src_pos
= from
;
6637 coding
->src_pos_byte
= from_byte
;
6641 code_conversion_save (0, 0);
6643 if (BUFFERP (dst_object
))
6645 coding
->dst_object
= dst_object
;
6646 if (EQ (src_object
, dst_object
))
6648 coding
->dst_pos
= from
;
6649 coding
->dst_pos_byte
= from_byte
;
6653 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6654 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6656 coding
->dst_multibyte
6657 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6659 else if (EQ (dst_object
, Qt
))
6661 coding
->dst_object
= Qnil
;
6662 coding
->dst_bytes
= coding
->src_chars
;
6663 if (coding
->dst_bytes
== 0)
6664 coding
->dst_bytes
= 1;
6665 coding
->destination
= (unsigned char *) xmalloc (coding
->dst_bytes
);
6666 coding
->dst_multibyte
= 0;
6670 coding
->dst_object
= Qnil
;
6671 coding
->dst_multibyte
= 0;
6674 encode_coding (coding
);
6676 if (EQ (dst_object
, Qt
))
6678 if (BUFFERP (coding
->dst_object
))
6679 coding
->dst_object
= Fbuffer_string ();
6683 = make_unibyte_string ((char *) coding
->destination
,
6685 xfree (coding
->destination
);
6691 /* This is the case of:
6692 (BUFFERP (src_object) && EQ (src_object, dst_object))
6693 As we have moved PT while replacing the original buffer
6694 contents, we must recover it now. */
6695 set_buffer_internal (XBUFFER (src_object
));
6696 if (saved_pt
< from
)
6697 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6698 else if (saved_pt
< from
+ chars
)
6699 TEMP_SET_PT_BOTH (from
, from_byte
);
6700 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6701 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6702 saved_pt_byte
+ (coding
->produced
- bytes
));
6704 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6705 saved_pt_byte
+ (coding
->produced
- bytes
));
6708 unbind_to (count
, Qnil
);
6713 preferred_coding_system ()
6715 int id
= coding_categories
[coding_priorities
[0]].id
;
6717 return CODING_ID_NAME (id
);
6722 /*** 8. Emacs Lisp library functions ***/
6724 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
6725 doc
: /* Return t if OBJECT is nil or a coding-system.
6726 See the documentation of `define-coding-system' for information
6727 about coding-system objects. */)
6731 return ((NILP (obj
) || CODING_SYSTEM_P (obj
)) ? Qt
: Qnil
);
6734 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
6735 Sread_non_nil_coding_system
, 1, 1, 0,
6736 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6743 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6744 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
6746 while (SCHARS (val
) == 0);
6747 return (Fintern (val
, Qnil
));
6750 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
6751 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6752 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6753 (prompt
, default_coding_system
)
6754 Lisp_Object prompt
, default_coding_system
;
6757 if (SYMBOLP (default_coding_system
))
6758 XSETSTRING (default_coding_system
, XPNTR (SYMBOL_NAME (default_coding_system
)));
6759 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6760 Qt
, Qnil
, Qcoding_system_history
,
6761 default_coding_system
, Qnil
);
6762 return (SCHARS (val
) == 0 ? Qnil
: Fintern (val
, Qnil
));
6765 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
6767 doc
: /* Check validity of CODING-SYSTEM.
6768 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6770 Lisp_Object coding_system
;
6772 CHECK_SYMBOL (coding_system
);
6773 if (!NILP (Fcoding_system_p (coding_system
)))
6774 return coding_system
;
6776 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
6780 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6781 HIGHEST is nonzero, return the coding system of the highest
6782 priority among the detected coding systems. Otherwize return a
6783 list of detected coding systems sorted by their priorities. If
6784 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6785 multibyte form but contains only ASCII and eight-bit chars.
6786 Otherwise, the bytes are raw bytes.
6788 CODING-SYSTEM controls the detection as below:
6790 If it is nil, detect both text-format and eol-format. If the
6791 text-format part of CODING-SYSTEM is already specified
6792 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6793 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6794 detect only text-format. */
6797 detect_coding_system (src
, src_chars
, src_bytes
, highest
, multibytep
,
6799 const unsigned char *src
;
6800 int src_chars
, src_bytes
, highest
;
6802 Lisp_Object coding_system
;
6804 const unsigned char *src_end
= src
+ src_bytes
;
6805 Lisp_Object attrs
, eol_type
;
6807 struct coding_system coding
;
6809 struct coding_detection_info detect_info
;
6810 enum coding_category base_category
;
6812 if (NILP (coding_system
))
6813 coding_system
= Qundecided
;
6814 setup_coding_system (coding_system
, &coding
);
6815 attrs
= CODING_ID_ATTRS (coding
.id
);
6816 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
6817 coding_system
= CODING_ATTR_BASE_NAME (attrs
);
6819 coding
.source
= src
;
6820 coding
.src_chars
= src_chars
;
6821 coding
.src_bytes
= src_bytes
;
6822 coding
.src_multibyte
= multibytep
;
6823 coding
.consumed
= 0;
6824 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6826 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
6828 /* At first, detect text-format if necessary. */
6829 base_category
= XINT (CODING_ATTR_CATEGORY (attrs
));
6830 if (base_category
== coding_category_undecided
)
6832 enum coding_category category
;
6833 struct coding_system
*this;
6836 /* Skip all ASCII bytes except for a few ISO2022 controls. */
6837 for (i
= 0; src
< src_end
; i
++, src
++)
6840 if (c
& 0x80 || (c
< 0x20 && (c
== 0
6841 || c
== ISO_CODE_ESC
6843 || c
== ISO_CODE_SO
)))
6846 coding
.head_ascii
= src
- coding
.source
;
6849 for (i
= 0; i
< coding_category_raw_text
; i
++)
6851 category
= coding_priorities
[i
];
6852 this = coding_categories
+ category
;
6856 /* No coding system of this category is defined. */
6857 detect_info
.rejected
|= (1 << category
);
6859 else if (category
>= coding_category_raw_text
)
6861 else if (detect_info
.checked
& (1 << category
))
6864 && (detect_info
.found
& (1 << category
)))
6869 if ((*(this->detector
)) (&coding
, &detect_info
)
6871 && (detect_info
.found
& (1 << category
)))
6873 if (category
== coding_category_utf_16_auto
)
6875 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
6876 category
= coding_category_utf_16_le
;
6878 category
= coding_category_utf_16_be
;
6885 if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
6887 detect_info
.found
= CATEGORY_MASK_RAW_TEXT
;
6888 id
= coding_categories
[coding_category_raw_text
].id
;
6889 val
= Fcons (make_number (id
), Qnil
);
6891 else if (! detect_info
.rejected
&& ! detect_info
.found
)
6893 detect_info
.found
= CATEGORY_MASK_ANY
;
6894 id
= coding_categories
[coding_category_undecided
].id
;
6895 val
= Fcons (make_number (id
), Qnil
);
6899 if (detect_info
.found
)
6901 detect_info
.found
= 1 << category
;
6902 val
= Fcons (make_number (this->id
), Qnil
);
6905 for (i
= 0; i
< coding_category_raw_text
; i
++)
6906 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
6908 detect_info
.found
= 1 << coding_priorities
[i
];
6909 id
= coding_categories
[coding_priorities
[i
]].id
;
6910 val
= Fcons (make_number (id
), Qnil
);
6916 int mask
= detect_info
.rejected
| detect_info
.found
;
6920 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6922 category
= coding_priorities
[i
];
6923 if (! (mask
& (1 << category
)))
6925 found
|= 1 << category
;
6926 id
= coding_categories
[category
].id
;
6927 val
= Fcons (make_number (id
), val
);
6930 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6932 category
= coding_priorities
[i
];
6933 if (detect_info
.found
& (1 << category
))
6935 id
= coding_categories
[category
].id
;
6936 val
= Fcons (make_number (id
), val
);
6939 detect_info
.found
|= found
;
6942 else if (base_category
== coding_category_utf_16_auto
)
6944 if (detect_coding_utf_16 (&coding
, &detect_info
))
6946 enum coding_category category
;
6947 struct coding_system
*this;
6949 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
6950 this = coding_categories
+ coding_category_utf_16_le
;
6951 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
6952 this = coding_categories
+ coding_category_utf_16_be
;
6953 else if (detect_info
.rejected
& CATEGORY_MASK_UTF_16_LE_NOSIG
)
6954 this = coding_categories
+ coding_category_utf_16_be_nosig
;
6956 this = coding_categories
+ coding_category_utf_16_le_nosig
;
6957 val
= Fcons (make_number (this->id
), Qnil
);
6962 detect_info
.found
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
6963 val
= Fcons (make_number (coding
.id
), Qnil
);
6966 /* Then, detect eol-format if necessary. */
6968 int normal_eol
= -1, utf_16_be_eol
= -1, utf_16_le_eol
;
6971 if (VECTORP (eol_type
))
6973 if (detect_info
.found
& ~CATEGORY_MASK_UTF_16
)
6974 normal_eol
= detect_eol (coding
.source
, src_bytes
,
6975 coding_category_raw_text
);
6976 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_BE
6977 | CATEGORY_MASK_UTF_16_BE_NOSIG
))
6978 utf_16_be_eol
= detect_eol (coding
.source
, src_bytes
,
6979 coding_category_utf_16_be
);
6980 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
6981 | CATEGORY_MASK_UTF_16_LE_NOSIG
))
6982 utf_16_le_eol
= detect_eol (coding
.source
, src_bytes
,
6983 coding_category_utf_16_le
);
6987 if (EQ (eol_type
, Qunix
))
6988 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_LF
;
6989 else if (EQ (eol_type
, Qdos
))
6990 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CRLF
;
6992 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CR
;
6995 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
6997 enum coding_category category
;
7000 id
= XINT (XCAR (tail
));
7001 attrs
= CODING_ID_ATTRS (id
);
7002 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7003 eol_type
= CODING_ID_EOL_TYPE (id
);
7004 if (VECTORP (eol_type
))
7006 if (category
== coding_category_utf_16_be
7007 || category
== coding_category_utf_16_be_nosig
)
7008 this_eol
= utf_16_be_eol
;
7009 else if (category
== coding_category_utf_16_le
7010 || category
== coding_category_utf_16_le_nosig
)
7011 this_eol
= utf_16_le_eol
;
7013 this_eol
= normal_eol
;
7015 if (this_eol
== EOL_SEEN_LF
)
7016 XSETCAR (tail
, AREF (eol_type
, 0));
7017 else if (this_eol
== EOL_SEEN_CRLF
)
7018 XSETCAR (tail
, AREF (eol_type
, 1));
7019 else if (this_eol
== EOL_SEEN_CR
)
7020 XSETCAR (tail
, AREF (eol_type
, 2));
7022 XSETCAR (tail
, CODING_ID_NAME (id
));
7025 XSETCAR (tail
, CODING_ID_NAME (id
));
7029 return (highest
? XCAR (val
) : val
);
7033 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
7035 doc
: /* Detect coding system of the text in the region between START and END.
7036 Return a list of possible coding systems ordered by priority.
7038 If only ASCII characters are found, it returns a list of single element
7039 `undecided' or its subsidiary coding system according to a detected
7042 If optional argument HIGHEST is non-nil, return the coding system of
7043 highest priority. */)
7044 (start
, end
, highest
)
7045 Lisp_Object start
, end
, highest
;
7048 int from_byte
, to_byte
;
7050 CHECK_NUMBER_COERCE_MARKER (start
);
7051 CHECK_NUMBER_COERCE_MARKER (end
);
7053 validate_region (&start
, &end
);
7054 from
= XINT (start
), to
= XINT (end
);
7055 from_byte
= CHAR_TO_BYTE (from
);
7056 to_byte
= CHAR_TO_BYTE (to
);
7058 if (from
< GPT
&& to
>= GPT
)
7059 move_gap_both (to
, to_byte
);
7061 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
7062 to
- from
, to_byte
- from_byte
,
7064 !NILP (current_buffer
7065 ->enable_multibyte_characters
),
7069 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
7071 doc
: /* Detect coding system of the text in STRING.
7072 Return a list of possible coding systems ordered by priority.
7074 If only ASCII characters are found, it returns a list of single element
7075 `undecided' or its subsidiary coding system according to a detected
7078 If optional argument HIGHEST is non-nil, return the coding system of
7079 highest priority. */)
7081 Lisp_Object string
, highest
;
7083 CHECK_STRING (string
);
7085 return detect_coding_system (SDATA (string
),
7086 SCHARS (string
), SBYTES (string
),
7087 !NILP (highest
), STRING_MULTIBYTE (string
),
7093 char_encodable_p (c
, attrs
)
7098 struct charset
*charset
;
7099 Lisp_Object translation_table
;
7101 translation_table
= CODING_ATTR_TRANS_TBL (attrs
);
7102 if (CHAR_TABLE_P (translation_table
))
7103 c
= translate_char (translation_table
, c
);
7104 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
7105 CONSP (tail
); tail
= XCDR (tail
))
7107 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
7108 if (CHAR_CHARSET_P (c
, charset
))
7111 return (! NILP (tail
));
7115 /* Return a list of coding systems that safely encode the text between
7116 START and END. If EXCLUDE is non-nil, it is a list of coding
7117 systems not to check. The returned list doesn't contain any such
7118 coding systems. In any case, if the text contains only ASCII or is
7119 unibyte, return t. */
7121 DEFUN ("find-coding-systems-region-internal",
7122 Ffind_coding_systems_region_internal
,
7123 Sfind_coding_systems_region_internal
, 2, 3, 0,
7124 doc
: /* Internal use only. */)
7125 (start
, end
, exclude
)
7126 Lisp_Object start
, end
, exclude
;
7128 Lisp_Object coding_attrs_list
, safe_codings
;
7129 EMACS_INT start_byte
, end_byte
;
7130 const unsigned char *p
, *pbeg
, *pend
;
7132 Lisp_Object tail
, elt
;
7134 if (STRINGP (start
))
7136 if (!STRING_MULTIBYTE (start
)
7137 || SCHARS (start
) == SBYTES (start
))
7140 end_byte
= SBYTES (start
);
7144 CHECK_NUMBER_COERCE_MARKER (start
);
7145 CHECK_NUMBER_COERCE_MARKER (end
);
7146 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7147 args_out_of_range (start
, end
);
7148 if (NILP (current_buffer
->enable_multibyte_characters
))
7150 start_byte
= CHAR_TO_BYTE (XINT (start
));
7151 end_byte
= CHAR_TO_BYTE (XINT (end
));
7152 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7155 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7157 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7158 move_gap_both (XINT (start
), start_byte
);
7160 move_gap_both (XINT (end
), end_byte
);
7164 coding_attrs_list
= Qnil
;
7165 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7167 || NILP (Fmemq (XCAR (tail
), exclude
)))
7171 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
7172 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
7173 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
7175 ASET (attrs
, coding_attr_trans_tbl
,
7176 get_translation_table (attrs
, 1));
7177 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
7181 if (STRINGP (start
))
7182 p
= pbeg
= SDATA (start
);
7184 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7185 pend
= p
+ (end_byte
- start_byte
);
7187 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
7188 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7192 if (ASCII_BYTE_P (*p
))
7196 c
= STRING_CHAR_ADVANCE (p
);
7198 charset_map_loaded
= 0;
7199 for (tail
= coding_attrs_list
; CONSP (tail
);)
7204 else if (char_encodable_p (c
, elt
))
7206 else if (CONSP (XCDR (tail
)))
7208 XSETCAR (tail
, XCAR (XCDR (tail
)));
7209 XSETCDR (tail
, XCDR (XCDR (tail
)));
7213 XSETCAR (tail
, Qnil
);
7217 if (charset_map_loaded
)
7219 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7221 if (STRINGP (start
))
7222 pbeg
= SDATA (start
);
7224 pbeg
= BYTE_POS_ADDR (start_byte
);
7225 p
= pbeg
+ p_offset
;
7226 pend
= pbeg
+ pend_offset
;
7231 safe_codings
= Qnil
;
7232 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
7233 if (! NILP (XCAR (tail
)))
7234 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
7236 return safe_codings
;
7240 DEFUN ("unencodable-char-position", Funencodable_char_position
,
7241 Sunencodable_char_position
, 3, 5, 0,
7243 Return position of first un-encodable character in a region.
7244 START and END specfiy the region and CODING-SYSTEM specifies the
7245 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7247 If optional 4th argument COUNT is non-nil, it specifies at most how
7248 many un-encodable characters to search. In this case, the value is a
7251 If optional 5th argument STRING is non-nil, it is a string to search
7252 for un-encodable characters. In that case, START and END are indexes
7254 (start
, end
, coding_system
, count
, string
)
7255 Lisp_Object start
, end
, coding_system
, count
, string
;
7258 struct coding_system coding
;
7259 Lisp_Object attrs
, charset_list
, translation_table
;
7260 Lisp_Object positions
;
7262 const unsigned char *p
, *stop
, *pend
;
7263 int ascii_compatible
;
7265 setup_coding_system (Fcheck_coding_system (coding_system
), &coding
);
7266 attrs
= CODING_ID_ATTRS (coding
.id
);
7267 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
7269 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
7270 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7271 translation_table
= get_translation_table (attrs
, 1);
7275 validate_region (&start
, &end
);
7276 from
= XINT (start
);
7278 if (NILP (current_buffer
->enable_multibyte_characters
)
7279 || (ascii_compatible
7280 && (to
- from
) == (CHAR_TO_BYTE (to
) - (CHAR_TO_BYTE (from
)))))
7282 p
= CHAR_POS_ADDR (from
);
7283 pend
= CHAR_POS_ADDR (to
);
7284 if (from
< GPT
&& to
>= GPT
)
7291 CHECK_STRING (string
);
7292 CHECK_NATNUM (start
);
7294 from
= XINT (start
);
7297 || to
> SCHARS (string
))
7298 args_out_of_range_3 (string
, start
, end
);
7299 if (! STRING_MULTIBYTE (string
))
7301 p
= SDATA (string
) + string_char_to_byte (string
, from
);
7302 stop
= pend
= SDATA (string
) + string_char_to_byte (string
, to
);
7303 if (ascii_compatible
&& (to
- from
) == (pend
- p
))
7311 CHECK_NATNUM (count
);
7320 if (ascii_compatible
)
7321 while (p
< stop
&& ASCII_BYTE_P (*p
))
7331 c
= STRING_CHAR_ADVANCE (p
);
7332 if (! (ASCII_CHAR_P (c
) && ascii_compatible
)
7333 && ! char_charset (translate_char (translation_table
, c
),
7334 charset_list
, NULL
))
7336 positions
= Fcons (make_number (from
), positions
);
7345 return (NILP (count
) ? Fcar (positions
) : Fnreverse (positions
));
7349 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
7350 Scheck_coding_systems_region
, 3, 3, 0,
7351 doc
: /* Check if the region is encodable by coding systems.
7353 START and END are buffer positions specifying the region.
7354 CODING-SYSTEM-LIST is a list of coding systems to check.
7356 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7357 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7358 whole region, POS0, POS1, ... are buffer positions where non-encodable
7359 characters are found.
7361 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7364 START may be a string. In that case, check if the string is
7365 encodable, and the value contains indices to the string instead of
7366 buffer positions. END is ignored. */)
7367 (start
, end
, coding_system_list
)
7368 Lisp_Object start
, end
, coding_system_list
;
7371 EMACS_INT start_byte
, end_byte
;
7373 const unsigned char *p
, *pbeg
, *pend
;
7375 Lisp_Object tail
, elt
, attrs
;
7377 if (STRINGP (start
))
7379 if (!STRING_MULTIBYTE (start
)
7380 && SCHARS (start
) != SBYTES (start
))
7383 end_byte
= SBYTES (start
);
7388 CHECK_NUMBER_COERCE_MARKER (start
);
7389 CHECK_NUMBER_COERCE_MARKER (end
);
7390 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7391 args_out_of_range (start
, end
);
7392 if (NILP (current_buffer
->enable_multibyte_characters
))
7394 start_byte
= CHAR_TO_BYTE (XINT (start
));
7395 end_byte
= CHAR_TO_BYTE (XINT (end
));
7396 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7399 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7401 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7402 move_gap_both (XINT (start
), start_byte
);
7404 move_gap_both (XINT (end
), end_byte
);
7410 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7413 attrs
= AREF (CODING_SYSTEM_SPEC (elt
), 0);
7414 ASET (attrs
, coding_attr_trans_tbl
, get_translation_table (attrs
, 1));
7415 list
= Fcons (Fcons (elt
, Fcons (attrs
, Qnil
)), list
);
7418 if (STRINGP (start
))
7419 p
= pbeg
= SDATA (start
);
7421 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7422 pend
= p
+ (end_byte
- start_byte
);
7424 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
7425 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7429 if (ASCII_BYTE_P (*p
))
7433 c
= STRING_CHAR_ADVANCE (p
);
7435 charset_map_loaded
= 0;
7436 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
7438 elt
= XCDR (XCAR (tail
));
7439 if (! char_encodable_p (c
, XCAR (elt
)))
7440 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
7442 if (charset_map_loaded
)
7444 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7446 if (STRINGP (start
))
7447 pbeg
= SDATA (start
);
7449 pbeg
= BYTE_POS_ADDR (start_byte
);
7450 p
= pbeg
+ p_offset
;
7451 pend
= pbeg
+ pend_offset
;
7459 for (; CONSP (tail
); tail
= XCDR (tail
))
7462 if (CONSP (XCDR (XCDR (elt
))))
7463 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
7472 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
7473 Lisp_Object start
, end
, coding_system
, dst_object
;
7474 int encodep
, norecord
;
7476 struct coding_system coding
;
7477 EMACS_INT from
, from_byte
, to
, to_byte
;
7478 Lisp_Object src_object
;
7480 CHECK_NUMBER_COERCE_MARKER (start
);
7481 CHECK_NUMBER_COERCE_MARKER (end
);
7482 if (NILP (coding_system
))
7483 coding_system
= Qno_conversion
;
7485 CHECK_CODING_SYSTEM (coding_system
);
7486 src_object
= Fcurrent_buffer ();
7487 if (NILP (dst_object
))
7488 dst_object
= src_object
;
7489 else if (! EQ (dst_object
, Qt
))
7490 CHECK_BUFFER (dst_object
);
7492 validate_region (&start
, &end
);
7493 from
= XFASTINT (start
);
7494 from_byte
= CHAR_TO_BYTE (from
);
7495 to
= XFASTINT (end
);
7496 to_byte
= CHAR_TO_BYTE (to
);
7498 setup_coding_system (coding_system
, &coding
);
7499 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7502 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7505 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7508 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7510 return (BUFFERP (dst_object
)
7511 ? make_number (coding
.produced_char
)
7512 : coding
.dst_object
);
7516 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
7517 3, 4, "r\nzCoding system: ",
7518 doc
: /* Decode the current region from the specified coding system.
7519 When called from a program, takes four arguments:
7520 START, END, CODING-SYSTEM, and DESTINATION.
7521 START and END are buffer positions.
7523 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7524 If nil, the region between START and END is replace by the decoded text.
7525 If buffer, the decoded text is inserted in the buffer.
7526 If t, the decoded text is returned.
7528 This function sets `last-coding-system-used' to the precise coding system
7529 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7530 not fully specified.)
7531 It returns the length of the decoded text. */)
7532 (start
, end
, coding_system
, destination
)
7533 Lisp_Object start
, end
, coding_system
, destination
;
7535 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
7538 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
7539 3, 4, "r\nzCoding system: ",
7540 doc
: /* Encode the current region by specified coding system.
7541 When called from a program, takes three arguments:
7542 START, END, and CODING-SYSTEM. START and END are buffer positions.
7544 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7545 If nil, the region between START and END is replace by the encoded text.
7546 If buffer, the encoded text is inserted in the buffer.
7547 If t, the encoded text is returned.
7549 This function sets `last-coding-system-used' to the precise coding system
7550 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7551 not fully specified.)
7552 It returns the length of the encoded text. */)
7553 (start
, end
, coding_system
, destination
)
7554 Lisp_Object start
, end
, coding_system
, destination
;
7556 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
7560 code_convert_string (string
, coding_system
, dst_object
,
7561 encodep
, nocopy
, norecord
)
7562 Lisp_Object string
, coding_system
, dst_object
;
7563 int encodep
, nocopy
, norecord
;
7565 struct coding_system coding
;
7566 EMACS_INT chars
, bytes
;
7568 CHECK_STRING (string
);
7569 if (NILP (coding_system
))
7572 Vlast_coding_system_used
= Qno_conversion
;
7573 if (NILP (dst_object
))
7574 return (nocopy
? Fcopy_sequence (string
) : string
);
7577 if (NILP (coding_system
))
7578 coding_system
= Qno_conversion
;
7580 CHECK_CODING_SYSTEM (coding_system
);
7581 if (NILP (dst_object
))
7583 else if (! EQ (dst_object
, Qt
))
7584 CHECK_BUFFER (dst_object
);
7586 setup_coding_system (coding_system
, &coding
);
7587 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7588 chars
= SCHARS (string
);
7589 bytes
= SBYTES (string
);
7591 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7593 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7595 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7597 return (BUFFERP (dst_object
)
7598 ? make_number (coding
.produced_char
)
7599 : coding
.dst_object
);
7603 /* Encode or decode STRING according to CODING_SYSTEM.
7604 Do not set Vlast_coding_system_used.
7606 This function is called only from macros DECODE_FILE and
7607 ENCODE_FILE, thus we ignore character composition. */
7610 code_convert_string_norecord (string
, coding_system
, encodep
)
7611 Lisp_Object string
, coding_system
;
7614 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
7618 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
7620 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7622 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7623 if the decoding operation is trivial.
7625 Optional fourth arg BUFFER non-nil meant that the decoded text is
7626 inserted in BUFFER instead of returned as a string. In this case,
7627 the return value is BUFFER.
7629 This function sets `last-coding-system-used' to the precise coding system
7630 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7631 not fully specified. */)
7632 (string
, coding_system
, nocopy
, buffer
)
7633 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7635 return code_convert_string (string
, coding_system
, buffer
,
7636 0, ! NILP (nocopy
), 0);
7639 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
7641 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
7643 Optional third arg NOCOPY non-nil means it is OK to return STRING
7644 itself if the encoding operation is trivial.
7646 Optional fourth arg BUFFER non-nil meant that the encoded text is
7647 inserted in BUFFER instead of returned as a string. In this case,
7648 the return value is BUFFER.
7650 This function sets `last-coding-system-used' to the precise coding system
7651 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7652 not fully specified.) */)
7653 (string
, coding_system
, nocopy
, buffer
)
7654 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7656 return code_convert_string (string
, coding_system
, buffer
,
7657 1, ! NILP (nocopy
), 1);
7661 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
7662 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
7663 Return the corresponding character. */)
7667 Lisp_Object spec
, attrs
, val
;
7668 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
7671 CHECK_NATNUM (code
);
7672 c
= XFASTINT (code
);
7673 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7674 attrs
= AREF (spec
, 0);
7676 if (ASCII_BYTE_P (c
)
7677 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7680 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7681 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7682 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7683 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7686 charset
= charset_roman
;
7687 else if (c
>= 0xA0 && c
< 0xDF)
7689 charset
= charset_kana
;
7694 int s1
= c
>> 8, s2
= c
& 0xFF;
7696 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
7697 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
7698 error ("Invalid code: %d", code
);
7700 charset
= charset_kanji
;
7702 c
= DECODE_CHAR (charset
, c
);
7704 error ("Invalid code: %d", code
);
7705 return make_number (c
);
7709 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
7710 doc
: /* Encode a Japanese character CHAR to shift_jis encoding.
7711 Return the corresponding code in SJIS. */)
7715 Lisp_Object spec
, attrs
, charset_list
;
7717 struct charset
*charset
;
7720 CHECK_CHARACTER (ch
);
7722 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7723 attrs
= AREF (spec
, 0);
7725 if (ASCII_CHAR_P (c
)
7726 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7729 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7730 charset
= char_charset (c
, charset_list
, &code
);
7731 if (code
== CHARSET_INVALID_CODE (charset
))
7732 error ("Can't encode by shift_jis encoding: %d", c
);
7735 return make_number (code
);
7738 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
7739 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
7740 Return the corresponding character. */)
7744 Lisp_Object spec
, attrs
, val
;
7745 struct charset
*charset_roman
, *charset_big5
, *charset
;
7748 CHECK_NATNUM (code
);
7749 c
= XFASTINT (code
);
7750 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7751 attrs
= AREF (spec
, 0);
7753 if (ASCII_BYTE_P (c
)
7754 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7757 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7758 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7759 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7762 charset
= charset_roman
;
7765 int b1
= c
>> 8, b2
= c
& 0x7F;
7766 if (b1
< 0xA1 || b1
> 0xFE
7767 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
7768 error ("Invalid code: %d", code
);
7769 charset
= charset_big5
;
7771 c
= DECODE_CHAR (charset
, (unsigned )c
);
7773 error ("Invalid code: %d", code
);
7774 return make_number (c
);
7777 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
7778 doc
: /* Encode the Big5 character CHAR to BIG5 coding system.
7779 Return the corresponding character code in Big5. */)
7783 Lisp_Object spec
, attrs
, charset_list
;
7784 struct charset
*charset
;
7788 CHECK_CHARACTER (ch
);
7790 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7791 attrs
= AREF (spec
, 0);
7792 if (ASCII_CHAR_P (c
)
7793 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7796 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7797 charset
= char_charset (c
, charset_list
, &code
);
7798 if (code
== CHARSET_INVALID_CODE (charset
))
7799 error ("Can't encode by Big5 encoding: %d", c
);
7801 return make_number (code
);
7805 DEFUN ("set-terminal-coding-system-internal",
7806 Fset_terminal_coding_system_internal
,
7807 Sset_terminal_coding_system_internal
, 1, 1, 0,
7808 doc
: /* Internal use only. */)
7810 Lisp_Object coding_system
;
7812 CHECK_SYMBOL (coding_system
);
7813 setup_coding_system (Fcheck_coding_system (coding_system
),
7816 /* We had better not send unsafe characters to terminal. */
7817 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
7818 /* Characer composition should be disabled. */
7819 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7820 terminal_coding
.src_multibyte
= 1;
7821 terminal_coding
.dst_multibyte
= 0;
7825 DEFUN ("set-safe-terminal-coding-system-internal",
7826 Fset_safe_terminal_coding_system_internal
,
7827 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
7828 doc
: /* Internal use only. */)
7830 Lisp_Object coding_system
;
7832 CHECK_SYMBOL (coding_system
);
7833 setup_coding_system (Fcheck_coding_system (coding_system
),
7834 &safe_terminal_coding
);
7835 /* Characer composition should be disabled. */
7836 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7837 safe_terminal_coding
.src_multibyte
= 1;
7838 safe_terminal_coding
.dst_multibyte
= 0;
7842 DEFUN ("terminal-coding-system",
7843 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
7844 doc
: /* Return coding system specified for terminal output. */)
7847 return CODING_ID_NAME (terminal_coding
.id
);
7850 DEFUN ("set-keyboard-coding-system-internal",
7851 Fset_keyboard_coding_system_internal
,
7852 Sset_keyboard_coding_system_internal
, 1, 1, 0,
7853 doc
: /* Internal use only. */)
7855 Lisp_Object coding_system
;
7857 CHECK_SYMBOL (coding_system
);
7858 setup_coding_system (Fcheck_coding_system (coding_system
),
7860 /* Characer composition should be disabled. */
7861 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7865 DEFUN ("keyboard-coding-system",
7866 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
7867 doc
: /* Return coding system specified for decoding keyboard input. */)
7870 return CODING_ID_NAME (keyboard_coding
.id
);
7874 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
7875 Sfind_operation_coding_system
, 1, MANY
, 0,
7876 doc
: /* Choose a coding system for an operation based on the target name.
7877 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7878 DECODING-SYSTEM is the coding system to use for decoding
7879 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7880 for encoding (in case OPERATION does encoding).
7882 The first argument OPERATION specifies an I/O primitive:
7883 For file I/O, `insert-file-contents' or `write-region'.
7884 For process I/O, `call-process', `call-process-region', or `start-process'.
7885 For network I/O, `open-network-stream'.
7887 The remaining arguments should be the same arguments that were passed
7888 to the primitive. Depending on which primitive, one of those arguments
7889 is selected as the TARGET. For example, if OPERATION does file I/O,
7890 whichever argument specifies the file name is TARGET.
7892 TARGET has a meaning which depends on OPERATION:
7893 For file I/O, TARGET is a file name.
7894 For process I/O, TARGET is a process name.
7895 For network I/O, TARGET is a service name or a port number
7897 This function looks up what specified for TARGET in,
7898 `file-coding-system-alist', `process-coding-system-alist',
7899 or `network-coding-system-alist' depending on OPERATION.
7900 They may specify a coding system, a cons of coding systems,
7901 or a function symbol to call.
7902 In the last case, we call the function with one argument,
7903 which is a list of all the arguments given to this function.
7905 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7910 Lisp_Object operation
, target_idx
, target
, val
;
7911 register Lisp_Object chain
;
7914 error ("Too few arguments");
7915 operation
= args
[0];
7916 if (!SYMBOLP (operation
)
7917 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
7918 error ("Invalid first arguement");
7919 if (nargs
< 1 + XINT (target_idx
))
7920 error ("Too few arguments for operation: %s",
7921 SDATA (SYMBOL_NAME (operation
)));
7922 target
= args
[XINT (target_idx
) + 1];
7923 if (!(STRINGP (target
)
7924 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
7925 error ("Invalid %dth argument", XINT (target_idx
) + 1);
7927 chain
= ((EQ (operation
, Qinsert_file_contents
)
7928 || EQ (operation
, Qwrite_region
))
7929 ? Vfile_coding_system_alist
7930 : (EQ (operation
, Qopen_network_stream
)
7931 ? Vnetwork_coding_system_alist
7932 : Vprocess_coding_system_alist
));
7936 for (; CONSP (chain
); chain
= XCDR (chain
))
7942 && ((STRINGP (target
)
7943 && STRINGP (XCAR (elt
))
7944 && fast_string_match (XCAR (elt
), target
) >= 0)
7945 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
7948 /* Here, if VAL is both a valid coding system and a valid
7949 function symbol, we return VAL as a coding system. */
7952 if (! SYMBOLP (val
))
7954 if (! NILP (Fcoding_system_p (val
)))
7955 return Fcons (val
, val
);
7956 if (! NILP (Ffboundp (val
)))
7958 val
= call1 (val
, Flist (nargs
, args
));
7961 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
7962 return Fcons (val
, val
);
7970 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
7971 Sset_coding_system_priority
, 0, MANY
, 0,
7972 doc
: /* Assign higher priority to the coding systems given as arguments.
7973 If multiple coding systems belongs to the same category,
7974 all but the first one are ignored.
7976 usage: (set-coding-system-priority ...) */)
7982 int changed
[coding_category_max
];
7983 enum coding_category priorities
[coding_category_max
];
7985 bzero (changed
, sizeof changed
);
7987 for (i
= j
= 0; i
< nargs
; i
++)
7989 enum coding_category category
;
7990 Lisp_Object spec
, attrs
;
7992 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
7993 attrs
= AREF (spec
, 0);
7994 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7995 if (changed
[category
])
7996 /* Ignore this coding system because a coding system of the
7997 same category already had a higher priority. */
7999 changed
[category
] = 1;
8000 priorities
[j
++] = category
;
8001 if (coding_categories
[category
].id
>= 0
8002 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
8003 setup_coding_system (args
[i
], &coding_categories
[category
]);
8004 Fset (AREF (Vcoding_category_table
, category
), args
[i
]);
8007 /* Now we have decided top J priorities. Reflect the order of the
8008 original priorities to the remaining priorities. */
8010 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
8012 while (j
< coding_category_max
8013 && changed
[coding_priorities
[j
]])
8015 if (j
== coding_category_max
)
8017 priorities
[i
] = coding_priorities
[j
];
8020 bcopy (priorities
, coding_priorities
, sizeof priorities
);
8022 /* Update `coding-category-list'. */
8023 Vcoding_category_list
= Qnil
;
8024 for (i
= coding_category_max
- 1; i
>= 0; i
--)
8025 Vcoding_category_list
8026 = Fcons (AREF (Vcoding_category_table
, priorities
[i
]),
8027 Vcoding_category_list
);
8032 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
8033 Scoding_system_priority_list
, 0, 1, 0,
8034 doc
: /* Return a list of coding systems ordered by their priorities.
8035 HIGHESTP non-nil means just return the highest priority one. */)
8037 Lisp_Object highestp
;
8042 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
8044 enum coding_category category
= coding_priorities
[i
];
8045 int id
= coding_categories
[category
].id
;
8050 attrs
= CODING_ID_ATTRS (id
);
8051 if (! NILP (highestp
))
8052 return CODING_ATTR_BASE_NAME (attrs
);
8053 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
8055 return Fnreverse (val
);
8058 static char *suffixes
[] = { "-unix", "-dos", "-mac" };
8061 make_subsidiaries (base
)
8064 Lisp_Object subsidiaries
;
8065 int base_name_len
= SBYTES (SYMBOL_NAME (base
));
8066 char *buf
= (char *) alloca (base_name_len
+ 6);
8069 bcopy (SDATA (SYMBOL_NAME (base
)), buf
, base_name_len
);
8070 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
8071 for (i
= 0; i
< 3; i
++)
8073 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
8074 ASET (subsidiaries
, i
, intern (buf
));
8076 return subsidiaries
;
8080 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
8081 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
8082 doc
: /* For internal use only.
8083 usage: (define-coding-system-internal ...) */)
8089 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
8090 Lisp_Object attrs
; /* Vector of attributes. */
8091 Lisp_Object eol_type
;
8092 Lisp_Object aliases
;
8093 Lisp_Object coding_type
, charset_list
, safe_charsets
;
8094 enum coding_category category
;
8095 Lisp_Object tail
, val
;
8096 int max_charset_id
= 0;
8099 if (nargs
< coding_arg_max
)
8102 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
8104 name
= args
[coding_arg_name
];
8105 CHECK_SYMBOL (name
);
8106 CODING_ATTR_BASE_NAME (attrs
) = name
;
8108 val
= args
[coding_arg_mnemonic
];
8109 if (! STRINGP (val
))
8110 CHECK_CHARACTER (val
);
8111 CODING_ATTR_MNEMONIC (attrs
) = val
;
8113 coding_type
= args
[coding_arg_coding_type
];
8114 CHECK_SYMBOL (coding_type
);
8115 CODING_ATTR_TYPE (attrs
) = coding_type
;
8117 charset_list
= args
[coding_arg_charset_list
];
8118 if (SYMBOLP (charset_list
))
8120 if (EQ (charset_list
, Qiso_2022
))
8122 if (! EQ (coding_type
, Qiso_2022
))
8123 error ("Invalid charset-list");
8124 charset_list
= Viso_2022_charset_list
;
8126 else if (EQ (charset_list
, Qemacs_mule
))
8128 if (! EQ (coding_type
, Qemacs_mule
))
8129 error ("Invalid charset-list");
8130 charset_list
= Vemacs_mule_charset_list
;
8132 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8133 if (max_charset_id
< XFASTINT (XCAR (tail
)))
8134 max_charset_id
= XFASTINT (XCAR (tail
));
8138 charset_list
= Fcopy_sequence (charset_list
);
8139 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
8141 struct charset
*charset
;
8144 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8145 if (EQ (coding_type
, Qiso_2022
)
8146 ? CHARSET_ISO_FINAL (charset
) < 0
8147 : EQ (coding_type
, Qemacs_mule
)
8148 ? CHARSET_EMACS_MULE_ID (charset
) < 0
8150 error ("Can't handle charset `%s'",
8151 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8153 XSETCAR (tail
, make_number (charset
->id
));
8154 if (max_charset_id
< charset
->id
)
8155 max_charset_id
= charset
->id
;
8158 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
8160 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
8162 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8163 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
8164 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
8166 CODING_ATTR_ASCII_COMPAT (attrs
) = args
[coding_arg_ascii_compatible_p
];
8168 val
= args
[coding_arg_decode_translation_table
];
8169 if (! CHAR_TABLE_P (val
))
8171 CODING_ATTR_DECODE_TBL (attrs
) = val
;
8173 val
= args
[coding_arg_encode_translation_table
];
8174 if (! CHAR_TABLE_P (val
))
8176 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
8178 val
= args
[coding_arg_post_read_conversion
];
8180 CODING_ATTR_POST_READ (attrs
) = val
;
8182 val
= args
[coding_arg_pre_write_conversion
];
8184 CODING_ATTR_PRE_WRITE (attrs
) = val
;
8186 val
= args
[coding_arg_default_char
];
8188 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
8191 CHECK_CHARACTER (val
);
8192 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
8195 val
= args
[coding_arg_for_unibyte
];
8196 CODING_ATTR_FOR_UNIBYTE (attrs
) = NILP (val
) ? Qnil
: Qt
;
8198 val
= args
[coding_arg_plist
];
8200 CODING_ATTR_PLIST (attrs
) = val
;
8202 if (EQ (coding_type
, Qcharset
))
8204 /* Generate a lisp vector of 256 elements. Each element is nil,
8205 integer, or a list of charset IDs.
8207 If Nth element is nil, the byte code N is invalid in this
8210 If Nth element is a number NUM, N is the first byte of a
8211 charset whose ID is NUM.
8213 If Nth element is a list of charset IDs, N is the first byte
8214 of one of them. The list is sorted by dimensions of the
8215 charsets. A charset of smaller dimension comes firtst. */
8216 val
= Fmake_vector (make_number (256), Qnil
);
8218 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8220 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
8221 int dim
= CHARSET_DIMENSION (charset
);
8222 int idx
= (dim
- 1) * 4;
8224 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8225 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8227 for (i
= charset
->code_space
[idx
];
8228 i
<= charset
->code_space
[idx
+ 1]; i
++)
8230 Lisp_Object tmp
, tmp2
;
8233 tmp
= AREF (val
, i
);
8236 else if (NUMBERP (tmp
))
8238 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
8240 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
8242 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
8246 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
8248 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
8253 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
8256 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
8257 XSETCAR (tmp2
, XCAR (tail
));
8263 ASET (attrs
, coding_attr_charset_valids
, val
);
8264 category
= coding_category_charset
;
8266 else if (EQ (coding_type
, Qccl
))
8270 if (nargs
< coding_arg_ccl_max
)
8273 val
= args
[coding_arg_ccl_decoder
];
8274 CHECK_CCL_PROGRAM (val
);
8276 val
= Fcopy_sequence (val
);
8277 ASET (attrs
, coding_attr_ccl_decoder
, val
);
8279 val
= args
[coding_arg_ccl_encoder
];
8280 CHECK_CCL_PROGRAM (val
);
8282 val
= Fcopy_sequence (val
);
8283 ASET (attrs
, coding_attr_ccl_encoder
, val
);
8285 val
= args
[coding_arg_ccl_valids
];
8286 valids
= Fmake_string (make_number (256), make_number (0));
8287 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
8294 from
= to
= XINT (val
);
8295 if (from
< 0 || from
> 255)
8296 args_out_of_range_3 (val
, make_number (0), make_number (255));
8301 CHECK_NATNUM_CAR (val
);
8302 CHECK_NATNUM_CDR (val
);
8303 from
= XINT (XCAR (val
));
8305 args_out_of_range_3 (XCAR (val
),
8306 make_number (0), make_number (255));
8307 to
= XINT (XCDR (val
));
8308 if (to
< from
|| to
> 255)
8309 args_out_of_range_3 (XCDR (val
),
8310 XCAR (val
), make_number (255));
8312 for (i
= from
; i
<= to
; i
++)
8313 SSET (valids
, i
, 1);
8315 ASET (attrs
, coding_attr_ccl_valids
, valids
);
8317 category
= coding_category_ccl
;
8319 else if (EQ (coding_type
, Qutf_16
))
8321 Lisp_Object bom
, endian
;
8323 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8325 if (nargs
< coding_arg_utf16_max
)
8328 bom
= args
[coding_arg_utf16_bom
];
8329 if (! NILP (bom
) && ! EQ (bom
, Qt
))
8333 CHECK_CODING_SYSTEM (val
);
8335 CHECK_CODING_SYSTEM (val
);
8337 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
8339 endian
= args
[coding_arg_utf16_endian
];
8340 CHECK_SYMBOL (endian
);
8343 else if (! EQ (endian
, Qbig
) && ! EQ (endian
, Qlittle
))
8344 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian
)));
8345 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
8347 category
= (CONSP (bom
)
8348 ? coding_category_utf_16_auto
8350 ? (EQ (endian
, Qbig
)
8351 ? coding_category_utf_16_be_nosig
8352 : coding_category_utf_16_le_nosig
)
8353 : (EQ (endian
, Qbig
)
8354 ? coding_category_utf_16_be
8355 : coding_category_utf_16_le
));
8357 else if (EQ (coding_type
, Qiso_2022
))
8359 Lisp_Object initial
, reg_usage
, request
, flags
;
8362 if (nargs
< coding_arg_iso2022_max
)
8365 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
8366 CHECK_VECTOR (initial
);
8367 for (i
= 0; i
< 4; i
++)
8369 val
= Faref (initial
, make_number (i
));
8372 struct charset
*charset
;
8374 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8375 ASET (initial
, i
, make_number (CHARSET_ID (charset
)));
8376 if (i
== 0 && CHARSET_ASCII_COMPATIBLE_P (charset
))
8377 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8380 ASET (initial
, i
, make_number (-1));
8383 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
8384 CHECK_CONS (reg_usage
);
8385 CHECK_NUMBER_CAR (reg_usage
);
8386 CHECK_NUMBER_CDR (reg_usage
);
8388 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
8389 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
8397 CHECK_CHARSET_GET_ID (tmp
, id
);
8398 CHECK_NATNUM_CDR (val
);
8399 if (XINT (XCDR (val
)) >= 4)
8400 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
8401 XSETCAR (val
, make_number (id
));
8404 flags
= args
[coding_arg_iso2022_flags
];
8405 CHECK_NATNUM (flags
);
8407 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
8408 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
8410 ASET (attrs
, coding_attr_iso_initial
, initial
);
8411 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
8412 ASET (attrs
, coding_attr_iso_request
, request
);
8413 ASET (attrs
, coding_attr_iso_flags
, flags
);
8414 setup_iso_safe_charsets (attrs
);
8416 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
8417 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
8418 | CODING_ISO_FLAG_SINGLE_SHIFT
))
8419 ? coding_category_iso_7_else
8420 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8421 ? coding_category_iso_7
8422 : coding_category_iso_7_tight
);
8425 int id
= XINT (AREF (initial
, 1));
8427 category
= (((i
& CODING_ISO_FLAG_LOCKING_SHIFT
)
8428 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8430 ? coding_category_iso_8_else
8431 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
8432 ? coding_category_iso_8_1
8433 : coding_category_iso_8_2
);
8435 if (category
!= coding_category_iso_8_1
8436 && category
!= coding_category_iso_8_2
)
8437 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8439 else if (EQ (coding_type
, Qemacs_mule
))
8441 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
8442 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
8443 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8444 category
= coding_category_emacs_mule
;
8446 else if (EQ (coding_type
, Qshift_jis
))
8449 struct charset
*charset
;
8451 if (XINT (Flength (charset_list
)) != 3
8452 && XINT (Flength (charset_list
)) != 4)
8453 error ("There should be three or four charsets");
8455 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8456 if (CHARSET_DIMENSION (charset
) != 1)
8457 error ("Dimension of charset %s is not one",
8458 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8459 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8460 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8462 charset_list
= XCDR (charset_list
);
8463 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8464 if (CHARSET_DIMENSION (charset
) != 1)
8465 error ("Dimension of charset %s is not one",
8466 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8468 charset_list
= XCDR (charset_list
);
8469 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8470 if (CHARSET_DIMENSION (charset
) != 2)
8471 error ("Dimension of charset %s is not two",
8472 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8474 charset_list
= XCDR (charset_list
);
8475 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8476 if (CHARSET_DIMENSION (charset
) != 2)
8477 error ("Dimension of charset %s is not two",
8478 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8480 category
= coding_category_sjis
;
8481 Vsjis_coding_system
= name
;
8483 else if (EQ (coding_type
, Qbig5
))
8485 struct charset
*charset
;
8487 if (XINT (Flength (charset_list
)) != 2)
8488 error ("There should be just two charsets");
8490 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8491 if (CHARSET_DIMENSION (charset
) != 1)
8492 error ("Dimension of charset %s is not one",
8493 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8494 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8495 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8497 charset_list
= XCDR (charset_list
);
8498 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8499 if (CHARSET_DIMENSION (charset
) != 2)
8500 error ("Dimension of charset %s is not two",
8501 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8503 category
= coding_category_big5
;
8504 Vbig5_coding_system
= name
;
8506 else if (EQ (coding_type
, Qraw_text
))
8508 category
= coding_category_raw_text
;
8509 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8511 else if (EQ (coding_type
, Qutf_8
))
8513 category
= coding_category_utf_8
;
8514 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8516 else if (EQ (coding_type
, Qundecided
))
8517 category
= coding_category_undecided
;
8519 error ("Invalid coding system type: %s",
8520 SDATA (SYMBOL_NAME (coding_type
)));
8522 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
8523 CODING_ATTR_PLIST (attrs
)
8524 = Fcons (QCcategory
, Fcons (AREF (Vcoding_category_table
, category
),
8525 CODING_ATTR_PLIST (attrs
)));
8527 eol_type
= args
[coding_arg_eol_type
];
8528 if (! NILP (eol_type
)
8529 && ! EQ (eol_type
, Qunix
)
8530 && ! EQ (eol_type
, Qdos
)
8531 && ! EQ (eol_type
, Qmac
))
8532 error ("Invalid eol-type");
8534 aliases
= Fcons (name
, Qnil
);
8536 if (NILP (eol_type
))
8538 eol_type
= make_subsidiaries (name
);
8539 for (i
= 0; i
< 3; i
++)
8541 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
8543 this_name
= AREF (eol_type
, i
);
8544 this_aliases
= Fcons (this_name
, Qnil
);
8545 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
8546 this_spec
= Fmake_vector (make_number (3), attrs
);
8547 ASET (this_spec
, 1, this_aliases
);
8548 ASET (this_spec
, 2, this_eol_type
);
8549 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
8550 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
8551 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
8552 Vcoding_system_alist
);
8556 spec_vec
= Fmake_vector (make_number (3), attrs
);
8557 ASET (spec_vec
, 1, aliases
);
8558 ASET (spec_vec
, 2, eol_type
);
8560 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
8561 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
8562 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
8563 Vcoding_system_alist
);
8566 int id
= coding_categories
[category
].id
;
8568 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
8569 setup_coding_system (name
, &coding_categories
[category
]);
8575 return Fsignal (Qwrong_number_of_arguments
,
8576 Fcons (intern ("define-coding-system-internal"),
8577 make_number (nargs
)));
8581 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
8582 Sdefine_coding_system_alias
, 2, 2, 0,
8583 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8584 (alias
, coding_system
)
8585 Lisp_Object alias
, coding_system
;
8587 Lisp_Object spec
, aliases
, eol_type
;
8589 CHECK_SYMBOL (alias
);
8590 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8591 aliases
= AREF (spec
, 1);
8592 /* ALISES should be a list of length more than zero, and the first
8593 element is a base coding system. Append ALIAS at the tail of the
8595 while (!NILP (XCDR (aliases
)))
8596 aliases
= XCDR (aliases
);
8597 XSETCDR (aliases
, Fcons (alias
, Qnil
));
8599 eol_type
= AREF (spec
, 2);
8600 if (VECTORP (eol_type
))
8602 Lisp_Object subsidiaries
;
8605 subsidiaries
= make_subsidiaries (alias
);
8606 for (i
= 0; i
< 3; i
++)
8607 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
8608 AREF (eol_type
, i
));
8611 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
8612 Vcoding_system_list
= Fcons (alias
, Vcoding_system_list
);
8613 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
8614 Vcoding_system_alist
);
8619 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
8621 doc
: /* Return the base of CODING-SYSTEM.
8622 Any alias or subsidiary coding system is not a base coding system. */)
8624 Lisp_Object coding_system
;
8626 Lisp_Object spec
, attrs
;
8628 if (NILP (coding_system
))
8629 return (Qno_conversion
);
8630 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8631 attrs
= AREF (spec
, 0);
8632 return CODING_ATTR_BASE_NAME (attrs
);
8635 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
8637 doc
: "Return the property list of CODING-SYSTEM.")
8639 Lisp_Object coding_system
;
8641 Lisp_Object spec
, attrs
;
8643 if (NILP (coding_system
))
8644 coding_system
= Qno_conversion
;
8645 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8646 attrs
= AREF (spec
, 0);
8647 return CODING_ATTR_PLIST (attrs
);
8651 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
8653 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
8655 Lisp_Object coding_system
;
8659 if (NILP (coding_system
))
8660 coding_system
= Qno_conversion
;
8661 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8662 return AREF (spec
, 1);
8665 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
8666 Scoding_system_eol_type
, 1, 1, 0,
8667 doc
: /* Return eol-type of CODING-SYSTEM.
8668 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8670 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8671 and CR respectively.
8673 A vector value indicates that a format of end-of-line should be
8674 detected automatically. Nth element of the vector is the subsidiary
8675 coding system whose eol-type is N. */)
8677 Lisp_Object coding_system
;
8679 Lisp_Object spec
, eol_type
;
8682 if (NILP (coding_system
))
8683 coding_system
= Qno_conversion
;
8684 if (! CODING_SYSTEM_P (coding_system
))
8686 spec
= CODING_SYSTEM_SPEC (coding_system
);
8687 eol_type
= AREF (spec
, 2);
8688 if (VECTORP (eol_type
))
8689 return Fcopy_sequence (eol_type
);
8690 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
8691 return make_number (n
);
8697 /*** 9. Post-amble ***/
8704 for (i
= 0; i
< coding_category_max
; i
++)
8706 coding_categories
[i
].id
= -1;
8707 coding_priorities
[i
] = i
;
8710 /* ISO2022 specific initialize routine. */
8711 for (i
= 0; i
< 0x20; i
++)
8712 iso_code_class
[i
] = ISO_control_0
;
8713 for (i
= 0x21; i
< 0x7F; i
++)
8714 iso_code_class
[i
] = ISO_graphic_plane_0
;
8715 for (i
= 0x80; i
< 0xA0; i
++)
8716 iso_code_class
[i
] = ISO_control_1
;
8717 for (i
= 0xA1; i
< 0xFF; i
++)
8718 iso_code_class
[i
] = ISO_graphic_plane_1
;
8719 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
8720 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
8721 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
8722 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
8723 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
8724 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
8725 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
8726 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
8727 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
8729 for (i
= 0; i
< 256; i
++)
8731 emacs_mule_bytes
[i
] = 1;
8733 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_11
] = 3;
8734 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_12
] = 3;
8735 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_21
] = 4;
8736 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_22
] = 4;
8744 staticpro (&Vcoding_system_hash_table
);
8746 Lisp_Object args
[2];
8749 Vcoding_system_hash_table
= Fmake_hash_table (2, args
);
8752 staticpro (&Vsjis_coding_system
);
8753 Vsjis_coding_system
= Qnil
;
8755 staticpro (&Vbig5_coding_system
);
8756 Vbig5_coding_system
= Qnil
;
8758 staticpro (&Vcode_conversion_reused_workbuf
);
8759 Vcode_conversion_reused_workbuf
= Qnil
;
8761 staticpro (&Vcode_conversion_workbuf_name
);
8762 Vcode_conversion_workbuf_name
= build_string (" *code-conversion-work*");
8764 reused_workbuf_in_use
= 0;
8766 DEFSYM (Qcharset
, "charset");
8767 DEFSYM (Qtarget_idx
, "target-idx");
8768 DEFSYM (Qcoding_system_history
, "coding-system-history");
8769 Fset (Qcoding_system_history
, Qnil
);
8771 /* Target FILENAME is the first argument. */
8772 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
8773 /* Target FILENAME is the third argument. */
8774 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
8776 DEFSYM (Qcall_process
, "call-process");
8777 /* Target PROGRAM is the first argument. */
8778 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
8780 DEFSYM (Qcall_process_region
, "call-process-region");
8781 /* Target PROGRAM is the third argument. */
8782 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
8784 DEFSYM (Qstart_process
, "start-process");
8785 /* Target PROGRAM is the third argument. */
8786 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
8788 DEFSYM (Qopen_network_stream
, "open-network-stream");
8789 /* Target SERVICE is the fourth argument. */
8790 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
8792 DEFSYM (Qcoding_system
, "coding-system");
8793 DEFSYM (Qcoding_aliases
, "coding-aliases");
8795 DEFSYM (Qeol_type
, "eol-type");
8796 DEFSYM (Qunix
, "unix");
8797 DEFSYM (Qdos
, "dos");
8799 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
8800 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
8801 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
8802 DEFSYM (Qdefault_char
, "default-char");
8803 DEFSYM (Qundecided
, "undecided");
8804 DEFSYM (Qno_conversion
, "no-conversion");
8805 DEFSYM (Qraw_text
, "raw-text");
8807 DEFSYM (Qiso_2022
, "iso-2022");
8809 DEFSYM (Qutf_8
, "utf-8");
8810 DEFSYM (Qutf_8_emacs
, "utf-8-emacs");
8812 DEFSYM (Qutf_16
, "utf-16");
8813 DEFSYM (Qbig
, "big");
8814 DEFSYM (Qlittle
, "little");
8816 DEFSYM (Qshift_jis
, "shift-jis");
8817 DEFSYM (Qbig5
, "big5");
8819 DEFSYM (Qcoding_system_p
, "coding-system-p");
8821 DEFSYM (Qcoding_system_error
, "coding-system-error");
8822 Fput (Qcoding_system_error
, Qerror_conditions
,
8823 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
8824 Fput (Qcoding_system_error
, Qerror_message
,
8825 build_string ("Invalid coding system"));
8827 /* Intern this now in case it isn't already done.
8828 Setting this variable twice is harmless.
8829 But don't staticpro it here--that is done in alloc.c. */
8830 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
8832 DEFSYM (Qtranslation_table
, "translation-table");
8833 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
8834 DEFSYM (Qtranslation_table_id
, "translation-table-id");
8835 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
8836 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
8838 DEFSYM (Qvalid_codes
, "valid-codes");
8840 DEFSYM (Qemacs_mule
, "emacs-mule");
8842 DEFSYM (QCcategory
, ":category");
8844 Vcoding_category_table
8845 = Fmake_vector (make_number (coding_category_max
), Qnil
);
8846 staticpro (&Vcoding_category_table
);
8847 /* Followings are target of code detection. */
8848 ASET (Vcoding_category_table
, coding_category_iso_7
,
8849 intern ("coding-category-iso-7"));
8850 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
8851 intern ("coding-category-iso-7-tight"));
8852 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
8853 intern ("coding-category-iso-8-1"));
8854 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
8855 intern ("coding-category-iso-8-2"));
8856 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
8857 intern ("coding-category-iso-7-else"));
8858 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
8859 intern ("coding-category-iso-8-else"));
8860 ASET (Vcoding_category_table
, coding_category_utf_8
,
8861 intern ("coding-category-utf-8"));
8862 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
8863 intern ("coding-category-utf-16-be"));
8864 ASET (Vcoding_category_table
, coding_category_utf_16_auto
,
8865 intern ("coding-category-utf-16-auto"));
8866 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
8867 intern ("coding-category-utf-16-le"));
8868 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
8869 intern ("coding-category-utf-16-be-nosig"));
8870 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
8871 intern ("coding-category-utf-16-le-nosig"));
8872 ASET (Vcoding_category_table
, coding_category_charset
,
8873 intern ("coding-category-charset"));
8874 ASET (Vcoding_category_table
, coding_category_sjis
,
8875 intern ("coding-category-sjis"));
8876 ASET (Vcoding_category_table
, coding_category_big5
,
8877 intern ("coding-category-big5"));
8878 ASET (Vcoding_category_table
, coding_category_ccl
,
8879 intern ("coding-category-ccl"));
8880 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
8881 intern ("coding-category-emacs-mule"));
8882 /* Followings are NOT target of code detection. */
8883 ASET (Vcoding_category_table
, coding_category_raw_text
,
8884 intern ("coding-category-raw-text"));
8885 ASET (Vcoding_category_table
, coding_category_undecided
,
8886 intern ("coding-category-undecided"));
8888 DEFSYM (Qinsufficient_source
, "insufficient-source");
8889 DEFSYM (Qinconsistent_eol
, "inconsistent-eol");
8890 DEFSYM (Qinvalid_source
, "invalid-source");
8891 DEFSYM (Qinterrupted
, "interrupted");
8892 DEFSYM (Qinsufficient_memory
, "insufficient-memory");
8894 defsubr (&Scoding_system_p
);
8895 defsubr (&Sread_coding_system
);
8896 defsubr (&Sread_non_nil_coding_system
);
8897 defsubr (&Scheck_coding_system
);
8898 defsubr (&Sdetect_coding_region
);
8899 defsubr (&Sdetect_coding_string
);
8900 defsubr (&Sfind_coding_systems_region_internal
);
8901 defsubr (&Sunencodable_char_position
);
8902 defsubr (&Scheck_coding_systems_region
);
8903 defsubr (&Sdecode_coding_region
);
8904 defsubr (&Sencode_coding_region
);
8905 defsubr (&Sdecode_coding_string
);
8906 defsubr (&Sencode_coding_string
);
8907 defsubr (&Sdecode_sjis_char
);
8908 defsubr (&Sencode_sjis_char
);
8909 defsubr (&Sdecode_big5_char
);
8910 defsubr (&Sencode_big5_char
);
8911 defsubr (&Sset_terminal_coding_system_internal
);
8912 defsubr (&Sset_safe_terminal_coding_system_internal
);
8913 defsubr (&Sterminal_coding_system
);
8914 defsubr (&Sset_keyboard_coding_system_internal
);
8915 defsubr (&Skeyboard_coding_system
);
8916 defsubr (&Sfind_operation_coding_system
);
8917 defsubr (&Sset_coding_system_priority
);
8918 defsubr (&Sdefine_coding_system_internal
);
8919 defsubr (&Sdefine_coding_system_alias
);
8920 defsubr (&Scoding_system_base
);
8921 defsubr (&Scoding_system_plist
);
8922 defsubr (&Scoding_system_aliases
);
8923 defsubr (&Scoding_system_eol_type
);
8924 defsubr (&Scoding_system_priority_list
);
8926 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
8927 doc
: /* List of coding systems.
8929 Do not alter the value of this variable manually. This variable should be
8930 updated by the functions `define-coding-system' and
8931 `define-coding-system-alias'. */);
8932 Vcoding_system_list
= Qnil
;
8934 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
8935 doc
: /* Alist of coding system names.
8936 Each element is one element list of coding system name.
8937 This variable is given to `completing-read' as TABLE argument.
8939 Do not alter the value of this variable manually. This variable should be
8940 updated by the functions `make-coding-system' and
8941 `define-coding-system-alias'. */);
8942 Vcoding_system_alist
= Qnil
;
8944 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
8945 doc
: /* List of coding-categories (symbols) ordered by priority.
8947 On detecting a coding system, Emacs tries code detection algorithms
8948 associated with each coding-category one by one in this order. When
8949 one algorithm agrees with a byte sequence of source text, the coding
8950 system bound to the corresponding coding-category is selected. */);
8954 Vcoding_category_list
= Qnil
;
8955 for (i
= coding_category_max
- 1; i
>= 0; i
--)
8956 Vcoding_category_list
8957 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
8958 Vcoding_category_list
);
8961 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
8962 doc
: /* Specify the coding system for read operations.
8963 It is useful to bind this variable with `let', but do not set it globally.
8964 If the value is a coding system, it is used for decoding on read operation.
8965 If not, an appropriate element is used from one of the coding system alists:
8966 There are three such tables, `file-coding-system-alist',
8967 `process-coding-system-alist', and `network-coding-system-alist'. */);
8968 Vcoding_system_for_read
= Qnil
;
8970 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
8971 doc
: /* Specify the coding system for write operations.
8972 Programs bind this variable with `let', but you should not set it globally.
8973 If the value is a coding system, it is used for encoding of output,
8974 when writing it to a file and when sending it to a file or subprocess.
8976 If this does not specify a coding system, an appropriate element
8977 is used from one of the coding system alists:
8978 There are three such tables, `file-coding-system-alist',
8979 `process-coding-system-alist', and `network-coding-system-alist'.
8980 For output to files, if the above procedure does not specify a coding system,
8981 the value of `buffer-file-coding-system' is used. */);
8982 Vcoding_system_for_write
= Qnil
;
8984 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
8986 Coding system used in the latest file or process I/O. */);
8987 Vlast_coding_system_used
= Qnil
;
8989 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error
,
8991 Error status of the last code conversion.
8993 When an error was detected in the last code conversion, this variable
8994 is set to one of the following symbols.
8995 `insufficient-source'
8999 `insufficient-memory'
9000 When no error was detected, the value doesn't change. So, to check
9001 the error status of a code conversion by this variable, you must
9002 explicitly set this variable to nil before performing code
9004 Vlast_code_conversion_error
= Qnil
;
9006 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
9008 *Non-nil means always inhibit code conversion of end-of-line format.
9009 See info node `Coding Systems' and info node `Text and Binary' concerning
9010 such conversion. */);
9011 inhibit_eol_conversion
= 0;
9013 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
9015 Non-nil means process buffer inherits coding system of process output.
9016 Bind it to t if the process output is to be treated as if it were a file
9017 read from some filesystem. */);
9018 inherit_process_coding_system
= 0;
9020 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
9022 Alist to decide a coding system to use for a file I/O operation.
9023 The format is ((PATTERN . VAL) ...),
9024 where PATTERN is a regular expression matching a file name,
9025 VAL is a coding system, a cons of coding systems, or a function symbol.
9026 If VAL is a coding system, it is used for both decoding and encoding
9028 If VAL is a cons of coding systems, the car part is used for decoding,
9029 and the cdr part is used for encoding.
9030 If VAL is a function symbol, the function must return a coding system
9031 or a cons of coding systems which are used as above. The function gets
9032 the arguments with which `find-operation-coding-systems' was called.
9034 See also the function `find-operation-coding-system'
9035 and the variable `auto-coding-alist'. */);
9036 Vfile_coding_system_alist
= Qnil
;
9038 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
9040 Alist to decide a coding system to use for a process I/O operation.
9041 The format is ((PATTERN . VAL) ...),
9042 where PATTERN is a regular expression matching a program name,
9043 VAL is a coding system, a cons of coding systems, or a function symbol.
9044 If VAL is a coding system, it is used for both decoding what received
9045 from the program and encoding what sent to the program.
9046 If VAL is a cons of coding systems, the car part is used for decoding,
9047 and the cdr part is used for encoding.
9048 If VAL is a function symbol, the function must return a coding system
9049 or a cons of coding systems which are used as above.
9051 See also the function `find-operation-coding-system'. */);
9052 Vprocess_coding_system_alist
= Qnil
;
9054 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
9056 Alist to decide a coding system to use for a network I/O operation.
9057 The format is ((PATTERN . VAL) ...),
9058 where PATTERN is a regular expression matching a network service name
9059 or is a port number to connect to,
9060 VAL is a coding system, a cons of coding systems, or a function symbol.
9061 If VAL is a coding system, it is used for both decoding what received
9062 from the network stream and encoding what sent to the network stream.
9063 If VAL is a cons of coding systems, the car part is used for decoding,
9064 and the cdr part is used for encoding.
9065 If VAL is a function symbol, the function must return a coding system
9066 or a cons of coding systems which are used as above.
9068 See also the function `find-operation-coding-system'. */);
9069 Vnetwork_coding_system_alist
= Qnil
;
9071 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
9072 doc
: /* Coding system to use with system messages.
9073 Also used for decoding keyboard input on X Window system. */);
9074 Vlocale_coding_system
= Qnil
;
9076 /* The eol mnemonics are reset in startup.el system-dependently. */
9077 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
9079 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9080 eol_mnemonic_unix
= build_string (":");
9082 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
9084 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9085 eol_mnemonic_dos
= build_string ("\\");
9087 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
9089 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9090 eol_mnemonic_mac
= build_string ("/");
9092 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
9094 *String displayed in mode line when end-of-line format is not yet determined. */);
9095 eol_mnemonic_undecided
= build_string (":");
9097 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
9099 *Non-nil enables character translation while encoding and decoding. */);
9100 Venable_character_translation
= Qt
;
9102 DEFVAR_LISP ("standard-translation-table-for-decode",
9103 &Vstandard_translation_table_for_decode
,
9104 doc
: /* Table for translating characters while decoding. */);
9105 Vstandard_translation_table_for_decode
= Qnil
;
9107 DEFVAR_LISP ("standard-translation-table-for-encode",
9108 &Vstandard_translation_table_for_encode
,
9109 doc
: /* Table for translating characters while encoding. */);
9110 Vstandard_translation_table_for_encode
= Qnil
;
9112 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
9113 doc
: /* Alist of charsets vs revision numbers.
9114 While encoding, if a charset (car part of an element) is found,
9115 designate it with the escape sequence identifying revision (cdr part
9116 of the element). */);
9117 Vcharset_revision_table
= Qnil
;
9119 DEFVAR_LISP ("default-process-coding-system",
9120 &Vdefault_process_coding_system
,
9121 doc
: /* Cons of coding systems used for process I/O by default.
9122 The car part is used for decoding a process output,
9123 the cdr part is used for encoding a text to be sent to a process. */);
9124 Vdefault_process_coding_system
= Qnil
;
9126 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
9128 Table of extra Latin codes in the range 128..159 (inclusive).
9129 This is a vector of length 256.
9130 If Nth element is non-nil, the existence of code N in a file
9131 \(or output of subprocess) doesn't prevent it to be detected as
9132 a coding system of ISO 2022 variant which has a flag
9133 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9134 or reading output of a subprocess.
9135 Only 128th through 159th elements has a meaning. */);
9136 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
9138 DEFVAR_LISP ("select-safe-coding-system-function",
9139 &Vselect_safe_coding_system_function
,
9141 Function to call to select safe coding system for encoding a text.
9143 If set, this function is called to force a user to select a proper
9144 coding system which can encode the text in the case that a default
9145 coding system used in each operation can't encode the text.
9147 The default value is `select-safe-coding-system' (which see). */);
9148 Vselect_safe_coding_system_function
= Qnil
;
9150 DEFVAR_BOOL ("coding-system-require-warning",
9151 &coding_system_require_warning
,
9152 doc
: /* Internal use only.
9153 If non-nil, on writing a file, `select-safe-coding-system-function' is
9154 called even if `coding-system-for-write' is non-nil. The command
9155 `universal-coding-system-argument' binds this variable to t temporarily. */);
9156 coding_system_require_warning
= 0;
9159 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9160 &inhibit_iso_escape_detection
,
9162 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9164 By default, on reading a file, Emacs tries to detect how the text is
9165 encoded. This code detection is sensitive to escape sequences. If
9166 the sequence is valid as ISO2022, the code is determined as one of
9167 the ISO2022 encodings, and the file is decoded by the corresponding
9168 coding system (e.g. `iso-2022-7bit').
9170 However, there may be a case that you want to read escape sequences in
9171 a file as is. In such a case, you can set this variable to non-nil.
9172 Then, as the code detection ignores any escape sequences, no file is
9173 detected as encoded in some ISO2022 encoding. The result is that all
9174 escape sequences become visible in a buffer.
9176 The default value is nil, and it is strongly recommended not to change
9177 it. That is because many Emacs Lisp source files that contain
9178 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9179 in Emacs's distribution, and they won't be decoded correctly on
9180 reading if you suppress escape sequence detection.
9182 The other way to read escape sequences in a file without decoding is
9183 to explicitly specify some coding system that doesn't use ISO2022's
9184 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9185 inhibit_iso_escape_detection
= 0;
9187 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input
,
9188 doc
: /* Char table for translating self-inserting characters.
9189 This is applied to the result of input methods, not their input. See also
9190 `keyboard-translate-table'. */);
9191 Vtranslation_table_for_input
= Qnil
;
9194 Lisp_Object args
[coding_arg_max
];
9195 Lisp_Object plist
[16];
9198 for (i
= 0; i
< coding_arg_max
; i
++)
9201 plist
[0] = intern (":name");
9202 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
9203 plist
[2] = intern (":mnemonic");
9204 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
9205 plist
[4] = intern (":coding-type");
9206 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
9207 plist
[6] = intern (":ascii-compatible-p");
9208 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
9209 plist
[8] = intern (":default-char");
9210 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
9211 plist
[10] = intern (":for-unibyte");
9212 plist
[11] = args
[coding_arg_for_unibyte
] = Qt
;
9213 plist
[12] = intern (":docstring");
9214 plist
[13] = build_string ("Do no conversion.\n\
9216 When you visit a file with this coding, the file is read into a\n\
9217 unibyte buffer as is, thus each byte of a file is treated as a\n\
9219 plist
[14] = intern (":eol-type");
9220 plist
[15] = args
[coding_arg_eol_type
] = Qunix
;
9221 args
[coding_arg_plist
] = Flist (16, plist
);
9222 Fdefine_coding_system_internal (coding_arg_max
, args
);
9225 setup_coding_system (Qno_conversion
, &keyboard_coding
);
9226 setup_coding_system (Qno_conversion
, &terminal_coding
);
9227 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
9232 for (i
= 0; i
< coding_category_max
; i
++)
9233 Fset (AREF (Vcoding_category_table
, i
), Qno_conversion
);
9238 emacs_strerror (error_number
)
9243 synchronize_system_messages_locale ();
9244 str
= strerror (error_number
);
9246 if (! NILP (Vlocale_coding_system
))
9248 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
9249 Vlocale_coding_system
,
9251 str
= (char *) SDATA (dec
);