1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
152 Below is the template of these functions. */
156 detect_coding_XXX (coding
, detect_info
)
157 struct coding_system
*coding
;
158 struct coding_detection_info
*detect_info
;
160 unsigned char *src
= coding
->source
;
161 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
162 int multibytep
= coding
->src_multibyte
;
163 int consumed_chars
= 0;
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
173 if (! __C_conforms_to_XXX___ (c
))
175 if (! __C_strongly_suggests_XXX__ (c
))
176 found
= CATEGORY_MASK_XXX
;
178 /* The byte sequence is invalid for XXX. */
179 detect_info
->rejected
|= CATEGORY_MASK_XXX
;
183 /* The source exausted successfully. */
184 detect_info
->found
|= found
;
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
201 Below is the template of these functions. */
205 decode_coding_XXXX (coding
)
206 struct coding_system
*coding
;
208 unsigned char *src
= coding
->source
+ coding
->consumed
;
209 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base
;
214 /* A buffer to produce decoded characters. */
215 int *charbuf
= coding
->charbuf
;
216 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
217 int multibytep
= coding
->src_multibyte
;
222 if (charbuf
< charbuf_end
)
223 /* No more room to produce a decoded character. */
230 if (src_base
< src_end
231 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base
< src_end
&& charbuf
< charbuf_end
)
235 *charbuf
++ = *src_base
++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
239 /* Remember how many characters we produced. */
240 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
260 Below is a template of these functions. */
263 encode_coding_XXX (coding
)
264 struct coding_system
*coding
;
266 int multibytep
= coding
->dst_multibyte
;
267 int *charbuf
= coding
->charbuf
;
268 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
269 unsigned char *dst
= coding
->destination
+ coding
->produced
;
270 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
271 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
272 int produced_chars
= 0;
274 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
277 /* Encode C into DST, and increment DST. */
279 label_no_more_destination
:
280 /* How many chars and bytes we produced. */
281 coding
->produced_char
+= produced_chars
;
282 coding
->produced
= dst
- coding
->destination
;
287 /*** 1. Preamble ***/
294 #include "character.h"
297 #include "composite.h"
301 Lisp_Object Vcoding_system_hash_table
;
303 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
304 Lisp_Object Qunix
, Qdos
;
305 extern Lisp_Object Qmac
; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system
;
307 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
308 Lisp_Object Qdefault_char
;
309 Lisp_Object Qno_conversion
, Qundecided
;
310 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
311 Lisp_Object Qbig
, Qlittle
;
312 Lisp_Object Qcoding_system_history
;
313 Lisp_Object Qvalid_codes
;
314 Lisp_Object QCcategory
;
316 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
317 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
318 Lisp_Object Qstart_process
, Qopen_network_stream
;
319 Lisp_Object Qtarget_idx
;
321 int coding_system_require_warning
;
323 Lisp_Object Vselect_safe_coding_system_function
;
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
327 /* Mnemonic string to indicate format of end-of-line is not yet
329 Lisp_Object eol_mnemonic_undecided
;
333 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
335 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule
, Qraw_text
;
340 Lisp_Object Qutf_8_emacs
;
342 /* Coding-systems are handed between Emacs Lisp programs and C internal
343 routines by the following three variables. */
344 /* Coding-system for reading files and receiving data from process. */
345 Lisp_Object Vcoding_system_for_read
;
346 /* Coding-system for writing files and sending data to process. */
347 Lisp_Object Vcoding_system_for_write
;
348 /* Coding-system actually used in the latest I/O. */
349 Lisp_Object Vlast_coding_system_used
;
351 /* A vector of length 256 which contains information about special
352 Latin codes (especially for dealing with Microsoft codes). */
353 Lisp_Object Vlatin_extra_code_table
;
355 /* Flag to inhibit code conversion of end-of-line format. */
356 int inhibit_eol_conversion
;
358 /* Flag to inhibit ISO2022 escape sequence detection. */
359 int inhibit_iso_escape_detection
;
361 /* Flag to make buffer-file-coding-system inherit from process-coding. */
362 int inherit_process_coding_system
;
364 /* Coding system to be used to encode text for terminal display. */
365 struct coding_system terminal_coding
;
367 /* Coding system to be used to encode text for terminal display when
368 terminal coding system is nil. */
369 struct coding_system safe_terminal_coding
;
371 /* Coding system of what is sent from terminal keyboard. */
372 struct coding_system keyboard_coding
;
374 Lisp_Object Vfile_coding_system_alist
;
375 Lisp_Object Vprocess_coding_system_alist
;
376 Lisp_Object Vnetwork_coding_system_alist
;
378 Lisp_Object Vlocale_coding_system
;
382 /* Flag to tell if we look up translation table on character code
384 Lisp_Object Venable_character_translation
;
385 /* Standard translation table to look up on decoding (reading). */
386 Lisp_Object Vstandard_translation_table_for_decode
;
387 /* Standard translation table to look up on encoding (writing). */
388 Lisp_Object Vstandard_translation_table_for_encode
;
390 Lisp_Object Qtranslation_table
;
391 Lisp_Object Qtranslation_table_id
;
392 Lisp_Object Qtranslation_table_for_decode
;
393 Lisp_Object Qtranslation_table_for_encode
;
395 /* Alist of charsets vs revision number. */
396 static Lisp_Object Vcharset_revision_table
;
398 /* Default coding systems used for process I/O. */
399 Lisp_Object Vdefault_process_coding_system
;
401 /* Char table for translating Quail and self-inserting input. */
402 Lisp_Object Vtranslation_table_for_input
;
404 /* Global flag to tell that we can't call post-read-conversion and
405 pre-write-conversion functions. Usually the value is zero, but it
406 is set to 1 temporarily while such functions are running. This is
407 to avoid infinite recursive call. */
408 static int inhibit_pre_post_conversion
;
410 /* Two special coding systems. */
411 Lisp_Object Vsjis_coding_system
;
412 Lisp_Object Vbig5_coding_system
;
415 static int detect_coding_utf_8
P_ ((struct coding_system
*,
416 struct coding_detection_info
*info
));
417 static void decode_coding_utf_8
P_ ((struct coding_system
*));
418 static int encode_coding_utf_8
P_ ((struct coding_system
*));
420 static int detect_coding_utf_16
P_ ((struct coding_system
*,
421 struct coding_detection_info
*info
));
422 static void decode_coding_utf_16
P_ ((struct coding_system
*));
423 static int encode_coding_utf_16
P_ ((struct coding_system
*));
425 static int detect_coding_iso_2022
P_ ((struct coding_system
*,
426 struct coding_detection_info
*info
));
427 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
428 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
430 static int detect_coding_emacs_mule
P_ ((struct coding_system
*,
431 struct coding_detection_info
*info
));
432 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
433 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
435 static int detect_coding_sjis
P_ ((struct coding_system
*,
436 struct coding_detection_info
*info
));
437 static void decode_coding_sjis
P_ ((struct coding_system
*));
438 static int encode_coding_sjis
P_ ((struct coding_system
*));
440 static int detect_coding_big5
P_ ((struct coding_system
*,
441 struct coding_detection_info
*info
));
442 static void decode_coding_big5
P_ ((struct coding_system
*));
443 static int encode_coding_big5
P_ ((struct coding_system
*));
445 static int detect_coding_ccl
P_ ((struct coding_system
*,
446 struct coding_detection_info
*info
));
447 static void decode_coding_ccl
P_ ((struct coding_system
*));
448 static int encode_coding_ccl
P_ ((struct coding_system
*));
450 static void decode_coding_raw_text
P_ ((struct coding_system
*));
451 static int encode_coding_raw_text
P_ ((struct coding_system
*));
454 /* ISO2022 section */
456 #define CODING_ISO_INITIAL(coding, reg) \
457 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
458 coding_attr_iso_initial), \
462 #define CODING_ISO_REQUEST(coding, charset_id) \
463 ((charset_id <= (coding)->max_charset_id \
464 ? (coding)->safe_charsets[charset_id] \
468 #define CODING_ISO_FLAGS(coding) \
469 ((coding)->spec.iso_2022.flags)
470 #define CODING_ISO_DESIGNATION(coding, reg) \
471 ((coding)->spec.iso_2022.current_designation[reg])
472 #define CODING_ISO_INVOCATION(coding, plane) \
473 ((coding)->spec.iso_2022.current_invocation[plane])
474 #define CODING_ISO_SINGLE_SHIFTING(coding) \
475 ((coding)->spec.iso_2022.single_shifting)
476 #define CODING_ISO_BOL(coding) \
477 ((coding)->spec.iso_2022.bol)
478 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
479 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
481 /* Control characters of ISO2022. */
482 /* code */ /* function */
483 #define ISO_CODE_LF 0x0A /* line-feed */
484 #define ISO_CODE_CR 0x0D /* carriage-return */
485 #define ISO_CODE_SO 0x0E /* shift-out */
486 #define ISO_CODE_SI 0x0F /* shift-in */
487 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
488 #define ISO_CODE_ESC 0x1B /* escape */
489 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
490 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
491 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
493 /* All code (1-byte) of ISO2022 is classified into one of the
495 enum iso_code_class_type
497 ISO_control_0
, /* Control codes in the range
498 0x00..0x1F and 0x7F, except for the
499 following 5 codes. */
500 ISO_carriage_return
, /* ISO_CODE_CR (0x0D) */
501 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
502 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
503 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
504 ISO_escape
, /* ISO_CODE_SO (0x1B) */
505 ISO_control_1
, /* Control codes in the range
506 0x80..0x9F, except for the
507 following 3 codes. */
508 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
509 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
510 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
511 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
512 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
513 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
514 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
517 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
518 `iso-flags' attribute of an iso2022 coding system. */
520 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
521 instead of the correct short-form sequence (e.g. ESC $ A). */
522 #define CODING_ISO_FLAG_LONG_FORM 0x0001
524 /* If set, reset graphic planes and registers at end-of-line to the
526 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
528 /* If set, reset graphic planes and registers before any control
529 characters to the initial state. */
530 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
532 /* If set, encode by 7-bit environment. */
533 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
535 /* If set, use locking-shift function. */
536 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
538 /* If set, use single-shift function. Overwrite
539 CODING_ISO_FLAG_LOCKING_SHIFT. */
540 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
542 /* If set, use designation escape sequence. */
543 #define CODING_ISO_FLAG_DESIGNATION 0x0040
545 /* If set, produce revision number sequence. */
546 #define CODING_ISO_FLAG_REVISION 0x0080
548 /* If set, produce ISO6429's direction specifying sequence. */
549 #define CODING_ISO_FLAG_DIRECTION 0x0100
551 /* If set, assume designation states are reset at beginning of line on
553 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
555 /* If set, designation sequence should be placed at beginning of line
557 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
559 /* If set, do not encode unsafe charactes on output. */
560 #define CODING_ISO_FLAG_SAFE 0x0800
562 /* If set, extra latin codes (128..159) are accepted as a valid code
564 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
566 #define CODING_ISO_FLAG_COMPOSITION 0x2000
568 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
570 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
572 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
574 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
576 /* A character to be produced on output if encoding of the original
577 character is prohibited by CODING_ISO_FLAG_SAFE. */
578 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
582 #define CODING_UTF_16_BOM(coding) \
583 ((coding)->spec.utf_16.bom)
585 #define CODING_UTF_16_ENDIAN(coding) \
586 ((coding)->spec.utf_16.endian)
588 #define CODING_UTF_16_SURROGATE(coding) \
589 ((coding)->spec.utf_16.surrogate)
593 #define CODING_CCL_DECODER(coding) \
594 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
595 #define CODING_CCL_ENCODER(coding) \
596 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
597 #define CODING_CCL_VALIDS(coding) \
598 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
600 /* Index for each coding category in `coding_categories' */
604 coding_category_iso_7
,
605 coding_category_iso_7_tight
,
606 coding_category_iso_8_1
,
607 coding_category_iso_8_2
,
608 coding_category_iso_7_else
,
609 coding_category_iso_8_else
,
610 coding_category_utf_8
,
611 coding_category_utf_16_auto
,
612 coding_category_utf_16_be
,
613 coding_category_utf_16_le
,
614 coding_category_utf_16_be_nosig
,
615 coding_category_utf_16_le_nosig
,
616 coding_category_charset
,
617 coding_category_sjis
,
618 coding_category_big5
,
620 coding_category_emacs_mule
,
621 /* All above are targets of code detection. */
622 coding_category_raw_text
,
623 coding_category_undecided
,
627 /* Definitions of flag bits used in detect_coding_XXXX. */
628 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
629 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
630 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
631 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
632 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
633 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
634 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
635 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
636 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
637 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
638 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
639 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
640 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
641 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
642 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
643 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
644 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
645 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
647 /* This value is returned if detect_coding_mask () find nothing other
648 than ASCII characters. */
649 #define CATEGORY_MASK_ANY \
650 (CATEGORY_MASK_ISO_7 \
651 | CATEGORY_MASK_ISO_7_TIGHT \
652 | CATEGORY_MASK_ISO_8_1 \
653 | CATEGORY_MASK_ISO_8_2 \
654 | CATEGORY_MASK_ISO_7_ELSE \
655 | CATEGORY_MASK_ISO_8_ELSE \
656 | CATEGORY_MASK_UTF_8 \
657 | CATEGORY_MASK_UTF_16_BE \
658 | CATEGORY_MASK_UTF_16_LE \
659 | CATEGORY_MASK_UTF_16_BE_NOSIG \
660 | CATEGORY_MASK_UTF_16_LE_NOSIG \
661 | CATEGORY_MASK_CHARSET \
662 | CATEGORY_MASK_SJIS \
663 | CATEGORY_MASK_BIG5 \
664 | CATEGORY_MASK_CCL \
665 | CATEGORY_MASK_EMACS_MULE)
668 #define CATEGORY_MASK_ISO_7BIT \
669 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
671 #define CATEGORY_MASK_ISO_8BIT \
672 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
674 #define CATEGORY_MASK_ISO_ELSE \
675 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
677 #define CATEGORY_MASK_ISO_ESCAPE \
678 (CATEGORY_MASK_ISO_7 \
679 | CATEGORY_MASK_ISO_7_TIGHT \
680 | CATEGORY_MASK_ISO_7_ELSE \
681 | CATEGORY_MASK_ISO_8_ELSE)
683 #define CATEGORY_MASK_ISO \
684 ( CATEGORY_MASK_ISO_7BIT \
685 | CATEGORY_MASK_ISO_8BIT \
686 | CATEGORY_MASK_ISO_ELSE)
688 #define CATEGORY_MASK_UTF_16 \
689 (CATEGORY_MASK_UTF_16_BE \
690 | CATEGORY_MASK_UTF_16_LE \
691 | CATEGORY_MASK_UTF_16_BE_NOSIG \
692 | CATEGORY_MASK_UTF_16_LE_NOSIG)
695 /* List of symbols `coding-category-xxx' ordered by priority. This
696 variable is exposed to Emacs Lisp. */
697 static Lisp_Object Vcoding_category_list
;
699 /* Table of coding categories (Lisp symbols). This variable is for
701 static Lisp_Object Vcoding_category_table
;
703 /* Table of coding-categories ordered by priority. */
704 static enum coding_category coding_priorities
[coding_category_max
];
706 /* Nth element is a coding context for the coding system bound to the
707 Nth coding category. */
708 static struct coding_system coding_categories
[coding_category_max
];
710 /*** Commonly used macros and functions ***/
713 #define min(a, b) ((a) < (b) ? (a) : (b))
716 #define max(a, b) ((a) > (b) ? (a) : (b))
719 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
721 attrs = CODING_ID_ATTRS (coding->id); \
722 eol_type = CODING_ID_EOL_TYPE (coding->id); \
723 if (VECTORP (eol_type)) \
725 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
729 /* Safely get one byte from the source text pointed by SRC which ends
730 at SRC_END, and set C to that byte. If there are not enough bytes
731 in the source, it jumps to `no_more_source'. The caller
732 should declare and set these variables appropriately in advance:
733 src, src_end, multibytep
736 #define ONE_MORE_BYTE(c) \
738 if (src == src_end) \
740 if (src_base < src) \
741 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
742 goto no_more_source; \
745 if (multibytep && (c & 0x80)) \
747 if ((c & 0xFE) != 0xC0) \
748 error ("Undecodable char found"); \
749 c = ((c & 1) << 6) | *src++; \
755 #define ONE_MORE_BYTE_NO_CHECK(c) \
758 if (multibytep && (c & 0x80)) \
760 if ((c & 0xFE) != 0xC0) \
761 error ("Undecodable char found"); \
762 c = ((c & 1) << 6) | *src++; \
768 /* Store a byte C in the place pointed by DST and increment DST to the
769 next free point, and increment PRODUCED_CHARS. The caller should
770 assure that C is 0..127, and declare and set the variable `dst'
771 appropriately in advance.
775 #define EMIT_ONE_ASCII_BYTE(c) \
782 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
784 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
786 produced_chars += 2; \
787 *dst++ = (c1), *dst++ = (c2); \
791 /* Store a byte C in the place pointed by DST and increment DST to the
792 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
793 nonzero, store in an appropriate multibyte from. The caller should
794 declare and set the variables `dst' and `multibytep' appropriately
797 #define EMIT_ONE_BYTE(c) \
804 ch = BYTE8_TO_CHAR (ch); \
805 CHAR_STRING_ADVANCE (ch, dst); \
812 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
814 #define EMIT_TWO_BYTES(c1, c2) \
816 produced_chars += 2; \
823 ch = BYTE8_TO_CHAR (ch); \
824 CHAR_STRING_ADVANCE (ch, dst); \
827 ch = BYTE8_TO_CHAR (ch); \
828 CHAR_STRING_ADVANCE (ch, dst); \
838 #define EMIT_THREE_BYTES(c1, c2, c3) \
840 EMIT_ONE_BYTE (c1); \
841 EMIT_TWO_BYTES (c2, c3); \
845 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
847 EMIT_TWO_BYTES (c1, c2); \
848 EMIT_TWO_BYTES (c3, c4); \
852 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
854 charset_map_loaded = 0; \
855 c = DECODE_CHAR (charset, code); \
856 if (charset_map_loaded) \
858 const unsigned char *orig = coding->source; \
861 coding_set_source (coding); \
862 offset = coding->source - orig; \
864 src_base += offset; \
870 #define ASSURE_DESTINATION(bytes) \
872 if (dst + (bytes) >= dst_end) \
874 int more_bytes = charbuf_end - charbuf + (bytes); \
876 dst = alloc_destination (coding, more_bytes, dst); \
877 dst_end = coding->destination + coding->dst_bytes; \
884 coding_set_source (coding
)
885 struct coding_system
*coding
;
887 if (BUFFERP (coding
->src_object
))
889 struct buffer
*buf
= XBUFFER (coding
->src_object
);
891 if (coding
->src_pos
< 0)
892 coding
->source
= BUF_GAP_END_ADDR (buf
) + coding
->src_pos_byte
;
894 coding
->source
= BUF_BYTE_ADDRESS (buf
, coding
->src_pos_byte
);
896 else if (STRINGP (coding
->src_object
))
898 coding
->source
= SDATA (coding
->src_object
) + coding
->src_pos_byte
;
901 /* Otherwise, the source is C string and is never relocated
902 automatically. Thus we don't have to update anything. */
907 coding_set_destination (coding
)
908 struct coding_system
*coding
;
910 if (BUFFERP (coding
->dst_object
))
912 if (coding
->src_pos
< 0)
914 coding
->destination
= BEG_ADDR
+ coding
->dst_pos_byte
- 1;
915 coding
->dst_bytes
= (GAP_END_ADDR
916 - (coding
->src_bytes
- coding
->consumed
)
917 - coding
->destination
);
921 /* We are sure that coding->dst_pos_byte is before the gap
923 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
924 + coding
->dst_pos_byte
- 1);
925 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
926 - coding
->destination
);
930 /* Otherwise, the destination is C string and is never relocated
931 automatically. Thus we don't have to update anything. */
937 coding_alloc_by_realloc (coding
, bytes
)
938 struct coding_system
*coding
;
941 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
942 coding
->dst_bytes
+ bytes
);
943 coding
->dst_bytes
+= bytes
;
947 coding_alloc_by_making_gap (coding
, bytes
)
948 struct coding_system
*coding
;
951 if (BUFFERP (coding
->dst_object
)
952 && EQ (coding
->src_object
, coding
->dst_object
))
954 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
956 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
958 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
962 Lisp_Object this_buffer
;
964 this_buffer
= Fcurrent_buffer ();
965 set_buffer_internal (XBUFFER (coding
->dst_object
));
967 set_buffer_internal (XBUFFER (this_buffer
));
972 static unsigned char *
973 alloc_destination (coding
, nbytes
, dst
)
974 struct coding_system
*coding
;
978 EMACS_INT offset
= dst
- coding
->destination
;
980 if (BUFFERP (coding
->dst_object
))
981 coding_alloc_by_making_gap (coding
, nbytes
);
983 coding_alloc_by_realloc (coding
, nbytes
);
984 coding
->result
= CODING_RESULT_SUCCESS
;
985 coding_set_destination (coding
);
986 dst
= coding
->destination
+ offset
;
990 /** Macros for annotations. */
992 /* Maximum length of annotation data (sum of annotations for
993 composition and charset). */
994 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
996 /* An annotation data is stored in the array coding->charbuf in this
998 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
999 LENGTH is the number of elements in the annotation.
1000 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1001 FROM and TO specify the range of text annotated. They are relative
1002 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1004 The format of the following elements depend on ANNOTATION_MASK.
1006 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1008 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1009 METHOD is one of enum composition_method.
1010 Optionnal COMPOSITION-COMPONENTS are characters and composition
1013 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1016 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1018 *(buf)++ = -(len); \
1019 *(buf)++ = (mask); \
1020 *(buf)++ = (from); \
1022 coding->annotated = 1; \
1025 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1027 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1032 #define ADD_CHARSET_DATA(buf, from, to, id) \
1034 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1039 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1046 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1047 Check if a text is encoded in UTF-8. If it is, return 1, else
1050 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1051 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1052 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1053 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1054 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1055 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1058 detect_coding_utf_8 (coding
, detect_info
)
1059 struct coding_system
*coding
;
1060 struct coding_detection_info
*detect_info
;
1062 const unsigned char *src
= coding
->source
, *src_base
= src
;
1063 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1064 int multibytep
= coding
->src_multibyte
;
1065 int consumed_chars
= 0;
1069 detect_info
->checked
|= CATEGORY_MASK_UTF_8
;
1070 /* A coding system of this category is always ASCII compatible. */
1071 src
+= coding
->head_ascii
;
1075 int c
, c1
, c2
, c3
, c4
;
1079 if (UTF_8_1_OCTET_P (c
))
1083 if (! UTF_8_EXTRA_OCTET_P (c1
))
1085 if (UTF_8_2_OCTET_LEADING_P (c
))
1087 found
= CATEGORY_MASK_UTF_8
;
1091 if (! UTF_8_EXTRA_OCTET_P (c2
))
1093 if (UTF_8_3_OCTET_LEADING_P (c
))
1095 found
= CATEGORY_MASK_UTF_8
;
1099 if (! UTF_8_EXTRA_OCTET_P (c3
))
1101 if (UTF_8_4_OCTET_LEADING_P (c
))
1103 found
= CATEGORY_MASK_UTF_8
;
1107 if (! UTF_8_EXTRA_OCTET_P (c4
))
1109 if (UTF_8_5_OCTET_LEADING_P (c
))
1111 found
= CATEGORY_MASK_UTF_8
;
1116 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1120 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1122 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1125 detect_info
->found
|= found
;
1131 decode_coding_utf_8 (coding
)
1132 struct coding_system
*coding
;
1134 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1135 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1136 const unsigned char *src_base
;
1137 int *charbuf
= coding
->charbuf
;
1138 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1139 int consumed_chars
= 0, consumed_chars_base
;
1140 int multibytep
= coding
->src_multibyte
;
1141 Lisp_Object attr
, eol_type
, charset_list
;
1143 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1147 int c
, c1
, c2
, c3
, c4
, c5
;
1150 consumed_chars_base
= consumed_chars
;
1152 if (charbuf
>= charbuf_end
)
1156 if (UTF_8_1_OCTET_P(c1
))
1161 if (EQ (eol_type
, Qdos
))
1165 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
1166 goto no_more_source
;
1171 else if (EQ (eol_type
, Qmac
))
1178 if (! UTF_8_EXTRA_OCTET_P (c2
))
1180 if (UTF_8_2_OCTET_LEADING_P (c1
))
1182 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1183 /* Reject overlong sequences here and below. Encoders
1184 producing them are incorrect, they can be misleading,
1185 and they mess up read/write invariance. */
1192 if (! UTF_8_EXTRA_OCTET_P (c3
))
1194 if (UTF_8_3_OCTET_LEADING_P (c1
))
1196 c
= (((c1
& 0xF) << 12)
1197 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1199 || (c
>= 0xd800 && c
< 0xe000)) /* surrogates (invalid) */
1205 if (! UTF_8_EXTRA_OCTET_P (c4
))
1207 if (UTF_8_4_OCTET_LEADING_P (c1
))
1209 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1210 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1217 if (! UTF_8_EXTRA_OCTET_P (c5
))
1219 if (UTF_8_5_OCTET_LEADING_P (c1
))
1221 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1222 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1224 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1239 consumed_chars
= consumed_chars_base
;
1241 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1246 coding
->consumed_char
+= consumed_chars_base
;
1247 coding
->consumed
= src_base
- coding
->source
;
1248 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1253 encode_coding_utf_8 (coding
)
1254 struct coding_system
*coding
;
1256 int multibytep
= coding
->dst_multibyte
;
1257 int *charbuf
= coding
->charbuf
;
1258 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1259 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1260 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1261 int produced_chars
= 0;
1266 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1268 while (charbuf
< charbuf_end
)
1270 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1272 ASSURE_DESTINATION (safe_room
);
1274 if (CHAR_BYTE8_P (c
))
1276 c
= CHAR_TO_BYTE8 (c
);
1281 CHAR_STRING_ADVANCE (c
, pend
);
1282 for (p
= str
; p
< pend
; p
++)
1289 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1291 while (charbuf
< charbuf_end
)
1293 ASSURE_DESTINATION (safe_room
);
1295 dst
+= CHAR_STRING (c
, dst
);
1299 coding
->result
= CODING_RESULT_SUCCESS
;
1300 coding
->produced_char
+= produced_chars
;
1301 coding
->produced
= dst
- coding
->destination
;
1306 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1307 Check if a text is encoded in one of UTF-16 based coding systems.
1308 If it is, return 1, else return 0. */
1310 #define UTF_16_HIGH_SURROGATE_P(val) \
1311 (((val) & 0xFC00) == 0xD800)
1313 #define UTF_16_LOW_SURROGATE_P(val) \
1314 (((val) & 0xFC00) == 0xDC00)
1316 #define UTF_16_INVALID_P(val) \
1317 (((val) == 0xFFFE) \
1318 || ((val) == 0xFFFF) \
1319 || UTF_16_LOW_SURROGATE_P (val))
1323 detect_coding_utf_16 (coding
, detect_info
)
1324 struct coding_system
*coding
;
1325 struct coding_detection_info
*detect_info
;
1327 const unsigned char *src
= coding
->source
, *src_base
= src
;
1328 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1329 int multibytep
= coding
->src_multibyte
;
1330 int consumed_chars
= 0;
1333 detect_info
->checked
|= CATEGORY_MASK_UTF_16
;
1335 if (coding
->mode
& CODING_MODE_LAST_BLOCK
1336 && (coding
->src_bytes
& 1))
1338 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1344 if ((c1
== 0xFF) && (c2
== 0xFE))
1346 detect_info
->found
|= (CATEGORY_MASK_UTF_16_LE
1347 | CATEGORY_MASK_UTF_16_AUTO
);
1348 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_BE
;
1350 else if ((c1
== 0xFE) && (c2
== 0xFF))
1352 detect_info
->found
|= (CATEGORY_MASK_UTF_16_BE
1353 | CATEGORY_MASK_UTF_16_AUTO
);
1354 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_LE
;
1361 decode_coding_utf_16 (coding
)
1362 struct coding_system
*coding
;
1364 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1365 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1366 const unsigned char *src_base
;
1367 int *charbuf
= coding
->charbuf
;
1368 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1369 int consumed_chars
= 0, consumed_chars_base
;
1370 int multibytep
= coding
->src_multibyte
;
1371 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1372 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1373 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1374 Lisp_Object attr
, eol_type
, charset_list
;
1376 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1378 if (bom
== utf_16_with_bom
)
1387 if (endian
== utf_16_big_endian
1388 ? c
!= 0xFEFF : c
!= 0xFFFE)
1390 /* The first two bytes are not BOM. Treat them as bytes
1391 for a normal character. */
1395 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1397 else if (bom
== utf_16_detect_bom
)
1399 /* We have already tried to detect BOM and failed in
1401 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1409 consumed_chars_base
= consumed_chars
;
1411 if (charbuf
+ 2 >= charbuf_end
)
1416 c
= (endian
== utf_16_big_endian
1417 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1420 if (! UTF_16_LOW_SURROGATE_P (c
))
1422 if (endian
== utf_16_big_endian
)
1423 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1425 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1429 if (UTF_16_HIGH_SURROGATE_P (c
))
1430 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1436 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1437 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1443 if (UTF_16_HIGH_SURROGATE_P (c
))
1444 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1451 coding
->consumed_char
+= consumed_chars_base
;
1452 coding
->consumed
= src_base
- coding
->source
;
1453 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1457 encode_coding_utf_16 (coding
)
1458 struct coding_system
*coding
;
1460 int multibytep
= coding
->dst_multibyte
;
1461 int *charbuf
= coding
->charbuf
;
1462 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1463 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1464 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1466 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1467 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1468 int produced_chars
= 0;
1469 Lisp_Object attrs
, eol_type
, charset_list
;
1472 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1474 if (bom
!= utf_16_without_bom
)
1476 ASSURE_DESTINATION (safe_room
);
1478 EMIT_TWO_BYTES (0xFE, 0xFF);
1480 EMIT_TWO_BYTES (0xFF, 0xFE);
1481 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1484 while (charbuf
< charbuf_end
)
1486 ASSURE_DESTINATION (safe_room
);
1488 if (c
>= MAX_UNICODE_CHAR
)
1489 c
= coding
->default_char
;
1494 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1496 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1503 c1
= (c
>> 10) + 0xD800;
1504 c2
= (c
& 0x3FF) + 0xDC00;
1506 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1508 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1511 coding
->result
= CODING_RESULT_SUCCESS
;
1512 coding
->produced
= dst
- coding
->destination
;
1513 coding
->produced_char
+= produced_chars
;
1518 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1520 /* Emacs' internal format for representation of multiple character
1521 sets is a kind of multi-byte encoding, i.e. characters are
1522 represented by variable-length sequences of one-byte codes.
1524 ASCII characters and control characters (e.g. `tab', `newline') are
1525 represented by one-byte sequences which are their ASCII codes, in
1526 the range 0x00 through 0x7F.
1528 8-bit characters of the range 0x80..0x9F are represented by
1529 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1532 8-bit characters of the range 0xA0..0xFF are represented by
1533 one-byte sequences which are their 8-bit code.
1535 The other characters are represented by a sequence of `base
1536 leading-code', optional `extended leading-code', and one or two
1537 `position-code's. The length of the sequence is determined by the
1538 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1539 whereas extended leading-code and position-code take the range 0xA0
1540 through 0xFF. See `charset.h' for more details about leading-code
1543 --- CODE RANGE of Emacs' internal format ---
1547 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1548 eight-bit-graphic 0xA0..0xBF
1549 ELSE 0x81..0x9D + [0xA0..0xFF]+
1550 ---------------------------------------------
1552 As this is the internal character representation, the format is
1553 usually not used externally (i.e. in a file or in a data sent to a
1554 process). But, it is possible to have a text externally in this
1555 format (i.e. by encoding by the coding system `emacs-mule').
1557 In that case, a sequence of one-byte codes has a slightly different
1560 At first, all characters in eight-bit-control are represented by
1561 one-byte sequences which are their 8-bit code.
1563 Next, character composition data are represented by the byte
1564 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1566 METHOD is 0xF0 plus one of composition method (enum
1567 composition_method),
1569 BYTES is 0xA0 plus a byte length of this composition data,
1571 CHARS is 0x20 plus a number of characters composed by this
1574 COMPONENTs are characters of multibye form or composition
1575 rules encoded by two-byte of ASCII codes.
1577 In addition, for backward compatibility, the following formats are
1578 also recognized as composition data on decoding.
1581 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1584 MSEQ is a multibyte form but in these special format:
1585 ASCII: 0xA0 ASCII_CODE+0x80,
1586 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1587 RULE is a one byte code of the range 0xA0..0xF0 that
1588 represents a composition rule.
1591 char emacs_mule_bytes
[256];
1594 emacs_mule_char (coding
, src
, nbytes
, nchars
, id
)
1595 struct coding_system
*coding
;
1597 int *nbytes
, *nchars
, *id
;
1599 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1600 const unsigned char *src_base
= src
;
1601 int multibytep
= coding
->src_multibyte
;
1602 struct charset
*charset
;
1605 int consumed_chars
= 0;
1608 switch (emacs_mule_bytes
[c
])
1611 if (! (charset
= emacs_mule_charset
[c
]))
1618 if (c
== EMACS_MULE_LEADING_CODE_PRIVATE_11
1619 || c
== EMACS_MULE_LEADING_CODE_PRIVATE_12
)
1622 if (! (charset
= emacs_mule_charset
[c
]))
1629 if (! (charset
= emacs_mule_charset
[c
]))
1632 code
= (c
& 0x7F) << 8;
1640 if (! (charset
= emacs_mule_charset
[c
]))
1643 code
= (c
& 0x7F) << 8;
1650 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
)
1651 ? charset_ascii
: charset_eight_bit
);
1657 c
= DECODE_CHAR (charset
, code
);
1660 *nbytes
= src
- src_base
;
1661 *nchars
= consumed_chars
;
1674 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1675 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1679 detect_coding_emacs_mule (coding
, detect_info
)
1680 struct coding_system
*coding
;
1681 struct coding_detection_info
*detect_info
;
1683 const unsigned char *src
= coding
->source
, *src_base
= src
;
1684 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1685 int multibytep
= coding
->src_multibyte
;
1686 int consumed_chars
= 0;
1691 detect_info
->checked
|= CATEGORY_MASK_EMACS_MULE
;
1692 /* A coding system of this category is always ASCII compatible. */
1693 src
+= coding
->head_ascii
;
1703 /* Perhaps the start of composite character. We simple skip
1704 it because analyzing it is too heavy for detecting. But,
1705 at least, we check that the composite character
1706 constitues of more than 4 bytes. */
1707 const unsigned char *src_base
;
1717 if (src
- src_base
<= 4)
1719 found
= CATEGORY_MASK_EMACS_MULE
;
1727 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1732 const unsigned char *src_base
= src
- 1;
1739 if (src
- src_base
!= emacs_mule_bytes
[*src_base
])
1741 found
= CATEGORY_MASK_EMACS_MULE
;
1744 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1748 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1750 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1753 detect_info
->found
|= found
;
1758 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1760 /* Decode a character represented as a component of composition
1761 sequence of Emacs 20/21 style at SRC. Set C to that character and
1762 update SRC to the head of next character (or an encoded composition
1763 rule). If SRC doesn't points a composition component, set C to -1.
1764 If SRC points an invalid byte sequence, global exit by a return
1767 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1771 int nbytes, nchars; \
1773 if (src == src_end) \
1775 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1780 goto invalid_code; \
1784 consumed_chars += nchars; \
1789 /* Decode a composition rule represented as a component of composition
1790 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1791 and increment BUF. If SRC points an invalid byte sequence, set C
1794 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1796 int c, gref, nref; \
1798 if (src >= src_end) \
1799 goto invalid_code; \
1800 ONE_MORE_BYTE_NO_CHECK (c); \
1802 if (c < 0 || c >= 81) \
1803 goto invalid_code; \
1805 gref = c / 9, nref = c % 9; \
1806 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1810 /* Decode a composition rule represented as a component of composition
1811 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1812 and increment BUF. If SRC points an invalid byte sequence, set C
1815 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1819 if (src + 1>= src_end) \
1820 goto invalid_code; \
1821 ONE_MORE_BYTE_NO_CHECK (gref); \
1823 ONE_MORE_BYTE_NO_CHECK (nref); \
1825 if (gref < 0 || gref >= 81 \
1826 || nref < 0 || nref >= 81) \
1827 goto invalid_code; \
1828 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1832 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1834 /* Emacs 21 style format. The first three bytes at SRC are \
1835 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1836 the byte length of this composition information, CHARS is the \
1837 number of characters composed by this composition. */ \
1838 enum composition_method method = c - 0xF2; \
1839 int *charbuf_base = charbuf; \
1841 int consumed_chars_limit; \
1842 int nbytes, nchars; \
1844 ONE_MORE_BYTE (c); \
1845 nbytes = c - 0xA0; \
1847 goto invalid_code; \
1848 ONE_MORE_BYTE (c); \
1849 nchars = c - 0xA0; \
1850 from = coding->produced + char_offset; \
1851 to = from + nchars; \
1852 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1853 consumed_chars_limit = consumed_chars_base + nbytes; \
1854 if (method != COMPOSITION_RELATIVE) \
1857 while (consumed_chars < consumed_chars_limit) \
1859 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1860 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1862 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1865 if (consumed_chars < consumed_chars_limit) \
1866 goto invalid_code; \
1867 charbuf_base[0] -= i; \
1872 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1874 /* Emacs 20 style format for relative composition. */ \
1875 /* Store multibyte form of characters to be composed. */ \
1876 enum composition_method method = COMPOSITION_RELATIVE; \
1877 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1878 int *buf = components; \
1883 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1884 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1885 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1887 goto invalid_code; \
1888 from = coding->produced_char + char_offset; \
1890 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1891 for (j = 0; j < i; j++) \
1892 *charbuf++ = components[j]; \
1896 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1898 /* Emacs 20 style format for rule-base composition. */ \
1899 /* Store multibyte form of characters to be composed. */ \
1900 enum composition_method method = COMPOSITION_WITH_RULE; \
1901 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1902 int *buf = components; \
1906 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1907 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1909 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1910 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1912 if (i < 1 || (buf - components) % 2 == 0) \
1913 goto invalid_code; \
1914 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1915 goto no_more_source; \
1916 from = coding->produced_char + char_offset; \
1918 ADD_COMPOSITION_DATA (buf, from, to, method); \
1919 for (j = 0; j < i; j++) \
1920 *charbuf++ = components[j]; \
1921 for (j = 0; j < i; j += 2) \
1922 *charbuf++ = components[j]; \
1927 decode_coding_emacs_mule (coding
)
1928 struct coding_system
*coding
;
1930 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1931 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1932 const unsigned char *src_base
;
1933 int *charbuf
= coding
->charbuf
;
1934 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
1935 int consumed_chars
= 0, consumed_chars_base
;
1936 int multibytep
= coding
->src_multibyte
;
1937 Lisp_Object attrs
, eol_type
, charset_list
;
1938 int char_offset
= coding
->produced_char
;
1939 int last_offset
= char_offset
;
1940 int last_id
= charset_ascii
;
1942 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1949 consumed_chars_base
= consumed_chars
;
1951 if (charbuf
>= charbuf_end
)
1960 if (EQ (eol_type
, Qdos
))
1964 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
1965 goto no_more_source
;
1970 else if (EQ (eol_type
, Qmac
))
1979 if (c
- 0xF2 >= COMPOSITION_RELATIVE
1980 && c
- 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS
)
1981 DECODE_EMACS_MULE_21_COMPOSITION (c
);
1983 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
1985 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
1989 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
1995 consumed_chars
= consumed_chars_base
;
1996 c
= emacs_mule_char (coding
, src
, &nbytes
, &nchars
, &id
);
2005 if (last_id
!= charset_ascii
)
2006 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2008 last_offset
= char_offset
;
2012 consumed_chars
+= nchars
;
2019 consumed_chars
= consumed_chars_base
;
2021 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
2027 if (last_id
!= charset_ascii
)
2028 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2029 coding
->consumed_char
+= consumed_chars_base
;
2030 coding
->consumed
= src_base
- coding
->source
;
2031 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
2035 #define EMACS_MULE_LEADING_CODES(id, codes) \
2038 codes[0] = id, codes[1] = 0; \
2039 else if (id < 0xE0) \
2040 codes[0] = 0x9A, codes[1] = id; \
2041 else if (id < 0xF0) \
2042 codes[0] = 0x9B, codes[1] = id; \
2043 else if (id < 0xF5) \
2044 codes[0] = 0x9C, codes[1] = id; \
2046 codes[0] = 0x9D, codes[1] = id; \
2051 encode_coding_emacs_mule (coding
)
2052 struct coding_system
*coding
;
2054 int multibytep
= coding
->dst_multibyte
;
2055 int *charbuf
= coding
->charbuf
;
2056 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
2057 unsigned char *dst
= coding
->destination
+ coding
->produced
;
2058 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
2060 int produced_chars
= 0;
2061 Lisp_Object attrs
, eol_type
, charset_list
;
2063 int preferred_charset_id
= -1;
2065 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2067 while (charbuf
< charbuf_end
)
2069 ASSURE_DESTINATION (safe_room
);
2074 /* Handle an annotation. */
2077 case CODING_ANNOTATE_COMPOSITION_MASK
:
2078 /* Not yet implemented. */
2080 case CODING_ANNOTATE_CHARSET_MASK
:
2081 preferred_charset_id
= charbuf
[3];
2082 if (preferred_charset_id
>= 0
2083 && NILP (Fmemq (make_number (preferred_charset_id
),
2085 preferred_charset_id
= -1;
2094 if (ASCII_CHAR_P (c
))
2095 EMIT_ONE_ASCII_BYTE (c
);
2096 else if (CHAR_BYTE8_P (c
))
2098 c
= CHAR_TO_BYTE8 (c
);
2103 struct charset
*charset
;
2107 unsigned char leading_codes
[2];
2109 if (preferred_charset_id
>= 0)
2111 charset
= CHARSET_FROM_ID (preferred_charset_id
);
2112 if (! CHAR_CHARSET_P (c
, charset
))
2113 charset
= char_charset (c
, charset_list
, NULL
);
2116 charset
= char_charset (c
, charset_list
, &code
);
2119 c
= coding
->default_char
;
2120 if (ASCII_CHAR_P (c
))
2122 EMIT_ONE_ASCII_BYTE (c
);
2125 charset
= char_charset (c
, charset_list
, &code
);
2127 dimension
= CHARSET_DIMENSION (charset
);
2128 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2129 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2130 EMIT_ONE_BYTE (leading_codes
[0]);
2131 if (leading_codes
[1])
2132 EMIT_ONE_BYTE (leading_codes
[1]);
2134 EMIT_ONE_BYTE (code
);
2137 EMIT_ONE_BYTE (code
>> 8);
2138 EMIT_ONE_BYTE (code
& 0xFF);
2142 coding
->result
= CODING_RESULT_SUCCESS
;
2143 coding
->produced_char
+= produced_chars
;
2144 coding
->produced
= dst
- coding
->destination
;
2149 /*** 7. ISO2022 handlers ***/
2151 /* The following note describes the coding system ISO2022 briefly.
2152 Since the intention of this note is to help understand the
2153 functions in this file, some parts are NOT ACCURATE or are OVERLY
2154 SIMPLIFIED. For thorough understanding, please refer to the
2155 original document of ISO2022. This is equivalent to the standard
2156 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2158 ISO2022 provides many mechanisms to encode several character sets
2159 in 7-bit and 8-bit environments. For 7-bit environments, all text
2160 is encoded using bytes less than 128. This may make the encoded
2161 text a little bit longer, but the text passes more easily through
2162 several types of gateway, some of which strip off the MSB (Most
2165 There are two kinds of character sets: control character sets and
2166 graphic character sets. The former contain control characters such
2167 as `newline' and `escape' to provide control functions (control
2168 functions are also provided by escape sequences). The latter
2169 contain graphic characters such as 'A' and '-'. Emacs recognizes
2170 two control character sets and many graphic character sets.
2172 Graphic character sets are classified into one of the following
2173 four classes, according to the number of bytes (DIMENSION) and
2174 number of characters in one dimension (CHARS) of the set:
2175 - DIMENSION1_CHARS94
2176 - DIMENSION1_CHARS96
2177 - DIMENSION2_CHARS94
2178 - DIMENSION2_CHARS96
2180 In addition, each character set is assigned an identification tag,
2181 unique for each set, called the "final character" (denoted as <F>
2182 hereafter). The <F> of each character set is decided by ECMA(*)
2183 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2184 (0x30..0x3F are for private use only).
2186 Note (*): ECMA = European Computer Manufacturers Association
2188 Here are examples of graphic character sets [NAME(<F>)]:
2189 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2190 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2191 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2192 o DIMENSION2_CHARS96 -- none for the moment
2194 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2195 C0 [0x00..0x1F] -- control character plane 0
2196 GL [0x20..0x7F] -- graphic character plane 0
2197 C1 [0x80..0x9F] -- control character plane 1
2198 GR [0xA0..0xFF] -- graphic character plane 1
2200 A control character set is directly designated and invoked to C0 or
2201 C1 by an escape sequence. The most common case is that:
2202 - ISO646's control character set is designated/invoked to C0, and
2203 - ISO6429's control character set is designated/invoked to C1,
2204 and usually these designations/invocations are omitted in encoded
2205 text. In a 7-bit environment, only C0 can be used, and a control
2206 character for C1 is encoded by an appropriate escape sequence to
2207 fit into the environment. All control characters for C1 are
2208 defined to have corresponding escape sequences.
2210 A graphic character set is at first designated to one of four
2211 graphic registers (G0 through G3), then these graphic registers are
2212 invoked to GL or GR. These designations and invocations can be
2213 done independently. The most common case is that G0 is invoked to
2214 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2215 these invocations and designations are omitted in encoded text.
2216 In a 7-bit environment, only GL can be used.
2218 When a graphic character set of CHARS94 is invoked to GL, codes
2219 0x20 and 0x7F of the GL area work as control characters SPACE and
2220 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2223 There are two ways of invocation: locking-shift and single-shift.
2224 With locking-shift, the invocation lasts until the next different
2225 invocation, whereas with single-shift, the invocation affects the
2226 following character only and doesn't affect the locking-shift
2227 state. Invocations are done by the following control characters or
2230 ----------------------------------------------------------------------
2231 abbrev function cntrl escape seq description
2232 ----------------------------------------------------------------------
2233 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2234 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2235 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2236 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2237 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2238 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2239 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2240 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2241 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2242 ----------------------------------------------------------------------
2243 (*) These are not used by any known coding system.
2245 Control characters for these functions are defined by macros
2246 ISO_CODE_XXX in `coding.h'.
2248 Designations are done by the following escape sequences:
2249 ----------------------------------------------------------------------
2250 escape sequence description
2251 ----------------------------------------------------------------------
2252 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2253 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2254 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2255 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2256 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2257 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2258 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2259 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2260 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2261 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2262 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2263 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2264 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2265 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2266 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2267 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2268 ----------------------------------------------------------------------
2270 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2271 of dimension 1, chars 94, and final character <F>, etc...
2273 Note (*): Although these designations are not allowed in ISO2022,
2274 Emacs accepts them on decoding, and produces them on encoding
2275 CHARS96 character sets in a coding system which is characterized as
2276 7-bit environment, non-locking-shift, and non-single-shift.
2278 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2279 '(' must be omitted. We refer to this as "short-form" hereafter.
2281 Now you may notice that there are a lot of ways of encoding the
2282 same multilingual text in ISO2022. Actually, there exist many
2283 coding systems such as Compound Text (used in X11's inter client
2284 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2285 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2286 localized platforms), and all of these are variants of ISO2022.
2288 In addition to the above, Emacs handles two more kinds of escape
2289 sequences: ISO6429's direction specification and Emacs' private
2290 sequence for specifying character composition.
2292 ISO6429's direction specification takes the following form:
2293 o CSI ']' -- end of the current direction
2294 o CSI '0' ']' -- end of the current direction
2295 o CSI '1' ']' -- start of left-to-right text
2296 o CSI '2' ']' -- start of right-to-left text
2297 The control character CSI (0x9B: control sequence introducer) is
2298 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2300 Character composition specification takes the following form:
2301 o ESC '0' -- start relative composition
2302 o ESC '1' -- end composition
2303 o ESC '2' -- start rule-base composition (*)
2304 o ESC '3' -- start relative composition with alternate chars (**)
2305 o ESC '4' -- start rule-base composition with alternate chars (**)
2306 Since these are not standard escape sequences of any ISO standard,
2307 the use of them with these meanings is restricted to Emacs only.
2309 (*) This form is used only in Emacs 20.7 and older versions,
2310 but newer versions can safely decode it.
2311 (**) This form is used only in Emacs 21.1 and newer versions,
2312 and older versions can't decode it.
2314 Here's a list of example usages of these composition escape
2315 sequences (categorized by `enum composition_method').
2317 COMPOSITION_RELATIVE:
2318 ESC 0 CHAR [ CHAR ] ESC 1
2319 COMPOSITION_WITH_RULE:
2320 ESC 2 CHAR [ RULE CHAR ] ESC 1
2321 COMPOSITION_WITH_ALTCHARS:
2322 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2323 COMPOSITION_WITH_RULE_ALTCHARS:
2324 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2326 enum iso_code_class_type iso_code_class
[256];
2328 #define SAFE_CHARSET_P(coding, id) \
2329 ((id) <= (coding)->max_charset_id \
2330 && (coding)->safe_charsets[id] >= 0)
2333 #define SHIFT_OUT_OK(category) \
2334 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2337 setup_iso_safe_charsets (attrs
)
2340 Lisp_Object charset_list
, safe_charsets
;
2341 Lisp_Object request
;
2342 Lisp_Object reg_usage
;
2345 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2348 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2349 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2350 && ! EQ (charset_list
, Viso_2022_charset_list
))
2352 CODING_ATTR_CHARSET_LIST (attrs
)
2353 = charset_list
= Viso_2022_charset_list
;
2354 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2357 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2361 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2363 int id
= XINT (XCAR (tail
));
2364 if (max_charset_id
< id
)
2365 max_charset_id
= id
;
2368 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2370 request
= AREF (attrs
, coding_attr_iso_request
);
2371 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2372 reg94
= XINT (XCAR (reg_usage
));
2373 reg96
= XINT (XCDR (reg_usage
));
2375 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2379 struct charset
*charset
;
2382 charset
= CHARSET_FROM_ID (XINT (id
));
2383 reg
= Fcdr (Fassq (id
, request
));
2385 SSET (safe_charsets
, XINT (id
), XINT (reg
));
2386 else if (charset
->iso_chars_96
)
2389 SSET (safe_charsets
, XINT (id
), reg96
);
2394 SSET (safe_charsets
, XINT (id
), reg94
);
2397 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2401 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2402 Check if a text is encoded in one of ISO-2022 based codig systems.
2403 If it is, return 1, else return 0. */
2406 detect_coding_iso_2022 (coding
, detect_info
)
2407 struct coding_system
*coding
;
2408 struct coding_detection_info
*detect_info
;
2410 const unsigned char *src
= coding
->source
, *src_base
= src
;
2411 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2412 int multibytep
= coding
->src_multibyte
;
2413 int single_shifting
= 0;
2416 int consumed_chars
= 0;
2421 detect_info
->checked
|= CATEGORY_MASK_ISO
;
2423 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2425 struct coding_system
*this = &(coding_categories
[i
]);
2426 Lisp_Object attrs
, val
;
2428 attrs
= CODING_ID_ATTRS (this->id
);
2429 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2430 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2431 setup_iso_safe_charsets (attrs
);
2432 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2433 this->max_charset_id
= SCHARS (val
) - 1;
2434 this->safe_charsets
= (char *) SDATA (val
);
2437 /* A coding system of this category is always ASCII compatible. */
2438 src
+= coding
->head_ascii
;
2440 while (rejected
!= CATEGORY_MASK_ISO
)
2446 if (inhibit_iso_escape_detection
)
2448 single_shifting
= 0;
2450 if (c
>= '(' && c
<= '/')
2452 /* Designation sequence for a charset of dimension 1. */
2454 if (c1
< ' ' || c1
>= 0x80
2455 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2456 /* Invalid designation sequence. Just ignore. */
2461 /* Designation sequence for a charset of dimension 2. */
2463 if (c
>= '@' && c
<= 'B')
2464 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2465 id
= iso_charset_table
[1][0][c
];
2466 else if (c
>= '(' && c
<= '/')
2469 if (c1
< ' ' || c1
>= 0x80
2470 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2471 /* Invalid designation sequence. Just ignore. */
2475 /* Invalid designation sequence. Just ignore it. */
2478 else if (c
== 'N' || c
== 'O')
2480 /* ESC <Fe> for SS2 or SS3. */
2481 single_shifting
= 1;
2482 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2485 else if (c
>= '0' && c
<= '4')
2487 /* ESC <Fp> for start/end composition. */
2488 found
|= CATEGORY_MASK_ISO
;
2493 /* Invalid escape sequence. Just ignore it. */
2497 /* We found a valid designation sequence for CHARSET. */
2498 rejected
|= CATEGORY_MASK_ISO_8BIT
;
2499 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2501 found
|= CATEGORY_MASK_ISO_7
;
2503 rejected
|= CATEGORY_MASK_ISO_7
;
2504 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2506 found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2508 rejected
|= CATEGORY_MASK_ISO_7_TIGHT
;
2509 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2511 found
|= CATEGORY_MASK_ISO_7_ELSE
;
2513 rejected
|= CATEGORY_MASK_ISO_7_ELSE
;
2514 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2516 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2518 rejected
|= CATEGORY_MASK_ISO_8_ELSE
;
2523 /* Locking shift out/in. */
2524 if (inhibit_iso_escape_detection
)
2526 single_shifting
= 0;
2527 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2528 found
|= CATEGORY_MASK_ISO_ELSE
;
2532 /* Control sequence introducer. */
2533 single_shifting
= 0;
2534 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2535 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2536 goto check_extra_latin
;
2542 if (inhibit_iso_escape_detection
)
2544 single_shifting
= 1;
2545 rejected
|= CATEGORY_MASK_ISO_7BIT
;
2546 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2547 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2548 found
|= CATEGORY_MASK_ISO_8_1
;
2549 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2550 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2551 found
|= CATEGORY_MASK_ISO_8_2
;
2552 goto check_extra_latin
;
2557 single_shifting
= 0;
2562 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2563 found
|= CATEGORY_MASK_ISO_8_1
;
2564 /* Check the length of succeeding codes of the range
2565 0xA0..0FF. If the byte length is even, we include
2566 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2567 only when we are not single shifting. */
2568 if (! single_shifting
2569 && ! (rejected
& CATEGORY_MASK_ISO_8_2
))
2572 while (src
< src_end
)
2580 if (i
& 1 && src
< src_end
)
2581 rejected
|= CATEGORY_MASK_ISO_8_2
;
2583 found
|= CATEGORY_MASK_ISO_8_2
;
2588 single_shifting
= 0;
2589 if (! VECTORP (Vlatin_extra_code_table
)
2590 || NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2592 rejected
= CATEGORY_MASK_ISO
;
2595 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2596 & CODING_ISO_FLAG_LATIN_EXTRA
)
2597 found
|= CATEGORY_MASK_ISO_8_1
;
2599 rejected
|= CATEGORY_MASK_ISO_8_1
;
2600 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2601 & CODING_ISO_FLAG_LATIN_EXTRA
)
2602 found
|= CATEGORY_MASK_ISO_8_2
;
2604 rejected
|= CATEGORY_MASK_ISO_8_2
;
2607 detect_info
->rejected
|= CATEGORY_MASK_ISO
;
2611 detect_info
->rejected
|= rejected
;
2612 detect_info
->found
|= (found
& ~rejected
);
2617 /* Set designation state into CODING. */
2618 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2622 if (final < '0' || final >= 128 \
2623 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2624 || !SAFE_CHARSET_P (coding, id)) \
2626 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2627 goto invalid_code; \
2629 prev = CODING_ISO_DESIGNATION (coding, reg); \
2630 if (id == charset_jisx0201_roman) \
2632 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2633 id = charset_ascii; \
2635 else if (id == charset_jisx0208_1978) \
2637 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2638 id = charset_jisx0208; \
2640 CODING_ISO_DESIGNATION (coding, reg) = id; \
2641 /* If there was an invalid designation to REG previously, and this \
2642 designation is ASCII to REG, we should keep this designation \
2644 if (prev == -2 && id == charset_ascii) \
2645 goto invalid_code; \
2649 #define MAYBE_FINISH_COMPOSITION() \
2652 if (composition_state == COMPOSING_NO) \
2654 /* It is assured that we have enough room for producing \
2655 characters stored in the table `components'. */ \
2656 if (charbuf + component_idx > charbuf_end) \
2657 goto no_more_source; \
2658 composition_state = COMPOSING_NO; \
2659 if (method == COMPOSITION_RELATIVE \
2660 || method == COMPOSITION_WITH_ALTCHARS) \
2662 for (i = 0; i < component_idx; i++) \
2663 *charbuf++ = components[i]; \
2664 char_offset += component_idx; \
2668 for (i = 0; i < component_idx; i += 2) \
2669 *charbuf++ = components[i]; \
2670 char_offset += (component_idx / 2) + 1; \
2675 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2676 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2677 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2678 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2679 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2682 #define DECODE_COMPOSITION_START(c1) \
2685 && composition_state == COMPOSING_COMPONENT_RULE) \
2687 component_len = component_idx; \
2688 composition_state = COMPOSING_CHAR; \
2692 const unsigned char *p; \
2694 MAYBE_FINISH_COMPOSITION (); \
2695 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2696 goto no_more_source; \
2697 for (p = src; p < src_end - 1; p++) \
2698 if (*p == ISO_CODE_ESC && p[1] == '1') \
2700 if (p == src_end - 1) \
2702 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2703 goto invalid_code; \
2704 goto no_more_source; \
2707 /* This is surely the start of a composition. */ \
2708 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2709 : c1 == '2' ? COMPOSITION_WITH_RULE \
2710 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2711 : COMPOSITION_WITH_RULE_ALTCHARS); \
2712 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2713 : COMPOSING_COMPONENT_CHAR); \
2714 component_idx = component_len = 0; \
2719 /* Handle compositoin end sequence ESC 1. */
2721 #define DECODE_COMPOSITION_END() \
2723 int nchars = (component_len > 0 ? component_idx - component_len \
2724 : method == COMPOSITION_RELATIVE ? component_idx \
2725 : (component_idx + 1) / 2); \
2727 int *saved_charbuf = charbuf; \
2728 int from = char_offset; \
2729 int to = from + nchars; \
2731 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2732 if (method != COMPOSITION_RELATIVE) \
2734 if (component_len == 0) \
2735 for (i = 0; i < component_idx; i++) \
2736 *charbuf++ = components[i]; \
2738 for (i = 0; i < component_len; i++) \
2739 *charbuf++ = components[i]; \
2740 *saved_charbuf = saved_charbuf - charbuf; \
2742 if (method == COMPOSITION_WITH_RULE) \
2743 for (i = 0; i < component_idx; i += 2, char_offset++) \
2744 *charbuf++ = components[i]; \
2746 for (i = component_len; i < component_idx; i++, char_offset++) \
2747 *charbuf++ = components[i]; \
2748 coding->annotated = 1; \
2749 composition_state = COMPOSING_NO; \
2753 /* Decode a composition rule from the byte C1 (and maybe one more byte
2754 from SRC) and store one encoded composition rule in
2755 coding->cmp_data. */
2757 #define DECODE_COMPOSITION_RULE(c1) \
2760 if (c1 < 81) /* old format (before ver.21) */ \
2762 int gref = (c1) / 9; \
2763 int nref = (c1) % 9; \
2764 if (gref == 4) gref = 10; \
2765 if (nref == 4) nref = 10; \
2766 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2768 else if (c1 < 93) /* new format (after ver.21) */ \
2770 ONE_MORE_BYTE (c2); \
2771 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2778 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2781 decode_coding_iso_2022 (coding
)
2782 struct coding_system
*coding
;
2784 const unsigned char *src
= coding
->source
+ coding
->consumed
;
2785 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2786 const unsigned char *src_base
;
2787 int *charbuf
= coding
->charbuf
;
2789 = charbuf
+ coding
->charbuf_size
- 4 - MAX_ANNOTATION_LENGTH
;
2790 int consumed_chars
= 0, consumed_chars_base
;
2791 int multibytep
= coding
->src_multibyte
;
2792 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2793 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2794 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2795 struct charset
*charset
;
2797 /* For handling composition sequence. */
2798 #define COMPOSING_NO 0
2799 #define COMPOSING_CHAR 1
2800 #define COMPOSING_RULE 2
2801 #define COMPOSING_COMPONENT_CHAR 3
2802 #define COMPOSING_COMPONENT_RULE 4
2804 int composition_state
= COMPOSING_NO
;
2805 enum composition_method method
;
2806 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2809 Lisp_Object attrs
, eol_type
, charset_list
;
2810 int char_offset
= coding
->produced_char
;
2811 int last_offset
= char_offset
;
2812 int last_id
= charset_ascii
;
2814 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2815 setup_iso_safe_charsets (attrs
);
2822 consumed_chars_base
= consumed_chars
;
2824 if (charbuf
>= charbuf_end
)
2829 /* We produce at most one character. */
2830 switch (iso_code_class
[c1
])
2832 case ISO_0x20_or_0x7F
:
2833 if (composition_state
!= COMPOSING_NO
)
2835 if (composition_state
== COMPOSING_RULE
2836 || composition_state
== COMPOSING_COMPONENT_RULE
)
2838 DECODE_COMPOSITION_RULE (c1
);
2839 components
[component_idx
++] = c1
;
2840 composition_state
--;
2844 if (charset_id_0
< 0
2845 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2846 /* This is SPACE or DEL. */
2847 charset
= CHARSET_FROM_ID (charset_ascii
);
2849 charset
= CHARSET_FROM_ID (charset_id_0
);
2852 case ISO_graphic_plane_0
:
2853 if (composition_state
!= COMPOSING_NO
)
2855 if (composition_state
== COMPOSING_RULE
2856 || composition_state
== COMPOSING_COMPONENT_RULE
)
2858 DECODE_COMPOSITION_RULE (c1
);
2859 components
[component_idx
++] = c1
;
2860 composition_state
--;
2864 charset
= CHARSET_FROM_ID (charset_id_0
);
2867 case ISO_0xA0_or_0xFF
:
2868 if (charset_id_1
< 0
2869 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
2870 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
2872 /* This is a graphic character, we fall down ... */
2874 case ISO_graphic_plane_1
:
2875 if (charset_id_1
< 0)
2877 charset
= CHARSET_FROM_ID (charset_id_1
);
2880 case ISO_carriage_return
:
2883 if (EQ (eol_type
, Qdos
))
2887 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
2888 goto no_more_source
;
2893 else if (EQ (eol_type
, Qmac
))
2899 MAYBE_FINISH_COMPOSITION ();
2900 charset
= CHARSET_FROM_ID (charset_ascii
);
2904 MAYBE_FINISH_COMPOSITION ();
2908 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2909 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
2911 CODING_ISO_INVOCATION (coding
, 0) = 1;
2912 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2916 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
2918 CODING_ISO_INVOCATION (coding
, 0) = 0;
2919 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2922 case ISO_single_shift_2_7
:
2923 case ISO_single_shift_2
:
2924 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2926 /* SS2 is handled as an escape sequence of ESC 'N' */
2928 goto label_escape_sequence
;
2930 case ISO_single_shift_3
:
2931 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2933 /* SS2 is handled as an escape sequence of ESC 'O' */
2935 goto label_escape_sequence
;
2937 case ISO_control_sequence_introducer
:
2938 /* CSI is handled as an escape sequence of ESC '[' ... */
2940 goto label_escape_sequence
;
2944 label_escape_sequence
:
2945 /* Escape sequences handled here are invocation,
2946 designation, direction specification, and character
2947 composition specification. */
2950 case '&': /* revision of following character set */
2952 if (!(c1
>= '@' && c1
<= '~'))
2955 if (c1
!= ISO_CODE_ESC
)
2958 goto label_escape_sequence
;
2960 case '$': /* designation of 2-byte character set */
2961 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2964 if (c1
>= '@' && c1
<= 'B')
2965 { /* designation of JISX0208.1978, GB2312.1980,
2967 DECODE_DESIGNATION (0, 2, 0, c1
);
2969 else if (c1
>= 0x28 && c1
<= 0x2B)
2970 { /* designation of DIMENSION2_CHARS94 character set */
2972 DECODE_DESIGNATION (c1
- 0x28, 2, 0, c2
);
2974 else if (c1
>= 0x2C && c1
<= 0x2F)
2975 { /* designation of DIMENSION2_CHARS96 character set */
2977 DECODE_DESIGNATION (c1
- 0x2C, 2, 1, c2
);
2981 /* We must update these variables now. */
2982 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2983 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2986 case 'n': /* invocation of locking-shift-2 */
2987 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2988 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2990 CODING_ISO_INVOCATION (coding
, 0) = 2;
2991 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2994 case 'o': /* invocation of locking-shift-3 */
2995 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2996 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2998 CODING_ISO_INVOCATION (coding
, 0) = 3;
2999 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3002 case 'N': /* invocation of single-shift-2 */
3003 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3004 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3006 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 2));
3008 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3012 case 'O': /* invocation of single-shift-3 */
3013 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3014 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3016 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 3));
3018 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3022 case '0': case '2': case '3': case '4': /* start composition */
3023 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
3025 DECODE_COMPOSITION_START (c1
);
3028 case '1': /* end composition */
3029 if (composition_state
== COMPOSING_NO
)
3031 DECODE_COMPOSITION_END ();
3034 case '[': /* specification of direction */
3035 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
3037 /* For the moment, nested direction is not supported.
3038 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3039 left-to-right, and nozero means right-to-left. */
3043 case ']': /* end of the current direction */
3044 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3046 case '0': /* end of the current direction */
3047 case '1': /* start of left-to-right direction */
3050 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3055 case '2': /* start of right-to-left direction */
3058 coding
->mode
|= CODING_MODE_DIRECTION
;
3072 /* CTEXT extended segment:
3073 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3074 We keep these bytes as is for the moment.
3075 They may be decoded by post-read-conversion. */
3079 ONE_MORE_BYTE (dim
);
3082 size
= ((M
- 128) * 128) + (L
- 128);
3083 if (charbuf
+ 8 + size
> charbuf_end
)
3085 *charbuf
++ = ISO_CODE_ESC
;
3089 *charbuf
++ = BYTE8_TO_CHAR (M
);
3090 *charbuf
++ = BYTE8_TO_CHAR (L
);
3094 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3099 /* XFree86 extension for embedding UTF-8 in CTEXT:
3100 ESC % G --UTF-8-BYTES-- ESC % @
3101 We keep these bytes as is for the moment.
3102 They may be decoded by post-read-conversion. */
3105 if (p
+ 6 > charbuf_end
)
3107 *p
++ = ISO_CODE_ESC
;
3110 while (p
< charbuf_end
)
3113 if (c1
== ISO_CODE_ESC
3114 && src
+ 1 < src_end
3118 *p
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3120 if (p
+ 3 > charbuf_end
)
3122 *p
++ = ISO_CODE_ESC
;
3133 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3135 if (c1
>= 0x28 && c1
<= 0x2B)
3136 { /* designation of DIMENSION1_CHARS94 character set */
3138 DECODE_DESIGNATION (c1
- 0x28, 1, 0, c2
);
3140 else if (c1
>= 0x2C && c1
<= 0x2F)
3141 { /* designation of DIMENSION1_CHARS96 character set */
3143 DECODE_DESIGNATION (c1
- 0x2C, 1, 1, c2
);
3147 /* We must update these variables now. */
3148 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3149 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3154 if (charset
->id
!= charset_ascii
3155 && last_id
!= charset
->id
)
3157 if (last_id
!= charset_ascii
)
3158 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3159 last_id
= charset
->id
;
3160 last_offset
= char_offset
;
3163 /* Now we know CHARSET and 1st position code C1 of a character.
3164 Produce a decoded character while getting 2nd position code
3167 if (CHARSET_DIMENSION (charset
) > 1)
3170 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3171 /* C2 is not in a valid range. */
3173 c1
= (c1
<< 8) | (c2
& 0x7F);
3174 if (CHARSET_DIMENSION (charset
) > 2)
3177 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3178 /* C2 is not in a valid range. */
3180 c1
= (c1
<< 8) | (c2
& 0x7F);
3184 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3187 MAYBE_FINISH_COMPOSITION ();
3188 for (; src_base
< src
; src_base
++, char_offset
++)
3190 if (ASCII_BYTE_P (*src_base
))
3191 *charbuf
++ = *src_base
;
3193 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3196 else if (composition_state
== COMPOSING_NO
)
3203 components
[component_idx
++] = c
;
3204 if (method
== COMPOSITION_WITH_RULE
3205 || (method
== COMPOSITION_WITH_RULE_ALTCHARS
3206 && composition_state
== COMPOSING_COMPONENT_CHAR
))
3207 composition_state
++;
3212 MAYBE_FINISH_COMPOSITION ();
3214 consumed_chars
= consumed_chars_base
;
3216 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3226 if (last_id
!= charset_ascii
)
3227 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3228 coding
->consumed_char
+= consumed_chars_base
;
3229 coding
->consumed
= src_base
- coding
->source
;
3230 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3234 /* ISO2022 encoding stuff. */
3237 It is not enough to say just "ISO2022" on encoding, we have to
3238 specify more details. In Emacs, each coding system of ISO2022
3239 variant has the following specifications:
3240 1. Initial designation to G0 thru G3.
3241 2. Allows short-form designation?
3242 3. ASCII should be designated to G0 before control characters?
3243 4. ASCII should be designated to G0 at end of line?
3244 5. 7-bit environment or 8-bit environment?
3245 6. Use locking-shift?
3246 7. Use Single-shift?
3247 And the following two are only for Japanese:
3248 8. Use ASCII in place of JIS0201-1976-Roman?
3249 9. Use JISX0208-1983 in place of JISX0208-1978?
3250 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3251 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3255 /* Produce codes (escape sequence) for designating CHARSET to graphic
3256 register REG at DST, and increment DST. If <final-char> of CHARSET is
3257 '@', 'A', or 'B' and the coding system CODING allows, produce
3258 designation sequence of short-form. */
3260 #define ENCODE_DESIGNATION(charset, reg, coding) \
3262 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3263 char *intermediate_char_94 = "()*+"; \
3264 char *intermediate_char_96 = ",-./"; \
3265 int revision = -1; \
3268 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3269 revision = CHARSET_ISO_REVISION (charset); \
3271 if (revision >= 0) \
3273 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3274 EMIT_ONE_BYTE ('@' + revision); \
3276 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3277 if (CHARSET_DIMENSION (charset) == 1) \
3279 if (! CHARSET_ISO_CHARS_96 (charset)) \
3280 c = intermediate_char_94[reg]; \
3282 c = intermediate_char_96[reg]; \
3283 EMIT_ONE_ASCII_BYTE (c); \
3287 EMIT_ONE_ASCII_BYTE ('$'); \
3288 if (! CHARSET_ISO_CHARS_96 (charset)) \
3290 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3292 || final_char < '@' || final_char > 'B') \
3293 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3296 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3298 EMIT_ONE_ASCII_BYTE (final_char); \
3300 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3304 /* The following two macros produce codes (control character or escape
3305 sequence) for ISO2022 single-shift functions (single-shift-2 and
3308 #define ENCODE_SINGLE_SHIFT_2 \
3310 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3311 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3313 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3314 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3318 #define ENCODE_SINGLE_SHIFT_3 \
3320 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3321 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3323 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3324 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3328 /* The following four macros produce codes (control character or
3329 escape sequence) for ISO2022 locking-shift functions (shift-in,
3330 shift-out, locking-shift-2, and locking-shift-3). */
3332 #define ENCODE_SHIFT_IN \
3334 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3335 CODING_ISO_INVOCATION (coding, 0) = 0; \
3339 #define ENCODE_SHIFT_OUT \
3341 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3342 CODING_ISO_INVOCATION (coding, 0) = 1; \
3346 #define ENCODE_LOCKING_SHIFT_2 \
3348 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3349 CODING_ISO_INVOCATION (coding, 0) = 2; \
3353 #define ENCODE_LOCKING_SHIFT_3 \
3355 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3356 CODING_ISO_INVOCATION (coding, 0) = 3; \
3360 /* Produce codes for a DIMENSION1 character whose character set is
3361 CHARSET and whose position-code is C1. Designation and invocation
3362 sequences are also produced in advance if necessary. */
3364 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3366 int id = CHARSET_ID (charset); \
3368 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3369 && id == charset_ascii) \
3371 id = charset_jisx0201_roman; \
3372 charset = CHARSET_FROM_ID (id); \
3375 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3377 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3378 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3380 EMIT_ONE_BYTE (c1 | 0x80); \
3381 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3384 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3386 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3389 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3391 EMIT_ONE_BYTE (c1 | 0x80); \
3395 /* Since CHARSET is not yet invoked to any graphic planes, we \
3396 must invoke it, or, at first, designate it to some graphic \
3397 register. Then repeat the loop to actually produce the \
3399 dst = encode_invocation_designation (charset, coding, dst, \
3404 /* Produce codes for a DIMENSION2 character whose character set is
3405 CHARSET and whose position-codes are C1 and C2. Designation and
3406 invocation codes are also produced in advance if necessary. */
3408 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3410 int id = CHARSET_ID (charset); \
3412 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3413 && id == charset_jisx0208) \
3415 id = charset_jisx0208_1978; \
3416 charset = CHARSET_FROM_ID (id); \
3419 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3421 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3422 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3424 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3425 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3428 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3430 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3433 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3435 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3439 /* Since CHARSET is not yet invoked to any graphic planes, we \
3440 must invoke it, or, at first, designate it to some graphic \
3441 register. Then repeat the loop to actually produce the \
3443 dst = encode_invocation_designation (charset, coding, dst, \
3448 #define ENCODE_ISO_CHARACTER(charset, c) \
3450 int code = ENCODE_CHAR ((charset),(c)); \
3452 if (CHARSET_DIMENSION (charset) == 1) \
3453 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3455 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3459 /* Produce designation and invocation codes at a place pointed by DST
3460 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3464 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3465 struct charset
*charset
;
3466 struct coding_system
*coding
;
3470 int multibytep
= coding
->dst_multibyte
;
3471 int produced_chars
= *p_nchars
;
3472 int reg
; /* graphic register number */
3473 int id
= CHARSET_ID (charset
);
3475 /* At first, check designations. */
3476 for (reg
= 0; reg
< 4; reg
++)
3477 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3482 /* CHARSET is not yet designated to any graphic registers. */
3483 /* At first check the requested designation. */
3484 reg
= CODING_ISO_REQUEST (coding
, id
);
3486 /* Since CHARSET requests no special designation, designate it
3487 to graphic register 0. */
3490 ENCODE_DESIGNATION (charset
, reg
, coding
);
3493 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3494 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3496 /* Since the graphic register REG is not invoked to any graphic
3497 planes, invoke it to graphic plane 0. */
3500 case 0: /* graphic register 0 */
3504 case 1: /* graphic register 1 */
3508 case 2: /* graphic register 2 */
3509 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3510 ENCODE_SINGLE_SHIFT_2
;
3512 ENCODE_LOCKING_SHIFT_2
;
3515 case 3: /* graphic register 3 */
3516 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3517 ENCODE_SINGLE_SHIFT_3
;
3519 ENCODE_LOCKING_SHIFT_3
;
3524 *p_nchars
= produced_chars
;
3528 /* The following three macros produce codes for indicating direction
3530 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3532 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3533 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3535 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3539 #define ENCODE_DIRECTION_R2L() \
3541 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3542 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3546 #define ENCODE_DIRECTION_L2R() \
3548 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3549 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3553 /* Produce codes for designation and invocation to reset the graphic
3554 planes and registers to initial state. */
3555 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3558 struct charset *charset; \
3560 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3562 for (reg = 0; reg < 4; reg++) \
3563 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3564 && (CODING_ISO_DESIGNATION (coding, reg) \
3565 != CODING_ISO_INITIAL (coding, reg))) \
3567 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3568 ENCODE_DESIGNATION (charset, reg, coding); \
3573 /* Produce designation sequences of charsets in the line started from
3574 SRC to a place pointed by DST, and return updated DST.
3576 If the current block ends before any end-of-line, we may fail to
3577 find all the necessary designations. */
3579 static unsigned char *
3580 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3581 struct coding_system
*coding
;
3582 int *charbuf
, *charbuf_end
;
3585 struct charset
*charset
;
3586 /* Table of charsets to be designated to each graphic register. */
3588 int c
, found
= 0, reg
;
3589 int produced_chars
= 0;
3590 int multibytep
= coding
->dst_multibyte
;
3592 Lisp_Object charset_list
;
3594 attrs
= CODING_ID_ATTRS (coding
->id
);
3595 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3596 if (EQ (charset_list
, Qiso_2022
))
3597 charset_list
= Viso_2022_charset_list
;
3599 for (reg
= 0; reg
< 4; reg
++)
3609 charset
= char_charset (c
, charset_list
, NULL
);
3610 id
= CHARSET_ID (charset
);
3611 reg
= CODING_ISO_REQUEST (coding
, id
);
3612 if (reg
>= 0 && r
[reg
] < 0)
3621 for (reg
= 0; reg
< 4; reg
++)
3623 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3624 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3630 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3633 encode_coding_iso_2022 (coding
)
3634 struct coding_system
*coding
;
3636 int multibytep
= coding
->dst_multibyte
;
3637 int *charbuf
= coding
->charbuf
;
3638 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3639 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3640 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3643 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3644 && CODING_ISO_BOL (coding
));
3645 int produced_chars
= 0;
3646 Lisp_Object attrs
, eol_type
, charset_list
;
3647 int ascii_compatible
;
3649 int preferred_charset_id
= -1;
3651 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3652 setup_iso_safe_charsets (attrs
);
3653 /* Charset list may have been changed. */
3654 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
); \
3655 coding
->safe_charsets
= (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs
));
3657 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3659 while (charbuf
< charbuf_end
)
3661 ASSURE_DESTINATION (safe_room
);
3663 if (bol_designation
)
3665 unsigned char *dst_prev
= dst
;
3667 /* We have to produce designation sequences if any now. */
3668 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3669 bol_designation
= 0;
3670 /* We are sure that designation sequences are all ASCII bytes. */
3671 produced_chars
+= dst
- dst_prev
;
3678 /* Handle an annotation. */
3681 case CODING_ANNOTATE_COMPOSITION_MASK
:
3682 /* Not yet implemented. */
3684 case CODING_ANNOTATE_CHARSET_MASK
:
3685 preferred_charset_id
= charbuf
[3];
3686 if (preferred_charset_id
>= 0
3687 && NILP (Fmemq (make_number (preferred_charset_id
),
3689 preferred_charset_id
= -1;
3698 /* Now encode the character C. */
3699 if (c
< 0x20 || c
== 0x7F)
3702 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3704 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3705 ENCODE_RESET_PLANE_AND_REGISTER ();
3706 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3710 for (i
= 0; i
< 4; i
++)
3711 CODING_ISO_DESIGNATION (coding
, i
)
3712 = CODING_ISO_INITIAL (coding
, i
);
3715 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3717 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3718 ENCODE_RESET_PLANE_AND_REGISTER ();
3719 EMIT_ONE_ASCII_BYTE (c
);
3721 else if (ASCII_CHAR_P (c
))
3723 if (ascii_compatible
)
3724 EMIT_ONE_ASCII_BYTE (c
);
3727 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
3728 ENCODE_ISO_CHARACTER (charset
, c
);
3731 else if (CHAR_BYTE8_P (c
))
3733 c
= CHAR_TO_BYTE8 (c
);
3738 struct charset
*charset
;
3740 if (preferred_charset_id
>= 0)
3742 charset
= CHARSET_FROM_ID (preferred_charset_id
);
3743 if (! CHAR_CHARSET_P (c
, charset
))
3744 charset
= char_charset (c
, charset_list
, NULL
);
3747 charset
= char_charset (c
, charset_list
, NULL
);
3750 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3752 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3753 charset
= CHARSET_FROM_ID (charset_ascii
);
3757 c
= coding
->default_char
;
3758 charset
= char_charset (c
, charset_list
, NULL
);
3761 ENCODE_ISO_CHARACTER (charset
, c
);
3765 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3766 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3768 ASSURE_DESTINATION (safe_room
);
3769 ENCODE_RESET_PLANE_AND_REGISTER ();
3771 coding
->result
= CODING_RESULT_SUCCESS
;
3772 CODING_ISO_BOL (coding
) = bol_designation
;
3773 coding
->produced_char
+= produced_chars
;
3774 coding
->produced
= dst
- coding
->destination
;
3779 /*** 8,9. SJIS and BIG5 handlers ***/
3781 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3782 quite widely. So, for the moment, Emacs supports them in the bare
3783 C code. But, in the future, they may be supported only by CCL. */
3785 /* SJIS is a coding system encoding three character sets: ASCII, right
3786 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3787 as is. A character of charset katakana-jisx0201 is encoded by
3788 "position-code + 0x80". A character of charset japanese-jisx0208
3789 is encoded in 2-byte but two position-codes are divided and shifted
3790 so that it fit in the range below.
3792 --- CODE RANGE of SJIS ---
3793 (character set) (range)
3795 KATAKANA-JISX0201 0xA0 .. 0xDF
3796 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3797 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3798 -------------------------------
3802 /* BIG5 is a coding system encoding two character sets: ASCII and
3803 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3804 character set and is encoded in two-byte.
3806 --- CODE RANGE of BIG5 ---
3807 (character set) (range)
3809 Big5 (1st byte) 0xA1 .. 0xFE
3810 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3811 --------------------------
3815 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3816 Check if a text is encoded in SJIS. If it is, return
3817 CATEGORY_MASK_SJIS, else return 0. */
3820 detect_coding_sjis (coding
, detect_info
)
3821 struct coding_system
*coding
;
3822 struct coding_detection_info
*detect_info
;
3824 const unsigned char *src
= coding
->source
, *src_base
= src
;
3825 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3826 int multibytep
= coding
->src_multibyte
;
3827 int consumed_chars
= 0;
3832 detect_info
->checked
|= CATEGORY_MASK_SJIS
;
3833 /* A coding system of this category is always ASCII compatible. */
3834 src
+= coding
->head_ascii
;
3843 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3846 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
3848 found
= CATEGORY_MASK_SJIS
;
3850 else if (c
>= 0xA0 && c
< 0xE0)
3851 found
= CATEGORY_MASK_SJIS
;
3855 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3859 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3861 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3864 detect_info
->found
|= found
;
3868 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3869 Check if a text is encoded in BIG5. If it is, return
3870 CATEGORY_MASK_BIG5, else return 0. */
3873 detect_coding_big5 (coding
, detect_info
)
3874 struct coding_system
*coding
;
3875 struct coding_detection_info
*detect_info
;
3877 const unsigned char *src
= coding
->source
, *src_base
= src
;
3878 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3879 int multibytep
= coding
->src_multibyte
;
3880 int consumed_chars
= 0;
3885 detect_info
->checked
|= CATEGORY_MASK_BIG5
;
3886 /* A coding system of this category is always ASCII compatible. */
3887 src
+= coding
->head_ascii
;
3899 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
3901 found
= CATEGORY_MASK_BIG5
;
3906 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3910 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3912 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3915 detect_info
->found
|= found
;
3919 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3920 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3923 decode_coding_sjis (coding
)
3924 struct coding_system
*coding
;
3926 const unsigned char *src
= coding
->source
+ coding
->consumed
;
3927 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3928 const unsigned char *src_base
;
3929 int *charbuf
= coding
->charbuf
;
3930 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
3931 int consumed_chars
= 0, consumed_chars_base
;
3932 int multibytep
= coding
->src_multibyte
;
3933 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3934 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3935 int char_offset
= coding
->produced_char
;
3936 int last_offset
= char_offset
;
3937 int last_id
= charset_ascii
;
3939 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3942 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3943 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3944 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3951 consumed_chars_base
= consumed_chars
;
3953 if (charbuf
>= charbuf_end
)
3960 if (EQ (eol_type
, Qdos
))
3964 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
3965 goto no_more_source
;
3970 else if (EQ (eol_type
, Qmac
))
3975 struct charset
*charset
;
3978 charset
= charset_roman
;
3983 if (c
< 0xA0 || c
>= 0xE0)
3985 /* SJIS -> JISX0208 */
3987 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
3991 charset
= charset_kanji
;
3995 /* SJIS -> JISX0201-Kana */
3997 charset
= charset_kana
;
4002 if (charset
->id
!= charset_ascii
4003 && last_id
!= charset
->id
)
4005 if (last_id
!= charset_ascii
)
4006 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4007 last_id
= charset
->id
;
4008 last_offset
= char_offset
;
4010 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4018 consumed_chars
= consumed_chars_base
;
4020 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4026 if (last_id
!= charset_ascii
)
4027 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4028 coding
->consumed_char
+= consumed_chars_base
;
4029 coding
->consumed
= src_base
- coding
->source
;
4030 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4034 decode_coding_big5 (coding
)
4035 struct coding_system
*coding
;
4037 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4038 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4039 const unsigned char *src_base
;
4040 int *charbuf
= coding
->charbuf
;
4041 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4042 int consumed_chars
= 0, consumed_chars_base
;
4043 int multibytep
= coding
->src_multibyte
;
4044 struct charset
*charset_roman
, *charset_big5
;
4045 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4046 int char_offset
= coding
->produced_char
;
4047 int last_offset
= char_offset
;
4048 int last_id
= charset_ascii
;
4050 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4052 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4053 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4060 consumed_chars_base
= consumed_chars
;
4062 if (charbuf
>= charbuf_end
)
4069 if (EQ (eol_type
, Qdos
))
4073 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4074 goto no_more_source
;
4079 else if (EQ (eol_type
, Qmac
))
4084 struct charset
*charset
;
4086 charset
= charset_roman
;
4090 if (c
< 0xA1 || c
> 0xFE)
4093 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
4096 charset
= charset_big5
;
4098 if (charset
->id
!= charset_ascii
4099 && last_id
!= charset
->id
)
4101 if (last_id
!= charset_ascii
)
4102 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4103 last_id
= charset
->id
;
4104 last_offset
= char_offset
;
4106 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4115 consumed_chars
= consumed_chars_base
;
4117 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4123 if (last_id
!= charset_ascii
)
4124 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4125 coding
->consumed_char
+= consumed_chars_base
;
4126 coding
->consumed
= src_base
- coding
->source
;
4127 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4130 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4131 This function can encode charsets `ascii', `katakana-jisx0201',
4132 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4133 are sure that all these charsets are registered as official charset
4134 (i.e. do not have extended leading-codes). Characters of other
4135 charsets are produced without any encoding. If SJIS_P is 1, encode
4136 SJIS text, else encode BIG5 text. */
4139 encode_coding_sjis (coding
)
4140 struct coding_system
*coding
;
4142 int multibytep
= coding
->dst_multibyte
;
4143 int *charbuf
= coding
->charbuf
;
4144 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4145 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4146 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4148 int produced_chars
= 0;
4149 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4150 int ascii_compatible
;
4151 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4154 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4156 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4157 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4158 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4160 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4162 while (charbuf
< charbuf_end
)
4164 ASSURE_DESTINATION (safe_room
);
4166 /* Now encode the character C. */
4167 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4168 EMIT_ONE_ASCII_BYTE (c
);
4169 else if (CHAR_BYTE8_P (c
))
4171 c
= CHAR_TO_BYTE8 (c
);
4177 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4181 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4183 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4184 charset
= CHARSET_FROM_ID (charset_ascii
);
4188 c
= coding
->default_char
;
4189 charset
= char_charset (c
, charset_list
, &code
);
4192 if (code
== CHARSET_INVALID_CODE (charset
))
4194 if (charset
== charset_kanji
)
4198 c1
= code
>> 8, c2
= code
& 0xFF;
4199 EMIT_TWO_BYTES (c1
, c2
);
4201 else if (charset
== charset_kana
)
4202 EMIT_ONE_BYTE (code
| 0x80);
4204 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4207 coding
->result
= CODING_RESULT_SUCCESS
;
4208 coding
->produced_char
+= produced_chars
;
4209 coding
->produced
= dst
- coding
->destination
;
4214 encode_coding_big5 (coding
)
4215 struct coding_system
*coding
;
4217 int multibytep
= coding
->dst_multibyte
;
4218 int *charbuf
= coding
->charbuf
;
4219 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4220 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4221 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4223 int produced_chars
= 0;
4224 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4225 int ascii_compatible
;
4226 struct charset
*charset_roman
, *charset_big5
;
4229 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4231 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4232 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4233 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4235 while (charbuf
< charbuf_end
)
4237 ASSURE_DESTINATION (safe_room
);
4239 /* Now encode the character C. */
4240 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4241 EMIT_ONE_ASCII_BYTE (c
);
4242 else if (CHAR_BYTE8_P (c
))
4244 c
= CHAR_TO_BYTE8 (c
);
4250 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4254 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4256 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4257 charset
= CHARSET_FROM_ID (charset_ascii
);
4261 c
= coding
->default_char
;
4262 charset
= char_charset (c
, charset_list
, &code
);
4265 if (code
== CHARSET_INVALID_CODE (charset
))
4267 if (charset
== charset_big5
)
4271 c1
= code
>> 8, c2
= code
& 0xFF;
4272 EMIT_TWO_BYTES (c1
, c2
);
4275 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4278 coding
->result
= CODING_RESULT_SUCCESS
;
4279 coding
->produced_char
+= produced_chars
;
4280 coding
->produced
= dst
- coding
->destination
;
4285 /*** 10. CCL handlers ***/
4287 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4288 Check if a text is encoded in a coding system of which
4289 encoder/decoder are written in CCL program. If it is, return
4290 CATEGORY_MASK_CCL, else return 0. */
4293 detect_coding_ccl (coding
, detect_info
)
4294 struct coding_system
*coding
;
4295 struct coding_detection_info
*detect_info
;
4297 const unsigned char *src
= coding
->source
, *src_base
= src
;
4298 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4299 int multibytep
= coding
->src_multibyte
;
4300 int consumed_chars
= 0;
4302 unsigned char *valids
= CODING_CCL_VALIDS (coding
);
4303 int head_ascii
= coding
->head_ascii
;
4306 detect_info
->checked
|= CATEGORY_MASK_CCL
;
4308 coding
= &coding_categories
[coding_category_ccl
];
4309 attrs
= CODING_ID_ATTRS (coding
->id
);
4310 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4319 if ((valids
[c
] > 1))
4320 found
= CATEGORY_MASK_CCL
;
4322 detect_info
->rejected
|= CATEGORY_MASK_CCL
;
4326 detect_info
->found
|= found
;
4331 decode_coding_ccl (coding
)
4332 struct coding_system
*coding
;
4334 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4335 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4336 int *charbuf
= coding
->charbuf
;
4337 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4338 int consumed_chars
= 0;
4339 int multibytep
= coding
->src_multibyte
;
4340 struct ccl_program ccl
;
4341 int source_charbuf
[1024];
4342 int source_byteidx
[1024];
4343 Lisp_Object attrs
, eol_type
, charset_list
;
4345 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4346 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
4348 while (src
< src_end
)
4350 const unsigned char *p
= src
;
4351 int *source
, *source_end
;
4355 while (i
< 1024 && p
< src_end
)
4357 source_byteidx
[i
] = p
- src
;
4358 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4361 while (i
< 1024 && p
< src_end
)
4362 source_charbuf
[i
++] = *p
++;
4364 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4367 source
= source_charbuf
;
4368 source_end
= source
+ i
;
4369 while (source
< source_end
)
4371 ccl_driver (&ccl
, source
, charbuf
,
4372 source_end
- source
, charbuf_end
- charbuf
,
4374 source
+= ccl
.consumed
;
4375 charbuf
+= ccl
.produced
;
4376 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4379 if (source
< source_end
)
4380 src
+= source_byteidx
[source
- source_charbuf
];
4383 consumed_chars
+= source
- source_charbuf
;
4385 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4386 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4392 case CCL_STAT_SUSPEND_BY_SRC
:
4393 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4395 case CCL_STAT_SUSPEND_BY_DST
:
4398 case CCL_STAT_INVALID_CMD
:
4399 coding
->result
= CODING_RESULT_INTERRUPT
;
4402 coding
->result
= CODING_RESULT_SUCCESS
;
4405 coding
->consumed_char
+= consumed_chars
;
4406 coding
->consumed
= src
- coding
->source
;
4407 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4411 encode_coding_ccl (coding
)
4412 struct coding_system
*coding
;
4414 struct ccl_program ccl
;
4415 int multibytep
= coding
->dst_multibyte
;
4416 int *charbuf
= coding
->charbuf
;
4417 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4418 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4419 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4420 unsigned char *adjusted_dst_end
= dst_end
- 1;
4421 int destination_charbuf
[1024];
4422 int i
, produced_chars
= 0;
4423 Lisp_Object attrs
, eol_type
, charset_list
;
4425 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4426 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4428 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4429 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4431 while (charbuf
< charbuf_end
&& dst
< adjusted_dst_end
)
4433 int dst_bytes
= dst_end
- dst
;
4434 if (dst_bytes
> 1024)
4437 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4438 charbuf_end
- charbuf
, dst_bytes
, charset_list
);
4439 charbuf
+= ccl
.consumed
;
4441 for (i
= 0; i
< ccl
.produced
; i
++)
4442 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4445 for (i
= 0; i
< ccl
.produced
; i
++)
4446 *dst
++ = destination_charbuf
[i
] & 0xFF;
4447 produced_chars
+= ccl
.produced
;
4453 case CCL_STAT_SUSPEND_BY_SRC
:
4454 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4456 case CCL_STAT_SUSPEND_BY_DST
:
4457 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
4460 case CCL_STAT_INVALID_CMD
:
4461 coding
->result
= CODING_RESULT_INTERRUPT
;
4464 coding
->result
= CODING_RESULT_SUCCESS
;
4468 coding
->produced_char
+= produced_chars
;
4469 coding
->produced
= dst
- coding
->destination
;
4475 /*** 10, 11. no-conversion handlers ***/
4477 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4480 decode_coding_raw_text (coding
)
4481 struct coding_system
*coding
;
4483 coding
->chars_at_source
= 1;
4484 coding
->consumed_char
= 0;
4485 coding
->consumed
= 0;
4486 coding
->result
= CODING_RESULT_SUCCESS
;
4490 encode_coding_raw_text (coding
)
4491 struct coding_system
*coding
;
4493 int multibytep
= coding
->dst_multibyte
;
4494 int *charbuf
= coding
->charbuf
;
4495 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4496 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4497 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4498 int produced_chars
= 0;
4503 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4505 if (coding
->src_multibyte
)
4506 while (charbuf
< charbuf_end
)
4508 ASSURE_DESTINATION (safe_room
);
4510 if (ASCII_CHAR_P (c
))
4511 EMIT_ONE_ASCII_BYTE (c
);
4512 else if (CHAR_BYTE8_P (c
))
4514 c
= CHAR_TO_BYTE8 (c
);
4519 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4521 CHAR_STRING_ADVANCE (c
, p1
);
4524 EMIT_ONE_BYTE (*p0
);
4530 while (charbuf
< charbuf_end
)
4532 ASSURE_DESTINATION (safe_room
);
4539 if (coding
->src_multibyte
)
4541 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4543 while (charbuf
< charbuf_end
)
4545 ASSURE_DESTINATION (safe_room
);
4547 if (ASCII_CHAR_P (c
))
4549 else if (CHAR_BYTE8_P (c
))
4550 *dst
++ = CHAR_TO_BYTE8 (c
);
4552 CHAR_STRING_ADVANCE (c
, dst
);
4558 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4559 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4560 *dst
++ = *charbuf
++;
4561 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4564 coding
->result
= CODING_RESULT_SUCCESS
;
4565 coding
->produced_char
+= produced_chars
;
4566 coding
->produced
= dst
- coding
->destination
;
4570 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4571 Check if a text is encoded in a charset-based coding system. If it
4572 is, return 1, else return 0. */
4575 detect_coding_charset (coding
, detect_info
)
4576 struct coding_system
*coding
;
4577 struct coding_detection_info
*detect_info
;
4579 const unsigned char *src
= coding
->source
, *src_base
= src
;
4580 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4581 int multibytep
= coding
->src_multibyte
;
4582 int consumed_chars
= 0;
4583 Lisp_Object attrs
, valids
;
4586 detect_info
->checked
|= CATEGORY_MASK_CHARSET
;
4588 coding
= &coding_categories
[coding_category_charset
];
4589 attrs
= CODING_ID_ATTRS (coding
->id
);
4590 valids
= AREF (attrs
, coding_attr_charset_valids
);
4592 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4593 src
+= coding
->head_ascii
;
4600 if (NILP (AREF (valids
, c
)))
4603 found
= CATEGORY_MASK_CHARSET
;
4605 detect_info
->rejected
|= CATEGORY_MASK_CHARSET
;
4609 detect_info
->found
|= found
;
4614 decode_coding_charset (coding
)
4615 struct coding_system
*coding
;
4617 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4618 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4619 const unsigned char *src_base
;
4620 int *charbuf
= coding
->charbuf
;
4621 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4622 int consumed_chars
= 0, consumed_chars_base
;
4623 int multibytep
= coding
->src_multibyte
;
4624 Lisp_Object attrs
, eol_type
, charset_list
, valids
;
4625 int char_offset
= coding
->produced_char
;
4626 int last_offset
= char_offset
;
4627 int last_id
= charset_ascii
;
4629 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4630 valids
= AREF (attrs
, coding_attr_charset_valids
);
4637 consumed_chars_base
= consumed_chars
;
4639 if (charbuf
>= charbuf_end
)
4645 /* Here we assume that no charset maps '\r' to something
4647 if (EQ (eol_type
, Qdos
))
4651 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4652 goto no_more_source
;
4657 else if (EQ (eol_type
, Qmac
))
4663 struct charset
*charset
;
4668 val
= AREF (valids
, c
);
4673 charset
= CHARSET_FROM_ID (XFASTINT (val
));
4674 dim
= CHARSET_DIMENSION (charset
);
4678 code
= (code
<< 8) | c
;
4681 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
4686 /* VAL is a list of charset IDs. It is assured that the
4687 list is sorted by charset dimensions (smaller one
4691 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
4692 dim
= CHARSET_DIMENSION (charset
);
4696 code
= (code
<< 8) | c
;
4699 CODING_DECODE_CHAR (coding
, src
, src_base
,
4700 src_end
, charset
, code
, c
);
4708 if (charset
->id
!= charset_ascii
4709 && last_id
!= charset
->id
)
4711 if (last_id
!= charset_ascii
)
4712 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4713 last_id
= charset
->id
;
4714 last_offset
= char_offset
;
4723 consumed_chars
= consumed_chars_base
;
4725 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4731 if (last_id
!= charset_ascii
)
4732 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4733 coding
->consumed_char
+= consumed_chars_base
;
4734 coding
->consumed
= src_base
- coding
->source
;
4735 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4739 encode_coding_charset (coding
)
4740 struct coding_system
*coding
;
4742 int multibytep
= coding
->dst_multibyte
;
4743 int *charbuf
= coding
->charbuf
;
4744 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4745 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4746 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4747 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4748 int produced_chars
= 0;
4749 Lisp_Object attrs
, eol_type
, charset_list
;
4750 int ascii_compatible
;
4753 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4754 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4756 while (charbuf
< charbuf_end
)
4758 struct charset
*charset
;
4761 ASSURE_DESTINATION (safe_room
);
4763 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4764 EMIT_ONE_ASCII_BYTE (c
);
4765 else if (CHAR_BYTE8_P (c
))
4767 c
= CHAR_TO_BYTE8 (c
);
4772 charset
= char_charset (c
, charset_list
, &code
);
4775 if (CHARSET_DIMENSION (charset
) == 1)
4776 EMIT_ONE_BYTE (code
);
4777 else if (CHARSET_DIMENSION (charset
) == 2)
4778 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
4779 else if (CHARSET_DIMENSION (charset
) == 3)
4780 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
4782 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
4783 (code
>> 8) & 0xFF, code
& 0xFF);
4787 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4788 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4790 c
= coding
->default_char
;
4796 coding
->result
= CODING_RESULT_SUCCESS
;
4797 coding
->produced_char
+= produced_chars
;
4798 coding
->produced
= dst
- coding
->destination
;
4803 /*** 7. C library functions ***/
4805 /* Setup coding context CODING from information about CODING_SYSTEM.
4806 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4807 CODING_SYSTEM is invalid, signal an error. */
4810 setup_coding_system (coding_system
, coding
)
4811 Lisp_Object coding_system
;
4812 struct coding_system
*coding
;
4815 Lisp_Object eol_type
;
4816 Lisp_Object coding_type
;
4819 if (NILP (coding_system
))
4820 coding_system
= Qno_conversion
;
4822 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
4824 attrs
= CODING_ID_ATTRS (coding
->id
);
4825 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4828 coding
->head_ascii
= -1;
4829 coding
->common_flags
4830 = (VECTORP (eol_type
) ? CODING_REQUIRE_DETECTION_MASK
: 0);
4831 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
4832 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
4833 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
4834 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
4835 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs
)))
4836 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
4838 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4839 coding
->max_charset_id
= SCHARS (val
) - 1;
4840 coding
->safe_charsets
= (char *) SDATA (val
);
4841 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
4843 coding_type
= CODING_ATTR_TYPE (attrs
);
4844 if (EQ (coding_type
, Qundecided
))
4846 coding
->detector
= NULL
;
4847 coding
->decoder
= decode_coding_raw_text
;
4848 coding
->encoder
= encode_coding_raw_text
;
4849 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4851 else if (EQ (coding_type
, Qiso_2022
))
4854 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
4856 /* Invoke graphic register 0 to plane 0. */
4857 CODING_ISO_INVOCATION (coding
, 0) = 0;
4858 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4859 CODING_ISO_INVOCATION (coding
, 1)
4860 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
4861 /* Setup the initial status of designation. */
4862 for (i
= 0; i
< 4; i
++)
4863 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
4864 /* Not single shifting initially. */
4865 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
4866 /* Beginning of buffer should also be regarded as bol. */
4867 CODING_ISO_BOL (coding
) = 1;
4868 coding
->detector
= detect_coding_iso_2022
;
4869 coding
->decoder
= decode_coding_iso_2022
;
4870 coding
->encoder
= encode_coding_iso_2022
;
4871 if (flags
& CODING_ISO_FLAG_SAFE
)
4872 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
4873 coding
->common_flags
4874 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4875 | CODING_REQUIRE_FLUSHING_MASK
);
4876 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
4877 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
4878 if (flags
& CODING_ISO_FLAG_DESIGNATION
)
4879 coding
->common_flags
|= CODING_ANNOTATE_CHARSET_MASK
;
4880 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
4882 setup_iso_safe_charsets (attrs
);
4883 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4884 coding
->max_charset_id
= SCHARS (val
) - 1;
4885 coding
->safe_charsets
= (char *) SDATA (val
);
4887 CODING_ISO_FLAGS (coding
) = flags
;
4889 else if (EQ (coding_type
, Qcharset
))
4891 coding
->detector
= detect_coding_charset
;
4892 coding
->decoder
= decode_coding_charset
;
4893 coding
->encoder
= encode_coding_charset
;
4894 coding
->common_flags
4895 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4897 else if (EQ (coding_type
, Qutf_8
))
4899 coding
->detector
= detect_coding_utf_8
;
4900 coding
->decoder
= decode_coding_utf_8
;
4901 coding
->encoder
= encode_coding_utf_8
;
4902 coding
->common_flags
4903 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4905 else if (EQ (coding_type
, Qutf_16
))
4907 val
= AREF (attrs
, coding_attr_utf_16_bom
);
4908 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
4909 : EQ (val
, Qt
) ? utf_16_with_bom
4910 : utf_16_without_bom
);
4911 val
= AREF (attrs
, coding_attr_utf_16_endian
);
4912 CODING_UTF_16_ENDIAN (coding
) = (EQ (val
, Qbig
) ? utf_16_big_endian
4913 : utf_16_little_endian
);
4914 CODING_UTF_16_SURROGATE (coding
) = 0;
4915 coding
->detector
= detect_coding_utf_16
;
4916 coding
->decoder
= decode_coding_utf_16
;
4917 coding
->encoder
= encode_coding_utf_16
;
4918 coding
->common_flags
4919 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4920 if (CODING_UTF_16_BOM (coding
) == utf_16_detect_bom
)
4921 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4923 else if (EQ (coding_type
, Qccl
))
4925 coding
->detector
= detect_coding_ccl
;
4926 coding
->decoder
= decode_coding_ccl
;
4927 coding
->encoder
= encode_coding_ccl
;
4928 coding
->common_flags
4929 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4930 | CODING_REQUIRE_FLUSHING_MASK
);
4932 else if (EQ (coding_type
, Qemacs_mule
))
4934 coding
->detector
= detect_coding_emacs_mule
;
4935 coding
->decoder
= decode_coding_emacs_mule
;
4936 coding
->encoder
= encode_coding_emacs_mule
;
4937 coding
->common_flags
4938 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4939 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
4940 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
4942 Lisp_Object tail
, safe_charsets
;
4943 int max_charset_id
= 0;
4945 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4947 if (max_charset_id
< XFASTINT (XCAR (tail
)))
4948 max_charset_id
= XFASTINT (XCAR (tail
));
4949 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
4951 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4953 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
4954 coding
->max_charset_id
= max_charset_id
;
4955 coding
->safe_charsets
= (char *) SDATA (safe_charsets
);
4958 else if (EQ (coding_type
, Qshift_jis
))
4960 coding
->detector
= detect_coding_sjis
;
4961 coding
->decoder
= decode_coding_sjis
;
4962 coding
->encoder
= encode_coding_sjis
;
4963 coding
->common_flags
4964 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4966 else if (EQ (coding_type
, Qbig5
))
4968 coding
->detector
= detect_coding_big5
;
4969 coding
->decoder
= decode_coding_big5
;
4970 coding
->encoder
= encode_coding_big5
;
4971 coding
->common_flags
4972 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4974 else /* EQ (coding_type, Qraw_text) */
4976 coding
->detector
= NULL
;
4977 coding
->decoder
= decode_coding_raw_text
;
4978 coding
->encoder
= encode_coding_raw_text
;
4984 /* Return raw-text or one of its subsidiaries that has the same
4985 eol_type as CODING-SYSTEM. */
4988 raw_text_coding_system (coding_system
)
4989 Lisp_Object coding_system
;
4991 Lisp_Object spec
, attrs
;
4992 Lisp_Object eol_type
, raw_text_eol_type
;
4994 if (NILP (coding_system
))
4996 spec
= CODING_SYSTEM_SPEC (coding_system
);
4997 attrs
= AREF (spec
, 0);
4999 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
5000 return coding_system
;
5002 eol_type
= AREF (spec
, 2);
5003 if (VECTORP (eol_type
))
5005 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
5006 raw_text_eol_type
= AREF (spec
, 2);
5007 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
5008 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
5009 : AREF (raw_text_eol_type
, 2));
5013 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5014 does, return one of the subsidiary that has the same eol-spec as
5015 PARENT. Otherwise, return CODING_SYSTEM. */
5018 coding_inherit_eol_type (coding_system
, parent
)
5019 Lisp_Object coding_system
, parent
;
5021 Lisp_Object spec
, attrs
, eol_type
;
5023 if (NILP (coding_system
))
5024 coding_system
= Qraw_text
;
5025 spec
= CODING_SYSTEM_SPEC (coding_system
);
5026 attrs
= AREF (spec
, 0);
5027 eol_type
= AREF (spec
, 2);
5028 if (VECTORP (eol_type
)
5031 Lisp_Object parent_spec
;
5032 Lisp_Object parent_eol_type
;
5035 = CODING_SYSTEM_SPEC (buffer_defaults
.buffer_file_coding_system
);
5036 parent_eol_type
= AREF (parent_spec
, 2);
5037 if (EQ (parent_eol_type
, Qunix
))
5038 coding_system
= AREF (eol_type
, 0);
5039 else if (EQ (parent_eol_type
, Qdos
))
5040 coding_system
= AREF (eol_type
, 1);
5041 else if (EQ (parent_eol_type
, Qmac
))
5042 coding_system
= AREF (eol_type
, 2);
5044 return coding_system
;
5047 /* Emacs has a mechanism to automatically detect a coding system if it
5048 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5049 it's impossible to distinguish some coding systems accurately
5050 because they use the same range of codes. So, at first, coding
5051 systems are categorized into 7, those are:
5053 o coding-category-emacs-mule
5055 The category for a coding system which has the same code range
5056 as Emacs' internal format. Assigned the coding-system (Lisp
5057 symbol) `emacs-mule' by default.
5059 o coding-category-sjis
5061 The category for a coding system which has the same code range
5062 as SJIS. Assigned the coding-system (Lisp
5063 symbol) `japanese-shift-jis' by default.
5065 o coding-category-iso-7
5067 The category for a coding system which has the same code range
5068 as ISO2022 of 7-bit environment. This doesn't use any locking
5069 shift and single shift functions. This can encode/decode all
5070 charsets. Assigned the coding-system (Lisp symbol)
5071 `iso-2022-7bit' by default.
5073 o coding-category-iso-7-tight
5075 Same as coding-category-iso-7 except that this can
5076 encode/decode only the specified charsets.
5078 o coding-category-iso-8-1
5080 The category for a coding system which has the same code range
5081 as ISO2022 of 8-bit environment and graphic plane 1 used only
5082 for DIMENSION1 charset. This doesn't use any locking shift
5083 and single shift functions. Assigned the coding-system (Lisp
5084 symbol) `iso-latin-1' by default.
5086 o coding-category-iso-8-2
5088 The category for a coding system which has the same code range
5089 as ISO2022 of 8-bit environment and graphic plane 1 used only
5090 for DIMENSION2 charset. This doesn't use any locking shift
5091 and single shift functions. Assigned the coding-system (Lisp
5092 symbol) `japanese-iso-8bit' by default.
5094 o coding-category-iso-7-else
5096 The category for a coding system which has the same code range
5097 as ISO2022 of 7-bit environemnt but uses locking shift or
5098 single shift functions. Assigned the coding-system (Lisp
5099 symbol) `iso-2022-7bit-lock' by default.
5101 o coding-category-iso-8-else
5103 The category for a coding system which has the same code range
5104 as ISO2022 of 8-bit environemnt but uses locking shift or
5105 single shift functions. Assigned the coding-system (Lisp
5106 symbol) `iso-2022-8bit-ss2' by default.
5108 o coding-category-big5
5110 The category for a coding system which has the same code range
5111 as BIG5. Assigned the coding-system (Lisp symbol)
5112 `cn-big5' by default.
5114 o coding-category-utf-8
5116 The category for a coding system which has the same code range
5117 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5118 symbol) `utf-8' by default.
5120 o coding-category-utf-16-be
5122 The category for a coding system in which a text has an
5123 Unicode signature (cf. Unicode Standard) in the order of BIG
5124 endian at the head. Assigned the coding-system (Lisp symbol)
5125 `utf-16-be' by default.
5127 o coding-category-utf-16-le
5129 The category for a coding system in which a text has an
5130 Unicode signature (cf. Unicode Standard) in the order of
5131 LITTLE endian at the head. Assigned the coding-system (Lisp
5132 symbol) `utf-16-le' by default.
5134 o coding-category-ccl
5136 The category for a coding system of which encoder/decoder is
5137 written in CCL programs. The default value is nil, i.e., no
5138 coding system is assigned.
5140 o coding-category-binary
5142 The category for a coding system not categorized in any of the
5143 above. Assigned the coding-system (Lisp symbol)
5144 `no-conversion' by default.
5146 Each of them is a Lisp symbol and the value is an actual
5147 `coding-system's (this is also a Lisp symbol) assigned by a user.
5148 What Emacs does actually is to detect a category of coding system.
5149 Then, it uses a `coding-system' assigned to it. If Emacs can't
5150 decide only one possible category, it selects a category of the
5151 highest priority. Priorities of categories are also specified by a
5152 user in a Lisp variable `coding-category-list'.
5156 #define EOL_SEEN_NONE 0
5157 #define EOL_SEEN_LF 1
5158 #define EOL_SEEN_CR 2
5159 #define EOL_SEEN_CRLF 4
5161 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5162 SOURCE is encoded. If CATEGORY is one of
5163 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5164 two-byte, else they are encoded by one-byte.
5166 Return one of EOL_SEEN_XXX. */
5168 #define MAX_EOL_CHECK_COUNT 3
5171 detect_eol (source
, src_bytes
, category
)
5172 unsigned char *source
;
5173 EMACS_INT src_bytes
;
5174 enum coding_category category
;
5176 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
5179 int eol_seen
= EOL_SEEN_NONE
;
5181 if ((1 << category
) & CATEGORY_MASK_UTF_16
)
5185 msb
= category
== (coding_category_utf_16_le
5186 | coding_category_utf_16_le_nosig
);
5189 while (src
+ 1 < src_end
)
5192 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
5197 this_eol
= EOL_SEEN_LF
;
5198 else if (src
+ 3 >= src_end
5199 || src
[msb
+ 2] != 0
5200 || src
[lsb
+ 2] != '\n')
5201 this_eol
= EOL_SEEN_CR
;
5203 this_eol
= EOL_SEEN_CRLF
;
5205 if (eol_seen
== EOL_SEEN_NONE
)
5206 /* This is the first end-of-line. */
5207 eol_seen
= this_eol
;
5208 else if (eol_seen
!= this_eol
)
5210 /* The found type is different from what found before. */
5211 eol_seen
= EOL_SEEN_LF
;
5214 if (++total
== MAX_EOL_CHECK_COUNT
)
5222 while (src
< src_end
)
5225 if (c
== '\n' || c
== '\r')
5230 this_eol
= EOL_SEEN_LF
;
5231 else if (src
>= src_end
|| *src
!= '\n')
5232 this_eol
= EOL_SEEN_CR
;
5234 this_eol
= EOL_SEEN_CRLF
, src
++;
5236 if (eol_seen
== EOL_SEEN_NONE
)
5237 /* This is the first end-of-line. */
5238 eol_seen
= this_eol
;
5239 else if (eol_seen
!= this_eol
)
5241 /* The found type is different from what found before. */
5242 eol_seen
= EOL_SEEN_LF
;
5245 if (++total
== MAX_EOL_CHECK_COUNT
)
5255 adjust_coding_eol_type (coding
, eol_seen
)
5256 struct coding_system
*coding
;
5259 Lisp_Object eol_type
;
5261 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5262 if (eol_seen
& EOL_SEEN_LF
)
5263 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
5264 else if (eol_seen
& EOL_SEEN_CRLF
)
5265 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
5266 else if (eol_seen
& EOL_SEEN_CR
)
5267 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
5270 /* Detect how a text specified in CODING is encoded. If a coding
5271 system is detected, update fields of CODING by the detected coding
5275 detect_coding (coding
)
5276 struct coding_system
*coding
;
5278 const unsigned char *src
, *src_end
;
5279 Lisp_Object attrs
, coding_type
;
5281 coding
->consumed
= coding
->consumed_char
= 0;
5282 coding
->produced
= coding
->produced_char
= 0;
5283 coding_set_source (coding
);
5285 src_end
= coding
->source
+ coding
->src_bytes
;
5287 /* If we have not yet decided the text encoding type, detect it
5289 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
5293 for (src
= coding
->source
; src
< src_end
; src
++)
5296 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
5298 || c
== ISO_CODE_SO
)))
5301 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
5303 if (coding
->head_ascii
< coding
->src_bytes
)
5305 struct coding_detection_info detect_info
;
5306 enum coding_category category
;
5307 struct coding_system
*this;
5309 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
5310 for (i
= 0; i
< coding_category_raw_text
; i
++)
5312 category
= coding_priorities
[i
];
5313 this = coding_categories
+ category
;
5316 /* No coding system of this category is defined. */
5317 detect_info
.rejected
|= (1 << category
);
5319 else if (category
>= coding_category_raw_text
)
5321 else if (detect_info
.checked
& (1 << category
))
5323 if (detect_info
.found
& (1 << category
))
5326 else if ((*(this->detector
)) (coding
, &detect_info
)
5327 && detect_info
.found
& (1 << category
))
5330 if (i
< coding_category_raw_text
)
5331 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5332 else if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
5333 setup_coding_system (Qraw_text
, coding
);
5334 else if (detect_info
.rejected
)
5335 for (i
= 0; i
< coding_category_raw_text
; i
++)
5336 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
5338 this = coding_categories
+ coding_priorities
[i
];
5339 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5344 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qutf_16
))
5346 Lisp_Object coding_systems
;
5347 struct coding_detection_info detect_info
;
5350 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_16_bom
);
5351 detect_info
.found
= detect_info
.rejected
= 0;
5352 if (CONSP (coding_systems
)
5353 && detect_coding_utf_16 (coding
, &detect_info
)
5354 && (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
5355 | CATEGORY_MASK_UTF_16_BE
)))
5357 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
5358 setup_coding_system (XCAR (coding_systems
), coding
);
5360 setup_coding_system (XCDR (coding_systems
), coding
);
5364 attrs
= CODING_ID_ATTRS (coding
->id
);
5365 coding_type
= CODING_ATTR_TYPE (attrs
);
5367 /* If we have not yet decided the EOL type, detect it now. But, the
5368 detection is impossible for a CCL based coding system, in which
5369 case, we detct the EOL type after decoding. */
5370 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
))
5371 && ! EQ (coding_type
, Qccl
))
5373 int eol_seen
= detect_eol (coding
->source
, coding
->src_bytes
,
5374 XINT (CODING_ATTR_CATEGORY (attrs
)));
5376 if (eol_seen
!= EOL_SEEN_NONE
)
5377 adjust_coding_eol_type (coding
, eol_seen
);
5384 struct coding_system
*coding
;
5386 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
)))
5388 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5389 unsigned char *pend
= p
+ coding
->produced
;
5390 int eol_seen
= EOL_SEEN_NONE
;
5392 for (; p
< pend
; p
++)
5395 eol_seen
|= EOL_SEEN_LF
;
5396 else if (*p
== '\r')
5398 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
5400 eol_seen
|= EOL_SEEN_CRLF
;
5404 eol_seen
|= EOL_SEEN_CR
;
5407 if (eol_seen
!= EOL_SEEN_NONE
)
5408 adjust_coding_eol_type (coding
, eol_seen
);
5411 if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qmac
))
5413 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5414 unsigned char *pend
= p
+ coding
->produced
;
5416 for (; p
< pend
; p
++)
5420 else if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
))
5422 unsigned char *p
, *pbeg
, *pend
;
5423 Lisp_Object undo_list
;
5425 move_gap_both (coding
->dst_pos
+ coding
->produced_char
,
5426 coding
->dst_pos_byte
+ coding
->produced
);
5427 undo_list
= current_buffer
->undo_list
;
5428 current_buffer
->undo_list
= Qt
;
5429 del_range_2 (coding
->dst_pos
, coding
->dst_pos_byte
, GPT
, GPT_BYTE
, 0);
5430 current_buffer
->undo_list
= undo_list
;
5432 pend
= pbeg
+ coding
->produced
;
5434 for (p
= pend
- 1; p
>= pbeg
; p
--)
5437 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
- p
- 1);
5440 coding
->produced_char
-= coding
->produced
- (pend
- pbeg
);
5441 coding
->produced
= pend
- pbeg
;
5442 insert_from_gap (coding
->produced_char
, coding
->produced
);
5447 translate_chars (coding
, table
)
5448 struct coding_system
*coding
;
5451 int *charbuf
= coding
->charbuf
;
5452 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5455 if (coding
->chars_at_source
)
5458 while (charbuf
< charbuf_end
)
5464 *charbuf
++ = translate_char (table
, c
);
5469 produce_chars (coding
)
5470 struct coding_system
*coding
;
5472 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5473 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5475 int produced_chars
= 0;
5477 if (! coding
->chars_at_source
)
5479 /* Characters are in coding->charbuf. */
5480 int *buf
= coding
->charbuf
;
5481 int *buf_end
= buf
+ coding
->charbuf_used
;
5482 unsigned char *adjusted_dst_end
;
5484 if (BUFFERP (coding
->src_object
)
5485 && EQ (coding
->src_object
, coding
->dst_object
))
5486 dst_end
= ((unsigned char *) coding
->source
) + coding
->consumed
;
5487 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5489 while (buf
< buf_end
)
5493 if (dst
>= adjusted_dst_end
)
5495 dst
= alloc_destination (coding
,
5496 buf_end
- buf
+ MAX_MULTIBYTE_LENGTH
,
5498 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5499 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5503 if (coding
->dst_multibyte
5504 || ! CHAR_BYTE8_P (c
))
5505 CHAR_STRING_ADVANCE (c
, dst
);
5507 *dst
++ = CHAR_TO_BYTE8 (c
);
5511 /* This is an annotation datum. (-C) is the length of
5518 const unsigned char *src
= coding
->source
;
5519 const unsigned char *src_end
= src
+ coding
->src_bytes
;
5520 Lisp_Object eol_type
;
5522 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5524 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5526 if (coding
->src_multibyte
)
5533 const unsigned char *src_base
= src
;
5539 if (EQ (eol_type
, Qdos
))
5543 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
5544 goto no_more_source
;
5549 else if (EQ (eol_type
, Qmac
))
5554 coding
->consumed
= src
- coding
->source
;
5556 if (EQ (coding
->src_object
, coding
->dst_object
))
5557 dst_end
= (unsigned char *) src
;
5560 dst
= alloc_destination (coding
, src_end
- src
+ 1,
5562 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5563 coding_set_source (coding
);
5564 src
= coding
->source
+ coding
->consumed
;
5565 src_end
= coding
->source
+ coding
->src_bytes
;
5575 while (src
< src_end
)
5582 if (EQ (eol_type
, Qdos
))
5588 else if (EQ (eol_type
, Qmac
))
5591 if (dst
>= dst_end
- 1)
5593 coding
->consumed
= src
- coding
->source
;
5595 if (EQ (coding
->src_object
, coding
->dst_object
))
5596 dst_end
= (unsigned char *) src
;
5597 if (dst
>= dst_end
- 1)
5599 dst
= alloc_destination (coding
, src_end
- src
+ 2,
5601 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5602 coding_set_source (coding
);
5603 src
= coding
->source
+ coding
->consumed
;
5604 src_end
= coding
->source
+ coding
->src_bytes
;
5612 if (!EQ (coding
->src_object
, coding
->dst_object
))
5614 int require
= coding
->src_bytes
- coding
->dst_bytes
;
5618 EMACS_INT offset
= src
- coding
->source
;
5620 dst
= alloc_destination (coding
, require
, dst
);
5621 coding_set_source (coding
);
5622 src
= coding
->source
+ offset
;
5623 src_end
= coding
->source
+ coding
->src_bytes
;
5626 produced_chars
= coding
->src_chars
;
5627 while (src
< src_end
)
5633 if (EQ (eol_type
, Qdos
))
5640 else if (EQ (eol_type
, Qmac
))
5646 coding
->consumed
= coding
->src_bytes
;
5647 coding
->consumed_char
= coding
->src_chars
;
5650 produced
= dst
- (coding
->destination
+ coding
->produced
);
5651 if (BUFFERP (coding
->dst_object
))
5652 insert_from_gap (produced_chars
, produced
);
5653 coding
->produced
+= produced
;
5654 coding
->produced_char
+= produced_chars
;
5655 return produced_chars
;
5658 /* Compose text in CODING->object according to the annotation data at
5659 CHARBUF. CHARBUF is an array:
5660 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5664 produce_composition (coding
, charbuf
)
5665 struct coding_system
*coding
;
5670 enum composition_method method
;
5671 Lisp_Object components
;
5674 from
= coding
->dst_pos
+ charbuf
[2];
5675 to
= coding
->dst_pos
+ charbuf
[3];
5676 method
= (enum composition_method
) (charbuf
[4]);
5678 if (method
== COMPOSITION_RELATIVE
)
5682 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
5687 for (i
= 0; i
< len
; i
++)
5688 args
[i
] = make_number (charbuf
[i
]);
5689 components
= (method
== COMPOSITION_WITH_ALTCHARS
5690 ? Fstring (len
, args
) : Fvector (len
, args
));
5692 compose_text (from
, to
, components
, Qnil
, coding
->dst_object
);
5696 /* Put `charset' property on text in CODING->object according to
5697 the annotation data at CHARBUF. CHARBUF is an array:
5698 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5702 produce_charset (coding
, charbuf
)
5703 struct coding_system
*coding
;
5706 EMACS_INT from
= coding
->dst_pos
+ charbuf
[2];
5707 EMACS_INT to
= coding
->dst_pos
+ charbuf
[3];
5708 struct charset
*charset
= CHARSET_FROM_ID (charbuf
[4]);
5710 Fput_text_property (make_number (from
), make_number (to
),
5711 Qcharset
, CHARSET_NAME (charset
),
5712 coding
->dst_object
);
5716 #define CHARBUF_SIZE 0x4000
5718 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5720 int size = CHARBUF_SIZE;; \
5722 coding->charbuf = NULL; \
5723 while (size > 1024) \
5725 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5726 if (coding->charbuf) \
5730 if (! coding->charbuf) \
5732 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5733 return coding->result; \
5735 coding->charbuf_size = size; \
5740 produce_annotation (coding
)
5741 struct coding_system
*coding
;
5743 int *charbuf
= coding
->charbuf
;
5744 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5746 if (NILP (coding
->dst_object
))
5749 while (charbuf
< charbuf_end
)
5755 int len
= -*charbuf
;
5758 case CODING_ANNOTATE_COMPOSITION_MASK
:
5759 produce_composition (coding
, charbuf
);
5761 case CODING_ANNOTATE_CHARSET_MASK
:
5762 produce_charset (coding
, charbuf
);
5772 /* Decode the data at CODING->src_object into CODING->dst_object.
5773 CODING->src_object is a buffer, a string, or nil.
5774 CODING->dst_object is a buffer.
5776 If CODING->src_object is a buffer, it must be the current buffer.
5777 In this case, if CODING->src_pos is positive, it is a position of
5778 the source text in the buffer, otherwise, the source text is in the
5779 gap area of the buffer, and CODING->src_pos specifies the offset of
5780 the text from GPT (which must be the same as PT). If this is the
5781 same buffer as CODING->dst_object, CODING->src_pos must be
5784 If CODING->src_object is a string, CODING->src_pos in an index to
5787 If CODING->src_object is nil, CODING->source must already point to
5788 the non-relocatable memory area. In this case, CODING->src_pos is
5789 an offset from CODING->source.
5791 The decoded data is inserted at the current point of the buffer
5796 decode_coding (coding
)
5797 struct coding_system
*coding
;
5801 if (BUFFERP (coding
->src_object
)
5802 && coding
->src_pos
> 0
5803 && coding
->src_pos
< GPT
5804 && coding
->src_pos
+ coding
->src_chars
> GPT
)
5805 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
5807 if (BUFFERP (coding
->dst_object
))
5809 if (current_buffer
!= XBUFFER (coding
->dst_object
))
5810 set_buffer_internal (XBUFFER (coding
->dst_object
));
5812 move_gap_both (PT
, PT_BYTE
);
5815 coding
->consumed
= coding
->consumed_char
= 0;
5816 coding
->produced
= coding
->produced_char
= 0;
5817 coding
->chars_at_source
= 0;
5818 coding
->result
= CODING_RESULT_SUCCESS
;
5821 ALLOC_CONVERSION_WORK_AREA (coding
);
5823 attrs
= CODING_ID_ATTRS (coding
->id
);
5827 coding_set_source (coding
);
5828 coding
->annotated
= 0;
5829 (*(coding
->decoder
)) (coding
);
5830 if (!NILP (CODING_ATTR_DECODE_TBL (attrs
)))
5831 translate_chars (coding
, CODING_ATTR_DECODE_TBL (attrs
));
5832 else if (!NILP (Vstandard_translation_table_for_decode
))
5833 translate_chars (coding
, Vstandard_translation_table_for_decode
);
5834 coding_set_destination (coding
);
5835 produce_chars (coding
);
5836 if (coding
->annotated
)
5837 produce_annotation (coding
);
5839 while (coding
->consumed
< coding
->src_bytes
5840 && ! coding
->result
);
5842 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qccl
)
5843 && SYMBOLP (CODING_ID_EOL_TYPE (coding
->id
))
5844 && ! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
5845 decode_eol (coding
);
5847 coding
->carryover_bytes
= 0;
5848 if (coding
->consumed
< coding
->src_bytes
)
5850 int nbytes
= coding
->src_bytes
- coding
->consumed
;
5851 const unsigned char *src
;
5853 coding_set_source (coding
);
5854 coding_set_destination (coding
);
5855 src
= coding
->source
+ coding
->consumed
;
5857 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
5859 /* Flush out unprocessed data as binary chars. We are sure
5860 that the number of data is less than the size of
5862 while (nbytes
-- > 0)
5866 coding
->charbuf
[coding
->charbuf_used
++] = (c
& 0x80 ? - c
: c
);
5868 produce_chars (coding
);
5872 /* Record unprocessed bytes in coding->carryover. We are
5873 sure that the number of data is less than the size of
5874 coding->carryover. */
5875 unsigned char *p
= coding
->carryover
;
5877 coding
->carryover_bytes
= nbytes
;
5878 while (nbytes
-- > 0)
5881 coding
->consumed
= coding
->src_bytes
;
5884 return coding
->result
;
5888 /* Extract an annotation datum from a composition starting at POS and
5889 ending before LIMIT of CODING->src_object (buffer or string), store
5890 the data in BUF, set *STOP to a starting position of the next
5891 composition (if any) or to LIMIT, and return the address of the
5892 next element of BUF.
5894 If such an annotation is not found, set *STOP to a starting
5895 position of a composition after POS (if any) or to LIMIT, and
5899 handle_composition_annotation (pos
, limit
, coding
, buf
, stop
)
5900 EMACS_INT pos
, limit
;
5901 struct coding_system
*coding
;
5905 EMACS_INT start
, end
;
5908 if (! find_composition (pos
, limit
, &start
, &end
, &prop
, coding
->src_object
)
5911 else if (start
> pos
)
5917 /* We found a composition. Store the corresponding
5918 annotation data in BUF. */
5920 enum composition_method method
= COMPOSITION_METHOD (prop
);
5921 int nchars
= COMPOSITION_LENGTH (prop
);
5923 ADD_COMPOSITION_DATA (buf
, 0, nchars
, method
);
5924 if (method
!= COMPOSITION_RELATIVE
)
5926 Lisp_Object components
;
5929 components
= COMPOSITION_COMPONENTS (prop
);
5930 if (VECTORP (components
))
5932 len
= XVECTOR (components
)->size
;
5933 for (i
= 0; i
< len
; i
++)
5934 *buf
++ = XINT (AREF (components
, i
));
5936 else if (STRINGP (components
))
5938 len
= SCHARS (components
);
5942 FETCH_STRING_CHAR_ADVANCE (*buf
, components
, i
, i_byte
);
5946 else if (INTEGERP (components
))
5949 *buf
++ = XINT (components
);
5951 else if (CONSP (components
))
5953 for (len
= 0; CONSP (components
);
5954 len
++, components
= XCDR (components
))
5955 *buf
++ = XINT (XCAR (components
));
5963 if (find_composition (end
, limit
, &start
, &end
, &prop
,
5974 /* Extract an annotation datum from a text property `charset' at POS of
5975 CODING->src_object (buffer of string), store the data in BUF, set
5976 *STOP to the position where the value of `charset' property changes
5977 (limiting by LIMIT), and return the address of the next element of
5980 If the property value is nil, set *STOP to the position where the
5981 property value is non-nil (limiting by LIMIT), and return BUF. */
5984 handle_charset_annotation (pos
, limit
, coding
, buf
, stop
)
5985 EMACS_INT pos
, limit
;
5986 struct coding_system
*coding
;
5990 Lisp_Object val
, next
;
5993 val
= Fget_text_property (make_number (pos
), Qcharset
, coding
->src_object
);
5994 if (! NILP (val
) && CHARSETP (val
))
5995 id
= XINT (CHARSET_SYMBOL_ID (val
));
5998 ADD_CHARSET_DATA (buf
, 0, 0, id
);
5999 next
= Fnext_single_property_change (make_number (pos
), Qcharset
,
6001 make_number (limit
));
6002 *stop
= XINT (next
);
6008 consume_chars (coding
)
6009 struct coding_system
*coding
;
6011 int *buf
= coding
->charbuf
;
6012 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
;
6013 const unsigned char *src
= coding
->source
+ coding
->consumed
;
6014 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
6015 EMACS_INT pos
= coding
->src_pos
+ coding
->consumed_char
;
6016 EMACS_INT end_pos
= coding
->src_pos
+ coding
->src_chars
;
6017 int multibytep
= coding
->src_multibyte
;
6018 Lisp_Object eol_type
;
6020 EMACS_INT stop
, stop_composition
, stop_charset
;
6022 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6023 if (VECTORP (eol_type
))
6026 /* Note: composition handling is not yet implemented. */
6027 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
6029 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
)
6030 stop
= stop_composition
= pos
;
6032 stop
= stop_composition
= end_pos
;
6033 if (coding
->common_flags
& CODING_ANNOTATE_CHARSET_MASK
)
6034 stop
= stop_charset
= pos
;
6036 stop_charset
= end_pos
;
6038 /* Compensate for CRLF and annotation. */
6039 buf_end
-= 1 + MAX_ANNOTATION_LENGTH
;
6040 while (buf
< buf_end
)
6046 if (pos
== stop_composition
)
6047 buf
= handle_composition_annotation (pos
, end_pos
, coding
,
6048 buf
, &stop_composition
);
6049 if (pos
== stop_charset
)
6050 buf
= handle_charset_annotation (pos
, end_pos
, coding
,
6051 buf
, &stop_charset
);
6052 stop
= (stop_composition
< stop_charset
6053 ? stop_composition
: stop_charset
);
6060 if (! CODING_FOR_UNIBYTE (coding
)
6061 && (bytes
= MULTIBYTE_LENGTH (src
, src_end
)) > 0)
6062 c
= STRING_CHAR_ADVANCE (src
), pos
+= bytes
;
6067 c
= STRING_CHAR_ADVANCE (src
), pos
++;
6068 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
6070 if (! EQ (eol_type
, Qunix
))
6074 if (EQ (eol_type
, Qdos
))
6083 coding
->consumed
= src
- coding
->source
;
6084 coding
->consumed_char
= pos
- coding
->src_pos
;
6085 coding
->charbuf_used
= buf
- coding
->charbuf
;
6086 coding
->chars_at_source
= 0;
6090 /* Encode the text at CODING->src_object into CODING->dst_object.
6091 CODING->src_object is a buffer or a string.
6092 CODING->dst_object is a buffer or nil.
6094 If CODING->src_object is a buffer, it must be the current buffer.
6095 In this case, if CODING->src_pos is positive, it is a position of
6096 the source text in the buffer, otherwise. the source text is in the
6097 gap area of the buffer, and coding->src_pos specifies the offset of
6098 the text from GPT (which must be the same as PT). If this is the
6099 same buffer as CODING->dst_object, CODING->src_pos must be
6100 negative and CODING should not have `pre-write-conversion'.
6102 If CODING->src_object is a string, CODING should not have
6103 `pre-write-conversion'.
6105 If CODING->dst_object is a buffer, the encoded data is inserted at
6106 the current point of that buffer.
6108 If CODING->dst_object is nil, the encoded data is placed at the
6109 memory area specified by CODING->destination. */
6112 encode_coding (coding
)
6113 struct coding_system
*coding
;
6117 attrs
= CODING_ID_ATTRS (coding
->id
);
6119 if (BUFFERP (coding
->dst_object
))
6121 set_buffer_internal (XBUFFER (coding
->dst_object
));
6122 coding
->dst_multibyte
6123 = ! NILP (current_buffer
->enable_multibyte_characters
);
6126 coding
->consumed
= coding
->consumed_char
= 0;
6127 coding
->produced
= coding
->produced_char
= 0;
6128 coding
->result
= CODING_RESULT_SUCCESS
;
6131 ALLOC_CONVERSION_WORK_AREA (coding
);
6134 coding_set_source (coding
);
6135 consume_chars (coding
);
6137 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs
)))
6138 translate_chars (coding
, CODING_ATTR_ENCODE_TBL (attrs
));
6139 else if (!NILP (Vstandard_translation_table_for_encode
))
6140 translate_chars (coding
, Vstandard_translation_table_for_encode
);
6142 coding_set_destination (coding
);
6143 (*(coding
->encoder
)) (coding
);
6144 } while (coding
->consumed_char
< coding
->src_chars
);
6146 if (BUFFERP (coding
->dst_object
))
6147 insert_from_gap (coding
->produced_char
, coding
->produced
);
6149 return (coding
->result
);
6153 /* Stack of working buffers used in code conversion. An nil element
6154 means that the code conversion of that level is not using a working
6156 Lisp_Object Vcode_conversion_work_buf_list
;
6158 /* A working buffer used by the top level conversion. */
6159 Lisp_Object Vcode_conversion_reused_work_buf
;
6162 /* Return a working buffer that can be freely used by the following
6163 code conversion. MULTIBYTEP specifies the multibyteness of the
6167 make_conversion_work_buffer (multibytep
, depth
)
6168 int multibytep
, depth
;
6170 struct buffer
*current
= current_buffer
;
6171 Lisp_Object buf
, name
;
6175 if (NILP (Vcode_conversion_reused_work_buf
))
6176 Vcode_conversion_reused_work_buf
6177 = Fget_buffer_create (build_string (" *code-converting-work<0>*"));
6178 buf
= Vcode_conversion_reused_work_buf
;
6184 name
= build_string (" *code-converting-work*");
6185 name
= Fgenerate_new_buffer_name (name
, Qnil
);
6191 sprintf (str
, " *code-converting-work*<%d>", depth
);
6192 name
= build_string (str
);
6194 buf
= Fget_buffer_create (name
);
6196 set_buffer_internal (XBUFFER (buf
));
6197 current_buffer
->undo_list
= Qt
;
6199 Fset_buffer_multibyte (multibytep
? Qt
: Qnil
);
6200 set_buffer_internal (current
);
6205 code_conversion_restore (buffer
)
6208 Lisp_Object workbuf
;
6210 workbuf
= XCAR (Vcode_conversion_work_buf_list
);
6211 if (! NILP (workbuf
)
6212 && ! EQ (workbuf
, Vcode_conversion_reused_work_buf
)
6213 && ! NILP (Fbuffer_live_p (workbuf
)))
6214 Fkill_buffer (workbuf
);
6215 Vcode_conversion_work_buf_list
= XCDR (Vcode_conversion_work_buf_list
);
6216 set_buffer_internal (XBUFFER (buffer
));
6221 code_conversion_save (buffer
, with_work_buf
, multibyte
)
6223 int with_work_buf
, multibyte
;
6225 Lisp_Object workbuf
;
6229 int depth
= XINT (Flength (Vcode_conversion_work_buf_list
));
6231 workbuf
= make_conversion_work_buffer (multibyte
, depth
);
6235 Vcode_conversion_work_buf_list
6236 = Fcons (workbuf
, Vcode_conversion_work_buf_list
);
6237 record_unwind_protect (code_conversion_restore
, buffer
);
6242 decode_coding_gap (coding
, chars
, bytes
)
6243 struct coding_system
*coding
;
6244 EMACS_INT chars
, bytes
;
6246 int count
= specpdl_ptr
- specpdl
;
6250 buffer
= Fcurrent_buffer ();
6251 code_conversion_save (buffer
, 0, 0);
6253 coding
->src_object
= buffer
;
6254 coding
->src_chars
= chars
;
6255 coding
->src_bytes
= bytes
;
6256 coding
->src_pos
= -chars
;
6257 coding
->src_pos_byte
= -bytes
;
6258 coding
->src_multibyte
= chars
< bytes
;
6259 coding
->dst_object
= buffer
;
6260 coding
->dst_pos
= PT
;
6261 coding
->dst_pos_byte
= PT_BYTE
;
6262 coding
->dst_multibyte
= ! NILP (current_buffer
->enable_multibyte_characters
);
6263 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
6265 if (CODING_REQUIRE_DETECTION (coding
))
6266 detect_coding (coding
);
6268 decode_coding (coding
);
6270 attrs
= CODING_ID_ATTRS (coding
->id
);
6271 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6273 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6276 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6277 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6278 make_number (coding
->produced_char
));
6280 coding
->produced_char
+= Z
- prev_Z
;
6281 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6284 unbind_to (count
, Qnil
);
6285 return coding
->result
;
6289 encode_coding_gap (coding
, chars
, bytes
)
6290 struct coding_system
*coding
;
6291 EMACS_INT chars
, bytes
;
6293 int count
= specpdl_ptr
- specpdl
;
6296 buffer
= Fcurrent_buffer ();
6297 code_conversion_save (buffer
, 0, 0);
6299 coding
->src_object
= buffer
;
6300 coding
->src_chars
= chars
;
6301 coding
->src_bytes
= bytes
;
6302 coding
->src_pos
= -chars
;
6303 coding
->src_pos_byte
= -bytes
;
6304 coding
->src_multibyte
= chars
< bytes
;
6305 coding
->dst_object
= coding
->src_object
;
6306 coding
->dst_pos
= PT
;
6307 coding
->dst_pos_byte
= PT_BYTE
;
6309 encode_coding (coding
);
6311 unbind_to (count
, Qnil
);
6312 return coding
->result
;
6316 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6317 SRC_OBJECT into DST_OBJECT by coding context CODING.
6319 SRC_OBJECT is a buffer, a string, or Qnil.
6321 If it is a buffer, the text is at point of the buffer. FROM and TO
6322 are positions in the buffer.
6324 If it is a string, the text is at the beginning of the string.
6325 FROM and TO are indices to the string.
6327 If it is nil, the text is at coding->source. FROM and TO are
6328 indices to coding->source.
6330 DST_OBJECT is a buffer, Qt, or Qnil.
6332 If it is a buffer, the decoded text is inserted at point of the
6333 buffer. If the buffer is the same as SRC_OBJECT, the source text
6336 If it is Qt, a string is made from the decoded text, and
6337 set in CODING->dst_object.
6339 If it is Qnil, the decoded text is stored at CODING->destination.
6340 The caller must allocate CODING->dst_bytes bytes at
6341 CODING->destination by xmalloc. If the decoded text is longer than
6342 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6346 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6348 struct coding_system
*coding
;
6349 Lisp_Object src_object
;
6350 EMACS_INT from
, from_byte
, to
, to_byte
;
6351 Lisp_Object dst_object
;
6353 int count
= specpdl_ptr
- specpdl
;
6354 unsigned char *destination
;
6355 EMACS_INT dst_bytes
;
6356 EMACS_INT chars
= to
- from
;
6357 EMACS_INT bytes
= to_byte
- from_byte
;
6360 int saved_pt
= -1, saved_pt_byte
;
6362 buffer
= Fcurrent_buffer ();
6364 if (NILP (dst_object
))
6366 destination
= coding
->destination
;
6367 dst_bytes
= coding
->dst_bytes
;
6370 coding
->src_object
= src_object
;
6371 coding
->src_chars
= chars
;
6372 coding
->src_bytes
= bytes
;
6373 coding
->src_multibyte
= chars
< bytes
;
6375 if (STRINGP (src_object
))
6377 coding
->src_pos
= from
;
6378 coding
->src_pos_byte
= from_byte
;
6380 else if (BUFFERP (src_object
))
6382 set_buffer_internal (XBUFFER (src_object
));
6384 move_gap_both (from
, from_byte
);
6385 if (EQ (src_object
, dst_object
))
6387 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6388 TEMP_SET_PT_BOTH (from
, from_byte
);
6389 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6390 coding
->src_pos
= -chars
;
6391 coding
->src_pos_byte
= -bytes
;
6395 coding
->src_pos
= from
;
6396 coding
->src_pos_byte
= from_byte
;
6400 if (CODING_REQUIRE_DETECTION (coding
))
6401 detect_coding (coding
);
6402 attrs
= CODING_ID_ATTRS (coding
->id
);
6404 if (EQ (dst_object
, Qt
)
6405 || (! NILP (CODING_ATTR_POST_READ (attrs
))
6406 && NILP (dst_object
)))
6408 coding
->dst_object
= code_conversion_save (buffer
, 1, 1);
6409 coding
->dst_pos
= BEG
;
6410 coding
->dst_pos_byte
= BEG_BYTE
;
6411 coding
->dst_multibyte
= 1;
6413 else if (BUFFERP (dst_object
))
6415 code_conversion_save (buffer
, 0, 0);
6416 coding
->dst_object
= dst_object
;
6417 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6418 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6419 coding
->dst_multibyte
6420 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6424 code_conversion_save (buffer
, 0, 0);
6425 coding
->dst_object
= Qnil
;
6426 coding
->dst_multibyte
= 1;
6429 decode_coding (coding
);
6431 if (BUFFERP (coding
->dst_object
))
6432 set_buffer_internal (XBUFFER (coding
->dst_object
));
6434 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6436 struct gcpro gcpro1
, gcpro2
;
6437 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6440 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6441 GCPRO2 (coding
->src_object
, coding
->dst_object
);
6442 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6443 make_number (coding
->produced_char
));
6446 coding
->produced_char
+= Z
- prev_Z
;
6447 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6450 if (EQ (dst_object
, Qt
))
6452 coding
->dst_object
= Fbuffer_string ();
6454 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
6456 set_buffer_internal (XBUFFER (coding
->dst_object
));
6457 if (dst_bytes
< coding
->produced
)
6460 = (unsigned char *) xrealloc (destination
, coding
->produced
);
6463 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
6464 unbind_to (count
, Qnil
);
6467 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
6468 move_gap_both (BEGV
, BEGV_BYTE
);
6469 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
6470 coding
->destination
= destination
;
6476 /* This is the case of:
6477 (BUFFERP (src_object) && EQ (src_object, dst_object))
6478 As we have moved PT while replacing the original buffer
6479 contents, we must recover it now. */
6480 set_buffer_internal (XBUFFER (src_object
));
6481 if (saved_pt
< from
)
6482 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6483 else if (saved_pt
< from
+ chars
)
6484 TEMP_SET_PT_BOTH (from
, from_byte
);
6485 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6486 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6487 saved_pt_byte
+ (coding
->produced
- bytes
));
6489 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6490 saved_pt_byte
+ (coding
->produced
- bytes
));
6493 unbind_to (count
, Qnil
);
6498 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6500 struct coding_system
*coding
;
6501 Lisp_Object src_object
;
6502 EMACS_INT from
, from_byte
, to
, to_byte
;
6503 Lisp_Object dst_object
;
6505 int count
= specpdl_ptr
- specpdl
;
6506 EMACS_INT chars
= to
- from
;
6507 EMACS_INT bytes
= to_byte
- from_byte
;
6510 int saved_pt
= -1, saved_pt_byte
;
6512 buffer
= Fcurrent_buffer ();
6514 coding
->src_object
= src_object
;
6515 coding
->src_chars
= chars
;
6516 coding
->src_bytes
= bytes
;
6517 coding
->src_multibyte
= chars
< bytes
;
6519 attrs
= CODING_ID_ATTRS (coding
->id
);
6521 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
6523 coding
->src_object
= code_conversion_save (buffer
, 1,
6524 coding
->src_multibyte
);
6525 set_buffer_internal (XBUFFER (coding
->src_object
));
6526 if (STRINGP (src_object
))
6527 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
6528 else if (BUFFERP (src_object
))
6529 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
6531 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
6533 if (EQ (src_object
, dst_object
))
6535 set_buffer_internal (XBUFFER (src_object
));
6536 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6537 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6538 set_buffer_internal (XBUFFER (coding
->src_object
));
6541 call2 (CODING_ATTR_PRE_WRITE (attrs
),
6542 make_number (BEG
), make_number (Z
));
6543 coding
->src_object
= Fcurrent_buffer ();
6545 move_gap_both (BEG
, BEG_BYTE
);
6546 coding
->src_chars
= Z
- BEG
;
6547 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
6548 coding
->src_pos
= BEG
;
6549 coding
->src_pos_byte
= BEG_BYTE
;
6550 coding
->src_multibyte
= Z
< Z_BYTE
;
6552 else if (STRINGP (src_object
))
6554 code_conversion_save (buffer
, 0, 0);
6555 coding
->src_pos
= from
;
6556 coding
->src_pos_byte
= from_byte
;
6558 else if (BUFFERP (src_object
))
6560 code_conversion_save (buffer
, 0, 0);
6561 set_buffer_internal (XBUFFER (src_object
));
6562 if (EQ (src_object
, dst_object
))
6564 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6565 coding
->src_object
= del_range_1 (from
, to
, 1, 1);
6566 coding
->src_pos
= 0;
6567 coding
->src_pos_byte
= 0;
6571 if (from
< GPT
&& to
>= GPT
)
6572 move_gap_both (from
, from_byte
);
6573 coding
->src_pos
= from
;
6574 coding
->src_pos_byte
= from_byte
;
6578 code_conversion_save (buffer
, 0, 0);
6580 if (BUFFERP (dst_object
))
6582 coding
->dst_object
= dst_object
;
6583 if (EQ (src_object
, dst_object
))
6585 coding
->dst_pos
= from
;
6586 coding
->dst_pos_byte
= from_byte
;
6590 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6591 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6593 coding
->dst_multibyte
6594 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6596 else if (EQ (dst_object
, Qt
))
6598 coding
->dst_object
= Qnil
;
6599 coding
->dst_bytes
= coding
->src_chars
;
6600 if (coding
->dst_bytes
== 0)
6601 coding
->dst_bytes
= 1;
6602 coding
->destination
= (unsigned char *) xmalloc (coding
->dst_bytes
);
6603 coding
->dst_multibyte
= 0;
6607 coding
->dst_object
= Qnil
;
6608 coding
->dst_multibyte
= 0;
6611 encode_coding (coding
);
6613 if (EQ (dst_object
, Qt
))
6615 if (BUFFERP (coding
->dst_object
))
6616 coding
->dst_object
= Fbuffer_string ();
6620 = make_unibyte_string ((char *) coding
->destination
,
6622 xfree (coding
->destination
);
6628 /* This is the case of:
6629 (BUFFERP (src_object) && EQ (src_object, dst_object))
6630 As we have moved PT while replacing the original buffer
6631 contents, we must recover it now. */
6632 set_buffer_internal (XBUFFER (src_object
));
6633 if (saved_pt
< from
)
6634 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6635 else if (saved_pt
< from
+ chars
)
6636 TEMP_SET_PT_BOTH (from
, from_byte
);
6637 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6638 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6639 saved_pt_byte
+ (coding
->produced
- bytes
));
6641 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6642 saved_pt_byte
+ (coding
->produced
- bytes
));
6645 unbind_to (count
, Qnil
);
6650 preferred_coding_system ()
6652 int id
= coding_categories
[coding_priorities
[0]].id
;
6654 return CODING_ID_NAME (id
);
6659 /*** 8. Emacs Lisp library functions ***/
6661 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
6662 doc
: /* Return t if OBJECT is nil or a coding-system.
6663 See the documentation of `define-coding-system' for information
6664 about coding-system objects. */)
6668 return ((NILP (obj
) || CODING_SYSTEM_P (obj
)) ? Qt
: Qnil
);
6671 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
6672 Sread_non_nil_coding_system
, 1, 1, 0,
6673 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6680 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6681 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
6683 while (SCHARS (val
) == 0);
6684 return (Fintern (val
, Qnil
));
6687 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
6688 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6689 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6690 (prompt
, default_coding_system
)
6691 Lisp_Object prompt
, default_coding_system
;
6694 if (SYMBOLP (default_coding_system
))
6695 XSETSTRING (default_coding_system
, XPNTR (SYMBOL_NAME (default_coding_system
)));
6696 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6697 Qt
, Qnil
, Qcoding_system_history
,
6698 default_coding_system
, Qnil
);
6699 return (SCHARS (val
) == 0 ? Qnil
: Fintern (val
, Qnil
));
6702 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
6704 doc
: /* Check validity of CODING-SYSTEM.
6705 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6707 Lisp_Object coding_system
;
6709 CHECK_SYMBOL (coding_system
);
6710 if (!NILP (Fcoding_system_p (coding_system
)))
6711 return coding_system
;
6713 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
6717 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6718 HIGHEST is nonzero, return the coding system of the highest
6719 priority among the detected coding systems. Otherwize return a
6720 list of detected coding systems sorted by their priorities. If
6721 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6722 multibyte form but contains only ASCII and eight-bit chars.
6723 Otherwise, the bytes are raw bytes.
6725 CODING-SYSTEM controls the detection as below:
6727 If it is nil, detect both text-format and eol-format. If the
6728 text-format part of CODING-SYSTEM is already specified
6729 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6730 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6731 detect only text-format. */
6734 detect_coding_system (src
, src_bytes
, highest
, multibytep
, coding_system
)
6735 const unsigned char *src
;
6736 int src_bytes
, highest
;
6738 Lisp_Object coding_system
;
6740 const unsigned char *src_end
= src
+ src_bytes
;
6741 Lisp_Object attrs
, eol_type
;
6743 struct coding_system coding
;
6745 struct coding_detection_info detect_info
;
6747 if (NILP (coding_system
))
6748 coding_system
= Qundecided
;
6749 setup_coding_system (coding_system
, &coding
);
6750 attrs
= CODING_ID_ATTRS (coding
.id
);
6751 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
6752 coding_system
= CODING_ATTR_BASE_NAME (attrs
);
6754 coding
.source
= src
;
6755 coding
.src_bytes
= src_bytes
;
6756 coding
.src_multibyte
= multibytep
;
6757 coding
.consumed
= 0;
6758 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6760 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
6762 /* At first, detect text-format if necessary. */
6763 if (XINT (CODING_ATTR_CATEGORY (attrs
)) == coding_category_undecided
)
6765 enum coding_category category
;
6766 struct coding_system
*this;
6769 for (; src
< src_end
; src
++)
6773 || (c
< 0x20 && (c
== ISO_CODE_ESC
6775 || c
== ISO_CODE_SO
)))
6778 coding
.head_ascii
= src
- coding
.source
;
6781 for (i
= 0; i
< coding_category_raw_text
; i
++)
6783 category
= coding_priorities
[i
];
6784 this = coding_categories
+ category
;
6788 /* No coding system of this category is defined. */
6789 detect_info
.rejected
|= (1 << category
);
6791 else if (category
>= coding_category_raw_text
)
6793 else if (detect_info
.checked
& (1 << category
))
6796 && (detect_info
.found
& (1 << category
)))
6801 if ((*(this->detector
)) (&coding
, &detect_info
)
6803 && (detect_info
.found
& (1 << category
)))
6809 if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
6811 detect_info
.found
= CATEGORY_MASK_RAW_TEXT
;
6812 id
= coding_categories
[coding_category_raw_text
].id
;
6813 val
= Fcons (make_number (id
), Qnil
);
6815 else if (! detect_info
.rejected
&& ! detect_info
.found
)
6817 detect_info
.found
= CATEGORY_MASK_ANY
;
6818 id
= coding_categories
[coding_category_undecided
].id
;
6819 val
= Fcons (make_number (id
), Qnil
);
6823 if (detect_info
.found
)
6825 detect_info
.found
= 1 << category
;
6826 val
= Fcons (make_number (this->id
), Qnil
);
6829 for (i
= 0; i
< coding_category_raw_text
; i
++)
6830 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
6832 detect_info
.found
= 1 << coding_priorities
[i
];
6833 id
= coding_categories
[coding_priorities
[i
]].id
;
6834 val
= Fcons (make_number (id
), Qnil
);
6840 int mask
= detect_info
.rejected
| detect_info
.found
;
6844 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6846 category
= coding_priorities
[i
];
6847 if (! (mask
& (1 << category
)))
6849 found
|= 1 << category
;
6850 id
= coding_categories
[category
].id
;
6851 val
= Fcons (make_number (id
), val
);
6854 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6856 category
= coding_priorities
[i
];
6857 if (detect_info
.found
& (1 << category
))
6859 id
= coding_categories
[category
].id
;
6860 val
= Fcons (make_number (id
), val
);
6863 detect_info
.found
|= found
;
6868 detect_info
.found
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
6869 val
= Fcons (make_number (coding
.id
), Qnil
);
6872 /* Then, detect eol-format if necessary. */
6874 int normal_eol
= -1, utf_16_be_eol
= -1, utf_16_le_eol
;
6877 if (VECTORP (eol_type
))
6879 if (detect_info
.found
& ~CATEGORY_MASK_UTF_16
)
6880 normal_eol
= detect_eol (coding
.source
, src_bytes
,
6881 coding_category_raw_text
);
6882 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_BE
6883 | CATEGORY_MASK_UTF_16_BE_NOSIG
))
6884 utf_16_be_eol
= detect_eol (coding
.source
, src_bytes
,
6885 coding_category_utf_16_be
);
6886 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
6887 | CATEGORY_MASK_UTF_16_LE_NOSIG
))
6888 utf_16_le_eol
= detect_eol (coding
.source
, src_bytes
,
6889 coding_category_utf_16_le
);
6893 if (EQ (eol_type
, Qunix
))
6894 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_LF
;
6895 else if (EQ (eol_type
, Qdos
))
6896 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CRLF
;
6898 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CR
;
6901 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
6903 enum coding_category category
;
6906 id
= XINT (XCAR (tail
));
6907 attrs
= CODING_ID_ATTRS (id
);
6908 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
6909 eol_type
= CODING_ID_EOL_TYPE (id
);
6910 if (VECTORP (eol_type
))
6912 if (category
== coding_category_utf_16_be
6913 || category
== coding_category_utf_16_be_nosig
)
6914 this_eol
= utf_16_be_eol
;
6915 else if (category
== coding_category_utf_16_le
6916 || category
== coding_category_utf_16_le_nosig
)
6917 this_eol
= utf_16_le_eol
;
6919 this_eol
= normal_eol
;
6921 if (this_eol
== EOL_SEEN_LF
)
6922 XSETCAR (tail
, AREF (eol_type
, 0));
6923 else if (this_eol
== EOL_SEEN_CRLF
)
6924 XSETCAR (tail
, AREF (eol_type
, 1));
6925 else if (this_eol
== EOL_SEEN_CR
)
6926 XSETCAR (tail
, AREF (eol_type
, 2));
6928 XSETCAR (tail
, CODING_ID_NAME (id
));
6931 XSETCAR (tail
, CODING_ID_NAME (id
));
6935 return (highest
? XCAR (val
) : val
);
6939 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
6941 doc
: /* Detect coding system of the text in the region between START and END.
6942 Return a list of possible coding systems ordered by priority.
6944 If only ASCII characters are found, it returns a list of single element
6945 `undecided' or its subsidiary coding system according to a detected
6948 If optional argument HIGHEST is non-nil, return the coding system of
6949 highest priority. */)
6950 (start
, end
, highest
)
6951 Lisp_Object start
, end
, highest
;
6954 int from_byte
, to_byte
;
6956 CHECK_NUMBER_COERCE_MARKER (start
);
6957 CHECK_NUMBER_COERCE_MARKER (end
);
6959 validate_region (&start
, &end
);
6960 from
= XINT (start
), to
= XINT (end
);
6961 from_byte
= CHAR_TO_BYTE (from
);
6962 to_byte
= CHAR_TO_BYTE (to
);
6964 if (from
< GPT
&& to
>= GPT
)
6965 move_gap_both (to
, to_byte
);
6967 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
6968 to_byte
- from_byte
,
6970 !NILP (current_buffer
6971 ->enable_multibyte_characters
),
6975 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
6977 doc
: /* Detect coding system of the text in STRING.
6978 Return a list of possible coding systems ordered by priority.
6980 If only ASCII characters are found, it returns a list of single element
6981 `undecided' or its subsidiary coding system according to a detected
6984 If optional argument HIGHEST is non-nil, return the coding system of
6985 highest priority. */)
6987 Lisp_Object string
, highest
;
6989 CHECK_STRING (string
);
6991 return detect_coding_system (SDATA (string
), SBYTES (string
),
6992 !NILP (highest
), STRING_MULTIBYTE (string
),
6998 char_encodable_p (c
, attrs
)
7003 struct charset
*charset
;
7005 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
7006 CONSP (tail
); tail
= XCDR (tail
))
7008 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
7009 if (CHAR_CHARSET_P (c
, charset
))
7012 return (! NILP (tail
));
7016 /* Return a list of coding systems that safely encode the text between
7017 START and END. If EXCLUDE is non-nil, it is a list of coding
7018 systems not to check. The returned list doesn't contain any such
7019 coding systems. In any case, if the text contains only ASCII or is
7020 unibyte, return t. */
7022 DEFUN ("find-coding-systems-region-internal",
7023 Ffind_coding_systems_region_internal
,
7024 Sfind_coding_systems_region_internal
, 2, 3, 0,
7025 doc
: /* Internal use only. */)
7026 (start
, end
, exclude
)
7027 Lisp_Object start
, end
, exclude
;
7029 Lisp_Object coding_attrs_list
, safe_codings
;
7030 EMACS_INT start_byte
, end_byte
;
7031 const unsigned char *p
, *pbeg
, *pend
;
7033 Lisp_Object tail
, elt
;
7035 if (STRINGP (start
))
7037 if (!STRING_MULTIBYTE (start
)
7038 || SCHARS (start
) == SBYTES (start
))
7041 end_byte
= SBYTES (start
);
7045 CHECK_NUMBER_COERCE_MARKER (start
);
7046 CHECK_NUMBER_COERCE_MARKER (end
);
7047 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7048 args_out_of_range (start
, end
);
7049 if (NILP (current_buffer
->enable_multibyte_characters
))
7051 start_byte
= CHAR_TO_BYTE (XINT (start
));
7052 end_byte
= CHAR_TO_BYTE (XINT (end
));
7053 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7056 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7058 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7059 move_gap_both (XINT (start
), start_byte
);
7061 move_gap_both (XINT (end
), end_byte
);
7065 coding_attrs_list
= Qnil
;
7066 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7068 || NILP (Fmemq (XCAR (tail
), exclude
)))
7072 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
7073 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
7074 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
7075 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
7078 if (STRINGP (start
))
7079 p
= pbeg
= SDATA (start
);
7081 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7082 pend
= p
+ (end_byte
- start_byte
);
7084 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
7085 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7089 if (ASCII_BYTE_P (*p
))
7093 c
= STRING_CHAR_ADVANCE (p
);
7095 charset_map_loaded
= 0;
7096 for (tail
= coding_attrs_list
; CONSP (tail
);)
7101 else if (char_encodable_p (c
, elt
))
7103 else if (CONSP (XCDR (tail
)))
7105 XSETCAR (tail
, XCAR (XCDR (tail
)));
7106 XSETCDR (tail
, XCDR (XCDR (tail
)));
7110 XSETCAR (tail
, Qnil
);
7114 if (charset_map_loaded
)
7116 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7118 if (STRINGP (start
))
7119 pbeg
= SDATA (start
);
7121 pbeg
= BYTE_POS_ADDR (start_byte
);
7122 p
= pbeg
+ p_offset
;
7123 pend
= pbeg
+ pend_offset
;
7128 safe_codings
= Qnil
;
7129 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
7130 if (! NILP (XCAR (tail
)))
7131 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
7133 return safe_codings
;
7137 DEFUN ("unencodable-char-position", Funencodable_char_position
,
7138 Sunencodable_char_position
, 3, 5, 0,
7140 Return position of first un-encodable character in a region.
7141 START and END specfiy the region and CODING-SYSTEM specifies the
7142 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7144 If optional 4th argument COUNT is non-nil, it specifies at most how
7145 many un-encodable characters to search. In this case, the value is a
7148 If optional 5th argument STRING is non-nil, it is a string to search
7149 for un-encodable characters. In that case, START and END are indexes
7151 (start
, end
, coding_system
, count
, string
)
7152 Lisp_Object start
, end
, coding_system
, count
, string
;
7155 struct coding_system coding
;
7156 Lisp_Object attrs
, charset_list
;
7157 Lisp_Object positions
;
7159 const unsigned char *p
, *stop
, *pend
;
7160 int ascii_compatible
;
7162 setup_coding_system (Fcheck_coding_system (coding_system
), &coding
);
7163 attrs
= CODING_ID_ATTRS (coding
.id
);
7164 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
7166 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
7167 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7171 validate_region (&start
, &end
);
7172 from
= XINT (start
);
7174 if (NILP (current_buffer
->enable_multibyte_characters
)
7175 || (ascii_compatible
7176 && (to
- from
) == (CHAR_TO_BYTE (to
) - (CHAR_TO_BYTE (from
)))))
7178 p
= CHAR_POS_ADDR (from
);
7179 pend
= CHAR_POS_ADDR (to
);
7180 if (from
< GPT
&& to
>= GPT
)
7187 CHECK_STRING (string
);
7188 CHECK_NATNUM (start
);
7190 from
= XINT (start
);
7193 || to
> SCHARS (string
))
7194 args_out_of_range_3 (string
, start
, end
);
7195 if (! STRING_MULTIBYTE (string
))
7197 p
= SDATA (string
) + string_char_to_byte (string
, from
);
7198 stop
= pend
= SDATA (string
) + string_char_to_byte (string
, to
);
7199 if (ascii_compatible
&& (to
- from
) == (pend
- p
))
7207 CHECK_NATNUM (count
);
7216 if (ascii_compatible
)
7217 while (p
< stop
&& ASCII_BYTE_P (*p
))
7227 c
= STRING_CHAR_ADVANCE (p
);
7228 if (! (ASCII_CHAR_P (c
) && ascii_compatible
)
7229 && ! char_charset (c
, charset_list
, NULL
))
7231 positions
= Fcons (make_number (from
), positions
);
7240 return (NILP (count
) ? Fcar (positions
) : Fnreverse (positions
));
7244 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
7245 Scheck_coding_systems_region
, 3, 3, 0,
7246 doc
: /* Check if the region is encodable by coding systems.
7248 START and END are buffer positions specifying the region.
7249 CODING-SYSTEM-LIST is a list of coding systems to check.
7251 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7252 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7253 whole region, POS0, POS1, ... are buffer positions where non-encodable
7254 characters are found.
7256 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7259 START may be a string. In that case, check if the string is
7260 encodable, and the value contains indices to the string instead of
7261 buffer positions. END is ignored. */)
7262 (start
, end
, coding_system_list
)
7263 Lisp_Object start
, end
, coding_system_list
;
7266 EMACS_INT start_byte
, end_byte
;
7268 const unsigned char *p
, *pbeg
, *pend
;
7270 Lisp_Object tail
, elt
;
7272 if (STRINGP (start
))
7274 if (!STRING_MULTIBYTE (start
)
7275 && SCHARS (start
) != SBYTES (start
))
7278 end_byte
= SBYTES (start
);
7283 CHECK_NUMBER_COERCE_MARKER (start
);
7284 CHECK_NUMBER_COERCE_MARKER (end
);
7285 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7286 args_out_of_range (start
, end
);
7287 if (NILP (current_buffer
->enable_multibyte_characters
))
7289 start_byte
= CHAR_TO_BYTE (XINT (start
));
7290 end_byte
= CHAR_TO_BYTE (XINT (end
));
7291 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7294 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7296 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7297 move_gap_both (XINT (start
), start_byte
);
7299 move_gap_both (XINT (end
), end_byte
);
7305 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7308 list
= Fcons (Fcons (elt
, Fcons (AREF (CODING_SYSTEM_SPEC (elt
), 0),
7313 if (STRINGP (start
))
7314 p
= pbeg
= SDATA (start
);
7316 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7317 pend
= p
+ (end_byte
- start_byte
);
7319 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
7320 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7324 if (ASCII_BYTE_P (*p
))
7328 c
= STRING_CHAR_ADVANCE (p
);
7330 charset_map_loaded
= 0;
7331 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
7333 elt
= XCDR (XCAR (tail
));
7334 if (! char_encodable_p (c
, XCAR (elt
)))
7335 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
7337 if (charset_map_loaded
)
7339 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7341 if (STRINGP (start
))
7342 pbeg
= SDATA (start
);
7344 pbeg
= BYTE_POS_ADDR (start_byte
);
7345 p
= pbeg
+ p_offset
;
7346 pend
= pbeg
+ pend_offset
;
7354 for (; CONSP (tail
); tail
= XCDR (tail
))
7357 if (CONSP (XCDR (XCDR (elt
))))
7358 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
7368 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
7369 Lisp_Object start
, end
, coding_system
, dst_object
;
7370 int encodep
, norecord
;
7372 struct coding_system coding
;
7373 EMACS_INT from
, from_byte
, to
, to_byte
;
7374 Lisp_Object src_object
;
7376 CHECK_NUMBER_COERCE_MARKER (start
);
7377 CHECK_NUMBER_COERCE_MARKER (end
);
7378 if (NILP (coding_system
))
7379 coding_system
= Qno_conversion
;
7381 CHECK_CODING_SYSTEM (coding_system
);
7382 src_object
= Fcurrent_buffer ();
7383 if (NILP (dst_object
))
7384 dst_object
= src_object
;
7385 else if (! EQ (dst_object
, Qt
))
7386 CHECK_BUFFER (dst_object
);
7388 validate_region (&start
, &end
);
7389 from
= XFASTINT (start
);
7390 from_byte
= CHAR_TO_BYTE (from
);
7391 to
= XFASTINT (end
);
7392 to_byte
= CHAR_TO_BYTE (to
);
7394 setup_coding_system (coding_system
, &coding
);
7395 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7398 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7401 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7404 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7406 if (coding
.result
!= CODING_RESULT_SUCCESS
)
7407 error ("Code conversion error: %d", coding
.result
);
7409 return (BUFFERP (dst_object
)
7410 ? make_number (coding
.produced_char
)
7411 : coding
.dst_object
);
7415 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
7416 3, 4, "r\nzCoding system: ",
7417 doc
: /* Decode the current region from the specified coding system.
7418 When called from a program, takes four arguments:
7419 START, END, CODING-SYSTEM, and DESTINATION.
7420 START and END are buffer positions.
7422 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7423 If nil, the region between START and END is replace by the decoded text.
7424 If buffer, the decoded text is inserted in the buffer.
7425 If t, the decoded text is returned.
7427 This function sets `last-coding-system-used' to the precise coding system
7428 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7429 not fully specified.)
7430 It returns the length of the decoded text. */)
7431 (start
, end
, coding_system
, destination
)
7432 Lisp_Object start
, end
, coding_system
, destination
;
7434 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
7437 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
7438 3, 4, "r\nzCoding system: ",
7439 doc
: /* Encode the current region by specified coding system.
7440 When called from a program, takes three arguments:
7441 START, END, and CODING-SYSTEM. START and END are buffer positions.
7443 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7444 If nil, the region between START and END is replace by the encoded text.
7445 If buffer, the encoded text is inserted in the buffer.
7446 If t, the encoded text is returned.
7448 This function sets `last-coding-system-used' to the precise coding system
7449 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7450 not fully specified.)
7451 It returns the length of the encoded text. */)
7452 (start
, end
, coding_system
, destination
)
7453 Lisp_Object start
, end
, coding_system
, destination
;
7455 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
7459 code_convert_string (string
, coding_system
, dst_object
,
7460 encodep
, nocopy
, norecord
)
7461 Lisp_Object string
, coding_system
, dst_object
;
7462 int encodep
, nocopy
, norecord
;
7464 struct coding_system coding
;
7465 EMACS_INT chars
, bytes
;
7467 CHECK_STRING (string
);
7468 if (NILP (coding_system
))
7471 Vlast_coding_system_used
= Qno_conversion
;
7472 if (NILP (dst_object
))
7473 return (nocopy
? Fcopy_sequence (string
) : string
);
7476 if (NILP (coding_system
))
7477 coding_system
= Qno_conversion
;
7479 CHECK_CODING_SYSTEM (coding_system
);
7480 if (NILP (dst_object
))
7482 else if (! EQ (dst_object
, Qt
))
7483 CHECK_BUFFER (dst_object
);
7485 setup_coding_system (coding_system
, &coding
);
7486 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7487 chars
= SCHARS (string
);
7488 bytes
= SBYTES (string
);
7490 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7492 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7494 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7496 if (coding
.result
!= CODING_RESULT_SUCCESS
)
7497 error ("Code conversion error: %d", coding
.result
);
7499 return (BUFFERP (dst_object
)
7500 ? make_number (coding
.produced_char
)
7501 : coding
.dst_object
);
7505 /* Encode or decode STRING according to CODING_SYSTEM.
7506 Do not set Vlast_coding_system_used.
7508 This function is called only from macros DECODE_FILE and
7509 ENCODE_FILE, thus we ignore character composition. */
7512 code_convert_string_norecord (string
, coding_system
, encodep
)
7513 Lisp_Object string
, coding_system
;
7516 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
7520 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
7522 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7524 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7525 if the decoding operation is trivial.
7527 Optional fourth arg BUFFER non-nil meant that the decoded text is
7528 inserted in BUFFER instead of returned as a string. In this case,
7529 the return value is BUFFER.
7531 This function sets `last-coding-system-used' to the precise coding system
7532 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7533 not fully specified. */)
7534 (string
, coding_system
, nocopy
, buffer
)
7535 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7537 return code_convert_string (string
, coding_system
, buffer
,
7538 0, ! NILP (nocopy
), 0);
7541 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
7543 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
7545 Optional third arg NOCOPY non-nil means it is OK to return STRING
7546 itself if the encoding operation is trivial.
7548 Optional fourth arg BUFFER non-nil meant that the encoded text is
7549 inserted in BUFFER instead of returned as a string. In this case,
7550 the return value is BUFFER.
7552 This function sets `last-coding-system-used' to the precise coding system
7553 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7554 not fully specified.) */)
7555 (string
, coding_system
, nocopy
, buffer
)
7556 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7558 return code_convert_string (string
, coding_system
, buffer
,
7559 1, ! NILP (nocopy
), 1);
7563 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
7564 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
7565 Return the corresponding character. */)
7569 Lisp_Object spec
, attrs
, val
;
7570 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
7573 CHECK_NATNUM (code
);
7574 c
= XFASTINT (code
);
7575 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7576 attrs
= AREF (spec
, 0);
7578 if (ASCII_BYTE_P (c
)
7579 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7582 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7583 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7584 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7585 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7588 charset
= charset_roman
;
7589 else if (c
>= 0xA0 && c
< 0xDF)
7591 charset
= charset_kana
;
7596 int s1
= c
>> 8, s2
= c
& 0xFF;
7598 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
7599 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
7600 error ("Invalid code: %d", code
);
7602 charset
= charset_kanji
;
7604 c
= DECODE_CHAR (charset
, c
);
7606 error ("Invalid code: %d", code
);
7607 return make_number (c
);
7611 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
7612 doc
: /* Encode a Japanese character CHAR to shift_jis encoding.
7613 Return the corresponding code in SJIS. */)
7617 Lisp_Object spec
, attrs
, charset_list
;
7619 struct charset
*charset
;
7622 CHECK_CHARACTER (ch
);
7624 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7625 attrs
= AREF (spec
, 0);
7627 if (ASCII_CHAR_P (c
)
7628 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7631 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7632 charset
= char_charset (c
, charset_list
, &code
);
7633 if (code
== CHARSET_INVALID_CODE (charset
))
7634 error ("Can't encode by shift_jis encoding: %d", c
);
7637 return make_number (code
);
7640 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
7641 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
7642 Return the corresponding character. */)
7646 Lisp_Object spec
, attrs
, val
;
7647 struct charset
*charset_roman
, *charset_big5
, *charset
;
7650 CHECK_NATNUM (code
);
7651 c
= XFASTINT (code
);
7652 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7653 attrs
= AREF (spec
, 0);
7655 if (ASCII_BYTE_P (c
)
7656 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7659 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7660 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7661 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7664 charset
= charset_roman
;
7667 int b1
= c
>> 8, b2
= c
& 0x7F;
7668 if (b1
< 0xA1 || b1
> 0xFE
7669 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
7670 error ("Invalid code: %d", code
);
7671 charset
= charset_big5
;
7673 c
= DECODE_CHAR (charset
, (unsigned )c
);
7675 error ("Invalid code: %d", code
);
7676 return make_number (c
);
7679 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
7680 doc
: /* Encode the Big5 character CHAR to BIG5 coding system.
7681 Return the corresponding character code in Big5. */)
7685 Lisp_Object spec
, attrs
, charset_list
;
7686 struct charset
*charset
;
7690 CHECK_CHARACTER (ch
);
7692 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7693 attrs
= AREF (spec
, 0);
7694 if (ASCII_CHAR_P (c
)
7695 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7698 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7699 charset
= char_charset (c
, charset_list
, &code
);
7700 if (code
== CHARSET_INVALID_CODE (charset
))
7701 error ("Can't encode by Big5 encoding: %d", c
);
7703 return make_number (code
);
7707 DEFUN ("set-terminal-coding-system-internal",
7708 Fset_terminal_coding_system_internal
,
7709 Sset_terminal_coding_system_internal
, 1, 1, 0,
7710 doc
: /* Internal use only. */)
7712 Lisp_Object coding_system
;
7714 CHECK_SYMBOL (coding_system
);
7715 setup_coding_system (Fcheck_coding_system (coding_system
),
7718 /* We had better not send unsafe characters to terminal. */
7719 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
7720 /* Characer composition should be disabled. */
7721 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7722 terminal_coding
.src_multibyte
= 1;
7723 terminal_coding
.dst_multibyte
= 0;
7727 DEFUN ("set-safe-terminal-coding-system-internal",
7728 Fset_safe_terminal_coding_system_internal
,
7729 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
7730 doc
: /* Internal use only. */)
7732 Lisp_Object coding_system
;
7734 CHECK_SYMBOL (coding_system
);
7735 setup_coding_system (Fcheck_coding_system (coding_system
),
7736 &safe_terminal_coding
);
7737 /* Characer composition should be disabled. */
7738 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7739 safe_terminal_coding
.src_multibyte
= 1;
7740 safe_terminal_coding
.dst_multibyte
= 0;
7744 DEFUN ("terminal-coding-system",
7745 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
7746 doc
: /* Return coding system specified for terminal output. */)
7749 return CODING_ID_NAME (terminal_coding
.id
);
7752 DEFUN ("set-keyboard-coding-system-internal",
7753 Fset_keyboard_coding_system_internal
,
7754 Sset_keyboard_coding_system_internal
, 1, 1, 0,
7755 doc
: /* Internal use only. */)
7757 Lisp_Object coding_system
;
7759 CHECK_SYMBOL (coding_system
);
7760 setup_coding_system (Fcheck_coding_system (coding_system
),
7762 /* Characer composition should be disabled. */
7763 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7767 DEFUN ("keyboard-coding-system",
7768 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
7769 doc
: /* Return coding system specified for decoding keyboard input. */)
7772 return CODING_ID_NAME (keyboard_coding
.id
);
7776 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
7777 Sfind_operation_coding_system
, 1, MANY
, 0,
7778 doc
: /* Choose a coding system for an operation based on the target name.
7779 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7780 DECODING-SYSTEM is the coding system to use for decoding
7781 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7782 for encoding (in case OPERATION does encoding).
7784 The first argument OPERATION specifies an I/O primitive:
7785 For file I/O, `insert-file-contents' or `write-region'.
7786 For process I/O, `call-process', `call-process-region', or `start-process'.
7787 For network I/O, `open-network-stream'.
7789 The remaining arguments should be the same arguments that were passed
7790 to the primitive. Depending on which primitive, one of those arguments
7791 is selected as the TARGET. For example, if OPERATION does file I/O,
7792 whichever argument specifies the file name is TARGET.
7794 TARGET has a meaning which depends on OPERATION:
7795 For file I/O, TARGET is a file name.
7796 For process I/O, TARGET is a process name.
7797 For network I/O, TARGET is a service name or a port number
7799 This function looks up what specified for TARGET in,
7800 `file-coding-system-alist', `process-coding-system-alist',
7801 or `network-coding-system-alist' depending on OPERATION.
7802 They may specify a coding system, a cons of coding systems,
7803 or a function symbol to call.
7804 In the last case, we call the function with one argument,
7805 which is a list of all the arguments given to this function.
7807 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7812 Lisp_Object operation
, target_idx
, target
, val
;
7813 register Lisp_Object chain
;
7816 error ("Too few arguments");
7817 operation
= args
[0];
7818 if (!SYMBOLP (operation
)
7819 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
7820 error ("Invalid first arguement");
7821 if (nargs
< 1 + XINT (target_idx
))
7822 error ("Too few arguments for operation: %s",
7823 SDATA (SYMBOL_NAME (operation
)));
7824 target
= args
[XINT (target_idx
) + 1];
7825 if (!(STRINGP (target
)
7826 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
7827 error ("Invalid %dth argument", XINT (target_idx
) + 1);
7829 chain
= ((EQ (operation
, Qinsert_file_contents
)
7830 || EQ (operation
, Qwrite_region
))
7831 ? Vfile_coding_system_alist
7832 : (EQ (operation
, Qopen_network_stream
)
7833 ? Vnetwork_coding_system_alist
7834 : Vprocess_coding_system_alist
));
7838 for (; CONSP (chain
); chain
= XCDR (chain
))
7844 && ((STRINGP (target
)
7845 && STRINGP (XCAR (elt
))
7846 && fast_string_match (XCAR (elt
), target
) >= 0)
7847 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
7850 /* Here, if VAL is both a valid coding system and a valid
7851 function symbol, we return VAL as a coding system. */
7854 if (! SYMBOLP (val
))
7856 if (! NILP (Fcoding_system_p (val
)))
7857 return Fcons (val
, val
);
7858 if (! NILP (Ffboundp (val
)))
7860 val
= call1 (val
, Flist (nargs
, args
));
7863 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
7864 return Fcons (val
, val
);
7872 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
7873 Sset_coding_system_priority
, 0, MANY
, 0,
7874 doc
: /* Assign higher priority to the coding systems given as arguments.
7875 If multiple coding systems belongs to the same category,
7876 all but the first one are ignored.
7878 usage: (set-coding-system-priority ...) */)
7884 int changed
[coding_category_max
];
7885 enum coding_category priorities
[coding_category_max
];
7887 bzero (changed
, sizeof changed
);
7889 for (i
= j
= 0; i
< nargs
; i
++)
7891 enum coding_category category
;
7892 Lisp_Object spec
, attrs
;
7894 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
7895 attrs
= AREF (spec
, 0);
7896 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7897 if (changed
[category
])
7898 /* Ignore this coding system because a coding system of the
7899 same category already had a higher priority. */
7901 changed
[category
] = 1;
7902 priorities
[j
++] = category
;
7903 if (coding_categories
[category
].id
>= 0
7904 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
7905 setup_coding_system (args
[i
], &coding_categories
[category
]);
7906 Fset (AREF (Vcoding_category_table
, category
), args
[i
]);
7909 /* Now we have decided top J priorities. Reflect the order of the
7910 original priorities to the remaining priorities. */
7912 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
7914 while (j
< coding_category_max
7915 && changed
[coding_priorities
[j
]])
7917 if (j
== coding_category_max
)
7919 priorities
[i
] = coding_priorities
[j
];
7922 bcopy (priorities
, coding_priorities
, sizeof priorities
);
7924 /* Update `coding-category-list'. */
7925 Vcoding_category_list
= Qnil
;
7926 for (i
= coding_category_max
- 1; i
>= 0; i
--)
7927 Vcoding_category_list
7928 = Fcons (AREF (Vcoding_category_table
, priorities
[i
]),
7929 Vcoding_category_list
);
7934 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
7935 Scoding_system_priority_list
, 0, 1, 0,
7936 doc
: /* Return a list of coding systems ordered by their priorities.
7937 HIGHESTP non-nil means just return the highest priority one. */)
7939 Lisp_Object highestp
;
7944 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
7946 enum coding_category category
= coding_priorities
[i
];
7947 int id
= coding_categories
[category
].id
;
7952 attrs
= CODING_ID_ATTRS (id
);
7953 if (! NILP (highestp
))
7954 return CODING_ATTR_BASE_NAME (attrs
);
7955 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
7957 return Fnreverse (val
);
7960 static char *suffixes
[] = { "-unix", "-dos", "-mac" };
7963 make_subsidiaries (base
)
7966 Lisp_Object subsidiaries
;
7967 int base_name_len
= SBYTES (SYMBOL_NAME (base
));
7968 char *buf
= (char *) alloca (base_name_len
+ 6);
7971 bcopy (SDATA (SYMBOL_NAME (base
)), buf
, base_name_len
);
7972 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
7973 for (i
= 0; i
< 3; i
++)
7975 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
7976 ASET (subsidiaries
, i
, intern (buf
));
7978 return subsidiaries
;
7982 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
7983 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
7984 doc
: /* For internal use only.
7985 usage: (define-coding-system-internal ...) */)
7991 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
7992 Lisp_Object attrs
; /* Vector of attributes. */
7993 Lisp_Object eol_type
;
7994 Lisp_Object aliases
;
7995 Lisp_Object coding_type
, charset_list
, safe_charsets
;
7996 enum coding_category category
;
7997 Lisp_Object tail
, val
;
7998 int max_charset_id
= 0;
8001 if (nargs
< coding_arg_max
)
8004 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
8006 name
= args
[coding_arg_name
];
8007 CHECK_SYMBOL (name
);
8008 CODING_ATTR_BASE_NAME (attrs
) = name
;
8010 val
= args
[coding_arg_mnemonic
];
8011 if (! STRINGP (val
))
8012 CHECK_CHARACTER (val
);
8013 CODING_ATTR_MNEMONIC (attrs
) = val
;
8015 coding_type
= args
[coding_arg_coding_type
];
8016 CHECK_SYMBOL (coding_type
);
8017 CODING_ATTR_TYPE (attrs
) = coding_type
;
8019 charset_list
= args
[coding_arg_charset_list
];
8020 if (SYMBOLP (charset_list
))
8022 if (EQ (charset_list
, Qiso_2022
))
8024 if (! EQ (coding_type
, Qiso_2022
))
8025 error ("Invalid charset-list");
8026 charset_list
= Viso_2022_charset_list
;
8028 else if (EQ (charset_list
, Qemacs_mule
))
8030 if (! EQ (coding_type
, Qemacs_mule
))
8031 error ("Invalid charset-list");
8032 charset_list
= Vemacs_mule_charset_list
;
8034 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8035 if (max_charset_id
< XFASTINT (XCAR (tail
)))
8036 max_charset_id
= XFASTINT (XCAR (tail
));
8040 charset_list
= Fcopy_sequence (charset_list
);
8041 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
8043 struct charset
*charset
;
8046 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8047 if (EQ (coding_type
, Qiso_2022
)
8048 ? CHARSET_ISO_FINAL (charset
) < 0
8049 : EQ (coding_type
, Qemacs_mule
)
8050 ? CHARSET_EMACS_MULE_ID (charset
) < 0
8052 error ("Can't handle charset `%s'",
8053 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8055 XSETCAR (tail
, make_number (charset
->id
));
8056 if (max_charset_id
< charset
->id
)
8057 max_charset_id
= charset
->id
;
8060 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
8062 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
8064 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8065 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
8066 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
8068 CODING_ATTR_ASCII_COMPAT (attrs
) = args
[coding_arg_ascii_compatible_p
];
8070 val
= args
[coding_arg_decode_translation_table
];
8072 CHECK_CHAR_TABLE (val
);
8073 CODING_ATTR_DECODE_TBL (attrs
) = val
;
8075 val
= args
[coding_arg_encode_translation_table
];
8077 CHECK_CHAR_TABLE (val
);
8078 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
8080 val
= args
[coding_arg_post_read_conversion
];
8082 CODING_ATTR_POST_READ (attrs
) = val
;
8084 val
= args
[coding_arg_pre_write_conversion
];
8086 CODING_ATTR_PRE_WRITE (attrs
) = val
;
8088 val
= args
[coding_arg_default_char
];
8090 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
8093 CHECK_CHARACTER (val
);
8094 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
8097 val
= args
[coding_arg_for_unibyte
];
8098 CODING_ATTR_FOR_UNIBYTE (attrs
) = NILP (val
) ? Qnil
: Qt
;
8100 val
= args
[coding_arg_plist
];
8102 CODING_ATTR_PLIST (attrs
) = val
;
8104 if (EQ (coding_type
, Qcharset
))
8107 /* Generate a lisp vector of 256 elements. Each element is nil,
8108 integer, or a list of charset IDs.
8110 If Nth element is nil, the byte code N is invalid in this
8113 If Nth element is a number NUM, N is the first byte of a
8114 charset whose ID is NUM.
8116 If Nth element is a list of charset IDs, N is the first byte
8117 of one of them. The list is sorted by dimensions of the
8118 charsets. A charset of smaller dimension comes firtst.
8120 for (list
= Qnil
, tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8122 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
8124 if (charset
->method
== CHARSET_METHOD_SUPERSET
)
8126 val
= CHARSET_SUPERSET (charset
);
8127 for (; CONSP (val
); val
= XCDR (val
))
8128 list
= Fcons (XCAR (XCAR (val
)), list
);
8131 list
= Fcons (XCAR (tail
), list
);
8134 val
= Fmake_vector (make_number (256), Qnil
);
8136 for (tail
= Fnreverse (list
); CONSP (tail
); tail
= XCDR (tail
))
8138 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
8139 int dim
= CHARSET_DIMENSION (charset
);
8140 int idx
= (dim
- 1) * 4;
8142 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8143 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8145 for (i
= charset
->code_space
[idx
];
8146 i
<= charset
->code_space
[idx
+ 1]; i
++)
8148 Lisp_Object tmp
, tmp2
;
8151 tmp
= AREF (val
, i
);
8154 else if (NUMBERP (tmp
))
8156 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
8158 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
8160 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
8164 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
8166 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
8171 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
8174 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
8175 XSETCAR (tmp2
, XCAR (tail
));
8181 ASET (attrs
, coding_attr_charset_valids
, val
);
8182 category
= coding_category_charset
;
8184 else if (EQ (coding_type
, Qccl
))
8188 if (nargs
< coding_arg_ccl_max
)
8191 val
= args
[coding_arg_ccl_decoder
];
8192 CHECK_CCL_PROGRAM (val
);
8194 val
= Fcopy_sequence (val
);
8195 ASET (attrs
, coding_attr_ccl_decoder
, val
);
8197 val
= args
[coding_arg_ccl_encoder
];
8198 CHECK_CCL_PROGRAM (val
);
8200 val
= Fcopy_sequence (val
);
8201 ASET (attrs
, coding_attr_ccl_encoder
, val
);
8203 val
= args
[coding_arg_ccl_valids
];
8204 valids
= Fmake_string (make_number (256), make_number (0));
8205 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
8212 from
= to
= XINT (val
);
8213 if (from
< 0 || from
> 255)
8214 args_out_of_range_3 (val
, make_number (0), make_number (255));
8219 CHECK_NATNUM_CAR (val
);
8220 CHECK_NATNUM_CDR (val
);
8221 from
= XINT (XCAR (val
));
8223 args_out_of_range_3 (XCAR (val
),
8224 make_number (0), make_number (255));
8225 to
= XINT (XCDR (val
));
8226 if (to
< from
|| to
> 255)
8227 args_out_of_range_3 (XCDR (val
),
8228 XCAR (val
), make_number (255));
8230 for (i
= from
; i
<= to
; i
++)
8231 SSET (valids
, i
, 1);
8233 ASET (attrs
, coding_attr_ccl_valids
, valids
);
8235 category
= coding_category_ccl
;
8237 else if (EQ (coding_type
, Qutf_16
))
8239 Lisp_Object bom
, endian
;
8241 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8243 if (nargs
< coding_arg_utf16_max
)
8246 bom
= args
[coding_arg_utf16_bom
];
8247 if (! NILP (bom
) && ! EQ (bom
, Qt
))
8251 CHECK_CODING_SYSTEM (val
);
8253 CHECK_CODING_SYSTEM (val
);
8255 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
8257 endian
= args
[coding_arg_utf16_endian
];
8258 CHECK_SYMBOL (endian
);
8261 else if (! EQ (endian
, Qbig
) && ! EQ (endian
, Qlittle
))
8262 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian
)));
8263 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
8265 category
= (CONSP (bom
)
8266 ? coding_category_utf_16_auto
8268 ? (EQ (endian
, Qbig
)
8269 ? coding_category_utf_16_be_nosig
8270 : coding_category_utf_16_le_nosig
)
8271 : (EQ (endian
, Qbig
)
8272 ? coding_category_utf_16_be
8273 : coding_category_utf_16_le
));
8275 else if (EQ (coding_type
, Qiso_2022
))
8277 Lisp_Object initial
, reg_usage
, request
, flags
;
8280 if (nargs
< coding_arg_iso2022_max
)
8283 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
8284 CHECK_VECTOR (initial
);
8285 for (i
= 0; i
< 4; i
++)
8287 val
= Faref (initial
, make_number (i
));
8290 struct charset
*charset
;
8292 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8293 ASET (initial
, i
, make_number (CHARSET_ID (charset
)));
8294 if (i
== 0 && CHARSET_ASCII_COMPATIBLE_P (charset
))
8295 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8298 ASET (initial
, i
, make_number (-1));
8301 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
8302 CHECK_CONS (reg_usage
);
8303 CHECK_NUMBER_CAR (reg_usage
);
8304 CHECK_NUMBER_CDR (reg_usage
);
8306 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
8307 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
8315 CHECK_CHARSET_GET_ID (tmp
, id
);
8316 CHECK_NATNUM_CDR (val
);
8317 if (XINT (XCDR (val
)) >= 4)
8318 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
8319 XSETCAR (val
, make_number (id
));
8322 flags
= args
[coding_arg_iso2022_flags
];
8323 CHECK_NATNUM (flags
);
8325 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
8326 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
8328 ASET (attrs
, coding_attr_iso_initial
, initial
);
8329 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
8330 ASET (attrs
, coding_attr_iso_request
, request
);
8331 ASET (attrs
, coding_attr_iso_flags
, flags
);
8332 setup_iso_safe_charsets (attrs
);
8334 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
8335 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
8336 | CODING_ISO_FLAG_SINGLE_SHIFT
))
8337 ? coding_category_iso_7_else
8338 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8339 ? coding_category_iso_7
8340 : coding_category_iso_7_tight
);
8343 int id
= XINT (AREF (initial
, 1));
8345 category
= (((i
& CODING_ISO_FLAG_LOCKING_SHIFT
)
8346 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8348 ? coding_category_iso_8_else
8349 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
8350 ? coding_category_iso_8_1
8351 : coding_category_iso_8_2
);
8353 if (category
!= coding_category_iso_8_1
8354 && category
!= coding_category_iso_8_2
)
8355 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8357 else if (EQ (coding_type
, Qemacs_mule
))
8359 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
8360 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
8361 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8362 category
= coding_category_emacs_mule
;
8364 else if (EQ (coding_type
, Qshift_jis
))
8367 struct charset
*charset
;
8369 if (XINT (Flength (charset_list
)) != 3)
8370 error ("There should be just three charsets");
8372 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8373 if (CHARSET_DIMENSION (charset
) != 1)
8374 error ("Dimension of charset %s is not one",
8375 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8376 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8377 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8379 charset_list
= XCDR (charset_list
);
8380 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8381 if (CHARSET_DIMENSION (charset
) != 1)
8382 error ("Dimension of charset %s is not one",
8383 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8385 charset_list
= XCDR (charset_list
);
8386 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8387 if (CHARSET_DIMENSION (charset
) != 2)
8388 error ("Dimension of charset %s is not two",
8389 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8391 category
= coding_category_sjis
;
8392 Vsjis_coding_system
= name
;
8394 else if (EQ (coding_type
, Qbig5
))
8396 struct charset
*charset
;
8398 if (XINT (Flength (charset_list
)) != 2)
8399 error ("There should be just two charsets");
8401 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8402 if (CHARSET_DIMENSION (charset
) != 1)
8403 error ("Dimension of charset %s is not one",
8404 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8405 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8406 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8408 charset_list
= XCDR (charset_list
);
8409 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8410 if (CHARSET_DIMENSION (charset
) != 2)
8411 error ("Dimension of charset %s is not two",
8412 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8414 category
= coding_category_big5
;
8415 Vbig5_coding_system
= name
;
8417 else if (EQ (coding_type
, Qraw_text
))
8419 category
= coding_category_raw_text
;
8420 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8422 else if (EQ (coding_type
, Qutf_8
))
8424 category
= coding_category_utf_8
;
8425 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8427 else if (EQ (coding_type
, Qundecided
))
8428 category
= coding_category_undecided
;
8430 error ("Invalid coding system type: %s",
8431 SDATA (SYMBOL_NAME (coding_type
)));
8433 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
8434 CODING_ATTR_PLIST (attrs
)
8435 = Fcons (QCcategory
, Fcons (AREF (Vcoding_category_table
, category
),
8436 CODING_ATTR_PLIST (attrs
)));
8438 eol_type
= args
[coding_arg_eol_type
];
8439 if (! NILP (eol_type
)
8440 && ! EQ (eol_type
, Qunix
)
8441 && ! EQ (eol_type
, Qdos
)
8442 && ! EQ (eol_type
, Qmac
))
8443 error ("Invalid eol-type");
8445 aliases
= Fcons (name
, Qnil
);
8447 if (NILP (eol_type
))
8449 eol_type
= make_subsidiaries (name
);
8450 for (i
= 0; i
< 3; i
++)
8452 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
8454 this_name
= AREF (eol_type
, i
);
8455 this_aliases
= Fcons (this_name
, Qnil
);
8456 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
8457 this_spec
= Fmake_vector (make_number (3), attrs
);
8458 ASET (this_spec
, 1, this_aliases
);
8459 ASET (this_spec
, 2, this_eol_type
);
8460 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
8461 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
8462 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
8463 Vcoding_system_alist
);
8467 spec_vec
= Fmake_vector (make_number (3), attrs
);
8468 ASET (spec_vec
, 1, aliases
);
8469 ASET (spec_vec
, 2, eol_type
);
8471 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
8472 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
8473 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
8474 Vcoding_system_alist
);
8477 int id
= coding_categories
[category
].id
;
8479 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
8480 setup_coding_system (name
, &coding_categories
[category
]);
8486 return Fsignal (Qwrong_number_of_arguments
,
8487 Fcons (intern ("define-coding-system-internal"),
8488 make_number (nargs
)));
8491 /* Fixme: should this record the alias relationships for
8492 diagnostics? Should it update coding-system-list? */
8493 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
8494 Sdefine_coding_system_alias
, 2, 2, 0,
8495 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8496 (alias
, coding_system
)
8497 Lisp_Object alias
, coding_system
;
8499 Lisp_Object spec
, aliases
, eol_type
;
8501 CHECK_SYMBOL (alias
);
8502 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8503 aliases
= AREF (spec
, 1);
8504 while (!NILP (XCDR (aliases
)))
8505 aliases
= XCDR (aliases
);
8506 XSETCDR (aliases
, Fcons (alias
, Qnil
));
8508 eol_type
= AREF (spec
, 2);
8509 if (VECTORP (eol_type
))
8511 Lisp_Object subsidiaries
;
8514 subsidiaries
= make_subsidiaries (alias
);
8515 for (i
= 0; i
< 3; i
++)
8516 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
8517 AREF (eol_type
, i
));
8519 ASET (spec
, 2, subsidiaries
);
8522 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
8523 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
8524 Vcoding_system_alist
);
8529 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
8531 doc
: /* Return the base of CODING-SYSTEM.
8532 Any alias or subsidiary coding system is not a base coding system. */)
8534 Lisp_Object coding_system
;
8536 Lisp_Object spec
, attrs
;
8538 if (NILP (coding_system
))
8539 return (Qno_conversion
);
8540 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8541 attrs
= AREF (spec
, 0);
8542 return CODING_ATTR_BASE_NAME (attrs
);
8545 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
8547 doc
: "Return the property list of CODING-SYSTEM.")
8549 Lisp_Object coding_system
;
8551 Lisp_Object spec
, attrs
;
8553 if (NILP (coding_system
))
8554 coding_system
= Qno_conversion
;
8555 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8556 attrs
= AREF (spec
, 0);
8557 return CODING_ATTR_PLIST (attrs
);
8561 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
8563 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
8565 Lisp_Object coding_system
;
8569 if (NILP (coding_system
))
8570 coding_system
= Qno_conversion
;
8571 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8572 return AREF (spec
, 1);
8575 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
8576 Scoding_system_eol_type
, 1, 1, 0,
8577 doc
: /* Return eol-type of CODING-SYSTEM.
8578 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8580 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8581 and CR respectively.
8583 A vector value indicates that a format of end-of-line should be
8584 detected automatically. Nth element of the vector is the subsidiary
8585 coding system whose eol-type is N. */)
8587 Lisp_Object coding_system
;
8589 Lisp_Object spec
, eol_type
;
8592 if (NILP (coding_system
))
8593 coding_system
= Qno_conversion
;
8594 if (! CODING_SYSTEM_P (coding_system
))
8596 spec
= CODING_SYSTEM_SPEC (coding_system
);
8597 eol_type
= AREF (spec
, 2);
8598 if (VECTORP (eol_type
))
8599 return Fcopy_sequence (eol_type
);
8600 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
8601 return make_number (n
);
8607 /*** 9. Post-amble ***/
8614 for (i
= 0; i
< coding_category_max
; i
++)
8616 coding_categories
[i
].id
= -1;
8617 coding_priorities
[i
] = i
;
8620 /* ISO2022 specific initialize routine. */
8621 for (i
= 0; i
< 0x20; i
++)
8622 iso_code_class
[i
] = ISO_control_0
;
8623 for (i
= 0x21; i
< 0x7F; i
++)
8624 iso_code_class
[i
] = ISO_graphic_plane_0
;
8625 for (i
= 0x80; i
< 0xA0; i
++)
8626 iso_code_class
[i
] = ISO_control_1
;
8627 for (i
= 0xA1; i
< 0xFF; i
++)
8628 iso_code_class
[i
] = ISO_graphic_plane_1
;
8629 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
8630 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
8631 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
8632 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
8633 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
8634 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
8635 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
8636 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
8637 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
8638 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
8640 inhibit_pre_post_conversion
= 0;
8642 for (i
= 0; i
< 256; i
++)
8644 emacs_mule_bytes
[i
] = 1;
8646 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_11
] = 3;
8647 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_12
] = 3;
8648 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_21
] = 4;
8649 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_22
] = 4;
8657 staticpro (&Vcoding_system_hash_table
);
8659 Lisp_Object args
[2];
8662 Vcoding_system_hash_table
= Fmake_hash_table (2, args
);
8665 staticpro (&Vsjis_coding_system
);
8666 Vsjis_coding_system
= Qnil
;
8668 staticpro (&Vbig5_coding_system
);
8669 Vbig5_coding_system
= Qnil
;
8671 staticpro (&Vcode_conversion_work_buf_list
);
8672 Vcode_conversion_work_buf_list
= Qnil
;
8674 staticpro (&Vcode_conversion_reused_work_buf
);
8675 Vcode_conversion_reused_work_buf
= Qnil
;
8677 DEFSYM (Qcharset
, "charset");
8678 DEFSYM (Qtarget_idx
, "target-idx");
8679 DEFSYM (Qcoding_system_history
, "coding-system-history");
8680 Fset (Qcoding_system_history
, Qnil
);
8682 /* Target FILENAME is the first argument. */
8683 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
8684 /* Target FILENAME is the third argument. */
8685 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
8687 DEFSYM (Qcall_process
, "call-process");
8688 /* Target PROGRAM is the first argument. */
8689 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
8691 DEFSYM (Qcall_process_region
, "call-process-region");
8692 /* Target PROGRAM is the third argument. */
8693 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
8695 DEFSYM (Qstart_process
, "start-process");
8696 /* Target PROGRAM is the third argument. */
8697 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
8699 DEFSYM (Qopen_network_stream
, "open-network-stream");
8700 /* Target SERVICE is the fourth argument. */
8701 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
8703 DEFSYM (Qcoding_system
, "coding-system");
8704 DEFSYM (Qcoding_aliases
, "coding-aliases");
8706 DEFSYM (Qeol_type
, "eol-type");
8707 DEFSYM (Qunix
, "unix");
8708 DEFSYM (Qdos
, "dos");
8710 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
8711 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
8712 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
8713 DEFSYM (Qdefault_char
, "default-char");
8714 DEFSYM (Qundecided
, "undecided");
8715 DEFSYM (Qno_conversion
, "no-conversion");
8716 DEFSYM (Qraw_text
, "raw-text");
8718 DEFSYM (Qiso_2022
, "iso-2022");
8720 DEFSYM (Qutf_8
, "utf-8");
8721 DEFSYM (Qutf_8_emacs
, "utf-8-emacs");
8723 DEFSYM (Qutf_16
, "utf-16");
8724 DEFSYM (Qbig
, "big");
8725 DEFSYM (Qlittle
, "little");
8727 DEFSYM (Qshift_jis
, "shift-jis");
8728 DEFSYM (Qbig5
, "big5");
8730 DEFSYM (Qcoding_system_p
, "coding-system-p");
8732 DEFSYM (Qcoding_system_error
, "coding-system-error");
8733 Fput (Qcoding_system_error
, Qerror_conditions
,
8734 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
8735 Fput (Qcoding_system_error
, Qerror_message
,
8736 build_string ("Invalid coding system"));
8738 /* Intern this now in case it isn't already done.
8739 Setting this variable twice is harmless.
8740 But don't staticpro it here--that is done in alloc.c. */
8741 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
8743 DEFSYM (Qtranslation_table
, "translation-table");
8744 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
8745 DEFSYM (Qtranslation_table_id
, "translation-table-id");
8746 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
8747 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
8749 DEFSYM (Qvalid_codes
, "valid-codes");
8751 DEFSYM (Qemacs_mule
, "emacs-mule");
8753 DEFSYM (QCcategory
, ":category");
8755 Vcoding_category_table
8756 = Fmake_vector (make_number (coding_category_max
), Qnil
);
8757 staticpro (&Vcoding_category_table
);
8758 /* Followings are target of code detection. */
8759 ASET (Vcoding_category_table
, coding_category_iso_7
,
8760 intern ("coding-category-iso-7"));
8761 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
8762 intern ("coding-category-iso-7-tight"));
8763 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
8764 intern ("coding-category-iso-8-1"));
8765 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
8766 intern ("coding-category-iso-8-2"));
8767 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
8768 intern ("coding-category-iso-7-else"));
8769 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
8770 intern ("coding-category-iso-8-else"));
8771 ASET (Vcoding_category_table
, coding_category_utf_8
,
8772 intern ("coding-category-utf-8"));
8773 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
8774 intern ("coding-category-utf-16-be"));
8775 ASET (Vcoding_category_table
, coding_category_utf_16_auto
,
8776 intern ("coding-category-utf-16-auto"));
8777 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
8778 intern ("coding-category-utf-16-le"));
8779 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
8780 intern ("coding-category-utf-16-be-nosig"));
8781 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
8782 intern ("coding-category-utf-16-le-nosig"));
8783 ASET (Vcoding_category_table
, coding_category_charset
,
8784 intern ("coding-category-charset"));
8785 ASET (Vcoding_category_table
, coding_category_sjis
,
8786 intern ("coding-category-sjis"));
8787 ASET (Vcoding_category_table
, coding_category_big5
,
8788 intern ("coding-category-big5"));
8789 ASET (Vcoding_category_table
, coding_category_ccl
,
8790 intern ("coding-category-ccl"));
8791 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
8792 intern ("coding-category-emacs-mule"));
8793 /* Followings are NOT target of code detection. */
8794 ASET (Vcoding_category_table
, coding_category_raw_text
,
8795 intern ("coding-category-raw-text"));
8796 ASET (Vcoding_category_table
, coding_category_undecided
,
8797 intern ("coding-category-undecided"));
8799 defsubr (&Scoding_system_p
);
8800 defsubr (&Sread_coding_system
);
8801 defsubr (&Sread_non_nil_coding_system
);
8802 defsubr (&Scheck_coding_system
);
8803 defsubr (&Sdetect_coding_region
);
8804 defsubr (&Sdetect_coding_string
);
8805 defsubr (&Sfind_coding_systems_region_internal
);
8806 defsubr (&Sunencodable_char_position
);
8807 defsubr (&Scheck_coding_systems_region
);
8808 defsubr (&Sdecode_coding_region
);
8809 defsubr (&Sencode_coding_region
);
8810 defsubr (&Sdecode_coding_string
);
8811 defsubr (&Sencode_coding_string
);
8812 defsubr (&Sdecode_sjis_char
);
8813 defsubr (&Sencode_sjis_char
);
8814 defsubr (&Sdecode_big5_char
);
8815 defsubr (&Sencode_big5_char
);
8816 defsubr (&Sset_terminal_coding_system_internal
);
8817 defsubr (&Sset_safe_terminal_coding_system_internal
);
8818 defsubr (&Sterminal_coding_system
);
8819 defsubr (&Sset_keyboard_coding_system_internal
);
8820 defsubr (&Skeyboard_coding_system
);
8821 defsubr (&Sfind_operation_coding_system
);
8822 defsubr (&Sset_coding_system_priority
);
8823 defsubr (&Sdefine_coding_system_internal
);
8824 defsubr (&Sdefine_coding_system_alias
);
8825 defsubr (&Scoding_system_base
);
8826 defsubr (&Scoding_system_plist
);
8827 defsubr (&Scoding_system_aliases
);
8828 defsubr (&Scoding_system_eol_type
);
8829 defsubr (&Scoding_system_priority_list
);
8831 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
8832 doc
: /* List of coding systems.
8834 Do not alter the value of this variable manually. This variable should be
8835 updated by the functions `define-coding-system' and
8836 `define-coding-system-alias'. */);
8837 Vcoding_system_list
= Qnil
;
8839 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
8840 doc
: /* Alist of coding system names.
8841 Each element is one element list of coding system name.
8842 This variable is given to `completing-read' as TABLE argument.
8844 Do not alter the value of this variable manually. This variable should be
8845 updated by the functions `make-coding-system' and
8846 `define-coding-system-alias'. */);
8847 Vcoding_system_alist
= Qnil
;
8849 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
8850 doc
: /* List of coding-categories (symbols) ordered by priority.
8852 On detecting a coding system, Emacs tries code detection algorithms
8853 associated with each coding-category one by one in this order. When
8854 one algorithm agrees with a byte sequence of source text, the coding
8855 system bound to the corresponding coding-category is selected. */);
8859 Vcoding_category_list
= Qnil
;
8860 for (i
= coding_category_max
- 1; i
>= 0; i
--)
8861 Vcoding_category_list
8862 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
8863 Vcoding_category_list
);
8866 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
8867 doc
: /* Specify the coding system for read operations.
8868 It is useful to bind this variable with `let', but do not set it globally.
8869 If the value is a coding system, it is used for decoding on read operation.
8870 If not, an appropriate element is used from one of the coding system alists:
8871 There are three such tables, `file-coding-system-alist',
8872 `process-coding-system-alist', and `network-coding-system-alist'. */);
8873 Vcoding_system_for_read
= Qnil
;
8875 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
8876 doc
: /* Specify the coding system for write operations.
8877 Programs bind this variable with `let', but you should not set it globally.
8878 If the value is a coding system, it is used for encoding of output,
8879 when writing it to a file and when sending it to a file or subprocess.
8881 If this does not specify a coding system, an appropriate element
8882 is used from one of the coding system alists:
8883 There are three such tables, `file-coding-system-alist',
8884 `process-coding-system-alist', and `network-coding-system-alist'.
8885 For output to files, if the above procedure does not specify a coding system,
8886 the value of `buffer-file-coding-system' is used. */);
8887 Vcoding_system_for_write
= Qnil
;
8889 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
8891 Coding system used in the latest file or process I/O. */);
8892 Vlast_coding_system_used
= Qnil
;
8894 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
8896 *Non-nil means always inhibit code conversion of end-of-line format.
8897 See info node `Coding Systems' and info node `Text and Binary' concerning
8898 such conversion. */);
8899 inhibit_eol_conversion
= 0;
8901 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
8903 Non-nil means process buffer inherits coding system of process output.
8904 Bind it to t if the process output is to be treated as if it were a file
8905 read from some filesystem. */);
8906 inherit_process_coding_system
= 0;
8908 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
8910 Alist to decide a coding system to use for a file I/O operation.
8911 The format is ((PATTERN . VAL) ...),
8912 where PATTERN is a regular expression matching a file name,
8913 VAL is a coding system, a cons of coding systems, or a function symbol.
8914 If VAL is a coding system, it is used for both decoding and encoding
8916 If VAL is a cons of coding systems, the car part is used for decoding,
8917 and the cdr part is used for encoding.
8918 If VAL is a function symbol, the function must return a coding system
8919 or a cons of coding systems which are used as above. The function gets
8920 the arguments with which `find-operation-coding-systems' was called.
8922 See also the function `find-operation-coding-system'
8923 and the variable `auto-coding-alist'. */);
8924 Vfile_coding_system_alist
= Qnil
;
8926 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
8928 Alist to decide a coding system to use for a process I/O operation.
8929 The format is ((PATTERN . VAL) ...),
8930 where PATTERN is a regular expression matching a program name,
8931 VAL is a coding system, a cons of coding systems, or a function symbol.
8932 If VAL is a coding system, it is used for both decoding what received
8933 from the program and encoding what sent to the program.
8934 If VAL is a cons of coding systems, the car part is used for decoding,
8935 and the cdr part is used for encoding.
8936 If VAL is a function symbol, the function must return a coding system
8937 or a cons of coding systems which are used as above.
8939 See also the function `find-operation-coding-system'. */);
8940 Vprocess_coding_system_alist
= Qnil
;
8942 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
8944 Alist to decide a coding system to use for a network I/O operation.
8945 The format is ((PATTERN . VAL) ...),
8946 where PATTERN is a regular expression matching a network service name
8947 or is a port number to connect to,
8948 VAL is a coding system, a cons of coding systems, or a function symbol.
8949 If VAL is a coding system, it is used for both decoding what received
8950 from the network stream and encoding what sent to the network stream.
8951 If VAL is a cons of coding systems, the car part is used for decoding,
8952 and the cdr part is used for encoding.
8953 If VAL is a function symbol, the function must return a coding system
8954 or a cons of coding systems which are used as above.
8956 See also the function `find-operation-coding-system'. */);
8957 Vnetwork_coding_system_alist
= Qnil
;
8959 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
8960 doc
: /* Coding system to use with system messages.
8961 Also used for decoding keyboard input on X Window system. */);
8962 Vlocale_coding_system
= Qnil
;
8964 /* The eol mnemonics are reset in startup.el system-dependently. */
8965 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
8967 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8968 eol_mnemonic_unix
= build_string (":");
8970 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
8972 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8973 eol_mnemonic_dos
= build_string ("\\");
8975 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
8977 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8978 eol_mnemonic_mac
= build_string ("/");
8980 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
8982 *String displayed in mode line when end-of-line format is not yet determined. */);
8983 eol_mnemonic_undecided
= build_string (":");
8985 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
8987 *Non-nil enables character translation while encoding and decoding. */);
8988 Venable_character_translation
= Qt
;
8990 DEFVAR_LISP ("standard-translation-table-for-decode",
8991 &Vstandard_translation_table_for_decode
,
8992 doc
: /* Table for translating characters while decoding. */);
8993 Vstandard_translation_table_for_decode
= Qnil
;
8995 DEFVAR_LISP ("standard-translation-table-for-encode",
8996 &Vstandard_translation_table_for_encode
,
8997 doc
: /* Table for translating characters while encoding. */);
8998 Vstandard_translation_table_for_encode
= Qnil
;
9000 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
9001 doc
: /* Alist of charsets vs revision numbers.
9002 While encoding, if a charset (car part of an element) is found,
9003 designate it with the escape sequence identifying revision (cdr part
9004 of the element). */);
9005 Vcharset_revision_table
= Qnil
;
9007 DEFVAR_LISP ("default-process-coding-system",
9008 &Vdefault_process_coding_system
,
9009 doc
: /* Cons of coding systems used for process I/O by default.
9010 The car part is used for decoding a process output,
9011 the cdr part is used for encoding a text to be sent to a process. */);
9012 Vdefault_process_coding_system
= Qnil
;
9014 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
9016 Table of extra Latin codes in the range 128..159 (inclusive).
9017 This is a vector of length 256.
9018 If Nth element is non-nil, the existence of code N in a file
9019 \(or output of subprocess) doesn't prevent it to be detected as
9020 a coding system of ISO 2022 variant which has a flag
9021 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9022 or reading output of a subprocess.
9023 Only 128th through 159th elements has a meaning. */);
9024 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
9026 DEFVAR_LISP ("select-safe-coding-system-function",
9027 &Vselect_safe_coding_system_function
,
9029 Function to call to select safe coding system for encoding a text.
9031 If set, this function is called to force a user to select a proper
9032 coding system which can encode the text in the case that a default
9033 coding system used in each operation can't encode the text.
9035 The default value is `select-safe-coding-system' (which see). */);
9036 Vselect_safe_coding_system_function
= Qnil
;
9038 DEFVAR_BOOL ("coding-system-require-warning",
9039 &coding_system_require_warning
,
9040 doc
: /* Internal use only.
9041 If non-nil, on writing a file, `select-safe-coding-system-function' is
9042 called even if `coding-system-for-write' is non-nil. The command
9043 `universal-coding-system-argument' binds this variable to t temporarily. */);
9044 coding_system_require_warning
= 0;
9047 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9048 &inhibit_iso_escape_detection
,
9050 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9052 By default, on reading a file, Emacs tries to detect how the text is
9053 encoded. This code detection is sensitive to escape sequences. If
9054 the sequence is valid as ISO2022, the code is determined as one of
9055 the ISO2022 encodings, and the file is decoded by the corresponding
9056 coding system (e.g. `iso-2022-7bit').
9058 However, there may be a case that you want to read escape sequences in
9059 a file as is. In such a case, you can set this variable to non-nil.
9060 Then, as the code detection ignores any escape sequences, no file is
9061 detected as encoded in some ISO2022 encoding. The result is that all
9062 escape sequences become visible in a buffer.
9064 The default value is nil, and it is strongly recommended not to change
9065 it. That is because many Emacs Lisp source files that contain
9066 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9067 in Emacs's distribution, and they won't be decoded correctly on
9068 reading if you suppress escape sequence detection.
9070 The other way to read escape sequences in a file without decoding is
9071 to explicitly specify some coding system that doesn't use ISO2022's
9072 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9073 inhibit_iso_escape_detection
= 0;
9075 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input
,
9076 doc
: /* Char table for translating self-inserting characters.
9077 This is applied to the result of input methods, not their input. See also
9078 `keyboard-translate-table'. */);
9079 Vtranslation_table_for_input
= Qnil
;
9082 Lisp_Object args
[coding_arg_max
];
9083 Lisp_Object plist
[16];
9086 for (i
= 0; i
< coding_arg_max
; i
++)
9089 plist
[0] = intern (":name");
9090 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
9091 plist
[2] = intern (":mnemonic");
9092 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
9093 plist
[4] = intern (":coding-type");
9094 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
9095 plist
[6] = intern (":ascii-compatible-p");
9096 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
9097 plist
[8] = intern (":default-char");
9098 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
9099 plist
[10] = intern (":for-unibyte");
9100 plist
[11] = args
[coding_arg_for_unibyte
] = Qt
;
9101 plist
[12] = intern (":docstring");
9102 plist
[13] = build_string ("Do no conversion.\n\
9104 When you visit a file with this coding, the file is read into a\n\
9105 unibyte buffer as is, thus each byte of a file is treated as a\n\
9107 plist
[14] = intern (":eol-type");
9108 plist
[15] = args
[coding_arg_eol_type
] = Qunix
;
9109 args
[coding_arg_plist
] = Flist (16, plist
);
9110 Fdefine_coding_system_internal (coding_arg_max
, args
);
9113 setup_coding_system (Qno_conversion
, &keyboard_coding
);
9114 setup_coding_system (Qno_conversion
, &terminal_coding
);
9115 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
9120 for (i
= 0; i
< coding_category_max
; i
++)
9121 Fset (AREF (Vcoding_category_table
, i
), Qno_conversion
);
9126 emacs_strerror (error_number
)
9131 synchronize_system_messages_locale ();
9132 str
= strerror (error_number
);
9134 if (! NILP (Vlocale_coding_system
))
9136 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
9137 Vlocale_coding_system
,
9139 str
= (char *) SDATA (dec
);