1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
152 Below is the template of these functions. */
156 detect_coding_XXX (coding
, detect_info
)
157 struct coding_system
*coding
;
158 struct coding_detection_info
*detect_info
;
160 unsigned char *src
= coding
->source
;
161 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
162 int multibytep
= coding
->src_multibyte
;
163 int consumed_chars
= 0;
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
173 if (! __C_conforms_to_XXX___ (c
))
175 if (! __C_strongly_suggests_XXX__ (c
))
176 found
= CATEGORY_MASK_XXX
;
178 /* The byte sequence is invalid for XXX. */
179 detect_info
->rejected
|= CATEGORY_MASK_XXX
;
183 /* The source exausted successfully. */
184 detect_info
->found
|= found
;
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
201 Below is the template of these functions. */
205 decode_coding_XXXX (coding
)
206 struct coding_system
*coding
;
208 unsigned char *src
= coding
->source
+ coding
->consumed
;
209 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base
;
214 /* A buffer to produce decoded characters. */
215 int *charbuf
= coding
->charbuf
;
216 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
217 int multibytep
= coding
->src_multibyte
;
222 if (charbuf
< charbuf_end
)
223 /* No more room to produce a decoded character. */
230 if (src_base
< src_end
231 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base
< src_end
&& charbuf
< charbuf_end
)
235 *charbuf
++ = *src_base
++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
239 /* Remember how many characters we produced. */
240 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
260 Below is a template of these functions. */
263 encode_coding_XXX (coding
)
264 struct coding_system
*coding
;
266 int multibytep
= coding
->dst_multibyte
;
267 int *charbuf
= coding
->charbuf
;
268 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
269 unsigned char *dst
= coding
->destination
+ coding
->produced
;
270 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
271 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
272 int produced_chars
= 0;
274 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
277 /* Encode C into DST, and increment DST. */
279 label_no_more_destination
:
280 /* How many chars and bytes we produced. */
281 coding
->produced_char
+= produced_chars
;
282 coding
->produced
= dst
- coding
->destination
;
287 /*** 1. Preamble ***/
294 #include "character.h"
297 #include "composite.h"
301 Lisp_Object Vcoding_system_hash_table
;
303 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
304 Lisp_Object Qunix
, Qdos
;
305 extern Lisp_Object Qmac
; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system
;
307 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
308 Lisp_Object Qdefault_char
;
309 Lisp_Object Qno_conversion
, Qundecided
;
310 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
311 Lisp_Object Qbig
, Qlittle
;
312 Lisp_Object Qcoding_system_history
;
313 Lisp_Object Qvalid_codes
;
314 Lisp_Object QCcategory
;
316 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
317 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
318 Lisp_Object Qstart_process
, Qopen_network_stream
;
319 Lisp_Object Qtarget_idx
;
321 int coding_system_require_warning
;
323 Lisp_Object Vselect_safe_coding_system_function
;
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
327 /* Mnemonic string to indicate format of end-of-line is not yet
329 Lisp_Object eol_mnemonic_undecided
;
333 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
335 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule
, Qraw_text
;
340 Lisp_Object Qutf_8_emacs
;
342 /* Coding-systems are handed between Emacs Lisp programs and C internal
343 routines by the following three variables. */
344 /* Coding-system for reading files and receiving data from process. */
345 Lisp_Object Vcoding_system_for_read
;
346 /* Coding-system for writing files and sending data to process. */
347 Lisp_Object Vcoding_system_for_write
;
348 /* Coding-system actually used in the latest I/O. */
349 Lisp_Object Vlast_coding_system_used
;
351 /* A vector of length 256 which contains information about special
352 Latin codes (especially for dealing with Microsoft codes). */
353 Lisp_Object Vlatin_extra_code_table
;
355 /* Flag to inhibit code conversion of end-of-line format. */
356 int inhibit_eol_conversion
;
358 /* Flag to inhibit ISO2022 escape sequence detection. */
359 int inhibit_iso_escape_detection
;
361 /* Flag to make buffer-file-coding-system inherit from process-coding. */
362 int inherit_process_coding_system
;
364 /* Coding system to be used to encode text for terminal display. */
365 struct coding_system terminal_coding
;
367 /* Coding system to be used to encode text for terminal display when
368 terminal coding system is nil. */
369 struct coding_system safe_terminal_coding
;
371 /* Coding system of what is sent from terminal keyboard. */
372 struct coding_system keyboard_coding
;
374 Lisp_Object Vfile_coding_system_alist
;
375 Lisp_Object Vprocess_coding_system_alist
;
376 Lisp_Object Vnetwork_coding_system_alist
;
378 Lisp_Object Vlocale_coding_system
;
382 /* Flag to tell if we look up translation table on character code
384 Lisp_Object Venable_character_translation
;
385 /* Standard translation table to look up on decoding (reading). */
386 Lisp_Object Vstandard_translation_table_for_decode
;
387 /* Standard translation table to look up on encoding (writing). */
388 Lisp_Object Vstandard_translation_table_for_encode
;
390 Lisp_Object Qtranslation_table
;
391 Lisp_Object Qtranslation_table_id
;
392 Lisp_Object Qtranslation_table_for_decode
;
393 Lisp_Object Qtranslation_table_for_encode
;
395 /* Alist of charsets vs revision number. */
396 static Lisp_Object Vcharset_revision_table
;
398 /* Default coding systems used for process I/O. */
399 Lisp_Object Vdefault_process_coding_system
;
401 /* Char table for translating Quail and self-inserting input. */
402 Lisp_Object Vtranslation_table_for_input
;
404 /* Two special coding systems. */
405 Lisp_Object Vsjis_coding_system
;
406 Lisp_Object Vbig5_coding_system
;
409 static int detect_coding_utf_8
P_ ((struct coding_system
*,
410 struct coding_detection_info
*info
));
411 static void decode_coding_utf_8
P_ ((struct coding_system
*));
412 static int encode_coding_utf_8
P_ ((struct coding_system
*));
414 static int detect_coding_utf_16
P_ ((struct coding_system
*,
415 struct coding_detection_info
*info
));
416 static void decode_coding_utf_16
P_ ((struct coding_system
*));
417 static int encode_coding_utf_16
P_ ((struct coding_system
*));
419 static int detect_coding_iso_2022
P_ ((struct coding_system
*,
420 struct coding_detection_info
*info
));
421 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
422 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
424 static int detect_coding_emacs_mule
P_ ((struct coding_system
*,
425 struct coding_detection_info
*info
));
426 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
427 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
429 static int detect_coding_sjis
P_ ((struct coding_system
*,
430 struct coding_detection_info
*info
));
431 static void decode_coding_sjis
P_ ((struct coding_system
*));
432 static int encode_coding_sjis
P_ ((struct coding_system
*));
434 static int detect_coding_big5
P_ ((struct coding_system
*,
435 struct coding_detection_info
*info
));
436 static void decode_coding_big5
P_ ((struct coding_system
*));
437 static int encode_coding_big5
P_ ((struct coding_system
*));
439 static int detect_coding_ccl
P_ ((struct coding_system
*,
440 struct coding_detection_info
*info
));
441 static void decode_coding_ccl
P_ ((struct coding_system
*));
442 static int encode_coding_ccl
P_ ((struct coding_system
*));
444 static void decode_coding_raw_text
P_ ((struct coding_system
*));
445 static int encode_coding_raw_text
P_ ((struct coding_system
*));
448 /* ISO2022 section */
450 #define CODING_ISO_INITIAL(coding, reg) \
451 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
452 coding_attr_iso_initial), \
456 #define CODING_ISO_REQUEST(coding, charset_id) \
457 ((charset_id <= (coding)->max_charset_id \
458 ? (coding)->safe_charsets[charset_id] \
462 #define CODING_ISO_FLAGS(coding) \
463 ((coding)->spec.iso_2022.flags)
464 #define CODING_ISO_DESIGNATION(coding, reg) \
465 ((coding)->spec.iso_2022.current_designation[reg])
466 #define CODING_ISO_INVOCATION(coding, plane) \
467 ((coding)->spec.iso_2022.current_invocation[plane])
468 #define CODING_ISO_SINGLE_SHIFTING(coding) \
469 ((coding)->spec.iso_2022.single_shifting)
470 #define CODING_ISO_BOL(coding) \
471 ((coding)->spec.iso_2022.bol)
472 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
473 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
475 /* Control characters of ISO2022. */
476 /* code */ /* function */
477 #define ISO_CODE_LF 0x0A /* line-feed */
478 #define ISO_CODE_CR 0x0D /* carriage-return */
479 #define ISO_CODE_SO 0x0E /* shift-out */
480 #define ISO_CODE_SI 0x0F /* shift-in */
481 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
482 #define ISO_CODE_ESC 0x1B /* escape */
483 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
484 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
485 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
487 /* All code (1-byte) of ISO2022 is classified into one of the
489 enum iso_code_class_type
491 ISO_control_0
, /* Control codes in the range
492 0x00..0x1F and 0x7F, except for the
493 following 5 codes. */
494 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
495 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
496 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
497 ISO_escape
, /* ISO_CODE_SO (0x1B) */
498 ISO_control_1
, /* Control codes in the range
499 0x80..0x9F, except for the
500 following 3 codes. */
501 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
502 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
503 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
504 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
505 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
506 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
507 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
510 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
511 `iso-flags' attribute of an iso2022 coding system. */
513 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
514 instead of the correct short-form sequence (e.g. ESC $ A). */
515 #define CODING_ISO_FLAG_LONG_FORM 0x0001
517 /* If set, reset graphic planes and registers at end-of-line to the
519 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
521 /* If set, reset graphic planes and registers before any control
522 characters to the initial state. */
523 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
525 /* If set, encode by 7-bit environment. */
526 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
528 /* If set, use locking-shift function. */
529 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
531 /* If set, use single-shift function. Overwrite
532 CODING_ISO_FLAG_LOCKING_SHIFT. */
533 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
535 /* If set, use designation escape sequence. */
536 #define CODING_ISO_FLAG_DESIGNATION 0x0040
538 /* If set, produce revision number sequence. */
539 #define CODING_ISO_FLAG_REVISION 0x0080
541 /* If set, produce ISO6429's direction specifying sequence. */
542 #define CODING_ISO_FLAG_DIRECTION 0x0100
544 /* If set, assume designation states are reset at beginning of line on
546 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
548 /* If set, designation sequence should be placed at beginning of line
550 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
552 /* If set, do not encode unsafe charactes on output. */
553 #define CODING_ISO_FLAG_SAFE 0x0800
555 /* If set, extra latin codes (128..159) are accepted as a valid code
557 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
559 #define CODING_ISO_FLAG_COMPOSITION 0x2000
561 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
563 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
565 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
567 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
569 /* A character to be produced on output if encoding of the original
570 character is prohibited by CODING_ISO_FLAG_SAFE. */
571 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
575 #define CODING_UTF_16_BOM(coding) \
576 ((coding)->spec.utf_16.bom)
578 #define CODING_UTF_16_ENDIAN(coding) \
579 ((coding)->spec.utf_16.endian)
581 #define CODING_UTF_16_SURROGATE(coding) \
582 ((coding)->spec.utf_16.surrogate)
586 #define CODING_CCL_DECODER(coding) \
587 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
588 #define CODING_CCL_ENCODER(coding) \
589 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
590 #define CODING_CCL_VALIDS(coding) \
591 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
593 /* Index for each coding category in `coding_categories' */
597 coding_category_iso_7
,
598 coding_category_iso_7_tight
,
599 coding_category_iso_8_1
,
600 coding_category_iso_8_2
,
601 coding_category_iso_7_else
,
602 coding_category_iso_8_else
,
603 coding_category_utf_8
,
604 coding_category_utf_16_auto
,
605 coding_category_utf_16_be
,
606 coding_category_utf_16_le
,
607 coding_category_utf_16_be_nosig
,
608 coding_category_utf_16_le_nosig
,
609 coding_category_charset
,
610 coding_category_sjis
,
611 coding_category_big5
,
613 coding_category_emacs_mule
,
614 /* All above are targets of code detection. */
615 coding_category_raw_text
,
616 coding_category_undecided
,
620 /* Definitions of flag bits used in detect_coding_XXXX. */
621 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
622 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
623 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
624 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
625 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
626 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
627 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
628 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
629 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
630 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
631 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
632 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
633 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
634 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
635 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
636 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
637 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
638 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
640 /* This value is returned if detect_coding_mask () find nothing other
641 than ASCII characters. */
642 #define CATEGORY_MASK_ANY \
643 (CATEGORY_MASK_ISO_7 \
644 | CATEGORY_MASK_ISO_7_TIGHT \
645 | CATEGORY_MASK_ISO_8_1 \
646 | CATEGORY_MASK_ISO_8_2 \
647 | CATEGORY_MASK_ISO_7_ELSE \
648 | CATEGORY_MASK_ISO_8_ELSE \
649 | CATEGORY_MASK_UTF_8 \
650 | CATEGORY_MASK_UTF_16_BE \
651 | CATEGORY_MASK_UTF_16_LE \
652 | CATEGORY_MASK_UTF_16_BE_NOSIG \
653 | CATEGORY_MASK_UTF_16_LE_NOSIG \
654 | CATEGORY_MASK_CHARSET \
655 | CATEGORY_MASK_SJIS \
656 | CATEGORY_MASK_BIG5 \
657 | CATEGORY_MASK_CCL \
658 | CATEGORY_MASK_EMACS_MULE)
661 #define CATEGORY_MASK_ISO_7BIT \
662 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
664 #define CATEGORY_MASK_ISO_8BIT \
665 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
667 #define CATEGORY_MASK_ISO_ELSE \
668 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
670 #define CATEGORY_MASK_ISO_ESCAPE \
671 (CATEGORY_MASK_ISO_7 \
672 | CATEGORY_MASK_ISO_7_TIGHT \
673 | CATEGORY_MASK_ISO_7_ELSE \
674 | CATEGORY_MASK_ISO_8_ELSE)
676 #define CATEGORY_MASK_ISO \
677 ( CATEGORY_MASK_ISO_7BIT \
678 | CATEGORY_MASK_ISO_8BIT \
679 | CATEGORY_MASK_ISO_ELSE)
681 #define CATEGORY_MASK_UTF_16 \
682 (CATEGORY_MASK_UTF_16_BE \
683 | CATEGORY_MASK_UTF_16_LE \
684 | CATEGORY_MASK_UTF_16_BE_NOSIG \
685 | CATEGORY_MASK_UTF_16_LE_NOSIG)
688 /* List of symbols `coding-category-xxx' ordered by priority. This
689 variable is exposed to Emacs Lisp. */
690 static Lisp_Object Vcoding_category_list
;
692 /* Table of coding categories (Lisp symbols). This variable is for
694 static Lisp_Object Vcoding_category_table
;
696 /* Table of coding-categories ordered by priority. */
697 static enum coding_category coding_priorities
[coding_category_max
];
699 /* Nth element is a coding context for the coding system bound to the
700 Nth coding category. */
701 static struct coding_system coding_categories
[coding_category_max
];
703 /*** Commonly used macros and functions ***/
706 #define min(a, b) ((a) < (b) ? (a) : (b))
709 #define max(a, b) ((a) > (b) ? (a) : (b))
712 #define CODING_GET_INFO(coding, attrs, charset_list) \
714 (attrs) = CODING_ID_ATTRS ((coding)->id); \
715 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
719 /* Safely get one byte from the source text pointed by SRC which ends
720 at SRC_END, and set C to that byte. If there are not enough bytes
721 in the source, it jumps to `no_more_source'. The caller
722 should declare and set these variables appropriately in advance:
723 src, src_end, multibytep
726 #define ONE_MORE_BYTE(c) \
728 if (src == src_end) \
730 if (src_base < src) \
731 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
732 goto no_more_source; \
735 if (multibytep && (c & 0x80)) \
737 if ((c & 0xFE) != 0xC0) \
738 error ("Undecodable char found"); \
739 c = ((c & 1) << 6) | *src++; \
745 #define ONE_MORE_BYTE_NO_CHECK(c) \
748 if (multibytep && (c & 0x80)) \
750 if ((c & 0xFE) != 0xC0) \
751 error ("Undecodable char found"); \
752 c = ((c & 1) << 6) | *src++; \
758 /* Store a byte C in the place pointed by DST and increment DST to the
759 next free point, and increment PRODUCED_CHARS. The caller should
760 assure that C is 0..127, and declare and set the variable `dst'
761 appropriately in advance.
765 #define EMIT_ONE_ASCII_BYTE(c) \
772 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
774 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
776 produced_chars += 2; \
777 *dst++ = (c1), *dst++ = (c2); \
781 /* Store a byte C in the place pointed by DST and increment DST to the
782 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
783 nonzero, store in an appropriate multibyte from. The caller should
784 declare and set the variables `dst' and `multibytep' appropriately
787 #define EMIT_ONE_BYTE(c) \
794 ch = BYTE8_TO_CHAR (ch); \
795 CHAR_STRING_ADVANCE (ch, dst); \
802 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
804 #define EMIT_TWO_BYTES(c1, c2) \
806 produced_chars += 2; \
813 ch = BYTE8_TO_CHAR (ch); \
814 CHAR_STRING_ADVANCE (ch, dst); \
817 ch = BYTE8_TO_CHAR (ch); \
818 CHAR_STRING_ADVANCE (ch, dst); \
828 #define EMIT_THREE_BYTES(c1, c2, c3) \
830 EMIT_ONE_BYTE (c1); \
831 EMIT_TWO_BYTES (c2, c3); \
835 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
837 EMIT_TWO_BYTES (c1, c2); \
838 EMIT_TWO_BYTES (c3, c4); \
842 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
844 charset_map_loaded = 0; \
845 c = DECODE_CHAR (charset, code); \
846 if (charset_map_loaded) \
848 const unsigned char *orig = coding->source; \
851 coding_set_source (coding); \
852 offset = coding->source - orig; \
854 src_base += offset; \
860 #define ASSURE_DESTINATION(bytes) \
862 if (dst + (bytes) >= dst_end) \
864 int more_bytes = charbuf_end - charbuf + (bytes); \
866 dst = alloc_destination (coding, more_bytes, dst); \
867 dst_end = coding->destination + coding->dst_bytes; \
874 coding_set_source (coding
)
875 struct coding_system
*coding
;
877 if (BUFFERP (coding
->src_object
))
879 struct buffer
*buf
= XBUFFER (coding
->src_object
);
881 if (coding
->src_pos
< 0)
882 coding
->source
= BUF_GAP_END_ADDR (buf
) + coding
->src_pos_byte
;
884 coding
->source
= BUF_BYTE_ADDRESS (buf
, coding
->src_pos_byte
);
886 else if (STRINGP (coding
->src_object
))
888 coding
->source
= SDATA (coding
->src_object
) + coding
->src_pos_byte
;
891 /* Otherwise, the source is C string and is never relocated
892 automatically. Thus we don't have to update anything. */
897 coding_set_destination (coding
)
898 struct coding_system
*coding
;
900 if (BUFFERP (coding
->dst_object
))
902 if (coding
->src_pos
< 0)
904 coding
->destination
= BEG_ADDR
+ coding
->dst_pos_byte
- 1;
905 coding
->dst_bytes
= (GAP_END_ADDR
906 - (coding
->src_bytes
- coding
->consumed
)
907 - coding
->destination
);
911 /* We are sure that coding->dst_pos_byte is before the gap
913 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
914 + coding
->dst_pos_byte
- 1);
915 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
916 - coding
->destination
);
920 /* Otherwise, the destination is C string and is never relocated
921 automatically. Thus we don't have to update anything. */
927 coding_alloc_by_realloc (coding
, bytes
)
928 struct coding_system
*coding
;
931 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
932 coding
->dst_bytes
+ bytes
);
933 coding
->dst_bytes
+= bytes
;
937 coding_alloc_by_making_gap (coding
, bytes
)
938 struct coding_system
*coding
;
941 if (BUFFERP (coding
->dst_object
)
942 && EQ (coding
->src_object
, coding
->dst_object
))
944 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
946 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
948 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
952 Lisp_Object this_buffer
;
954 this_buffer
= Fcurrent_buffer ();
955 set_buffer_internal (XBUFFER (coding
->dst_object
));
957 set_buffer_internal (XBUFFER (this_buffer
));
962 static unsigned char *
963 alloc_destination (coding
, nbytes
, dst
)
964 struct coding_system
*coding
;
968 EMACS_INT offset
= dst
- coding
->destination
;
970 if (BUFFERP (coding
->dst_object
))
971 coding_alloc_by_making_gap (coding
, nbytes
);
973 coding_alloc_by_realloc (coding
, nbytes
);
974 coding
->result
= CODING_RESULT_SUCCESS
;
975 coding_set_destination (coding
);
976 dst
= coding
->destination
+ offset
;
980 /** Macros for annotations. */
982 /* Maximum length of annotation data (sum of annotations for
983 composition and charset). */
984 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
986 /* An annotation data is stored in the array coding->charbuf in this
988 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
989 LENGTH is the number of elements in the annotation.
990 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
991 FROM and TO specify the range of text annotated. They are relative
992 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
994 The format of the following elements depend on ANNOTATION_MASK.
996 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
998 ... METHOD [ COMPOSITION-COMPONENTS ... ]
999 METHOD is one of enum composition_method.
1000 Optionnal COMPOSITION-COMPONENTS are characters and composition
1003 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1006 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1008 *(buf)++ = -(len); \
1009 *(buf)++ = (mask); \
1010 *(buf)++ = (from); \
1012 coding->annotated = 1; \
1015 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1017 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1022 #define ADD_CHARSET_DATA(buf, from, to, id) \
1024 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1029 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1036 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1037 Check if a text is encoded in UTF-8. If it is, return 1, else
1040 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1041 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1042 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1043 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1044 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1045 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1048 detect_coding_utf_8 (coding
, detect_info
)
1049 struct coding_system
*coding
;
1050 struct coding_detection_info
*detect_info
;
1052 const unsigned char *src
= coding
->source
, *src_base
= src
;
1053 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1054 int multibytep
= coding
->src_multibyte
;
1055 int consumed_chars
= 0;
1059 detect_info
->checked
|= CATEGORY_MASK_UTF_8
;
1060 /* A coding system of this category is always ASCII compatible. */
1061 src
+= coding
->head_ascii
;
1065 int c
, c1
, c2
, c3
, c4
;
1069 if (UTF_8_1_OCTET_P (c
))
1073 if (! UTF_8_EXTRA_OCTET_P (c1
))
1075 if (UTF_8_2_OCTET_LEADING_P (c
))
1077 found
= CATEGORY_MASK_UTF_8
;
1081 if (! UTF_8_EXTRA_OCTET_P (c2
))
1083 if (UTF_8_3_OCTET_LEADING_P (c
))
1085 found
= CATEGORY_MASK_UTF_8
;
1089 if (! UTF_8_EXTRA_OCTET_P (c3
))
1091 if (UTF_8_4_OCTET_LEADING_P (c
))
1093 found
= CATEGORY_MASK_UTF_8
;
1097 if (! UTF_8_EXTRA_OCTET_P (c4
))
1099 if (UTF_8_5_OCTET_LEADING_P (c
))
1101 found
= CATEGORY_MASK_UTF_8
;
1106 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1110 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1112 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1115 detect_info
->found
|= found
;
1121 decode_coding_utf_8 (coding
)
1122 struct coding_system
*coding
;
1124 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1125 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1126 const unsigned char *src_base
;
1127 int *charbuf
= coding
->charbuf
;
1128 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1129 int consumed_chars
= 0, consumed_chars_base
;
1130 int multibytep
= coding
->src_multibyte
;
1131 Lisp_Object attr
, charset_list
;
1133 CODING_GET_INFO (coding
, attr
, charset_list
);
1137 int c
, c1
, c2
, c3
, c4
, c5
;
1140 consumed_chars_base
= consumed_chars
;
1142 if (charbuf
>= charbuf_end
)
1146 if (UTF_8_1_OCTET_P(c1
))
1153 if (! UTF_8_EXTRA_OCTET_P (c2
))
1155 if (UTF_8_2_OCTET_LEADING_P (c1
))
1157 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1158 /* Reject overlong sequences here and below. Encoders
1159 producing them are incorrect, they can be misleading,
1160 and they mess up read/write invariance. */
1167 if (! UTF_8_EXTRA_OCTET_P (c3
))
1169 if (UTF_8_3_OCTET_LEADING_P (c1
))
1171 c
= (((c1
& 0xF) << 12)
1172 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1174 || (c
>= 0xd800 && c
< 0xe000)) /* surrogates (invalid) */
1180 if (! UTF_8_EXTRA_OCTET_P (c4
))
1182 if (UTF_8_4_OCTET_LEADING_P (c1
))
1184 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1185 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1192 if (! UTF_8_EXTRA_OCTET_P (c5
))
1194 if (UTF_8_5_OCTET_LEADING_P (c1
))
1196 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1197 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1199 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1214 consumed_chars
= consumed_chars_base
;
1216 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1221 coding
->consumed_char
+= consumed_chars_base
;
1222 coding
->consumed
= src_base
- coding
->source
;
1223 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1228 encode_coding_utf_8 (coding
)
1229 struct coding_system
*coding
;
1231 int multibytep
= coding
->dst_multibyte
;
1232 int *charbuf
= coding
->charbuf
;
1233 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1234 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1235 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1236 int produced_chars
= 0;
1241 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1243 while (charbuf
< charbuf_end
)
1245 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1247 ASSURE_DESTINATION (safe_room
);
1249 if (CHAR_BYTE8_P (c
))
1251 c
= CHAR_TO_BYTE8 (c
);
1256 CHAR_STRING_ADVANCE (c
, pend
);
1257 for (p
= str
; p
< pend
; p
++)
1264 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1266 while (charbuf
< charbuf_end
)
1268 ASSURE_DESTINATION (safe_room
);
1270 dst
+= CHAR_STRING (c
, dst
);
1274 coding
->result
= CODING_RESULT_SUCCESS
;
1275 coding
->produced_char
+= produced_chars
;
1276 coding
->produced
= dst
- coding
->destination
;
1281 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1282 Check if a text is encoded in one of UTF-16 based coding systems.
1283 If it is, return 1, else return 0. */
1285 #define UTF_16_HIGH_SURROGATE_P(val) \
1286 (((val) & 0xFC00) == 0xD800)
1288 #define UTF_16_LOW_SURROGATE_P(val) \
1289 (((val) & 0xFC00) == 0xDC00)
1291 #define UTF_16_INVALID_P(val) \
1292 (((val) == 0xFFFE) \
1293 || ((val) == 0xFFFF) \
1294 || UTF_16_LOW_SURROGATE_P (val))
1298 detect_coding_utf_16 (coding
, detect_info
)
1299 struct coding_system
*coding
;
1300 struct coding_detection_info
*detect_info
;
1302 const unsigned char *src
= coding
->source
, *src_base
= src
;
1303 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1304 int multibytep
= coding
->src_multibyte
;
1305 int consumed_chars
= 0;
1308 detect_info
->checked
|= CATEGORY_MASK_UTF_16
;
1309 if (coding
->mode
& CODING_MODE_LAST_BLOCK
1310 && (coding
->src_chars
& 1))
1312 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1318 if ((c1
== 0xFF) && (c2
== 0xFE))
1320 detect_info
->found
|= (CATEGORY_MASK_UTF_16_LE
1321 | CATEGORY_MASK_UTF_16_AUTO
);
1322 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_BE
1323 | CATEGORY_MASK_UTF_16_BE_NOSIG
1324 | CATEGORY_MASK_UTF_16_LE_NOSIG
);
1326 else if ((c1
== 0xFE) && (c2
== 0xFF))
1328 detect_info
->found
|= (CATEGORY_MASK_UTF_16_BE
1329 | CATEGORY_MASK_UTF_16_AUTO
);
1330 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_LE
1331 | CATEGORY_MASK_UTF_16_BE_NOSIG
1332 | CATEGORY_MASK_UTF_16_LE_NOSIG
);
1336 unsigned char b1
[256], b2
[256];
1337 int b1_variants
= 1, b2_variants
= 1;
1340 bzero (b1
, 256), bzero (b2
, 256);
1342 for (n
= 0; n
< 256 && src
< src_end
; n
++)
1346 if (! b1
[c1
++]) b1_variants
++;
1347 if (! b2
[c2
++]) b2_variants
++;
1349 if (b1_variants
< b2_variants
)
1350 detect_info
->found
|= CATEGORY_MASK_UTF_16_BE_NOSIG
;
1352 detect_info
->found
|= CATEGORY_MASK_UTF_16_LE_NOSIG
;
1353 detect_info
->rejected
1354 |= (CATEGORY_MASK_UTF_16_BE
| CATEGORY_MASK_UTF_16_LE
);
1361 decode_coding_utf_16 (coding
)
1362 struct coding_system
*coding
;
1364 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1365 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1366 const unsigned char *src_base
;
1367 int *charbuf
= coding
->charbuf
;
1368 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1369 int consumed_chars
= 0, consumed_chars_base
;
1370 int multibytep
= coding
->src_multibyte
;
1371 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1372 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1373 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1374 Lisp_Object attr
, charset_list
;
1376 CODING_GET_INFO (coding
, attr
, charset_list
);
1378 if (bom
== utf_16_with_bom
)
1387 if (endian
== utf_16_big_endian
1388 ? c
!= 0xFEFF : c
!= 0xFFFE)
1390 /* The first two bytes are not BOM. Treat them as bytes
1391 for a normal character. */
1395 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1397 else if (bom
== utf_16_detect_bom
)
1399 /* We have already tried to detect BOM and failed in
1401 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1409 consumed_chars_base
= consumed_chars
;
1411 if (charbuf
+ 2 >= charbuf_end
)
1416 c
= (endian
== utf_16_big_endian
1417 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1420 if (! UTF_16_LOW_SURROGATE_P (c
))
1422 if (endian
== utf_16_big_endian
)
1423 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1425 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1429 if (UTF_16_HIGH_SURROGATE_P (c
))
1430 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1436 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1437 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1443 if (UTF_16_HIGH_SURROGATE_P (c
))
1444 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1451 coding
->consumed_char
+= consumed_chars_base
;
1452 coding
->consumed
= src_base
- coding
->source
;
1453 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1457 encode_coding_utf_16 (coding
)
1458 struct coding_system
*coding
;
1460 int multibytep
= coding
->dst_multibyte
;
1461 int *charbuf
= coding
->charbuf
;
1462 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1463 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1464 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1466 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1467 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1468 int produced_chars
= 0;
1469 Lisp_Object attrs
, charset_list
;
1472 CODING_GET_INFO (coding
, attrs
, charset_list
);
1474 if (bom
!= utf_16_without_bom
)
1476 ASSURE_DESTINATION (safe_room
);
1478 EMIT_TWO_BYTES (0xFE, 0xFF);
1480 EMIT_TWO_BYTES (0xFF, 0xFE);
1481 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1484 while (charbuf
< charbuf_end
)
1486 ASSURE_DESTINATION (safe_room
);
1488 if (c
>= MAX_UNICODE_CHAR
)
1489 c
= coding
->default_char
;
1494 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1496 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1503 c1
= (c
>> 10) + 0xD800;
1504 c2
= (c
& 0x3FF) + 0xDC00;
1506 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1508 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1511 coding
->result
= CODING_RESULT_SUCCESS
;
1512 coding
->produced
= dst
- coding
->destination
;
1513 coding
->produced_char
+= produced_chars
;
1518 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1520 /* Emacs' internal format for representation of multiple character
1521 sets is a kind of multi-byte encoding, i.e. characters are
1522 represented by variable-length sequences of one-byte codes.
1524 ASCII characters and control characters (e.g. `tab', `newline') are
1525 represented by one-byte sequences which are their ASCII codes, in
1526 the range 0x00 through 0x7F.
1528 8-bit characters of the range 0x80..0x9F are represented by
1529 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1532 8-bit characters of the range 0xA0..0xFF are represented by
1533 one-byte sequences which are their 8-bit code.
1535 The other characters are represented by a sequence of `base
1536 leading-code', optional `extended leading-code', and one or two
1537 `position-code's. The length of the sequence is determined by the
1538 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1539 whereas extended leading-code and position-code take the range 0xA0
1540 through 0xFF. See `charset.h' for more details about leading-code
1543 --- CODE RANGE of Emacs' internal format ---
1547 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1548 eight-bit-graphic 0xA0..0xBF
1549 ELSE 0x81..0x9D + [0xA0..0xFF]+
1550 ---------------------------------------------
1552 As this is the internal character representation, the format is
1553 usually not used externally (i.e. in a file or in a data sent to a
1554 process). But, it is possible to have a text externally in this
1555 format (i.e. by encoding by the coding system `emacs-mule').
1557 In that case, a sequence of one-byte codes has a slightly different
1560 At first, all characters in eight-bit-control are represented by
1561 one-byte sequences which are their 8-bit code.
1563 Next, character composition data are represented by the byte
1564 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1566 METHOD is 0xF0 plus one of composition method (enum
1567 composition_method),
1569 BYTES is 0xA0 plus a byte length of this composition data,
1571 CHARS is 0x20 plus a number of characters composed by this
1574 COMPONENTs are characters of multibye form or composition
1575 rules encoded by two-byte of ASCII codes.
1577 In addition, for backward compatibility, the following formats are
1578 also recognized as composition data on decoding.
1581 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1584 MSEQ is a multibyte form but in these special format:
1585 ASCII: 0xA0 ASCII_CODE+0x80,
1586 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1587 RULE is a one byte code of the range 0xA0..0xF0 that
1588 represents a composition rule.
1591 char emacs_mule_bytes
[256];
1594 emacs_mule_char (coding
, src
, nbytes
, nchars
, id
)
1595 struct coding_system
*coding
;
1597 int *nbytes
, *nchars
, *id
;
1599 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1600 const unsigned char *src_base
= src
;
1601 int multibytep
= coding
->src_multibyte
;
1602 struct charset
*charset
;
1605 int consumed_chars
= 0;
1608 switch (emacs_mule_bytes
[c
])
1611 if (! (charset
= emacs_mule_charset
[c
]))
1618 if (c
== EMACS_MULE_LEADING_CODE_PRIVATE_11
1619 || c
== EMACS_MULE_LEADING_CODE_PRIVATE_12
)
1622 if (! (charset
= emacs_mule_charset
[c
]))
1629 if (! (charset
= emacs_mule_charset
[c
]))
1632 code
= (c
& 0x7F) << 8;
1640 if (! (charset
= emacs_mule_charset
[c
]))
1643 code
= (c
& 0x7F) << 8;
1650 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
)
1651 ? charset_ascii
: charset_eight_bit
);
1657 c
= DECODE_CHAR (charset
, code
);
1660 *nbytes
= src
- src_base
;
1661 *nchars
= consumed_chars
;
1674 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1675 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1679 detect_coding_emacs_mule (coding
, detect_info
)
1680 struct coding_system
*coding
;
1681 struct coding_detection_info
*detect_info
;
1683 const unsigned char *src
= coding
->source
, *src_base
= src
;
1684 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1685 int multibytep
= coding
->src_multibyte
;
1686 int consumed_chars
= 0;
1691 detect_info
->checked
|= CATEGORY_MASK_EMACS_MULE
;
1692 /* A coding system of this category is always ASCII compatible. */
1693 src
+= coding
->head_ascii
;
1703 /* Perhaps the start of composite character. We simple skip
1704 it because analyzing it is too heavy for detecting. But,
1705 at least, we check that the composite character
1706 constitues of more than 4 bytes. */
1707 const unsigned char *src_base
;
1717 if (src
- src_base
<= 4)
1719 found
= CATEGORY_MASK_EMACS_MULE
;
1727 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1732 const unsigned char *src_base
= src
- 1;
1739 if (src
- src_base
!= emacs_mule_bytes
[*src_base
])
1741 found
= CATEGORY_MASK_EMACS_MULE
;
1744 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1748 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1750 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1753 detect_info
->found
|= found
;
1758 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1760 /* Decode a character represented as a component of composition
1761 sequence of Emacs 20/21 style at SRC. Set C to that character and
1762 update SRC to the head of next character (or an encoded composition
1763 rule). If SRC doesn't points a composition component, set C to -1.
1764 If SRC points an invalid byte sequence, global exit by a return
1767 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1771 int nbytes, nchars; \
1773 if (src == src_end) \
1775 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1780 goto invalid_code; \
1784 consumed_chars += nchars; \
1789 /* Decode a composition rule represented as a component of composition
1790 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1791 and increment BUF. If SRC points an invalid byte sequence, set C
1794 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1796 int c, gref, nref; \
1798 if (src >= src_end) \
1799 goto invalid_code; \
1800 ONE_MORE_BYTE_NO_CHECK (c); \
1802 if (c < 0 || c >= 81) \
1803 goto invalid_code; \
1805 gref = c / 9, nref = c % 9; \
1806 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1810 /* Decode a composition rule represented as a component of composition
1811 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1812 and increment BUF. If SRC points an invalid byte sequence, set C
1815 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1819 if (src + 1>= src_end) \
1820 goto invalid_code; \
1821 ONE_MORE_BYTE_NO_CHECK (gref); \
1823 ONE_MORE_BYTE_NO_CHECK (nref); \
1825 if (gref < 0 || gref >= 81 \
1826 || nref < 0 || nref >= 81) \
1827 goto invalid_code; \
1828 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1832 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1834 /* Emacs 21 style format. The first three bytes at SRC are \
1835 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1836 the byte length of this composition information, CHARS is the \
1837 number of characters composed by this composition. */ \
1838 enum composition_method method = c - 0xF2; \
1839 int *charbuf_base = charbuf; \
1841 int consumed_chars_limit; \
1842 int nbytes, nchars; \
1844 ONE_MORE_BYTE (c); \
1845 nbytes = c - 0xA0; \
1847 goto invalid_code; \
1848 ONE_MORE_BYTE (c); \
1849 nchars = c - 0xA0; \
1850 from = coding->produced + char_offset; \
1851 to = from + nchars; \
1852 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1853 consumed_chars_limit = consumed_chars_base + nbytes; \
1854 if (method != COMPOSITION_RELATIVE) \
1857 while (consumed_chars < consumed_chars_limit) \
1859 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1860 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1862 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1865 if (consumed_chars < consumed_chars_limit) \
1866 goto invalid_code; \
1867 charbuf_base[0] -= i; \
1872 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1874 /* Emacs 20 style format for relative composition. */ \
1875 /* Store multibyte form of characters to be composed. */ \
1876 enum composition_method method = COMPOSITION_RELATIVE; \
1877 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1878 int *buf = components; \
1883 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1884 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1885 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1887 goto invalid_code; \
1888 from = coding->produced_char + char_offset; \
1890 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1891 for (j = 0; j < i; j++) \
1892 *charbuf++ = components[j]; \
1896 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1898 /* Emacs 20 style format for rule-base composition. */ \
1899 /* Store multibyte form of characters to be composed. */ \
1900 enum composition_method method = COMPOSITION_WITH_RULE; \
1901 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1902 int *buf = components; \
1906 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1907 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1909 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1910 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1912 if (i < 1 || (buf - components) % 2 == 0) \
1913 goto invalid_code; \
1914 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1915 goto no_more_source; \
1916 from = coding->produced_char + char_offset; \
1918 ADD_COMPOSITION_DATA (buf, from, to, method); \
1919 for (j = 0; j < i; j++) \
1920 *charbuf++ = components[j]; \
1921 for (j = 0; j < i; j += 2) \
1922 *charbuf++ = components[j]; \
1927 decode_coding_emacs_mule (coding
)
1928 struct coding_system
*coding
;
1930 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1931 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1932 const unsigned char *src_base
;
1933 int *charbuf
= coding
->charbuf
;
1934 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
1935 int consumed_chars
= 0, consumed_chars_base
;
1936 int multibytep
= coding
->src_multibyte
;
1937 Lisp_Object attrs
, charset_list
;
1938 int char_offset
= coding
->produced_char
;
1939 int last_offset
= char_offset
;
1940 int last_id
= charset_ascii
;
1942 CODING_GET_INFO (coding
, attrs
, charset_list
);
1949 consumed_chars_base
= consumed_chars
;
1951 if (charbuf
>= charbuf_end
)
1964 if (c
- 0xF2 >= COMPOSITION_RELATIVE
1965 && c
- 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS
)
1966 DECODE_EMACS_MULE_21_COMPOSITION (c
);
1968 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
1970 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
1974 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
1980 consumed_chars
= consumed_chars_base
;
1981 c
= emacs_mule_char (coding
, src
, &nbytes
, &nchars
, &id
);
1990 if (last_id
!= charset_ascii
)
1991 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
1993 last_offset
= char_offset
;
1997 consumed_chars
+= nchars
;
2004 consumed_chars
= consumed_chars_base
;
2006 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
2012 if (last_id
!= charset_ascii
)
2013 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2014 coding
->consumed_char
+= consumed_chars_base
;
2015 coding
->consumed
= src_base
- coding
->source
;
2016 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
2020 #define EMACS_MULE_LEADING_CODES(id, codes) \
2023 codes[0] = id, codes[1] = 0; \
2024 else if (id < 0xE0) \
2025 codes[0] = 0x9A, codes[1] = id; \
2026 else if (id < 0xF0) \
2027 codes[0] = 0x9B, codes[1] = id; \
2028 else if (id < 0xF5) \
2029 codes[0] = 0x9C, codes[1] = id; \
2031 codes[0] = 0x9D, codes[1] = id; \
2036 encode_coding_emacs_mule (coding
)
2037 struct coding_system
*coding
;
2039 int multibytep
= coding
->dst_multibyte
;
2040 int *charbuf
= coding
->charbuf
;
2041 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
2042 unsigned char *dst
= coding
->destination
+ coding
->produced
;
2043 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
2045 int produced_chars
= 0;
2046 Lisp_Object attrs
, charset_list
;
2048 int preferred_charset_id
= -1;
2050 CODING_GET_INFO (coding
, attrs
, charset_list
);
2051 if (! EQ (charset_list
, Vemacs_mule_charset_list
))
2053 CODING_ATTR_CHARSET_LIST (attrs
)
2054 = charset_list
= Vemacs_mule_charset_list
;
2057 while (charbuf
< charbuf_end
)
2059 ASSURE_DESTINATION (safe_room
);
2064 /* Handle an annotation. */
2067 case CODING_ANNOTATE_COMPOSITION_MASK
:
2068 /* Not yet implemented. */
2070 case CODING_ANNOTATE_CHARSET_MASK
:
2071 preferred_charset_id
= charbuf
[3];
2072 if (preferred_charset_id
>= 0
2073 && NILP (Fmemq (make_number (preferred_charset_id
),
2075 preferred_charset_id
= -1;
2084 if (ASCII_CHAR_P (c
))
2085 EMIT_ONE_ASCII_BYTE (c
);
2086 else if (CHAR_BYTE8_P (c
))
2088 c
= CHAR_TO_BYTE8 (c
);
2093 struct charset
*charset
;
2097 unsigned char leading_codes
[2];
2099 if (preferred_charset_id
>= 0)
2101 charset
= CHARSET_FROM_ID (preferred_charset_id
);
2102 if (! CHAR_CHARSET_P (c
, charset
))
2103 charset
= char_charset (c
, charset_list
, NULL
);
2106 charset
= char_charset (c
, charset_list
, &code
);
2109 c
= coding
->default_char
;
2110 if (ASCII_CHAR_P (c
))
2112 EMIT_ONE_ASCII_BYTE (c
);
2115 charset
= char_charset (c
, charset_list
, &code
);
2117 dimension
= CHARSET_DIMENSION (charset
);
2118 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2119 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2120 EMIT_ONE_BYTE (leading_codes
[0]);
2121 if (leading_codes
[1])
2122 EMIT_ONE_BYTE (leading_codes
[1]);
2124 EMIT_ONE_BYTE (code
| 0x80);
2128 EMIT_ONE_BYTE (code
>> 8);
2129 EMIT_ONE_BYTE (code
& 0xFF);
2133 coding
->result
= CODING_RESULT_SUCCESS
;
2134 coding
->produced_char
+= produced_chars
;
2135 coding
->produced
= dst
- coding
->destination
;
2140 /*** 7. ISO2022 handlers ***/
2142 /* The following note describes the coding system ISO2022 briefly.
2143 Since the intention of this note is to help understand the
2144 functions in this file, some parts are NOT ACCURATE or are OVERLY
2145 SIMPLIFIED. For thorough understanding, please refer to the
2146 original document of ISO2022. This is equivalent to the standard
2147 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2149 ISO2022 provides many mechanisms to encode several character sets
2150 in 7-bit and 8-bit environments. For 7-bit environments, all text
2151 is encoded using bytes less than 128. This may make the encoded
2152 text a little bit longer, but the text passes more easily through
2153 several types of gateway, some of which strip off the MSB (Most
2156 There are two kinds of character sets: control character sets and
2157 graphic character sets. The former contain control characters such
2158 as `newline' and `escape' to provide control functions (control
2159 functions are also provided by escape sequences). The latter
2160 contain graphic characters such as 'A' and '-'. Emacs recognizes
2161 two control character sets and many graphic character sets.
2163 Graphic character sets are classified into one of the following
2164 four classes, according to the number of bytes (DIMENSION) and
2165 number of characters in one dimension (CHARS) of the set:
2166 - DIMENSION1_CHARS94
2167 - DIMENSION1_CHARS96
2168 - DIMENSION2_CHARS94
2169 - DIMENSION2_CHARS96
2171 In addition, each character set is assigned an identification tag,
2172 unique for each set, called the "final character" (denoted as <F>
2173 hereafter). The <F> of each character set is decided by ECMA(*)
2174 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2175 (0x30..0x3F are for private use only).
2177 Note (*): ECMA = European Computer Manufacturers Association
2179 Here are examples of graphic character sets [NAME(<F>)]:
2180 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2181 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2182 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2183 o DIMENSION2_CHARS96 -- none for the moment
2185 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2186 C0 [0x00..0x1F] -- control character plane 0
2187 GL [0x20..0x7F] -- graphic character plane 0
2188 C1 [0x80..0x9F] -- control character plane 1
2189 GR [0xA0..0xFF] -- graphic character plane 1
2191 A control character set is directly designated and invoked to C0 or
2192 C1 by an escape sequence. The most common case is that:
2193 - ISO646's control character set is designated/invoked to C0, and
2194 - ISO6429's control character set is designated/invoked to C1,
2195 and usually these designations/invocations are omitted in encoded
2196 text. In a 7-bit environment, only C0 can be used, and a control
2197 character for C1 is encoded by an appropriate escape sequence to
2198 fit into the environment. All control characters for C1 are
2199 defined to have corresponding escape sequences.
2201 A graphic character set is at first designated to one of four
2202 graphic registers (G0 through G3), then these graphic registers are
2203 invoked to GL or GR. These designations and invocations can be
2204 done independently. The most common case is that G0 is invoked to
2205 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2206 these invocations and designations are omitted in encoded text.
2207 In a 7-bit environment, only GL can be used.
2209 When a graphic character set of CHARS94 is invoked to GL, codes
2210 0x20 and 0x7F of the GL area work as control characters SPACE and
2211 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2214 There are two ways of invocation: locking-shift and single-shift.
2215 With locking-shift, the invocation lasts until the next different
2216 invocation, whereas with single-shift, the invocation affects the
2217 following character only and doesn't affect the locking-shift
2218 state. Invocations are done by the following control characters or
2221 ----------------------------------------------------------------------
2222 abbrev function cntrl escape seq description
2223 ----------------------------------------------------------------------
2224 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2225 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2226 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2227 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2228 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2229 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2230 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2231 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2232 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2233 ----------------------------------------------------------------------
2234 (*) These are not used by any known coding system.
2236 Control characters for these functions are defined by macros
2237 ISO_CODE_XXX in `coding.h'.
2239 Designations are done by the following escape sequences:
2240 ----------------------------------------------------------------------
2241 escape sequence description
2242 ----------------------------------------------------------------------
2243 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2244 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2245 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2246 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2247 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2248 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2249 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2250 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2251 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2252 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2253 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2254 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2255 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2256 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2257 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2258 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2259 ----------------------------------------------------------------------
2261 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2262 of dimension 1, chars 94, and final character <F>, etc...
2264 Note (*): Although these designations are not allowed in ISO2022,
2265 Emacs accepts them on decoding, and produces them on encoding
2266 CHARS96 character sets in a coding system which is characterized as
2267 7-bit environment, non-locking-shift, and non-single-shift.
2269 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2270 '(' must be omitted. We refer to this as "short-form" hereafter.
2272 Now you may notice that there are a lot of ways of encoding the
2273 same multilingual text in ISO2022. Actually, there exist many
2274 coding systems such as Compound Text (used in X11's inter client
2275 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2276 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2277 localized platforms), and all of these are variants of ISO2022.
2279 In addition to the above, Emacs handles two more kinds of escape
2280 sequences: ISO6429's direction specification and Emacs' private
2281 sequence for specifying character composition.
2283 ISO6429's direction specification takes the following form:
2284 o CSI ']' -- end of the current direction
2285 o CSI '0' ']' -- end of the current direction
2286 o CSI '1' ']' -- start of left-to-right text
2287 o CSI '2' ']' -- start of right-to-left text
2288 The control character CSI (0x9B: control sequence introducer) is
2289 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2291 Character composition specification takes the following form:
2292 o ESC '0' -- start relative composition
2293 o ESC '1' -- end composition
2294 o ESC '2' -- start rule-base composition (*)
2295 o ESC '3' -- start relative composition with alternate chars (**)
2296 o ESC '4' -- start rule-base composition with alternate chars (**)
2297 Since these are not standard escape sequences of any ISO standard,
2298 the use of them with these meanings is restricted to Emacs only.
2300 (*) This form is used only in Emacs 20.7 and older versions,
2301 but newer versions can safely decode it.
2302 (**) This form is used only in Emacs 21.1 and newer versions,
2303 and older versions can't decode it.
2305 Here's a list of example usages of these composition escape
2306 sequences (categorized by `enum composition_method').
2308 COMPOSITION_RELATIVE:
2309 ESC 0 CHAR [ CHAR ] ESC 1
2310 COMPOSITION_WITH_RULE:
2311 ESC 2 CHAR [ RULE CHAR ] ESC 1
2312 COMPOSITION_WITH_ALTCHARS:
2313 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2314 COMPOSITION_WITH_RULE_ALTCHARS:
2315 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2317 enum iso_code_class_type iso_code_class
[256];
2319 #define SAFE_CHARSET_P(coding, id) \
2320 ((id) <= (coding)->max_charset_id \
2321 && (coding)->safe_charsets[id] >= 0)
2324 #define SHIFT_OUT_OK(category) \
2325 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2328 setup_iso_safe_charsets (attrs
)
2331 Lisp_Object charset_list
, safe_charsets
;
2332 Lisp_Object request
;
2333 Lisp_Object reg_usage
;
2336 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2339 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2340 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2341 && ! EQ (charset_list
, Viso_2022_charset_list
))
2343 CODING_ATTR_CHARSET_LIST (attrs
)
2344 = charset_list
= Viso_2022_charset_list
;
2345 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2348 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2352 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2354 int id
= XINT (XCAR (tail
));
2355 if (max_charset_id
< id
)
2356 max_charset_id
= id
;
2359 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2361 request
= AREF (attrs
, coding_attr_iso_request
);
2362 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2363 reg94
= XINT (XCAR (reg_usage
));
2364 reg96
= XINT (XCDR (reg_usage
));
2366 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2370 struct charset
*charset
;
2373 charset
= CHARSET_FROM_ID (XINT (id
));
2374 reg
= Fcdr (Fassq (id
, request
));
2376 SSET (safe_charsets
, XINT (id
), XINT (reg
));
2377 else if (charset
->iso_chars_96
)
2380 SSET (safe_charsets
, XINT (id
), reg96
);
2385 SSET (safe_charsets
, XINT (id
), reg94
);
2388 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2392 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2393 Check if a text is encoded in one of ISO-2022 based codig systems.
2394 If it is, return 1, else return 0. */
2397 detect_coding_iso_2022 (coding
, detect_info
)
2398 struct coding_system
*coding
;
2399 struct coding_detection_info
*detect_info
;
2401 const unsigned char *src
= coding
->source
, *src_base
= src
;
2402 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2403 int multibytep
= coding
->src_multibyte
;
2404 int single_shifting
= 0;
2407 int consumed_chars
= 0;
2412 detect_info
->checked
|= CATEGORY_MASK_ISO
;
2414 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2416 struct coding_system
*this = &(coding_categories
[i
]);
2417 Lisp_Object attrs
, val
;
2419 attrs
= CODING_ID_ATTRS (this->id
);
2420 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2421 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2422 setup_iso_safe_charsets (attrs
);
2423 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2424 this->max_charset_id
= SCHARS (val
) - 1;
2425 this->safe_charsets
= (char *) SDATA (val
);
2428 /* A coding system of this category is always ASCII compatible. */
2429 src
+= coding
->head_ascii
;
2431 while (rejected
!= CATEGORY_MASK_ISO
)
2437 if (inhibit_iso_escape_detection
)
2439 single_shifting
= 0;
2441 if (c
>= '(' && c
<= '/')
2443 /* Designation sequence for a charset of dimension 1. */
2445 if (c1
< ' ' || c1
>= 0x80
2446 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2447 /* Invalid designation sequence. Just ignore. */
2452 /* Designation sequence for a charset of dimension 2. */
2454 if (c
>= '@' && c
<= 'B')
2455 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2456 id
= iso_charset_table
[1][0][c
];
2457 else if (c
>= '(' && c
<= '/')
2460 if (c1
< ' ' || c1
>= 0x80
2461 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2462 /* Invalid designation sequence. Just ignore. */
2466 /* Invalid designation sequence. Just ignore it. */
2469 else if (c
== 'N' || c
== 'O')
2471 /* ESC <Fe> for SS2 or SS3. */
2472 single_shifting
= 1;
2473 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2476 else if (c
>= '0' && c
<= '4')
2478 /* ESC <Fp> for start/end composition. */
2479 found
|= CATEGORY_MASK_ISO
;
2484 /* Invalid escape sequence. Just ignore it. */
2488 /* We found a valid designation sequence for CHARSET. */
2489 rejected
|= CATEGORY_MASK_ISO_8BIT
;
2490 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2492 found
|= CATEGORY_MASK_ISO_7
;
2494 rejected
|= CATEGORY_MASK_ISO_7
;
2495 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2497 found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2499 rejected
|= CATEGORY_MASK_ISO_7_TIGHT
;
2500 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2502 found
|= CATEGORY_MASK_ISO_7_ELSE
;
2504 rejected
|= CATEGORY_MASK_ISO_7_ELSE
;
2505 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2507 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2509 rejected
|= CATEGORY_MASK_ISO_8_ELSE
;
2514 /* Locking shift out/in. */
2515 if (inhibit_iso_escape_detection
)
2517 single_shifting
= 0;
2518 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2519 found
|= CATEGORY_MASK_ISO_ELSE
;
2523 /* Control sequence introducer. */
2524 single_shifting
= 0;
2525 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2526 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2527 goto check_extra_latin
;
2533 if (inhibit_iso_escape_detection
)
2535 single_shifting
= 1;
2536 rejected
|= CATEGORY_MASK_ISO_7BIT
;
2537 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2538 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2539 found
|= CATEGORY_MASK_ISO_8_1
;
2540 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2541 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2542 found
|= CATEGORY_MASK_ISO_8_2
;
2543 goto check_extra_latin
;
2548 single_shifting
= 0;
2553 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2554 found
|= CATEGORY_MASK_ISO_8_1
;
2555 /* Check the length of succeeding codes of the range
2556 0xA0..0FF. If the byte length is even, we include
2557 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2558 only when we are not single shifting. */
2559 if (! single_shifting
2560 && ! (rejected
& CATEGORY_MASK_ISO_8_2
))
2563 while (src
< src_end
)
2571 if (i
& 1 && src
< src_end
)
2572 rejected
|= CATEGORY_MASK_ISO_8_2
;
2574 found
|= CATEGORY_MASK_ISO_8_2
;
2579 single_shifting
= 0;
2580 if (! VECTORP (Vlatin_extra_code_table
)
2581 || NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2583 rejected
= CATEGORY_MASK_ISO
;
2586 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2587 & CODING_ISO_FLAG_LATIN_EXTRA
)
2588 found
|= CATEGORY_MASK_ISO_8_1
;
2590 rejected
|= CATEGORY_MASK_ISO_8_1
;
2591 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2592 & CODING_ISO_FLAG_LATIN_EXTRA
)
2593 found
|= CATEGORY_MASK_ISO_8_2
;
2595 rejected
|= CATEGORY_MASK_ISO_8_2
;
2598 detect_info
->rejected
|= CATEGORY_MASK_ISO
;
2602 detect_info
->rejected
|= rejected
;
2603 detect_info
->found
|= (found
& ~rejected
);
2608 /* Set designation state into CODING. */
2609 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2613 if (final < '0' || final >= 128 \
2614 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2615 || !SAFE_CHARSET_P (coding, id)) \
2617 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2618 goto invalid_code; \
2620 prev = CODING_ISO_DESIGNATION (coding, reg); \
2621 if (id == charset_jisx0201_roman) \
2623 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2624 id = charset_ascii; \
2626 else if (id == charset_jisx0208_1978) \
2628 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2629 id = charset_jisx0208; \
2631 CODING_ISO_DESIGNATION (coding, reg) = id; \
2632 /* If there was an invalid designation to REG previously, and this \
2633 designation is ASCII to REG, we should keep this designation \
2635 if (prev == -2 && id == charset_ascii) \
2636 goto invalid_code; \
2640 #define MAYBE_FINISH_COMPOSITION() \
2643 if (composition_state == COMPOSING_NO) \
2645 /* It is assured that we have enough room for producing \
2646 characters stored in the table `components'. */ \
2647 if (charbuf + component_idx > charbuf_end) \
2648 goto no_more_source; \
2649 composition_state = COMPOSING_NO; \
2650 if (method == COMPOSITION_RELATIVE \
2651 || method == COMPOSITION_WITH_ALTCHARS) \
2653 for (i = 0; i < component_idx; i++) \
2654 *charbuf++ = components[i]; \
2655 char_offset += component_idx; \
2659 for (i = 0; i < component_idx; i += 2) \
2660 *charbuf++ = components[i]; \
2661 char_offset += (component_idx / 2) + 1; \
2666 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2667 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2668 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2669 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2670 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2673 #define DECODE_COMPOSITION_START(c1) \
2676 && composition_state == COMPOSING_COMPONENT_RULE) \
2678 component_len = component_idx; \
2679 composition_state = COMPOSING_CHAR; \
2683 const unsigned char *p; \
2685 MAYBE_FINISH_COMPOSITION (); \
2686 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2687 goto no_more_source; \
2688 for (p = src; p < src_end - 1; p++) \
2689 if (*p == ISO_CODE_ESC && p[1] == '1') \
2691 if (p == src_end - 1) \
2693 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2694 goto invalid_code; \
2695 goto no_more_source; \
2698 /* This is surely the start of a composition. */ \
2699 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2700 : c1 == '2' ? COMPOSITION_WITH_RULE \
2701 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2702 : COMPOSITION_WITH_RULE_ALTCHARS); \
2703 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2704 : COMPOSING_COMPONENT_CHAR); \
2705 component_idx = component_len = 0; \
2710 /* Handle compositoin end sequence ESC 1. */
2712 #define DECODE_COMPOSITION_END() \
2714 int nchars = (component_len > 0 ? component_idx - component_len \
2715 : method == COMPOSITION_RELATIVE ? component_idx \
2716 : (component_idx + 1) / 2); \
2718 int *saved_charbuf = charbuf; \
2719 int from = char_offset; \
2720 int to = from + nchars; \
2722 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2723 if (method != COMPOSITION_RELATIVE) \
2725 if (component_len == 0) \
2726 for (i = 0; i < component_idx; i++) \
2727 *charbuf++ = components[i]; \
2729 for (i = 0; i < component_len; i++) \
2730 *charbuf++ = components[i]; \
2731 *saved_charbuf = saved_charbuf - charbuf; \
2733 if (method == COMPOSITION_WITH_RULE) \
2734 for (i = 0; i < component_idx; i += 2, char_offset++) \
2735 *charbuf++ = components[i]; \
2737 for (i = component_len; i < component_idx; i++, char_offset++) \
2738 *charbuf++ = components[i]; \
2739 coding->annotated = 1; \
2740 composition_state = COMPOSING_NO; \
2744 /* Decode a composition rule from the byte C1 (and maybe one more byte
2745 from SRC) and store one encoded composition rule in
2746 coding->cmp_data. */
2748 #define DECODE_COMPOSITION_RULE(c1) \
2751 if (c1 < 81) /* old format (before ver.21) */ \
2753 int gref = (c1) / 9; \
2754 int nref = (c1) % 9; \
2755 if (gref == 4) gref = 10; \
2756 if (nref == 4) nref = 10; \
2757 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2759 else if (c1 < 93) /* new format (after ver.21) */ \
2761 ONE_MORE_BYTE (c2); \
2762 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2769 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2772 decode_coding_iso_2022 (coding
)
2773 struct coding_system
*coding
;
2775 const unsigned char *src
= coding
->source
+ coding
->consumed
;
2776 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2777 const unsigned char *src_base
;
2778 int *charbuf
= coding
->charbuf
;
2780 = charbuf
+ coding
->charbuf_size
- 4 - MAX_ANNOTATION_LENGTH
;
2781 int consumed_chars
= 0, consumed_chars_base
;
2782 int multibytep
= coding
->src_multibyte
;
2783 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2784 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2785 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2786 struct charset
*charset
;
2788 /* For handling composition sequence. */
2789 #define COMPOSING_NO 0
2790 #define COMPOSING_CHAR 1
2791 #define COMPOSING_RULE 2
2792 #define COMPOSING_COMPONENT_CHAR 3
2793 #define COMPOSING_COMPONENT_RULE 4
2795 int composition_state
= COMPOSING_NO
;
2796 enum composition_method method
;
2797 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2800 Lisp_Object attrs
, charset_list
;
2801 int char_offset
= coding
->produced_char
;
2802 int last_offset
= char_offset
;
2803 int last_id
= charset_ascii
;
2805 CODING_GET_INFO (coding
, attrs
, charset_list
);
2806 setup_iso_safe_charsets (attrs
);
2813 consumed_chars_base
= consumed_chars
;
2815 if (charbuf
>= charbuf_end
)
2820 /* We produce at most one character. */
2821 switch (iso_code_class
[c1
])
2823 case ISO_0x20_or_0x7F
:
2824 if (composition_state
!= COMPOSING_NO
)
2826 if (composition_state
== COMPOSING_RULE
2827 || composition_state
== COMPOSING_COMPONENT_RULE
)
2829 DECODE_COMPOSITION_RULE (c1
);
2830 components
[component_idx
++] = c1
;
2831 composition_state
--;
2835 if (charset_id_0
< 0
2836 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2837 /* This is SPACE or DEL. */
2838 charset
= CHARSET_FROM_ID (charset_ascii
);
2840 charset
= CHARSET_FROM_ID (charset_id_0
);
2843 case ISO_graphic_plane_0
:
2844 if (composition_state
!= COMPOSING_NO
)
2846 if (composition_state
== COMPOSING_RULE
2847 || composition_state
== COMPOSING_COMPONENT_RULE
)
2849 DECODE_COMPOSITION_RULE (c1
);
2850 components
[component_idx
++] = c1
;
2851 composition_state
--;
2855 charset
= CHARSET_FROM_ID (charset_id_0
);
2858 case ISO_0xA0_or_0xFF
:
2859 if (charset_id_1
< 0
2860 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
2861 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
2863 /* This is a graphic character, we fall down ... */
2865 case ISO_graphic_plane_1
:
2866 if (charset_id_1
< 0)
2868 charset
= CHARSET_FROM_ID (charset_id_1
);
2872 MAYBE_FINISH_COMPOSITION ();
2873 charset
= CHARSET_FROM_ID (charset_ascii
);
2877 MAYBE_FINISH_COMPOSITION ();
2881 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2882 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
2884 CODING_ISO_INVOCATION (coding
, 0) = 1;
2885 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2889 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
2891 CODING_ISO_INVOCATION (coding
, 0) = 0;
2892 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2895 case ISO_single_shift_2_7
:
2896 case ISO_single_shift_2
:
2897 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2899 /* SS2 is handled as an escape sequence of ESC 'N' */
2901 goto label_escape_sequence
;
2903 case ISO_single_shift_3
:
2904 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2906 /* SS2 is handled as an escape sequence of ESC 'O' */
2908 goto label_escape_sequence
;
2910 case ISO_control_sequence_introducer
:
2911 /* CSI is handled as an escape sequence of ESC '[' ... */
2913 goto label_escape_sequence
;
2917 label_escape_sequence
:
2918 /* Escape sequences handled here are invocation,
2919 designation, direction specification, and character
2920 composition specification. */
2923 case '&': /* revision of following character set */
2925 if (!(c1
>= '@' && c1
<= '~'))
2928 if (c1
!= ISO_CODE_ESC
)
2931 goto label_escape_sequence
;
2933 case '$': /* designation of 2-byte character set */
2934 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2937 if (c1
>= '@' && c1
<= 'B')
2938 { /* designation of JISX0208.1978, GB2312.1980,
2940 DECODE_DESIGNATION (0, 2, 0, c1
);
2942 else if (c1
>= 0x28 && c1
<= 0x2B)
2943 { /* designation of DIMENSION2_CHARS94 character set */
2945 DECODE_DESIGNATION (c1
- 0x28, 2, 0, c2
);
2947 else if (c1
>= 0x2C && c1
<= 0x2F)
2948 { /* designation of DIMENSION2_CHARS96 character set */
2950 DECODE_DESIGNATION (c1
- 0x2C, 2, 1, c2
);
2954 /* We must update these variables now. */
2955 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2956 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2959 case 'n': /* invocation of locking-shift-2 */
2960 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2961 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2963 CODING_ISO_INVOCATION (coding
, 0) = 2;
2964 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2967 case 'o': /* invocation of locking-shift-3 */
2968 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2969 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2971 CODING_ISO_INVOCATION (coding
, 0) = 3;
2972 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2975 case 'N': /* invocation of single-shift-2 */
2976 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2977 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2979 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 2));
2981 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
2985 case 'O': /* invocation of single-shift-3 */
2986 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2987 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2989 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 3));
2991 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
2995 case '0': case '2': case '3': case '4': /* start composition */
2996 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
2998 DECODE_COMPOSITION_START (c1
);
3001 case '1': /* end composition */
3002 if (composition_state
== COMPOSING_NO
)
3004 DECODE_COMPOSITION_END ();
3007 case '[': /* specification of direction */
3008 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
3010 /* For the moment, nested direction is not supported.
3011 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3012 left-to-right, and nozero means right-to-left. */
3016 case ']': /* end of the current direction */
3017 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3019 case '0': /* end of the current direction */
3020 case '1': /* start of left-to-right direction */
3023 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3028 case '2': /* start of right-to-left direction */
3031 coding
->mode
|= CODING_MODE_DIRECTION
;
3045 /* CTEXT extended segment:
3046 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3047 We keep these bytes as is for the moment.
3048 They may be decoded by post-read-conversion. */
3052 ONE_MORE_BYTE (dim
);
3055 size
= ((M
- 128) * 128) + (L
- 128);
3056 if (charbuf
+ 8 + size
> charbuf_end
)
3058 *charbuf
++ = ISO_CODE_ESC
;
3062 *charbuf
++ = BYTE8_TO_CHAR (M
);
3063 *charbuf
++ = BYTE8_TO_CHAR (L
);
3067 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3072 /* XFree86 extension for embedding UTF-8 in CTEXT:
3073 ESC % G --UTF-8-BYTES-- ESC % @
3074 We keep these bytes as is for the moment.
3075 They may be decoded by post-read-conversion. */
3078 if (p
+ 6 > charbuf_end
)
3080 *p
++ = ISO_CODE_ESC
;
3083 while (p
< charbuf_end
)
3086 if (c1
== ISO_CODE_ESC
3087 && src
+ 1 < src_end
3091 *p
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3093 if (p
+ 3 > charbuf_end
)
3095 *p
++ = ISO_CODE_ESC
;
3106 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3108 if (c1
>= 0x28 && c1
<= 0x2B)
3109 { /* designation of DIMENSION1_CHARS94 character set */
3111 DECODE_DESIGNATION (c1
- 0x28, 1, 0, c2
);
3113 else if (c1
>= 0x2C && c1
<= 0x2F)
3114 { /* designation of DIMENSION1_CHARS96 character set */
3116 DECODE_DESIGNATION (c1
- 0x2C, 1, 1, c2
);
3120 /* We must update these variables now. */
3121 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3122 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3127 if (charset
->id
!= charset_ascii
3128 && last_id
!= charset
->id
)
3130 if (last_id
!= charset_ascii
)
3131 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3132 last_id
= charset
->id
;
3133 last_offset
= char_offset
;
3136 /* Now we know CHARSET and 1st position code C1 of a character.
3137 Produce a decoded character while getting 2nd position code
3140 if (CHARSET_DIMENSION (charset
) > 1)
3143 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3144 /* C2 is not in a valid range. */
3146 c1
= (c1
<< 8) | (c2
& 0x7F);
3147 if (CHARSET_DIMENSION (charset
) > 2)
3150 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3151 /* C2 is not in a valid range. */
3153 c1
= (c1
<< 8) | (c2
& 0x7F);
3157 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3160 MAYBE_FINISH_COMPOSITION ();
3161 for (; src_base
< src
; src_base
++, char_offset
++)
3163 if (ASCII_BYTE_P (*src_base
))
3164 *charbuf
++ = *src_base
;
3166 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3169 else if (composition_state
== COMPOSING_NO
)
3176 components
[component_idx
++] = c
;
3177 if (method
== COMPOSITION_WITH_RULE
3178 || (method
== COMPOSITION_WITH_RULE_ALTCHARS
3179 && composition_state
== COMPOSING_COMPONENT_CHAR
))
3180 composition_state
++;
3185 MAYBE_FINISH_COMPOSITION ();
3187 consumed_chars
= consumed_chars_base
;
3189 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3199 if (last_id
!= charset_ascii
)
3200 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3201 coding
->consumed_char
+= consumed_chars_base
;
3202 coding
->consumed
= src_base
- coding
->source
;
3203 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3207 /* ISO2022 encoding stuff. */
3210 It is not enough to say just "ISO2022" on encoding, we have to
3211 specify more details. In Emacs, each coding system of ISO2022
3212 variant has the following specifications:
3213 1. Initial designation to G0 thru G3.
3214 2. Allows short-form designation?
3215 3. ASCII should be designated to G0 before control characters?
3216 4. ASCII should be designated to G0 at end of line?
3217 5. 7-bit environment or 8-bit environment?
3218 6. Use locking-shift?
3219 7. Use Single-shift?
3220 And the following two are only for Japanese:
3221 8. Use ASCII in place of JIS0201-1976-Roman?
3222 9. Use JISX0208-1983 in place of JISX0208-1978?
3223 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3224 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3228 /* Produce codes (escape sequence) for designating CHARSET to graphic
3229 register REG at DST, and increment DST. If <final-char> of CHARSET is
3230 '@', 'A', or 'B' and the coding system CODING allows, produce
3231 designation sequence of short-form. */
3233 #define ENCODE_DESIGNATION(charset, reg, coding) \
3235 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3236 char *intermediate_char_94 = "()*+"; \
3237 char *intermediate_char_96 = ",-./"; \
3238 int revision = -1; \
3241 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3242 revision = CHARSET_ISO_REVISION (charset); \
3244 if (revision >= 0) \
3246 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3247 EMIT_ONE_BYTE ('@' + revision); \
3249 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3250 if (CHARSET_DIMENSION (charset) == 1) \
3252 if (! CHARSET_ISO_CHARS_96 (charset)) \
3253 c = intermediate_char_94[reg]; \
3255 c = intermediate_char_96[reg]; \
3256 EMIT_ONE_ASCII_BYTE (c); \
3260 EMIT_ONE_ASCII_BYTE ('$'); \
3261 if (! CHARSET_ISO_CHARS_96 (charset)) \
3263 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3265 || final_char < '@' || final_char > 'B') \
3266 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3269 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3271 EMIT_ONE_ASCII_BYTE (final_char); \
3273 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3277 /* The following two macros produce codes (control character or escape
3278 sequence) for ISO2022 single-shift functions (single-shift-2 and
3281 #define ENCODE_SINGLE_SHIFT_2 \
3283 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3284 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3286 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3287 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3291 #define ENCODE_SINGLE_SHIFT_3 \
3293 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3294 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3296 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3297 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3301 /* The following four macros produce codes (control character or
3302 escape sequence) for ISO2022 locking-shift functions (shift-in,
3303 shift-out, locking-shift-2, and locking-shift-3). */
3305 #define ENCODE_SHIFT_IN \
3307 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3308 CODING_ISO_INVOCATION (coding, 0) = 0; \
3312 #define ENCODE_SHIFT_OUT \
3314 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3315 CODING_ISO_INVOCATION (coding, 0) = 1; \
3319 #define ENCODE_LOCKING_SHIFT_2 \
3321 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3322 CODING_ISO_INVOCATION (coding, 0) = 2; \
3326 #define ENCODE_LOCKING_SHIFT_3 \
3328 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3329 CODING_ISO_INVOCATION (coding, 0) = 3; \
3333 /* Produce codes for a DIMENSION1 character whose character set is
3334 CHARSET and whose position-code is C1. Designation and invocation
3335 sequences are also produced in advance if necessary. */
3337 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3339 int id = CHARSET_ID (charset); \
3341 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3342 && id == charset_ascii) \
3344 id = charset_jisx0201_roman; \
3345 charset = CHARSET_FROM_ID (id); \
3348 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3350 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3351 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3353 EMIT_ONE_BYTE (c1 | 0x80); \
3354 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3357 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3359 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3362 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3364 EMIT_ONE_BYTE (c1 | 0x80); \
3368 /* Since CHARSET is not yet invoked to any graphic planes, we \
3369 must invoke it, or, at first, designate it to some graphic \
3370 register. Then repeat the loop to actually produce the \
3372 dst = encode_invocation_designation (charset, coding, dst, \
3377 /* Produce codes for a DIMENSION2 character whose character set is
3378 CHARSET and whose position-codes are C1 and C2. Designation and
3379 invocation codes are also produced in advance if necessary. */
3381 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3383 int id = CHARSET_ID (charset); \
3385 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3386 && id == charset_jisx0208) \
3388 id = charset_jisx0208_1978; \
3389 charset = CHARSET_FROM_ID (id); \
3392 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3394 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3395 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3397 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3398 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3401 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3403 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3406 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3408 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3412 /* Since CHARSET is not yet invoked to any graphic planes, we \
3413 must invoke it, or, at first, designate it to some graphic \
3414 register. Then repeat the loop to actually produce the \
3416 dst = encode_invocation_designation (charset, coding, dst, \
3421 #define ENCODE_ISO_CHARACTER(charset, c) \
3423 int code = ENCODE_CHAR ((charset),(c)); \
3425 if (CHARSET_DIMENSION (charset) == 1) \
3426 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3428 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3432 /* Produce designation and invocation codes at a place pointed by DST
3433 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3437 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3438 struct charset
*charset
;
3439 struct coding_system
*coding
;
3443 int multibytep
= coding
->dst_multibyte
;
3444 int produced_chars
= *p_nchars
;
3445 int reg
; /* graphic register number */
3446 int id
= CHARSET_ID (charset
);
3448 /* At first, check designations. */
3449 for (reg
= 0; reg
< 4; reg
++)
3450 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3455 /* CHARSET is not yet designated to any graphic registers. */
3456 /* At first check the requested designation. */
3457 reg
= CODING_ISO_REQUEST (coding
, id
);
3459 /* Since CHARSET requests no special designation, designate it
3460 to graphic register 0. */
3463 ENCODE_DESIGNATION (charset
, reg
, coding
);
3466 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3467 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3469 /* Since the graphic register REG is not invoked to any graphic
3470 planes, invoke it to graphic plane 0. */
3473 case 0: /* graphic register 0 */
3477 case 1: /* graphic register 1 */
3481 case 2: /* graphic register 2 */
3482 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3483 ENCODE_SINGLE_SHIFT_2
;
3485 ENCODE_LOCKING_SHIFT_2
;
3488 case 3: /* graphic register 3 */
3489 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3490 ENCODE_SINGLE_SHIFT_3
;
3492 ENCODE_LOCKING_SHIFT_3
;
3497 *p_nchars
= produced_chars
;
3501 /* The following three macros produce codes for indicating direction
3503 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3505 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3506 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3508 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3512 #define ENCODE_DIRECTION_R2L() \
3514 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3515 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3519 #define ENCODE_DIRECTION_L2R() \
3521 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3522 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3526 /* Produce codes for designation and invocation to reset the graphic
3527 planes and registers to initial state. */
3528 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3531 struct charset *charset; \
3533 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3535 for (reg = 0; reg < 4; reg++) \
3536 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3537 && (CODING_ISO_DESIGNATION (coding, reg) \
3538 != CODING_ISO_INITIAL (coding, reg))) \
3540 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3541 ENCODE_DESIGNATION (charset, reg, coding); \
3546 /* Produce designation sequences of charsets in the line started from
3547 SRC to a place pointed by DST, and return updated DST.
3549 If the current block ends before any end-of-line, we may fail to
3550 find all the necessary designations. */
3552 static unsigned char *
3553 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3554 struct coding_system
*coding
;
3555 int *charbuf
, *charbuf_end
;
3558 struct charset
*charset
;
3559 /* Table of charsets to be designated to each graphic register. */
3561 int c
, found
= 0, reg
;
3562 int produced_chars
= 0;
3563 int multibytep
= coding
->dst_multibyte
;
3565 Lisp_Object charset_list
;
3567 attrs
= CODING_ID_ATTRS (coding
->id
);
3568 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3569 if (EQ (charset_list
, Qiso_2022
))
3570 charset_list
= Viso_2022_charset_list
;
3572 for (reg
= 0; reg
< 4; reg
++)
3582 charset
= char_charset (c
, charset_list
, NULL
);
3583 id
= CHARSET_ID (charset
);
3584 reg
= CODING_ISO_REQUEST (coding
, id
);
3585 if (reg
>= 0 && r
[reg
] < 0)
3594 for (reg
= 0; reg
< 4; reg
++)
3596 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3597 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3603 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3606 encode_coding_iso_2022 (coding
)
3607 struct coding_system
*coding
;
3609 int multibytep
= coding
->dst_multibyte
;
3610 int *charbuf
= coding
->charbuf
;
3611 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3612 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3613 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3616 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3617 && CODING_ISO_BOL (coding
));
3618 int produced_chars
= 0;
3619 Lisp_Object attrs
, eol_type
, charset_list
;
3620 int ascii_compatible
;
3622 int preferred_charset_id
= -1;
3624 CODING_GET_INFO (coding
, attrs
, charset_list
);
3625 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
3626 if (VECTORP (eol_type
))
3629 setup_iso_safe_charsets (attrs
);
3630 /* Charset list may have been changed. */
3631 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
); \
3632 coding
->safe_charsets
= (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs
));
3634 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3636 while (charbuf
< charbuf_end
)
3638 ASSURE_DESTINATION (safe_room
);
3640 if (bol_designation
)
3642 unsigned char *dst_prev
= dst
;
3644 /* We have to produce designation sequences if any now. */
3645 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3646 bol_designation
= 0;
3647 /* We are sure that designation sequences are all ASCII bytes. */
3648 produced_chars
+= dst
- dst_prev
;
3655 /* Handle an annotation. */
3658 case CODING_ANNOTATE_COMPOSITION_MASK
:
3659 /* Not yet implemented. */
3661 case CODING_ANNOTATE_CHARSET_MASK
:
3662 preferred_charset_id
= charbuf
[3];
3663 if (preferred_charset_id
>= 0
3664 && NILP (Fmemq (make_number (preferred_charset_id
),
3666 preferred_charset_id
= -1;
3675 /* Now encode the character C. */
3676 if (c
< 0x20 || c
== 0x7F)
3679 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3681 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3682 ENCODE_RESET_PLANE_AND_REGISTER ();
3683 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3687 for (i
= 0; i
< 4; i
++)
3688 CODING_ISO_DESIGNATION (coding
, i
)
3689 = CODING_ISO_INITIAL (coding
, i
);
3692 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3694 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3695 ENCODE_RESET_PLANE_AND_REGISTER ();
3696 EMIT_ONE_ASCII_BYTE (c
);
3698 else if (ASCII_CHAR_P (c
))
3700 if (ascii_compatible
)
3701 EMIT_ONE_ASCII_BYTE (c
);
3704 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
3705 ENCODE_ISO_CHARACTER (charset
, c
);
3708 else if (CHAR_BYTE8_P (c
))
3710 c
= CHAR_TO_BYTE8 (c
);
3715 struct charset
*charset
;
3717 if (preferred_charset_id
>= 0)
3719 charset
= CHARSET_FROM_ID (preferred_charset_id
);
3720 if (! CHAR_CHARSET_P (c
, charset
))
3721 charset
= char_charset (c
, charset_list
, NULL
);
3724 charset
= char_charset (c
, charset_list
, NULL
);
3727 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3729 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3730 charset
= CHARSET_FROM_ID (charset_ascii
);
3734 c
= coding
->default_char
;
3735 charset
= char_charset (c
, charset_list
, NULL
);
3738 ENCODE_ISO_CHARACTER (charset
, c
);
3742 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3743 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3745 ASSURE_DESTINATION (safe_room
);
3746 ENCODE_RESET_PLANE_AND_REGISTER ();
3748 coding
->result
= CODING_RESULT_SUCCESS
;
3749 CODING_ISO_BOL (coding
) = bol_designation
;
3750 coding
->produced_char
+= produced_chars
;
3751 coding
->produced
= dst
- coding
->destination
;
3756 /*** 8,9. SJIS and BIG5 handlers ***/
3758 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3759 quite widely. So, for the moment, Emacs supports them in the bare
3760 C code. But, in the future, they may be supported only by CCL. */
3762 /* SJIS is a coding system encoding three character sets: ASCII, right
3763 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3764 as is. A character of charset katakana-jisx0201 is encoded by
3765 "position-code + 0x80". A character of charset japanese-jisx0208
3766 is encoded in 2-byte but two position-codes are divided and shifted
3767 so that it fit in the range below.
3769 --- CODE RANGE of SJIS ---
3770 (character set) (range)
3772 KATAKANA-JISX0201 0xA0 .. 0xDF
3773 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3774 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3775 -------------------------------
3779 /* BIG5 is a coding system encoding two character sets: ASCII and
3780 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3781 character set and is encoded in two-byte.
3783 --- CODE RANGE of BIG5 ---
3784 (character set) (range)
3786 Big5 (1st byte) 0xA1 .. 0xFE
3787 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3788 --------------------------
3792 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3793 Check if a text is encoded in SJIS. If it is, return
3794 CATEGORY_MASK_SJIS, else return 0. */
3797 detect_coding_sjis (coding
, detect_info
)
3798 struct coding_system
*coding
;
3799 struct coding_detection_info
*detect_info
;
3801 const unsigned char *src
= coding
->source
, *src_base
= src
;
3802 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3803 int multibytep
= coding
->src_multibyte
;
3804 int consumed_chars
= 0;
3809 detect_info
->checked
|= CATEGORY_MASK_SJIS
;
3810 /* A coding system of this category is always ASCII compatible. */
3811 src
+= coding
->head_ascii
;
3820 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3823 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
3825 found
= CATEGORY_MASK_SJIS
;
3827 else if (c
>= 0xA0 && c
< 0xE0)
3828 found
= CATEGORY_MASK_SJIS
;
3832 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3836 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3838 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3841 detect_info
->found
|= found
;
3845 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3846 Check if a text is encoded in BIG5. If it is, return
3847 CATEGORY_MASK_BIG5, else return 0. */
3850 detect_coding_big5 (coding
, detect_info
)
3851 struct coding_system
*coding
;
3852 struct coding_detection_info
*detect_info
;
3854 const unsigned char *src
= coding
->source
, *src_base
= src
;
3855 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3856 int multibytep
= coding
->src_multibyte
;
3857 int consumed_chars
= 0;
3862 detect_info
->checked
|= CATEGORY_MASK_BIG5
;
3863 /* A coding system of this category is always ASCII compatible. */
3864 src
+= coding
->head_ascii
;
3876 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
3878 found
= CATEGORY_MASK_BIG5
;
3883 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3887 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3889 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3892 detect_info
->found
|= found
;
3896 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3897 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3900 decode_coding_sjis (coding
)
3901 struct coding_system
*coding
;
3903 const unsigned char *src
= coding
->source
+ coding
->consumed
;
3904 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3905 const unsigned char *src_base
;
3906 int *charbuf
= coding
->charbuf
;
3907 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
3908 int consumed_chars
= 0, consumed_chars_base
;
3909 int multibytep
= coding
->src_multibyte
;
3910 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3911 Lisp_Object attrs
, charset_list
, val
;
3912 int char_offset
= coding
->produced_char
;
3913 int last_offset
= char_offset
;
3914 int last_id
= charset_ascii
;
3916 CODING_GET_INFO (coding
, attrs
, charset_list
);
3919 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3920 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3921 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3926 struct charset
*charset
;
3929 consumed_chars_base
= consumed_chars
;
3931 if (charbuf
>= charbuf_end
)
3937 charset
= charset_roman
;
3942 if (c
< 0xA0 || c
>= 0xE0)
3944 /* SJIS -> JISX0208 */
3946 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
3950 charset
= charset_kanji
;
3954 /* SJIS -> JISX0201-Kana */
3956 charset
= charset_kana
;
3961 if (charset
->id
!= charset_ascii
3962 && last_id
!= charset
->id
)
3964 if (last_id
!= charset_ascii
)
3965 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3966 last_id
= charset
->id
;
3967 last_offset
= char_offset
;
3969 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
3976 consumed_chars
= consumed_chars_base
;
3978 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3984 if (last_id
!= charset_ascii
)
3985 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3986 coding
->consumed_char
+= consumed_chars_base
;
3987 coding
->consumed
= src_base
- coding
->source
;
3988 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3992 decode_coding_big5 (coding
)
3993 struct coding_system
*coding
;
3995 const unsigned char *src
= coding
->source
+ coding
->consumed
;
3996 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3997 const unsigned char *src_base
;
3998 int *charbuf
= coding
->charbuf
;
3999 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4000 int consumed_chars
= 0, consumed_chars_base
;
4001 int multibytep
= coding
->src_multibyte
;
4002 struct charset
*charset_roman
, *charset_big5
;
4003 Lisp_Object attrs
, charset_list
, val
;
4004 int char_offset
= coding
->produced_char
;
4005 int last_offset
= char_offset
;
4006 int last_id
= charset_ascii
;
4008 CODING_GET_INFO (coding
, attrs
, charset_list
);
4010 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4011 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4016 struct charset
*charset
;
4019 consumed_chars_base
= consumed_chars
;
4021 if (charbuf
>= charbuf_end
)
4027 charset
= charset_roman
;
4031 if (c
< 0xA1 || c
> 0xFE)
4034 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
4037 charset
= charset_big5
;
4039 if (charset
->id
!= charset_ascii
4040 && last_id
!= charset
->id
)
4042 if (last_id
!= charset_ascii
)
4043 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4044 last_id
= charset
->id
;
4045 last_offset
= char_offset
;
4047 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4054 consumed_chars
= consumed_chars_base
;
4056 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4062 if (last_id
!= charset_ascii
)
4063 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4064 coding
->consumed_char
+= consumed_chars_base
;
4065 coding
->consumed
= src_base
- coding
->source
;
4066 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4069 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4070 This function can encode charsets `ascii', `katakana-jisx0201',
4071 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4072 are sure that all these charsets are registered as official charset
4073 (i.e. do not have extended leading-codes). Characters of other
4074 charsets are produced without any encoding. If SJIS_P is 1, encode
4075 SJIS text, else encode BIG5 text. */
4078 encode_coding_sjis (coding
)
4079 struct coding_system
*coding
;
4081 int multibytep
= coding
->dst_multibyte
;
4082 int *charbuf
= coding
->charbuf
;
4083 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4084 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4085 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4087 int produced_chars
= 0;
4088 Lisp_Object attrs
, charset_list
, val
;
4089 int ascii_compatible
;
4090 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4093 CODING_GET_INFO (coding
, attrs
, charset_list
);
4095 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4096 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4097 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4099 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4101 while (charbuf
< charbuf_end
)
4103 ASSURE_DESTINATION (safe_room
);
4105 /* Now encode the character C. */
4106 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4107 EMIT_ONE_ASCII_BYTE (c
);
4108 else if (CHAR_BYTE8_P (c
))
4110 c
= CHAR_TO_BYTE8 (c
);
4116 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4120 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4122 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4123 charset
= CHARSET_FROM_ID (charset_ascii
);
4127 c
= coding
->default_char
;
4128 charset
= char_charset (c
, charset_list
, &code
);
4131 if (code
== CHARSET_INVALID_CODE (charset
))
4133 if (charset
== charset_kanji
)
4137 c1
= code
>> 8, c2
= code
& 0xFF;
4138 EMIT_TWO_BYTES (c1
, c2
);
4140 else if (charset
== charset_kana
)
4141 EMIT_ONE_BYTE (code
| 0x80);
4143 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4146 coding
->result
= CODING_RESULT_SUCCESS
;
4147 coding
->produced_char
+= produced_chars
;
4148 coding
->produced
= dst
- coding
->destination
;
4153 encode_coding_big5 (coding
)
4154 struct coding_system
*coding
;
4156 int multibytep
= coding
->dst_multibyte
;
4157 int *charbuf
= coding
->charbuf
;
4158 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4159 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4160 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4162 int produced_chars
= 0;
4163 Lisp_Object attrs
, charset_list
, val
;
4164 int ascii_compatible
;
4165 struct charset
*charset_roman
, *charset_big5
;
4168 CODING_GET_INFO (coding
, attrs
, charset_list
);
4170 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4171 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4172 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4174 while (charbuf
< charbuf_end
)
4176 ASSURE_DESTINATION (safe_room
);
4178 /* Now encode the character C. */
4179 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4180 EMIT_ONE_ASCII_BYTE (c
);
4181 else if (CHAR_BYTE8_P (c
))
4183 c
= CHAR_TO_BYTE8 (c
);
4189 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4193 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4195 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4196 charset
= CHARSET_FROM_ID (charset_ascii
);
4200 c
= coding
->default_char
;
4201 charset
= char_charset (c
, charset_list
, &code
);
4204 if (code
== CHARSET_INVALID_CODE (charset
))
4206 if (charset
== charset_big5
)
4210 c1
= code
>> 8, c2
= code
& 0xFF;
4211 EMIT_TWO_BYTES (c1
, c2
);
4214 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4217 coding
->result
= CODING_RESULT_SUCCESS
;
4218 coding
->produced_char
+= produced_chars
;
4219 coding
->produced
= dst
- coding
->destination
;
4224 /*** 10. CCL handlers ***/
4226 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4227 Check if a text is encoded in a coding system of which
4228 encoder/decoder are written in CCL program. If it is, return
4229 CATEGORY_MASK_CCL, else return 0. */
4232 detect_coding_ccl (coding
, detect_info
)
4233 struct coding_system
*coding
;
4234 struct coding_detection_info
*detect_info
;
4236 const unsigned char *src
= coding
->source
, *src_base
= src
;
4237 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4238 int multibytep
= coding
->src_multibyte
;
4239 int consumed_chars
= 0;
4241 unsigned char *valids
= CODING_CCL_VALIDS (coding
);
4242 int head_ascii
= coding
->head_ascii
;
4245 detect_info
->checked
|= CATEGORY_MASK_CCL
;
4247 coding
= &coding_categories
[coding_category_ccl
];
4248 attrs
= CODING_ID_ATTRS (coding
->id
);
4249 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4258 if ((valids
[c
] > 1))
4259 found
= CATEGORY_MASK_CCL
;
4261 detect_info
->rejected
|= CATEGORY_MASK_CCL
;
4265 detect_info
->found
|= found
;
4270 decode_coding_ccl (coding
)
4271 struct coding_system
*coding
;
4273 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4274 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4275 int *charbuf
= coding
->charbuf
;
4276 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4277 int consumed_chars
= 0;
4278 int multibytep
= coding
->src_multibyte
;
4279 struct ccl_program ccl
;
4280 int source_charbuf
[1024];
4281 int source_byteidx
[1024];
4282 Lisp_Object attrs
, charset_list
;
4284 CODING_GET_INFO (coding
, attrs
, charset_list
);
4285 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
4287 while (src
< src_end
)
4289 const unsigned char *p
= src
;
4290 int *source
, *source_end
;
4294 while (i
< 1024 && p
< src_end
)
4296 source_byteidx
[i
] = p
- src
;
4297 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4300 while (i
< 1024 && p
< src_end
)
4301 source_charbuf
[i
++] = *p
++;
4303 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4306 source
= source_charbuf
;
4307 source_end
= source
+ i
;
4308 while (source
< source_end
)
4310 ccl_driver (&ccl
, source
, charbuf
,
4311 source_end
- source
, charbuf_end
- charbuf
,
4313 source
+= ccl
.consumed
;
4314 charbuf
+= ccl
.produced
;
4315 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4318 if (source
< source_end
)
4319 src
+= source_byteidx
[source
- source_charbuf
];
4322 consumed_chars
+= source
- source_charbuf
;
4324 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4325 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4331 case CCL_STAT_SUSPEND_BY_SRC
:
4332 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4334 case CCL_STAT_SUSPEND_BY_DST
:
4337 case CCL_STAT_INVALID_CMD
:
4338 coding
->result
= CODING_RESULT_INTERRUPT
;
4341 coding
->result
= CODING_RESULT_SUCCESS
;
4344 coding
->consumed_char
+= consumed_chars
;
4345 coding
->consumed
= src
- coding
->source
;
4346 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4350 encode_coding_ccl (coding
)
4351 struct coding_system
*coding
;
4353 struct ccl_program ccl
;
4354 int multibytep
= coding
->dst_multibyte
;
4355 int *charbuf
= coding
->charbuf
;
4356 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4357 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4358 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4359 unsigned char *adjusted_dst_end
= dst_end
- 1;
4360 int destination_charbuf
[1024];
4361 int i
, produced_chars
= 0;
4362 Lisp_Object attrs
, charset_list
;
4364 CODING_GET_INFO (coding
, attrs
, charset_list
);
4365 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4367 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4368 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4370 while (charbuf
< charbuf_end
&& dst
< adjusted_dst_end
)
4372 int dst_bytes
= dst_end
- dst
;
4373 if (dst_bytes
> 1024)
4376 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4377 charbuf_end
- charbuf
, dst_bytes
, charset_list
);
4378 charbuf
+= ccl
.consumed
;
4380 for (i
= 0; i
< ccl
.produced
; i
++)
4381 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4384 for (i
= 0; i
< ccl
.produced
; i
++)
4385 *dst
++ = destination_charbuf
[i
] & 0xFF;
4386 produced_chars
+= ccl
.produced
;
4392 case CCL_STAT_SUSPEND_BY_SRC
:
4393 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4395 case CCL_STAT_SUSPEND_BY_DST
:
4396 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
4399 case CCL_STAT_INVALID_CMD
:
4400 coding
->result
= CODING_RESULT_INTERRUPT
;
4403 coding
->result
= CODING_RESULT_SUCCESS
;
4407 coding
->produced_char
+= produced_chars
;
4408 coding
->produced
= dst
- coding
->destination
;
4414 /*** 10, 11. no-conversion handlers ***/
4416 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4419 decode_coding_raw_text (coding
)
4420 struct coding_system
*coding
;
4422 coding
->chars_at_source
= 1;
4423 coding
->consumed_char
= 0;
4424 coding
->consumed
= 0;
4425 coding
->result
= CODING_RESULT_SUCCESS
;
4429 encode_coding_raw_text (coding
)
4430 struct coding_system
*coding
;
4432 int multibytep
= coding
->dst_multibyte
;
4433 int *charbuf
= coding
->charbuf
;
4434 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4435 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4436 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4437 int produced_chars
= 0;
4442 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4444 if (coding
->src_multibyte
)
4445 while (charbuf
< charbuf_end
)
4447 ASSURE_DESTINATION (safe_room
);
4449 if (ASCII_CHAR_P (c
))
4450 EMIT_ONE_ASCII_BYTE (c
);
4451 else if (CHAR_BYTE8_P (c
))
4453 c
= CHAR_TO_BYTE8 (c
);
4458 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4460 CHAR_STRING_ADVANCE (c
, p1
);
4463 EMIT_ONE_BYTE (*p0
);
4469 while (charbuf
< charbuf_end
)
4471 ASSURE_DESTINATION (safe_room
);
4478 if (coding
->src_multibyte
)
4480 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4482 while (charbuf
< charbuf_end
)
4484 ASSURE_DESTINATION (safe_room
);
4486 if (ASCII_CHAR_P (c
))
4488 else if (CHAR_BYTE8_P (c
))
4489 *dst
++ = CHAR_TO_BYTE8 (c
);
4491 CHAR_STRING_ADVANCE (c
, dst
);
4497 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4498 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4499 *dst
++ = *charbuf
++;
4500 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4503 coding
->result
= CODING_RESULT_SUCCESS
;
4504 coding
->produced_char
+= produced_chars
;
4505 coding
->produced
= dst
- coding
->destination
;
4509 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4510 Check if a text is encoded in a charset-based coding system. If it
4511 is, return 1, else return 0. */
4514 detect_coding_charset (coding
, detect_info
)
4515 struct coding_system
*coding
;
4516 struct coding_detection_info
*detect_info
;
4518 const unsigned char *src
= coding
->source
, *src_base
= src
;
4519 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4520 int multibytep
= coding
->src_multibyte
;
4521 int consumed_chars
= 0;
4522 Lisp_Object attrs
, valids
;
4525 detect_info
->checked
|= CATEGORY_MASK_CHARSET
;
4527 coding
= &coding_categories
[coding_category_charset
];
4528 attrs
= CODING_ID_ATTRS (coding
->id
);
4529 valids
= AREF (attrs
, coding_attr_charset_valids
);
4531 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4532 src
+= coding
->head_ascii
;
4539 if (NILP (AREF (valids
, c
)))
4542 found
= CATEGORY_MASK_CHARSET
;
4544 detect_info
->rejected
|= CATEGORY_MASK_CHARSET
;
4548 detect_info
->found
|= found
;
4553 decode_coding_charset (coding
)
4554 struct coding_system
*coding
;
4556 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4557 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4558 const unsigned char *src_base
;
4559 int *charbuf
= coding
->charbuf
;
4560 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4561 int consumed_chars
= 0, consumed_chars_base
;
4562 int multibytep
= coding
->src_multibyte
;
4563 Lisp_Object attrs
, charset_list
, valids
;
4564 int char_offset
= coding
->produced_char
;
4565 int last_offset
= char_offset
;
4566 int last_id
= charset_ascii
;
4568 CODING_GET_INFO (coding
, attrs
, charset_list
);
4569 valids
= AREF (attrs
, coding_attr_charset_valids
);
4575 struct charset
*charset
;
4581 consumed_chars_base
= consumed_chars
;
4583 if (charbuf
>= charbuf_end
)
4589 val
= AREF (valids
, c
);
4594 charset
= CHARSET_FROM_ID (XFASTINT (val
));
4595 dim
= CHARSET_DIMENSION (charset
);
4599 code
= (code
<< 8) | c
;
4602 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
4607 /* VAL is a list of charset IDs. It is assured that the
4608 list is sorted by charset dimensions (smaller one
4612 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
4613 dim
= CHARSET_DIMENSION (charset
);
4617 code
= (code
<< 8) | c
;
4620 CODING_DECODE_CHAR (coding
, src
, src_base
,
4621 src_end
, charset
, code
, c
);
4629 if (charset
->id
!= charset_ascii
4630 && last_id
!= charset
->id
)
4632 if (last_id
!= charset_ascii
)
4633 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4634 last_id
= charset
->id
;
4635 last_offset
= char_offset
;
4644 consumed_chars
= consumed_chars_base
;
4646 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4652 if (last_id
!= charset_ascii
)
4653 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4654 coding
->consumed_char
+= consumed_chars_base
;
4655 coding
->consumed
= src_base
- coding
->source
;
4656 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4660 encode_coding_charset (coding
)
4661 struct coding_system
*coding
;
4663 int multibytep
= coding
->dst_multibyte
;
4664 int *charbuf
= coding
->charbuf
;
4665 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4666 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4667 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4668 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4669 int produced_chars
= 0;
4670 Lisp_Object attrs
, charset_list
;
4671 int ascii_compatible
;
4674 CODING_GET_INFO (coding
, attrs
, charset_list
);
4675 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4677 while (charbuf
< charbuf_end
)
4679 struct charset
*charset
;
4682 ASSURE_DESTINATION (safe_room
);
4684 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4685 EMIT_ONE_ASCII_BYTE (c
);
4686 else if (CHAR_BYTE8_P (c
))
4688 c
= CHAR_TO_BYTE8 (c
);
4693 charset
= char_charset (c
, charset_list
, &code
);
4696 if (CHARSET_DIMENSION (charset
) == 1)
4697 EMIT_ONE_BYTE (code
);
4698 else if (CHARSET_DIMENSION (charset
) == 2)
4699 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
4700 else if (CHARSET_DIMENSION (charset
) == 3)
4701 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
4703 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
4704 (code
>> 8) & 0xFF, code
& 0xFF);
4708 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4709 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4711 c
= coding
->default_char
;
4717 coding
->result
= CODING_RESULT_SUCCESS
;
4718 coding
->produced_char
+= produced_chars
;
4719 coding
->produced
= dst
- coding
->destination
;
4724 /*** 7. C library functions ***/
4726 /* Setup coding context CODING from information about CODING_SYSTEM.
4727 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4728 CODING_SYSTEM is invalid, signal an error. */
4731 setup_coding_system (coding_system
, coding
)
4732 Lisp_Object coding_system
;
4733 struct coding_system
*coding
;
4736 Lisp_Object eol_type
;
4737 Lisp_Object coding_type
;
4740 if (NILP (coding_system
))
4741 coding_system
= Qno_conversion
;
4743 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
4745 attrs
= CODING_ID_ATTRS (coding
->id
);
4746 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4749 coding
->head_ascii
= -1;
4750 coding
->common_flags
4751 = (VECTORP (eol_type
) ? CODING_REQUIRE_DETECTION_MASK
: 0);
4752 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
4753 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
4754 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
4755 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
4756 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs
)))
4757 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
4759 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4760 coding
->max_charset_id
= SCHARS (val
) - 1;
4761 coding
->safe_charsets
= (char *) SDATA (val
);
4762 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
4764 coding_type
= CODING_ATTR_TYPE (attrs
);
4765 if (EQ (coding_type
, Qundecided
))
4767 coding
->detector
= NULL
;
4768 coding
->decoder
= decode_coding_raw_text
;
4769 coding
->encoder
= encode_coding_raw_text
;
4770 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4772 else if (EQ (coding_type
, Qiso_2022
))
4775 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
4777 /* Invoke graphic register 0 to plane 0. */
4778 CODING_ISO_INVOCATION (coding
, 0) = 0;
4779 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4780 CODING_ISO_INVOCATION (coding
, 1)
4781 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
4782 /* Setup the initial status of designation. */
4783 for (i
= 0; i
< 4; i
++)
4784 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
4785 /* Not single shifting initially. */
4786 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
4787 /* Beginning of buffer should also be regarded as bol. */
4788 CODING_ISO_BOL (coding
) = 1;
4789 coding
->detector
= detect_coding_iso_2022
;
4790 coding
->decoder
= decode_coding_iso_2022
;
4791 coding
->encoder
= encode_coding_iso_2022
;
4792 if (flags
& CODING_ISO_FLAG_SAFE
)
4793 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
4794 coding
->common_flags
4795 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4796 | CODING_REQUIRE_FLUSHING_MASK
);
4797 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
4798 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
4799 if (flags
& CODING_ISO_FLAG_DESIGNATION
)
4800 coding
->common_flags
|= CODING_ANNOTATE_CHARSET_MASK
;
4801 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
4803 setup_iso_safe_charsets (attrs
);
4804 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4805 coding
->max_charset_id
= SCHARS (val
) - 1;
4806 coding
->safe_charsets
= (char *) SDATA (val
);
4808 CODING_ISO_FLAGS (coding
) = flags
;
4810 else if (EQ (coding_type
, Qcharset
))
4812 coding
->detector
= detect_coding_charset
;
4813 coding
->decoder
= decode_coding_charset
;
4814 coding
->encoder
= encode_coding_charset
;
4815 coding
->common_flags
4816 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4818 else if (EQ (coding_type
, Qutf_8
))
4820 coding
->detector
= detect_coding_utf_8
;
4821 coding
->decoder
= decode_coding_utf_8
;
4822 coding
->encoder
= encode_coding_utf_8
;
4823 coding
->common_flags
4824 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4826 else if (EQ (coding_type
, Qutf_16
))
4828 val
= AREF (attrs
, coding_attr_utf_16_bom
);
4829 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
4830 : EQ (val
, Qt
) ? utf_16_with_bom
4831 : utf_16_without_bom
);
4832 val
= AREF (attrs
, coding_attr_utf_16_endian
);
4833 CODING_UTF_16_ENDIAN (coding
) = (EQ (val
, Qbig
) ? utf_16_big_endian
4834 : utf_16_little_endian
);
4835 CODING_UTF_16_SURROGATE (coding
) = 0;
4836 coding
->detector
= detect_coding_utf_16
;
4837 coding
->decoder
= decode_coding_utf_16
;
4838 coding
->encoder
= encode_coding_utf_16
;
4839 coding
->common_flags
4840 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4841 if (CODING_UTF_16_BOM (coding
) == utf_16_detect_bom
)
4842 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4844 else if (EQ (coding_type
, Qccl
))
4846 coding
->detector
= detect_coding_ccl
;
4847 coding
->decoder
= decode_coding_ccl
;
4848 coding
->encoder
= encode_coding_ccl
;
4849 coding
->common_flags
4850 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4851 | CODING_REQUIRE_FLUSHING_MASK
);
4853 else if (EQ (coding_type
, Qemacs_mule
))
4855 coding
->detector
= detect_coding_emacs_mule
;
4856 coding
->decoder
= decode_coding_emacs_mule
;
4857 coding
->encoder
= encode_coding_emacs_mule
;
4858 coding
->common_flags
4859 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4860 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
4861 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
4863 Lisp_Object tail
, safe_charsets
;
4864 int max_charset_id
= 0;
4866 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4868 if (max_charset_id
< XFASTINT (XCAR (tail
)))
4869 max_charset_id
= XFASTINT (XCAR (tail
));
4870 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
4872 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4874 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
4875 coding
->max_charset_id
= max_charset_id
;
4876 coding
->safe_charsets
= (char *) SDATA (safe_charsets
);
4879 else if (EQ (coding_type
, Qshift_jis
))
4881 coding
->detector
= detect_coding_sjis
;
4882 coding
->decoder
= decode_coding_sjis
;
4883 coding
->encoder
= encode_coding_sjis
;
4884 coding
->common_flags
4885 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4887 else if (EQ (coding_type
, Qbig5
))
4889 coding
->detector
= detect_coding_big5
;
4890 coding
->decoder
= decode_coding_big5
;
4891 coding
->encoder
= encode_coding_big5
;
4892 coding
->common_flags
4893 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4895 else /* EQ (coding_type, Qraw_text) */
4897 coding
->detector
= NULL
;
4898 coding
->decoder
= decode_coding_raw_text
;
4899 coding
->encoder
= encode_coding_raw_text
;
4905 /* Return raw-text or one of its subsidiaries that has the same
4906 eol_type as CODING-SYSTEM. */
4909 raw_text_coding_system (coding_system
)
4910 Lisp_Object coding_system
;
4912 Lisp_Object spec
, attrs
;
4913 Lisp_Object eol_type
, raw_text_eol_type
;
4915 if (NILP (coding_system
))
4917 spec
= CODING_SYSTEM_SPEC (coding_system
);
4918 attrs
= AREF (spec
, 0);
4920 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
4921 return coding_system
;
4923 eol_type
= AREF (spec
, 2);
4924 if (VECTORP (eol_type
))
4926 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
4927 raw_text_eol_type
= AREF (spec
, 2);
4928 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
4929 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
4930 : AREF (raw_text_eol_type
, 2));
4934 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4935 does, return one of the subsidiary that has the same eol-spec as
4936 PARENT. Otherwise, return CODING_SYSTEM. */
4939 coding_inherit_eol_type (coding_system
, parent
)
4940 Lisp_Object coding_system
, parent
;
4942 Lisp_Object spec
, eol_type
;
4944 if (NILP (coding_system
))
4945 coding_system
= Qraw_text
;
4946 spec
= CODING_SYSTEM_SPEC (coding_system
);
4947 eol_type
= AREF (spec
, 2);
4948 if (VECTORP (eol_type
)
4951 Lisp_Object parent_spec
;
4952 Lisp_Object parent_eol_type
;
4955 = CODING_SYSTEM_SPEC (buffer_defaults
.buffer_file_coding_system
);
4956 parent_eol_type
= AREF (parent_spec
, 2);
4957 if (EQ (parent_eol_type
, Qunix
))
4958 coding_system
= AREF (eol_type
, 0);
4959 else if (EQ (parent_eol_type
, Qdos
))
4960 coding_system
= AREF (eol_type
, 1);
4961 else if (EQ (parent_eol_type
, Qmac
))
4962 coding_system
= AREF (eol_type
, 2);
4964 return coding_system
;
4967 /* Emacs has a mechanism to automatically detect a coding system if it
4968 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4969 it's impossible to distinguish some coding systems accurately
4970 because they use the same range of codes. So, at first, coding
4971 systems are categorized into 7, those are:
4973 o coding-category-emacs-mule
4975 The category for a coding system which has the same code range
4976 as Emacs' internal format. Assigned the coding-system (Lisp
4977 symbol) `emacs-mule' by default.
4979 o coding-category-sjis
4981 The category for a coding system which has the same code range
4982 as SJIS. Assigned the coding-system (Lisp
4983 symbol) `japanese-shift-jis' by default.
4985 o coding-category-iso-7
4987 The category for a coding system which has the same code range
4988 as ISO2022 of 7-bit environment. This doesn't use any locking
4989 shift and single shift functions. This can encode/decode all
4990 charsets. Assigned the coding-system (Lisp symbol)
4991 `iso-2022-7bit' by default.
4993 o coding-category-iso-7-tight
4995 Same as coding-category-iso-7 except that this can
4996 encode/decode only the specified charsets.
4998 o coding-category-iso-8-1
5000 The category for a coding system which has the same code range
5001 as ISO2022 of 8-bit environment and graphic plane 1 used only
5002 for DIMENSION1 charset. This doesn't use any locking shift
5003 and single shift functions. Assigned the coding-system (Lisp
5004 symbol) `iso-latin-1' by default.
5006 o coding-category-iso-8-2
5008 The category for a coding system which has the same code range
5009 as ISO2022 of 8-bit environment and graphic plane 1 used only
5010 for DIMENSION2 charset. This doesn't use any locking shift
5011 and single shift functions. Assigned the coding-system (Lisp
5012 symbol) `japanese-iso-8bit' by default.
5014 o coding-category-iso-7-else
5016 The category for a coding system which has the same code range
5017 as ISO2022 of 7-bit environemnt but uses locking shift or
5018 single shift functions. Assigned the coding-system (Lisp
5019 symbol) `iso-2022-7bit-lock' by default.
5021 o coding-category-iso-8-else
5023 The category for a coding system which has the same code range
5024 as ISO2022 of 8-bit environemnt but uses locking shift or
5025 single shift functions. Assigned the coding-system (Lisp
5026 symbol) `iso-2022-8bit-ss2' by default.
5028 o coding-category-big5
5030 The category for a coding system which has the same code range
5031 as BIG5. Assigned the coding-system (Lisp symbol)
5032 `cn-big5' by default.
5034 o coding-category-utf-8
5036 The category for a coding system which has the same code range
5037 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5038 symbol) `utf-8' by default.
5040 o coding-category-utf-16-be
5042 The category for a coding system in which a text has an
5043 Unicode signature (cf. Unicode Standard) in the order of BIG
5044 endian at the head. Assigned the coding-system (Lisp symbol)
5045 `utf-16-be' by default.
5047 o coding-category-utf-16-le
5049 The category for a coding system in which a text has an
5050 Unicode signature (cf. Unicode Standard) in the order of
5051 LITTLE endian at the head. Assigned the coding-system (Lisp
5052 symbol) `utf-16-le' by default.
5054 o coding-category-ccl
5056 The category for a coding system of which encoder/decoder is
5057 written in CCL programs. The default value is nil, i.e., no
5058 coding system is assigned.
5060 o coding-category-binary
5062 The category for a coding system not categorized in any of the
5063 above. Assigned the coding-system (Lisp symbol)
5064 `no-conversion' by default.
5066 Each of them is a Lisp symbol and the value is an actual
5067 `coding-system's (this is also a Lisp symbol) assigned by a user.
5068 What Emacs does actually is to detect a category of coding system.
5069 Then, it uses a `coding-system' assigned to it. If Emacs can't
5070 decide only one possible category, it selects a category of the
5071 highest priority. Priorities of categories are also specified by a
5072 user in a Lisp variable `coding-category-list'.
5076 #define EOL_SEEN_NONE 0
5077 #define EOL_SEEN_LF 1
5078 #define EOL_SEEN_CR 2
5079 #define EOL_SEEN_CRLF 4
5081 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5082 SOURCE is encoded. If CATEGORY is one of
5083 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5084 two-byte, else they are encoded by one-byte.
5086 Return one of EOL_SEEN_XXX. */
5088 #define MAX_EOL_CHECK_COUNT 3
5091 detect_eol (source
, src_bytes
, category
)
5092 unsigned char *source
;
5093 EMACS_INT src_bytes
;
5094 enum coding_category category
;
5096 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
5099 int eol_seen
= EOL_SEEN_NONE
;
5101 if ((1 << category
) & CATEGORY_MASK_UTF_16
)
5105 msb
= category
== (coding_category_utf_16_le
5106 | coding_category_utf_16_le_nosig
);
5109 while (src
+ 1 < src_end
)
5112 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
5117 this_eol
= EOL_SEEN_LF
;
5118 else if (src
+ 3 >= src_end
5119 || src
[msb
+ 2] != 0
5120 || src
[lsb
+ 2] != '\n')
5121 this_eol
= EOL_SEEN_CR
;
5123 this_eol
= EOL_SEEN_CRLF
;
5125 if (eol_seen
== EOL_SEEN_NONE
)
5126 /* This is the first end-of-line. */
5127 eol_seen
= this_eol
;
5128 else if (eol_seen
!= this_eol
)
5130 /* The found type is different from what found before. */
5131 eol_seen
= EOL_SEEN_LF
;
5134 if (++total
== MAX_EOL_CHECK_COUNT
)
5142 while (src
< src_end
)
5145 if (c
== '\n' || c
== '\r')
5150 this_eol
= EOL_SEEN_LF
;
5151 else if (src
>= src_end
|| *src
!= '\n')
5152 this_eol
= EOL_SEEN_CR
;
5154 this_eol
= EOL_SEEN_CRLF
, src
++;
5156 if (eol_seen
== EOL_SEEN_NONE
)
5157 /* This is the first end-of-line. */
5158 eol_seen
= this_eol
;
5159 else if (eol_seen
!= this_eol
)
5161 /* The found type is different from what found before. */
5162 eol_seen
= EOL_SEEN_LF
;
5165 if (++total
== MAX_EOL_CHECK_COUNT
)
5175 adjust_coding_eol_type (coding
, eol_seen
)
5176 struct coding_system
*coding
;
5179 Lisp_Object eol_type
;
5181 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5182 if (eol_seen
& EOL_SEEN_LF
)
5184 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
5187 else if (eol_seen
& EOL_SEEN_CRLF
)
5189 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
5192 else if (eol_seen
& EOL_SEEN_CR
)
5194 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
5200 /* Detect how a text specified in CODING is encoded. If a coding
5201 system is detected, update fields of CODING by the detected coding
5205 detect_coding (coding
)
5206 struct coding_system
*coding
;
5208 const unsigned char *src
, *src_end
;
5209 Lisp_Object attrs
, coding_type
;
5211 coding
->consumed
= coding
->consumed_char
= 0;
5212 coding
->produced
= coding
->produced_char
= 0;
5213 coding_set_source (coding
);
5215 src_end
= coding
->source
+ coding
->src_bytes
;
5217 /* If we have not yet decided the text encoding type, detect it
5219 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
5223 for (i
= 0, src
= coding
->source
; src
< src_end
; i
++, src
++)
5226 if (c
& 0x80 || (c
< 0x20 && (c
== 0
5227 || c
== ISO_CODE_ESC
5229 || c
== ISO_CODE_SO
)))
5232 /* Skipped bytes must be even for utf-16 detector. */
5235 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
5237 if (coding
->head_ascii
< coding
->src_bytes
)
5239 struct coding_detection_info detect_info
;
5240 enum coding_category category
;
5241 struct coding_system
*this;
5243 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
5244 for (i
= 0; i
< coding_category_raw_text
; i
++)
5246 category
= coding_priorities
[i
];
5247 this = coding_categories
+ category
;
5250 /* No coding system of this category is defined. */
5251 detect_info
.rejected
|= (1 << category
);
5253 else if (category
>= coding_category_raw_text
)
5255 else if (detect_info
.checked
& (1 << category
))
5257 if (detect_info
.found
& (1 << category
))
5260 else if ((*(this->detector
)) (coding
, &detect_info
)
5261 && detect_info
.found
& (1 << category
))
5263 if (category
== coding_category_utf_16_auto
)
5265 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
5266 category
= coding_category_utf_16_le
;
5268 category
= coding_category_utf_16_be
;
5273 if (i
< coding_category_raw_text
)
5274 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5275 else if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
5276 setup_coding_system (Qraw_text
, coding
);
5277 else if (detect_info
.rejected
)
5278 for (i
= 0; i
< coding_category_raw_text
; i
++)
5279 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
5281 this = coding_categories
+ coding_priorities
[i
];
5282 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5287 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding
->id
)))
5288 == coding_category_utf_16_auto
)
5290 Lisp_Object coding_systems
;
5291 struct coding_detection_info detect_info
;
5294 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_16_bom
);
5295 detect_info
.found
= detect_info
.rejected
= 0;
5296 if (CONSP (coding_systems
)
5297 && detect_coding_utf_16 (coding
, &detect_info
))
5299 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
5300 setup_coding_system (XCAR (coding_systems
), coding
);
5301 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
5302 setup_coding_system (XCDR (coding_systems
), coding
);
5310 struct coding_system
*coding
;
5312 Lisp_Object eol_type
;
5313 unsigned char *p
, *pbeg
, *pend
;
5315 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5316 if (EQ (eol_type
, Qunix
))
5319 if (NILP (coding
->dst_object
))
5320 pbeg
= coding
->destination
;
5322 pbeg
= BYTE_POS_ADDR (coding
->dst_pos_byte
);
5323 pend
= pbeg
+ coding
->produced
;
5325 if (VECTORP (eol_type
))
5327 int eol_seen
= EOL_SEEN_NONE
;
5329 for (p
= pbeg
; p
< pend
; p
++)
5332 eol_seen
|= EOL_SEEN_LF
;
5333 else if (*p
== '\r')
5335 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
5337 eol_seen
|= EOL_SEEN_CRLF
;
5341 eol_seen
|= EOL_SEEN_CR
;
5344 if (eol_seen
!= EOL_SEEN_NONE
5345 && eol_seen
!= EOL_SEEN_LF
5346 && eol_seen
!= EOL_SEEN_CRLF
5347 && eol_seen
!= EOL_SEEN_CR
)
5348 eol_seen
= EOL_SEEN_LF
;
5349 if (eol_seen
!= EOL_SEEN_NONE
)
5350 eol_type
= adjust_coding_eol_type (coding
, eol_seen
);
5353 if (EQ (eol_type
, Qmac
))
5355 for (p
= pbeg
; p
< pend
; p
++)
5359 else if (EQ (eol_type
, Qdos
))
5363 if (NILP (coding
->dst_object
))
5365 for (p
= pend
- 2; p
>= pbeg
; p
--)
5368 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
-- - p
- 1);
5374 for (p
= pend
- 2; p
>= pbeg
; p
--)
5377 int pos_byte
= coding
->dst_pos_byte
+ (p
- pbeg
);
5378 int pos
= BYTE_TO_CHAR (pos_byte
);
5380 del_range_2 (pos
, pos_byte
, pos
+ 1, pos_byte
+ 1, 0);
5384 coding
->produced
-= n
;
5385 coding
->produced_char
-= n
;
5390 translate_chars (coding
, table
)
5391 struct coding_system
*coding
;
5394 int *charbuf
= coding
->charbuf
;
5395 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5398 if (coding
->chars_at_source
)
5401 while (charbuf
< charbuf_end
)
5407 *charbuf
++ = translate_char (table
, c
);
5412 produce_chars (coding
)
5413 struct coding_system
*coding
;
5415 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5416 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5418 int produced_chars
= 0;
5420 if (! coding
->chars_at_source
)
5422 /* Characters are in coding->charbuf. */
5423 int *buf
= coding
->charbuf
;
5424 int *buf_end
= buf
+ coding
->charbuf_used
;
5425 unsigned char *adjusted_dst_end
;
5427 if (BUFFERP (coding
->src_object
)
5428 && EQ (coding
->src_object
, coding
->dst_object
))
5429 dst_end
= ((unsigned char *) coding
->source
) + coding
->consumed
;
5430 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5432 while (buf
< buf_end
)
5436 if (dst
>= adjusted_dst_end
)
5438 dst
= alloc_destination (coding
,
5439 buf_end
- buf
+ MAX_MULTIBYTE_LENGTH
,
5441 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5442 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5446 if (coding
->dst_multibyte
5447 || ! CHAR_BYTE8_P (c
))
5448 CHAR_STRING_ADVANCE (c
, dst
);
5450 *dst
++ = CHAR_TO_BYTE8 (c
);
5454 /* This is an annotation datum. (-C) is the length of
5461 const unsigned char *src
= coding
->source
;
5462 const unsigned char *src_end
= src
+ coding
->src_bytes
;
5463 Lisp_Object eol_type
;
5465 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5467 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5469 if (coding
->src_multibyte
)
5476 const unsigned char *src_base
= src
;
5482 if (EQ (eol_type
, Qdos
))
5486 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
5487 goto no_more_source
;
5492 else if (EQ (eol_type
, Qmac
))
5497 coding
->consumed
= src
- coding
->source
;
5499 if (EQ (coding
->src_object
, coding
->dst_object
))
5500 dst_end
= (unsigned char *) src
;
5503 dst
= alloc_destination (coding
, src_end
- src
+ 1,
5505 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5506 coding_set_source (coding
);
5507 src
= coding
->source
+ coding
->consumed
;
5508 src_end
= coding
->source
+ coding
->src_bytes
;
5518 while (src
< src_end
)
5525 if (EQ (eol_type
, Qdos
))
5531 else if (EQ (eol_type
, Qmac
))
5534 if (dst
>= dst_end
- 1)
5536 coding
->consumed
= src
- coding
->source
;
5538 if (EQ (coding
->src_object
, coding
->dst_object
))
5539 dst_end
= (unsigned char *) src
;
5540 if (dst
>= dst_end
- 1)
5542 dst
= alloc_destination (coding
, src_end
- src
+ 2,
5544 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5545 coding_set_source (coding
);
5546 src
= coding
->source
+ coding
->consumed
;
5547 src_end
= coding
->source
+ coding
->src_bytes
;
5555 if (!EQ (coding
->src_object
, coding
->dst_object
))
5557 int require
= coding
->src_bytes
- coding
->dst_bytes
;
5561 EMACS_INT offset
= src
- coding
->source
;
5563 dst
= alloc_destination (coding
, require
, dst
);
5564 coding_set_source (coding
);
5565 src
= coding
->source
+ offset
;
5566 src_end
= coding
->source
+ coding
->src_bytes
;
5569 produced_chars
= coding
->src_chars
;
5570 while (src
< src_end
)
5576 if (EQ (eol_type
, Qdos
))
5583 else if (EQ (eol_type
, Qmac
))
5589 coding
->consumed
= coding
->src_bytes
;
5590 coding
->consumed_char
= coding
->src_chars
;
5593 produced
= dst
- (coding
->destination
+ coding
->produced
);
5594 if (BUFFERP (coding
->dst_object
))
5595 insert_from_gap (produced_chars
, produced
);
5596 coding
->produced
+= produced
;
5597 coding
->produced_char
+= produced_chars
;
5598 return produced_chars
;
5601 /* Compose text in CODING->object according to the annotation data at
5602 CHARBUF. CHARBUF is an array:
5603 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5607 produce_composition (coding
, charbuf
)
5608 struct coding_system
*coding
;
5613 enum composition_method method
;
5614 Lisp_Object components
;
5617 from
= coding
->dst_pos
+ charbuf
[2];
5618 to
= coding
->dst_pos
+ charbuf
[3];
5619 method
= (enum composition_method
) (charbuf
[4]);
5621 if (method
== COMPOSITION_RELATIVE
)
5625 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
5630 for (i
= 0; i
< len
; i
++)
5631 args
[i
] = make_number (charbuf
[i
]);
5632 components
= (method
== COMPOSITION_WITH_ALTCHARS
5633 ? Fstring (len
, args
) : Fvector (len
, args
));
5635 compose_text (from
, to
, components
, Qnil
, coding
->dst_object
);
5639 /* Put `charset' property on text in CODING->object according to
5640 the annotation data at CHARBUF. CHARBUF is an array:
5641 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5645 produce_charset (coding
, charbuf
)
5646 struct coding_system
*coding
;
5649 EMACS_INT from
= coding
->dst_pos
+ charbuf
[2];
5650 EMACS_INT to
= coding
->dst_pos
+ charbuf
[3];
5651 struct charset
*charset
= CHARSET_FROM_ID (charbuf
[4]);
5653 Fput_text_property (make_number (from
), make_number (to
),
5654 Qcharset
, CHARSET_NAME (charset
),
5655 coding
->dst_object
);
5659 #define CHARBUF_SIZE 0x4000
5661 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5663 int size = CHARBUF_SIZE;; \
5665 coding->charbuf = NULL; \
5666 while (size > 1024) \
5668 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5669 if (coding->charbuf) \
5673 if (! coding->charbuf) \
5675 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5676 return coding->result; \
5678 coding->charbuf_size = size; \
5683 produce_annotation (coding
)
5684 struct coding_system
*coding
;
5686 int *charbuf
= coding
->charbuf
;
5687 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5689 if (NILP (coding
->dst_object
))
5692 while (charbuf
< charbuf_end
)
5698 int len
= -*charbuf
;
5701 case CODING_ANNOTATE_COMPOSITION_MASK
:
5702 produce_composition (coding
, charbuf
);
5704 case CODING_ANNOTATE_CHARSET_MASK
:
5705 produce_charset (coding
, charbuf
);
5715 /* Decode the data at CODING->src_object into CODING->dst_object.
5716 CODING->src_object is a buffer, a string, or nil.
5717 CODING->dst_object is a buffer.
5719 If CODING->src_object is a buffer, it must be the current buffer.
5720 In this case, if CODING->src_pos is positive, it is a position of
5721 the source text in the buffer, otherwise, the source text is in the
5722 gap area of the buffer, and CODING->src_pos specifies the offset of
5723 the text from GPT (which must be the same as PT). If this is the
5724 same buffer as CODING->dst_object, CODING->src_pos must be
5727 If CODING->src_object is a string, CODING->src_pos in an index to
5730 If CODING->src_object is nil, CODING->source must already point to
5731 the non-relocatable memory area. In this case, CODING->src_pos is
5732 an offset from CODING->source.
5734 The decoded data is inserted at the current point of the buffer
5739 decode_coding (coding
)
5740 struct coding_system
*coding
;
5743 Lisp_Object undo_list
;
5745 if (BUFFERP (coding
->src_object
)
5746 && coding
->src_pos
> 0
5747 && coding
->src_pos
< GPT
5748 && coding
->src_pos
+ coding
->src_chars
> GPT
)
5749 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
5752 if (BUFFERP (coding
->dst_object
))
5754 if (current_buffer
!= XBUFFER (coding
->dst_object
))
5755 set_buffer_internal (XBUFFER (coding
->dst_object
));
5757 move_gap_both (PT
, PT_BYTE
);
5758 undo_list
= current_buffer
->undo_list
;
5759 current_buffer
->undo_list
= Qt
;
5762 coding
->consumed
= coding
->consumed_char
= 0;
5763 coding
->produced
= coding
->produced_char
= 0;
5764 coding
->chars_at_source
= 0;
5765 coding
->result
= CODING_RESULT_SUCCESS
;
5768 ALLOC_CONVERSION_WORK_AREA (coding
);
5770 attrs
= CODING_ID_ATTRS (coding
->id
);
5774 coding_set_source (coding
);
5775 coding
->annotated
= 0;
5776 (*(coding
->decoder
)) (coding
);
5777 if (!NILP (CODING_ATTR_DECODE_TBL (attrs
)))
5778 translate_chars (coding
, CODING_ATTR_DECODE_TBL (attrs
));
5779 else if (!NILP (Vstandard_translation_table_for_decode
))
5780 translate_chars (coding
, Vstandard_translation_table_for_decode
);
5781 coding_set_destination (coding
);
5782 produce_chars (coding
);
5783 if (coding
->annotated
)
5784 produce_annotation (coding
);
5786 while (coding
->consumed
< coding
->src_bytes
5787 && ! coding
->result
);
5789 coding
->carryover_bytes
= 0;
5790 if (coding
->consumed
< coding
->src_bytes
)
5792 int nbytes
= coding
->src_bytes
- coding
->consumed
;
5793 const unsigned char *src
;
5795 coding_set_source (coding
);
5796 coding_set_destination (coding
);
5797 src
= coding
->source
+ coding
->consumed
;
5799 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
5801 /* Flush out unprocessed data as binary chars. We are sure
5802 that the number of data is less than the size of
5804 while (nbytes
-- > 0)
5808 coding
->charbuf
[coding
->charbuf_used
++] = (c
& 0x80 ? - c
: c
);
5810 produce_chars (coding
);
5814 /* Record unprocessed bytes in coding->carryover. We are
5815 sure that the number of data is less than the size of
5816 coding->carryover. */
5817 unsigned char *p
= coding
->carryover
;
5819 coding
->carryover_bytes
= nbytes
;
5820 while (nbytes
-- > 0)
5823 coding
->consumed
= coding
->src_bytes
;
5826 if (BUFFERP (coding
->dst_object
))
5828 current_buffer
->undo_list
= undo_list
;
5829 record_insert (coding
->dst_pos
, coding
->produced_char
);
5831 if (! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
5832 decode_eol (coding
);
5833 return coding
->result
;
5837 /* Extract an annotation datum from a composition starting at POS and
5838 ending before LIMIT of CODING->src_object (buffer or string), store
5839 the data in BUF, set *STOP to a starting position of the next
5840 composition (if any) or to LIMIT, and return the address of the
5841 next element of BUF.
5843 If such an annotation is not found, set *STOP to a starting
5844 position of a composition after POS (if any) or to LIMIT, and
5848 handle_composition_annotation (pos
, limit
, coding
, buf
, stop
)
5849 EMACS_INT pos
, limit
;
5850 struct coding_system
*coding
;
5854 EMACS_INT start
, end
;
5857 if (! find_composition (pos
, limit
, &start
, &end
, &prop
, coding
->src_object
)
5860 else if (start
> pos
)
5866 /* We found a composition. Store the corresponding
5867 annotation data in BUF. */
5869 enum composition_method method
= COMPOSITION_METHOD (prop
);
5870 int nchars
= COMPOSITION_LENGTH (prop
);
5872 ADD_COMPOSITION_DATA (buf
, 0, nchars
, method
);
5873 if (method
!= COMPOSITION_RELATIVE
)
5875 Lisp_Object components
;
5878 components
= COMPOSITION_COMPONENTS (prop
);
5879 if (VECTORP (components
))
5881 len
= XVECTOR (components
)->size
;
5882 for (i
= 0; i
< len
; i
++)
5883 *buf
++ = XINT (AREF (components
, i
));
5885 else if (STRINGP (components
))
5887 len
= SCHARS (components
);
5891 FETCH_STRING_CHAR_ADVANCE (*buf
, components
, i
, i_byte
);
5895 else if (INTEGERP (components
))
5898 *buf
++ = XINT (components
);
5900 else if (CONSP (components
))
5902 for (len
= 0; CONSP (components
);
5903 len
++, components
= XCDR (components
))
5904 *buf
++ = XINT (XCAR (components
));
5912 if (find_composition (end
, limit
, &start
, &end
, &prop
,
5923 /* Extract an annotation datum from a text property `charset' at POS of
5924 CODING->src_object (buffer of string), store the data in BUF, set
5925 *STOP to the position where the value of `charset' property changes
5926 (limiting by LIMIT), and return the address of the next element of
5929 If the property value is nil, set *STOP to the position where the
5930 property value is non-nil (limiting by LIMIT), and return BUF. */
5933 handle_charset_annotation (pos
, limit
, coding
, buf
, stop
)
5934 EMACS_INT pos
, limit
;
5935 struct coding_system
*coding
;
5939 Lisp_Object val
, next
;
5942 val
= Fget_text_property (make_number (pos
), Qcharset
, coding
->src_object
);
5943 if (! NILP (val
) && CHARSETP (val
))
5944 id
= XINT (CHARSET_SYMBOL_ID (val
));
5947 ADD_CHARSET_DATA (buf
, 0, 0, id
);
5948 next
= Fnext_single_property_change (make_number (pos
), Qcharset
,
5950 make_number (limit
));
5951 *stop
= XINT (next
);
5957 consume_chars (coding
)
5958 struct coding_system
*coding
;
5960 int *buf
= coding
->charbuf
;
5961 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
;
5962 const unsigned char *src
= coding
->source
+ coding
->consumed
;
5963 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
5964 EMACS_INT pos
= coding
->src_pos
+ coding
->consumed_char
;
5965 EMACS_INT end_pos
= coding
->src_pos
+ coding
->src_chars
;
5966 int multibytep
= coding
->src_multibyte
;
5967 Lisp_Object eol_type
;
5969 EMACS_INT stop
, stop_composition
, stop_charset
;
5971 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5972 if (VECTORP (eol_type
))
5975 /* Note: composition handling is not yet implemented. */
5976 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
5978 if (NILP (coding
->src_object
))
5979 stop
= stop_composition
= stop_charset
= end_pos
;
5982 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
)
5983 stop
= stop_composition
= pos
;
5985 stop
= stop_composition
= end_pos
;
5986 if (coding
->common_flags
& CODING_ANNOTATE_CHARSET_MASK
)
5987 stop
= stop_charset
= pos
;
5989 stop_charset
= end_pos
;
5992 /* Compensate for CRLF and conversion. */
5993 buf_end
-= 1 + MAX_ANNOTATION_LENGTH
;
5994 while (buf
< buf_end
)
6000 if (pos
== stop_composition
)
6001 buf
= handle_composition_annotation (pos
, end_pos
, coding
,
6002 buf
, &stop_composition
);
6003 if (pos
== stop_charset
)
6004 buf
= handle_charset_annotation (pos
, end_pos
, coding
,
6005 buf
, &stop_charset
);
6006 stop
= (stop_composition
< stop_charset
6007 ? stop_composition
: stop_charset
);
6014 if (! CODING_FOR_UNIBYTE (coding
)
6015 && (bytes
= MULTIBYTE_LENGTH (src
, src_end
)) > 0)
6016 c
= STRING_CHAR_ADVANCE (src
), pos
+= bytes
;
6021 c
= STRING_CHAR_ADVANCE (src
), pos
++;
6022 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
6024 if (! EQ (eol_type
, Qunix
))
6028 if (EQ (eol_type
, Qdos
))
6037 coding
->consumed
= src
- coding
->source
;
6038 coding
->consumed_char
= pos
- coding
->src_pos
;
6039 coding
->charbuf_used
= buf
- coding
->charbuf
;
6040 coding
->chars_at_source
= 0;
6044 /* Encode the text at CODING->src_object into CODING->dst_object.
6045 CODING->src_object is a buffer or a string.
6046 CODING->dst_object is a buffer or nil.
6048 If CODING->src_object is a buffer, it must be the current buffer.
6049 In this case, if CODING->src_pos is positive, it is a position of
6050 the source text in the buffer, otherwise. the source text is in the
6051 gap area of the buffer, and coding->src_pos specifies the offset of
6052 the text from GPT (which must be the same as PT). If this is the
6053 same buffer as CODING->dst_object, CODING->src_pos must be
6054 negative and CODING should not have `pre-write-conversion'.
6056 If CODING->src_object is a string, CODING should not have
6057 `pre-write-conversion'.
6059 If CODING->dst_object is a buffer, the encoded data is inserted at
6060 the current point of that buffer.
6062 If CODING->dst_object is nil, the encoded data is placed at the
6063 memory area specified by CODING->destination. */
6066 encode_coding (coding
)
6067 struct coding_system
*coding
;
6071 attrs
= CODING_ID_ATTRS (coding
->id
);
6073 if (BUFFERP (coding
->dst_object
))
6075 set_buffer_internal (XBUFFER (coding
->dst_object
));
6076 coding
->dst_multibyte
6077 = ! NILP (current_buffer
->enable_multibyte_characters
);
6080 coding
->consumed
= coding
->consumed_char
= 0;
6081 coding
->produced
= coding
->produced_char
= 0;
6082 coding
->result
= CODING_RESULT_SUCCESS
;
6085 ALLOC_CONVERSION_WORK_AREA (coding
);
6088 coding_set_source (coding
);
6089 consume_chars (coding
);
6091 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs
)))
6092 translate_chars (coding
, CODING_ATTR_ENCODE_TBL (attrs
));
6093 else if (!NILP (Vstandard_translation_table_for_encode
))
6094 translate_chars (coding
, Vstandard_translation_table_for_encode
);
6096 coding_set_destination (coding
);
6097 (*(coding
->encoder
)) (coding
);
6098 } while (coding
->consumed_char
< coding
->src_chars
);
6100 if (BUFFERP (coding
->dst_object
))
6101 insert_from_gap (coding
->produced_char
, coding
->produced
);
6103 return (coding
->result
);
6107 /* Name (or base name) of work buffer for code conversion. */
6108 static Lisp_Object Vcode_conversion_workbuf_name
;
6110 /* A working buffer used by the top level conversion. Once it is
6111 created, it is never destroyed. It has the name
6112 Vcode_conversion_workbuf_name. The other working buffers are
6113 destroyed after the use is finished, and their names are modified
6114 versions of Vcode_conversion_workbuf_name. */
6115 static Lisp_Object Vcode_conversion_reused_workbuf
;
6117 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6118 static int reused_workbuf_in_use
;
6121 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6122 multibyteness of returning buffer. */
6125 make_conversion_work_buffer (multibyte
)
6127 Lisp_Object name
, workbuf
;
6128 struct buffer
*current
;
6130 if (reused_workbuf_in_use
++)
6131 name
= Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name
, Qnil
);
6133 name
= Vcode_conversion_workbuf_name
;
6134 workbuf
= Fget_buffer_create (name
);
6135 current
= current_buffer
;
6136 set_buffer_internal (XBUFFER (workbuf
));
6138 current_buffer
->undo_list
= Qt
;
6139 current_buffer
->enable_multibyte_characters
= multibyte
? Qt
: Qnil
;
6140 set_buffer_internal (current
);
6146 code_conversion_restore (arg
)
6149 Lisp_Object current
, workbuf
;
6151 current
= XCAR (arg
);
6152 workbuf
= XCDR (arg
);
6153 if (! NILP (workbuf
))
6155 if (EQ (workbuf
, Vcode_conversion_reused_workbuf
))
6156 reused_workbuf_in_use
= 0;
6157 else if (! NILP (Fbuffer_live_p (workbuf
)))
6158 Fkill_buffer (workbuf
);
6160 set_buffer_internal (XBUFFER (current
));
6165 code_conversion_save (with_work_buf
, multibyte
)
6166 int with_work_buf
, multibyte
;
6168 Lisp_Object workbuf
= Qnil
;
6171 workbuf
= make_conversion_work_buffer (multibyte
);
6172 record_unwind_protect (code_conversion_restore
,
6173 Fcons (Fcurrent_buffer (), workbuf
));
6178 decode_coding_gap (coding
, chars
, bytes
)
6179 struct coding_system
*coding
;
6180 EMACS_INT chars
, bytes
;
6182 int count
= specpdl_ptr
- specpdl
;
6185 code_conversion_save (0, 0);
6187 coding
->src_object
= Fcurrent_buffer ();
6188 coding
->src_chars
= chars
;
6189 coding
->src_bytes
= bytes
;
6190 coding
->src_pos
= -chars
;
6191 coding
->src_pos_byte
= -bytes
;
6192 coding
->src_multibyte
= chars
< bytes
;
6193 coding
->dst_object
= coding
->src_object
;
6194 coding
->dst_pos
= PT
;
6195 coding
->dst_pos_byte
= PT_BYTE
;
6196 coding
->dst_multibyte
= ! NILP (current_buffer
->enable_multibyte_characters
);
6197 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
6199 if (CODING_REQUIRE_DETECTION (coding
))
6200 detect_coding (coding
);
6202 decode_coding (coding
);
6204 attrs
= CODING_ID_ATTRS (coding
->id
);
6205 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6207 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6210 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6211 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6212 make_number (coding
->produced_char
));
6214 coding
->produced_char
+= Z
- prev_Z
;
6215 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6218 unbind_to (count
, Qnil
);
6219 return coding
->result
;
6223 encode_coding_gap (coding
, chars
, bytes
)
6224 struct coding_system
*coding
;
6225 EMACS_INT chars
, bytes
;
6227 int count
= specpdl_ptr
- specpdl
;
6229 code_conversion_save (0, 0);
6231 coding
->src_object
= Fcurrent_buffer ();
6232 coding
->src_chars
= chars
;
6233 coding
->src_bytes
= bytes
;
6234 coding
->src_pos
= -chars
;
6235 coding
->src_pos_byte
= -bytes
;
6236 coding
->src_multibyte
= chars
< bytes
;
6237 coding
->dst_object
= coding
->src_object
;
6238 coding
->dst_pos
= PT
;
6239 coding
->dst_pos_byte
= PT_BYTE
;
6241 encode_coding (coding
);
6243 unbind_to (count
, Qnil
);
6244 return coding
->result
;
6248 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6249 SRC_OBJECT into DST_OBJECT by coding context CODING.
6251 SRC_OBJECT is a buffer, a string, or Qnil.
6253 If it is a buffer, the text is at point of the buffer. FROM and TO
6254 are positions in the buffer.
6256 If it is a string, the text is at the beginning of the string.
6257 FROM and TO are indices to the string.
6259 If it is nil, the text is at coding->source. FROM and TO are
6260 indices to coding->source.
6262 DST_OBJECT is a buffer, Qt, or Qnil.
6264 If it is a buffer, the decoded text is inserted at point of the
6265 buffer. If the buffer is the same as SRC_OBJECT, the source text
6268 If it is Qt, a string is made from the decoded text, and
6269 set in CODING->dst_object.
6271 If it is Qnil, the decoded text is stored at CODING->destination.
6272 The caller must allocate CODING->dst_bytes bytes at
6273 CODING->destination by xmalloc. If the decoded text is longer than
6274 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6278 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6280 struct coding_system
*coding
;
6281 Lisp_Object src_object
;
6282 EMACS_INT from
, from_byte
, to
, to_byte
;
6283 Lisp_Object dst_object
;
6285 int count
= specpdl_ptr
- specpdl
;
6286 unsigned char *destination
;
6287 EMACS_INT dst_bytes
;
6288 EMACS_INT chars
= to
- from
;
6289 EMACS_INT bytes
= to_byte
- from_byte
;
6292 int saved_pt
= -1, saved_pt_byte
;
6294 buffer
= Fcurrent_buffer ();
6296 if (NILP (dst_object
))
6298 destination
= coding
->destination
;
6299 dst_bytes
= coding
->dst_bytes
;
6302 coding
->src_object
= src_object
;
6303 coding
->src_chars
= chars
;
6304 coding
->src_bytes
= bytes
;
6305 coding
->src_multibyte
= chars
< bytes
;
6307 if (STRINGP (src_object
))
6309 coding
->src_pos
= from
;
6310 coding
->src_pos_byte
= from_byte
;
6312 else if (BUFFERP (src_object
))
6314 set_buffer_internal (XBUFFER (src_object
));
6316 move_gap_both (from
, from_byte
);
6317 if (EQ (src_object
, dst_object
))
6319 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6320 TEMP_SET_PT_BOTH (from
, from_byte
);
6321 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6322 coding
->src_pos
= -chars
;
6323 coding
->src_pos_byte
= -bytes
;
6327 coding
->src_pos
= from
;
6328 coding
->src_pos_byte
= from_byte
;
6332 if (CODING_REQUIRE_DETECTION (coding
))
6333 detect_coding (coding
);
6334 attrs
= CODING_ID_ATTRS (coding
->id
);
6336 if (EQ (dst_object
, Qt
)
6337 || (! NILP (CODING_ATTR_POST_READ (attrs
))
6338 && NILP (dst_object
)))
6340 coding
->dst_object
= code_conversion_save (1, 1);
6341 coding
->dst_pos
= BEG
;
6342 coding
->dst_pos_byte
= BEG_BYTE
;
6343 coding
->dst_multibyte
= 1;
6345 else if (BUFFERP (dst_object
))
6347 code_conversion_save (0, 0);
6348 coding
->dst_object
= dst_object
;
6349 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6350 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6351 coding
->dst_multibyte
6352 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6356 code_conversion_save (0, 0);
6357 coding
->dst_object
= Qnil
;
6358 coding
->dst_multibyte
= 1;
6361 decode_coding (coding
);
6363 if (BUFFERP (coding
->dst_object
))
6364 set_buffer_internal (XBUFFER (coding
->dst_object
));
6366 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6368 struct gcpro gcpro1
, gcpro2
;
6369 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6372 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6373 GCPRO2 (coding
->src_object
, coding
->dst_object
);
6374 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6375 make_number (coding
->produced_char
));
6378 coding
->produced_char
+= Z
- prev_Z
;
6379 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6382 if (EQ (dst_object
, Qt
))
6384 coding
->dst_object
= Fbuffer_string ();
6386 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
6388 set_buffer_internal (XBUFFER (coding
->dst_object
));
6389 if (dst_bytes
< coding
->produced
)
6392 = (unsigned char *) xrealloc (destination
, coding
->produced
);
6395 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
6396 unbind_to (count
, Qnil
);
6399 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
6400 move_gap_both (BEGV
, BEGV_BYTE
);
6401 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
6402 coding
->destination
= destination
;
6408 /* This is the case of:
6409 (BUFFERP (src_object) && EQ (src_object, dst_object))
6410 As we have moved PT while replacing the original buffer
6411 contents, we must recover it now. */
6412 set_buffer_internal (XBUFFER (src_object
));
6413 if (saved_pt
< from
)
6414 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6415 else if (saved_pt
< from
+ chars
)
6416 TEMP_SET_PT_BOTH (from
, from_byte
);
6417 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6418 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6419 saved_pt_byte
+ (coding
->produced
- bytes
));
6421 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6422 saved_pt_byte
+ (coding
->produced
- bytes
));
6425 unbind_to (count
, Qnil
);
6430 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6432 struct coding_system
*coding
;
6433 Lisp_Object src_object
;
6434 EMACS_INT from
, from_byte
, to
, to_byte
;
6435 Lisp_Object dst_object
;
6437 int count
= specpdl_ptr
- specpdl
;
6438 EMACS_INT chars
= to
- from
;
6439 EMACS_INT bytes
= to_byte
- from_byte
;
6442 int saved_pt
= -1, saved_pt_byte
;
6444 buffer
= Fcurrent_buffer ();
6446 coding
->src_object
= src_object
;
6447 coding
->src_chars
= chars
;
6448 coding
->src_bytes
= bytes
;
6449 coding
->src_multibyte
= chars
< bytes
;
6451 attrs
= CODING_ID_ATTRS (coding
->id
);
6453 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
6455 coding
->src_object
= code_conversion_save (1, coding
->src_multibyte
);
6456 set_buffer_internal (XBUFFER (coding
->src_object
));
6457 if (STRINGP (src_object
))
6458 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
6459 else if (BUFFERP (src_object
))
6460 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
6462 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
6464 if (EQ (src_object
, dst_object
))
6466 set_buffer_internal (XBUFFER (src_object
));
6467 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6468 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6469 set_buffer_internal (XBUFFER (coding
->src_object
));
6472 call2 (CODING_ATTR_PRE_WRITE (attrs
),
6473 make_number (BEG
), make_number (Z
));
6474 coding
->src_object
= Fcurrent_buffer ();
6476 move_gap_both (BEG
, BEG_BYTE
);
6477 coding
->src_chars
= Z
- BEG
;
6478 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
6479 coding
->src_pos
= BEG
;
6480 coding
->src_pos_byte
= BEG_BYTE
;
6481 coding
->src_multibyte
= Z
< Z_BYTE
;
6483 else if (STRINGP (src_object
))
6485 code_conversion_save (0, 0);
6486 coding
->src_pos
= from
;
6487 coding
->src_pos_byte
= from_byte
;
6489 else if (BUFFERP (src_object
))
6491 code_conversion_save (0, 0);
6492 set_buffer_internal (XBUFFER (src_object
));
6493 if (EQ (src_object
, dst_object
))
6495 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6496 coding
->src_object
= del_range_1 (from
, to
, 1, 1);
6497 coding
->src_pos
= 0;
6498 coding
->src_pos_byte
= 0;
6502 if (from
< GPT
&& to
>= GPT
)
6503 move_gap_both (from
, from_byte
);
6504 coding
->src_pos
= from
;
6505 coding
->src_pos_byte
= from_byte
;
6509 code_conversion_save (0, 0);
6511 if (BUFFERP (dst_object
))
6513 coding
->dst_object
= dst_object
;
6514 if (EQ (src_object
, dst_object
))
6516 coding
->dst_pos
= from
;
6517 coding
->dst_pos_byte
= from_byte
;
6521 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6522 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6524 coding
->dst_multibyte
6525 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6527 else if (EQ (dst_object
, Qt
))
6529 coding
->dst_object
= Qnil
;
6530 coding
->dst_bytes
= coding
->src_chars
;
6531 if (coding
->dst_bytes
== 0)
6532 coding
->dst_bytes
= 1;
6533 coding
->destination
= (unsigned char *) xmalloc (coding
->dst_bytes
);
6534 coding
->dst_multibyte
= 0;
6538 coding
->dst_object
= Qnil
;
6539 coding
->dst_multibyte
= 0;
6542 encode_coding (coding
);
6544 if (EQ (dst_object
, Qt
))
6546 if (BUFFERP (coding
->dst_object
))
6547 coding
->dst_object
= Fbuffer_string ();
6551 = make_unibyte_string ((char *) coding
->destination
,
6553 xfree (coding
->destination
);
6559 /* This is the case of:
6560 (BUFFERP (src_object) && EQ (src_object, dst_object))
6561 As we have moved PT while replacing the original buffer
6562 contents, we must recover it now. */
6563 set_buffer_internal (XBUFFER (src_object
));
6564 if (saved_pt
< from
)
6565 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6566 else if (saved_pt
< from
+ chars
)
6567 TEMP_SET_PT_BOTH (from
, from_byte
);
6568 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6569 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6570 saved_pt_byte
+ (coding
->produced
- bytes
));
6572 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6573 saved_pt_byte
+ (coding
->produced
- bytes
));
6576 unbind_to (count
, Qnil
);
6581 preferred_coding_system ()
6583 int id
= coding_categories
[coding_priorities
[0]].id
;
6585 return CODING_ID_NAME (id
);
6590 /*** 8. Emacs Lisp library functions ***/
6592 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
6593 doc
: /* Return t if OBJECT is nil or a coding-system.
6594 See the documentation of `define-coding-system' for information
6595 about coding-system objects. */)
6599 return ((NILP (obj
) || CODING_SYSTEM_P (obj
)) ? Qt
: Qnil
);
6602 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
6603 Sread_non_nil_coding_system
, 1, 1, 0,
6604 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6611 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6612 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
6614 while (SCHARS (val
) == 0);
6615 return (Fintern (val
, Qnil
));
6618 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
6619 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6620 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6621 (prompt
, default_coding_system
)
6622 Lisp_Object prompt
, default_coding_system
;
6625 if (SYMBOLP (default_coding_system
))
6626 XSETSTRING (default_coding_system
, XPNTR (SYMBOL_NAME (default_coding_system
)));
6627 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6628 Qt
, Qnil
, Qcoding_system_history
,
6629 default_coding_system
, Qnil
);
6630 return (SCHARS (val
) == 0 ? Qnil
: Fintern (val
, Qnil
));
6633 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
6635 doc
: /* Check validity of CODING-SYSTEM.
6636 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6638 Lisp_Object coding_system
;
6640 CHECK_SYMBOL (coding_system
);
6641 if (!NILP (Fcoding_system_p (coding_system
)))
6642 return coding_system
;
6644 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
6648 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6649 HIGHEST is nonzero, return the coding system of the highest
6650 priority among the detected coding systems. Otherwize return a
6651 list of detected coding systems sorted by their priorities. If
6652 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6653 multibyte form but contains only ASCII and eight-bit chars.
6654 Otherwise, the bytes are raw bytes.
6656 CODING-SYSTEM controls the detection as below:
6658 If it is nil, detect both text-format and eol-format. If the
6659 text-format part of CODING-SYSTEM is already specified
6660 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6661 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6662 detect only text-format. */
6665 detect_coding_system (src
, src_chars
, src_bytes
, highest
, multibytep
,
6667 const unsigned char *src
;
6668 int src_chars
, src_bytes
, highest
;
6670 Lisp_Object coding_system
;
6672 const unsigned char *src_end
= src
+ src_bytes
;
6673 Lisp_Object attrs
, eol_type
;
6675 struct coding_system coding
;
6677 struct coding_detection_info detect_info
;
6678 enum coding_category base_category
;
6680 if (NILP (coding_system
))
6681 coding_system
= Qundecided
;
6682 setup_coding_system (coding_system
, &coding
);
6683 attrs
= CODING_ID_ATTRS (coding
.id
);
6684 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
6685 coding_system
= CODING_ATTR_BASE_NAME (attrs
);
6687 coding
.source
= src
;
6688 coding
.src_chars
= src_chars
;
6689 coding
.src_bytes
= src_bytes
;
6690 coding
.src_multibyte
= multibytep
;
6691 coding
.consumed
= 0;
6692 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6694 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
6696 /* At first, detect text-format if necessary. */
6697 base_category
= XINT (CODING_ATTR_CATEGORY (attrs
));
6698 if (base_category
== coding_category_undecided
)
6700 enum coding_category category
;
6701 struct coding_system
*this;
6704 /* Skip all ASCII bytes except for a few ISO2022 controls. */
6705 for (i
= 0; src
< src_end
; i
++, src
++)
6708 if (c
& 0x80 || (c
< 0x20 && (c
== 0
6709 || c
== ISO_CODE_ESC
6711 || c
== ISO_CODE_SO
)))
6714 /* Skipped bytes must be even for utf-16 detecor. */
6717 coding
.head_ascii
= src
- coding
.source
;
6720 for (i
= 0; i
< coding_category_raw_text
; i
++)
6722 category
= coding_priorities
[i
];
6723 this = coding_categories
+ category
;
6727 /* No coding system of this category is defined. */
6728 detect_info
.rejected
|= (1 << category
);
6730 else if (category
>= coding_category_raw_text
)
6732 else if (detect_info
.checked
& (1 << category
))
6735 && (detect_info
.found
& (1 << category
)))
6740 if ((*(this->detector
)) (&coding
, &detect_info
)
6742 && (detect_info
.found
& (1 << category
)))
6744 if (category
== coding_category_utf_16_auto
)
6746 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
6747 category
= coding_category_utf_16_le
;
6749 category
= coding_category_utf_16_be
;
6756 if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
6758 detect_info
.found
= CATEGORY_MASK_RAW_TEXT
;
6759 id
= coding_categories
[coding_category_raw_text
].id
;
6760 val
= Fcons (make_number (id
), Qnil
);
6762 else if (! detect_info
.rejected
&& ! detect_info
.found
)
6764 detect_info
.found
= CATEGORY_MASK_ANY
;
6765 id
= coding_categories
[coding_category_undecided
].id
;
6766 val
= Fcons (make_number (id
), Qnil
);
6770 if (detect_info
.found
)
6772 detect_info
.found
= 1 << category
;
6773 val
= Fcons (make_number (this->id
), Qnil
);
6776 for (i
= 0; i
< coding_category_raw_text
; i
++)
6777 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
6779 detect_info
.found
= 1 << coding_priorities
[i
];
6780 id
= coding_categories
[coding_priorities
[i
]].id
;
6781 val
= Fcons (make_number (id
), Qnil
);
6787 int mask
= detect_info
.rejected
| detect_info
.found
;
6791 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6793 category
= coding_priorities
[i
];
6794 if (! (mask
& (1 << category
)))
6796 found
|= 1 << category
;
6797 id
= coding_categories
[category
].id
;
6798 val
= Fcons (make_number (id
), val
);
6801 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6803 category
= coding_priorities
[i
];
6804 if (detect_info
.found
& (1 << category
))
6806 id
= coding_categories
[category
].id
;
6807 val
= Fcons (make_number (id
), val
);
6810 detect_info
.found
|= found
;
6813 else if (base_category
== coding_category_utf_16_auto
)
6815 if (detect_coding_utf_16 (&coding
, &detect_info
))
6817 enum coding_category category
;
6818 struct coding_system
*this;
6820 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
6821 this = coding_categories
+ coding_category_utf_16_le
;
6822 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
6823 this = coding_categories
+ coding_category_utf_16_be
;
6824 else if (detect_info
.rejected
& CATEGORY_MASK_UTF_16_LE_NOSIG
)
6825 this = coding_categories
+ coding_category_utf_16_be_nosig
;
6827 this = coding_categories
+ coding_category_utf_16_le_nosig
;
6828 val
= Fcons (make_number (this->id
), Qnil
);
6833 detect_info
.found
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
6834 val
= Fcons (make_number (coding
.id
), Qnil
);
6837 /* Then, detect eol-format if necessary. */
6839 int normal_eol
= -1, utf_16_be_eol
= -1, utf_16_le_eol
;
6842 if (VECTORP (eol_type
))
6844 if (detect_info
.found
& ~CATEGORY_MASK_UTF_16
)
6845 normal_eol
= detect_eol (coding
.source
, src_bytes
,
6846 coding_category_raw_text
);
6847 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_BE
6848 | CATEGORY_MASK_UTF_16_BE_NOSIG
))
6849 utf_16_be_eol
= detect_eol (coding
.source
, src_bytes
,
6850 coding_category_utf_16_be
);
6851 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
6852 | CATEGORY_MASK_UTF_16_LE_NOSIG
))
6853 utf_16_le_eol
= detect_eol (coding
.source
, src_bytes
,
6854 coding_category_utf_16_le
);
6858 if (EQ (eol_type
, Qunix
))
6859 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_LF
;
6860 else if (EQ (eol_type
, Qdos
))
6861 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CRLF
;
6863 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CR
;
6866 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
6868 enum coding_category category
;
6871 id
= XINT (XCAR (tail
));
6872 attrs
= CODING_ID_ATTRS (id
);
6873 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
6874 eol_type
= CODING_ID_EOL_TYPE (id
);
6875 if (VECTORP (eol_type
))
6877 if (category
== coding_category_utf_16_be
6878 || category
== coding_category_utf_16_be_nosig
)
6879 this_eol
= utf_16_be_eol
;
6880 else if (category
== coding_category_utf_16_le
6881 || category
== coding_category_utf_16_le_nosig
)
6882 this_eol
= utf_16_le_eol
;
6884 this_eol
= normal_eol
;
6886 if (this_eol
== EOL_SEEN_LF
)
6887 XSETCAR (tail
, AREF (eol_type
, 0));
6888 else if (this_eol
== EOL_SEEN_CRLF
)
6889 XSETCAR (tail
, AREF (eol_type
, 1));
6890 else if (this_eol
== EOL_SEEN_CR
)
6891 XSETCAR (tail
, AREF (eol_type
, 2));
6893 XSETCAR (tail
, CODING_ID_NAME (id
));
6896 XSETCAR (tail
, CODING_ID_NAME (id
));
6900 return (highest
? XCAR (val
) : val
);
6904 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
6906 doc
: /* Detect coding system of the text in the region between START and END.
6907 Return a list of possible coding systems ordered by priority.
6909 If only ASCII characters are found, it returns a list of single element
6910 `undecided' or its subsidiary coding system according to a detected
6913 If optional argument HIGHEST is non-nil, return the coding system of
6914 highest priority. */)
6915 (start
, end
, highest
)
6916 Lisp_Object start
, end
, highest
;
6919 int from_byte
, to_byte
;
6921 CHECK_NUMBER_COERCE_MARKER (start
);
6922 CHECK_NUMBER_COERCE_MARKER (end
);
6924 validate_region (&start
, &end
);
6925 from
= XINT (start
), to
= XINT (end
);
6926 from_byte
= CHAR_TO_BYTE (from
);
6927 to_byte
= CHAR_TO_BYTE (to
);
6929 if (from
< GPT
&& to
>= GPT
)
6930 move_gap_both (to
, to_byte
);
6932 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
6933 to
- from
, to_byte
- from_byte
,
6935 !NILP (current_buffer
6936 ->enable_multibyte_characters
),
6940 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
6942 doc
: /* Detect coding system of the text in STRING.
6943 Return a list of possible coding systems ordered by priority.
6945 If only ASCII characters are found, it returns a list of single element
6946 `undecided' or its subsidiary coding system according to a detected
6949 If optional argument HIGHEST is non-nil, return the coding system of
6950 highest priority. */)
6952 Lisp_Object string
, highest
;
6954 CHECK_STRING (string
);
6956 return detect_coding_system (SDATA (string
),
6957 SCHARS (string
), SBYTES (string
),
6958 !NILP (highest
), STRING_MULTIBYTE (string
),
6964 char_encodable_p (c
, attrs
)
6969 struct charset
*charset
;
6971 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
6972 CONSP (tail
); tail
= XCDR (tail
))
6974 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
6975 if (CHAR_CHARSET_P (c
, charset
))
6978 return (! NILP (tail
));
6982 /* Return a list of coding systems that safely encode the text between
6983 START and END. If EXCLUDE is non-nil, it is a list of coding
6984 systems not to check. The returned list doesn't contain any such
6985 coding systems. In any case, if the text contains only ASCII or is
6986 unibyte, return t. */
6988 DEFUN ("find-coding-systems-region-internal",
6989 Ffind_coding_systems_region_internal
,
6990 Sfind_coding_systems_region_internal
, 2, 3, 0,
6991 doc
: /* Internal use only. */)
6992 (start
, end
, exclude
)
6993 Lisp_Object start
, end
, exclude
;
6995 Lisp_Object coding_attrs_list
, safe_codings
;
6996 EMACS_INT start_byte
, end_byte
;
6997 const unsigned char *p
, *pbeg
, *pend
;
6999 Lisp_Object tail
, elt
;
7001 if (STRINGP (start
))
7003 if (!STRING_MULTIBYTE (start
)
7004 || SCHARS (start
) == SBYTES (start
))
7007 end_byte
= SBYTES (start
);
7011 CHECK_NUMBER_COERCE_MARKER (start
);
7012 CHECK_NUMBER_COERCE_MARKER (end
);
7013 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7014 args_out_of_range (start
, end
);
7015 if (NILP (current_buffer
->enable_multibyte_characters
))
7017 start_byte
= CHAR_TO_BYTE (XINT (start
));
7018 end_byte
= CHAR_TO_BYTE (XINT (end
));
7019 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7022 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7024 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7025 move_gap_both (XINT (start
), start_byte
);
7027 move_gap_both (XINT (end
), end_byte
);
7031 coding_attrs_list
= Qnil
;
7032 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7034 || NILP (Fmemq (XCAR (tail
), exclude
)))
7038 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
7039 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
7040 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
7041 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
7044 if (STRINGP (start
))
7045 p
= pbeg
= SDATA (start
);
7047 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7048 pend
= p
+ (end_byte
- start_byte
);
7050 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
7051 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7055 if (ASCII_BYTE_P (*p
))
7059 c
= STRING_CHAR_ADVANCE (p
);
7061 charset_map_loaded
= 0;
7062 for (tail
= coding_attrs_list
; CONSP (tail
);)
7067 else if (char_encodable_p (c
, elt
))
7069 else if (CONSP (XCDR (tail
)))
7071 XSETCAR (tail
, XCAR (XCDR (tail
)));
7072 XSETCDR (tail
, XCDR (XCDR (tail
)));
7076 XSETCAR (tail
, Qnil
);
7080 if (charset_map_loaded
)
7082 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7084 if (STRINGP (start
))
7085 pbeg
= SDATA (start
);
7087 pbeg
= BYTE_POS_ADDR (start_byte
);
7088 p
= pbeg
+ p_offset
;
7089 pend
= pbeg
+ pend_offset
;
7094 safe_codings
= Qnil
;
7095 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
7096 if (! NILP (XCAR (tail
)))
7097 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
7099 return safe_codings
;
7103 DEFUN ("unencodable-char-position", Funencodable_char_position
,
7104 Sunencodable_char_position
, 3, 5, 0,
7106 Return position of first un-encodable character in a region.
7107 START and END specfiy the region and CODING-SYSTEM specifies the
7108 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7110 If optional 4th argument COUNT is non-nil, it specifies at most how
7111 many un-encodable characters to search. In this case, the value is a
7114 If optional 5th argument STRING is non-nil, it is a string to search
7115 for un-encodable characters. In that case, START and END are indexes
7117 (start
, end
, coding_system
, count
, string
)
7118 Lisp_Object start
, end
, coding_system
, count
, string
;
7121 struct coding_system coding
;
7122 Lisp_Object attrs
, charset_list
;
7123 Lisp_Object positions
;
7125 const unsigned char *p
, *stop
, *pend
;
7126 int ascii_compatible
;
7128 setup_coding_system (Fcheck_coding_system (coding_system
), &coding
);
7129 attrs
= CODING_ID_ATTRS (coding
.id
);
7130 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
7132 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
7133 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7137 validate_region (&start
, &end
);
7138 from
= XINT (start
);
7140 if (NILP (current_buffer
->enable_multibyte_characters
)
7141 || (ascii_compatible
7142 && (to
- from
) == (CHAR_TO_BYTE (to
) - (CHAR_TO_BYTE (from
)))))
7144 p
= CHAR_POS_ADDR (from
);
7145 pend
= CHAR_POS_ADDR (to
);
7146 if (from
< GPT
&& to
>= GPT
)
7153 CHECK_STRING (string
);
7154 CHECK_NATNUM (start
);
7156 from
= XINT (start
);
7159 || to
> SCHARS (string
))
7160 args_out_of_range_3 (string
, start
, end
);
7161 if (! STRING_MULTIBYTE (string
))
7163 p
= SDATA (string
) + string_char_to_byte (string
, from
);
7164 stop
= pend
= SDATA (string
) + string_char_to_byte (string
, to
);
7165 if (ascii_compatible
&& (to
- from
) == (pend
- p
))
7173 CHECK_NATNUM (count
);
7182 if (ascii_compatible
)
7183 while (p
< stop
&& ASCII_BYTE_P (*p
))
7193 c
= STRING_CHAR_ADVANCE (p
);
7194 if (! (ASCII_CHAR_P (c
) && ascii_compatible
)
7195 && ! char_charset (c
, charset_list
, NULL
))
7197 positions
= Fcons (make_number (from
), positions
);
7206 return (NILP (count
) ? Fcar (positions
) : Fnreverse (positions
));
7210 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
7211 Scheck_coding_systems_region
, 3, 3, 0,
7212 doc
: /* Check if the region is encodable by coding systems.
7214 START and END are buffer positions specifying the region.
7215 CODING-SYSTEM-LIST is a list of coding systems to check.
7217 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7218 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7219 whole region, POS0, POS1, ... are buffer positions where non-encodable
7220 characters are found.
7222 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7225 START may be a string. In that case, check if the string is
7226 encodable, and the value contains indices to the string instead of
7227 buffer positions. END is ignored. */)
7228 (start
, end
, coding_system_list
)
7229 Lisp_Object start
, end
, coding_system_list
;
7232 EMACS_INT start_byte
, end_byte
;
7234 const unsigned char *p
, *pbeg
, *pend
;
7236 Lisp_Object tail
, elt
;
7238 if (STRINGP (start
))
7240 if (!STRING_MULTIBYTE (start
)
7241 && SCHARS (start
) != SBYTES (start
))
7244 end_byte
= SBYTES (start
);
7249 CHECK_NUMBER_COERCE_MARKER (start
);
7250 CHECK_NUMBER_COERCE_MARKER (end
);
7251 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7252 args_out_of_range (start
, end
);
7253 if (NILP (current_buffer
->enable_multibyte_characters
))
7255 start_byte
= CHAR_TO_BYTE (XINT (start
));
7256 end_byte
= CHAR_TO_BYTE (XINT (end
));
7257 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7260 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7262 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7263 move_gap_both (XINT (start
), start_byte
);
7265 move_gap_both (XINT (end
), end_byte
);
7271 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7274 list
= Fcons (Fcons (elt
, Fcons (AREF (CODING_SYSTEM_SPEC (elt
), 0),
7279 if (STRINGP (start
))
7280 p
= pbeg
= SDATA (start
);
7282 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7283 pend
= p
+ (end_byte
- start_byte
);
7285 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
7286 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7290 if (ASCII_BYTE_P (*p
))
7294 c
= STRING_CHAR_ADVANCE (p
);
7296 charset_map_loaded
= 0;
7297 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
7299 elt
= XCDR (XCAR (tail
));
7300 if (! char_encodable_p (c
, XCAR (elt
)))
7301 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
7303 if (charset_map_loaded
)
7305 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7307 if (STRINGP (start
))
7308 pbeg
= SDATA (start
);
7310 pbeg
= BYTE_POS_ADDR (start_byte
);
7311 p
= pbeg
+ p_offset
;
7312 pend
= pbeg
+ pend_offset
;
7320 for (; CONSP (tail
); tail
= XCDR (tail
))
7323 if (CONSP (XCDR (XCDR (elt
))))
7324 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
7334 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
7335 Lisp_Object start
, end
, coding_system
, dst_object
;
7336 int encodep
, norecord
;
7338 struct coding_system coding
;
7339 EMACS_INT from
, from_byte
, to
, to_byte
;
7340 Lisp_Object src_object
;
7342 CHECK_NUMBER_COERCE_MARKER (start
);
7343 CHECK_NUMBER_COERCE_MARKER (end
);
7344 if (NILP (coding_system
))
7345 coding_system
= Qno_conversion
;
7347 CHECK_CODING_SYSTEM (coding_system
);
7348 src_object
= Fcurrent_buffer ();
7349 if (NILP (dst_object
))
7350 dst_object
= src_object
;
7351 else if (! EQ (dst_object
, Qt
))
7352 CHECK_BUFFER (dst_object
);
7354 validate_region (&start
, &end
);
7355 from
= XFASTINT (start
);
7356 from_byte
= CHAR_TO_BYTE (from
);
7357 to
= XFASTINT (end
);
7358 to_byte
= CHAR_TO_BYTE (to
);
7360 setup_coding_system (coding_system
, &coding
);
7361 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7364 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7367 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7370 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7372 if (coding
.result
!= CODING_RESULT_SUCCESS
)
7373 error ("Code conversion error: %d", coding
.result
);
7375 return (BUFFERP (dst_object
)
7376 ? make_number (coding
.produced_char
)
7377 : coding
.dst_object
);
7381 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
7382 3, 4, "r\nzCoding system: ",
7383 doc
: /* Decode the current region from the specified coding system.
7384 When called from a program, takes four arguments:
7385 START, END, CODING-SYSTEM, and DESTINATION.
7386 START and END are buffer positions.
7388 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7389 If nil, the region between START and END is replace by the decoded text.
7390 If buffer, the decoded text is inserted in the buffer.
7391 If t, the decoded text is returned.
7393 This function sets `last-coding-system-used' to the precise coding system
7394 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7395 not fully specified.)
7396 It returns the length of the decoded text. */)
7397 (start
, end
, coding_system
, destination
)
7398 Lisp_Object start
, end
, coding_system
, destination
;
7400 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
7403 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
7404 3, 4, "r\nzCoding system: ",
7405 doc
: /* Encode the current region by specified coding system.
7406 When called from a program, takes three arguments:
7407 START, END, and CODING-SYSTEM. START and END are buffer positions.
7409 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7410 If nil, the region between START and END is replace by the encoded text.
7411 If buffer, the encoded text is inserted in the buffer.
7412 If t, the encoded text is returned.
7414 This function sets `last-coding-system-used' to the precise coding system
7415 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7416 not fully specified.)
7417 It returns the length of the encoded text. */)
7418 (start
, end
, coding_system
, destination
)
7419 Lisp_Object start
, end
, coding_system
, destination
;
7421 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
7425 code_convert_string (string
, coding_system
, dst_object
,
7426 encodep
, nocopy
, norecord
)
7427 Lisp_Object string
, coding_system
, dst_object
;
7428 int encodep
, nocopy
, norecord
;
7430 struct coding_system coding
;
7431 EMACS_INT chars
, bytes
;
7433 CHECK_STRING (string
);
7434 if (NILP (coding_system
))
7437 Vlast_coding_system_used
= Qno_conversion
;
7438 if (NILP (dst_object
))
7439 return (nocopy
? Fcopy_sequence (string
) : string
);
7442 if (NILP (coding_system
))
7443 coding_system
= Qno_conversion
;
7445 CHECK_CODING_SYSTEM (coding_system
);
7446 if (NILP (dst_object
))
7448 else if (! EQ (dst_object
, Qt
))
7449 CHECK_BUFFER (dst_object
);
7451 setup_coding_system (coding_system
, &coding
);
7452 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7453 chars
= SCHARS (string
);
7454 bytes
= SBYTES (string
);
7456 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7458 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7460 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7462 if (coding
.result
!= CODING_RESULT_SUCCESS
)
7463 error ("Code conversion error: %d", coding
.result
);
7465 return (BUFFERP (dst_object
)
7466 ? make_number (coding
.produced_char
)
7467 : coding
.dst_object
);
7471 /* Encode or decode STRING according to CODING_SYSTEM.
7472 Do not set Vlast_coding_system_used.
7474 This function is called only from macros DECODE_FILE and
7475 ENCODE_FILE, thus we ignore character composition. */
7478 code_convert_string_norecord (string
, coding_system
, encodep
)
7479 Lisp_Object string
, coding_system
;
7482 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
7486 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
7488 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7490 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7491 if the decoding operation is trivial.
7493 Optional fourth arg BUFFER non-nil meant that the decoded text is
7494 inserted in BUFFER instead of returned as a string. In this case,
7495 the return value is BUFFER.
7497 This function sets `last-coding-system-used' to the precise coding system
7498 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7499 not fully specified. */)
7500 (string
, coding_system
, nocopy
, buffer
)
7501 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7503 return code_convert_string (string
, coding_system
, buffer
,
7504 0, ! NILP (nocopy
), 0);
7507 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
7509 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
7511 Optional third arg NOCOPY non-nil means it is OK to return STRING
7512 itself if the encoding operation is trivial.
7514 Optional fourth arg BUFFER non-nil meant that the encoded text is
7515 inserted in BUFFER instead of returned as a string. In this case,
7516 the return value is BUFFER.
7518 This function sets `last-coding-system-used' to the precise coding system
7519 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7520 not fully specified.) */)
7521 (string
, coding_system
, nocopy
, buffer
)
7522 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7524 return code_convert_string (string
, coding_system
, buffer
,
7525 1, ! NILP (nocopy
), 1);
7529 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
7530 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
7531 Return the corresponding character. */)
7535 Lisp_Object spec
, attrs
, val
;
7536 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
7539 CHECK_NATNUM (code
);
7540 c
= XFASTINT (code
);
7541 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7542 attrs
= AREF (spec
, 0);
7544 if (ASCII_BYTE_P (c
)
7545 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7548 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7549 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7550 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7551 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7554 charset
= charset_roman
;
7555 else if (c
>= 0xA0 && c
< 0xDF)
7557 charset
= charset_kana
;
7562 int s1
= c
>> 8, s2
= c
& 0xFF;
7564 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
7565 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
7566 error ("Invalid code: %d", code
);
7568 charset
= charset_kanji
;
7570 c
= DECODE_CHAR (charset
, c
);
7572 error ("Invalid code: %d", code
);
7573 return make_number (c
);
7577 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
7578 doc
: /* Encode a Japanese character CHAR to shift_jis encoding.
7579 Return the corresponding code in SJIS. */)
7583 Lisp_Object spec
, attrs
, charset_list
;
7585 struct charset
*charset
;
7588 CHECK_CHARACTER (ch
);
7590 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7591 attrs
= AREF (spec
, 0);
7593 if (ASCII_CHAR_P (c
)
7594 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7597 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7598 charset
= char_charset (c
, charset_list
, &code
);
7599 if (code
== CHARSET_INVALID_CODE (charset
))
7600 error ("Can't encode by shift_jis encoding: %d", c
);
7603 return make_number (code
);
7606 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
7607 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
7608 Return the corresponding character. */)
7612 Lisp_Object spec
, attrs
, val
;
7613 struct charset
*charset_roman
, *charset_big5
, *charset
;
7616 CHECK_NATNUM (code
);
7617 c
= XFASTINT (code
);
7618 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7619 attrs
= AREF (spec
, 0);
7621 if (ASCII_BYTE_P (c
)
7622 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7625 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7626 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7627 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7630 charset
= charset_roman
;
7633 int b1
= c
>> 8, b2
= c
& 0x7F;
7634 if (b1
< 0xA1 || b1
> 0xFE
7635 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
7636 error ("Invalid code: %d", code
);
7637 charset
= charset_big5
;
7639 c
= DECODE_CHAR (charset
, (unsigned )c
);
7641 error ("Invalid code: %d", code
);
7642 return make_number (c
);
7645 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
7646 doc
: /* Encode the Big5 character CHAR to BIG5 coding system.
7647 Return the corresponding character code in Big5. */)
7651 Lisp_Object spec
, attrs
, charset_list
;
7652 struct charset
*charset
;
7656 CHECK_CHARACTER (ch
);
7658 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7659 attrs
= AREF (spec
, 0);
7660 if (ASCII_CHAR_P (c
)
7661 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7664 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7665 charset
= char_charset (c
, charset_list
, &code
);
7666 if (code
== CHARSET_INVALID_CODE (charset
))
7667 error ("Can't encode by Big5 encoding: %d", c
);
7669 return make_number (code
);
7673 DEFUN ("set-terminal-coding-system-internal",
7674 Fset_terminal_coding_system_internal
,
7675 Sset_terminal_coding_system_internal
, 1, 1, 0,
7676 doc
: /* Internal use only. */)
7678 Lisp_Object coding_system
;
7680 CHECK_SYMBOL (coding_system
);
7681 setup_coding_system (Fcheck_coding_system (coding_system
),
7684 /* We had better not send unsafe characters to terminal. */
7685 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
7686 /* Characer composition should be disabled. */
7687 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7688 terminal_coding
.src_multibyte
= 1;
7689 terminal_coding
.dst_multibyte
= 0;
7693 DEFUN ("set-safe-terminal-coding-system-internal",
7694 Fset_safe_terminal_coding_system_internal
,
7695 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
7696 doc
: /* Internal use only. */)
7698 Lisp_Object coding_system
;
7700 CHECK_SYMBOL (coding_system
);
7701 setup_coding_system (Fcheck_coding_system (coding_system
),
7702 &safe_terminal_coding
);
7703 /* Characer composition should be disabled. */
7704 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7705 safe_terminal_coding
.src_multibyte
= 1;
7706 safe_terminal_coding
.dst_multibyte
= 0;
7710 DEFUN ("terminal-coding-system",
7711 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
7712 doc
: /* Return coding system specified for terminal output. */)
7715 return CODING_ID_NAME (terminal_coding
.id
);
7718 DEFUN ("set-keyboard-coding-system-internal",
7719 Fset_keyboard_coding_system_internal
,
7720 Sset_keyboard_coding_system_internal
, 1, 1, 0,
7721 doc
: /* Internal use only. */)
7723 Lisp_Object coding_system
;
7725 CHECK_SYMBOL (coding_system
);
7726 setup_coding_system (Fcheck_coding_system (coding_system
),
7728 /* Characer composition should be disabled. */
7729 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7733 DEFUN ("keyboard-coding-system",
7734 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
7735 doc
: /* Return coding system specified for decoding keyboard input. */)
7738 return CODING_ID_NAME (keyboard_coding
.id
);
7742 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
7743 Sfind_operation_coding_system
, 1, MANY
, 0,
7744 doc
: /* Choose a coding system for an operation based on the target name.
7745 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7746 DECODING-SYSTEM is the coding system to use for decoding
7747 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7748 for encoding (in case OPERATION does encoding).
7750 The first argument OPERATION specifies an I/O primitive:
7751 For file I/O, `insert-file-contents' or `write-region'.
7752 For process I/O, `call-process', `call-process-region', or `start-process'.
7753 For network I/O, `open-network-stream'.
7755 The remaining arguments should be the same arguments that were passed
7756 to the primitive. Depending on which primitive, one of those arguments
7757 is selected as the TARGET. For example, if OPERATION does file I/O,
7758 whichever argument specifies the file name is TARGET.
7760 TARGET has a meaning which depends on OPERATION:
7761 For file I/O, TARGET is a file name.
7762 For process I/O, TARGET is a process name.
7763 For network I/O, TARGET is a service name or a port number
7765 This function looks up what specified for TARGET in,
7766 `file-coding-system-alist', `process-coding-system-alist',
7767 or `network-coding-system-alist' depending on OPERATION.
7768 They may specify a coding system, a cons of coding systems,
7769 or a function symbol to call.
7770 In the last case, we call the function with one argument,
7771 which is a list of all the arguments given to this function.
7773 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7778 Lisp_Object operation
, target_idx
, target
, val
;
7779 register Lisp_Object chain
;
7782 error ("Too few arguments");
7783 operation
= args
[0];
7784 if (!SYMBOLP (operation
)
7785 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
7786 error ("Invalid first arguement");
7787 if (nargs
< 1 + XINT (target_idx
))
7788 error ("Too few arguments for operation: %s",
7789 SDATA (SYMBOL_NAME (operation
)));
7790 target
= args
[XINT (target_idx
) + 1];
7791 if (!(STRINGP (target
)
7792 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
7793 error ("Invalid %dth argument", XINT (target_idx
) + 1);
7795 chain
= ((EQ (operation
, Qinsert_file_contents
)
7796 || EQ (operation
, Qwrite_region
))
7797 ? Vfile_coding_system_alist
7798 : (EQ (operation
, Qopen_network_stream
)
7799 ? Vnetwork_coding_system_alist
7800 : Vprocess_coding_system_alist
));
7804 for (; CONSP (chain
); chain
= XCDR (chain
))
7810 && ((STRINGP (target
)
7811 && STRINGP (XCAR (elt
))
7812 && fast_string_match (XCAR (elt
), target
) >= 0)
7813 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
7816 /* Here, if VAL is both a valid coding system and a valid
7817 function symbol, we return VAL as a coding system. */
7820 if (! SYMBOLP (val
))
7822 if (! NILP (Fcoding_system_p (val
)))
7823 return Fcons (val
, val
);
7824 if (! NILP (Ffboundp (val
)))
7826 val
= call1 (val
, Flist (nargs
, args
));
7829 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
7830 return Fcons (val
, val
);
7838 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
7839 Sset_coding_system_priority
, 0, MANY
, 0,
7840 doc
: /* Assign higher priority to the coding systems given as arguments.
7841 If multiple coding systems belongs to the same category,
7842 all but the first one are ignored.
7844 usage: (set-coding-system-priority ...) */)
7850 int changed
[coding_category_max
];
7851 enum coding_category priorities
[coding_category_max
];
7853 bzero (changed
, sizeof changed
);
7855 for (i
= j
= 0; i
< nargs
; i
++)
7857 enum coding_category category
;
7858 Lisp_Object spec
, attrs
;
7860 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
7861 attrs
= AREF (spec
, 0);
7862 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7863 if (changed
[category
])
7864 /* Ignore this coding system because a coding system of the
7865 same category already had a higher priority. */
7867 changed
[category
] = 1;
7868 priorities
[j
++] = category
;
7869 if (coding_categories
[category
].id
>= 0
7870 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
7871 setup_coding_system (args
[i
], &coding_categories
[category
]);
7872 Fset (AREF (Vcoding_category_table
, category
), args
[i
]);
7875 /* Now we have decided top J priorities. Reflect the order of the
7876 original priorities to the remaining priorities. */
7878 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
7880 while (j
< coding_category_max
7881 && changed
[coding_priorities
[j
]])
7883 if (j
== coding_category_max
)
7885 priorities
[i
] = coding_priorities
[j
];
7888 bcopy (priorities
, coding_priorities
, sizeof priorities
);
7890 /* Update `coding-category-list'. */
7891 Vcoding_category_list
= Qnil
;
7892 for (i
= coding_category_max
- 1; i
>= 0; i
--)
7893 Vcoding_category_list
7894 = Fcons (AREF (Vcoding_category_table
, priorities
[i
]),
7895 Vcoding_category_list
);
7900 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
7901 Scoding_system_priority_list
, 0, 1, 0,
7902 doc
: /* Return a list of coding systems ordered by their priorities.
7903 HIGHESTP non-nil means just return the highest priority one. */)
7905 Lisp_Object highestp
;
7910 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
7912 enum coding_category category
= coding_priorities
[i
];
7913 int id
= coding_categories
[category
].id
;
7918 attrs
= CODING_ID_ATTRS (id
);
7919 if (! NILP (highestp
))
7920 return CODING_ATTR_BASE_NAME (attrs
);
7921 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
7923 return Fnreverse (val
);
7926 static char *suffixes
[] = { "-unix", "-dos", "-mac" };
7929 make_subsidiaries (base
)
7932 Lisp_Object subsidiaries
;
7933 int base_name_len
= SBYTES (SYMBOL_NAME (base
));
7934 char *buf
= (char *) alloca (base_name_len
+ 6);
7937 bcopy (SDATA (SYMBOL_NAME (base
)), buf
, base_name_len
);
7938 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
7939 for (i
= 0; i
< 3; i
++)
7941 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
7942 ASET (subsidiaries
, i
, intern (buf
));
7944 return subsidiaries
;
7948 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
7949 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
7950 doc
: /* For internal use only.
7951 usage: (define-coding-system-internal ...) */)
7957 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
7958 Lisp_Object attrs
; /* Vector of attributes. */
7959 Lisp_Object eol_type
;
7960 Lisp_Object aliases
;
7961 Lisp_Object coding_type
, charset_list
, safe_charsets
;
7962 enum coding_category category
;
7963 Lisp_Object tail
, val
;
7964 int max_charset_id
= 0;
7967 if (nargs
< coding_arg_max
)
7970 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
7972 name
= args
[coding_arg_name
];
7973 CHECK_SYMBOL (name
);
7974 CODING_ATTR_BASE_NAME (attrs
) = name
;
7976 val
= args
[coding_arg_mnemonic
];
7977 if (! STRINGP (val
))
7978 CHECK_CHARACTER (val
);
7979 CODING_ATTR_MNEMONIC (attrs
) = val
;
7981 coding_type
= args
[coding_arg_coding_type
];
7982 CHECK_SYMBOL (coding_type
);
7983 CODING_ATTR_TYPE (attrs
) = coding_type
;
7985 charset_list
= args
[coding_arg_charset_list
];
7986 if (SYMBOLP (charset_list
))
7988 if (EQ (charset_list
, Qiso_2022
))
7990 if (! EQ (coding_type
, Qiso_2022
))
7991 error ("Invalid charset-list");
7992 charset_list
= Viso_2022_charset_list
;
7994 else if (EQ (charset_list
, Qemacs_mule
))
7996 if (! EQ (coding_type
, Qemacs_mule
))
7997 error ("Invalid charset-list");
7998 charset_list
= Vemacs_mule_charset_list
;
8000 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8001 if (max_charset_id
< XFASTINT (XCAR (tail
)))
8002 max_charset_id
= XFASTINT (XCAR (tail
));
8006 charset_list
= Fcopy_sequence (charset_list
);
8007 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
8009 struct charset
*charset
;
8012 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8013 if (EQ (coding_type
, Qiso_2022
)
8014 ? CHARSET_ISO_FINAL (charset
) < 0
8015 : EQ (coding_type
, Qemacs_mule
)
8016 ? CHARSET_EMACS_MULE_ID (charset
) < 0
8018 error ("Can't handle charset `%s'",
8019 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8021 XSETCAR (tail
, make_number (charset
->id
));
8022 if (max_charset_id
< charset
->id
)
8023 max_charset_id
= charset
->id
;
8026 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
8028 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
8030 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8031 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
8032 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
8034 CODING_ATTR_ASCII_COMPAT (attrs
) = args
[coding_arg_ascii_compatible_p
];
8036 val
= args
[coding_arg_decode_translation_table
];
8038 CHECK_CHAR_TABLE (val
);
8039 CODING_ATTR_DECODE_TBL (attrs
) = val
;
8041 val
= args
[coding_arg_encode_translation_table
];
8043 CHECK_CHAR_TABLE (val
);
8044 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
8046 val
= args
[coding_arg_post_read_conversion
];
8048 CODING_ATTR_POST_READ (attrs
) = val
;
8050 val
= args
[coding_arg_pre_write_conversion
];
8052 CODING_ATTR_PRE_WRITE (attrs
) = val
;
8054 val
= args
[coding_arg_default_char
];
8056 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
8059 CHECK_CHARACTER (val
);
8060 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
8063 val
= args
[coding_arg_for_unibyte
];
8064 CODING_ATTR_FOR_UNIBYTE (attrs
) = NILP (val
) ? Qnil
: Qt
;
8066 val
= args
[coding_arg_plist
];
8068 CODING_ATTR_PLIST (attrs
) = val
;
8070 if (EQ (coding_type
, Qcharset
))
8072 /* Generate a lisp vector of 256 elements. Each element is nil,
8073 integer, or a list of charset IDs.
8075 If Nth element is nil, the byte code N is invalid in this
8078 If Nth element is a number NUM, N is the first byte of a
8079 charset whose ID is NUM.
8081 If Nth element is a list of charset IDs, N is the first byte
8082 of one of them. The list is sorted by dimensions of the
8083 charsets. A charset of smaller dimension comes firtst. */
8084 val
= Fmake_vector (make_number (256), Qnil
);
8086 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8088 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
8089 int dim
= CHARSET_DIMENSION (charset
);
8090 int idx
= (dim
- 1) * 4;
8092 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8093 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8095 for (i
= charset
->code_space
[idx
];
8096 i
<= charset
->code_space
[idx
+ 1]; i
++)
8098 Lisp_Object tmp
, tmp2
;
8101 tmp
= AREF (val
, i
);
8104 else if (NUMBERP (tmp
))
8106 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
8108 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
8110 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
8114 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
8116 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
8121 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
8124 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
8125 XSETCAR (tmp2
, XCAR (tail
));
8131 ASET (attrs
, coding_attr_charset_valids
, val
);
8132 category
= coding_category_charset
;
8134 else if (EQ (coding_type
, Qccl
))
8138 if (nargs
< coding_arg_ccl_max
)
8141 val
= args
[coding_arg_ccl_decoder
];
8142 CHECK_CCL_PROGRAM (val
);
8144 val
= Fcopy_sequence (val
);
8145 ASET (attrs
, coding_attr_ccl_decoder
, val
);
8147 val
= args
[coding_arg_ccl_encoder
];
8148 CHECK_CCL_PROGRAM (val
);
8150 val
= Fcopy_sequence (val
);
8151 ASET (attrs
, coding_attr_ccl_encoder
, val
);
8153 val
= args
[coding_arg_ccl_valids
];
8154 valids
= Fmake_string (make_number (256), make_number (0));
8155 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
8162 from
= to
= XINT (val
);
8163 if (from
< 0 || from
> 255)
8164 args_out_of_range_3 (val
, make_number (0), make_number (255));
8169 CHECK_NATNUM_CAR (val
);
8170 CHECK_NATNUM_CDR (val
);
8171 from
= XINT (XCAR (val
));
8173 args_out_of_range_3 (XCAR (val
),
8174 make_number (0), make_number (255));
8175 to
= XINT (XCDR (val
));
8176 if (to
< from
|| to
> 255)
8177 args_out_of_range_3 (XCDR (val
),
8178 XCAR (val
), make_number (255));
8180 for (i
= from
; i
<= to
; i
++)
8181 SSET (valids
, i
, 1);
8183 ASET (attrs
, coding_attr_ccl_valids
, valids
);
8185 category
= coding_category_ccl
;
8187 else if (EQ (coding_type
, Qutf_16
))
8189 Lisp_Object bom
, endian
;
8191 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8193 if (nargs
< coding_arg_utf16_max
)
8196 bom
= args
[coding_arg_utf16_bom
];
8197 if (! NILP (bom
) && ! EQ (bom
, Qt
))
8201 CHECK_CODING_SYSTEM (val
);
8203 CHECK_CODING_SYSTEM (val
);
8205 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
8207 endian
= args
[coding_arg_utf16_endian
];
8208 CHECK_SYMBOL (endian
);
8211 else if (! EQ (endian
, Qbig
) && ! EQ (endian
, Qlittle
))
8212 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian
)));
8213 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
8215 category
= (CONSP (bom
)
8216 ? coding_category_utf_16_auto
8218 ? (EQ (endian
, Qbig
)
8219 ? coding_category_utf_16_be_nosig
8220 : coding_category_utf_16_le_nosig
)
8221 : (EQ (endian
, Qbig
)
8222 ? coding_category_utf_16_be
8223 : coding_category_utf_16_le
));
8225 else if (EQ (coding_type
, Qiso_2022
))
8227 Lisp_Object initial
, reg_usage
, request
, flags
;
8230 if (nargs
< coding_arg_iso2022_max
)
8233 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
8234 CHECK_VECTOR (initial
);
8235 for (i
= 0; i
< 4; i
++)
8237 val
= Faref (initial
, make_number (i
));
8240 struct charset
*charset
;
8242 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8243 ASET (initial
, i
, make_number (CHARSET_ID (charset
)));
8244 if (i
== 0 && CHARSET_ASCII_COMPATIBLE_P (charset
))
8245 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8248 ASET (initial
, i
, make_number (-1));
8251 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
8252 CHECK_CONS (reg_usage
);
8253 CHECK_NUMBER_CAR (reg_usage
);
8254 CHECK_NUMBER_CDR (reg_usage
);
8256 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
8257 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
8265 CHECK_CHARSET_GET_ID (tmp
, id
);
8266 CHECK_NATNUM_CDR (val
);
8267 if (XINT (XCDR (val
)) >= 4)
8268 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
8269 XSETCAR (val
, make_number (id
));
8272 flags
= args
[coding_arg_iso2022_flags
];
8273 CHECK_NATNUM (flags
);
8275 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
8276 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
8278 ASET (attrs
, coding_attr_iso_initial
, initial
);
8279 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
8280 ASET (attrs
, coding_attr_iso_request
, request
);
8281 ASET (attrs
, coding_attr_iso_flags
, flags
);
8282 setup_iso_safe_charsets (attrs
);
8284 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
8285 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
8286 | CODING_ISO_FLAG_SINGLE_SHIFT
))
8287 ? coding_category_iso_7_else
8288 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8289 ? coding_category_iso_7
8290 : coding_category_iso_7_tight
);
8293 int id
= XINT (AREF (initial
, 1));
8295 category
= (((i
& CODING_ISO_FLAG_LOCKING_SHIFT
)
8296 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8298 ? coding_category_iso_8_else
8299 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
8300 ? coding_category_iso_8_1
8301 : coding_category_iso_8_2
);
8303 if (category
!= coding_category_iso_8_1
8304 && category
!= coding_category_iso_8_2
)
8305 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8307 else if (EQ (coding_type
, Qemacs_mule
))
8309 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
8310 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
8311 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8312 category
= coding_category_emacs_mule
;
8314 else if (EQ (coding_type
, Qshift_jis
))
8317 struct charset
*charset
;
8319 if (XINT (Flength (charset_list
)) != 3)
8320 error ("There should be just three charsets");
8322 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8323 if (CHARSET_DIMENSION (charset
) != 1)
8324 error ("Dimension of charset %s is not one",
8325 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8326 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8327 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8329 charset_list
= XCDR (charset_list
);
8330 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8331 if (CHARSET_DIMENSION (charset
) != 1)
8332 error ("Dimension of charset %s is not one",
8333 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8335 charset_list
= XCDR (charset_list
);
8336 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8337 if (CHARSET_DIMENSION (charset
) != 2)
8338 error ("Dimension of charset %s is not two",
8339 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8341 category
= coding_category_sjis
;
8342 Vsjis_coding_system
= name
;
8344 else if (EQ (coding_type
, Qbig5
))
8346 struct charset
*charset
;
8348 if (XINT (Flength (charset_list
)) != 2)
8349 error ("There should be just two charsets");
8351 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8352 if (CHARSET_DIMENSION (charset
) != 1)
8353 error ("Dimension of charset %s is not one",
8354 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8355 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8356 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8358 charset_list
= XCDR (charset_list
);
8359 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8360 if (CHARSET_DIMENSION (charset
) != 2)
8361 error ("Dimension of charset %s is not two",
8362 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8364 category
= coding_category_big5
;
8365 Vbig5_coding_system
= name
;
8367 else if (EQ (coding_type
, Qraw_text
))
8369 category
= coding_category_raw_text
;
8370 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8372 else if (EQ (coding_type
, Qutf_8
))
8374 category
= coding_category_utf_8
;
8375 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8377 else if (EQ (coding_type
, Qundecided
))
8378 category
= coding_category_undecided
;
8380 error ("Invalid coding system type: %s",
8381 SDATA (SYMBOL_NAME (coding_type
)));
8383 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
8384 CODING_ATTR_PLIST (attrs
)
8385 = Fcons (QCcategory
, Fcons (AREF (Vcoding_category_table
, category
),
8386 CODING_ATTR_PLIST (attrs
)));
8388 eol_type
= args
[coding_arg_eol_type
];
8389 if (! NILP (eol_type
)
8390 && ! EQ (eol_type
, Qunix
)
8391 && ! EQ (eol_type
, Qdos
)
8392 && ! EQ (eol_type
, Qmac
))
8393 error ("Invalid eol-type");
8395 aliases
= Fcons (name
, Qnil
);
8397 if (NILP (eol_type
))
8399 eol_type
= make_subsidiaries (name
);
8400 for (i
= 0; i
< 3; i
++)
8402 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
8404 this_name
= AREF (eol_type
, i
);
8405 this_aliases
= Fcons (this_name
, Qnil
);
8406 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
8407 this_spec
= Fmake_vector (make_number (3), attrs
);
8408 ASET (this_spec
, 1, this_aliases
);
8409 ASET (this_spec
, 2, this_eol_type
);
8410 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
8411 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
8412 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
8413 Vcoding_system_alist
);
8417 spec_vec
= Fmake_vector (make_number (3), attrs
);
8418 ASET (spec_vec
, 1, aliases
);
8419 ASET (spec_vec
, 2, eol_type
);
8421 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
8422 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
8423 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
8424 Vcoding_system_alist
);
8427 int id
= coding_categories
[category
].id
;
8429 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
8430 setup_coding_system (name
, &coding_categories
[category
]);
8436 return Fsignal (Qwrong_number_of_arguments
,
8437 Fcons (intern ("define-coding-system-internal"),
8438 make_number (nargs
)));
8442 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
8443 Sdefine_coding_system_alias
, 2, 2, 0,
8444 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8445 (alias
, coding_system
)
8446 Lisp_Object alias
, coding_system
;
8448 Lisp_Object spec
, aliases
, eol_type
;
8450 CHECK_SYMBOL (alias
);
8451 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8452 aliases
= AREF (spec
, 1);
8453 /* ALISES should be a list of length more than zero, and the first
8454 element is a base coding system. Append ALIAS at the tail of the
8456 while (!NILP (XCDR (aliases
)))
8457 aliases
= XCDR (aliases
);
8458 XSETCDR (aliases
, Fcons (alias
, Qnil
));
8460 eol_type
= AREF (spec
, 2);
8461 if (VECTORP (eol_type
))
8463 Lisp_Object subsidiaries
;
8466 subsidiaries
= make_subsidiaries (alias
);
8467 for (i
= 0; i
< 3; i
++)
8468 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
8469 AREF (eol_type
, i
));
8472 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
8473 Vcoding_system_list
= Fcons (alias
, Vcoding_system_list
);
8474 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
8475 Vcoding_system_alist
);
8480 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
8482 doc
: /* Return the base of CODING-SYSTEM.
8483 Any alias or subsidiary coding system is not a base coding system. */)
8485 Lisp_Object coding_system
;
8487 Lisp_Object spec
, attrs
;
8489 if (NILP (coding_system
))
8490 return (Qno_conversion
);
8491 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8492 attrs
= AREF (spec
, 0);
8493 return CODING_ATTR_BASE_NAME (attrs
);
8496 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
8498 doc
: "Return the property list of CODING-SYSTEM.")
8500 Lisp_Object coding_system
;
8502 Lisp_Object spec
, attrs
;
8504 if (NILP (coding_system
))
8505 coding_system
= Qno_conversion
;
8506 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8507 attrs
= AREF (spec
, 0);
8508 return CODING_ATTR_PLIST (attrs
);
8512 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
8514 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
8516 Lisp_Object coding_system
;
8520 if (NILP (coding_system
))
8521 coding_system
= Qno_conversion
;
8522 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8523 return AREF (spec
, 1);
8526 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
8527 Scoding_system_eol_type
, 1, 1, 0,
8528 doc
: /* Return eol-type of CODING-SYSTEM.
8529 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8531 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8532 and CR respectively.
8534 A vector value indicates that a format of end-of-line should be
8535 detected automatically. Nth element of the vector is the subsidiary
8536 coding system whose eol-type is N. */)
8538 Lisp_Object coding_system
;
8540 Lisp_Object spec
, eol_type
;
8543 if (NILP (coding_system
))
8544 coding_system
= Qno_conversion
;
8545 if (! CODING_SYSTEM_P (coding_system
))
8547 spec
= CODING_SYSTEM_SPEC (coding_system
);
8548 eol_type
= AREF (spec
, 2);
8549 if (VECTORP (eol_type
))
8550 return Fcopy_sequence (eol_type
);
8551 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
8552 return make_number (n
);
8558 /*** 9. Post-amble ***/
8565 for (i
= 0; i
< coding_category_max
; i
++)
8567 coding_categories
[i
].id
= -1;
8568 coding_priorities
[i
] = i
;
8571 /* ISO2022 specific initialize routine. */
8572 for (i
= 0; i
< 0x20; i
++)
8573 iso_code_class
[i
] = ISO_control_0
;
8574 for (i
= 0x21; i
< 0x7F; i
++)
8575 iso_code_class
[i
] = ISO_graphic_plane_0
;
8576 for (i
= 0x80; i
< 0xA0; i
++)
8577 iso_code_class
[i
] = ISO_control_1
;
8578 for (i
= 0xA1; i
< 0xFF; i
++)
8579 iso_code_class
[i
] = ISO_graphic_plane_1
;
8580 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
8581 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
8582 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
8583 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
8584 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
8585 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
8586 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
8587 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
8588 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
8590 for (i
= 0; i
< 256; i
++)
8592 emacs_mule_bytes
[i
] = 1;
8594 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_11
] = 3;
8595 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_12
] = 3;
8596 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_21
] = 4;
8597 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_22
] = 4;
8605 staticpro (&Vcoding_system_hash_table
);
8607 Lisp_Object args
[2];
8610 Vcoding_system_hash_table
= Fmake_hash_table (2, args
);
8613 staticpro (&Vsjis_coding_system
);
8614 Vsjis_coding_system
= Qnil
;
8616 staticpro (&Vbig5_coding_system
);
8617 Vbig5_coding_system
= Qnil
;
8619 staticpro (&Vcode_conversion_reused_workbuf
);
8620 Vcode_conversion_reused_workbuf
= Qnil
;
8622 staticpro (&Vcode_conversion_workbuf_name
);
8623 Vcode_conversion_workbuf_name
= build_string (" *code-conversion-work*");
8625 reused_workbuf_in_use
= 0;
8627 DEFSYM (Qcharset
, "charset");
8628 DEFSYM (Qtarget_idx
, "target-idx");
8629 DEFSYM (Qcoding_system_history
, "coding-system-history");
8630 Fset (Qcoding_system_history
, Qnil
);
8632 /* Target FILENAME is the first argument. */
8633 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
8634 /* Target FILENAME is the third argument. */
8635 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
8637 DEFSYM (Qcall_process
, "call-process");
8638 /* Target PROGRAM is the first argument. */
8639 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
8641 DEFSYM (Qcall_process_region
, "call-process-region");
8642 /* Target PROGRAM is the third argument. */
8643 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
8645 DEFSYM (Qstart_process
, "start-process");
8646 /* Target PROGRAM is the third argument. */
8647 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
8649 DEFSYM (Qopen_network_stream
, "open-network-stream");
8650 /* Target SERVICE is the fourth argument. */
8651 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
8653 DEFSYM (Qcoding_system
, "coding-system");
8654 DEFSYM (Qcoding_aliases
, "coding-aliases");
8656 DEFSYM (Qeol_type
, "eol-type");
8657 DEFSYM (Qunix
, "unix");
8658 DEFSYM (Qdos
, "dos");
8660 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
8661 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
8662 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
8663 DEFSYM (Qdefault_char
, "default-char");
8664 DEFSYM (Qundecided
, "undecided");
8665 DEFSYM (Qno_conversion
, "no-conversion");
8666 DEFSYM (Qraw_text
, "raw-text");
8668 DEFSYM (Qiso_2022
, "iso-2022");
8670 DEFSYM (Qutf_8
, "utf-8");
8671 DEFSYM (Qutf_8_emacs
, "utf-8-emacs");
8673 DEFSYM (Qutf_16
, "utf-16");
8674 DEFSYM (Qbig
, "big");
8675 DEFSYM (Qlittle
, "little");
8677 DEFSYM (Qshift_jis
, "shift-jis");
8678 DEFSYM (Qbig5
, "big5");
8680 DEFSYM (Qcoding_system_p
, "coding-system-p");
8682 DEFSYM (Qcoding_system_error
, "coding-system-error");
8683 Fput (Qcoding_system_error
, Qerror_conditions
,
8684 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
8685 Fput (Qcoding_system_error
, Qerror_message
,
8686 build_string ("Invalid coding system"));
8688 /* Intern this now in case it isn't already done.
8689 Setting this variable twice is harmless.
8690 But don't staticpro it here--that is done in alloc.c. */
8691 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
8693 DEFSYM (Qtranslation_table
, "translation-table");
8694 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
8695 DEFSYM (Qtranslation_table_id
, "translation-table-id");
8696 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
8697 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
8699 DEFSYM (Qvalid_codes
, "valid-codes");
8701 DEFSYM (Qemacs_mule
, "emacs-mule");
8703 DEFSYM (QCcategory
, ":category");
8705 Vcoding_category_table
8706 = Fmake_vector (make_number (coding_category_max
), Qnil
);
8707 staticpro (&Vcoding_category_table
);
8708 /* Followings are target of code detection. */
8709 ASET (Vcoding_category_table
, coding_category_iso_7
,
8710 intern ("coding-category-iso-7"));
8711 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
8712 intern ("coding-category-iso-7-tight"));
8713 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
8714 intern ("coding-category-iso-8-1"));
8715 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
8716 intern ("coding-category-iso-8-2"));
8717 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
8718 intern ("coding-category-iso-7-else"));
8719 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
8720 intern ("coding-category-iso-8-else"));
8721 ASET (Vcoding_category_table
, coding_category_utf_8
,
8722 intern ("coding-category-utf-8"));
8723 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
8724 intern ("coding-category-utf-16-be"));
8725 ASET (Vcoding_category_table
, coding_category_utf_16_auto
,
8726 intern ("coding-category-utf-16-auto"));
8727 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
8728 intern ("coding-category-utf-16-le"));
8729 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
8730 intern ("coding-category-utf-16-be-nosig"));
8731 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
8732 intern ("coding-category-utf-16-le-nosig"));
8733 ASET (Vcoding_category_table
, coding_category_charset
,
8734 intern ("coding-category-charset"));
8735 ASET (Vcoding_category_table
, coding_category_sjis
,
8736 intern ("coding-category-sjis"));
8737 ASET (Vcoding_category_table
, coding_category_big5
,
8738 intern ("coding-category-big5"));
8739 ASET (Vcoding_category_table
, coding_category_ccl
,
8740 intern ("coding-category-ccl"));
8741 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
8742 intern ("coding-category-emacs-mule"));
8743 /* Followings are NOT target of code detection. */
8744 ASET (Vcoding_category_table
, coding_category_raw_text
,
8745 intern ("coding-category-raw-text"));
8746 ASET (Vcoding_category_table
, coding_category_undecided
,
8747 intern ("coding-category-undecided"));
8749 defsubr (&Scoding_system_p
);
8750 defsubr (&Sread_coding_system
);
8751 defsubr (&Sread_non_nil_coding_system
);
8752 defsubr (&Scheck_coding_system
);
8753 defsubr (&Sdetect_coding_region
);
8754 defsubr (&Sdetect_coding_string
);
8755 defsubr (&Sfind_coding_systems_region_internal
);
8756 defsubr (&Sunencodable_char_position
);
8757 defsubr (&Scheck_coding_systems_region
);
8758 defsubr (&Sdecode_coding_region
);
8759 defsubr (&Sencode_coding_region
);
8760 defsubr (&Sdecode_coding_string
);
8761 defsubr (&Sencode_coding_string
);
8762 defsubr (&Sdecode_sjis_char
);
8763 defsubr (&Sencode_sjis_char
);
8764 defsubr (&Sdecode_big5_char
);
8765 defsubr (&Sencode_big5_char
);
8766 defsubr (&Sset_terminal_coding_system_internal
);
8767 defsubr (&Sset_safe_terminal_coding_system_internal
);
8768 defsubr (&Sterminal_coding_system
);
8769 defsubr (&Sset_keyboard_coding_system_internal
);
8770 defsubr (&Skeyboard_coding_system
);
8771 defsubr (&Sfind_operation_coding_system
);
8772 defsubr (&Sset_coding_system_priority
);
8773 defsubr (&Sdefine_coding_system_internal
);
8774 defsubr (&Sdefine_coding_system_alias
);
8775 defsubr (&Scoding_system_base
);
8776 defsubr (&Scoding_system_plist
);
8777 defsubr (&Scoding_system_aliases
);
8778 defsubr (&Scoding_system_eol_type
);
8779 defsubr (&Scoding_system_priority_list
);
8781 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
8782 doc
: /* List of coding systems.
8784 Do not alter the value of this variable manually. This variable should be
8785 updated by the functions `define-coding-system' and
8786 `define-coding-system-alias'. */);
8787 Vcoding_system_list
= Qnil
;
8789 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
8790 doc
: /* Alist of coding system names.
8791 Each element is one element list of coding system name.
8792 This variable is given to `completing-read' as TABLE argument.
8794 Do not alter the value of this variable manually. This variable should be
8795 updated by the functions `make-coding-system' and
8796 `define-coding-system-alias'. */);
8797 Vcoding_system_alist
= Qnil
;
8799 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
8800 doc
: /* List of coding-categories (symbols) ordered by priority.
8802 On detecting a coding system, Emacs tries code detection algorithms
8803 associated with each coding-category one by one in this order. When
8804 one algorithm agrees with a byte sequence of source text, the coding
8805 system bound to the corresponding coding-category is selected. */);
8809 Vcoding_category_list
= Qnil
;
8810 for (i
= coding_category_max
- 1; i
>= 0; i
--)
8811 Vcoding_category_list
8812 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
8813 Vcoding_category_list
);
8816 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
8817 doc
: /* Specify the coding system for read operations.
8818 It is useful to bind this variable with `let', but do not set it globally.
8819 If the value is a coding system, it is used for decoding on read operation.
8820 If not, an appropriate element is used from one of the coding system alists:
8821 There are three such tables, `file-coding-system-alist',
8822 `process-coding-system-alist', and `network-coding-system-alist'. */);
8823 Vcoding_system_for_read
= Qnil
;
8825 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
8826 doc
: /* Specify the coding system for write operations.
8827 Programs bind this variable with `let', but you should not set it globally.
8828 If the value is a coding system, it is used for encoding of output,
8829 when writing it to a file and when sending it to a file or subprocess.
8831 If this does not specify a coding system, an appropriate element
8832 is used from one of the coding system alists:
8833 There are three such tables, `file-coding-system-alist',
8834 `process-coding-system-alist', and `network-coding-system-alist'.
8835 For output to files, if the above procedure does not specify a coding system,
8836 the value of `buffer-file-coding-system' is used. */);
8837 Vcoding_system_for_write
= Qnil
;
8839 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
8841 Coding system used in the latest file or process I/O. */);
8842 Vlast_coding_system_used
= Qnil
;
8844 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
8846 *Non-nil means always inhibit code conversion of end-of-line format.
8847 See info node `Coding Systems' and info node `Text and Binary' concerning
8848 such conversion. */);
8849 inhibit_eol_conversion
= 0;
8851 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
8853 Non-nil means process buffer inherits coding system of process output.
8854 Bind it to t if the process output is to be treated as if it were a file
8855 read from some filesystem. */);
8856 inherit_process_coding_system
= 0;
8858 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
8860 Alist to decide a coding system to use for a file I/O operation.
8861 The format is ((PATTERN . VAL) ...),
8862 where PATTERN is a regular expression matching a file name,
8863 VAL is a coding system, a cons of coding systems, or a function symbol.
8864 If VAL is a coding system, it is used for both decoding and encoding
8866 If VAL is a cons of coding systems, the car part is used for decoding,
8867 and the cdr part is used for encoding.
8868 If VAL is a function symbol, the function must return a coding system
8869 or a cons of coding systems which are used as above. The function gets
8870 the arguments with which `find-operation-coding-systems' was called.
8872 See also the function `find-operation-coding-system'
8873 and the variable `auto-coding-alist'. */);
8874 Vfile_coding_system_alist
= Qnil
;
8876 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
8878 Alist to decide a coding system to use for a process I/O operation.
8879 The format is ((PATTERN . VAL) ...),
8880 where PATTERN is a regular expression matching a program name,
8881 VAL is a coding system, a cons of coding systems, or a function symbol.
8882 If VAL is a coding system, it is used for both decoding what received
8883 from the program and encoding what sent to the program.
8884 If VAL is a cons of coding systems, the car part is used for decoding,
8885 and the cdr part is used for encoding.
8886 If VAL is a function symbol, the function must return a coding system
8887 or a cons of coding systems which are used as above.
8889 See also the function `find-operation-coding-system'. */);
8890 Vprocess_coding_system_alist
= Qnil
;
8892 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
8894 Alist to decide a coding system to use for a network I/O operation.
8895 The format is ((PATTERN . VAL) ...),
8896 where PATTERN is a regular expression matching a network service name
8897 or is a port number to connect to,
8898 VAL is a coding system, a cons of coding systems, or a function symbol.
8899 If VAL is a coding system, it is used for both decoding what received
8900 from the network stream and encoding what sent to the network stream.
8901 If VAL is a cons of coding systems, the car part is used for decoding,
8902 and the cdr part is used for encoding.
8903 If VAL is a function symbol, the function must return a coding system
8904 or a cons of coding systems which are used as above.
8906 See also the function `find-operation-coding-system'. */);
8907 Vnetwork_coding_system_alist
= Qnil
;
8909 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
8910 doc
: /* Coding system to use with system messages.
8911 Also used for decoding keyboard input on X Window system. */);
8912 Vlocale_coding_system
= Qnil
;
8914 /* The eol mnemonics are reset in startup.el system-dependently. */
8915 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
8917 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8918 eol_mnemonic_unix
= build_string (":");
8920 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
8922 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8923 eol_mnemonic_dos
= build_string ("\\");
8925 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
8927 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8928 eol_mnemonic_mac
= build_string ("/");
8930 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
8932 *String displayed in mode line when end-of-line format is not yet determined. */);
8933 eol_mnemonic_undecided
= build_string (":");
8935 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
8937 *Non-nil enables character translation while encoding and decoding. */);
8938 Venable_character_translation
= Qt
;
8940 DEFVAR_LISP ("standard-translation-table-for-decode",
8941 &Vstandard_translation_table_for_decode
,
8942 doc
: /* Table for translating characters while decoding. */);
8943 Vstandard_translation_table_for_decode
= Qnil
;
8945 DEFVAR_LISP ("standard-translation-table-for-encode",
8946 &Vstandard_translation_table_for_encode
,
8947 doc
: /* Table for translating characters while encoding. */);
8948 Vstandard_translation_table_for_encode
= Qnil
;
8950 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
8951 doc
: /* Alist of charsets vs revision numbers.
8952 While encoding, if a charset (car part of an element) is found,
8953 designate it with the escape sequence identifying revision (cdr part
8954 of the element). */);
8955 Vcharset_revision_table
= Qnil
;
8957 DEFVAR_LISP ("default-process-coding-system",
8958 &Vdefault_process_coding_system
,
8959 doc
: /* Cons of coding systems used for process I/O by default.
8960 The car part is used for decoding a process output,
8961 the cdr part is used for encoding a text to be sent to a process. */);
8962 Vdefault_process_coding_system
= Qnil
;
8964 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
8966 Table of extra Latin codes in the range 128..159 (inclusive).
8967 This is a vector of length 256.
8968 If Nth element is non-nil, the existence of code N in a file
8969 \(or output of subprocess) doesn't prevent it to be detected as
8970 a coding system of ISO 2022 variant which has a flag
8971 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8972 or reading output of a subprocess.
8973 Only 128th through 159th elements has a meaning. */);
8974 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
8976 DEFVAR_LISP ("select-safe-coding-system-function",
8977 &Vselect_safe_coding_system_function
,
8979 Function to call to select safe coding system for encoding a text.
8981 If set, this function is called to force a user to select a proper
8982 coding system which can encode the text in the case that a default
8983 coding system used in each operation can't encode the text.
8985 The default value is `select-safe-coding-system' (which see). */);
8986 Vselect_safe_coding_system_function
= Qnil
;
8988 DEFVAR_BOOL ("coding-system-require-warning",
8989 &coding_system_require_warning
,
8990 doc
: /* Internal use only.
8991 If non-nil, on writing a file, `select-safe-coding-system-function' is
8992 called even if `coding-system-for-write' is non-nil. The command
8993 `universal-coding-system-argument' binds this variable to t temporarily. */);
8994 coding_system_require_warning
= 0;
8997 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8998 &inhibit_iso_escape_detection
,
9000 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9002 By default, on reading a file, Emacs tries to detect how the text is
9003 encoded. This code detection is sensitive to escape sequences. If
9004 the sequence is valid as ISO2022, the code is determined as one of
9005 the ISO2022 encodings, and the file is decoded by the corresponding
9006 coding system (e.g. `iso-2022-7bit').
9008 However, there may be a case that you want to read escape sequences in
9009 a file as is. In such a case, you can set this variable to non-nil.
9010 Then, as the code detection ignores any escape sequences, no file is
9011 detected as encoded in some ISO2022 encoding. The result is that all
9012 escape sequences become visible in a buffer.
9014 The default value is nil, and it is strongly recommended not to change
9015 it. That is because many Emacs Lisp source files that contain
9016 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9017 in Emacs's distribution, and they won't be decoded correctly on
9018 reading if you suppress escape sequence detection.
9020 The other way to read escape sequences in a file without decoding is
9021 to explicitly specify some coding system that doesn't use ISO2022's
9022 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9023 inhibit_iso_escape_detection
= 0;
9025 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input
,
9026 doc
: /* Char table for translating self-inserting characters.
9027 This is applied to the result of input methods, not their input. See also
9028 `keyboard-translate-table'. */);
9029 Vtranslation_table_for_input
= Qnil
;
9032 Lisp_Object args
[coding_arg_max
];
9033 Lisp_Object plist
[16];
9036 for (i
= 0; i
< coding_arg_max
; i
++)
9039 plist
[0] = intern (":name");
9040 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
9041 plist
[2] = intern (":mnemonic");
9042 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
9043 plist
[4] = intern (":coding-type");
9044 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
9045 plist
[6] = intern (":ascii-compatible-p");
9046 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
9047 plist
[8] = intern (":default-char");
9048 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
9049 plist
[10] = intern (":for-unibyte");
9050 plist
[11] = args
[coding_arg_for_unibyte
] = Qt
;
9051 plist
[12] = intern (":docstring");
9052 plist
[13] = build_string ("Do no conversion.\n\
9054 When you visit a file with this coding, the file is read into a\n\
9055 unibyte buffer as is, thus each byte of a file is treated as a\n\
9057 plist
[14] = intern (":eol-type");
9058 plist
[15] = args
[coding_arg_eol_type
] = Qunix
;
9059 args
[coding_arg_plist
] = Flist (16, plist
);
9060 Fdefine_coding_system_internal (coding_arg_max
, args
);
9063 setup_coding_system (Qno_conversion
, &keyboard_coding
);
9064 setup_coding_system (Qno_conversion
, &terminal_coding
);
9065 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
9070 for (i
= 0; i
< coding_category_max
; i
++)
9071 Fset (AREF (Vcoding_category_table
, i
), Qno_conversion
);
9076 emacs_strerror (error_number
)
9081 synchronize_system_messages_locale ();
9082 str
= strerror (error_number
);
9084 if (! NILP (Vlocale_coding_system
))
9086 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
9087 Vlocale_coding_system
,
9089 str
= (char *) SDATA (dec
);