1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
152 Below is the template of these functions. */
156 detect_coding_XXX (coding
, detect_info
)
157 struct coding_system
*coding
;
158 struct coding_detection_info
*detect_info
;
160 unsigned char *src
= coding
->source
;
161 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
162 int multibytep
= coding
->src_multibyte
;
163 int consumed_chars
= 0;
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
173 if (! __C_conforms_to_XXX___ (c
))
175 if (! __C_strongly_suggests_XXX__ (c
))
176 found
= CATEGORY_MASK_XXX
;
178 /* The byte sequence is invalid for XXX. */
179 detect_info
->rejected
|= CATEGORY_MASK_XXX
;
183 /* The source exausted successfully. */
184 detect_info
->found
|= found
;
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
201 Below is the template of these functions. */
205 decode_coding_XXXX (coding
)
206 struct coding_system
*coding
;
208 unsigned char *src
= coding
->source
+ coding
->consumed
;
209 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base
;
214 /* A buffer to produce decoded characters. */
215 int *charbuf
= coding
->charbuf
;
216 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
217 int multibytep
= coding
->src_multibyte
;
222 if (charbuf
< charbuf_end
)
223 /* No more room to produce a decoded character. */
230 if (src_base
< src_end
231 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base
< src_end
&& charbuf
< charbuf_end
)
235 *charbuf
++ = *src_base
++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
239 /* Remember how many characters we produced. */
240 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
260 Below is a template of these functions. */
263 encode_coding_XXX (coding
)
264 struct coding_system
*coding
;
266 int multibytep
= coding
->dst_multibyte
;
267 int *charbuf
= coding
->charbuf
;
268 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
269 unsigned char *dst
= coding
->destination
+ coding
->produced
;
270 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
271 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
272 int produced_chars
= 0;
274 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
277 /* Encode C into DST, and increment DST. */
279 label_no_more_destination
:
280 /* How many chars and bytes we produced. */
281 coding
->produced_char
+= produced_chars
;
282 coding
->produced
= dst
- coding
->destination
;
287 /*** 1. Preamble ***/
294 #include "character.h"
297 #include "composite.h"
301 Lisp_Object Vcoding_system_hash_table
;
303 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
304 Lisp_Object Qunix
, Qdos
;
305 extern Lisp_Object Qmac
; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system
;
307 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
308 Lisp_Object Qdefault_char
;
309 Lisp_Object Qno_conversion
, Qundecided
;
310 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
311 Lisp_Object Qbig
, Qlittle
;
312 Lisp_Object Qcoding_system_history
;
313 Lisp_Object Qvalid_codes
;
314 Lisp_Object QCcategory
;
316 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
317 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
318 Lisp_Object Qstart_process
, Qopen_network_stream
;
319 Lisp_Object Qtarget_idx
;
321 int coding_system_require_warning
;
323 Lisp_Object Vselect_safe_coding_system_function
;
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
327 /* Mnemonic string to indicate format of end-of-line is not yet
329 Lisp_Object eol_mnemonic_undecided
;
333 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
335 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule
, Qraw_text
;
340 Lisp_Object Qutf_8_emacs
;
342 /* Coding-systems are handed between Emacs Lisp programs and C internal
343 routines by the following three variables. */
344 /* Coding-system for reading files and receiving data from process. */
345 Lisp_Object Vcoding_system_for_read
;
346 /* Coding-system for writing files and sending data to process. */
347 Lisp_Object Vcoding_system_for_write
;
348 /* Coding-system actually used in the latest I/O. */
349 Lisp_Object Vlast_coding_system_used
;
351 /* A vector of length 256 which contains information about special
352 Latin codes (especially for dealing with Microsoft codes). */
353 Lisp_Object Vlatin_extra_code_table
;
355 /* Flag to inhibit code conversion of end-of-line format. */
356 int inhibit_eol_conversion
;
358 /* Flag to inhibit ISO2022 escape sequence detection. */
359 int inhibit_iso_escape_detection
;
361 /* Flag to make buffer-file-coding-system inherit from process-coding. */
362 int inherit_process_coding_system
;
364 /* Coding system to be used to encode text for terminal display. */
365 struct coding_system terminal_coding
;
367 /* Coding system to be used to encode text for terminal display when
368 terminal coding system is nil. */
369 struct coding_system safe_terminal_coding
;
371 /* Coding system of what is sent from terminal keyboard. */
372 struct coding_system keyboard_coding
;
374 Lisp_Object Vfile_coding_system_alist
;
375 Lisp_Object Vprocess_coding_system_alist
;
376 Lisp_Object Vnetwork_coding_system_alist
;
378 Lisp_Object Vlocale_coding_system
;
382 /* Flag to tell if we look up translation table on character code
384 Lisp_Object Venable_character_translation
;
385 /* Standard translation table to look up on decoding (reading). */
386 Lisp_Object Vstandard_translation_table_for_decode
;
387 /* Standard translation table to look up on encoding (writing). */
388 Lisp_Object Vstandard_translation_table_for_encode
;
390 Lisp_Object Qtranslation_table
;
391 Lisp_Object Qtranslation_table_id
;
392 Lisp_Object Qtranslation_table_for_decode
;
393 Lisp_Object Qtranslation_table_for_encode
;
395 /* Alist of charsets vs revision number. */
396 static Lisp_Object Vcharset_revision_table
;
398 /* Default coding systems used for process I/O. */
399 Lisp_Object Vdefault_process_coding_system
;
401 /* Char table for translating Quail and self-inserting input. */
402 Lisp_Object Vtranslation_table_for_input
;
404 /* Two special coding systems. */
405 Lisp_Object Vsjis_coding_system
;
406 Lisp_Object Vbig5_coding_system
;
409 static int detect_coding_utf_8
P_ ((struct coding_system
*,
410 struct coding_detection_info
*info
));
411 static void decode_coding_utf_8
P_ ((struct coding_system
*));
412 static int encode_coding_utf_8
P_ ((struct coding_system
*));
414 static int detect_coding_utf_16
P_ ((struct coding_system
*,
415 struct coding_detection_info
*info
));
416 static void decode_coding_utf_16
P_ ((struct coding_system
*));
417 static int encode_coding_utf_16
P_ ((struct coding_system
*));
419 static int detect_coding_iso_2022
P_ ((struct coding_system
*,
420 struct coding_detection_info
*info
));
421 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
422 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
424 static int detect_coding_emacs_mule
P_ ((struct coding_system
*,
425 struct coding_detection_info
*info
));
426 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
427 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
429 static int detect_coding_sjis
P_ ((struct coding_system
*,
430 struct coding_detection_info
*info
));
431 static void decode_coding_sjis
P_ ((struct coding_system
*));
432 static int encode_coding_sjis
P_ ((struct coding_system
*));
434 static int detect_coding_big5
P_ ((struct coding_system
*,
435 struct coding_detection_info
*info
));
436 static void decode_coding_big5
P_ ((struct coding_system
*));
437 static int encode_coding_big5
P_ ((struct coding_system
*));
439 static int detect_coding_ccl
P_ ((struct coding_system
*,
440 struct coding_detection_info
*info
));
441 static void decode_coding_ccl
P_ ((struct coding_system
*));
442 static int encode_coding_ccl
P_ ((struct coding_system
*));
444 static void decode_coding_raw_text
P_ ((struct coding_system
*));
445 static int encode_coding_raw_text
P_ ((struct coding_system
*));
448 /* ISO2022 section */
450 #define CODING_ISO_INITIAL(coding, reg) \
451 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
452 coding_attr_iso_initial), \
456 #define CODING_ISO_REQUEST(coding, charset_id) \
457 ((charset_id <= (coding)->max_charset_id \
458 ? (coding)->safe_charsets[charset_id] \
462 #define CODING_ISO_FLAGS(coding) \
463 ((coding)->spec.iso_2022.flags)
464 #define CODING_ISO_DESIGNATION(coding, reg) \
465 ((coding)->spec.iso_2022.current_designation[reg])
466 #define CODING_ISO_INVOCATION(coding, plane) \
467 ((coding)->spec.iso_2022.current_invocation[plane])
468 #define CODING_ISO_SINGLE_SHIFTING(coding) \
469 ((coding)->spec.iso_2022.single_shifting)
470 #define CODING_ISO_BOL(coding) \
471 ((coding)->spec.iso_2022.bol)
472 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
473 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
475 /* Control characters of ISO2022. */
476 /* code */ /* function */
477 #define ISO_CODE_LF 0x0A /* line-feed */
478 #define ISO_CODE_CR 0x0D /* carriage-return */
479 #define ISO_CODE_SO 0x0E /* shift-out */
480 #define ISO_CODE_SI 0x0F /* shift-in */
481 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
482 #define ISO_CODE_ESC 0x1B /* escape */
483 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
484 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
485 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
487 /* All code (1-byte) of ISO2022 is classified into one of the
489 enum iso_code_class_type
491 ISO_control_0
, /* Control codes in the range
492 0x00..0x1F and 0x7F, except for the
493 following 5 codes. */
494 ISO_carriage_return
, /* ISO_CODE_CR (0x0D) */
495 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
496 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
497 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
498 ISO_escape
, /* ISO_CODE_SO (0x1B) */
499 ISO_control_1
, /* Control codes in the range
500 0x80..0x9F, except for the
501 following 3 codes. */
502 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
503 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
504 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
505 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
506 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
507 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
508 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
511 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
512 `iso-flags' attribute of an iso2022 coding system. */
514 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
515 instead of the correct short-form sequence (e.g. ESC $ A). */
516 #define CODING_ISO_FLAG_LONG_FORM 0x0001
518 /* If set, reset graphic planes and registers at end-of-line to the
520 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
522 /* If set, reset graphic planes and registers before any control
523 characters to the initial state. */
524 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
526 /* If set, encode by 7-bit environment. */
527 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
529 /* If set, use locking-shift function. */
530 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
532 /* If set, use single-shift function. Overwrite
533 CODING_ISO_FLAG_LOCKING_SHIFT. */
534 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
536 /* If set, use designation escape sequence. */
537 #define CODING_ISO_FLAG_DESIGNATION 0x0040
539 /* If set, produce revision number sequence. */
540 #define CODING_ISO_FLAG_REVISION 0x0080
542 /* If set, produce ISO6429's direction specifying sequence. */
543 #define CODING_ISO_FLAG_DIRECTION 0x0100
545 /* If set, assume designation states are reset at beginning of line on
547 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
549 /* If set, designation sequence should be placed at beginning of line
551 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
553 /* If set, do not encode unsafe charactes on output. */
554 #define CODING_ISO_FLAG_SAFE 0x0800
556 /* If set, extra latin codes (128..159) are accepted as a valid code
558 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
560 #define CODING_ISO_FLAG_COMPOSITION 0x2000
562 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
564 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
566 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
568 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
570 /* A character to be produced on output if encoding of the original
571 character is prohibited by CODING_ISO_FLAG_SAFE. */
572 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
576 #define CODING_UTF_16_BOM(coding) \
577 ((coding)->spec.utf_16.bom)
579 #define CODING_UTF_16_ENDIAN(coding) \
580 ((coding)->spec.utf_16.endian)
582 #define CODING_UTF_16_SURROGATE(coding) \
583 ((coding)->spec.utf_16.surrogate)
587 #define CODING_CCL_DECODER(coding) \
588 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
589 #define CODING_CCL_ENCODER(coding) \
590 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
591 #define CODING_CCL_VALIDS(coding) \
592 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
594 /* Index for each coding category in `coding_categories' */
598 coding_category_iso_7
,
599 coding_category_iso_7_tight
,
600 coding_category_iso_8_1
,
601 coding_category_iso_8_2
,
602 coding_category_iso_7_else
,
603 coding_category_iso_8_else
,
604 coding_category_utf_8
,
605 coding_category_utf_16_auto
,
606 coding_category_utf_16_be
,
607 coding_category_utf_16_le
,
608 coding_category_utf_16_be_nosig
,
609 coding_category_utf_16_le_nosig
,
610 coding_category_charset
,
611 coding_category_sjis
,
612 coding_category_big5
,
614 coding_category_emacs_mule
,
615 /* All above are targets of code detection. */
616 coding_category_raw_text
,
617 coding_category_undecided
,
621 /* Definitions of flag bits used in detect_coding_XXXX. */
622 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
623 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
624 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
625 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
626 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
627 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
628 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
629 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
630 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
631 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
632 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
633 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
634 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
635 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
636 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
637 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
638 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
639 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
641 /* This value is returned if detect_coding_mask () find nothing other
642 than ASCII characters. */
643 #define CATEGORY_MASK_ANY \
644 (CATEGORY_MASK_ISO_7 \
645 | CATEGORY_MASK_ISO_7_TIGHT \
646 | CATEGORY_MASK_ISO_8_1 \
647 | CATEGORY_MASK_ISO_8_2 \
648 | CATEGORY_MASK_ISO_7_ELSE \
649 | CATEGORY_MASK_ISO_8_ELSE \
650 | CATEGORY_MASK_UTF_8 \
651 | CATEGORY_MASK_UTF_16_BE \
652 | CATEGORY_MASK_UTF_16_LE \
653 | CATEGORY_MASK_UTF_16_BE_NOSIG \
654 | CATEGORY_MASK_UTF_16_LE_NOSIG \
655 | CATEGORY_MASK_CHARSET \
656 | CATEGORY_MASK_SJIS \
657 | CATEGORY_MASK_BIG5 \
658 | CATEGORY_MASK_CCL \
659 | CATEGORY_MASK_EMACS_MULE)
662 #define CATEGORY_MASK_ISO_7BIT \
663 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
665 #define CATEGORY_MASK_ISO_8BIT \
666 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
668 #define CATEGORY_MASK_ISO_ELSE \
669 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
671 #define CATEGORY_MASK_ISO_ESCAPE \
672 (CATEGORY_MASK_ISO_7 \
673 | CATEGORY_MASK_ISO_7_TIGHT \
674 | CATEGORY_MASK_ISO_7_ELSE \
675 | CATEGORY_MASK_ISO_8_ELSE)
677 #define CATEGORY_MASK_ISO \
678 ( CATEGORY_MASK_ISO_7BIT \
679 | CATEGORY_MASK_ISO_8BIT \
680 | CATEGORY_MASK_ISO_ELSE)
682 #define CATEGORY_MASK_UTF_16 \
683 (CATEGORY_MASK_UTF_16_BE \
684 | CATEGORY_MASK_UTF_16_LE \
685 | CATEGORY_MASK_UTF_16_BE_NOSIG \
686 | CATEGORY_MASK_UTF_16_LE_NOSIG)
689 /* List of symbols `coding-category-xxx' ordered by priority. This
690 variable is exposed to Emacs Lisp. */
691 static Lisp_Object Vcoding_category_list
;
693 /* Table of coding categories (Lisp symbols). This variable is for
695 static Lisp_Object Vcoding_category_table
;
697 /* Table of coding-categories ordered by priority. */
698 static enum coding_category coding_priorities
[coding_category_max
];
700 /* Nth element is a coding context for the coding system bound to the
701 Nth coding category. */
702 static struct coding_system coding_categories
[coding_category_max
];
704 /*** Commonly used macros and functions ***/
707 #define min(a, b) ((a) < (b) ? (a) : (b))
710 #define max(a, b) ((a) > (b) ? (a) : (b))
713 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
715 attrs = CODING_ID_ATTRS (coding->id); \
716 eol_type = CODING_ID_EOL_TYPE (coding->id); \
717 if (VECTORP (eol_type)) \
719 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
723 /* Safely get one byte from the source text pointed by SRC which ends
724 at SRC_END, and set C to that byte. If there are not enough bytes
725 in the source, it jumps to `no_more_source'. The caller
726 should declare and set these variables appropriately in advance:
727 src, src_end, multibytep
730 #define ONE_MORE_BYTE(c) \
732 if (src == src_end) \
734 if (src_base < src) \
735 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
736 goto no_more_source; \
739 if (multibytep && (c & 0x80)) \
741 if ((c & 0xFE) != 0xC0) \
742 error ("Undecodable char found"); \
743 c = ((c & 1) << 6) | *src++; \
749 #define ONE_MORE_BYTE_NO_CHECK(c) \
752 if (multibytep && (c & 0x80)) \
754 if ((c & 0xFE) != 0xC0) \
755 error ("Undecodable char found"); \
756 c = ((c & 1) << 6) | *src++; \
762 /* Store a byte C in the place pointed by DST and increment DST to the
763 next free point, and increment PRODUCED_CHARS. The caller should
764 assure that C is 0..127, and declare and set the variable `dst'
765 appropriately in advance.
769 #define EMIT_ONE_ASCII_BYTE(c) \
776 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
778 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
780 produced_chars += 2; \
781 *dst++ = (c1), *dst++ = (c2); \
785 /* Store a byte C in the place pointed by DST and increment DST to the
786 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
787 nonzero, store in an appropriate multibyte from. The caller should
788 declare and set the variables `dst' and `multibytep' appropriately
791 #define EMIT_ONE_BYTE(c) \
798 ch = BYTE8_TO_CHAR (ch); \
799 CHAR_STRING_ADVANCE (ch, dst); \
806 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
808 #define EMIT_TWO_BYTES(c1, c2) \
810 produced_chars += 2; \
817 ch = BYTE8_TO_CHAR (ch); \
818 CHAR_STRING_ADVANCE (ch, dst); \
821 ch = BYTE8_TO_CHAR (ch); \
822 CHAR_STRING_ADVANCE (ch, dst); \
832 #define EMIT_THREE_BYTES(c1, c2, c3) \
834 EMIT_ONE_BYTE (c1); \
835 EMIT_TWO_BYTES (c2, c3); \
839 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
841 EMIT_TWO_BYTES (c1, c2); \
842 EMIT_TWO_BYTES (c3, c4); \
846 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
848 charset_map_loaded = 0; \
849 c = DECODE_CHAR (charset, code); \
850 if (charset_map_loaded) \
852 const unsigned char *orig = coding->source; \
855 coding_set_source (coding); \
856 offset = coding->source - orig; \
858 src_base += offset; \
864 #define ASSURE_DESTINATION(bytes) \
866 if (dst + (bytes) >= dst_end) \
868 int more_bytes = charbuf_end - charbuf + (bytes); \
870 dst = alloc_destination (coding, more_bytes, dst); \
871 dst_end = coding->destination + coding->dst_bytes; \
878 coding_set_source (coding
)
879 struct coding_system
*coding
;
881 if (BUFFERP (coding
->src_object
))
883 struct buffer
*buf
= XBUFFER (coding
->src_object
);
885 if (coding
->src_pos
< 0)
886 coding
->source
= BUF_GAP_END_ADDR (buf
) + coding
->src_pos_byte
;
888 coding
->source
= BUF_BYTE_ADDRESS (buf
, coding
->src_pos_byte
);
890 else if (STRINGP (coding
->src_object
))
892 coding
->source
= SDATA (coding
->src_object
) + coding
->src_pos_byte
;
895 /* Otherwise, the source is C string and is never relocated
896 automatically. Thus we don't have to update anything. */
901 coding_set_destination (coding
)
902 struct coding_system
*coding
;
904 if (BUFFERP (coding
->dst_object
))
906 if (coding
->src_pos
< 0)
908 coding
->destination
= BEG_ADDR
+ coding
->dst_pos_byte
- 1;
909 coding
->dst_bytes
= (GAP_END_ADDR
910 - (coding
->src_bytes
- coding
->consumed
)
911 - coding
->destination
);
915 /* We are sure that coding->dst_pos_byte is before the gap
917 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
918 + coding
->dst_pos_byte
- 1);
919 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
920 - coding
->destination
);
924 /* Otherwise, the destination is C string and is never relocated
925 automatically. Thus we don't have to update anything. */
931 coding_alloc_by_realloc (coding
, bytes
)
932 struct coding_system
*coding
;
935 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
936 coding
->dst_bytes
+ bytes
);
937 coding
->dst_bytes
+= bytes
;
941 coding_alloc_by_making_gap (coding
, bytes
)
942 struct coding_system
*coding
;
945 if (BUFFERP (coding
->dst_object
)
946 && EQ (coding
->src_object
, coding
->dst_object
))
948 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
950 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
952 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
956 Lisp_Object this_buffer
;
958 this_buffer
= Fcurrent_buffer ();
959 set_buffer_internal (XBUFFER (coding
->dst_object
));
961 set_buffer_internal (XBUFFER (this_buffer
));
966 static unsigned char *
967 alloc_destination (coding
, nbytes
, dst
)
968 struct coding_system
*coding
;
972 EMACS_INT offset
= dst
- coding
->destination
;
974 if (BUFFERP (coding
->dst_object
))
975 coding_alloc_by_making_gap (coding
, nbytes
);
977 coding_alloc_by_realloc (coding
, nbytes
);
978 coding
->result
= CODING_RESULT_SUCCESS
;
979 coding_set_destination (coding
);
980 dst
= coding
->destination
+ offset
;
984 /** Macros for annotations. */
986 /* Maximum length of annotation data (sum of annotations for
987 composition and charset). */
988 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
990 /* An annotation data is stored in the array coding->charbuf in this
992 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
993 LENGTH is the number of elements in the annotation.
994 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
995 FROM and TO specify the range of text annotated. They are relative
996 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
998 The format of the following elements depend on ANNOTATION_MASK.
1000 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1002 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1003 METHOD is one of enum composition_method.
1004 Optionnal COMPOSITION-COMPONENTS are characters and composition
1007 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1010 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1012 *(buf)++ = -(len); \
1013 *(buf)++ = (mask); \
1014 *(buf)++ = (from); \
1016 coding->annotated = 1; \
1019 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1021 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1026 #define ADD_CHARSET_DATA(buf, from, to, id) \
1028 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1033 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1040 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1041 Check if a text is encoded in UTF-8. If it is, return 1, else
1044 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1045 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1046 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1047 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1048 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1049 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1052 detect_coding_utf_8 (coding
, detect_info
)
1053 struct coding_system
*coding
;
1054 struct coding_detection_info
*detect_info
;
1056 const unsigned char *src
= coding
->source
, *src_base
= src
;
1057 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1058 int multibytep
= coding
->src_multibyte
;
1059 int consumed_chars
= 0;
1063 detect_info
->checked
|= CATEGORY_MASK_UTF_8
;
1064 /* A coding system of this category is always ASCII compatible. */
1065 src
+= coding
->head_ascii
;
1069 int c
, c1
, c2
, c3
, c4
;
1073 if (UTF_8_1_OCTET_P (c
))
1077 if (! UTF_8_EXTRA_OCTET_P (c1
))
1079 if (UTF_8_2_OCTET_LEADING_P (c
))
1081 found
= CATEGORY_MASK_UTF_8
;
1085 if (! UTF_8_EXTRA_OCTET_P (c2
))
1087 if (UTF_8_3_OCTET_LEADING_P (c
))
1089 found
= CATEGORY_MASK_UTF_8
;
1093 if (! UTF_8_EXTRA_OCTET_P (c3
))
1095 if (UTF_8_4_OCTET_LEADING_P (c
))
1097 found
= CATEGORY_MASK_UTF_8
;
1101 if (! UTF_8_EXTRA_OCTET_P (c4
))
1103 if (UTF_8_5_OCTET_LEADING_P (c
))
1105 found
= CATEGORY_MASK_UTF_8
;
1110 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1114 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1116 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1119 detect_info
->found
|= found
;
1125 decode_coding_utf_8 (coding
)
1126 struct coding_system
*coding
;
1128 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1129 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1130 const unsigned char *src_base
;
1131 int *charbuf
= coding
->charbuf
;
1132 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1133 int consumed_chars
= 0, consumed_chars_base
;
1134 int multibytep
= coding
->src_multibyte
;
1135 Lisp_Object attr
, eol_type
, charset_list
;
1137 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1141 int c
, c1
, c2
, c3
, c4
, c5
;
1144 consumed_chars_base
= consumed_chars
;
1146 if (charbuf
>= charbuf_end
)
1150 if (UTF_8_1_OCTET_P(c1
))
1155 if (EQ (eol_type
, Qdos
))
1159 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
1160 goto no_more_source
;
1165 else if (EQ (eol_type
, Qmac
))
1172 if (! UTF_8_EXTRA_OCTET_P (c2
))
1174 if (UTF_8_2_OCTET_LEADING_P (c1
))
1176 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1177 /* Reject overlong sequences here and below. Encoders
1178 producing them are incorrect, they can be misleading,
1179 and they mess up read/write invariance. */
1186 if (! UTF_8_EXTRA_OCTET_P (c3
))
1188 if (UTF_8_3_OCTET_LEADING_P (c1
))
1190 c
= (((c1
& 0xF) << 12)
1191 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1193 || (c
>= 0xd800 && c
< 0xe000)) /* surrogates (invalid) */
1199 if (! UTF_8_EXTRA_OCTET_P (c4
))
1201 if (UTF_8_4_OCTET_LEADING_P (c1
))
1203 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1204 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1211 if (! UTF_8_EXTRA_OCTET_P (c5
))
1213 if (UTF_8_5_OCTET_LEADING_P (c1
))
1215 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1216 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1218 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1233 consumed_chars
= consumed_chars_base
;
1235 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1240 coding
->consumed_char
+= consumed_chars_base
;
1241 coding
->consumed
= src_base
- coding
->source
;
1242 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1247 encode_coding_utf_8 (coding
)
1248 struct coding_system
*coding
;
1250 int multibytep
= coding
->dst_multibyte
;
1251 int *charbuf
= coding
->charbuf
;
1252 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1253 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1254 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1255 int produced_chars
= 0;
1260 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1262 while (charbuf
< charbuf_end
)
1264 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1266 ASSURE_DESTINATION (safe_room
);
1268 if (CHAR_BYTE8_P (c
))
1270 c
= CHAR_TO_BYTE8 (c
);
1275 CHAR_STRING_ADVANCE (c
, pend
);
1276 for (p
= str
; p
< pend
; p
++)
1283 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1285 while (charbuf
< charbuf_end
)
1287 ASSURE_DESTINATION (safe_room
);
1289 dst
+= CHAR_STRING (c
, dst
);
1293 coding
->result
= CODING_RESULT_SUCCESS
;
1294 coding
->produced_char
+= produced_chars
;
1295 coding
->produced
= dst
- coding
->destination
;
1300 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1301 Check if a text is encoded in one of UTF-16 based coding systems.
1302 If it is, return 1, else return 0. */
1304 #define UTF_16_HIGH_SURROGATE_P(val) \
1305 (((val) & 0xFC00) == 0xD800)
1307 #define UTF_16_LOW_SURROGATE_P(val) \
1308 (((val) & 0xFC00) == 0xDC00)
1310 #define UTF_16_INVALID_P(val) \
1311 (((val) == 0xFFFE) \
1312 || ((val) == 0xFFFF) \
1313 || UTF_16_LOW_SURROGATE_P (val))
1317 detect_coding_utf_16 (coding
, detect_info
)
1318 struct coding_system
*coding
;
1319 struct coding_detection_info
*detect_info
;
1321 const unsigned char *src
= coding
->source
, *src_base
= src
;
1322 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1323 int multibytep
= coding
->src_multibyte
;
1324 int consumed_chars
= 0;
1327 detect_info
->checked
|= CATEGORY_MASK_UTF_16
;
1329 if (coding
->mode
& CODING_MODE_LAST_BLOCK
1330 && (coding
->src_bytes
& 1))
1332 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1338 if ((c1
== 0xFF) && (c2
== 0xFE))
1340 detect_info
->found
|= (CATEGORY_MASK_UTF_16_LE
1341 | CATEGORY_MASK_UTF_16_AUTO
);
1342 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_BE
;
1344 else if ((c1
== 0xFE) && (c2
== 0xFF))
1346 detect_info
->found
|= (CATEGORY_MASK_UTF_16_BE
1347 | CATEGORY_MASK_UTF_16_AUTO
);
1348 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_LE
;
1355 decode_coding_utf_16 (coding
)
1356 struct coding_system
*coding
;
1358 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1359 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1360 const unsigned char *src_base
;
1361 int *charbuf
= coding
->charbuf
;
1362 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1363 int consumed_chars
= 0, consumed_chars_base
;
1364 int multibytep
= coding
->src_multibyte
;
1365 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1366 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1367 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1368 Lisp_Object attr
, eol_type
, charset_list
;
1370 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1372 if (bom
== utf_16_with_bom
)
1381 if (endian
== utf_16_big_endian
1382 ? c
!= 0xFEFF : c
!= 0xFFFE)
1384 /* The first two bytes are not BOM. Treat them as bytes
1385 for a normal character. */
1389 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1391 else if (bom
== utf_16_detect_bom
)
1393 /* We have already tried to detect BOM and failed in
1395 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1403 consumed_chars_base
= consumed_chars
;
1405 if (charbuf
+ 2 >= charbuf_end
)
1410 c
= (endian
== utf_16_big_endian
1411 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1414 if (! UTF_16_LOW_SURROGATE_P (c
))
1416 if (endian
== utf_16_big_endian
)
1417 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1419 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1423 if (UTF_16_HIGH_SURROGATE_P (c
))
1424 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1430 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1431 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1437 if (UTF_16_HIGH_SURROGATE_P (c
))
1438 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1445 coding
->consumed_char
+= consumed_chars_base
;
1446 coding
->consumed
= src_base
- coding
->source
;
1447 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1451 encode_coding_utf_16 (coding
)
1452 struct coding_system
*coding
;
1454 int multibytep
= coding
->dst_multibyte
;
1455 int *charbuf
= coding
->charbuf
;
1456 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1457 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1458 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1460 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1461 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1462 int produced_chars
= 0;
1463 Lisp_Object attrs
, eol_type
, charset_list
;
1466 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1468 if (bom
!= utf_16_without_bom
)
1470 ASSURE_DESTINATION (safe_room
);
1472 EMIT_TWO_BYTES (0xFE, 0xFF);
1474 EMIT_TWO_BYTES (0xFF, 0xFE);
1475 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1478 while (charbuf
< charbuf_end
)
1480 ASSURE_DESTINATION (safe_room
);
1482 if (c
>= MAX_UNICODE_CHAR
)
1483 c
= coding
->default_char
;
1488 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1490 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1497 c1
= (c
>> 10) + 0xD800;
1498 c2
= (c
& 0x3FF) + 0xDC00;
1500 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1502 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1505 coding
->result
= CODING_RESULT_SUCCESS
;
1506 coding
->produced
= dst
- coding
->destination
;
1507 coding
->produced_char
+= produced_chars
;
1512 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1514 /* Emacs' internal format for representation of multiple character
1515 sets is a kind of multi-byte encoding, i.e. characters are
1516 represented by variable-length sequences of one-byte codes.
1518 ASCII characters and control characters (e.g. `tab', `newline') are
1519 represented by one-byte sequences which are their ASCII codes, in
1520 the range 0x00 through 0x7F.
1522 8-bit characters of the range 0x80..0x9F are represented by
1523 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1526 8-bit characters of the range 0xA0..0xFF are represented by
1527 one-byte sequences which are their 8-bit code.
1529 The other characters are represented by a sequence of `base
1530 leading-code', optional `extended leading-code', and one or two
1531 `position-code's. The length of the sequence is determined by the
1532 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1533 whereas extended leading-code and position-code take the range 0xA0
1534 through 0xFF. See `charset.h' for more details about leading-code
1537 --- CODE RANGE of Emacs' internal format ---
1541 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1542 eight-bit-graphic 0xA0..0xBF
1543 ELSE 0x81..0x9D + [0xA0..0xFF]+
1544 ---------------------------------------------
1546 As this is the internal character representation, the format is
1547 usually not used externally (i.e. in a file or in a data sent to a
1548 process). But, it is possible to have a text externally in this
1549 format (i.e. by encoding by the coding system `emacs-mule').
1551 In that case, a sequence of one-byte codes has a slightly different
1554 At first, all characters in eight-bit-control are represented by
1555 one-byte sequences which are their 8-bit code.
1557 Next, character composition data are represented by the byte
1558 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1560 METHOD is 0xF0 plus one of composition method (enum
1561 composition_method),
1563 BYTES is 0xA0 plus a byte length of this composition data,
1565 CHARS is 0x20 plus a number of characters composed by this
1568 COMPONENTs are characters of multibye form or composition
1569 rules encoded by two-byte of ASCII codes.
1571 In addition, for backward compatibility, the following formats are
1572 also recognized as composition data on decoding.
1575 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1578 MSEQ is a multibyte form but in these special format:
1579 ASCII: 0xA0 ASCII_CODE+0x80,
1580 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1581 RULE is a one byte code of the range 0xA0..0xF0 that
1582 represents a composition rule.
1585 char emacs_mule_bytes
[256];
1588 emacs_mule_char (coding
, src
, nbytes
, nchars
, id
)
1589 struct coding_system
*coding
;
1591 int *nbytes
, *nchars
, *id
;
1593 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1594 const unsigned char *src_base
= src
;
1595 int multibytep
= coding
->src_multibyte
;
1596 struct charset
*charset
;
1599 int consumed_chars
= 0;
1602 switch (emacs_mule_bytes
[c
])
1605 if (! (charset
= emacs_mule_charset
[c
]))
1612 if (c
== EMACS_MULE_LEADING_CODE_PRIVATE_11
1613 || c
== EMACS_MULE_LEADING_CODE_PRIVATE_12
)
1616 if (! (charset
= emacs_mule_charset
[c
]))
1623 if (! (charset
= emacs_mule_charset
[c
]))
1626 code
= (c
& 0x7F) << 8;
1634 if (! (charset
= emacs_mule_charset
[c
]))
1637 code
= (c
& 0x7F) << 8;
1644 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
)
1645 ? charset_ascii
: charset_eight_bit
);
1651 c
= DECODE_CHAR (charset
, code
);
1654 *nbytes
= src
- src_base
;
1655 *nchars
= consumed_chars
;
1668 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1669 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1673 detect_coding_emacs_mule (coding
, detect_info
)
1674 struct coding_system
*coding
;
1675 struct coding_detection_info
*detect_info
;
1677 const unsigned char *src
= coding
->source
, *src_base
= src
;
1678 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1679 int multibytep
= coding
->src_multibyte
;
1680 int consumed_chars
= 0;
1685 detect_info
->checked
|= CATEGORY_MASK_EMACS_MULE
;
1686 /* A coding system of this category is always ASCII compatible. */
1687 src
+= coding
->head_ascii
;
1697 /* Perhaps the start of composite character. We simple skip
1698 it because analyzing it is too heavy for detecting. But,
1699 at least, we check that the composite character
1700 constitues of more than 4 bytes. */
1701 const unsigned char *src_base
;
1711 if (src
- src_base
<= 4)
1713 found
= CATEGORY_MASK_EMACS_MULE
;
1721 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1726 const unsigned char *src_base
= src
- 1;
1733 if (src
- src_base
!= emacs_mule_bytes
[*src_base
])
1735 found
= CATEGORY_MASK_EMACS_MULE
;
1738 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1742 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1744 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1747 detect_info
->found
|= found
;
1752 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1754 /* Decode a character represented as a component of composition
1755 sequence of Emacs 20/21 style at SRC. Set C to that character and
1756 update SRC to the head of next character (or an encoded composition
1757 rule). If SRC doesn't points a composition component, set C to -1.
1758 If SRC points an invalid byte sequence, global exit by a return
1761 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1765 int nbytes, nchars; \
1767 if (src == src_end) \
1769 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1774 goto invalid_code; \
1778 consumed_chars += nchars; \
1783 /* Decode a composition rule represented as a component of composition
1784 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1785 and increment BUF. If SRC points an invalid byte sequence, set C
1788 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1790 int c, gref, nref; \
1792 if (src >= src_end) \
1793 goto invalid_code; \
1794 ONE_MORE_BYTE_NO_CHECK (c); \
1796 if (c < 0 || c >= 81) \
1797 goto invalid_code; \
1799 gref = c / 9, nref = c % 9; \
1800 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1804 /* Decode a composition rule represented as a component of composition
1805 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1806 and increment BUF. If SRC points an invalid byte sequence, set C
1809 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1813 if (src + 1>= src_end) \
1814 goto invalid_code; \
1815 ONE_MORE_BYTE_NO_CHECK (gref); \
1817 ONE_MORE_BYTE_NO_CHECK (nref); \
1819 if (gref < 0 || gref >= 81 \
1820 || nref < 0 || nref >= 81) \
1821 goto invalid_code; \
1822 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1826 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1828 /* Emacs 21 style format. The first three bytes at SRC are \
1829 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1830 the byte length of this composition information, CHARS is the \
1831 number of characters composed by this composition. */ \
1832 enum composition_method method = c - 0xF2; \
1833 int *charbuf_base = charbuf; \
1835 int consumed_chars_limit; \
1836 int nbytes, nchars; \
1838 ONE_MORE_BYTE (c); \
1839 nbytes = c - 0xA0; \
1841 goto invalid_code; \
1842 ONE_MORE_BYTE (c); \
1843 nchars = c - 0xA0; \
1844 from = coding->produced + char_offset; \
1845 to = from + nchars; \
1846 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1847 consumed_chars_limit = consumed_chars_base + nbytes; \
1848 if (method != COMPOSITION_RELATIVE) \
1851 while (consumed_chars < consumed_chars_limit) \
1853 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1854 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1856 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1859 if (consumed_chars < consumed_chars_limit) \
1860 goto invalid_code; \
1861 charbuf_base[0] -= i; \
1866 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1868 /* Emacs 20 style format for relative composition. */ \
1869 /* Store multibyte form of characters to be composed. */ \
1870 enum composition_method method = COMPOSITION_RELATIVE; \
1871 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1872 int *buf = components; \
1877 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1878 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1879 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1881 goto invalid_code; \
1882 from = coding->produced_char + char_offset; \
1884 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1885 for (j = 0; j < i; j++) \
1886 *charbuf++ = components[j]; \
1890 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1892 /* Emacs 20 style format for rule-base composition. */ \
1893 /* Store multibyte form of characters to be composed. */ \
1894 enum composition_method method = COMPOSITION_WITH_RULE; \
1895 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1896 int *buf = components; \
1900 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1901 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1903 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1904 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1906 if (i < 1 || (buf - components) % 2 == 0) \
1907 goto invalid_code; \
1908 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1909 goto no_more_source; \
1910 from = coding->produced_char + char_offset; \
1912 ADD_COMPOSITION_DATA (buf, from, to, method); \
1913 for (j = 0; j < i; j++) \
1914 *charbuf++ = components[j]; \
1915 for (j = 0; j < i; j += 2) \
1916 *charbuf++ = components[j]; \
1921 decode_coding_emacs_mule (coding
)
1922 struct coding_system
*coding
;
1924 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1925 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1926 const unsigned char *src_base
;
1927 int *charbuf
= coding
->charbuf
;
1928 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
1929 int consumed_chars
= 0, consumed_chars_base
;
1930 int multibytep
= coding
->src_multibyte
;
1931 Lisp_Object attrs
, eol_type
, charset_list
;
1932 int char_offset
= coding
->produced_char
;
1933 int last_offset
= char_offset
;
1934 int last_id
= charset_ascii
;
1936 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1943 consumed_chars_base
= consumed_chars
;
1945 if (charbuf
>= charbuf_end
)
1954 if (EQ (eol_type
, Qdos
))
1958 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
1959 goto no_more_source
;
1964 else if (EQ (eol_type
, Qmac
))
1973 if (c
- 0xF2 >= COMPOSITION_RELATIVE
1974 && c
- 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS
)
1975 DECODE_EMACS_MULE_21_COMPOSITION (c
);
1977 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
1979 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
1983 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
1989 consumed_chars
= consumed_chars_base
;
1990 c
= emacs_mule_char (coding
, src
, &nbytes
, &nchars
, &id
);
1999 if (last_id
!= charset_ascii
)
2000 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2002 last_offset
= char_offset
;
2006 consumed_chars
+= nchars
;
2013 consumed_chars
= consumed_chars_base
;
2015 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
2021 if (last_id
!= charset_ascii
)
2022 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2023 coding
->consumed_char
+= consumed_chars_base
;
2024 coding
->consumed
= src_base
- coding
->source
;
2025 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
2029 #define EMACS_MULE_LEADING_CODES(id, codes) \
2032 codes[0] = id, codes[1] = 0; \
2033 else if (id < 0xE0) \
2034 codes[0] = 0x9A, codes[1] = id; \
2035 else if (id < 0xF0) \
2036 codes[0] = 0x9B, codes[1] = id; \
2037 else if (id < 0xF5) \
2038 codes[0] = 0x9C, codes[1] = id; \
2040 codes[0] = 0x9D, codes[1] = id; \
2045 encode_coding_emacs_mule (coding
)
2046 struct coding_system
*coding
;
2048 int multibytep
= coding
->dst_multibyte
;
2049 int *charbuf
= coding
->charbuf
;
2050 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
2051 unsigned char *dst
= coding
->destination
+ coding
->produced
;
2052 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
2054 int produced_chars
= 0;
2055 Lisp_Object attrs
, eol_type
, charset_list
;
2057 int preferred_charset_id
= -1;
2059 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2060 if (! EQ (charset_list
, Vemacs_mule_charset_list
))
2062 CODING_ATTR_CHARSET_LIST (attrs
)
2063 = charset_list
= Vemacs_mule_charset_list
;
2066 while (charbuf
< charbuf_end
)
2068 ASSURE_DESTINATION (safe_room
);
2073 /* Handle an annotation. */
2076 case CODING_ANNOTATE_COMPOSITION_MASK
:
2077 /* Not yet implemented. */
2079 case CODING_ANNOTATE_CHARSET_MASK
:
2080 preferred_charset_id
= charbuf
[3];
2081 if (preferred_charset_id
>= 0
2082 && NILP (Fmemq (make_number (preferred_charset_id
),
2084 preferred_charset_id
= -1;
2093 if (ASCII_CHAR_P (c
))
2094 EMIT_ONE_ASCII_BYTE (c
);
2095 else if (CHAR_BYTE8_P (c
))
2097 c
= CHAR_TO_BYTE8 (c
);
2102 struct charset
*charset
;
2106 unsigned char leading_codes
[2];
2108 if (preferred_charset_id
>= 0)
2110 charset
= CHARSET_FROM_ID (preferred_charset_id
);
2111 if (! CHAR_CHARSET_P (c
, charset
))
2112 charset
= char_charset (c
, charset_list
, NULL
);
2115 charset
= char_charset (c
, charset_list
, &code
);
2118 c
= coding
->default_char
;
2119 if (ASCII_CHAR_P (c
))
2121 EMIT_ONE_ASCII_BYTE (c
);
2124 charset
= char_charset (c
, charset_list
, &code
);
2126 dimension
= CHARSET_DIMENSION (charset
);
2127 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2128 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2129 EMIT_ONE_BYTE (leading_codes
[0]);
2130 if (leading_codes
[1])
2131 EMIT_ONE_BYTE (leading_codes
[1]);
2133 EMIT_ONE_BYTE (code
| 0x80);
2137 EMIT_ONE_BYTE (code
>> 8);
2138 EMIT_ONE_BYTE (code
& 0xFF);
2142 coding
->result
= CODING_RESULT_SUCCESS
;
2143 coding
->produced_char
+= produced_chars
;
2144 coding
->produced
= dst
- coding
->destination
;
2149 /*** 7. ISO2022 handlers ***/
2151 /* The following note describes the coding system ISO2022 briefly.
2152 Since the intention of this note is to help understand the
2153 functions in this file, some parts are NOT ACCURATE or are OVERLY
2154 SIMPLIFIED. For thorough understanding, please refer to the
2155 original document of ISO2022. This is equivalent to the standard
2156 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2158 ISO2022 provides many mechanisms to encode several character sets
2159 in 7-bit and 8-bit environments. For 7-bit environments, all text
2160 is encoded using bytes less than 128. This may make the encoded
2161 text a little bit longer, but the text passes more easily through
2162 several types of gateway, some of which strip off the MSB (Most
2165 There are two kinds of character sets: control character sets and
2166 graphic character sets. The former contain control characters such
2167 as `newline' and `escape' to provide control functions (control
2168 functions are also provided by escape sequences). The latter
2169 contain graphic characters such as 'A' and '-'. Emacs recognizes
2170 two control character sets and many graphic character sets.
2172 Graphic character sets are classified into one of the following
2173 four classes, according to the number of bytes (DIMENSION) and
2174 number of characters in one dimension (CHARS) of the set:
2175 - DIMENSION1_CHARS94
2176 - DIMENSION1_CHARS96
2177 - DIMENSION2_CHARS94
2178 - DIMENSION2_CHARS96
2180 In addition, each character set is assigned an identification tag,
2181 unique for each set, called the "final character" (denoted as <F>
2182 hereafter). The <F> of each character set is decided by ECMA(*)
2183 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2184 (0x30..0x3F are for private use only).
2186 Note (*): ECMA = European Computer Manufacturers Association
2188 Here are examples of graphic character sets [NAME(<F>)]:
2189 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2190 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2191 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2192 o DIMENSION2_CHARS96 -- none for the moment
2194 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2195 C0 [0x00..0x1F] -- control character plane 0
2196 GL [0x20..0x7F] -- graphic character plane 0
2197 C1 [0x80..0x9F] -- control character plane 1
2198 GR [0xA0..0xFF] -- graphic character plane 1
2200 A control character set is directly designated and invoked to C0 or
2201 C1 by an escape sequence. The most common case is that:
2202 - ISO646's control character set is designated/invoked to C0, and
2203 - ISO6429's control character set is designated/invoked to C1,
2204 and usually these designations/invocations are omitted in encoded
2205 text. In a 7-bit environment, only C0 can be used, and a control
2206 character for C1 is encoded by an appropriate escape sequence to
2207 fit into the environment. All control characters for C1 are
2208 defined to have corresponding escape sequences.
2210 A graphic character set is at first designated to one of four
2211 graphic registers (G0 through G3), then these graphic registers are
2212 invoked to GL or GR. These designations and invocations can be
2213 done independently. The most common case is that G0 is invoked to
2214 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2215 these invocations and designations are omitted in encoded text.
2216 In a 7-bit environment, only GL can be used.
2218 When a graphic character set of CHARS94 is invoked to GL, codes
2219 0x20 and 0x7F of the GL area work as control characters SPACE and
2220 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2223 There are two ways of invocation: locking-shift and single-shift.
2224 With locking-shift, the invocation lasts until the next different
2225 invocation, whereas with single-shift, the invocation affects the
2226 following character only and doesn't affect the locking-shift
2227 state. Invocations are done by the following control characters or
2230 ----------------------------------------------------------------------
2231 abbrev function cntrl escape seq description
2232 ----------------------------------------------------------------------
2233 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2234 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2235 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2236 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2237 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2238 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2239 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2240 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2241 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2242 ----------------------------------------------------------------------
2243 (*) These are not used by any known coding system.
2245 Control characters for these functions are defined by macros
2246 ISO_CODE_XXX in `coding.h'.
2248 Designations are done by the following escape sequences:
2249 ----------------------------------------------------------------------
2250 escape sequence description
2251 ----------------------------------------------------------------------
2252 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2253 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2254 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2255 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2256 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2257 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2258 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2259 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2260 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2261 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2262 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2263 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2264 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2265 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2266 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2267 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2268 ----------------------------------------------------------------------
2270 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2271 of dimension 1, chars 94, and final character <F>, etc...
2273 Note (*): Although these designations are not allowed in ISO2022,
2274 Emacs accepts them on decoding, and produces them on encoding
2275 CHARS96 character sets in a coding system which is characterized as
2276 7-bit environment, non-locking-shift, and non-single-shift.
2278 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2279 '(' must be omitted. We refer to this as "short-form" hereafter.
2281 Now you may notice that there are a lot of ways of encoding the
2282 same multilingual text in ISO2022. Actually, there exist many
2283 coding systems such as Compound Text (used in X11's inter client
2284 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2285 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2286 localized platforms), and all of these are variants of ISO2022.
2288 In addition to the above, Emacs handles two more kinds of escape
2289 sequences: ISO6429's direction specification and Emacs' private
2290 sequence for specifying character composition.
2292 ISO6429's direction specification takes the following form:
2293 o CSI ']' -- end of the current direction
2294 o CSI '0' ']' -- end of the current direction
2295 o CSI '1' ']' -- start of left-to-right text
2296 o CSI '2' ']' -- start of right-to-left text
2297 The control character CSI (0x9B: control sequence introducer) is
2298 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2300 Character composition specification takes the following form:
2301 o ESC '0' -- start relative composition
2302 o ESC '1' -- end composition
2303 o ESC '2' -- start rule-base composition (*)
2304 o ESC '3' -- start relative composition with alternate chars (**)
2305 o ESC '4' -- start rule-base composition with alternate chars (**)
2306 Since these are not standard escape sequences of any ISO standard,
2307 the use of them with these meanings is restricted to Emacs only.
2309 (*) This form is used only in Emacs 20.7 and older versions,
2310 but newer versions can safely decode it.
2311 (**) This form is used only in Emacs 21.1 and newer versions,
2312 and older versions can't decode it.
2314 Here's a list of example usages of these composition escape
2315 sequences (categorized by `enum composition_method').
2317 COMPOSITION_RELATIVE:
2318 ESC 0 CHAR [ CHAR ] ESC 1
2319 COMPOSITION_WITH_RULE:
2320 ESC 2 CHAR [ RULE CHAR ] ESC 1
2321 COMPOSITION_WITH_ALTCHARS:
2322 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2323 COMPOSITION_WITH_RULE_ALTCHARS:
2324 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2326 enum iso_code_class_type iso_code_class
[256];
2328 #define SAFE_CHARSET_P(coding, id) \
2329 ((id) <= (coding)->max_charset_id \
2330 && (coding)->safe_charsets[id] >= 0)
2333 #define SHIFT_OUT_OK(category) \
2334 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2337 setup_iso_safe_charsets (attrs
)
2340 Lisp_Object charset_list
, safe_charsets
;
2341 Lisp_Object request
;
2342 Lisp_Object reg_usage
;
2345 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2348 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2349 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2350 && ! EQ (charset_list
, Viso_2022_charset_list
))
2352 CODING_ATTR_CHARSET_LIST (attrs
)
2353 = charset_list
= Viso_2022_charset_list
;
2354 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2357 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2361 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2363 int id
= XINT (XCAR (tail
));
2364 if (max_charset_id
< id
)
2365 max_charset_id
= id
;
2368 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2370 request
= AREF (attrs
, coding_attr_iso_request
);
2371 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2372 reg94
= XINT (XCAR (reg_usage
));
2373 reg96
= XINT (XCDR (reg_usage
));
2375 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2379 struct charset
*charset
;
2382 charset
= CHARSET_FROM_ID (XINT (id
));
2383 reg
= Fcdr (Fassq (id
, request
));
2385 SSET (safe_charsets
, XINT (id
), XINT (reg
));
2386 else if (charset
->iso_chars_96
)
2389 SSET (safe_charsets
, XINT (id
), reg96
);
2394 SSET (safe_charsets
, XINT (id
), reg94
);
2397 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2401 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2402 Check if a text is encoded in one of ISO-2022 based codig systems.
2403 If it is, return 1, else return 0. */
2406 detect_coding_iso_2022 (coding
, detect_info
)
2407 struct coding_system
*coding
;
2408 struct coding_detection_info
*detect_info
;
2410 const unsigned char *src
= coding
->source
, *src_base
= src
;
2411 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2412 int multibytep
= coding
->src_multibyte
;
2413 int single_shifting
= 0;
2416 int consumed_chars
= 0;
2421 detect_info
->checked
|= CATEGORY_MASK_ISO
;
2423 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2425 struct coding_system
*this = &(coding_categories
[i
]);
2426 Lisp_Object attrs
, val
;
2428 attrs
= CODING_ID_ATTRS (this->id
);
2429 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2430 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2431 setup_iso_safe_charsets (attrs
);
2432 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2433 this->max_charset_id
= SCHARS (val
) - 1;
2434 this->safe_charsets
= (char *) SDATA (val
);
2437 /* A coding system of this category is always ASCII compatible. */
2438 src
+= coding
->head_ascii
;
2440 while (rejected
!= CATEGORY_MASK_ISO
)
2446 if (inhibit_iso_escape_detection
)
2448 single_shifting
= 0;
2450 if (c
>= '(' && c
<= '/')
2452 /* Designation sequence for a charset of dimension 1. */
2454 if (c1
< ' ' || c1
>= 0x80
2455 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2456 /* Invalid designation sequence. Just ignore. */
2461 /* Designation sequence for a charset of dimension 2. */
2463 if (c
>= '@' && c
<= 'B')
2464 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2465 id
= iso_charset_table
[1][0][c
];
2466 else if (c
>= '(' && c
<= '/')
2469 if (c1
< ' ' || c1
>= 0x80
2470 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2471 /* Invalid designation sequence. Just ignore. */
2475 /* Invalid designation sequence. Just ignore it. */
2478 else if (c
== 'N' || c
== 'O')
2480 /* ESC <Fe> for SS2 or SS3. */
2481 single_shifting
= 1;
2482 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2485 else if (c
>= '0' && c
<= '4')
2487 /* ESC <Fp> for start/end composition. */
2488 found
|= CATEGORY_MASK_ISO
;
2493 /* Invalid escape sequence. Just ignore it. */
2497 /* We found a valid designation sequence for CHARSET. */
2498 rejected
|= CATEGORY_MASK_ISO_8BIT
;
2499 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2501 found
|= CATEGORY_MASK_ISO_7
;
2503 rejected
|= CATEGORY_MASK_ISO_7
;
2504 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2506 found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2508 rejected
|= CATEGORY_MASK_ISO_7_TIGHT
;
2509 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2511 found
|= CATEGORY_MASK_ISO_7_ELSE
;
2513 rejected
|= CATEGORY_MASK_ISO_7_ELSE
;
2514 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2516 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2518 rejected
|= CATEGORY_MASK_ISO_8_ELSE
;
2523 /* Locking shift out/in. */
2524 if (inhibit_iso_escape_detection
)
2526 single_shifting
= 0;
2527 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2528 found
|= CATEGORY_MASK_ISO_ELSE
;
2532 /* Control sequence introducer. */
2533 single_shifting
= 0;
2534 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2535 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2536 goto check_extra_latin
;
2542 if (inhibit_iso_escape_detection
)
2544 single_shifting
= 1;
2545 rejected
|= CATEGORY_MASK_ISO_7BIT
;
2546 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2547 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2548 found
|= CATEGORY_MASK_ISO_8_1
;
2549 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2550 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2551 found
|= CATEGORY_MASK_ISO_8_2
;
2552 goto check_extra_latin
;
2557 single_shifting
= 0;
2562 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2563 found
|= CATEGORY_MASK_ISO_8_1
;
2564 /* Check the length of succeeding codes of the range
2565 0xA0..0FF. If the byte length is even, we include
2566 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2567 only when we are not single shifting. */
2568 if (! single_shifting
2569 && ! (rejected
& CATEGORY_MASK_ISO_8_2
))
2572 while (src
< src_end
)
2580 if (i
& 1 && src
< src_end
)
2581 rejected
|= CATEGORY_MASK_ISO_8_2
;
2583 found
|= CATEGORY_MASK_ISO_8_2
;
2588 single_shifting
= 0;
2589 if (! VECTORP (Vlatin_extra_code_table
)
2590 || NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2592 rejected
= CATEGORY_MASK_ISO
;
2595 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2596 & CODING_ISO_FLAG_LATIN_EXTRA
)
2597 found
|= CATEGORY_MASK_ISO_8_1
;
2599 rejected
|= CATEGORY_MASK_ISO_8_1
;
2600 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2601 & CODING_ISO_FLAG_LATIN_EXTRA
)
2602 found
|= CATEGORY_MASK_ISO_8_2
;
2604 rejected
|= CATEGORY_MASK_ISO_8_2
;
2607 detect_info
->rejected
|= CATEGORY_MASK_ISO
;
2611 detect_info
->rejected
|= rejected
;
2612 detect_info
->found
|= (found
& ~rejected
);
2617 /* Set designation state into CODING. */
2618 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2622 if (final < '0' || final >= 128 \
2623 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2624 || !SAFE_CHARSET_P (coding, id)) \
2626 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2627 goto invalid_code; \
2629 prev = CODING_ISO_DESIGNATION (coding, reg); \
2630 if (id == charset_jisx0201_roman) \
2632 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2633 id = charset_ascii; \
2635 else if (id == charset_jisx0208_1978) \
2637 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2638 id = charset_jisx0208; \
2640 CODING_ISO_DESIGNATION (coding, reg) = id; \
2641 /* If there was an invalid designation to REG previously, and this \
2642 designation is ASCII to REG, we should keep this designation \
2644 if (prev == -2 && id == charset_ascii) \
2645 goto invalid_code; \
2649 #define MAYBE_FINISH_COMPOSITION() \
2652 if (composition_state == COMPOSING_NO) \
2654 /* It is assured that we have enough room for producing \
2655 characters stored in the table `components'. */ \
2656 if (charbuf + component_idx > charbuf_end) \
2657 goto no_more_source; \
2658 composition_state = COMPOSING_NO; \
2659 if (method == COMPOSITION_RELATIVE \
2660 || method == COMPOSITION_WITH_ALTCHARS) \
2662 for (i = 0; i < component_idx; i++) \
2663 *charbuf++ = components[i]; \
2664 char_offset += component_idx; \
2668 for (i = 0; i < component_idx; i += 2) \
2669 *charbuf++ = components[i]; \
2670 char_offset += (component_idx / 2) + 1; \
2675 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2676 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2677 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2678 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2679 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2682 #define DECODE_COMPOSITION_START(c1) \
2685 && composition_state == COMPOSING_COMPONENT_RULE) \
2687 component_len = component_idx; \
2688 composition_state = COMPOSING_CHAR; \
2692 const unsigned char *p; \
2694 MAYBE_FINISH_COMPOSITION (); \
2695 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2696 goto no_more_source; \
2697 for (p = src; p < src_end - 1; p++) \
2698 if (*p == ISO_CODE_ESC && p[1] == '1') \
2700 if (p == src_end - 1) \
2702 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2703 goto invalid_code; \
2704 goto no_more_source; \
2707 /* This is surely the start of a composition. */ \
2708 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2709 : c1 == '2' ? COMPOSITION_WITH_RULE \
2710 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2711 : COMPOSITION_WITH_RULE_ALTCHARS); \
2712 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2713 : COMPOSING_COMPONENT_CHAR); \
2714 component_idx = component_len = 0; \
2719 /* Handle compositoin end sequence ESC 1. */
2721 #define DECODE_COMPOSITION_END() \
2723 int nchars = (component_len > 0 ? component_idx - component_len \
2724 : method == COMPOSITION_RELATIVE ? component_idx \
2725 : (component_idx + 1) / 2); \
2727 int *saved_charbuf = charbuf; \
2728 int from = char_offset; \
2729 int to = from + nchars; \
2731 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2732 if (method != COMPOSITION_RELATIVE) \
2734 if (component_len == 0) \
2735 for (i = 0; i < component_idx; i++) \
2736 *charbuf++ = components[i]; \
2738 for (i = 0; i < component_len; i++) \
2739 *charbuf++ = components[i]; \
2740 *saved_charbuf = saved_charbuf - charbuf; \
2742 if (method == COMPOSITION_WITH_RULE) \
2743 for (i = 0; i < component_idx; i += 2, char_offset++) \
2744 *charbuf++ = components[i]; \
2746 for (i = component_len; i < component_idx; i++, char_offset++) \
2747 *charbuf++ = components[i]; \
2748 coding->annotated = 1; \
2749 composition_state = COMPOSING_NO; \
2753 /* Decode a composition rule from the byte C1 (and maybe one more byte
2754 from SRC) and store one encoded composition rule in
2755 coding->cmp_data. */
2757 #define DECODE_COMPOSITION_RULE(c1) \
2760 if (c1 < 81) /* old format (before ver.21) */ \
2762 int gref = (c1) / 9; \
2763 int nref = (c1) % 9; \
2764 if (gref == 4) gref = 10; \
2765 if (nref == 4) nref = 10; \
2766 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2768 else if (c1 < 93) /* new format (after ver.21) */ \
2770 ONE_MORE_BYTE (c2); \
2771 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2778 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2781 decode_coding_iso_2022 (coding
)
2782 struct coding_system
*coding
;
2784 const unsigned char *src
= coding
->source
+ coding
->consumed
;
2785 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2786 const unsigned char *src_base
;
2787 int *charbuf
= coding
->charbuf
;
2789 = charbuf
+ coding
->charbuf_size
- 4 - MAX_ANNOTATION_LENGTH
;
2790 int consumed_chars
= 0, consumed_chars_base
;
2791 int multibytep
= coding
->src_multibyte
;
2792 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2793 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2794 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2795 struct charset
*charset
;
2797 /* For handling composition sequence. */
2798 #define COMPOSING_NO 0
2799 #define COMPOSING_CHAR 1
2800 #define COMPOSING_RULE 2
2801 #define COMPOSING_COMPONENT_CHAR 3
2802 #define COMPOSING_COMPONENT_RULE 4
2804 int composition_state
= COMPOSING_NO
;
2805 enum composition_method method
;
2806 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2809 Lisp_Object attrs
, eol_type
, charset_list
;
2810 int char_offset
= coding
->produced_char
;
2811 int last_offset
= char_offset
;
2812 int last_id
= charset_ascii
;
2814 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2815 setup_iso_safe_charsets (attrs
);
2822 consumed_chars_base
= consumed_chars
;
2824 if (charbuf
>= charbuf_end
)
2829 /* We produce at most one character. */
2830 switch (iso_code_class
[c1
])
2832 case ISO_0x20_or_0x7F
:
2833 if (composition_state
!= COMPOSING_NO
)
2835 if (composition_state
== COMPOSING_RULE
2836 || composition_state
== COMPOSING_COMPONENT_RULE
)
2838 DECODE_COMPOSITION_RULE (c1
);
2839 components
[component_idx
++] = c1
;
2840 composition_state
--;
2844 if (charset_id_0
< 0
2845 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2846 /* This is SPACE or DEL. */
2847 charset
= CHARSET_FROM_ID (charset_ascii
);
2849 charset
= CHARSET_FROM_ID (charset_id_0
);
2852 case ISO_graphic_plane_0
:
2853 if (composition_state
!= COMPOSING_NO
)
2855 if (composition_state
== COMPOSING_RULE
2856 || composition_state
== COMPOSING_COMPONENT_RULE
)
2858 DECODE_COMPOSITION_RULE (c1
);
2859 components
[component_idx
++] = c1
;
2860 composition_state
--;
2864 charset
= CHARSET_FROM_ID (charset_id_0
);
2867 case ISO_0xA0_or_0xFF
:
2868 if (charset_id_1
< 0
2869 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
2870 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
2872 /* This is a graphic character, we fall down ... */
2874 case ISO_graphic_plane_1
:
2875 if (charset_id_1
< 0)
2877 charset
= CHARSET_FROM_ID (charset_id_1
);
2880 case ISO_carriage_return
:
2883 if (EQ (eol_type
, Qdos
))
2887 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
2888 goto no_more_source
;
2893 else if (EQ (eol_type
, Qmac
))
2899 MAYBE_FINISH_COMPOSITION ();
2900 charset
= CHARSET_FROM_ID (charset_ascii
);
2904 MAYBE_FINISH_COMPOSITION ();
2908 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2909 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
2911 CODING_ISO_INVOCATION (coding
, 0) = 1;
2912 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2916 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
2918 CODING_ISO_INVOCATION (coding
, 0) = 0;
2919 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2922 case ISO_single_shift_2_7
:
2923 case ISO_single_shift_2
:
2924 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2926 /* SS2 is handled as an escape sequence of ESC 'N' */
2928 goto label_escape_sequence
;
2930 case ISO_single_shift_3
:
2931 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2933 /* SS2 is handled as an escape sequence of ESC 'O' */
2935 goto label_escape_sequence
;
2937 case ISO_control_sequence_introducer
:
2938 /* CSI is handled as an escape sequence of ESC '[' ... */
2940 goto label_escape_sequence
;
2944 label_escape_sequence
:
2945 /* Escape sequences handled here are invocation,
2946 designation, direction specification, and character
2947 composition specification. */
2950 case '&': /* revision of following character set */
2952 if (!(c1
>= '@' && c1
<= '~'))
2955 if (c1
!= ISO_CODE_ESC
)
2958 goto label_escape_sequence
;
2960 case '$': /* designation of 2-byte character set */
2961 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2964 if (c1
>= '@' && c1
<= 'B')
2965 { /* designation of JISX0208.1978, GB2312.1980,
2967 DECODE_DESIGNATION (0, 2, 0, c1
);
2969 else if (c1
>= 0x28 && c1
<= 0x2B)
2970 { /* designation of DIMENSION2_CHARS94 character set */
2972 DECODE_DESIGNATION (c1
- 0x28, 2, 0, c2
);
2974 else if (c1
>= 0x2C && c1
<= 0x2F)
2975 { /* designation of DIMENSION2_CHARS96 character set */
2977 DECODE_DESIGNATION (c1
- 0x2C, 2, 1, c2
);
2981 /* We must update these variables now. */
2982 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2983 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2986 case 'n': /* invocation of locking-shift-2 */
2987 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2988 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2990 CODING_ISO_INVOCATION (coding
, 0) = 2;
2991 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2994 case 'o': /* invocation of locking-shift-3 */
2995 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2996 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2998 CODING_ISO_INVOCATION (coding
, 0) = 3;
2999 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3002 case 'N': /* invocation of single-shift-2 */
3003 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3004 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3006 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 2));
3008 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3012 case 'O': /* invocation of single-shift-3 */
3013 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3014 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3016 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 3));
3018 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3022 case '0': case '2': case '3': case '4': /* start composition */
3023 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
3025 DECODE_COMPOSITION_START (c1
);
3028 case '1': /* end composition */
3029 if (composition_state
== COMPOSING_NO
)
3031 DECODE_COMPOSITION_END ();
3034 case '[': /* specification of direction */
3035 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
3037 /* For the moment, nested direction is not supported.
3038 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3039 left-to-right, and nozero means right-to-left. */
3043 case ']': /* end of the current direction */
3044 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3046 case '0': /* end of the current direction */
3047 case '1': /* start of left-to-right direction */
3050 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3055 case '2': /* start of right-to-left direction */
3058 coding
->mode
|= CODING_MODE_DIRECTION
;
3072 /* CTEXT extended segment:
3073 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3074 We keep these bytes as is for the moment.
3075 They may be decoded by post-read-conversion. */
3079 ONE_MORE_BYTE (dim
);
3082 size
= ((M
- 128) * 128) + (L
- 128);
3083 if (charbuf
+ 8 + size
> charbuf_end
)
3085 *charbuf
++ = ISO_CODE_ESC
;
3089 *charbuf
++ = BYTE8_TO_CHAR (M
);
3090 *charbuf
++ = BYTE8_TO_CHAR (L
);
3094 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3099 /* XFree86 extension for embedding UTF-8 in CTEXT:
3100 ESC % G --UTF-8-BYTES-- ESC % @
3101 We keep these bytes as is for the moment.
3102 They may be decoded by post-read-conversion. */
3105 if (p
+ 6 > charbuf_end
)
3107 *p
++ = ISO_CODE_ESC
;
3110 while (p
< charbuf_end
)
3113 if (c1
== ISO_CODE_ESC
3114 && src
+ 1 < src_end
3118 *p
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3120 if (p
+ 3 > charbuf_end
)
3122 *p
++ = ISO_CODE_ESC
;
3133 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3135 if (c1
>= 0x28 && c1
<= 0x2B)
3136 { /* designation of DIMENSION1_CHARS94 character set */
3138 DECODE_DESIGNATION (c1
- 0x28, 1, 0, c2
);
3140 else if (c1
>= 0x2C && c1
<= 0x2F)
3141 { /* designation of DIMENSION1_CHARS96 character set */
3143 DECODE_DESIGNATION (c1
- 0x2C, 1, 1, c2
);
3147 /* We must update these variables now. */
3148 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3149 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3154 if (charset
->id
!= charset_ascii
3155 && last_id
!= charset
->id
)
3157 if (last_id
!= charset_ascii
)
3158 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3159 last_id
= charset
->id
;
3160 last_offset
= char_offset
;
3163 /* Now we know CHARSET and 1st position code C1 of a character.
3164 Produce a decoded character while getting 2nd position code
3167 if (CHARSET_DIMENSION (charset
) > 1)
3170 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3171 /* C2 is not in a valid range. */
3173 c1
= (c1
<< 8) | (c2
& 0x7F);
3174 if (CHARSET_DIMENSION (charset
) > 2)
3177 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3178 /* C2 is not in a valid range. */
3180 c1
= (c1
<< 8) | (c2
& 0x7F);
3184 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3187 MAYBE_FINISH_COMPOSITION ();
3188 for (; src_base
< src
; src_base
++, char_offset
++)
3190 if (ASCII_BYTE_P (*src_base
))
3191 *charbuf
++ = *src_base
;
3193 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3196 else if (composition_state
== COMPOSING_NO
)
3203 components
[component_idx
++] = c
;
3204 if (method
== COMPOSITION_WITH_RULE
3205 || (method
== COMPOSITION_WITH_RULE_ALTCHARS
3206 && composition_state
== COMPOSING_COMPONENT_CHAR
))
3207 composition_state
++;
3212 MAYBE_FINISH_COMPOSITION ();
3214 consumed_chars
= consumed_chars_base
;
3216 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3226 if (last_id
!= charset_ascii
)
3227 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3228 coding
->consumed_char
+= consumed_chars_base
;
3229 coding
->consumed
= src_base
- coding
->source
;
3230 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3234 /* ISO2022 encoding stuff. */
3237 It is not enough to say just "ISO2022" on encoding, we have to
3238 specify more details. In Emacs, each coding system of ISO2022
3239 variant has the following specifications:
3240 1. Initial designation to G0 thru G3.
3241 2. Allows short-form designation?
3242 3. ASCII should be designated to G0 before control characters?
3243 4. ASCII should be designated to G0 at end of line?
3244 5. 7-bit environment or 8-bit environment?
3245 6. Use locking-shift?
3246 7. Use Single-shift?
3247 And the following two are only for Japanese:
3248 8. Use ASCII in place of JIS0201-1976-Roman?
3249 9. Use JISX0208-1983 in place of JISX0208-1978?
3250 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3251 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3255 /* Produce codes (escape sequence) for designating CHARSET to graphic
3256 register REG at DST, and increment DST. If <final-char> of CHARSET is
3257 '@', 'A', or 'B' and the coding system CODING allows, produce
3258 designation sequence of short-form. */
3260 #define ENCODE_DESIGNATION(charset, reg, coding) \
3262 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3263 char *intermediate_char_94 = "()*+"; \
3264 char *intermediate_char_96 = ",-./"; \
3265 int revision = -1; \
3268 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3269 revision = CHARSET_ISO_REVISION (charset); \
3271 if (revision >= 0) \
3273 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3274 EMIT_ONE_BYTE ('@' + revision); \
3276 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3277 if (CHARSET_DIMENSION (charset) == 1) \
3279 if (! CHARSET_ISO_CHARS_96 (charset)) \
3280 c = intermediate_char_94[reg]; \
3282 c = intermediate_char_96[reg]; \
3283 EMIT_ONE_ASCII_BYTE (c); \
3287 EMIT_ONE_ASCII_BYTE ('$'); \
3288 if (! CHARSET_ISO_CHARS_96 (charset)) \
3290 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3292 || final_char < '@' || final_char > 'B') \
3293 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3296 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3298 EMIT_ONE_ASCII_BYTE (final_char); \
3300 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3304 /* The following two macros produce codes (control character or escape
3305 sequence) for ISO2022 single-shift functions (single-shift-2 and
3308 #define ENCODE_SINGLE_SHIFT_2 \
3310 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3311 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3313 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3314 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3318 #define ENCODE_SINGLE_SHIFT_3 \
3320 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3321 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3323 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3324 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3328 /* The following four macros produce codes (control character or
3329 escape sequence) for ISO2022 locking-shift functions (shift-in,
3330 shift-out, locking-shift-2, and locking-shift-3). */
3332 #define ENCODE_SHIFT_IN \
3334 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3335 CODING_ISO_INVOCATION (coding, 0) = 0; \
3339 #define ENCODE_SHIFT_OUT \
3341 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3342 CODING_ISO_INVOCATION (coding, 0) = 1; \
3346 #define ENCODE_LOCKING_SHIFT_2 \
3348 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3349 CODING_ISO_INVOCATION (coding, 0) = 2; \
3353 #define ENCODE_LOCKING_SHIFT_3 \
3355 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3356 CODING_ISO_INVOCATION (coding, 0) = 3; \
3360 /* Produce codes for a DIMENSION1 character whose character set is
3361 CHARSET and whose position-code is C1. Designation and invocation
3362 sequences are also produced in advance if necessary. */
3364 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3366 int id = CHARSET_ID (charset); \
3368 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3369 && id == charset_ascii) \
3371 id = charset_jisx0201_roman; \
3372 charset = CHARSET_FROM_ID (id); \
3375 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3377 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3378 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3380 EMIT_ONE_BYTE (c1 | 0x80); \
3381 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3384 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3386 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3389 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3391 EMIT_ONE_BYTE (c1 | 0x80); \
3395 /* Since CHARSET is not yet invoked to any graphic planes, we \
3396 must invoke it, or, at first, designate it to some graphic \
3397 register. Then repeat the loop to actually produce the \
3399 dst = encode_invocation_designation (charset, coding, dst, \
3404 /* Produce codes for a DIMENSION2 character whose character set is
3405 CHARSET and whose position-codes are C1 and C2. Designation and
3406 invocation codes are also produced in advance if necessary. */
3408 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3410 int id = CHARSET_ID (charset); \
3412 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3413 && id == charset_jisx0208) \
3415 id = charset_jisx0208_1978; \
3416 charset = CHARSET_FROM_ID (id); \
3419 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3421 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3422 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3424 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3425 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3428 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3430 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3433 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3435 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3439 /* Since CHARSET is not yet invoked to any graphic planes, we \
3440 must invoke it, or, at first, designate it to some graphic \
3441 register. Then repeat the loop to actually produce the \
3443 dst = encode_invocation_designation (charset, coding, dst, \
3448 #define ENCODE_ISO_CHARACTER(charset, c) \
3450 int code = ENCODE_CHAR ((charset),(c)); \
3452 if (CHARSET_DIMENSION (charset) == 1) \
3453 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3455 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3459 /* Produce designation and invocation codes at a place pointed by DST
3460 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3464 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3465 struct charset
*charset
;
3466 struct coding_system
*coding
;
3470 int multibytep
= coding
->dst_multibyte
;
3471 int produced_chars
= *p_nchars
;
3472 int reg
; /* graphic register number */
3473 int id
= CHARSET_ID (charset
);
3475 /* At first, check designations. */
3476 for (reg
= 0; reg
< 4; reg
++)
3477 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3482 /* CHARSET is not yet designated to any graphic registers. */
3483 /* At first check the requested designation. */
3484 reg
= CODING_ISO_REQUEST (coding
, id
);
3486 /* Since CHARSET requests no special designation, designate it
3487 to graphic register 0. */
3490 ENCODE_DESIGNATION (charset
, reg
, coding
);
3493 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3494 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3496 /* Since the graphic register REG is not invoked to any graphic
3497 planes, invoke it to graphic plane 0. */
3500 case 0: /* graphic register 0 */
3504 case 1: /* graphic register 1 */
3508 case 2: /* graphic register 2 */
3509 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3510 ENCODE_SINGLE_SHIFT_2
;
3512 ENCODE_LOCKING_SHIFT_2
;
3515 case 3: /* graphic register 3 */
3516 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3517 ENCODE_SINGLE_SHIFT_3
;
3519 ENCODE_LOCKING_SHIFT_3
;
3524 *p_nchars
= produced_chars
;
3528 /* The following three macros produce codes for indicating direction
3530 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3532 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3533 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3535 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3539 #define ENCODE_DIRECTION_R2L() \
3541 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3542 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3546 #define ENCODE_DIRECTION_L2R() \
3548 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3549 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3553 /* Produce codes for designation and invocation to reset the graphic
3554 planes and registers to initial state. */
3555 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3558 struct charset *charset; \
3560 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3562 for (reg = 0; reg < 4; reg++) \
3563 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3564 && (CODING_ISO_DESIGNATION (coding, reg) \
3565 != CODING_ISO_INITIAL (coding, reg))) \
3567 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3568 ENCODE_DESIGNATION (charset, reg, coding); \
3573 /* Produce designation sequences of charsets in the line started from
3574 SRC to a place pointed by DST, and return updated DST.
3576 If the current block ends before any end-of-line, we may fail to
3577 find all the necessary designations. */
3579 static unsigned char *
3580 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3581 struct coding_system
*coding
;
3582 int *charbuf
, *charbuf_end
;
3585 struct charset
*charset
;
3586 /* Table of charsets to be designated to each graphic register. */
3588 int c
, found
= 0, reg
;
3589 int produced_chars
= 0;
3590 int multibytep
= coding
->dst_multibyte
;
3592 Lisp_Object charset_list
;
3594 attrs
= CODING_ID_ATTRS (coding
->id
);
3595 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3596 if (EQ (charset_list
, Qiso_2022
))
3597 charset_list
= Viso_2022_charset_list
;
3599 for (reg
= 0; reg
< 4; reg
++)
3609 charset
= char_charset (c
, charset_list
, NULL
);
3610 id
= CHARSET_ID (charset
);
3611 reg
= CODING_ISO_REQUEST (coding
, id
);
3612 if (reg
>= 0 && r
[reg
] < 0)
3621 for (reg
= 0; reg
< 4; reg
++)
3623 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3624 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3630 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3633 encode_coding_iso_2022 (coding
)
3634 struct coding_system
*coding
;
3636 int multibytep
= coding
->dst_multibyte
;
3637 int *charbuf
= coding
->charbuf
;
3638 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3639 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3640 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3643 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3644 && CODING_ISO_BOL (coding
));
3645 int produced_chars
= 0;
3646 Lisp_Object attrs
, eol_type
, charset_list
;
3647 int ascii_compatible
;
3649 int preferred_charset_id
= -1;
3651 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3652 setup_iso_safe_charsets (attrs
);
3653 /* Charset list may have been changed. */
3654 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
); \
3655 coding
->safe_charsets
= (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs
));
3657 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3659 while (charbuf
< charbuf_end
)
3661 ASSURE_DESTINATION (safe_room
);
3663 if (bol_designation
)
3665 unsigned char *dst_prev
= dst
;
3667 /* We have to produce designation sequences if any now. */
3668 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3669 bol_designation
= 0;
3670 /* We are sure that designation sequences are all ASCII bytes. */
3671 produced_chars
+= dst
- dst_prev
;
3678 /* Handle an annotation. */
3681 case CODING_ANNOTATE_COMPOSITION_MASK
:
3682 /* Not yet implemented. */
3684 case CODING_ANNOTATE_CHARSET_MASK
:
3685 preferred_charset_id
= charbuf
[3];
3686 if (preferred_charset_id
>= 0
3687 && NILP (Fmemq (make_number (preferred_charset_id
),
3689 preferred_charset_id
= -1;
3698 /* Now encode the character C. */
3699 if (c
< 0x20 || c
== 0x7F)
3702 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3704 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3705 ENCODE_RESET_PLANE_AND_REGISTER ();
3706 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3710 for (i
= 0; i
< 4; i
++)
3711 CODING_ISO_DESIGNATION (coding
, i
)
3712 = CODING_ISO_INITIAL (coding
, i
);
3715 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3717 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3718 ENCODE_RESET_PLANE_AND_REGISTER ();
3719 EMIT_ONE_ASCII_BYTE (c
);
3721 else if (ASCII_CHAR_P (c
))
3723 if (ascii_compatible
)
3724 EMIT_ONE_ASCII_BYTE (c
);
3727 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
3728 ENCODE_ISO_CHARACTER (charset
, c
);
3731 else if (CHAR_BYTE8_P (c
))
3733 c
= CHAR_TO_BYTE8 (c
);
3738 struct charset
*charset
;
3740 if (preferred_charset_id
>= 0)
3742 charset
= CHARSET_FROM_ID (preferred_charset_id
);
3743 if (! CHAR_CHARSET_P (c
, charset
))
3744 charset
= char_charset (c
, charset_list
, NULL
);
3747 charset
= char_charset (c
, charset_list
, NULL
);
3750 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3752 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3753 charset
= CHARSET_FROM_ID (charset_ascii
);
3757 c
= coding
->default_char
;
3758 charset
= char_charset (c
, charset_list
, NULL
);
3761 ENCODE_ISO_CHARACTER (charset
, c
);
3765 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3766 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3768 ASSURE_DESTINATION (safe_room
);
3769 ENCODE_RESET_PLANE_AND_REGISTER ();
3771 coding
->result
= CODING_RESULT_SUCCESS
;
3772 CODING_ISO_BOL (coding
) = bol_designation
;
3773 coding
->produced_char
+= produced_chars
;
3774 coding
->produced
= dst
- coding
->destination
;
3779 /*** 8,9. SJIS and BIG5 handlers ***/
3781 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3782 quite widely. So, for the moment, Emacs supports them in the bare
3783 C code. But, in the future, they may be supported only by CCL. */
3785 /* SJIS is a coding system encoding three character sets: ASCII, right
3786 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3787 as is. A character of charset katakana-jisx0201 is encoded by
3788 "position-code + 0x80". A character of charset japanese-jisx0208
3789 is encoded in 2-byte but two position-codes are divided and shifted
3790 so that it fit in the range below.
3792 --- CODE RANGE of SJIS ---
3793 (character set) (range)
3795 KATAKANA-JISX0201 0xA0 .. 0xDF
3796 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3797 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3798 -------------------------------
3802 /* BIG5 is a coding system encoding two character sets: ASCII and
3803 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3804 character set and is encoded in two-byte.
3806 --- CODE RANGE of BIG5 ---
3807 (character set) (range)
3809 Big5 (1st byte) 0xA1 .. 0xFE
3810 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3811 --------------------------
3815 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3816 Check if a text is encoded in SJIS. If it is, return
3817 CATEGORY_MASK_SJIS, else return 0. */
3820 detect_coding_sjis (coding
, detect_info
)
3821 struct coding_system
*coding
;
3822 struct coding_detection_info
*detect_info
;
3824 const unsigned char *src
= coding
->source
, *src_base
= src
;
3825 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3826 int multibytep
= coding
->src_multibyte
;
3827 int consumed_chars
= 0;
3832 detect_info
->checked
|= CATEGORY_MASK_SJIS
;
3833 /* A coding system of this category is always ASCII compatible. */
3834 src
+= coding
->head_ascii
;
3843 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3846 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
3848 found
= CATEGORY_MASK_SJIS
;
3850 else if (c
>= 0xA0 && c
< 0xE0)
3851 found
= CATEGORY_MASK_SJIS
;
3855 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3859 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3861 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3864 detect_info
->found
|= found
;
3868 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3869 Check if a text is encoded in BIG5. If it is, return
3870 CATEGORY_MASK_BIG5, else return 0. */
3873 detect_coding_big5 (coding
, detect_info
)
3874 struct coding_system
*coding
;
3875 struct coding_detection_info
*detect_info
;
3877 const unsigned char *src
= coding
->source
, *src_base
= src
;
3878 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3879 int multibytep
= coding
->src_multibyte
;
3880 int consumed_chars
= 0;
3885 detect_info
->checked
|= CATEGORY_MASK_BIG5
;
3886 /* A coding system of this category is always ASCII compatible. */
3887 src
+= coding
->head_ascii
;
3899 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
3901 found
= CATEGORY_MASK_BIG5
;
3906 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3910 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3912 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3915 detect_info
->found
|= found
;
3919 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3920 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3923 decode_coding_sjis (coding
)
3924 struct coding_system
*coding
;
3926 const unsigned char *src
= coding
->source
+ coding
->consumed
;
3927 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3928 const unsigned char *src_base
;
3929 int *charbuf
= coding
->charbuf
;
3930 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
3931 int consumed_chars
= 0, consumed_chars_base
;
3932 int multibytep
= coding
->src_multibyte
;
3933 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3934 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3935 int char_offset
= coding
->produced_char
;
3936 int last_offset
= char_offset
;
3937 int last_id
= charset_ascii
;
3939 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3942 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3943 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3944 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3951 consumed_chars_base
= consumed_chars
;
3953 if (charbuf
>= charbuf_end
)
3960 if (EQ (eol_type
, Qdos
))
3964 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
3965 goto no_more_source
;
3970 else if (EQ (eol_type
, Qmac
))
3975 struct charset
*charset
;
3978 charset
= charset_roman
;
3983 if (c
< 0xA0 || c
>= 0xE0)
3985 /* SJIS -> JISX0208 */
3987 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
3991 charset
= charset_kanji
;
3995 /* SJIS -> JISX0201-Kana */
3997 charset
= charset_kana
;
4002 if (charset
->id
!= charset_ascii
4003 && last_id
!= charset
->id
)
4005 if (last_id
!= charset_ascii
)
4006 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4007 last_id
= charset
->id
;
4008 last_offset
= char_offset
;
4010 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4018 consumed_chars
= consumed_chars_base
;
4020 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4026 if (last_id
!= charset_ascii
)
4027 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4028 coding
->consumed_char
+= consumed_chars_base
;
4029 coding
->consumed
= src_base
- coding
->source
;
4030 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4034 decode_coding_big5 (coding
)
4035 struct coding_system
*coding
;
4037 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4038 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4039 const unsigned char *src_base
;
4040 int *charbuf
= coding
->charbuf
;
4041 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4042 int consumed_chars
= 0, consumed_chars_base
;
4043 int multibytep
= coding
->src_multibyte
;
4044 struct charset
*charset_roman
, *charset_big5
;
4045 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4046 int char_offset
= coding
->produced_char
;
4047 int last_offset
= char_offset
;
4048 int last_id
= charset_ascii
;
4050 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4052 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4053 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4060 consumed_chars_base
= consumed_chars
;
4062 if (charbuf
>= charbuf_end
)
4069 if (EQ (eol_type
, Qdos
))
4073 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4074 goto no_more_source
;
4079 else if (EQ (eol_type
, Qmac
))
4084 struct charset
*charset
;
4086 charset
= charset_roman
;
4090 if (c
< 0xA1 || c
> 0xFE)
4093 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
4096 charset
= charset_big5
;
4098 if (charset
->id
!= charset_ascii
4099 && last_id
!= charset
->id
)
4101 if (last_id
!= charset_ascii
)
4102 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4103 last_id
= charset
->id
;
4104 last_offset
= char_offset
;
4106 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4115 consumed_chars
= consumed_chars_base
;
4117 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4123 if (last_id
!= charset_ascii
)
4124 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4125 coding
->consumed_char
+= consumed_chars_base
;
4126 coding
->consumed
= src_base
- coding
->source
;
4127 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4130 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4131 This function can encode charsets `ascii', `katakana-jisx0201',
4132 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4133 are sure that all these charsets are registered as official charset
4134 (i.e. do not have extended leading-codes). Characters of other
4135 charsets are produced without any encoding. If SJIS_P is 1, encode
4136 SJIS text, else encode BIG5 text. */
4139 encode_coding_sjis (coding
)
4140 struct coding_system
*coding
;
4142 int multibytep
= coding
->dst_multibyte
;
4143 int *charbuf
= coding
->charbuf
;
4144 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4145 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4146 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4148 int produced_chars
= 0;
4149 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4150 int ascii_compatible
;
4151 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4154 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4156 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4157 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4158 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4160 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4162 while (charbuf
< charbuf_end
)
4164 ASSURE_DESTINATION (safe_room
);
4166 /* Now encode the character C. */
4167 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4168 EMIT_ONE_ASCII_BYTE (c
);
4169 else if (CHAR_BYTE8_P (c
))
4171 c
= CHAR_TO_BYTE8 (c
);
4177 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4181 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4183 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4184 charset
= CHARSET_FROM_ID (charset_ascii
);
4188 c
= coding
->default_char
;
4189 charset
= char_charset (c
, charset_list
, &code
);
4192 if (code
== CHARSET_INVALID_CODE (charset
))
4194 if (charset
== charset_kanji
)
4198 c1
= code
>> 8, c2
= code
& 0xFF;
4199 EMIT_TWO_BYTES (c1
, c2
);
4201 else if (charset
== charset_kana
)
4202 EMIT_ONE_BYTE (code
| 0x80);
4204 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4207 coding
->result
= CODING_RESULT_SUCCESS
;
4208 coding
->produced_char
+= produced_chars
;
4209 coding
->produced
= dst
- coding
->destination
;
4214 encode_coding_big5 (coding
)
4215 struct coding_system
*coding
;
4217 int multibytep
= coding
->dst_multibyte
;
4218 int *charbuf
= coding
->charbuf
;
4219 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4220 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4221 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4223 int produced_chars
= 0;
4224 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4225 int ascii_compatible
;
4226 struct charset
*charset_roman
, *charset_big5
;
4229 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4231 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4232 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4233 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4235 while (charbuf
< charbuf_end
)
4237 ASSURE_DESTINATION (safe_room
);
4239 /* Now encode the character C. */
4240 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4241 EMIT_ONE_ASCII_BYTE (c
);
4242 else if (CHAR_BYTE8_P (c
))
4244 c
= CHAR_TO_BYTE8 (c
);
4250 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4254 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4256 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4257 charset
= CHARSET_FROM_ID (charset_ascii
);
4261 c
= coding
->default_char
;
4262 charset
= char_charset (c
, charset_list
, &code
);
4265 if (code
== CHARSET_INVALID_CODE (charset
))
4267 if (charset
== charset_big5
)
4271 c1
= code
>> 8, c2
= code
& 0xFF;
4272 EMIT_TWO_BYTES (c1
, c2
);
4275 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4278 coding
->result
= CODING_RESULT_SUCCESS
;
4279 coding
->produced_char
+= produced_chars
;
4280 coding
->produced
= dst
- coding
->destination
;
4285 /*** 10. CCL handlers ***/
4287 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4288 Check if a text is encoded in a coding system of which
4289 encoder/decoder are written in CCL program. If it is, return
4290 CATEGORY_MASK_CCL, else return 0. */
4293 detect_coding_ccl (coding
, detect_info
)
4294 struct coding_system
*coding
;
4295 struct coding_detection_info
*detect_info
;
4297 const unsigned char *src
= coding
->source
, *src_base
= src
;
4298 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4299 int multibytep
= coding
->src_multibyte
;
4300 int consumed_chars
= 0;
4302 unsigned char *valids
= CODING_CCL_VALIDS (coding
);
4303 int head_ascii
= coding
->head_ascii
;
4306 detect_info
->checked
|= CATEGORY_MASK_CCL
;
4308 coding
= &coding_categories
[coding_category_ccl
];
4309 attrs
= CODING_ID_ATTRS (coding
->id
);
4310 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4319 if ((valids
[c
] > 1))
4320 found
= CATEGORY_MASK_CCL
;
4322 detect_info
->rejected
|= CATEGORY_MASK_CCL
;
4326 detect_info
->found
|= found
;
4331 decode_coding_ccl (coding
)
4332 struct coding_system
*coding
;
4334 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4335 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4336 int *charbuf
= coding
->charbuf
;
4337 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4338 int consumed_chars
= 0;
4339 int multibytep
= coding
->src_multibyte
;
4340 struct ccl_program ccl
;
4341 int source_charbuf
[1024];
4342 int source_byteidx
[1024];
4343 Lisp_Object attrs
, eol_type
, charset_list
;
4345 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4346 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
4348 while (src
< src_end
)
4350 const unsigned char *p
= src
;
4351 int *source
, *source_end
;
4355 while (i
< 1024 && p
< src_end
)
4357 source_byteidx
[i
] = p
- src
;
4358 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4361 while (i
< 1024 && p
< src_end
)
4362 source_charbuf
[i
++] = *p
++;
4364 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4367 source
= source_charbuf
;
4368 source_end
= source
+ i
;
4369 while (source
< source_end
)
4371 ccl_driver (&ccl
, source
, charbuf
,
4372 source_end
- source
, charbuf_end
- charbuf
,
4374 source
+= ccl
.consumed
;
4375 charbuf
+= ccl
.produced
;
4376 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4379 if (source
< source_end
)
4380 src
+= source_byteidx
[source
- source_charbuf
];
4383 consumed_chars
+= source
- source_charbuf
;
4385 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4386 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4392 case CCL_STAT_SUSPEND_BY_SRC
:
4393 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4395 case CCL_STAT_SUSPEND_BY_DST
:
4398 case CCL_STAT_INVALID_CMD
:
4399 coding
->result
= CODING_RESULT_INTERRUPT
;
4402 coding
->result
= CODING_RESULT_SUCCESS
;
4405 coding
->consumed_char
+= consumed_chars
;
4406 coding
->consumed
= src
- coding
->source
;
4407 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4411 encode_coding_ccl (coding
)
4412 struct coding_system
*coding
;
4414 struct ccl_program ccl
;
4415 int multibytep
= coding
->dst_multibyte
;
4416 int *charbuf
= coding
->charbuf
;
4417 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4418 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4419 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4420 unsigned char *adjusted_dst_end
= dst_end
- 1;
4421 int destination_charbuf
[1024];
4422 int i
, produced_chars
= 0;
4423 Lisp_Object attrs
, eol_type
, charset_list
;
4425 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4426 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4428 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4429 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4431 while (charbuf
< charbuf_end
&& dst
< adjusted_dst_end
)
4433 int dst_bytes
= dst_end
- dst
;
4434 if (dst_bytes
> 1024)
4437 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4438 charbuf_end
- charbuf
, dst_bytes
, charset_list
);
4439 charbuf
+= ccl
.consumed
;
4441 for (i
= 0; i
< ccl
.produced
; i
++)
4442 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4445 for (i
= 0; i
< ccl
.produced
; i
++)
4446 *dst
++ = destination_charbuf
[i
] & 0xFF;
4447 produced_chars
+= ccl
.produced
;
4453 case CCL_STAT_SUSPEND_BY_SRC
:
4454 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4456 case CCL_STAT_SUSPEND_BY_DST
:
4457 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
4460 case CCL_STAT_INVALID_CMD
:
4461 coding
->result
= CODING_RESULT_INTERRUPT
;
4464 coding
->result
= CODING_RESULT_SUCCESS
;
4468 coding
->produced_char
+= produced_chars
;
4469 coding
->produced
= dst
- coding
->destination
;
4475 /*** 10, 11. no-conversion handlers ***/
4477 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4480 decode_coding_raw_text (coding
)
4481 struct coding_system
*coding
;
4483 coding
->chars_at_source
= 1;
4484 coding
->consumed_char
= 0;
4485 coding
->consumed
= 0;
4486 coding
->result
= CODING_RESULT_SUCCESS
;
4490 encode_coding_raw_text (coding
)
4491 struct coding_system
*coding
;
4493 int multibytep
= coding
->dst_multibyte
;
4494 int *charbuf
= coding
->charbuf
;
4495 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4496 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4497 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4498 int produced_chars
= 0;
4503 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4505 if (coding
->src_multibyte
)
4506 while (charbuf
< charbuf_end
)
4508 ASSURE_DESTINATION (safe_room
);
4510 if (ASCII_CHAR_P (c
))
4511 EMIT_ONE_ASCII_BYTE (c
);
4512 else if (CHAR_BYTE8_P (c
))
4514 c
= CHAR_TO_BYTE8 (c
);
4519 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4521 CHAR_STRING_ADVANCE (c
, p1
);
4524 EMIT_ONE_BYTE (*p0
);
4530 while (charbuf
< charbuf_end
)
4532 ASSURE_DESTINATION (safe_room
);
4539 if (coding
->src_multibyte
)
4541 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4543 while (charbuf
< charbuf_end
)
4545 ASSURE_DESTINATION (safe_room
);
4547 if (ASCII_CHAR_P (c
))
4549 else if (CHAR_BYTE8_P (c
))
4550 *dst
++ = CHAR_TO_BYTE8 (c
);
4552 CHAR_STRING_ADVANCE (c
, dst
);
4558 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4559 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4560 *dst
++ = *charbuf
++;
4561 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4564 coding
->result
= CODING_RESULT_SUCCESS
;
4565 coding
->produced_char
+= produced_chars
;
4566 coding
->produced
= dst
- coding
->destination
;
4570 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4571 Check if a text is encoded in a charset-based coding system. If it
4572 is, return 1, else return 0. */
4575 detect_coding_charset (coding
, detect_info
)
4576 struct coding_system
*coding
;
4577 struct coding_detection_info
*detect_info
;
4579 const unsigned char *src
= coding
->source
, *src_base
= src
;
4580 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4581 int multibytep
= coding
->src_multibyte
;
4582 int consumed_chars
= 0;
4583 Lisp_Object attrs
, valids
;
4586 detect_info
->checked
|= CATEGORY_MASK_CHARSET
;
4588 coding
= &coding_categories
[coding_category_charset
];
4589 attrs
= CODING_ID_ATTRS (coding
->id
);
4590 valids
= AREF (attrs
, coding_attr_charset_valids
);
4592 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4593 src
+= coding
->head_ascii
;
4600 if (NILP (AREF (valids
, c
)))
4603 found
= CATEGORY_MASK_CHARSET
;
4605 detect_info
->rejected
|= CATEGORY_MASK_CHARSET
;
4609 detect_info
->found
|= found
;
4614 decode_coding_charset (coding
)
4615 struct coding_system
*coding
;
4617 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4618 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4619 const unsigned char *src_base
;
4620 int *charbuf
= coding
->charbuf
;
4621 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4622 int consumed_chars
= 0, consumed_chars_base
;
4623 int multibytep
= coding
->src_multibyte
;
4624 Lisp_Object attrs
, eol_type
, charset_list
, valids
;
4625 int char_offset
= coding
->produced_char
;
4626 int last_offset
= char_offset
;
4627 int last_id
= charset_ascii
;
4629 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4630 valids
= AREF (attrs
, coding_attr_charset_valids
);
4637 consumed_chars_base
= consumed_chars
;
4639 if (charbuf
>= charbuf_end
)
4645 /* Here we assume that no charset maps '\r' to something
4647 if (EQ (eol_type
, Qdos
))
4651 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4652 goto no_more_source
;
4657 else if (EQ (eol_type
, Qmac
))
4663 struct charset
*charset
;
4668 val
= AREF (valids
, c
);
4673 charset
= CHARSET_FROM_ID (XFASTINT (val
));
4674 dim
= CHARSET_DIMENSION (charset
);
4678 code
= (code
<< 8) | c
;
4681 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
4686 /* VAL is a list of charset IDs. It is assured that the
4687 list is sorted by charset dimensions (smaller one
4691 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
4692 dim
= CHARSET_DIMENSION (charset
);
4696 code
= (code
<< 8) | c
;
4699 CODING_DECODE_CHAR (coding
, src
, src_base
,
4700 src_end
, charset
, code
, c
);
4708 if (charset
->id
!= charset_ascii
4709 && last_id
!= charset
->id
)
4711 if (last_id
!= charset_ascii
)
4712 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4713 last_id
= charset
->id
;
4714 last_offset
= char_offset
;
4723 consumed_chars
= consumed_chars_base
;
4725 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4731 if (last_id
!= charset_ascii
)
4732 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4733 coding
->consumed_char
+= consumed_chars_base
;
4734 coding
->consumed
= src_base
- coding
->source
;
4735 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4739 encode_coding_charset (coding
)
4740 struct coding_system
*coding
;
4742 int multibytep
= coding
->dst_multibyte
;
4743 int *charbuf
= coding
->charbuf
;
4744 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4745 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4746 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4747 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4748 int produced_chars
= 0;
4749 Lisp_Object attrs
, eol_type
, charset_list
;
4750 int ascii_compatible
;
4753 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4754 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4756 while (charbuf
< charbuf_end
)
4758 struct charset
*charset
;
4761 ASSURE_DESTINATION (safe_room
);
4763 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4764 EMIT_ONE_ASCII_BYTE (c
);
4765 else if (CHAR_BYTE8_P (c
))
4767 c
= CHAR_TO_BYTE8 (c
);
4772 charset
= char_charset (c
, charset_list
, &code
);
4775 if (CHARSET_DIMENSION (charset
) == 1)
4776 EMIT_ONE_BYTE (code
);
4777 else if (CHARSET_DIMENSION (charset
) == 2)
4778 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
4779 else if (CHARSET_DIMENSION (charset
) == 3)
4780 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
4782 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
4783 (code
>> 8) & 0xFF, code
& 0xFF);
4787 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4788 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4790 c
= coding
->default_char
;
4796 coding
->result
= CODING_RESULT_SUCCESS
;
4797 coding
->produced_char
+= produced_chars
;
4798 coding
->produced
= dst
- coding
->destination
;
4803 /*** 7. C library functions ***/
4805 /* Setup coding context CODING from information about CODING_SYSTEM.
4806 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4807 CODING_SYSTEM is invalid, signal an error. */
4810 setup_coding_system (coding_system
, coding
)
4811 Lisp_Object coding_system
;
4812 struct coding_system
*coding
;
4815 Lisp_Object eol_type
;
4816 Lisp_Object coding_type
;
4819 if (NILP (coding_system
))
4820 coding_system
= Qno_conversion
;
4822 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
4824 attrs
= CODING_ID_ATTRS (coding
->id
);
4825 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4828 coding
->head_ascii
= -1;
4829 coding
->common_flags
4830 = (VECTORP (eol_type
) ? CODING_REQUIRE_DETECTION_MASK
: 0);
4831 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
4832 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
4833 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
4834 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
4835 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs
)))
4836 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
4838 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4839 coding
->max_charset_id
= SCHARS (val
) - 1;
4840 coding
->safe_charsets
= (char *) SDATA (val
);
4841 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
4843 coding_type
= CODING_ATTR_TYPE (attrs
);
4844 if (EQ (coding_type
, Qundecided
))
4846 coding
->detector
= NULL
;
4847 coding
->decoder
= decode_coding_raw_text
;
4848 coding
->encoder
= encode_coding_raw_text
;
4849 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4851 else if (EQ (coding_type
, Qiso_2022
))
4854 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
4856 /* Invoke graphic register 0 to plane 0. */
4857 CODING_ISO_INVOCATION (coding
, 0) = 0;
4858 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4859 CODING_ISO_INVOCATION (coding
, 1)
4860 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
4861 /* Setup the initial status of designation. */
4862 for (i
= 0; i
< 4; i
++)
4863 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
4864 /* Not single shifting initially. */
4865 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
4866 /* Beginning of buffer should also be regarded as bol. */
4867 CODING_ISO_BOL (coding
) = 1;
4868 coding
->detector
= detect_coding_iso_2022
;
4869 coding
->decoder
= decode_coding_iso_2022
;
4870 coding
->encoder
= encode_coding_iso_2022
;
4871 if (flags
& CODING_ISO_FLAG_SAFE
)
4872 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
4873 coding
->common_flags
4874 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4875 | CODING_REQUIRE_FLUSHING_MASK
);
4876 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
4877 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
4878 if (flags
& CODING_ISO_FLAG_DESIGNATION
)
4879 coding
->common_flags
|= CODING_ANNOTATE_CHARSET_MASK
;
4880 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
4882 setup_iso_safe_charsets (attrs
);
4883 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4884 coding
->max_charset_id
= SCHARS (val
) - 1;
4885 coding
->safe_charsets
= (char *) SDATA (val
);
4887 CODING_ISO_FLAGS (coding
) = flags
;
4889 else if (EQ (coding_type
, Qcharset
))
4891 coding
->detector
= detect_coding_charset
;
4892 coding
->decoder
= decode_coding_charset
;
4893 coding
->encoder
= encode_coding_charset
;
4894 coding
->common_flags
4895 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4897 else if (EQ (coding_type
, Qutf_8
))
4899 coding
->detector
= detect_coding_utf_8
;
4900 coding
->decoder
= decode_coding_utf_8
;
4901 coding
->encoder
= encode_coding_utf_8
;
4902 coding
->common_flags
4903 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4905 else if (EQ (coding_type
, Qutf_16
))
4907 val
= AREF (attrs
, coding_attr_utf_16_bom
);
4908 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
4909 : EQ (val
, Qt
) ? utf_16_with_bom
4910 : utf_16_without_bom
);
4911 val
= AREF (attrs
, coding_attr_utf_16_endian
);
4912 CODING_UTF_16_ENDIAN (coding
) = (EQ (val
, Qbig
) ? utf_16_big_endian
4913 : utf_16_little_endian
);
4914 CODING_UTF_16_SURROGATE (coding
) = 0;
4915 coding
->detector
= detect_coding_utf_16
;
4916 coding
->decoder
= decode_coding_utf_16
;
4917 coding
->encoder
= encode_coding_utf_16
;
4918 coding
->common_flags
4919 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4920 if (CODING_UTF_16_BOM (coding
) == utf_16_detect_bom
)
4921 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4923 else if (EQ (coding_type
, Qccl
))
4925 coding
->detector
= detect_coding_ccl
;
4926 coding
->decoder
= decode_coding_ccl
;
4927 coding
->encoder
= encode_coding_ccl
;
4928 coding
->common_flags
4929 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4930 | CODING_REQUIRE_FLUSHING_MASK
);
4932 else if (EQ (coding_type
, Qemacs_mule
))
4934 coding
->detector
= detect_coding_emacs_mule
;
4935 coding
->decoder
= decode_coding_emacs_mule
;
4936 coding
->encoder
= encode_coding_emacs_mule
;
4937 coding
->common_flags
4938 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4939 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
4940 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
4942 Lisp_Object tail
, safe_charsets
;
4943 int max_charset_id
= 0;
4945 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4947 if (max_charset_id
< XFASTINT (XCAR (tail
)))
4948 max_charset_id
= XFASTINT (XCAR (tail
));
4949 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
4951 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4953 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
4954 coding
->max_charset_id
= max_charset_id
;
4955 coding
->safe_charsets
= (char *) SDATA (safe_charsets
);
4958 else if (EQ (coding_type
, Qshift_jis
))
4960 coding
->detector
= detect_coding_sjis
;
4961 coding
->decoder
= decode_coding_sjis
;
4962 coding
->encoder
= encode_coding_sjis
;
4963 coding
->common_flags
4964 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4966 else if (EQ (coding_type
, Qbig5
))
4968 coding
->detector
= detect_coding_big5
;
4969 coding
->decoder
= decode_coding_big5
;
4970 coding
->encoder
= encode_coding_big5
;
4971 coding
->common_flags
4972 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4974 else /* EQ (coding_type, Qraw_text) */
4976 coding
->detector
= NULL
;
4977 coding
->decoder
= decode_coding_raw_text
;
4978 coding
->encoder
= encode_coding_raw_text
;
4984 /* Return raw-text or one of its subsidiaries that has the same
4985 eol_type as CODING-SYSTEM. */
4988 raw_text_coding_system (coding_system
)
4989 Lisp_Object coding_system
;
4991 Lisp_Object spec
, attrs
;
4992 Lisp_Object eol_type
, raw_text_eol_type
;
4994 if (NILP (coding_system
))
4996 spec
= CODING_SYSTEM_SPEC (coding_system
);
4997 attrs
= AREF (spec
, 0);
4999 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
5000 return coding_system
;
5002 eol_type
= AREF (spec
, 2);
5003 if (VECTORP (eol_type
))
5005 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
5006 raw_text_eol_type
= AREF (spec
, 2);
5007 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
5008 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
5009 : AREF (raw_text_eol_type
, 2));
5013 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5014 does, return one of the subsidiary that has the same eol-spec as
5015 PARENT. Otherwise, return CODING_SYSTEM. */
5018 coding_inherit_eol_type (coding_system
, parent
)
5019 Lisp_Object coding_system
, parent
;
5021 Lisp_Object spec
, eol_type
;
5023 if (NILP (coding_system
))
5024 coding_system
= Qraw_text
;
5025 spec
= CODING_SYSTEM_SPEC (coding_system
);
5026 eol_type
= AREF (spec
, 2);
5027 if (VECTORP (eol_type
)
5030 Lisp_Object parent_spec
;
5031 Lisp_Object parent_eol_type
;
5034 = CODING_SYSTEM_SPEC (buffer_defaults
.buffer_file_coding_system
);
5035 parent_eol_type
= AREF (parent_spec
, 2);
5036 if (EQ (parent_eol_type
, Qunix
))
5037 coding_system
= AREF (eol_type
, 0);
5038 else if (EQ (parent_eol_type
, Qdos
))
5039 coding_system
= AREF (eol_type
, 1);
5040 else if (EQ (parent_eol_type
, Qmac
))
5041 coding_system
= AREF (eol_type
, 2);
5043 return coding_system
;
5046 /* Emacs has a mechanism to automatically detect a coding system if it
5047 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5048 it's impossible to distinguish some coding systems accurately
5049 because they use the same range of codes. So, at first, coding
5050 systems are categorized into 7, those are:
5052 o coding-category-emacs-mule
5054 The category for a coding system which has the same code range
5055 as Emacs' internal format. Assigned the coding-system (Lisp
5056 symbol) `emacs-mule' by default.
5058 o coding-category-sjis
5060 The category for a coding system which has the same code range
5061 as SJIS. Assigned the coding-system (Lisp
5062 symbol) `japanese-shift-jis' by default.
5064 o coding-category-iso-7
5066 The category for a coding system which has the same code range
5067 as ISO2022 of 7-bit environment. This doesn't use any locking
5068 shift and single shift functions. This can encode/decode all
5069 charsets. Assigned the coding-system (Lisp symbol)
5070 `iso-2022-7bit' by default.
5072 o coding-category-iso-7-tight
5074 Same as coding-category-iso-7 except that this can
5075 encode/decode only the specified charsets.
5077 o coding-category-iso-8-1
5079 The category for a coding system which has the same code range
5080 as ISO2022 of 8-bit environment and graphic plane 1 used only
5081 for DIMENSION1 charset. This doesn't use any locking shift
5082 and single shift functions. Assigned the coding-system (Lisp
5083 symbol) `iso-latin-1' by default.
5085 o coding-category-iso-8-2
5087 The category for a coding system which has the same code range
5088 as ISO2022 of 8-bit environment and graphic plane 1 used only
5089 for DIMENSION2 charset. This doesn't use any locking shift
5090 and single shift functions. Assigned the coding-system (Lisp
5091 symbol) `japanese-iso-8bit' by default.
5093 o coding-category-iso-7-else
5095 The category for a coding system which has the same code range
5096 as ISO2022 of 7-bit environemnt but uses locking shift or
5097 single shift functions. Assigned the coding-system (Lisp
5098 symbol) `iso-2022-7bit-lock' by default.
5100 o coding-category-iso-8-else
5102 The category for a coding system which has the same code range
5103 as ISO2022 of 8-bit environemnt but uses locking shift or
5104 single shift functions. Assigned the coding-system (Lisp
5105 symbol) `iso-2022-8bit-ss2' by default.
5107 o coding-category-big5
5109 The category for a coding system which has the same code range
5110 as BIG5. Assigned the coding-system (Lisp symbol)
5111 `cn-big5' by default.
5113 o coding-category-utf-8
5115 The category for a coding system which has the same code range
5116 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5117 symbol) `utf-8' by default.
5119 o coding-category-utf-16-be
5121 The category for a coding system in which a text has an
5122 Unicode signature (cf. Unicode Standard) in the order of BIG
5123 endian at the head. Assigned the coding-system (Lisp symbol)
5124 `utf-16-be' by default.
5126 o coding-category-utf-16-le
5128 The category for a coding system in which a text has an
5129 Unicode signature (cf. Unicode Standard) in the order of
5130 LITTLE endian at the head. Assigned the coding-system (Lisp
5131 symbol) `utf-16-le' by default.
5133 o coding-category-ccl
5135 The category for a coding system of which encoder/decoder is
5136 written in CCL programs. The default value is nil, i.e., no
5137 coding system is assigned.
5139 o coding-category-binary
5141 The category for a coding system not categorized in any of the
5142 above. Assigned the coding-system (Lisp symbol)
5143 `no-conversion' by default.
5145 Each of them is a Lisp symbol and the value is an actual
5146 `coding-system's (this is also a Lisp symbol) assigned by a user.
5147 What Emacs does actually is to detect a category of coding system.
5148 Then, it uses a `coding-system' assigned to it. If Emacs can't
5149 decide only one possible category, it selects a category of the
5150 highest priority. Priorities of categories are also specified by a
5151 user in a Lisp variable `coding-category-list'.
5155 #define EOL_SEEN_NONE 0
5156 #define EOL_SEEN_LF 1
5157 #define EOL_SEEN_CR 2
5158 #define EOL_SEEN_CRLF 4
5160 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5161 SOURCE is encoded. If CATEGORY is one of
5162 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5163 two-byte, else they are encoded by one-byte.
5165 Return one of EOL_SEEN_XXX. */
5167 #define MAX_EOL_CHECK_COUNT 3
5170 detect_eol (source
, src_bytes
, category
)
5171 unsigned char *source
;
5172 EMACS_INT src_bytes
;
5173 enum coding_category category
;
5175 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
5178 int eol_seen
= EOL_SEEN_NONE
;
5180 if ((1 << category
) & CATEGORY_MASK_UTF_16
)
5184 msb
= category
== (coding_category_utf_16_le
5185 | coding_category_utf_16_le_nosig
);
5188 while (src
+ 1 < src_end
)
5191 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
5196 this_eol
= EOL_SEEN_LF
;
5197 else if (src
+ 3 >= src_end
5198 || src
[msb
+ 2] != 0
5199 || src
[lsb
+ 2] != '\n')
5200 this_eol
= EOL_SEEN_CR
;
5202 this_eol
= EOL_SEEN_CRLF
;
5204 if (eol_seen
== EOL_SEEN_NONE
)
5205 /* This is the first end-of-line. */
5206 eol_seen
= this_eol
;
5207 else if (eol_seen
!= this_eol
)
5209 /* The found type is different from what found before. */
5210 eol_seen
= EOL_SEEN_LF
;
5213 if (++total
== MAX_EOL_CHECK_COUNT
)
5221 while (src
< src_end
)
5224 if (c
== '\n' || c
== '\r')
5229 this_eol
= EOL_SEEN_LF
;
5230 else if (src
>= src_end
|| *src
!= '\n')
5231 this_eol
= EOL_SEEN_CR
;
5233 this_eol
= EOL_SEEN_CRLF
, src
++;
5235 if (eol_seen
== EOL_SEEN_NONE
)
5236 /* This is the first end-of-line. */
5237 eol_seen
= this_eol
;
5238 else if (eol_seen
!= this_eol
)
5240 /* The found type is different from what found before. */
5241 eol_seen
= EOL_SEEN_LF
;
5244 if (++total
== MAX_EOL_CHECK_COUNT
)
5254 adjust_coding_eol_type (coding
, eol_seen
)
5255 struct coding_system
*coding
;
5258 Lisp_Object eol_type
;
5260 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5261 if (eol_seen
& EOL_SEEN_LF
)
5262 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
5263 else if (eol_seen
& EOL_SEEN_CRLF
)
5264 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
5265 else if (eol_seen
& EOL_SEEN_CR
)
5266 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
5269 /* Detect how a text specified in CODING is encoded. If a coding
5270 system is detected, update fields of CODING by the detected coding
5274 detect_coding (coding
)
5275 struct coding_system
*coding
;
5277 const unsigned char *src
, *src_end
;
5278 Lisp_Object attrs
, coding_type
;
5280 coding
->consumed
= coding
->consumed_char
= 0;
5281 coding
->produced
= coding
->produced_char
= 0;
5282 coding_set_source (coding
);
5284 src_end
= coding
->source
+ coding
->src_bytes
;
5286 /* If we have not yet decided the text encoding type, detect it
5288 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
5292 for (src
= coding
->source
; src
< src_end
; src
++)
5295 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
5297 || c
== ISO_CODE_SO
)))
5300 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
5302 if (coding
->head_ascii
< coding
->src_bytes
)
5304 struct coding_detection_info detect_info
;
5305 enum coding_category category
;
5306 struct coding_system
*this;
5308 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
5309 for (i
= 0; i
< coding_category_raw_text
; i
++)
5311 category
= coding_priorities
[i
];
5312 this = coding_categories
+ category
;
5315 /* No coding system of this category is defined. */
5316 detect_info
.rejected
|= (1 << category
);
5318 else if (category
>= coding_category_raw_text
)
5320 else if (detect_info
.checked
& (1 << category
))
5322 if (detect_info
.found
& (1 << category
))
5325 else if ((*(this->detector
)) (coding
, &detect_info
)
5326 && detect_info
.found
& (1 << category
))
5329 if (i
< coding_category_raw_text
)
5330 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5331 else if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
5332 setup_coding_system (Qraw_text
, coding
);
5333 else if (detect_info
.rejected
)
5334 for (i
= 0; i
< coding_category_raw_text
; i
++)
5335 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
5337 this = coding_categories
+ coding_priorities
[i
];
5338 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5343 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qutf_16
))
5345 Lisp_Object coding_systems
;
5346 struct coding_detection_info detect_info
;
5349 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_16_bom
);
5350 detect_info
.found
= detect_info
.rejected
= 0;
5351 if (CONSP (coding_systems
)
5352 && detect_coding_utf_16 (coding
, &detect_info
)
5353 && (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
5354 | CATEGORY_MASK_UTF_16_BE
)))
5356 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
5357 setup_coding_system (XCAR (coding_systems
), coding
);
5359 setup_coding_system (XCDR (coding_systems
), coding
);
5363 attrs
= CODING_ID_ATTRS (coding
->id
);
5364 coding_type
= CODING_ATTR_TYPE (attrs
);
5366 /* If we have not yet decided the EOL type, detect it now. But, the
5367 detection is impossible for a CCL based coding system, in which
5368 case, we detct the EOL type after decoding. */
5369 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
))
5370 && ! EQ (coding_type
, Qccl
))
5372 int eol_seen
= detect_eol (coding
->source
, coding
->src_bytes
,
5373 (enum coding_category
) XINT (CODING_ATTR_CATEGORY (attrs
)));
5375 if (eol_seen
!= EOL_SEEN_NONE
)
5376 adjust_coding_eol_type (coding
, eol_seen
);
5383 struct coding_system
*coding
;
5385 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
)))
5387 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5388 unsigned char *pend
= p
+ coding
->produced
;
5389 int eol_seen
= EOL_SEEN_NONE
;
5391 for (; p
< pend
; p
++)
5394 eol_seen
|= EOL_SEEN_LF
;
5395 else if (*p
== '\r')
5397 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
5399 eol_seen
|= EOL_SEEN_CRLF
;
5403 eol_seen
|= EOL_SEEN_CR
;
5406 if (eol_seen
!= EOL_SEEN_NONE
)
5407 adjust_coding_eol_type (coding
, eol_seen
);
5410 if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qmac
))
5412 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5413 unsigned char *pend
= p
+ coding
->produced
;
5415 for (; p
< pend
; p
++)
5419 else if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
))
5421 unsigned char *p
, *pbeg
, *pend
;
5422 Lisp_Object undo_list
;
5424 move_gap_both (coding
->dst_pos
+ coding
->produced_char
,
5425 coding
->dst_pos_byte
+ coding
->produced
);
5426 undo_list
= current_buffer
->undo_list
;
5427 current_buffer
->undo_list
= Qt
;
5428 del_range_2 (coding
->dst_pos
, coding
->dst_pos_byte
, GPT
, GPT_BYTE
, 0);
5429 current_buffer
->undo_list
= undo_list
;
5431 pend
= pbeg
+ coding
->produced
;
5433 for (p
= pend
- 1; p
>= pbeg
; p
--)
5436 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
- p
- 1);
5439 coding
->produced_char
-= coding
->produced
- (pend
- pbeg
);
5440 coding
->produced
= pend
- pbeg
;
5441 insert_from_gap (coding
->produced_char
, coding
->produced
);
5446 translate_chars (coding
, table
)
5447 struct coding_system
*coding
;
5450 int *charbuf
= coding
->charbuf
;
5451 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5454 if (coding
->chars_at_source
)
5457 while (charbuf
< charbuf_end
)
5463 *charbuf
++ = translate_char (table
, c
);
5468 produce_chars (coding
)
5469 struct coding_system
*coding
;
5471 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5472 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5474 int produced_chars
= 0;
5476 if (! coding
->chars_at_source
)
5478 /* Characters are in coding->charbuf. */
5479 int *buf
= coding
->charbuf
;
5480 int *buf_end
= buf
+ coding
->charbuf_used
;
5481 unsigned char *adjusted_dst_end
;
5483 if (BUFFERP (coding
->src_object
)
5484 && EQ (coding
->src_object
, coding
->dst_object
))
5485 dst_end
= ((unsigned char *) coding
->source
) + coding
->consumed
;
5486 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5488 while (buf
< buf_end
)
5492 if (dst
>= adjusted_dst_end
)
5494 dst
= alloc_destination (coding
,
5495 buf_end
- buf
+ MAX_MULTIBYTE_LENGTH
,
5497 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5498 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5502 if (coding
->dst_multibyte
5503 || ! CHAR_BYTE8_P (c
))
5504 CHAR_STRING_ADVANCE (c
, dst
);
5506 *dst
++ = CHAR_TO_BYTE8 (c
);
5510 /* This is an annotation datum. (-C) is the length of
5517 const unsigned char *src
= coding
->source
;
5518 const unsigned char *src_end
= src
+ coding
->src_bytes
;
5519 Lisp_Object eol_type
;
5521 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5523 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5525 if (coding
->src_multibyte
)
5532 const unsigned char *src_base
= src
;
5538 if (EQ (eol_type
, Qdos
))
5542 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
5543 goto no_more_source
;
5548 else if (EQ (eol_type
, Qmac
))
5553 coding
->consumed
= src
- coding
->source
;
5555 if (EQ (coding
->src_object
, coding
->dst_object
))
5556 dst_end
= (unsigned char *) src
;
5559 dst
= alloc_destination (coding
, src_end
- src
+ 1,
5561 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5562 coding_set_source (coding
);
5563 src
= coding
->source
+ coding
->consumed
;
5564 src_end
= coding
->source
+ coding
->src_bytes
;
5574 while (src
< src_end
)
5581 if (EQ (eol_type
, Qdos
))
5587 else if (EQ (eol_type
, Qmac
))
5590 if (dst
>= dst_end
- 1)
5592 coding
->consumed
= src
- coding
->source
;
5594 if (EQ (coding
->src_object
, coding
->dst_object
))
5595 dst_end
= (unsigned char *) src
;
5596 if (dst
>= dst_end
- 1)
5598 dst
= alloc_destination (coding
, src_end
- src
+ 2,
5600 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5601 coding_set_source (coding
);
5602 src
= coding
->source
+ coding
->consumed
;
5603 src_end
= coding
->source
+ coding
->src_bytes
;
5611 if (!EQ (coding
->src_object
, coding
->dst_object
))
5613 int require
= coding
->src_bytes
- coding
->dst_bytes
;
5617 EMACS_INT offset
= src
- coding
->source
;
5619 dst
= alloc_destination (coding
, require
, dst
);
5620 coding_set_source (coding
);
5621 src
= coding
->source
+ offset
;
5622 src_end
= coding
->source
+ coding
->src_bytes
;
5625 produced_chars
= coding
->src_chars
;
5626 while (src
< src_end
)
5632 if (EQ (eol_type
, Qdos
))
5639 else if (EQ (eol_type
, Qmac
))
5645 coding
->consumed
= coding
->src_bytes
;
5646 coding
->consumed_char
= coding
->src_chars
;
5649 produced
= dst
- (coding
->destination
+ coding
->produced
);
5650 if (BUFFERP (coding
->dst_object
))
5651 insert_from_gap (produced_chars
, produced
);
5652 coding
->produced
+= produced
;
5653 coding
->produced_char
+= produced_chars
;
5654 return produced_chars
;
5657 /* Compose text in CODING->object according to the annotation data at
5658 CHARBUF. CHARBUF is an array:
5659 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5663 produce_composition (coding
, charbuf
)
5664 struct coding_system
*coding
;
5669 enum composition_method method
;
5670 Lisp_Object components
;
5673 from
= coding
->dst_pos
+ charbuf
[2];
5674 to
= coding
->dst_pos
+ charbuf
[3];
5675 method
= (enum composition_method
) (charbuf
[4]);
5677 if (method
== COMPOSITION_RELATIVE
)
5681 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
5686 for (i
= 0; i
< len
; i
++)
5687 args
[i
] = make_number (charbuf
[i
]);
5688 components
= (method
== COMPOSITION_WITH_ALTCHARS
5689 ? Fstring (len
, args
) : Fvector (len
, args
));
5691 compose_text (from
, to
, components
, Qnil
, coding
->dst_object
);
5695 /* Put `charset' property on text in CODING->object according to
5696 the annotation data at CHARBUF. CHARBUF is an array:
5697 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5701 produce_charset (coding
, charbuf
)
5702 struct coding_system
*coding
;
5705 EMACS_INT from
= coding
->dst_pos
+ charbuf
[2];
5706 EMACS_INT to
= coding
->dst_pos
+ charbuf
[3];
5707 struct charset
*charset
= CHARSET_FROM_ID (charbuf
[4]);
5709 Fput_text_property (make_number (from
), make_number (to
),
5710 Qcharset
, CHARSET_NAME (charset
),
5711 coding
->dst_object
);
5715 #define CHARBUF_SIZE 0x4000
5717 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5719 int size = CHARBUF_SIZE;; \
5721 coding->charbuf = NULL; \
5722 while (size > 1024) \
5724 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5725 if (coding->charbuf) \
5729 if (! coding->charbuf) \
5731 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5732 return coding->result; \
5734 coding->charbuf_size = size; \
5739 produce_annotation (coding
)
5740 struct coding_system
*coding
;
5742 int *charbuf
= coding
->charbuf
;
5743 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5745 if (NILP (coding
->dst_object
))
5748 while (charbuf
< charbuf_end
)
5754 int len
= -*charbuf
;
5757 case CODING_ANNOTATE_COMPOSITION_MASK
:
5758 produce_composition (coding
, charbuf
);
5760 case CODING_ANNOTATE_CHARSET_MASK
:
5761 produce_charset (coding
, charbuf
);
5771 /* Decode the data at CODING->src_object into CODING->dst_object.
5772 CODING->src_object is a buffer, a string, or nil.
5773 CODING->dst_object is a buffer.
5775 If CODING->src_object is a buffer, it must be the current buffer.
5776 In this case, if CODING->src_pos is positive, it is a position of
5777 the source text in the buffer, otherwise, the source text is in the
5778 gap area of the buffer, and CODING->src_pos specifies the offset of
5779 the text from GPT (which must be the same as PT). If this is the
5780 same buffer as CODING->dst_object, CODING->src_pos must be
5783 If CODING->src_object is a string, CODING->src_pos in an index to
5786 If CODING->src_object is nil, CODING->source must already point to
5787 the non-relocatable memory area. In this case, CODING->src_pos is
5788 an offset from CODING->source.
5790 The decoded data is inserted at the current point of the buffer
5795 decode_coding (coding
)
5796 struct coding_system
*coding
;
5800 if (BUFFERP (coding
->src_object
)
5801 && coding
->src_pos
> 0
5802 && coding
->src_pos
< GPT
5803 && coding
->src_pos
+ coding
->src_chars
> GPT
)
5804 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
5806 if (BUFFERP (coding
->dst_object
))
5808 if (current_buffer
!= XBUFFER (coding
->dst_object
))
5809 set_buffer_internal (XBUFFER (coding
->dst_object
));
5811 move_gap_both (PT
, PT_BYTE
);
5814 coding
->consumed
= coding
->consumed_char
= 0;
5815 coding
->produced
= coding
->produced_char
= 0;
5816 coding
->chars_at_source
= 0;
5817 coding
->result
= CODING_RESULT_SUCCESS
;
5820 ALLOC_CONVERSION_WORK_AREA (coding
);
5822 attrs
= CODING_ID_ATTRS (coding
->id
);
5826 coding_set_source (coding
);
5827 coding
->annotated
= 0;
5828 (*(coding
->decoder
)) (coding
);
5829 if (!NILP (CODING_ATTR_DECODE_TBL (attrs
)))
5830 translate_chars (coding
, CODING_ATTR_DECODE_TBL (attrs
));
5831 else if (!NILP (Vstandard_translation_table_for_decode
))
5832 translate_chars (coding
, Vstandard_translation_table_for_decode
);
5833 coding_set_destination (coding
);
5834 produce_chars (coding
);
5835 if (coding
->annotated
)
5836 produce_annotation (coding
);
5838 while (coding
->consumed
< coding
->src_bytes
5839 && ! coding
->result
);
5841 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qccl
)
5842 && SYMBOLP (CODING_ID_EOL_TYPE (coding
->id
))
5843 && ! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
5844 decode_eol (coding
);
5846 coding
->carryover_bytes
= 0;
5847 if (coding
->consumed
< coding
->src_bytes
)
5849 int nbytes
= coding
->src_bytes
- coding
->consumed
;
5850 const unsigned char *src
;
5852 coding_set_source (coding
);
5853 coding_set_destination (coding
);
5854 src
= coding
->source
+ coding
->consumed
;
5856 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
5858 /* Flush out unprocessed data as binary chars. We are sure
5859 that the number of data is less than the size of
5861 while (nbytes
-- > 0)
5865 coding
->charbuf
[coding
->charbuf_used
++] = (c
& 0x80 ? - c
: c
);
5867 produce_chars (coding
);
5871 /* Record unprocessed bytes in coding->carryover. We are
5872 sure that the number of data is less than the size of
5873 coding->carryover. */
5874 unsigned char *p
= coding
->carryover
;
5876 coding
->carryover_bytes
= nbytes
;
5877 while (nbytes
-- > 0)
5880 coding
->consumed
= coding
->src_bytes
;
5883 return coding
->result
;
5887 /* Extract an annotation datum from a composition starting at POS and
5888 ending before LIMIT of CODING->src_object (buffer or string), store
5889 the data in BUF, set *STOP to a starting position of the next
5890 composition (if any) or to LIMIT, and return the address of the
5891 next element of BUF.
5893 If such an annotation is not found, set *STOP to a starting
5894 position of a composition after POS (if any) or to LIMIT, and
5898 handle_composition_annotation (pos
, limit
, coding
, buf
, stop
)
5899 EMACS_INT pos
, limit
;
5900 struct coding_system
*coding
;
5904 EMACS_INT start
, end
;
5907 if (! find_composition (pos
, limit
, &start
, &end
, &prop
, coding
->src_object
)
5910 else if (start
> pos
)
5916 /* We found a composition. Store the corresponding
5917 annotation data in BUF. */
5919 enum composition_method method
= COMPOSITION_METHOD (prop
);
5920 int nchars
= COMPOSITION_LENGTH (prop
);
5922 ADD_COMPOSITION_DATA (buf
, 0, nchars
, method
);
5923 if (method
!= COMPOSITION_RELATIVE
)
5925 Lisp_Object components
;
5928 components
= COMPOSITION_COMPONENTS (prop
);
5929 if (VECTORP (components
))
5931 len
= XVECTOR (components
)->size
;
5932 for (i
= 0; i
< len
; i
++)
5933 *buf
++ = XINT (AREF (components
, i
));
5935 else if (STRINGP (components
))
5937 len
= SCHARS (components
);
5941 FETCH_STRING_CHAR_ADVANCE (*buf
, components
, i
, i_byte
);
5945 else if (INTEGERP (components
))
5948 *buf
++ = XINT (components
);
5950 else if (CONSP (components
))
5952 for (len
= 0; CONSP (components
);
5953 len
++, components
= XCDR (components
))
5954 *buf
++ = XINT (XCAR (components
));
5962 if (find_composition (end
, limit
, &start
, &end
, &prop
,
5973 /* Extract an annotation datum from a text property `charset' at POS of
5974 CODING->src_object (buffer of string), store the data in BUF, set
5975 *STOP to the position where the value of `charset' property changes
5976 (limiting by LIMIT), and return the address of the next element of
5979 If the property value is nil, set *STOP to the position where the
5980 property value is non-nil (limiting by LIMIT), and return BUF. */
5983 handle_charset_annotation (pos
, limit
, coding
, buf
, stop
)
5984 EMACS_INT pos
, limit
;
5985 struct coding_system
*coding
;
5989 Lisp_Object val
, next
;
5992 val
= Fget_text_property (make_number (pos
), Qcharset
, coding
->src_object
);
5993 if (! NILP (val
) && CHARSETP (val
))
5994 id
= XINT (CHARSET_SYMBOL_ID (val
));
5997 ADD_CHARSET_DATA (buf
, 0, 0, id
);
5998 next
= Fnext_single_property_change (make_number (pos
), Qcharset
,
6000 make_number (limit
));
6001 *stop
= XINT (next
);
6007 consume_chars (coding
)
6008 struct coding_system
*coding
;
6010 int *buf
= coding
->charbuf
;
6011 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
;
6012 const unsigned char *src
= coding
->source
+ coding
->consumed
;
6013 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
6014 EMACS_INT pos
= coding
->src_pos
+ coding
->consumed_char
;
6015 EMACS_INT end_pos
= coding
->src_pos
+ coding
->src_chars
;
6016 int multibytep
= coding
->src_multibyte
;
6017 Lisp_Object eol_type
;
6019 EMACS_INT stop
, stop_composition
, stop_charset
;
6021 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6022 if (VECTORP (eol_type
))
6025 /* Note: composition handling is not yet implemented. */
6026 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
6028 if (NILP (coding
->src_object
))
6029 stop
= stop_composition
= stop_charset
= end_pos
;
6032 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
)
6033 stop
= stop_composition
= pos
;
6035 stop
= stop_composition
= end_pos
;
6036 if (coding
->common_flags
& CODING_ANNOTATE_CHARSET_MASK
)
6037 stop
= stop_charset
= pos
;
6039 stop_charset
= end_pos
;
6042 /* Compensate for CRLF and annotation. */
6043 buf_end
-= 1 + MAX_ANNOTATION_LENGTH
;
6044 while (buf
< buf_end
)
6050 if (pos
== stop_composition
)
6051 buf
= handle_composition_annotation (pos
, end_pos
, coding
,
6052 buf
, &stop_composition
);
6053 if (pos
== stop_charset
)
6054 buf
= handle_charset_annotation (pos
, end_pos
, coding
,
6055 buf
, &stop_charset
);
6056 stop
= (stop_composition
< stop_charset
6057 ? stop_composition
: stop_charset
);
6064 if (! CODING_FOR_UNIBYTE (coding
)
6065 && (bytes
= MULTIBYTE_LENGTH (src
, src_end
)) > 0)
6066 c
= STRING_CHAR_ADVANCE (src
), pos
+= bytes
;
6071 c
= STRING_CHAR_ADVANCE (src
), pos
++;
6072 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
6074 if (! EQ (eol_type
, Qunix
))
6078 if (EQ (eol_type
, Qdos
))
6087 coding
->consumed
= src
- coding
->source
;
6088 coding
->consumed_char
= pos
- coding
->src_pos
;
6089 coding
->charbuf_used
= buf
- coding
->charbuf
;
6090 coding
->chars_at_source
= 0;
6094 /* Encode the text at CODING->src_object into CODING->dst_object.
6095 CODING->src_object is a buffer or a string.
6096 CODING->dst_object is a buffer or nil.
6098 If CODING->src_object is a buffer, it must be the current buffer.
6099 In this case, if CODING->src_pos is positive, it is a position of
6100 the source text in the buffer, otherwise. the source text is in the
6101 gap area of the buffer, and coding->src_pos specifies the offset of
6102 the text from GPT (which must be the same as PT). If this is the
6103 same buffer as CODING->dst_object, CODING->src_pos must be
6104 negative and CODING should not have `pre-write-conversion'.
6106 If CODING->src_object is a string, CODING should not have
6107 `pre-write-conversion'.
6109 If CODING->dst_object is a buffer, the encoded data is inserted at
6110 the current point of that buffer.
6112 If CODING->dst_object is nil, the encoded data is placed at the
6113 memory area specified by CODING->destination. */
6116 encode_coding (coding
)
6117 struct coding_system
*coding
;
6121 attrs
= CODING_ID_ATTRS (coding
->id
);
6123 if (BUFFERP (coding
->dst_object
))
6125 set_buffer_internal (XBUFFER (coding
->dst_object
));
6126 coding
->dst_multibyte
6127 = ! NILP (current_buffer
->enable_multibyte_characters
);
6130 coding
->consumed
= coding
->consumed_char
= 0;
6131 coding
->produced
= coding
->produced_char
= 0;
6132 coding
->result
= CODING_RESULT_SUCCESS
;
6135 ALLOC_CONVERSION_WORK_AREA (coding
);
6138 coding_set_source (coding
);
6139 consume_chars (coding
);
6141 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs
)))
6142 translate_chars (coding
, CODING_ATTR_ENCODE_TBL (attrs
));
6143 else if (!NILP (Vstandard_translation_table_for_encode
))
6144 translate_chars (coding
, Vstandard_translation_table_for_encode
);
6146 coding_set_destination (coding
);
6147 (*(coding
->encoder
)) (coding
);
6148 } while (coding
->consumed_char
< coding
->src_chars
);
6150 if (BUFFERP (coding
->dst_object
))
6151 insert_from_gap (coding
->produced_char
, coding
->produced
);
6153 return (coding
->result
);
6157 /* Stack of working buffers used in code conversion. An nil element
6158 means that the code conversion of that level is not using a working
6160 Lisp_Object Vcode_conversion_work_buf_list
;
6162 /* A working buffer used by the top level conversion. */
6163 Lisp_Object Vcode_conversion_reused_work_buf
;
6166 /* Return a working buffer that can be freely used by the following
6167 code conversion. MULTIBYTEP specifies the multibyteness of the
6171 make_conversion_work_buffer (multibytep
, depth
)
6172 int multibytep
, depth
;
6174 struct buffer
*current
= current_buffer
;
6175 Lisp_Object buf
, name
;
6179 if (NILP (Vcode_conversion_reused_work_buf
))
6180 Vcode_conversion_reused_work_buf
6181 = Fget_buffer_create (build_string (" *code-converting-work<0>*"));
6182 buf
= Vcode_conversion_reused_work_buf
;
6188 name
= build_string (" *code-converting-work*");
6189 name
= Fgenerate_new_buffer_name (name
, Qnil
);
6195 sprintf (str
, " *code-converting-work*<%d>", depth
);
6196 name
= build_string (str
);
6198 buf
= Fget_buffer_create (name
);
6200 set_buffer_internal (XBUFFER (buf
));
6201 current_buffer
->undo_list
= Qt
;
6203 Fset_buffer_multibyte (multibytep
? Qt
: Qnil
);
6204 set_buffer_internal (current
);
6209 code_conversion_restore (buffer
)
6212 Lisp_Object workbuf
;
6214 workbuf
= XCAR (Vcode_conversion_work_buf_list
);
6215 if (! NILP (workbuf
)
6216 && ! EQ (workbuf
, Vcode_conversion_reused_work_buf
)
6217 && ! NILP (Fbuffer_live_p (workbuf
)))
6218 Fkill_buffer (workbuf
);
6219 Vcode_conversion_work_buf_list
= XCDR (Vcode_conversion_work_buf_list
);
6220 set_buffer_internal (XBUFFER (buffer
));
6225 code_conversion_save (buffer
, with_work_buf
, multibyte
)
6227 int with_work_buf
, multibyte
;
6229 Lisp_Object workbuf
;
6233 int depth
= XINT (Flength (Vcode_conversion_work_buf_list
));
6235 workbuf
= make_conversion_work_buffer (multibyte
, depth
);
6239 Vcode_conversion_work_buf_list
6240 = Fcons (workbuf
, Vcode_conversion_work_buf_list
);
6241 record_unwind_protect (code_conversion_restore
, buffer
);
6246 decode_coding_gap (coding
, chars
, bytes
)
6247 struct coding_system
*coding
;
6248 EMACS_INT chars
, bytes
;
6250 int count
= specpdl_ptr
- specpdl
;
6254 buffer
= Fcurrent_buffer ();
6255 code_conversion_save (buffer
, 0, 0);
6257 coding
->src_object
= buffer
;
6258 coding
->src_chars
= chars
;
6259 coding
->src_bytes
= bytes
;
6260 coding
->src_pos
= -chars
;
6261 coding
->src_pos_byte
= -bytes
;
6262 coding
->src_multibyte
= chars
< bytes
;
6263 coding
->dst_object
= buffer
;
6264 coding
->dst_pos
= PT
;
6265 coding
->dst_pos_byte
= PT_BYTE
;
6266 coding
->dst_multibyte
= ! NILP (current_buffer
->enable_multibyte_characters
);
6267 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
6269 if (CODING_REQUIRE_DETECTION (coding
))
6270 detect_coding (coding
);
6272 decode_coding (coding
);
6274 attrs
= CODING_ID_ATTRS (coding
->id
);
6275 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6277 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6280 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6281 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6282 make_number (coding
->produced_char
));
6284 coding
->produced_char
+= Z
- prev_Z
;
6285 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6288 unbind_to (count
, Qnil
);
6289 return coding
->result
;
6293 encode_coding_gap (coding
, chars
, bytes
)
6294 struct coding_system
*coding
;
6295 EMACS_INT chars
, bytes
;
6297 int count
= specpdl_ptr
- specpdl
;
6300 buffer
= Fcurrent_buffer ();
6301 code_conversion_save (buffer
, 0, 0);
6303 coding
->src_object
= buffer
;
6304 coding
->src_chars
= chars
;
6305 coding
->src_bytes
= bytes
;
6306 coding
->src_pos
= -chars
;
6307 coding
->src_pos_byte
= -bytes
;
6308 coding
->src_multibyte
= chars
< bytes
;
6309 coding
->dst_object
= coding
->src_object
;
6310 coding
->dst_pos
= PT
;
6311 coding
->dst_pos_byte
= PT_BYTE
;
6313 encode_coding (coding
);
6315 unbind_to (count
, Qnil
);
6316 return coding
->result
;
6320 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6321 SRC_OBJECT into DST_OBJECT by coding context CODING.
6323 SRC_OBJECT is a buffer, a string, or Qnil.
6325 If it is a buffer, the text is at point of the buffer. FROM and TO
6326 are positions in the buffer.
6328 If it is a string, the text is at the beginning of the string.
6329 FROM and TO are indices to the string.
6331 If it is nil, the text is at coding->source. FROM and TO are
6332 indices to coding->source.
6334 DST_OBJECT is a buffer, Qt, or Qnil.
6336 If it is a buffer, the decoded text is inserted at point of the
6337 buffer. If the buffer is the same as SRC_OBJECT, the source text
6340 If it is Qt, a string is made from the decoded text, and
6341 set in CODING->dst_object.
6343 If it is Qnil, the decoded text is stored at CODING->destination.
6344 The caller must allocate CODING->dst_bytes bytes at
6345 CODING->destination by xmalloc. If the decoded text is longer than
6346 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6350 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6352 struct coding_system
*coding
;
6353 Lisp_Object src_object
;
6354 EMACS_INT from
, from_byte
, to
, to_byte
;
6355 Lisp_Object dst_object
;
6357 int count
= specpdl_ptr
- specpdl
;
6358 unsigned char *destination
;
6359 EMACS_INT dst_bytes
;
6360 EMACS_INT chars
= to
- from
;
6361 EMACS_INT bytes
= to_byte
- from_byte
;
6364 int saved_pt
= -1, saved_pt_byte
;
6366 buffer
= Fcurrent_buffer ();
6368 if (NILP (dst_object
))
6370 destination
= coding
->destination
;
6371 dst_bytes
= coding
->dst_bytes
;
6374 coding
->src_object
= src_object
;
6375 coding
->src_chars
= chars
;
6376 coding
->src_bytes
= bytes
;
6377 coding
->src_multibyte
= chars
< bytes
;
6379 if (STRINGP (src_object
))
6381 coding
->src_pos
= from
;
6382 coding
->src_pos_byte
= from_byte
;
6384 else if (BUFFERP (src_object
))
6386 set_buffer_internal (XBUFFER (src_object
));
6388 move_gap_both (from
, from_byte
);
6389 if (EQ (src_object
, dst_object
))
6391 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6392 TEMP_SET_PT_BOTH (from
, from_byte
);
6393 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6394 coding
->src_pos
= -chars
;
6395 coding
->src_pos_byte
= -bytes
;
6399 coding
->src_pos
= from
;
6400 coding
->src_pos_byte
= from_byte
;
6404 if (CODING_REQUIRE_DETECTION (coding
))
6405 detect_coding (coding
);
6406 attrs
= CODING_ID_ATTRS (coding
->id
);
6408 if (EQ (dst_object
, Qt
)
6409 || (! NILP (CODING_ATTR_POST_READ (attrs
))
6410 && NILP (dst_object
)))
6412 coding
->dst_object
= code_conversion_save (buffer
, 1, 1);
6413 coding
->dst_pos
= BEG
;
6414 coding
->dst_pos_byte
= BEG_BYTE
;
6415 coding
->dst_multibyte
= 1;
6417 else if (BUFFERP (dst_object
))
6419 code_conversion_save (buffer
, 0, 0);
6420 coding
->dst_object
= dst_object
;
6421 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6422 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6423 coding
->dst_multibyte
6424 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6428 code_conversion_save (buffer
, 0, 0);
6429 coding
->dst_object
= Qnil
;
6430 coding
->dst_multibyte
= 1;
6433 decode_coding (coding
);
6435 if (BUFFERP (coding
->dst_object
))
6436 set_buffer_internal (XBUFFER (coding
->dst_object
));
6438 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6440 struct gcpro gcpro1
, gcpro2
;
6441 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6444 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6445 GCPRO2 (coding
->src_object
, coding
->dst_object
);
6446 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6447 make_number (coding
->produced_char
));
6450 coding
->produced_char
+= Z
- prev_Z
;
6451 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6454 if (EQ (dst_object
, Qt
))
6456 coding
->dst_object
= Fbuffer_string ();
6458 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
6460 set_buffer_internal (XBUFFER (coding
->dst_object
));
6461 if (dst_bytes
< coding
->produced
)
6464 = (unsigned char *) xrealloc (destination
, coding
->produced
);
6467 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
6468 unbind_to (count
, Qnil
);
6471 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
6472 move_gap_both (BEGV
, BEGV_BYTE
);
6473 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
6474 coding
->destination
= destination
;
6480 /* This is the case of:
6481 (BUFFERP (src_object) && EQ (src_object, dst_object))
6482 As we have moved PT while replacing the original buffer
6483 contents, we must recover it now. */
6484 set_buffer_internal (XBUFFER (src_object
));
6485 if (saved_pt
< from
)
6486 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6487 else if (saved_pt
< from
+ chars
)
6488 TEMP_SET_PT_BOTH (from
, from_byte
);
6489 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6490 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6491 saved_pt_byte
+ (coding
->produced
- bytes
));
6493 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6494 saved_pt_byte
+ (coding
->produced
- bytes
));
6497 unbind_to (count
, Qnil
);
6502 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6504 struct coding_system
*coding
;
6505 Lisp_Object src_object
;
6506 EMACS_INT from
, from_byte
, to
, to_byte
;
6507 Lisp_Object dst_object
;
6509 int count
= specpdl_ptr
- specpdl
;
6510 EMACS_INT chars
= to
- from
;
6511 EMACS_INT bytes
= to_byte
- from_byte
;
6514 int saved_pt
= -1, saved_pt_byte
;
6516 buffer
= Fcurrent_buffer ();
6518 coding
->src_object
= src_object
;
6519 coding
->src_chars
= chars
;
6520 coding
->src_bytes
= bytes
;
6521 coding
->src_multibyte
= chars
< bytes
;
6523 attrs
= CODING_ID_ATTRS (coding
->id
);
6525 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
6527 coding
->src_object
= code_conversion_save (buffer
, 1,
6528 coding
->src_multibyte
);
6529 set_buffer_internal (XBUFFER (coding
->src_object
));
6530 if (STRINGP (src_object
))
6531 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
6532 else if (BUFFERP (src_object
))
6533 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
6535 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
6537 if (EQ (src_object
, dst_object
))
6539 set_buffer_internal (XBUFFER (src_object
));
6540 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6541 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6542 set_buffer_internal (XBUFFER (coding
->src_object
));
6545 call2 (CODING_ATTR_PRE_WRITE (attrs
),
6546 make_number (BEG
), make_number (Z
));
6547 coding
->src_object
= Fcurrent_buffer ();
6549 move_gap_both (BEG
, BEG_BYTE
);
6550 coding
->src_chars
= Z
- BEG
;
6551 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
6552 coding
->src_pos
= BEG
;
6553 coding
->src_pos_byte
= BEG_BYTE
;
6554 coding
->src_multibyte
= Z
< Z_BYTE
;
6556 else if (STRINGP (src_object
))
6558 code_conversion_save (buffer
, 0, 0);
6559 coding
->src_pos
= from
;
6560 coding
->src_pos_byte
= from_byte
;
6562 else if (BUFFERP (src_object
))
6564 code_conversion_save (buffer
, 0, 0);
6565 set_buffer_internal (XBUFFER (src_object
));
6566 if (EQ (src_object
, dst_object
))
6568 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6569 coding
->src_object
= del_range_1 (from
, to
, 1, 1);
6570 coding
->src_pos
= 0;
6571 coding
->src_pos_byte
= 0;
6575 if (from
< GPT
&& to
>= GPT
)
6576 move_gap_both (from
, from_byte
);
6577 coding
->src_pos
= from
;
6578 coding
->src_pos_byte
= from_byte
;
6582 code_conversion_save (buffer
, 0, 0);
6584 if (BUFFERP (dst_object
))
6586 coding
->dst_object
= dst_object
;
6587 if (EQ (src_object
, dst_object
))
6589 coding
->dst_pos
= from
;
6590 coding
->dst_pos_byte
= from_byte
;
6594 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6595 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6597 coding
->dst_multibyte
6598 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6600 else if (EQ (dst_object
, Qt
))
6602 coding
->dst_object
= Qnil
;
6603 coding
->dst_bytes
= coding
->src_chars
;
6604 if (coding
->dst_bytes
== 0)
6605 coding
->dst_bytes
= 1;
6606 coding
->destination
= (unsigned char *) xmalloc (coding
->dst_bytes
);
6607 coding
->dst_multibyte
= 0;
6611 coding
->dst_object
= Qnil
;
6612 coding
->dst_multibyte
= 0;
6615 encode_coding (coding
);
6617 if (EQ (dst_object
, Qt
))
6619 if (BUFFERP (coding
->dst_object
))
6620 coding
->dst_object
= Fbuffer_string ();
6624 = make_unibyte_string ((char *) coding
->destination
,
6626 xfree (coding
->destination
);
6632 /* This is the case of:
6633 (BUFFERP (src_object) && EQ (src_object, dst_object))
6634 As we have moved PT while replacing the original buffer
6635 contents, we must recover it now. */
6636 set_buffer_internal (XBUFFER (src_object
));
6637 if (saved_pt
< from
)
6638 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6639 else if (saved_pt
< from
+ chars
)
6640 TEMP_SET_PT_BOTH (from
, from_byte
);
6641 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6642 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6643 saved_pt_byte
+ (coding
->produced
- bytes
));
6645 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6646 saved_pt_byte
+ (coding
->produced
- bytes
));
6649 unbind_to (count
, Qnil
);
6654 preferred_coding_system ()
6656 int id
= coding_categories
[coding_priorities
[0]].id
;
6658 return CODING_ID_NAME (id
);
6663 /*** 8. Emacs Lisp library functions ***/
6665 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
6666 doc
: /* Return t if OBJECT is nil or a coding-system.
6667 See the documentation of `define-coding-system' for information
6668 about coding-system objects. */)
6672 return ((NILP (obj
) || CODING_SYSTEM_P (obj
)) ? Qt
: Qnil
);
6675 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
6676 Sread_non_nil_coding_system
, 1, 1, 0,
6677 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6684 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6685 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
6687 while (SCHARS (val
) == 0);
6688 return (Fintern (val
, Qnil
));
6691 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
6692 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6693 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6694 (prompt
, default_coding_system
)
6695 Lisp_Object prompt
, default_coding_system
;
6698 if (SYMBOLP (default_coding_system
))
6699 XSETSTRING (default_coding_system
, XPNTR (SYMBOL_NAME (default_coding_system
)));
6700 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6701 Qt
, Qnil
, Qcoding_system_history
,
6702 default_coding_system
, Qnil
);
6703 return (SCHARS (val
) == 0 ? Qnil
: Fintern (val
, Qnil
));
6706 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
6708 doc
: /* Check validity of CODING-SYSTEM.
6709 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6711 Lisp_Object coding_system
;
6713 CHECK_SYMBOL (coding_system
);
6714 if (!NILP (Fcoding_system_p (coding_system
)))
6715 return coding_system
;
6717 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
6721 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6722 HIGHEST is nonzero, return the coding system of the highest
6723 priority among the detected coding systems. Otherwize return a
6724 list of detected coding systems sorted by their priorities. If
6725 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6726 multibyte form but contains only ASCII and eight-bit chars.
6727 Otherwise, the bytes are raw bytes.
6729 CODING-SYSTEM controls the detection as below:
6731 If it is nil, detect both text-format and eol-format. If the
6732 text-format part of CODING-SYSTEM is already specified
6733 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6734 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6735 detect only text-format. */
6738 detect_coding_system (src
, src_bytes
, highest
, multibytep
, coding_system
)
6739 const unsigned char *src
;
6740 int src_bytes
, highest
;
6742 Lisp_Object coding_system
;
6744 const unsigned char *src_end
= src
+ src_bytes
;
6745 Lisp_Object attrs
, eol_type
;
6747 struct coding_system coding
;
6749 struct coding_detection_info detect_info
;
6751 if (NILP (coding_system
))
6752 coding_system
= Qundecided
;
6753 setup_coding_system (coding_system
, &coding
);
6754 attrs
= CODING_ID_ATTRS (coding
.id
);
6755 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
6756 coding_system
= CODING_ATTR_BASE_NAME (attrs
);
6758 coding
.source
= src
;
6759 coding
.src_bytes
= src_bytes
;
6760 coding
.src_multibyte
= multibytep
;
6761 coding
.consumed
= 0;
6762 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6764 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
6766 /* At first, detect text-format if necessary. */
6767 if (XINT (CODING_ATTR_CATEGORY (attrs
)) == coding_category_undecided
)
6769 enum coding_category category
;
6770 struct coding_system
*this;
6773 for (; src
< src_end
; src
++)
6777 || (c
< 0x20 && (c
== ISO_CODE_ESC
6779 || c
== ISO_CODE_SO
)))
6782 coding
.head_ascii
= src
- coding
.source
;
6785 for (i
= 0; i
< coding_category_raw_text
; i
++)
6787 category
= coding_priorities
[i
];
6788 this = coding_categories
+ category
;
6792 /* No coding system of this category is defined. */
6793 detect_info
.rejected
|= (1 << category
);
6795 else if (category
>= coding_category_raw_text
)
6797 else if (detect_info
.checked
& (1 << category
))
6800 && (detect_info
.found
& (1 << category
)))
6805 if ((*(this->detector
)) (&coding
, &detect_info
)
6807 && (detect_info
.found
& (1 << category
)))
6813 if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
6815 detect_info
.found
= CATEGORY_MASK_RAW_TEXT
;
6816 id
= coding_categories
[coding_category_raw_text
].id
;
6817 val
= Fcons (make_number (id
), Qnil
);
6819 else if (! detect_info
.rejected
&& ! detect_info
.found
)
6821 detect_info
.found
= CATEGORY_MASK_ANY
;
6822 id
= coding_categories
[coding_category_undecided
].id
;
6823 val
= Fcons (make_number (id
), Qnil
);
6827 if (detect_info
.found
)
6829 detect_info
.found
= 1 << category
;
6830 val
= Fcons (make_number (this->id
), Qnil
);
6833 for (i
= 0; i
< coding_category_raw_text
; i
++)
6834 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
6836 detect_info
.found
= 1 << coding_priorities
[i
];
6837 id
= coding_categories
[coding_priorities
[i
]].id
;
6838 val
= Fcons (make_number (id
), Qnil
);
6844 int mask
= detect_info
.rejected
| detect_info
.found
;
6848 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6850 category
= coding_priorities
[i
];
6851 if (! (mask
& (1 << category
)))
6853 found
|= 1 << category
;
6854 id
= coding_categories
[category
].id
;
6855 val
= Fcons (make_number (id
), val
);
6858 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6860 category
= coding_priorities
[i
];
6861 if (detect_info
.found
& (1 << category
))
6863 id
= coding_categories
[category
].id
;
6864 val
= Fcons (make_number (id
), val
);
6867 detect_info
.found
|= found
;
6872 detect_info
.found
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
6873 val
= Fcons (make_number (coding
.id
), Qnil
);
6876 /* Then, detect eol-format if necessary. */
6878 int normal_eol
= -1, utf_16_be_eol
= -1, utf_16_le_eol
;
6881 if (VECTORP (eol_type
))
6883 if (detect_info
.found
& ~CATEGORY_MASK_UTF_16
)
6884 normal_eol
= detect_eol (coding
.source
, src_bytes
,
6885 coding_category_raw_text
);
6886 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_BE
6887 | CATEGORY_MASK_UTF_16_BE_NOSIG
))
6888 utf_16_be_eol
= detect_eol (coding
.source
, src_bytes
,
6889 coding_category_utf_16_be
);
6890 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
6891 | CATEGORY_MASK_UTF_16_LE_NOSIG
))
6892 utf_16_le_eol
= detect_eol (coding
.source
, src_bytes
,
6893 coding_category_utf_16_le
);
6897 if (EQ (eol_type
, Qunix
))
6898 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_LF
;
6899 else if (EQ (eol_type
, Qdos
))
6900 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CRLF
;
6902 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CR
;
6905 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
6907 enum coding_category category
;
6910 id
= XINT (XCAR (tail
));
6911 attrs
= CODING_ID_ATTRS (id
);
6912 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
6913 eol_type
= CODING_ID_EOL_TYPE (id
);
6914 if (VECTORP (eol_type
))
6916 if (category
== coding_category_utf_16_be
6917 || category
== coding_category_utf_16_be_nosig
)
6918 this_eol
= utf_16_be_eol
;
6919 else if (category
== coding_category_utf_16_le
6920 || category
== coding_category_utf_16_le_nosig
)
6921 this_eol
= utf_16_le_eol
;
6923 this_eol
= normal_eol
;
6925 if (this_eol
== EOL_SEEN_LF
)
6926 XSETCAR (tail
, AREF (eol_type
, 0));
6927 else if (this_eol
== EOL_SEEN_CRLF
)
6928 XSETCAR (tail
, AREF (eol_type
, 1));
6929 else if (this_eol
== EOL_SEEN_CR
)
6930 XSETCAR (tail
, AREF (eol_type
, 2));
6932 XSETCAR (tail
, CODING_ID_NAME (id
));
6935 XSETCAR (tail
, CODING_ID_NAME (id
));
6939 return (highest
? XCAR (val
) : val
);
6943 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
6945 doc
: /* Detect coding system of the text in the region between START and END.
6946 Return a list of possible coding systems ordered by priority.
6948 If only ASCII characters are found, it returns a list of single element
6949 `undecided' or its subsidiary coding system according to a detected
6952 If optional argument HIGHEST is non-nil, return the coding system of
6953 highest priority. */)
6954 (start
, end
, highest
)
6955 Lisp_Object start
, end
, highest
;
6958 int from_byte
, to_byte
;
6960 CHECK_NUMBER_COERCE_MARKER (start
);
6961 CHECK_NUMBER_COERCE_MARKER (end
);
6963 validate_region (&start
, &end
);
6964 from
= XINT (start
), to
= XINT (end
);
6965 from_byte
= CHAR_TO_BYTE (from
);
6966 to_byte
= CHAR_TO_BYTE (to
);
6968 if (from
< GPT
&& to
>= GPT
)
6969 move_gap_both (to
, to_byte
);
6971 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
6972 to_byte
- from_byte
,
6974 !NILP (current_buffer
6975 ->enable_multibyte_characters
),
6979 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
6981 doc
: /* Detect coding system of the text in STRING.
6982 Return a list of possible coding systems ordered by priority.
6984 If only ASCII characters are found, it returns a list of single element
6985 `undecided' or its subsidiary coding system according to a detected
6988 If optional argument HIGHEST is non-nil, return the coding system of
6989 highest priority. */)
6991 Lisp_Object string
, highest
;
6993 CHECK_STRING (string
);
6995 return detect_coding_system (SDATA (string
), SBYTES (string
),
6996 !NILP (highest
), STRING_MULTIBYTE (string
),
7002 char_encodable_p (c
, attrs
)
7007 struct charset
*charset
;
7009 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
7010 CONSP (tail
); tail
= XCDR (tail
))
7012 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
7013 if (CHAR_CHARSET_P (c
, charset
))
7016 return (! NILP (tail
));
7020 /* Return a list of coding systems that safely encode the text between
7021 START and END. If EXCLUDE is non-nil, it is a list of coding
7022 systems not to check. The returned list doesn't contain any such
7023 coding systems. In any case, if the text contains only ASCII or is
7024 unibyte, return t. */
7026 DEFUN ("find-coding-systems-region-internal",
7027 Ffind_coding_systems_region_internal
,
7028 Sfind_coding_systems_region_internal
, 2, 3, 0,
7029 doc
: /* Internal use only. */)
7030 (start
, end
, exclude
)
7031 Lisp_Object start
, end
, exclude
;
7033 Lisp_Object coding_attrs_list
, safe_codings
;
7034 EMACS_INT start_byte
, end_byte
;
7035 const unsigned char *p
, *pbeg
, *pend
;
7037 Lisp_Object tail
, elt
;
7039 if (STRINGP (start
))
7041 if (!STRING_MULTIBYTE (start
)
7042 || SCHARS (start
) == SBYTES (start
))
7045 end_byte
= SBYTES (start
);
7049 CHECK_NUMBER_COERCE_MARKER (start
);
7050 CHECK_NUMBER_COERCE_MARKER (end
);
7051 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7052 args_out_of_range (start
, end
);
7053 if (NILP (current_buffer
->enable_multibyte_characters
))
7055 start_byte
= CHAR_TO_BYTE (XINT (start
));
7056 end_byte
= CHAR_TO_BYTE (XINT (end
));
7057 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7060 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7062 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7063 move_gap_both (XINT (start
), start_byte
);
7065 move_gap_both (XINT (end
), end_byte
);
7069 coding_attrs_list
= Qnil
;
7070 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7072 || NILP (Fmemq (XCAR (tail
), exclude
)))
7076 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
7077 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
7078 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
7079 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
7082 if (STRINGP (start
))
7083 p
= pbeg
= SDATA (start
);
7085 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7086 pend
= p
+ (end_byte
- start_byte
);
7088 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
7089 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7093 if (ASCII_BYTE_P (*p
))
7097 c
= STRING_CHAR_ADVANCE (p
);
7099 charset_map_loaded
= 0;
7100 for (tail
= coding_attrs_list
; CONSP (tail
);)
7105 else if (char_encodable_p (c
, elt
))
7107 else if (CONSP (XCDR (tail
)))
7109 XSETCAR (tail
, XCAR (XCDR (tail
)));
7110 XSETCDR (tail
, XCDR (XCDR (tail
)));
7114 XSETCAR (tail
, Qnil
);
7118 if (charset_map_loaded
)
7120 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7122 if (STRINGP (start
))
7123 pbeg
= SDATA (start
);
7125 pbeg
= BYTE_POS_ADDR (start_byte
);
7126 p
= pbeg
+ p_offset
;
7127 pend
= pbeg
+ pend_offset
;
7132 safe_codings
= Qnil
;
7133 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
7134 if (! NILP (XCAR (tail
)))
7135 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
7137 return safe_codings
;
7141 DEFUN ("unencodable-char-position", Funencodable_char_position
,
7142 Sunencodable_char_position
, 3, 5, 0,
7144 Return position of first un-encodable character in a region.
7145 START and END specfiy the region and CODING-SYSTEM specifies the
7146 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7148 If optional 4th argument COUNT is non-nil, it specifies at most how
7149 many un-encodable characters to search. In this case, the value is a
7152 If optional 5th argument STRING is non-nil, it is a string to search
7153 for un-encodable characters. In that case, START and END are indexes
7155 (start
, end
, coding_system
, count
, string
)
7156 Lisp_Object start
, end
, coding_system
, count
, string
;
7159 struct coding_system coding
;
7160 Lisp_Object attrs
, charset_list
;
7161 Lisp_Object positions
;
7163 const unsigned char *p
, *stop
, *pend
;
7164 int ascii_compatible
;
7166 setup_coding_system (Fcheck_coding_system (coding_system
), &coding
);
7167 attrs
= CODING_ID_ATTRS (coding
.id
);
7168 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
7170 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
7171 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7175 validate_region (&start
, &end
);
7176 from
= XINT (start
);
7178 if (NILP (current_buffer
->enable_multibyte_characters
)
7179 || (ascii_compatible
7180 && (to
- from
) == (CHAR_TO_BYTE (to
) - (CHAR_TO_BYTE (from
)))))
7182 p
= CHAR_POS_ADDR (from
);
7183 pend
= CHAR_POS_ADDR (to
);
7184 if (from
< GPT
&& to
>= GPT
)
7191 CHECK_STRING (string
);
7192 CHECK_NATNUM (start
);
7194 from
= XINT (start
);
7197 || to
> SCHARS (string
))
7198 args_out_of_range_3 (string
, start
, end
);
7199 if (! STRING_MULTIBYTE (string
))
7201 p
= SDATA (string
) + string_char_to_byte (string
, from
);
7202 stop
= pend
= SDATA (string
) + string_char_to_byte (string
, to
);
7203 if (ascii_compatible
&& (to
- from
) == (pend
- p
))
7211 CHECK_NATNUM (count
);
7220 if (ascii_compatible
)
7221 while (p
< stop
&& ASCII_BYTE_P (*p
))
7231 c
= STRING_CHAR_ADVANCE (p
);
7232 if (! (ASCII_CHAR_P (c
) && ascii_compatible
)
7233 && ! char_charset (c
, charset_list
, NULL
))
7235 positions
= Fcons (make_number (from
), positions
);
7244 return (NILP (count
) ? Fcar (positions
) : Fnreverse (positions
));
7248 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
7249 Scheck_coding_systems_region
, 3, 3, 0,
7250 doc
: /* Check if the region is encodable by coding systems.
7252 START and END are buffer positions specifying the region.
7253 CODING-SYSTEM-LIST is a list of coding systems to check.
7255 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7256 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7257 whole region, POS0, POS1, ... are buffer positions where non-encodable
7258 characters are found.
7260 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7263 START may be a string. In that case, check if the string is
7264 encodable, and the value contains indices to the string instead of
7265 buffer positions. END is ignored. */)
7266 (start
, end
, coding_system_list
)
7267 Lisp_Object start
, end
, coding_system_list
;
7270 EMACS_INT start_byte
, end_byte
;
7272 const unsigned char *p
, *pbeg
, *pend
;
7274 Lisp_Object tail
, elt
;
7276 if (STRINGP (start
))
7278 if (!STRING_MULTIBYTE (start
)
7279 && SCHARS (start
) != SBYTES (start
))
7282 end_byte
= SBYTES (start
);
7287 CHECK_NUMBER_COERCE_MARKER (start
);
7288 CHECK_NUMBER_COERCE_MARKER (end
);
7289 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7290 args_out_of_range (start
, end
);
7291 if (NILP (current_buffer
->enable_multibyte_characters
))
7293 start_byte
= CHAR_TO_BYTE (XINT (start
));
7294 end_byte
= CHAR_TO_BYTE (XINT (end
));
7295 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7298 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7300 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7301 move_gap_both (XINT (start
), start_byte
);
7303 move_gap_both (XINT (end
), end_byte
);
7309 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7312 list
= Fcons (Fcons (elt
, Fcons (AREF (CODING_SYSTEM_SPEC (elt
), 0),
7317 if (STRINGP (start
))
7318 p
= pbeg
= SDATA (start
);
7320 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7321 pend
= p
+ (end_byte
- start_byte
);
7323 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
7324 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7328 if (ASCII_BYTE_P (*p
))
7332 c
= STRING_CHAR_ADVANCE (p
);
7334 charset_map_loaded
= 0;
7335 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
7337 elt
= XCDR (XCAR (tail
));
7338 if (! char_encodable_p (c
, XCAR (elt
)))
7339 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
7341 if (charset_map_loaded
)
7343 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7345 if (STRINGP (start
))
7346 pbeg
= SDATA (start
);
7348 pbeg
= BYTE_POS_ADDR (start_byte
);
7349 p
= pbeg
+ p_offset
;
7350 pend
= pbeg
+ pend_offset
;
7358 for (; CONSP (tail
); tail
= XCDR (tail
))
7361 if (CONSP (XCDR (XCDR (elt
))))
7362 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
7372 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
7373 Lisp_Object start
, end
, coding_system
, dst_object
;
7374 int encodep
, norecord
;
7376 struct coding_system coding
;
7377 EMACS_INT from
, from_byte
, to
, to_byte
;
7378 Lisp_Object src_object
;
7380 CHECK_NUMBER_COERCE_MARKER (start
);
7381 CHECK_NUMBER_COERCE_MARKER (end
);
7382 if (NILP (coding_system
))
7383 coding_system
= Qno_conversion
;
7385 CHECK_CODING_SYSTEM (coding_system
);
7386 src_object
= Fcurrent_buffer ();
7387 if (NILP (dst_object
))
7388 dst_object
= src_object
;
7389 else if (! EQ (dst_object
, Qt
))
7390 CHECK_BUFFER (dst_object
);
7392 validate_region (&start
, &end
);
7393 from
= XFASTINT (start
);
7394 from_byte
= CHAR_TO_BYTE (from
);
7395 to
= XFASTINT (end
);
7396 to_byte
= CHAR_TO_BYTE (to
);
7398 setup_coding_system (coding_system
, &coding
);
7399 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7402 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7405 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7408 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7410 if (coding
.result
!= CODING_RESULT_SUCCESS
)
7411 error ("Code conversion error: %d", coding
.result
);
7413 return (BUFFERP (dst_object
)
7414 ? make_number (coding
.produced_char
)
7415 : coding
.dst_object
);
7419 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
7420 3, 4, "r\nzCoding system: ",
7421 doc
: /* Decode the current region from the specified coding system.
7422 When called from a program, takes four arguments:
7423 START, END, CODING-SYSTEM, and DESTINATION.
7424 START and END are buffer positions.
7426 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7427 If nil, the region between START and END is replace by the decoded text.
7428 If buffer, the decoded text is inserted in the buffer.
7429 If t, the decoded text is returned.
7431 This function sets `last-coding-system-used' to the precise coding system
7432 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7433 not fully specified.)
7434 It returns the length of the decoded text. */)
7435 (start
, end
, coding_system
, destination
)
7436 Lisp_Object start
, end
, coding_system
, destination
;
7438 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
7441 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
7442 3, 4, "r\nzCoding system: ",
7443 doc
: /* Encode the current region by specified coding system.
7444 When called from a program, takes three arguments:
7445 START, END, and CODING-SYSTEM. START and END are buffer positions.
7447 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7448 If nil, the region between START and END is replace by the encoded text.
7449 If buffer, the encoded text is inserted in the buffer.
7450 If t, the encoded text is returned.
7452 This function sets `last-coding-system-used' to the precise coding system
7453 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7454 not fully specified.)
7455 It returns the length of the encoded text. */)
7456 (start
, end
, coding_system
, destination
)
7457 Lisp_Object start
, end
, coding_system
, destination
;
7459 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
7463 code_convert_string (string
, coding_system
, dst_object
,
7464 encodep
, nocopy
, norecord
)
7465 Lisp_Object string
, coding_system
, dst_object
;
7466 int encodep
, nocopy
, norecord
;
7468 struct coding_system coding
;
7469 EMACS_INT chars
, bytes
;
7471 CHECK_STRING (string
);
7472 if (NILP (coding_system
))
7475 Vlast_coding_system_used
= Qno_conversion
;
7476 if (NILP (dst_object
))
7477 return (nocopy
? Fcopy_sequence (string
) : string
);
7480 if (NILP (coding_system
))
7481 coding_system
= Qno_conversion
;
7483 CHECK_CODING_SYSTEM (coding_system
);
7484 if (NILP (dst_object
))
7486 else if (! EQ (dst_object
, Qt
))
7487 CHECK_BUFFER (dst_object
);
7489 setup_coding_system (coding_system
, &coding
);
7490 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7491 chars
= SCHARS (string
);
7492 bytes
= SBYTES (string
);
7494 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7496 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7498 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7500 if (coding
.result
!= CODING_RESULT_SUCCESS
)
7501 error ("Code conversion error: %d", coding
.result
);
7503 return (BUFFERP (dst_object
)
7504 ? make_number (coding
.produced_char
)
7505 : coding
.dst_object
);
7509 /* Encode or decode STRING according to CODING_SYSTEM.
7510 Do not set Vlast_coding_system_used.
7512 This function is called only from macros DECODE_FILE and
7513 ENCODE_FILE, thus we ignore character composition. */
7516 code_convert_string_norecord (string
, coding_system
, encodep
)
7517 Lisp_Object string
, coding_system
;
7520 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
7524 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
7526 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7528 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7529 if the decoding operation is trivial.
7531 Optional fourth arg BUFFER non-nil meant that the decoded text is
7532 inserted in BUFFER instead of returned as a string. In this case,
7533 the return value is BUFFER.
7535 This function sets `last-coding-system-used' to the precise coding system
7536 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7537 not fully specified. */)
7538 (string
, coding_system
, nocopy
, buffer
)
7539 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7541 return code_convert_string (string
, coding_system
, buffer
,
7542 0, ! NILP (nocopy
), 0);
7545 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
7547 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
7549 Optional third arg NOCOPY non-nil means it is OK to return STRING
7550 itself if the encoding operation is trivial.
7552 Optional fourth arg BUFFER non-nil meant that the encoded text is
7553 inserted in BUFFER instead of returned as a string. In this case,
7554 the return value is BUFFER.
7556 This function sets `last-coding-system-used' to the precise coding system
7557 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7558 not fully specified.) */)
7559 (string
, coding_system
, nocopy
, buffer
)
7560 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7562 return code_convert_string (string
, coding_system
, buffer
,
7563 1, ! NILP (nocopy
), 1);
7567 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
7568 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
7569 Return the corresponding character. */)
7573 Lisp_Object spec
, attrs
, val
;
7574 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
7577 CHECK_NATNUM (code
);
7578 c
= XFASTINT (code
);
7579 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7580 attrs
= AREF (spec
, 0);
7582 if (ASCII_BYTE_P (c
)
7583 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7586 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7587 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7588 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7589 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7592 charset
= charset_roman
;
7593 else if (c
>= 0xA0 && c
< 0xDF)
7595 charset
= charset_kana
;
7600 int s1
= c
>> 8, s2
= c
& 0xFF;
7602 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
7603 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
7604 error ("Invalid code: %d", code
);
7606 charset
= charset_kanji
;
7608 c
= DECODE_CHAR (charset
, c
);
7610 error ("Invalid code: %d", code
);
7611 return make_number (c
);
7615 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
7616 doc
: /* Encode a Japanese character CHAR to shift_jis encoding.
7617 Return the corresponding code in SJIS. */)
7621 Lisp_Object spec
, attrs
, charset_list
;
7623 struct charset
*charset
;
7626 CHECK_CHARACTER (ch
);
7628 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7629 attrs
= AREF (spec
, 0);
7631 if (ASCII_CHAR_P (c
)
7632 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7635 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7636 charset
= char_charset (c
, charset_list
, &code
);
7637 if (code
== CHARSET_INVALID_CODE (charset
))
7638 error ("Can't encode by shift_jis encoding: %d", c
);
7641 return make_number (code
);
7644 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
7645 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
7646 Return the corresponding character. */)
7650 Lisp_Object spec
, attrs
, val
;
7651 struct charset
*charset_roman
, *charset_big5
, *charset
;
7654 CHECK_NATNUM (code
);
7655 c
= XFASTINT (code
);
7656 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7657 attrs
= AREF (spec
, 0);
7659 if (ASCII_BYTE_P (c
)
7660 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7663 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7664 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7665 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7668 charset
= charset_roman
;
7671 int b1
= c
>> 8, b2
= c
& 0x7F;
7672 if (b1
< 0xA1 || b1
> 0xFE
7673 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
7674 error ("Invalid code: %d", code
);
7675 charset
= charset_big5
;
7677 c
= DECODE_CHAR (charset
, (unsigned )c
);
7679 error ("Invalid code: %d", code
);
7680 return make_number (c
);
7683 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
7684 doc
: /* Encode the Big5 character CHAR to BIG5 coding system.
7685 Return the corresponding character code in Big5. */)
7689 Lisp_Object spec
, attrs
, charset_list
;
7690 struct charset
*charset
;
7694 CHECK_CHARACTER (ch
);
7696 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7697 attrs
= AREF (spec
, 0);
7698 if (ASCII_CHAR_P (c
)
7699 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7702 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7703 charset
= char_charset (c
, charset_list
, &code
);
7704 if (code
== CHARSET_INVALID_CODE (charset
))
7705 error ("Can't encode by Big5 encoding: %d", c
);
7707 return make_number (code
);
7711 DEFUN ("set-terminal-coding-system-internal",
7712 Fset_terminal_coding_system_internal
,
7713 Sset_terminal_coding_system_internal
, 1, 1, 0,
7714 doc
: /* Internal use only. */)
7716 Lisp_Object coding_system
;
7718 CHECK_SYMBOL (coding_system
);
7719 setup_coding_system (Fcheck_coding_system (coding_system
),
7722 /* We had better not send unsafe characters to terminal. */
7723 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
7724 /* Characer composition should be disabled. */
7725 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7726 terminal_coding
.src_multibyte
= 1;
7727 terminal_coding
.dst_multibyte
= 0;
7731 DEFUN ("set-safe-terminal-coding-system-internal",
7732 Fset_safe_terminal_coding_system_internal
,
7733 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
7734 doc
: /* Internal use only. */)
7736 Lisp_Object coding_system
;
7738 CHECK_SYMBOL (coding_system
);
7739 setup_coding_system (Fcheck_coding_system (coding_system
),
7740 &safe_terminal_coding
);
7741 /* Characer composition should be disabled. */
7742 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7743 safe_terminal_coding
.src_multibyte
= 1;
7744 safe_terminal_coding
.dst_multibyte
= 0;
7748 DEFUN ("terminal-coding-system",
7749 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
7750 doc
: /* Return coding system specified for terminal output. */)
7753 return CODING_ID_NAME (terminal_coding
.id
);
7756 DEFUN ("set-keyboard-coding-system-internal",
7757 Fset_keyboard_coding_system_internal
,
7758 Sset_keyboard_coding_system_internal
, 1, 1, 0,
7759 doc
: /* Internal use only. */)
7761 Lisp_Object coding_system
;
7763 CHECK_SYMBOL (coding_system
);
7764 setup_coding_system (Fcheck_coding_system (coding_system
),
7766 /* Characer composition should be disabled. */
7767 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7771 DEFUN ("keyboard-coding-system",
7772 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
7773 doc
: /* Return coding system specified for decoding keyboard input. */)
7776 return CODING_ID_NAME (keyboard_coding
.id
);
7780 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
7781 Sfind_operation_coding_system
, 1, MANY
, 0,
7782 doc
: /* Choose a coding system for an operation based on the target name.
7783 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7784 DECODING-SYSTEM is the coding system to use for decoding
7785 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7786 for encoding (in case OPERATION does encoding).
7788 The first argument OPERATION specifies an I/O primitive:
7789 For file I/O, `insert-file-contents' or `write-region'.
7790 For process I/O, `call-process', `call-process-region', or `start-process'.
7791 For network I/O, `open-network-stream'.
7793 The remaining arguments should be the same arguments that were passed
7794 to the primitive. Depending on which primitive, one of those arguments
7795 is selected as the TARGET. For example, if OPERATION does file I/O,
7796 whichever argument specifies the file name is TARGET.
7798 TARGET has a meaning which depends on OPERATION:
7799 For file I/O, TARGET is a file name.
7800 For process I/O, TARGET is a process name.
7801 For network I/O, TARGET is a service name or a port number
7803 This function looks up what specified for TARGET in,
7804 `file-coding-system-alist', `process-coding-system-alist',
7805 or `network-coding-system-alist' depending on OPERATION.
7806 They may specify a coding system, a cons of coding systems,
7807 or a function symbol to call.
7808 In the last case, we call the function with one argument,
7809 which is a list of all the arguments given to this function.
7811 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7816 Lisp_Object operation
, target_idx
, target
, val
;
7817 register Lisp_Object chain
;
7820 error ("Too few arguments");
7821 operation
= args
[0];
7822 if (!SYMBOLP (operation
)
7823 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
7824 error ("Invalid first arguement");
7825 if (nargs
< 1 + XINT (target_idx
))
7826 error ("Too few arguments for operation: %s",
7827 SDATA (SYMBOL_NAME (operation
)));
7828 target
= args
[XINT (target_idx
) + 1];
7829 if (!(STRINGP (target
)
7830 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
7831 error ("Invalid %dth argument", XINT (target_idx
) + 1);
7833 chain
= ((EQ (operation
, Qinsert_file_contents
)
7834 || EQ (operation
, Qwrite_region
))
7835 ? Vfile_coding_system_alist
7836 : (EQ (operation
, Qopen_network_stream
)
7837 ? Vnetwork_coding_system_alist
7838 : Vprocess_coding_system_alist
));
7842 for (; CONSP (chain
); chain
= XCDR (chain
))
7848 && ((STRINGP (target
)
7849 && STRINGP (XCAR (elt
))
7850 && fast_string_match (XCAR (elt
), target
) >= 0)
7851 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
7854 /* Here, if VAL is both a valid coding system and a valid
7855 function symbol, we return VAL as a coding system. */
7858 if (! SYMBOLP (val
))
7860 if (! NILP (Fcoding_system_p (val
)))
7861 return Fcons (val
, val
);
7862 if (! NILP (Ffboundp (val
)))
7864 val
= call1 (val
, Flist (nargs
, args
));
7867 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
7868 return Fcons (val
, val
);
7876 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
7877 Sset_coding_system_priority
, 0, MANY
, 0,
7878 doc
: /* Assign higher priority to the coding systems given as arguments.
7879 If multiple coding systems belongs to the same category,
7880 all but the first one are ignored.
7882 usage: (set-coding-system-priority ...) */)
7888 int changed
[coding_category_max
];
7889 enum coding_category priorities
[coding_category_max
];
7891 bzero (changed
, sizeof changed
);
7893 for (i
= j
= 0; i
< nargs
; i
++)
7895 enum coding_category category
;
7896 Lisp_Object spec
, attrs
;
7898 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
7899 attrs
= AREF (spec
, 0);
7900 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7901 if (changed
[category
])
7902 /* Ignore this coding system because a coding system of the
7903 same category already had a higher priority. */
7905 changed
[category
] = 1;
7906 priorities
[j
++] = category
;
7907 if (coding_categories
[category
].id
>= 0
7908 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
7909 setup_coding_system (args
[i
], &coding_categories
[category
]);
7910 Fset (AREF (Vcoding_category_table
, category
), args
[i
]);
7913 /* Now we have decided top J priorities. Reflect the order of the
7914 original priorities to the remaining priorities. */
7916 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
7918 while (j
< coding_category_max
7919 && changed
[coding_priorities
[j
]])
7921 if (j
== coding_category_max
)
7923 priorities
[i
] = coding_priorities
[j
];
7926 bcopy (priorities
, coding_priorities
, sizeof priorities
);
7928 /* Update `coding-category-list'. */
7929 Vcoding_category_list
= Qnil
;
7930 for (i
= coding_category_max
- 1; i
>= 0; i
--)
7931 Vcoding_category_list
7932 = Fcons (AREF (Vcoding_category_table
, priorities
[i
]),
7933 Vcoding_category_list
);
7938 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
7939 Scoding_system_priority_list
, 0, 1, 0,
7940 doc
: /* Return a list of coding systems ordered by their priorities.
7941 HIGHESTP non-nil means just return the highest priority one. */)
7943 Lisp_Object highestp
;
7948 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
7950 enum coding_category category
= coding_priorities
[i
];
7951 int id
= coding_categories
[category
].id
;
7956 attrs
= CODING_ID_ATTRS (id
);
7957 if (! NILP (highestp
))
7958 return CODING_ATTR_BASE_NAME (attrs
);
7959 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
7961 return Fnreverse (val
);
7964 static char *suffixes
[] = { "-unix", "-dos", "-mac" };
7967 make_subsidiaries (base
)
7970 Lisp_Object subsidiaries
;
7971 int base_name_len
= SBYTES (SYMBOL_NAME (base
));
7972 char *buf
= (char *) alloca (base_name_len
+ 6);
7975 bcopy (SDATA (SYMBOL_NAME (base
)), buf
, base_name_len
);
7976 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
7977 for (i
= 0; i
< 3; i
++)
7979 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
7980 ASET (subsidiaries
, i
, intern (buf
));
7982 return subsidiaries
;
7986 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
7987 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
7988 doc
: /* For internal use only.
7989 usage: (define-coding-system-internal ...) */)
7995 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
7996 Lisp_Object attrs
; /* Vector of attributes. */
7997 Lisp_Object eol_type
;
7998 Lisp_Object aliases
;
7999 Lisp_Object coding_type
, charset_list
, safe_charsets
;
8000 enum coding_category category
;
8001 Lisp_Object tail
, val
;
8002 int max_charset_id
= 0;
8005 if (nargs
< coding_arg_max
)
8008 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
8010 name
= args
[coding_arg_name
];
8011 CHECK_SYMBOL (name
);
8012 CODING_ATTR_BASE_NAME (attrs
) = name
;
8014 val
= args
[coding_arg_mnemonic
];
8015 if (! STRINGP (val
))
8016 CHECK_CHARACTER (val
);
8017 CODING_ATTR_MNEMONIC (attrs
) = val
;
8019 coding_type
= args
[coding_arg_coding_type
];
8020 CHECK_SYMBOL (coding_type
);
8021 CODING_ATTR_TYPE (attrs
) = coding_type
;
8023 charset_list
= args
[coding_arg_charset_list
];
8024 if (SYMBOLP (charset_list
))
8026 if (EQ (charset_list
, Qiso_2022
))
8028 if (! EQ (coding_type
, Qiso_2022
))
8029 error ("Invalid charset-list");
8030 charset_list
= Viso_2022_charset_list
;
8032 else if (EQ (charset_list
, Qemacs_mule
))
8034 if (! EQ (coding_type
, Qemacs_mule
))
8035 error ("Invalid charset-list");
8036 charset_list
= Vemacs_mule_charset_list
;
8038 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8039 if (max_charset_id
< XFASTINT (XCAR (tail
)))
8040 max_charset_id
= XFASTINT (XCAR (tail
));
8044 charset_list
= Fcopy_sequence (charset_list
);
8045 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
8047 struct charset
*charset
;
8050 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8051 if (EQ (coding_type
, Qiso_2022
)
8052 ? CHARSET_ISO_FINAL (charset
) < 0
8053 : EQ (coding_type
, Qemacs_mule
)
8054 ? CHARSET_EMACS_MULE_ID (charset
) < 0
8056 error ("Can't handle charset `%s'",
8057 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8059 XSETCAR (tail
, make_number (charset
->id
));
8060 if (max_charset_id
< charset
->id
)
8061 max_charset_id
= charset
->id
;
8064 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
8066 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
8068 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8069 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
8070 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
8072 CODING_ATTR_ASCII_COMPAT (attrs
) = args
[coding_arg_ascii_compatible_p
];
8074 val
= args
[coding_arg_decode_translation_table
];
8076 CHECK_CHAR_TABLE (val
);
8077 CODING_ATTR_DECODE_TBL (attrs
) = val
;
8079 val
= args
[coding_arg_encode_translation_table
];
8081 CHECK_CHAR_TABLE (val
);
8082 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
8084 val
= args
[coding_arg_post_read_conversion
];
8086 CODING_ATTR_POST_READ (attrs
) = val
;
8088 val
= args
[coding_arg_pre_write_conversion
];
8090 CODING_ATTR_PRE_WRITE (attrs
) = val
;
8092 val
= args
[coding_arg_default_char
];
8094 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
8097 CHECK_CHARACTER (val
);
8098 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
8101 val
= args
[coding_arg_for_unibyte
];
8102 CODING_ATTR_FOR_UNIBYTE (attrs
) = NILP (val
) ? Qnil
: Qt
;
8104 val
= args
[coding_arg_plist
];
8106 CODING_ATTR_PLIST (attrs
) = val
;
8108 if (EQ (coding_type
, Qcharset
))
8110 /* Generate a lisp vector of 256 elements. Each element is nil,
8111 integer, or a list of charset IDs.
8113 If Nth element is nil, the byte code N is invalid in this
8116 If Nth element is a number NUM, N is the first byte of a
8117 charset whose ID is NUM.
8119 If Nth element is a list of charset IDs, N is the first byte
8120 of one of them. The list is sorted by dimensions of the
8121 charsets. A charset of smaller dimension comes firtst. */
8123 int maybe_ascii_compatible
= 1;
8125 for (list
= Qnil
, tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8127 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
8129 if (charset
->method
== CHARSET_METHOD_SUPERSET
)
8131 val
= CHARSET_SUPERSET (charset
);
8132 for (; CONSP (val
); val
= XCDR (val
))
8133 list
= Fcons (XCAR (XCAR (val
)), list
);
8134 maybe_ascii_compatible
= 0;
8137 list
= Fcons (XCAR (tail
), list
);
8140 val
= Fmake_vector (make_number (256), Qnil
);
8142 for (tail
= Fnreverse (list
); CONSP (tail
); tail
= XCDR (tail
))
8144 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
8145 int dim
= CHARSET_DIMENSION (charset
);
8146 int idx
= (dim
- 1) * 4;
8148 if (CHARSET_ASCII_COMPATIBLE_P (charset
)
8149 && maybe_ascii_compatible
)
8150 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8152 for (i
= charset
->code_space
[idx
];
8153 i
<= charset
->code_space
[idx
+ 1]; i
++)
8155 Lisp_Object tmp
, tmp2
;
8158 tmp
= AREF (val
, i
);
8161 else if (NUMBERP (tmp
))
8163 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
8165 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
8167 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
8171 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
8173 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
8178 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
8181 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
8182 XSETCAR (tmp2
, XCAR (tail
));
8188 ASET (attrs
, coding_attr_charset_valids
, val
);
8189 category
= coding_category_charset
;
8191 else if (EQ (coding_type
, Qccl
))
8195 if (nargs
< coding_arg_ccl_max
)
8198 val
= args
[coding_arg_ccl_decoder
];
8199 CHECK_CCL_PROGRAM (val
);
8201 val
= Fcopy_sequence (val
);
8202 ASET (attrs
, coding_attr_ccl_decoder
, val
);
8204 val
= args
[coding_arg_ccl_encoder
];
8205 CHECK_CCL_PROGRAM (val
);
8207 val
= Fcopy_sequence (val
);
8208 ASET (attrs
, coding_attr_ccl_encoder
, val
);
8210 val
= args
[coding_arg_ccl_valids
];
8211 valids
= Fmake_string (make_number (256), make_number (0));
8212 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
8219 from
= to
= XINT (val
);
8220 if (from
< 0 || from
> 255)
8221 args_out_of_range_3 (val
, make_number (0), make_number (255));
8226 CHECK_NATNUM_CAR (val
);
8227 CHECK_NATNUM_CDR (val
);
8228 from
= XINT (XCAR (val
));
8230 args_out_of_range_3 (XCAR (val
),
8231 make_number (0), make_number (255));
8232 to
= XINT (XCDR (val
));
8233 if (to
< from
|| to
> 255)
8234 args_out_of_range_3 (XCDR (val
),
8235 XCAR (val
), make_number (255));
8237 for (i
= from
; i
<= to
; i
++)
8238 SSET (valids
, i
, 1);
8240 ASET (attrs
, coding_attr_ccl_valids
, valids
);
8242 category
= coding_category_ccl
;
8244 else if (EQ (coding_type
, Qutf_16
))
8246 Lisp_Object bom
, endian
;
8248 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8250 if (nargs
< coding_arg_utf16_max
)
8253 bom
= args
[coding_arg_utf16_bom
];
8254 if (! NILP (bom
) && ! EQ (bom
, Qt
))
8258 CHECK_CODING_SYSTEM (val
);
8260 CHECK_CODING_SYSTEM (val
);
8262 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
8264 endian
= args
[coding_arg_utf16_endian
];
8265 CHECK_SYMBOL (endian
);
8268 else if (! EQ (endian
, Qbig
) && ! EQ (endian
, Qlittle
))
8269 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian
)));
8270 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
8272 category
= (CONSP (bom
)
8273 ? coding_category_utf_16_auto
8275 ? (EQ (endian
, Qbig
)
8276 ? coding_category_utf_16_be_nosig
8277 : coding_category_utf_16_le_nosig
)
8278 : (EQ (endian
, Qbig
)
8279 ? coding_category_utf_16_be
8280 : coding_category_utf_16_le
));
8282 else if (EQ (coding_type
, Qiso_2022
))
8284 Lisp_Object initial
, reg_usage
, request
, flags
;
8287 if (nargs
< coding_arg_iso2022_max
)
8290 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
8291 CHECK_VECTOR (initial
);
8292 for (i
= 0; i
< 4; i
++)
8294 val
= Faref (initial
, make_number (i
));
8297 struct charset
*charset
;
8299 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8300 ASET (initial
, i
, make_number (CHARSET_ID (charset
)));
8301 if (i
== 0 && CHARSET_ASCII_COMPATIBLE_P (charset
))
8302 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8305 ASET (initial
, i
, make_number (-1));
8308 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
8309 CHECK_CONS (reg_usage
);
8310 CHECK_NUMBER_CAR (reg_usage
);
8311 CHECK_NUMBER_CDR (reg_usage
);
8313 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
8314 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
8322 CHECK_CHARSET_GET_ID (tmp
, id
);
8323 CHECK_NATNUM_CDR (val
);
8324 if (XINT (XCDR (val
)) >= 4)
8325 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
8326 XSETCAR (val
, make_number (id
));
8329 flags
= args
[coding_arg_iso2022_flags
];
8330 CHECK_NATNUM (flags
);
8332 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
8333 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
8335 ASET (attrs
, coding_attr_iso_initial
, initial
);
8336 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
8337 ASET (attrs
, coding_attr_iso_request
, request
);
8338 ASET (attrs
, coding_attr_iso_flags
, flags
);
8339 setup_iso_safe_charsets (attrs
);
8341 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
8342 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
8343 | CODING_ISO_FLAG_SINGLE_SHIFT
))
8344 ? coding_category_iso_7_else
8345 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8346 ? coding_category_iso_7
8347 : coding_category_iso_7_tight
);
8350 int id
= XINT (AREF (initial
, 1));
8352 category
= (((i
& CODING_ISO_FLAG_LOCKING_SHIFT
)
8353 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8355 ? coding_category_iso_8_else
8356 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
8357 ? coding_category_iso_8_1
8358 : coding_category_iso_8_2
);
8360 if (category
!= coding_category_iso_8_1
8361 && category
!= coding_category_iso_8_2
)
8362 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8364 else if (EQ (coding_type
, Qemacs_mule
))
8366 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
8367 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
8368 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8369 category
= coding_category_emacs_mule
;
8371 else if (EQ (coding_type
, Qshift_jis
))
8374 struct charset
*charset
;
8376 if (XINT (Flength (charset_list
)) != 3)
8377 error ("There should be just three charsets");
8379 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8380 if (CHARSET_DIMENSION (charset
) != 1)
8381 error ("Dimension of charset %s is not one",
8382 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8383 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8384 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8386 charset_list
= XCDR (charset_list
);
8387 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8388 if (CHARSET_DIMENSION (charset
) != 1)
8389 error ("Dimension of charset %s is not one",
8390 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8392 charset_list
= XCDR (charset_list
);
8393 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8394 if (CHARSET_DIMENSION (charset
) != 2)
8395 error ("Dimension of charset %s is not two",
8396 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8398 category
= coding_category_sjis
;
8399 Vsjis_coding_system
= name
;
8401 else if (EQ (coding_type
, Qbig5
))
8403 struct charset
*charset
;
8405 if (XINT (Flength (charset_list
)) != 2)
8406 error ("There should be just two charsets");
8408 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8409 if (CHARSET_DIMENSION (charset
) != 1)
8410 error ("Dimension of charset %s is not one",
8411 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8412 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8413 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8415 charset_list
= XCDR (charset_list
);
8416 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8417 if (CHARSET_DIMENSION (charset
) != 2)
8418 error ("Dimension of charset %s is not two",
8419 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
8421 category
= coding_category_big5
;
8422 Vbig5_coding_system
= name
;
8424 else if (EQ (coding_type
, Qraw_text
))
8426 category
= coding_category_raw_text
;
8427 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8429 else if (EQ (coding_type
, Qutf_8
))
8431 category
= coding_category_utf_8
;
8432 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8434 else if (EQ (coding_type
, Qundecided
))
8435 category
= coding_category_undecided
;
8437 error ("Invalid coding system type: %s",
8438 SDATA (SYMBOL_NAME (coding_type
)));
8440 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
8441 CODING_ATTR_PLIST (attrs
)
8442 = Fcons (QCcategory
, Fcons (AREF (Vcoding_category_table
, category
),
8443 CODING_ATTR_PLIST (attrs
)));
8445 eol_type
= args
[coding_arg_eol_type
];
8446 if (! NILP (eol_type
)
8447 && ! EQ (eol_type
, Qunix
)
8448 && ! EQ (eol_type
, Qdos
)
8449 && ! EQ (eol_type
, Qmac
))
8450 error ("Invalid eol-type");
8452 aliases
= Fcons (name
, Qnil
);
8454 if (NILP (eol_type
))
8456 eol_type
= make_subsidiaries (name
);
8457 for (i
= 0; i
< 3; i
++)
8459 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
8461 this_name
= AREF (eol_type
, i
);
8462 this_aliases
= Fcons (this_name
, Qnil
);
8463 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
8464 this_spec
= Fmake_vector (make_number (3), attrs
);
8465 ASET (this_spec
, 1, this_aliases
);
8466 ASET (this_spec
, 2, this_eol_type
);
8467 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
8468 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
8469 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
8470 Vcoding_system_alist
);
8474 spec_vec
= Fmake_vector (make_number (3), attrs
);
8475 ASET (spec_vec
, 1, aliases
);
8476 ASET (spec_vec
, 2, eol_type
);
8478 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
8479 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
8480 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
8481 Vcoding_system_alist
);
8484 int id
= coding_categories
[category
].id
;
8486 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
8487 setup_coding_system (name
, &coding_categories
[category
]);
8493 return Fsignal (Qwrong_number_of_arguments
,
8494 Fcons (intern ("define-coding-system-internal"),
8495 make_number (nargs
)));
8499 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
8500 Sdefine_coding_system_alias
, 2, 2, 0,
8501 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8502 (alias
, coding_system
)
8503 Lisp_Object alias
, coding_system
;
8505 Lisp_Object spec
, aliases
, eol_type
;
8507 CHECK_SYMBOL (alias
);
8508 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8509 aliases
= AREF (spec
, 1);
8510 /* ALISES should be a list of length more than zero, and the first
8511 element is a base coding system. Append ALIAS at the tail of the
8513 while (!NILP (XCDR (aliases
)))
8514 aliases
= XCDR (aliases
);
8515 XSETCDR (aliases
, Fcons (alias
, Qnil
));
8517 eol_type
= AREF (spec
, 2);
8518 if (VECTORP (eol_type
))
8520 Lisp_Object subsidiaries
;
8523 subsidiaries
= make_subsidiaries (alias
);
8524 for (i
= 0; i
< 3; i
++)
8525 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
8526 AREF (eol_type
, i
));
8529 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
8530 Vcoding_system_list
= Fcons (alias
, Vcoding_system_list
);
8531 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
8532 Vcoding_system_alist
);
8537 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
8539 doc
: /* Return the base of CODING-SYSTEM.
8540 Any alias or subsidiary coding system is not a base coding system. */)
8542 Lisp_Object coding_system
;
8544 Lisp_Object spec
, attrs
;
8546 if (NILP (coding_system
))
8547 return (Qno_conversion
);
8548 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8549 attrs
= AREF (spec
, 0);
8550 return CODING_ATTR_BASE_NAME (attrs
);
8553 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
8555 doc
: "Return the property list of CODING-SYSTEM.")
8557 Lisp_Object coding_system
;
8559 Lisp_Object spec
, attrs
;
8561 if (NILP (coding_system
))
8562 coding_system
= Qno_conversion
;
8563 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8564 attrs
= AREF (spec
, 0);
8565 return CODING_ATTR_PLIST (attrs
);
8569 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
8571 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
8573 Lisp_Object coding_system
;
8577 if (NILP (coding_system
))
8578 coding_system
= Qno_conversion
;
8579 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8580 return AREF (spec
, 1);
8583 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
8584 Scoding_system_eol_type
, 1, 1, 0,
8585 doc
: /* Return eol-type of CODING-SYSTEM.
8586 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8588 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8589 and CR respectively.
8591 A vector value indicates that a format of end-of-line should be
8592 detected automatically. Nth element of the vector is the subsidiary
8593 coding system whose eol-type is N. */)
8595 Lisp_Object coding_system
;
8597 Lisp_Object spec
, eol_type
;
8600 if (NILP (coding_system
))
8601 coding_system
= Qno_conversion
;
8602 if (! CODING_SYSTEM_P (coding_system
))
8604 spec
= CODING_SYSTEM_SPEC (coding_system
);
8605 eol_type
= AREF (spec
, 2);
8606 if (VECTORP (eol_type
))
8607 return Fcopy_sequence (eol_type
);
8608 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
8609 return make_number (n
);
8615 /*** 9. Post-amble ***/
8622 for (i
= 0; i
< coding_category_max
; i
++)
8624 coding_categories
[i
].id
= -1;
8625 coding_priorities
[i
] = i
;
8628 /* ISO2022 specific initialize routine. */
8629 for (i
= 0; i
< 0x20; i
++)
8630 iso_code_class
[i
] = ISO_control_0
;
8631 for (i
= 0x21; i
< 0x7F; i
++)
8632 iso_code_class
[i
] = ISO_graphic_plane_0
;
8633 for (i
= 0x80; i
< 0xA0; i
++)
8634 iso_code_class
[i
] = ISO_control_1
;
8635 for (i
= 0xA1; i
< 0xFF; i
++)
8636 iso_code_class
[i
] = ISO_graphic_plane_1
;
8637 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
8638 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
8639 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
8640 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
8641 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
8642 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
8643 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
8644 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
8645 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
8646 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
8648 for (i
= 0; i
< 256; i
++)
8650 emacs_mule_bytes
[i
] = 1;
8652 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_11
] = 3;
8653 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_12
] = 3;
8654 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_21
] = 4;
8655 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_22
] = 4;
8663 staticpro (&Vcoding_system_hash_table
);
8665 Lisp_Object args
[2];
8668 Vcoding_system_hash_table
= Fmake_hash_table (2, args
);
8671 staticpro (&Vsjis_coding_system
);
8672 Vsjis_coding_system
= Qnil
;
8674 staticpro (&Vbig5_coding_system
);
8675 Vbig5_coding_system
= Qnil
;
8677 staticpro (&Vcode_conversion_work_buf_list
);
8678 Vcode_conversion_work_buf_list
= Qnil
;
8680 staticpro (&Vcode_conversion_reused_work_buf
);
8681 Vcode_conversion_reused_work_buf
= Qnil
;
8683 DEFSYM (Qcharset
, "charset");
8684 DEFSYM (Qtarget_idx
, "target-idx");
8685 DEFSYM (Qcoding_system_history
, "coding-system-history");
8686 Fset (Qcoding_system_history
, Qnil
);
8688 /* Target FILENAME is the first argument. */
8689 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
8690 /* Target FILENAME is the third argument. */
8691 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
8693 DEFSYM (Qcall_process
, "call-process");
8694 /* Target PROGRAM is the first argument. */
8695 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
8697 DEFSYM (Qcall_process_region
, "call-process-region");
8698 /* Target PROGRAM is the third argument. */
8699 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
8701 DEFSYM (Qstart_process
, "start-process");
8702 /* Target PROGRAM is the third argument. */
8703 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
8705 DEFSYM (Qopen_network_stream
, "open-network-stream");
8706 /* Target SERVICE is the fourth argument. */
8707 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
8709 DEFSYM (Qcoding_system
, "coding-system");
8710 DEFSYM (Qcoding_aliases
, "coding-aliases");
8712 DEFSYM (Qeol_type
, "eol-type");
8713 DEFSYM (Qunix
, "unix");
8714 DEFSYM (Qdos
, "dos");
8716 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
8717 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
8718 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
8719 DEFSYM (Qdefault_char
, "default-char");
8720 DEFSYM (Qundecided
, "undecided");
8721 DEFSYM (Qno_conversion
, "no-conversion");
8722 DEFSYM (Qraw_text
, "raw-text");
8724 DEFSYM (Qiso_2022
, "iso-2022");
8726 DEFSYM (Qutf_8
, "utf-8");
8727 DEFSYM (Qutf_8_emacs
, "utf-8-emacs");
8729 DEFSYM (Qutf_16
, "utf-16");
8730 DEFSYM (Qbig
, "big");
8731 DEFSYM (Qlittle
, "little");
8733 DEFSYM (Qshift_jis
, "shift-jis");
8734 DEFSYM (Qbig5
, "big5");
8736 DEFSYM (Qcoding_system_p
, "coding-system-p");
8738 DEFSYM (Qcoding_system_error
, "coding-system-error");
8739 Fput (Qcoding_system_error
, Qerror_conditions
,
8740 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
8741 Fput (Qcoding_system_error
, Qerror_message
,
8742 build_string ("Invalid coding system"));
8744 /* Intern this now in case it isn't already done.
8745 Setting this variable twice is harmless.
8746 But don't staticpro it here--that is done in alloc.c. */
8747 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
8749 DEFSYM (Qtranslation_table
, "translation-table");
8750 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
8751 DEFSYM (Qtranslation_table_id
, "translation-table-id");
8752 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
8753 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
8755 DEFSYM (Qvalid_codes
, "valid-codes");
8757 DEFSYM (Qemacs_mule
, "emacs-mule");
8759 DEFSYM (QCcategory
, ":category");
8761 Vcoding_category_table
8762 = Fmake_vector (make_number (coding_category_max
), Qnil
);
8763 staticpro (&Vcoding_category_table
);
8764 /* Followings are target of code detection. */
8765 ASET (Vcoding_category_table
, coding_category_iso_7
,
8766 intern ("coding-category-iso-7"));
8767 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
8768 intern ("coding-category-iso-7-tight"));
8769 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
8770 intern ("coding-category-iso-8-1"));
8771 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
8772 intern ("coding-category-iso-8-2"));
8773 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
8774 intern ("coding-category-iso-7-else"));
8775 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
8776 intern ("coding-category-iso-8-else"));
8777 ASET (Vcoding_category_table
, coding_category_utf_8
,
8778 intern ("coding-category-utf-8"));
8779 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
8780 intern ("coding-category-utf-16-be"));
8781 ASET (Vcoding_category_table
, coding_category_utf_16_auto
,
8782 intern ("coding-category-utf-16-auto"));
8783 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
8784 intern ("coding-category-utf-16-le"));
8785 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
8786 intern ("coding-category-utf-16-be-nosig"));
8787 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
8788 intern ("coding-category-utf-16-le-nosig"));
8789 ASET (Vcoding_category_table
, coding_category_charset
,
8790 intern ("coding-category-charset"));
8791 ASET (Vcoding_category_table
, coding_category_sjis
,
8792 intern ("coding-category-sjis"));
8793 ASET (Vcoding_category_table
, coding_category_big5
,
8794 intern ("coding-category-big5"));
8795 ASET (Vcoding_category_table
, coding_category_ccl
,
8796 intern ("coding-category-ccl"));
8797 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
8798 intern ("coding-category-emacs-mule"));
8799 /* Followings are NOT target of code detection. */
8800 ASET (Vcoding_category_table
, coding_category_raw_text
,
8801 intern ("coding-category-raw-text"));
8802 ASET (Vcoding_category_table
, coding_category_undecided
,
8803 intern ("coding-category-undecided"));
8805 defsubr (&Scoding_system_p
);
8806 defsubr (&Sread_coding_system
);
8807 defsubr (&Sread_non_nil_coding_system
);
8808 defsubr (&Scheck_coding_system
);
8809 defsubr (&Sdetect_coding_region
);
8810 defsubr (&Sdetect_coding_string
);
8811 defsubr (&Sfind_coding_systems_region_internal
);
8812 defsubr (&Sunencodable_char_position
);
8813 defsubr (&Scheck_coding_systems_region
);
8814 defsubr (&Sdecode_coding_region
);
8815 defsubr (&Sencode_coding_region
);
8816 defsubr (&Sdecode_coding_string
);
8817 defsubr (&Sencode_coding_string
);
8818 defsubr (&Sdecode_sjis_char
);
8819 defsubr (&Sencode_sjis_char
);
8820 defsubr (&Sdecode_big5_char
);
8821 defsubr (&Sencode_big5_char
);
8822 defsubr (&Sset_terminal_coding_system_internal
);
8823 defsubr (&Sset_safe_terminal_coding_system_internal
);
8824 defsubr (&Sterminal_coding_system
);
8825 defsubr (&Sset_keyboard_coding_system_internal
);
8826 defsubr (&Skeyboard_coding_system
);
8827 defsubr (&Sfind_operation_coding_system
);
8828 defsubr (&Sset_coding_system_priority
);
8829 defsubr (&Sdefine_coding_system_internal
);
8830 defsubr (&Sdefine_coding_system_alias
);
8831 defsubr (&Scoding_system_base
);
8832 defsubr (&Scoding_system_plist
);
8833 defsubr (&Scoding_system_aliases
);
8834 defsubr (&Scoding_system_eol_type
);
8835 defsubr (&Scoding_system_priority_list
);
8837 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
8838 doc
: /* List of coding systems.
8840 Do not alter the value of this variable manually. This variable should be
8841 updated by the functions `define-coding-system' and
8842 `define-coding-system-alias'. */);
8843 Vcoding_system_list
= Qnil
;
8845 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
8846 doc
: /* Alist of coding system names.
8847 Each element is one element list of coding system name.
8848 This variable is given to `completing-read' as TABLE argument.
8850 Do not alter the value of this variable manually. This variable should be
8851 updated by the functions `make-coding-system' and
8852 `define-coding-system-alias'. */);
8853 Vcoding_system_alist
= Qnil
;
8855 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
8856 doc
: /* List of coding-categories (symbols) ordered by priority.
8858 On detecting a coding system, Emacs tries code detection algorithms
8859 associated with each coding-category one by one in this order. When
8860 one algorithm agrees with a byte sequence of source text, the coding
8861 system bound to the corresponding coding-category is selected. */);
8865 Vcoding_category_list
= Qnil
;
8866 for (i
= coding_category_max
- 1; i
>= 0; i
--)
8867 Vcoding_category_list
8868 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
8869 Vcoding_category_list
);
8872 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
8873 doc
: /* Specify the coding system for read operations.
8874 It is useful to bind this variable with `let', but do not set it globally.
8875 If the value is a coding system, it is used for decoding on read operation.
8876 If not, an appropriate element is used from one of the coding system alists:
8877 There are three such tables, `file-coding-system-alist',
8878 `process-coding-system-alist', and `network-coding-system-alist'. */);
8879 Vcoding_system_for_read
= Qnil
;
8881 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
8882 doc
: /* Specify the coding system for write operations.
8883 Programs bind this variable with `let', but you should not set it globally.
8884 If the value is a coding system, it is used for encoding of output,
8885 when writing it to a file and when sending it to a file or subprocess.
8887 If this does not specify a coding system, an appropriate element
8888 is used from one of the coding system alists:
8889 There are three such tables, `file-coding-system-alist',
8890 `process-coding-system-alist', and `network-coding-system-alist'.
8891 For output to files, if the above procedure does not specify a coding system,
8892 the value of `buffer-file-coding-system' is used. */);
8893 Vcoding_system_for_write
= Qnil
;
8895 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
8897 Coding system used in the latest file or process I/O. */);
8898 Vlast_coding_system_used
= Qnil
;
8900 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
8902 *Non-nil means always inhibit code conversion of end-of-line format.
8903 See info node `Coding Systems' and info node `Text and Binary' concerning
8904 such conversion. */);
8905 inhibit_eol_conversion
= 0;
8907 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
8909 Non-nil means process buffer inherits coding system of process output.
8910 Bind it to t if the process output is to be treated as if it were a file
8911 read from some filesystem. */);
8912 inherit_process_coding_system
= 0;
8914 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
8916 Alist to decide a coding system to use for a file I/O operation.
8917 The format is ((PATTERN . VAL) ...),
8918 where PATTERN is a regular expression matching a file name,
8919 VAL is a coding system, a cons of coding systems, or a function symbol.
8920 If VAL is a coding system, it is used for both decoding and encoding
8922 If VAL is a cons of coding systems, the car part is used for decoding,
8923 and the cdr part is used for encoding.
8924 If VAL is a function symbol, the function must return a coding system
8925 or a cons of coding systems which are used as above. The function gets
8926 the arguments with which `find-operation-coding-systems' was called.
8928 See also the function `find-operation-coding-system'
8929 and the variable `auto-coding-alist'. */);
8930 Vfile_coding_system_alist
= Qnil
;
8932 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
8934 Alist to decide a coding system to use for a process I/O operation.
8935 The format is ((PATTERN . VAL) ...),
8936 where PATTERN is a regular expression matching a program name,
8937 VAL is a coding system, a cons of coding systems, or a function symbol.
8938 If VAL is a coding system, it is used for both decoding what received
8939 from the program and encoding what sent to the program.
8940 If VAL is a cons of coding systems, the car part is used for decoding,
8941 and the cdr part is used for encoding.
8942 If VAL is a function symbol, the function must return a coding system
8943 or a cons of coding systems which are used as above.
8945 See also the function `find-operation-coding-system'. */);
8946 Vprocess_coding_system_alist
= Qnil
;
8948 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
8950 Alist to decide a coding system to use for a network I/O operation.
8951 The format is ((PATTERN . VAL) ...),
8952 where PATTERN is a regular expression matching a network service name
8953 or is a port number to connect to,
8954 VAL is a coding system, a cons of coding systems, or a function symbol.
8955 If VAL is a coding system, it is used for both decoding what received
8956 from the network stream and encoding what sent to the network stream.
8957 If VAL is a cons of coding systems, the car part is used for decoding,
8958 and the cdr part is used for encoding.
8959 If VAL is a function symbol, the function must return a coding system
8960 or a cons of coding systems which are used as above.
8962 See also the function `find-operation-coding-system'. */);
8963 Vnetwork_coding_system_alist
= Qnil
;
8965 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
8966 doc
: /* Coding system to use with system messages.
8967 Also used for decoding keyboard input on X Window system. */);
8968 Vlocale_coding_system
= Qnil
;
8970 /* The eol mnemonics are reset in startup.el system-dependently. */
8971 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
8973 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8974 eol_mnemonic_unix
= build_string (":");
8976 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
8978 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8979 eol_mnemonic_dos
= build_string ("\\");
8981 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
8983 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8984 eol_mnemonic_mac
= build_string ("/");
8986 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
8988 *String displayed in mode line when end-of-line format is not yet determined. */);
8989 eol_mnemonic_undecided
= build_string (":");
8991 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
8993 *Non-nil enables character translation while encoding and decoding. */);
8994 Venable_character_translation
= Qt
;
8996 DEFVAR_LISP ("standard-translation-table-for-decode",
8997 &Vstandard_translation_table_for_decode
,
8998 doc
: /* Table for translating characters while decoding. */);
8999 Vstandard_translation_table_for_decode
= Qnil
;
9001 DEFVAR_LISP ("standard-translation-table-for-encode",
9002 &Vstandard_translation_table_for_encode
,
9003 doc
: /* Table for translating characters while encoding. */);
9004 Vstandard_translation_table_for_encode
= Qnil
;
9006 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
9007 doc
: /* Alist of charsets vs revision numbers.
9008 While encoding, if a charset (car part of an element) is found,
9009 designate it with the escape sequence identifying revision (cdr part
9010 of the element). */);
9011 Vcharset_revision_table
= Qnil
;
9013 DEFVAR_LISP ("default-process-coding-system",
9014 &Vdefault_process_coding_system
,
9015 doc
: /* Cons of coding systems used for process I/O by default.
9016 The car part is used for decoding a process output,
9017 the cdr part is used for encoding a text to be sent to a process. */);
9018 Vdefault_process_coding_system
= Qnil
;
9020 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
9022 Table of extra Latin codes in the range 128..159 (inclusive).
9023 This is a vector of length 256.
9024 If Nth element is non-nil, the existence of code N in a file
9025 \(or output of subprocess) doesn't prevent it to be detected as
9026 a coding system of ISO 2022 variant which has a flag
9027 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9028 or reading output of a subprocess.
9029 Only 128th through 159th elements has a meaning. */);
9030 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
9032 DEFVAR_LISP ("select-safe-coding-system-function",
9033 &Vselect_safe_coding_system_function
,
9035 Function to call to select safe coding system for encoding a text.
9037 If set, this function is called to force a user to select a proper
9038 coding system which can encode the text in the case that a default
9039 coding system used in each operation can't encode the text.
9041 The default value is `select-safe-coding-system' (which see). */);
9042 Vselect_safe_coding_system_function
= Qnil
;
9044 DEFVAR_BOOL ("coding-system-require-warning",
9045 &coding_system_require_warning
,
9046 doc
: /* Internal use only.
9047 If non-nil, on writing a file, `select-safe-coding-system-function' is
9048 called even if `coding-system-for-write' is non-nil. The command
9049 `universal-coding-system-argument' binds this variable to t temporarily. */);
9050 coding_system_require_warning
= 0;
9053 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9054 &inhibit_iso_escape_detection
,
9056 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9058 By default, on reading a file, Emacs tries to detect how the text is
9059 encoded. This code detection is sensitive to escape sequences. If
9060 the sequence is valid as ISO2022, the code is determined as one of
9061 the ISO2022 encodings, and the file is decoded by the corresponding
9062 coding system (e.g. `iso-2022-7bit').
9064 However, there may be a case that you want to read escape sequences in
9065 a file as is. In such a case, you can set this variable to non-nil.
9066 Then, as the code detection ignores any escape sequences, no file is
9067 detected as encoded in some ISO2022 encoding. The result is that all
9068 escape sequences become visible in a buffer.
9070 The default value is nil, and it is strongly recommended not to change
9071 it. That is because many Emacs Lisp source files that contain
9072 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9073 in Emacs's distribution, and they won't be decoded correctly on
9074 reading if you suppress escape sequence detection.
9076 The other way to read escape sequences in a file without decoding is
9077 to explicitly specify some coding system that doesn't use ISO2022's
9078 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9079 inhibit_iso_escape_detection
= 0;
9081 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input
,
9082 doc
: /* Char table for translating self-inserting characters.
9083 This is applied to the result of input methods, not their input. See also
9084 `keyboard-translate-table'. */);
9085 Vtranslation_table_for_input
= Qnil
;
9088 Lisp_Object args
[coding_arg_max
];
9089 Lisp_Object plist
[16];
9092 for (i
= 0; i
< coding_arg_max
; i
++)
9095 plist
[0] = intern (":name");
9096 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
9097 plist
[2] = intern (":mnemonic");
9098 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
9099 plist
[4] = intern (":coding-type");
9100 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
9101 plist
[6] = intern (":ascii-compatible-p");
9102 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
9103 plist
[8] = intern (":default-char");
9104 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
9105 plist
[10] = intern (":for-unibyte");
9106 plist
[11] = args
[coding_arg_for_unibyte
] = Qt
;
9107 plist
[12] = intern (":docstring");
9108 plist
[13] = build_string ("Do no conversion.\n\
9110 When you visit a file with this coding, the file is read into a\n\
9111 unibyte buffer as is, thus each byte of a file is treated as a\n\
9113 plist
[14] = intern (":eol-type");
9114 plist
[15] = args
[coding_arg_eol_type
] = Qunix
;
9115 args
[coding_arg_plist
] = Flist (16, plist
);
9116 Fdefine_coding_system_internal (coding_arg_max
, args
);
9119 setup_coding_system (Qno_conversion
, &keyboard_coding
);
9120 setup_coding_system (Qno_conversion
, &terminal_coding
);
9121 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
9126 for (i
= 0; i
< coding_category_max
; i
++)
9127 Fset (AREF (Vcoding_category_table
, i
), Qno_conversion
);
9132 emacs_strerror (error_number
)
9137 synchronize_system_messages_locale ();
9138 str
= strerror (error_number
);
9140 if (! NILP (Vlocale_coding_system
))
9142 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
9143 Vlocale_coding_system
,
9145 str
= (char *) SDATA (dec
);