1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used by Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for a text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX. Return 1 if the data contains
148 a byte sequence which can be decoded into non-ASCII characters by
149 the coding system. Otherwize (i.e. the data contains only ASCII
150 characters or invalid sequence) return 0.
152 It also resets some bits of an integer pointed by MASK. The macros
153 CATEGORY_MASK_XXX specifies each bit of this integer.
155 Below is the template of these functions. */
159 detect_coding_XXX (coding
, mask
)
160 struct coding_system
*coding
;
163 unsigned char *src
= coding
->source
;
164 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
165 int multibytep
= coding
->src_multibyte
;
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
175 /* Check if it conforms to XXX. If not, break the loop. */
177 /* As the data is invalid for XXX, reset a proper bits. */
178 *mask
&= ~CODING_CATEGORY_XXX
;
181 /* The source exausted. */
183 /* ASCII characters only. */
185 /* Some data should be decoded into non-ASCII characters. */
186 *mask
&= CODING_CATEGORY_XXX
;
191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
193 These functions decode a byte sequence specified as a source by
194 CODING. The resulting multibyte text goes to a place pointed to by
195 CODING->charbuf, the length of which should not exceed
196 CODING->charbuf_size;
198 These functions set the information of original and decoded texts in
199 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
200 They also set CODING->result to one of CODING_RESULT_XXX indicating
201 how the decoding is finished.
203 Below is the template of these functions. */
207 decode_coding_XXXX (coding
)
208 struct coding_system
*coding
;
210 unsigned char *src
= coding
->source
+ coding
->consumed
;
211 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
212 /* SRC_BASE remembers the start position in source in each loop.
213 The loop will be exited when there's not enough source code, or
214 when there's no room in CHARBUF for a decoded character. */
215 unsigned char *src_base
;
216 /* A buffer to produce decoded characters. */
217 int *charbuf
= coding
->charbuf
;
218 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
219 int multibytep
= coding
->src_multibyte
;
224 if (charbuf
< charbuf_end
)
225 /* No more room to produce a decoded character. */
232 if (src_base
< src_end
233 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
234 /* If the source ends by partial bytes to construct a character,
235 treat them as eight-bit raw data. */
236 while (src_base
< src_end
&& charbuf
< charbuf_end
)
237 *charbuf
++ = *src_base
++;
238 /* Remember how many bytes and characters we consumed. If the
239 source is multibyte, the bytes and chars are not identical. */
240 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
241 /* Remember how many characters we produced. */
242 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
248 These functions encode SRC_BYTES length text at SOURCE of Emacs'
249 internal multibyte format by CODING. The resulting byte sequence
250 goes to a place pointed to by DESTINATION, the length of which
251 should not exceed DST_BYTES.
253 These functions set the information of original and encoded texts in
254 the members produced, produced_char, consumed, and consumed_char of
255 the structure *CODING. They also set the member result to one of
256 CODING_RESULT_XXX indicating how the encoding finished.
258 DST_BYTES zero means that source area and destination area are
259 overlapped, which means that we can produce a encoded text until it
260 reaches at the head of not-yet-encoded source text.
262 Below is a template of these functions. */
265 encode_coding_XXX (coding
)
266 struct coding_system
*coding
;
268 int multibytep
= coding
->dst_multibyte
;
269 int *charbuf
= coding
->charbuf
;
270 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
271 unsigned char *dst
= coding
->destination
+ coding
->produced
;
272 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
273 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
274 int produced_chars
= 0;
276 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
279 /* Encode C into DST, and increment DST. */
281 label_no_more_destination
:
282 /* How many chars and bytes we produced. */
283 coding
->produced_char
+= produced_chars
;
284 coding
->produced
= dst
- coding
->destination
;
289 /*** 1. Preamble ***/
296 #include "character.h"
299 #include "composite.h"
303 Lisp_Object Vcoding_system_hash_table
;
305 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
306 Lisp_Object Qunix
, Qdos
;
307 extern Lisp_Object Qmac
; /* frame.c */
308 Lisp_Object Qbuffer_file_coding_system
;
309 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
310 Lisp_Object Qdefault_char
;
311 Lisp_Object Qno_conversion
, Qundecided
;
312 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
313 Lisp_Object Qutf_16_be_nosig
, Qutf_16_be
, Qutf_16_le_nosig
, Qutf_16_le
;
314 Lisp_Object Qsignature
, Qendian
, Qbig
, Qlittle
;
315 Lisp_Object Qcoding_system_history
;
316 Lisp_Object Qvalid_codes
;
318 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
319 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
320 Lisp_Object Qstart_process
, Qopen_network_stream
;
321 Lisp_Object Qtarget_idx
;
323 Lisp_Object Vselect_safe_coding_system_function
;
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
327 /* Mnemonic string to indicate format of end-of-line is not yet
329 Lisp_Object eol_mnemonic_undecided
;
333 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
335 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule
, Qraw_text
;
341 /* Coding-systems are handed between Emacs Lisp programs and C internal
342 routines by the following three variables. */
343 /* Coding-system for reading files and receiving data from process. */
344 Lisp_Object Vcoding_system_for_read
;
345 /* Coding-system for writing files and sending data to process. */
346 Lisp_Object Vcoding_system_for_write
;
347 /* Coding-system actually used in the latest I/O. */
348 Lisp_Object Vlast_coding_system_used
;
350 /* A vector of length 256 which contains information about special
351 Latin codes (especially for dealing with Microsoft codes). */
352 Lisp_Object Vlatin_extra_code_table
;
354 /* Flag to inhibit code conversion of end-of-line format. */
355 int inhibit_eol_conversion
;
357 /* Flag to inhibit ISO2022 escape sequence detection. */
358 int inhibit_iso_escape_detection
;
360 /* Flag to make buffer-file-coding-system inherit from process-coding. */
361 int inherit_process_coding_system
;
363 /* Coding system to be used to encode text for terminal display. */
364 struct coding_system terminal_coding
;
366 /* Coding system to be used to encode text for terminal display when
367 terminal coding system is nil. */
368 struct coding_system safe_terminal_coding
;
370 /* Coding system of what is sent from terminal keyboard. */
371 struct coding_system keyboard_coding
;
373 Lisp_Object Vfile_coding_system_alist
;
374 Lisp_Object Vprocess_coding_system_alist
;
375 Lisp_Object Vnetwork_coding_system_alist
;
377 Lisp_Object Vlocale_coding_system
;
381 /* Flag to tell if we look up translation table on character code
383 Lisp_Object Venable_character_translation
;
384 /* Standard translation table to look up on decoding (reading). */
385 Lisp_Object Vstandard_translation_table_for_decode
;
386 /* Standard translation table to look up on encoding (writing). */
387 Lisp_Object Vstandard_translation_table_for_encode
;
389 Lisp_Object Qtranslation_table
;
390 Lisp_Object Qtranslation_table_id
;
391 Lisp_Object Qtranslation_table_for_decode
;
392 Lisp_Object Qtranslation_table_for_encode
;
394 /* Alist of charsets vs revision number. */
395 static Lisp_Object Vcharset_revision_table
;
397 /* Default coding systems used for process I/O. */
398 Lisp_Object Vdefault_process_coding_system
;
400 /* Global flag to tell that we can't call post-read-conversion and
401 pre-write-conversion functions. Usually the value is zero, but it
402 is set to 1 temporarily while such functions are running. This is
403 to avoid infinite recursive call. */
404 static int inhibit_pre_post_conversion
;
406 /* Two special coding systems. */
407 Lisp_Object Vsjis_coding_system
;
408 Lisp_Object Vbig5_coding_system
;
411 static int detect_coding_utf_8
P_ ((struct coding_system
*, int *));
412 static void decode_coding_utf_8
P_ ((struct coding_system
*));
413 static int encode_coding_utf_8
P_ ((struct coding_system
*));
415 static int detect_coding_utf_16
P_ ((struct coding_system
*, int *));
416 static void decode_coding_utf_16
P_ ((struct coding_system
*));
417 static int encode_coding_utf_16
P_ ((struct coding_system
*));
419 static int detect_coding_iso_2022
P_ ((struct coding_system
*, int *));
420 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
421 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
423 static int detect_coding_emacs_mule
P_ ((struct coding_system
*, int *));
424 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
425 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
427 static int detect_coding_sjis
P_ ((struct coding_system
*, int *));
428 static void decode_coding_sjis
P_ ((struct coding_system
*));
429 static int encode_coding_sjis
P_ ((struct coding_system
*));
431 static int detect_coding_big5
P_ ((struct coding_system
*, int *));
432 static void decode_coding_big5
P_ ((struct coding_system
*));
433 static int encode_coding_big5
P_ ((struct coding_system
*));
435 static int detect_coding_ccl
P_ ((struct coding_system
*, int *));
436 static void decode_coding_ccl
P_ ((struct coding_system
*));
437 static int encode_coding_ccl
P_ ((struct coding_system
*));
439 static void decode_coding_raw_text
P_ ((struct coding_system
*));
440 static int encode_coding_raw_text
P_ ((struct coding_system
*));
443 /* ISO2022 section */
445 #define CODING_ISO_INITIAL(coding, reg) \
446 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
447 coding_attr_iso_initial), \
451 #define CODING_ISO_REQUEST(coding, charset_id) \
452 ((charset_id <= (coding)->max_charset_id \
453 ? (coding)->safe_charsets[charset_id] \
457 #define CODING_ISO_FLAGS(coding) \
458 ((coding)->spec.iso_2022.flags)
459 #define CODING_ISO_DESIGNATION(coding, reg) \
460 ((coding)->spec.iso_2022.current_designation[reg])
461 #define CODING_ISO_INVOCATION(coding, plane) \
462 ((coding)->spec.iso_2022.current_invocation[plane])
463 #define CODING_ISO_SINGLE_SHIFTING(coding) \
464 ((coding)->spec.iso_2022.single_shifting)
465 #define CODING_ISO_BOL(coding) \
466 ((coding)->spec.iso_2022.bol)
467 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
468 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
470 /* Control characters of ISO2022. */
471 /* code */ /* function */
472 #define ISO_CODE_LF 0x0A /* line-feed */
473 #define ISO_CODE_CR 0x0D /* carriage-return */
474 #define ISO_CODE_SO 0x0E /* shift-out */
475 #define ISO_CODE_SI 0x0F /* shift-in */
476 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
477 #define ISO_CODE_ESC 0x1B /* escape */
478 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
479 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
480 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
482 /* All code (1-byte) of ISO2022 is classified into one of the
484 enum iso_code_class_type
486 ISO_control_0
, /* Control codes in the range
487 0x00..0x1F and 0x7F, except for the
488 following 5 codes. */
489 ISO_carriage_return
, /* ISO_CODE_CR (0x0D) */
490 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
491 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
492 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
493 ISO_escape
, /* ISO_CODE_SO (0x1B) */
494 ISO_control_1
, /* Control codes in the range
495 0x80..0x9F, except for the
496 following 3 codes. */
497 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
498 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
499 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
500 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
501 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
502 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
503 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
506 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
507 `iso-flags' attribute of an iso2022 coding system. */
509 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
510 instead of the correct short-form sequence (e.g. ESC $ A). */
511 #define CODING_ISO_FLAG_LONG_FORM 0x0001
513 /* If set, reset graphic planes and registers at end-of-line to the
515 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
517 /* If set, reset graphic planes and registers before any control
518 characters to the initial state. */
519 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
521 /* If set, encode by 7-bit environment. */
522 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
524 /* If set, use locking-shift function. */
525 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
527 /* If set, use single-shift function. Overwrite
528 CODING_ISO_FLAG_LOCKING_SHIFT. */
529 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
531 /* If set, use designation escape sequence. */
532 #define CODING_ISO_FLAG_DESIGNATION 0x0040
534 /* If set, produce revision number sequence. */
535 #define CODING_ISO_FLAG_REVISION 0x0080
537 /* If set, produce ISO6429's direction specifying sequence. */
538 #define CODING_ISO_FLAG_DIRECTION 0x0100
540 /* If set, assume designation states are reset at beginning of line on
542 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
544 /* If set, designation sequence should be placed at beginning of line
546 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
548 /* If set, do not encode unsafe charactes on output. */
549 #define CODING_ISO_FLAG_SAFE 0x0800
551 /* If set, extra latin codes (128..159) are accepted as a valid code
553 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
555 #define CODING_ISO_FLAG_COMPOSITION 0x2000
557 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
559 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
561 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
563 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
565 /* A character to be produced on output if encoding of the original
566 character is prohibited by CODING_ISO_FLAG_SAFE. */
567 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
571 #define CODING_UTF_16_BOM(coding) \
572 ((coding)->spec.utf_16.bom)
574 #define CODING_UTF_16_ENDIAN(coding) \
575 ((coding)->spec.utf_16.endian)
577 #define CODING_UTF_16_SURROGATE(coding) \
578 ((coding)->spec.utf_16.surrogate)
582 #define CODING_CCL_DECODER(coding) \
583 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
584 #define CODING_CCL_ENCODER(coding) \
585 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
586 #define CODING_CCL_VALIDS(coding) \
587 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
590 /* Index for each coding category in `coding_category_table' */
594 coding_category_iso_7
,
595 coding_category_iso_7_tight
,
596 coding_category_iso_8_1
,
597 coding_category_iso_8_2
,
598 coding_category_iso_7_else
,
599 coding_category_iso_8_else
,
600 coding_category_utf_8
,
601 coding_category_utf_16_auto
,
602 coding_category_utf_16_be
,
603 coding_category_utf_16_le
,
604 coding_category_utf_16_be_nosig
,
605 coding_category_utf_16_le_nosig
,
606 coding_category_charset
,
607 coding_category_sjis
,
608 coding_category_big5
,
610 coding_category_emacs_mule
,
611 /* All above are targets of code detection. */
612 coding_category_raw_text
,
613 coding_category_undecided
,
617 /* Definitions of flag bits used in detect_coding_XXXX. */
618 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
619 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
620 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
621 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
622 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
623 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
624 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
625 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
626 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
627 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
628 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
629 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
630 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
631 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
632 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
633 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
635 /* This value is returned if detect_coding_mask () find nothing other
636 than ASCII characters. */
637 #define CATEGORY_MASK_ANY \
638 (CATEGORY_MASK_ISO_7 \
639 | CATEGORY_MASK_ISO_7_TIGHT \
640 | CATEGORY_MASK_ISO_8_1 \
641 | CATEGORY_MASK_ISO_8_2 \
642 | CATEGORY_MASK_ISO_7_ELSE \
643 | CATEGORY_MASK_ISO_8_ELSE \
644 | CATEGORY_MASK_UTF_8 \
645 | CATEGORY_MASK_UTF_16_BE \
646 | CATEGORY_MASK_UTF_16_LE \
647 | CATEGORY_MASK_UTF_16_BE_NOSIG \
648 | CATEGORY_MASK_UTF_16_LE_NOSIG \
649 | CATEGORY_MASK_CHARSET \
650 | CATEGORY_MASK_SJIS \
651 | CATEGORY_MASK_BIG5 \
652 | CATEGORY_MASK_CCL \
653 | CATEGORY_MASK_EMACS_MULE)
656 #define CATEGORY_MASK_ISO_7BIT \
657 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
659 #define CATEGORY_MASK_ISO_8BIT \
660 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
662 #define CATEGORY_MASK_ISO_ELSE \
663 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
665 #define CATEGORY_MASK_ISO_ESCAPE \
666 (CATEGORY_MASK_ISO_7 \
667 | CATEGORY_MASK_ISO_7_TIGHT \
668 | CATEGORY_MASK_ISO_7_ELSE \
669 | CATEGORY_MASK_ISO_8_ELSE)
671 #define CATEGORY_MASK_ISO \
672 ( CATEGORY_MASK_ISO_7BIT \
673 | CATEGORY_MASK_ISO_8BIT \
674 | CATEGORY_MASK_ISO_ELSE)
676 #define CATEGORY_MASK_UTF_16 \
677 (CATEGORY_MASK_UTF_16_BE \
678 | CATEGORY_MASK_UTF_16_LE \
679 | CATEGORY_MASK_UTF_16_BE_NOSIG \
680 | CATEGORY_MASK_UTF_16_LE_NOSIG)
683 /* List of symbols `coding-category-xxx' ordered by priority. This
684 variable is exposed to Emacs Lisp. */
685 static Lisp_Object Vcoding_category_list
;
687 /* Table of coding categories (Lisp symbols). This variable is for
689 static Lisp_Object Vcoding_category_table
;
691 /* Table of coding-categories ordered by priority. */
692 static enum coding_category coding_priorities
[coding_category_max
];
694 /* Nth element is a coding context for the coding system bound to the
695 Nth coding category. */
696 static struct coding_system coding_categories
[coding_category_max
];
698 static int detected_mask
[coding_category_raw_text
] =
706 CATEGORY_MASK_UTF_16
,
707 CATEGORY_MASK_UTF_16
,
708 CATEGORY_MASK_UTF_16
,
709 CATEGORY_MASK_UTF_16
,
710 CATEGORY_MASK_UTF_16
,
711 CATEGORY_MASK_CHARSET
,
715 CATEGORY_MASK_EMACS_MULE
718 /*** Commonly used macros and functions ***/
721 #define min(a, b) ((a) < (b) ? (a) : (b))
724 #define max(a, b) ((a) > (b) ? (a) : (b))
727 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
729 attrs = CODING_ID_ATTRS (coding->id); \
730 eol_type = CODING_ID_EOL_TYPE (coding->id); \
731 if (VECTORP (eol_type)) \
733 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
737 /* Safely get one byte from the source text pointed by SRC which ends
738 at SRC_END, and set C to that byte. If there are not enough bytes
739 in the source, it jumps to `no_more_source'. The caller
740 should declare and set these variables appropriately in advance:
741 src, src_end, multibytep
744 #define ONE_MORE_BYTE(c) \
746 if (src == src_end) \
748 if (src_base < src) \
749 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
750 goto no_more_source; \
753 if (multibytep && (c & 0x80)) \
755 if ((c & 0xFE) != 0xC0) \
756 error ("Undecodable char found"); \
757 c = ((c & 1) << 6) | *src++; \
763 #define ONE_MORE_BYTE_NO_CHECK(c) \
766 if (multibytep && (c & 0x80)) \
768 if ((c & 0xFE) != 0xC0) \
769 error ("Undecodable char found"); \
770 c = ((c & 1) << 6) | *src++; \
776 /* Store a byte C in the place pointed by DST and increment DST to the
777 next free point, and increment PRODUCED_CHARS. The caller should
778 assure that C is 0..127, and declare and set the variable `dst'
779 appropriately in advance.
783 #define EMIT_ONE_ASCII_BYTE(c) \
790 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
792 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
794 produced_chars += 2; \
795 *dst++ = (c1), *dst++ = (c2); \
799 /* Store a byte C in the place pointed by DST and increment DST to the
800 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
801 nonzero, store in an appropriate multibyte from. The caller should
802 declare and set the variables `dst' and `multibytep' appropriately
805 #define EMIT_ONE_BYTE(c) \
812 ch = BYTE8_TO_CHAR (ch); \
813 CHAR_STRING_ADVANCE (ch, dst); \
820 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
822 #define EMIT_TWO_BYTES(c1, c2) \
824 produced_chars += 2; \
831 ch = BYTE8_TO_CHAR (ch); \
832 CHAR_STRING_ADVANCE (ch, dst); \
835 ch = BYTE8_TO_CHAR (ch); \
836 CHAR_STRING_ADVANCE (ch, dst); \
846 #define EMIT_THREE_BYTES(c1, c2, c3) \
848 EMIT_ONE_BYTE (c1); \
849 EMIT_TWO_BYTES (c2, c3); \
853 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
855 EMIT_TWO_BYTES (c1, c2); \
856 EMIT_TWO_BYTES (c3, c4); \
860 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
862 charset_map_loaded = 0; \
863 c = DECODE_CHAR (charset, code); \
864 if (charset_map_loaded) \
866 unsigned char *orig = coding->source; \
869 coding_set_source (coding); \
870 offset = coding->source - orig; \
872 src_base += offset; \
878 #define ASSURE_DESTINATION(bytes) \
880 if (dst + (bytes) >= dst_end) \
882 int more_bytes = charbuf_end - charbuf + (bytes); \
884 dst = alloc_destination (coding, more_bytes, dst); \
885 dst_end = coding->destination + coding->dst_bytes; \
892 coding_set_source (coding
)
893 struct coding_system
*coding
;
895 if (BUFFERP (coding
->src_object
))
897 if (coding
->src_pos
< 0)
898 coding
->source
= GAP_END_ADDR
+ coding
->src_pos_byte
;
901 struct buffer
*buf
= XBUFFER (coding
->src_object
);
902 EMACS_INT gpt_byte
= BUF_GPT_BYTE (buf
);
903 unsigned char *beg_addr
= BUF_BEG_ADDR (buf
);
905 coding
->source
= beg_addr
+ coding
->src_pos_byte
- 1;
906 if (coding
->src_pos_byte
>= gpt_byte
)
907 coding
->source
+= BUF_GAP_SIZE (buf
);
910 else if (STRINGP (coding
->src_object
))
912 coding
->source
= (XSTRING (coding
->src_object
)->data
913 + coding
->src_pos_byte
);
916 /* Otherwise, the source is C string and is never relocated
917 automatically. Thus we don't have to update anything. */
922 coding_set_destination (coding
)
923 struct coding_system
*coding
;
925 if (BUFFERP (coding
->dst_object
))
927 /* We are sure that coding->dst_pos_byte is before the gap of the
929 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
930 + coding
->dst_pos_byte
- 1);
931 if (coding
->src_pos
< 0)
932 coding
->dst_bytes
= (GAP_END_ADDR
933 - (coding
->src_bytes
- coding
->consumed
)
934 - coding
->destination
);
936 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
937 - coding
->destination
);
940 /* Otherwise, the destination is C string and is never relocated
941 automatically. Thus we don't have to update anything. */
947 coding_alloc_by_realloc (coding
, bytes
)
948 struct coding_system
*coding
;
951 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
952 coding
->dst_bytes
+ bytes
);
953 coding
->dst_bytes
+= bytes
;
957 coding_alloc_by_making_gap (coding
, bytes
)
958 struct coding_system
*coding
;
961 if (BUFFERP (coding
->dst_object
)
962 && EQ (coding
->src_object
, coding
->dst_object
))
964 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
966 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
968 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
972 Lisp_Object this_buffer
;
974 this_buffer
= Fcurrent_buffer ();
975 set_buffer_internal (XBUFFER (coding
->dst_object
));
977 set_buffer_internal (XBUFFER (this_buffer
));
982 static unsigned char *
983 alloc_destination (coding
, nbytes
, dst
)
984 struct coding_system
*coding
;
988 EMACS_INT offset
= dst
- coding
->destination
;
990 if (BUFFERP (coding
->dst_object
))
991 coding_alloc_by_making_gap (coding
, nbytes
);
993 coding_alloc_by_realloc (coding
, nbytes
);
994 coding
->result
= CODING_RESULT_SUCCESS
;
995 coding_set_destination (coding
);
996 dst
= coding
->destination
+ offset
;
1001 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1008 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1009 Check if a text is encoded in UTF-8. If it is, return
1010 CATEGORY_MASK_UTF_8, else return 0. */
1012 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1013 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1014 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1015 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1016 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1017 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1020 detect_coding_utf_8 (coding
, mask
)
1021 struct coding_system
*coding
;
1024 unsigned char *src
= coding
->source
, *src_base
= src
;
1025 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1026 int multibytep
= coding
->src_multibyte
;
1027 int consumed_chars
= 0;
1030 /* A coding system of this category is always ASCII compatible. */
1031 src
+= coding
->head_ascii
;
1035 int c
, c1
, c2
, c3
, c4
;
1038 if (UTF_8_1_OCTET_P (c
))
1041 if (! UTF_8_EXTRA_OCTET_P (c1
))
1043 if (UTF_8_2_OCTET_LEADING_P (c
))
1049 if (! UTF_8_EXTRA_OCTET_P (c2
))
1051 if (UTF_8_3_OCTET_LEADING_P (c
))
1057 if (! UTF_8_EXTRA_OCTET_P (c3
))
1059 if (UTF_8_4_OCTET_LEADING_P (c
))
1065 if (! UTF_8_EXTRA_OCTET_P (c4
))
1067 if (UTF_8_5_OCTET_LEADING_P (c
))
1074 *mask
&= ~CATEGORY_MASK_UTF_8
;
1080 *mask
&= CATEGORY_MASK_UTF_8
;
1085 /* Fixme: deal with surrogates? */
1087 decode_coding_utf_8 (coding
)
1088 struct coding_system
*coding
;
1090 unsigned char *src
= coding
->source
+ coding
->consumed
;
1091 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1092 unsigned char *src_base
;
1093 int *charbuf
= coding
->charbuf
;
1094 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1095 int consumed_chars
= 0, consumed_chars_base
;
1096 int multibytep
= coding
->src_multibyte
;
1097 Lisp_Object attr
, eol_type
, charset_list
;
1099 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1103 int c
, c1
, c2
, c3
, c4
, c5
;
1106 consumed_chars_base
= consumed_chars
;
1108 if (charbuf
>= charbuf_end
)
1112 if (UTF_8_1_OCTET_P(c1
))
1117 if (EQ (eol_type
, Qdos
))
1120 goto no_more_source
;
1124 else if (EQ (eol_type
, Qmac
))
1131 if (! UTF_8_EXTRA_OCTET_P (c2
))
1133 if (UTF_8_2_OCTET_LEADING_P (c1
))
1135 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1136 /* Reject overlong sequences here and below. Encoders
1137 producing them are incorrect, they can be misleading,
1138 and they mess up read/write invariance. */
1145 if (! UTF_8_EXTRA_OCTET_P (c3
))
1147 if (UTF_8_3_OCTET_LEADING_P (c1
))
1149 c
= (((c1
& 0xF) << 12)
1150 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1157 if (! UTF_8_EXTRA_OCTET_P (c4
))
1159 if (UTF_8_4_OCTET_LEADING_P (c1
))
1161 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1162 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1169 if (! UTF_8_EXTRA_OCTET_P (c5
))
1171 if (UTF_8_5_OCTET_LEADING_P (c1
))
1173 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1174 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1176 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1191 consumed_chars
= consumed_chars_base
;
1193 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1198 coding
->consumed_char
+= consumed_chars_base
;
1199 coding
->consumed
= src_base
- coding
->source
;
1200 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1205 encode_coding_utf_8 (coding
)
1206 struct coding_system
*coding
;
1208 int multibytep
= coding
->dst_multibyte
;
1209 int *charbuf
= coding
->charbuf
;
1210 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1211 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1212 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1213 int produced_chars
= 0;
1218 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1220 while (charbuf
< charbuf_end
)
1222 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1224 ASSURE_DESTINATION (safe_room
);
1226 CHAR_STRING_ADVANCE (c
, pend
);
1227 for (p
= str
; p
< pend
; p
++)
1233 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1235 while (charbuf
< charbuf_end
)
1237 ASSURE_DESTINATION (safe_room
);
1239 dst
+= CHAR_STRING (c
, dst
);
1243 coding
->result
= CODING_RESULT_SUCCESS
;
1244 coding
->produced_char
+= produced_chars
;
1245 coding
->produced
= dst
- coding
->destination
;
1250 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1251 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1252 Little Endian (otherwise). If it is, return
1253 CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1256 #define UTF_16_HIGH_SURROGATE_P(val) \
1257 (((val) & 0xFC00) == 0xD800)
1259 #define UTF_16_LOW_SURROGATE_P(val) \
1260 (((val) & 0xFC00) == 0xDC00)
1262 #define UTF_16_INVALID_P(val) \
1263 (((val) == 0xFFFE) \
1264 || ((val) == 0xFFFF) \
1265 || UTF_16_LOW_SURROGATE_P (val))
1269 detect_coding_utf_16 (coding
, mask
)
1270 struct coding_system
*coding
;
1273 unsigned char *src
= coding
->source
, *src_base
= src
;
1274 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1275 int multibytep
= coding
->src_multibyte
;
1276 int consumed_chars
= 0;
1282 if ((c1
== 0xFF) && (c2
== 0xFE))
1284 *mask
&= CATEGORY_MASK_UTF_16_LE
;
1287 else if ((c1
== 0xFE) && (c2
== 0xFF))
1289 *mask
&= CATEGORY_MASK_UTF_16_BE
;
1297 decode_coding_utf_16 (coding
)
1298 struct coding_system
*coding
;
1300 unsigned char *src
= coding
->source
+ coding
->consumed
;
1301 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1302 unsigned char *src_base
;
1303 int *charbuf
= coding
->charbuf
;
1304 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1305 int consumed_chars
= 0, consumed_chars_base
;
1306 int multibytep
= coding
->src_multibyte
;
1307 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1308 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1309 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1310 Lisp_Object attr
, eol_type
, charset_list
;
1312 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1314 if (bom
!= utf_16_without_bom
)
1322 if (bom
== utf_16_with_bom
)
1324 if (endian
== utf_16_big_endian
1325 ? c
!= 0xFFFE : c
!= 0xFEFF)
1327 /* We are sure that there's enouph room at CHARBUF. */
1336 CODING_UTF_16_ENDIAN (coding
)
1337 = endian
= utf_16_big_endian
;
1338 else if (c
== 0xFEFF)
1339 CODING_UTF_16_ENDIAN (coding
)
1340 = endian
= utf_16_little_endian
;
1343 CODING_UTF_16_ENDIAN (coding
)
1344 = endian
= utf_16_big_endian
;
1348 CODING_UTF_16_BOM (coding
) = utf_16_with_bom
;
1356 consumed_chars_base
= consumed_chars
;
1358 if (charbuf
+ 2 >= charbuf_end
)
1363 c
= (endian
== utf_16_big_endian
1364 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1367 if (! UTF_16_LOW_SURROGATE_P (c
))
1369 if (endian
== utf_16_big_endian
)
1370 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1372 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1376 if (UTF_16_HIGH_SURROGATE_P (c
))
1377 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1383 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1384 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1390 if (UTF_16_HIGH_SURROGATE_P (c
))
1391 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1398 coding
->consumed_char
+= consumed_chars_base
;
1399 coding
->consumed
= src_base
- coding
->source
;
1400 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1404 encode_coding_utf_16 (coding
)
1405 struct coding_system
*coding
;
1407 int multibytep
= coding
->dst_multibyte
;
1408 int *charbuf
= coding
->charbuf
;
1409 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1410 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1411 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1413 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1414 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1415 int produced_chars
= 0;
1416 Lisp_Object attrs
, eol_type
, charset_list
;
1419 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1421 if (bom
== utf_16_with_bom
)
1423 ASSURE_DESTINATION (safe_room
);
1425 EMIT_TWO_BYTES (0xFF, 0xFE);
1427 EMIT_TWO_BYTES (0xFE, 0xFF);
1428 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1431 while (charbuf
< charbuf_end
)
1433 ASSURE_DESTINATION (safe_room
);
1435 if (c
>= MAX_UNICODE_CHAR
)
1436 c
= coding
->default_char
;
1441 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1443 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1450 c1
= (c
>> 10) + 0xD800;
1451 c2
= (c
& 0x3FF) + 0xDC00;
1453 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1455 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1458 coding
->result
= CODING_RESULT_SUCCESS
;
1459 coding
->produced
= dst
- coding
->destination
;
1460 coding
->produced_char
+= produced_chars
;
1465 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1467 /* Emacs' internal format for representation of multiple character
1468 sets is a kind of multi-byte encoding, i.e. characters are
1469 represented by variable-length sequences of one-byte codes.
1471 ASCII characters and control characters (e.g. `tab', `newline') are
1472 represented by one-byte sequences which are their ASCII codes, in
1473 the range 0x00 through 0x7F.
1475 8-bit characters of the range 0x80..0x9F are represented by
1476 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1479 8-bit characters of the range 0xA0..0xFF are represented by
1480 one-byte sequences which are their 8-bit code.
1482 The other characters are represented by a sequence of `base
1483 leading-code', optional `extended leading-code', and one or two
1484 `position-code's. The length of the sequence is determined by the
1485 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1486 whereas extended leading-code and position-code take the range 0xA0
1487 through 0xFF. See `charset.h' for more details about leading-code
1490 --- CODE RANGE of Emacs' internal format ---
1494 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1495 eight-bit-graphic 0xA0..0xBF
1496 ELSE 0x81..0x9D + [0xA0..0xFF]+
1497 ---------------------------------------------
1499 As this is the internal character representation, the format is
1500 usually not used externally (i.e. in a file or in a data sent to a
1501 process). But, it is possible to have a text externally in this
1502 format (i.e. by encoding by the coding system `emacs-mule').
1504 In that case, a sequence of one-byte codes has a slightly different
1507 At first, all characters in eight-bit-control are represented by
1508 one-byte sequences which are their 8-bit code.
1510 Next, character composition data are represented by the byte
1511 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1513 METHOD is 0xF0 plus one of composition method (enum
1514 composition_method),
1516 BYTES is 0xA0 plus a byte length of this composition data,
1518 CHARS is 0x20 plus a number of characters composed by this
1521 COMPONENTs are characters of multibye form or composition
1522 rules encoded by two-byte of ASCII codes.
1524 In addition, for backward compatibility, the following formats are
1525 also recognized as composition data on decoding.
1528 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1531 MSEQ is a multibyte form but in these special format:
1532 ASCII: 0xA0 ASCII_CODE+0x80,
1533 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1534 RULE is a one byte code of the range 0xA0..0xF0 that
1535 represents a composition rule.
1538 char emacs_mule_bytes
[256];
1540 /* Leading-code followed by extended leading-code. */
1541 #define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
1542 #define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
1543 #define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
1544 #define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
1548 emacs_mule_char (coding
, src
, nbytes
, nchars
)
1549 struct coding_system
*coding
;
1551 int *nbytes
, *nchars
;
1553 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1554 int multibytep
= coding
->src_multibyte
;
1555 unsigned char *src_base
= src
;
1556 struct charset
*charset
;
1559 int consumed_chars
= 0;
1562 switch (emacs_mule_bytes
[c
])
1565 if (! (charset
= emacs_mule_charset
[c
]))
1572 if (c
== LEADING_CODE_PRIVATE_11
1573 || c
== LEADING_CODE_PRIVATE_12
)
1576 if (! (charset
= emacs_mule_charset
[c
]))
1583 if (! (charset
= emacs_mule_charset
[c
]))
1586 code
= (c
& 0x7F) << 8;
1594 if (! (charset
= emacs_mule_charset
[c
]))
1597 code
= (c
& 0x7F) << 8;
1604 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
) ? charset_ascii
1605 : code
< 0xA0 ? charset_8_bit_control
1606 : charset_8_bit_graphic
);
1612 c
= DECODE_CHAR (charset
, code
);
1615 *nbytes
= src
- src_base
;
1616 *nchars
= consumed_chars
;
1627 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1628 Check if a text is encoded in `emacs-mule'. */
1631 detect_coding_emacs_mule (coding
, mask
)
1632 struct coding_system
*coding
;
1635 unsigned char *src
= coding
->source
, *src_base
= src
;
1636 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1637 int multibytep
= coding
->src_multibyte
;
1638 int consumed_chars
= 0;
1642 /* A coding system of this category is always ASCII compatible. */
1643 src
+= coding
->head_ascii
;
1651 /* Perhaps the start of composite character. We simple skip
1652 it because analyzing it is too heavy for detecting. But,
1653 at least, we check that the composite character
1654 constitues of more than 4 bytes. */
1655 unsigned char *src_base
;
1665 if (src
- src_base
<= 4)
1675 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1680 unsigned char *src_base
= src
- 1;
1687 if (src
- src_base
!= emacs_mule_bytes
[*src_base
])
1692 *mask
&= ~CATEGORY_MASK_EMACS_MULE
;
1698 *mask
&= CATEGORY_MASK_EMACS_MULE
;
1703 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1705 /* Decode a character represented as a component of composition
1706 sequence of Emacs 20/21 style at SRC. Set C to that character and
1707 update SRC to the head of next character (or an encoded composition
1708 rule). If SRC doesn't points a composition component, set C to -1.
1709 If SRC points an invalid byte sequence, global exit by a return
1712 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1716 int nbytes, nchars; \
1718 if (src == src_end) \
1720 c = emacs_mule_char (coding, src, &nbytes, &nchars); \
1725 goto invalid_code; \
1729 consumed_chars += nchars; \
1734 /* Decode a composition rule represented as a component of composition
1735 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1736 and increment BUF. If SRC points an invalid byte sequence, set C
1739 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1741 int c, gref, nref; \
1743 if (src >= src_end) \
1744 goto invalid_code; \
1745 ONE_MORE_BYTE_NO_CHECK (c); \
1747 if (c < 0 || c >= 81) \
1748 goto invalid_code; \
1750 gref = c / 9, nref = c % 9; \
1751 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1755 /* Decode a composition rule represented as a component of composition
1756 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1757 and increment BUF. If SRC points an invalid byte sequence, set C
1760 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1764 if (src + 1>= src_end) \
1765 goto invalid_code; \
1766 ONE_MORE_BYTE_NO_CHECK (gref); \
1768 ONE_MORE_BYTE_NO_CHECK (nref); \
1770 if (gref < 0 || gref >= 81 \
1771 || nref < 0 || nref >= 81) \
1772 goto invalid_code; \
1773 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1777 #define ADD_COMPOSITION_DATA(buf, method, nchars) \
1780 *buf++ = coding->produced_char + char_offset; \
1781 *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \
1787 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1789 /* Emacs 21 style format. The first three bytes at SRC are \
1790 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1791 the byte length of this composition information, CHARS is the \
1792 number of characters composed by this composition. */ \
1793 enum composition_method method = c - 0xF2; \
1794 int *charbuf_base = charbuf; \
1795 int consumed_chars_limit; \
1796 int nbytes, nchars; \
1798 ONE_MORE_BYTE (c); \
1799 nbytes = c - 0xA0; \
1801 goto invalid_code; \
1802 ONE_MORE_BYTE (c); \
1803 nchars = c - 0xA0; \
1804 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
1805 consumed_chars_limit = consumed_chars_base + nbytes; \
1806 if (method != COMPOSITION_RELATIVE) \
1809 while (consumed_chars < consumed_chars_limit) \
1811 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1812 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1814 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1817 if (consumed_chars < consumed_chars_limit) \
1818 goto invalid_code; \
1819 charbuf_base[0] -= i; \
1824 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1826 /* Emacs 20 style format for relative composition. */ \
1827 /* Store multibyte form of characters to be composed. */ \
1828 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1829 int *buf = components; \
1833 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1834 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1835 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1837 goto invalid_code; \
1838 ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \
1839 for (j = 0; j < i; j++) \
1840 *charbuf++ = components[j]; \
1844 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1846 /* Emacs 20 style format for rule-base composition. */ \
1847 /* Store multibyte form of characters to be composed. */ \
1848 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1849 int *buf = components; \
1852 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1853 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1855 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1856 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1858 if (i < 1 || (buf - components) % 2 == 0) \
1859 goto invalid_code; \
1860 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1861 goto no_more_source; \
1862 ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \
1863 for (j = 0; j < i; j++) \
1864 *charbuf++ = components[j]; \
1865 for (j = 0; j < i; j += 2) \
1866 *charbuf++ = components[j]; \
1871 decode_coding_emacs_mule (coding
)
1872 struct coding_system
*coding
;
1874 unsigned char *src
= coding
->source
+ coding
->consumed
;
1875 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1876 unsigned char *src_base
;
1877 int *charbuf
= coding
->charbuf
;
1878 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1879 int consumed_chars
= 0, consumed_chars_base
;
1880 int char_offset
= 0;
1881 int multibytep
= coding
->src_multibyte
;
1882 Lisp_Object attrs
, eol_type
, charset_list
;
1884 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1891 consumed_chars_base
= consumed_chars
;
1893 if (charbuf
>= charbuf_end
)
1902 if (EQ (eol_type
, Qdos
))
1905 goto no_more_source
;
1909 else if (EQ (eol_type
, Qmac
))
1917 if (charbuf
+ 5 + (MAX_COMPOSITION_COMPONENTS
* 2) - 1 > charbuf_end
)
1920 if (c
- 0xF2 >= COMPOSITION_RELATIVE
1921 && c
- 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS
)
1922 DECODE_EMACS_MULE_21_COMPOSITION (c
);
1924 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
1926 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
1929 coding
->annotated
= 1;
1931 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
1935 consumed_chars
= consumed_chars_base
;
1936 c
= emacs_mule_char (coding
, src
, &nbytes
, &nchars
);
1945 consumed_chars
+= nchars
;
1952 consumed_chars
= consumed_chars_base
;
1954 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1959 coding
->consumed_char
+= consumed_chars_base
;
1960 coding
->consumed
= src_base
- coding
->source
;
1961 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1965 #define EMACS_MULE_LEADING_CODES(id, codes) \
1968 codes[0] = id, codes[1] = 0; \
1969 else if (id < 0xE0) \
1970 codes[0] = 0x9A, codes[1] = id; \
1971 else if (id < 0xF0) \
1972 codes[0] = 0x9B, codes[1] = id; \
1973 else if (id < 0xF5) \
1974 codes[0] = 0x9C, codes[1] = id; \
1976 codes[0] = 0x9D, codes[1] = id; \
1981 encode_coding_emacs_mule (coding
)
1982 struct coding_system
*coding
;
1984 int multibytep
= coding
->dst_multibyte
;
1985 int *charbuf
= coding
->charbuf
;
1986 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1987 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1988 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1990 int produced_chars
= 0;
1991 Lisp_Object attrs
, eol_type
, charset_list
;
1994 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1996 while (charbuf
< charbuf_end
)
1998 ASSURE_DESTINATION (safe_room
);
2000 if (ASCII_CHAR_P (c
))
2001 EMIT_ONE_ASCII_BYTE (c
);
2004 struct charset
*charset
;
2008 unsigned char leading_codes
[2];
2010 charset
= char_charset (c
, charset_list
, &code
);
2013 c
= coding
->default_char
;
2014 if (ASCII_CHAR_P (c
))
2016 EMIT_ONE_ASCII_BYTE (c
);
2019 charset
= char_charset (c
, charset_list
, &code
);
2021 dimension
= CHARSET_DIMENSION (charset
);
2022 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2023 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2024 EMIT_ONE_BYTE (leading_codes
[0]);
2025 if (leading_codes
[1])
2026 EMIT_ONE_BYTE (leading_codes
[1]);
2028 EMIT_ONE_BYTE (code
);
2031 EMIT_ONE_BYTE (code
>> 8);
2032 EMIT_ONE_BYTE (code
& 0xFF);
2036 coding
->result
= CODING_RESULT_SUCCESS
;
2037 coding
->produced_char
+= produced_chars
;
2038 coding
->produced
= dst
- coding
->destination
;
2043 /*** 7. ISO2022 handlers ***/
2045 /* The following note describes the coding system ISO2022 briefly.
2046 Since the intention of this note is to help understand the
2047 functions in this file, some parts are NOT ACCURATE or OVERLY
2048 SIMPLIFIED. For thorough understanding, please refer to the
2049 original document of ISO2022.
2051 ISO2022 provides many mechanisms to encode several character sets
2052 in 7-bit and 8-bit environments. For 7-bite environments, all text
2053 is encoded using bytes less than 128. This may make the encoded
2054 text a little bit longer, but the text passes more easily through
2055 several gateways, some of which strip off MSB (Most Signigant Bit).
2057 There are two kinds of character sets: control character set and
2058 graphic character set. The former contains control characters such
2059 as `newline' and `escape' to provide control functions (control
2060 functions are also provided by escape sequences). The latter
2061 contains graphic characters such as 'A' and '-'. Emacs recognizes
2062 two control character sets and many graphic character sets.
2064 Graphic character sets are classified into one of the following
2065 four classes, according to the number of bytes (DIMENSION) and
2066 number of characters in one dimension (CHARS) of the set:
2067 - DIMENSION1_CHARS94
2068 - DIMENSION1_CHARS96
2069 - DIMENSION2_CHARS94
2070 - DIMENSION2_CHARS96
2072 In addition, each character set is assigned an identification tag,
2073 unique for each set, called "final character" (denoted as <F>
2074 hereafter). The <F> of each character set is decided by ECMA(*)
2075 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2076 (0x30..0x3F are for private use only).
2078 Note (*): ECMA = European Computer Manufacturers Association
2080 Here are examples of graphic character set [NAME(<F>)]:
2081 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2082 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2083 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2084 o DIMENSION2_CHARS96 -- none for the moment
2086 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2087 C0 [0x00..0x1F] -- control character plane 0
2088 GL [0x20..0x7F] -- graphic character plane 0
2089 C1 [0x80..0x9F] -- control character plane 1
2090 GR [0xA0..0xFF] -- graphic character plane 1
2092 A control character set is directly designated and invoked to C0 or
2093 C1 by an escape sequence. The most common case is that:
2094 - ISO646's control character set is designated/invoked to C0, and
2095 - ISO6429's control character set is designated/invoked to C1,
2096 and usually these designations/invocations are omitted in encoded
2097 text. In a 7-bit environment, only C0 can be used, and a control
2098 character for C1 is encoded by an appropriate escape sequence to
2099 fit into the environment. All control characters for C1 are
2100 defined to have corresponding escape sequences.
2102 A graphic character set is at first designated to one of four
2103 graphic registers (G0 through G3), then these graphic registers are
2104 invoked to GL or GR. These designations and invocations can be
2105 done independently. The most common case is that G0 is invoked to
2106 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2107 these invocations and designations are omitted in encoded text.
2108 In a 7-bit environment, only GL can be used.
2110 When a graphic character set of CHARS94 is invoked to GL, codes
2111 0x20 and 0x7F of the GL area work as control characters SPACE and
2112 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2115 There are two ways of invocation: locking-shift and single-shift.
2116 With locking-shift, the invocation lasts until the next different
2117 invocation, whereas with single-shift, the invocation affects the
2118 following character only and doesn't affect the locking-shift
2119 state. Invocations are done by the following control characters or
2122 ----------------------------------------------------------------------
2123 abbrev function cntrl escape seq description
2124 ----------------------------------------------------------------------
2125 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2126 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2127 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2128 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2129 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2130 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2131 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2132 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2133 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2134 ----------------------------------------------------------------------
2135 (*) These are not used by any known coding system.
2137 Control characters for these functions are defined by macros
2138 ISO_CODE_XXX in `coding.h'.
2140 Designations are done by the following escape sequences:
2141 ----------------------------------------------------------------------
2142 escape sequence description
2143 ----------------------------------------------------------------------
2144 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2145 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2146 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2147 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2148 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2149 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2150 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2151 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2152 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2153 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2154 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2155 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2156 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2157 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2158 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2159 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2160 ----------------------------------------------------------------------
2162 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2163 of dimension 1, chars 94, and final character <F>, etc...
2165 Note (*): Although these designations are not allowed in ISO2022,
2166 Emacs accepts them on decoding, and produces them on encoding
2167 CHARS96 character sets in a coding system which is characterized as
2168 7-bit environment, non-locking-shift, and non-single-shift.
2170 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2171 '(' must be omitted. We refer to this as "short-form" hereafter.
2173 Now you may notice that there are a lot of ways for encoding the
2174 same multilingual text in ISO2022. Actually, there exist many
2175 coding systems such as Compound Text (used in X11's inter client
2176 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
2177 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
2178 localized platforms), and all of these are variants of ISO2022.
2180 In addition to the above, Emacs handles two more kinds of escape
2181 sequences: ISO6429's direction specification and Emacs' private
2182 sequence for specifying character composition.
2184 ISO6429's direction specification takes the following form:
2185 o CSI ']' -- end of the current direction
2186 o CSI '0' ']' -- end of the current direction
2187 o CSI '1' ']' -- start of left-to-right text
2188 o CSI '2' ']' -- start of right-to-left text
2189 The control character CSI (0x9B: control sequence introducer) is
2190 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2192 Character composition specification takes the following form:
2193 o ESC '0' -- start relative composition
2194 o ESC '1' -- end composition
2195 o ESC '2' -- start rule-base composition (*)
2196 o ESC '3' -- start relative composition with alternate chars (**)
2197 o ESC '4' -- start rule-base composition with alternate chars (**)
2198 Since these are not standard escape sequences of any ISO standard,
2199 the use of them for these meaning is restricted to Emacs only.
2201 (*) This form is used only in Emacs 20.5 and the older versions,
2202 but the newer versions can safely decode it.
2203 (**) This form is used only in Emacs 21.1 and the newer versions,
2204 and the older versions can't decode it.
2206 Here's a list of examples usages of these composition escape
2207 sequences (categorized by `enum composition_method').
2209 COMPOSITION_RELATIVE:
2210 ESC 0 CHAR [ CHAR ] ESC 1
2211 COMPOSITOIN_WITH_RULE:
2212 ESC 2 CHAR [ RULE CHAR ] ESC 1
2213 COMPOSITION_WITH_ALTCHARS:
2214 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2215 COMPOSITION_WITH_RULE_ALTCHARS:
2216 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2218 enum iso_code_class_type iso_code_class
[256];
2220 #define SAFE_CHARSET_P(coding, id) \
2221 ((id) <= (coding)->max_charset_id \
2222 && (coding)->safe_charsets[id] >= 0)
2225 #define SHIFT_OUT_OK(category) \
2226 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2229 setup_iso_safe_charsets (attrs
)
2232 Lisp_Object charset_list
, safe_charsets
;
2233 Lisp_Object request
;
2234 Lisp_Object reg_usage
;
2237 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2240 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2241 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2242 && ! EQ (charset_list
, Viso_2022_charset_list
))
2244 CODING_ATTR_CHARSET_LIST (attrs
)
2245 = charset_list
= Viso_2022_charset_list
;
2246 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2249 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2253 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2255 int id
= XINT (XCAR (tail
));
2256 if (max_charset_id
< id
)
2257 max_charset_id
= id
;
2260 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2262 request
= AREF (attrs
, coding_attr_iso_request
);
2263 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2264 reg94
= XINT (XCAR (reg_usage
));
2265 reg96
= XINT (XCDR (reg_usage
));
2267 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2271 struct charset
*charset
;
2274 charset
= CHARSET_FROM_ID (XINT (id
));
2275 reg
= Fcdr (Fassq (id
, request
));
2277 XSTRING (safe_charsets
)->data
[XINT (id
)] = XINT (reg
);
2278 else if (charset
->iso_chars_96
)
2281 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg96
;
2286 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg94
;
2289 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2293 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2294 Check if a text is encoded in ISO2022. If it is, returns an
2295 integer in which appropriate flag bits any of:
2297 CATEGORY_MASK_ISO_7_TIGHT
2298 CATEGORY_MASK_ISO_8_1
2299 CATEGORY_MASK_ISO_8_2
2300 CATEGORY_MASK_ISO_7_ELSE
2301 CATEGORY_MASK_ISO_8_ELSE
2302 are set. If a code which should never appear in ISO2022 is found,
2306 detect_coding_iso_2022 (coding
, mask
)
2307 struct coding_system
*coding
;
2310 unsigned char *src
= coding
->source
, *src_base
= src
;
2311 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2312 int multibytep
= coding
->src_multibyte
;
2313 int mask_iso
= CATEGORY_MASK_ISO
;
2314 int mask_found
= 0, mask_8bit_found
= 0;
2315 int reg
[4], shift_out
= 0, single_shifting
= 0;
2318 int consumed_chars
= 0;
2321 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2323 struct coding_system
*this = &(coding_categories
[i
]);
2324 Lisp_Object attrs
, val
;
2326 attrs
= CODING_ID_ATTRS (this->id
);
2327 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2328 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2329 setup_iso_safe_charsets (attrs
);
2330 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2331 this->max_charset_id
= XSTRING (val
)->size
- 1;
2332 this->safe_charsets
= (char *) XSTRING (val
)->data
;
2335 /* A coding system of this category is always ASCII compatible. */
2336 src
+= coding
->head_ascii
;
2338 reg
[0] = charset_ascii
, reg
[1] = reg
[2] = reg
[3] = -1;
2339 while (mask_iso
&& src
< src_end
)
2345 if (inhibit_iso_escape_detection
)
2347 single_shifting
= 0;
2349 if (c
>= '(' && c
<= '/')
2351 /* Designation sequence for a charset of dimension 1. */
2353 if (c1
< ' ' || c1
>= 0x80
2354 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2355 /* Invalid designation sequence. Just ignore. */
2357 reg
[(c
- '(') % 4] = id
;
2361 /* Designation sequence for a charset of dimension 2. */
2363 if (c
>= '@' && c
<= 'B')
2364 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2365 reg
[0] = id
= iso_charset_table
[1][0][c
];
2366 else if (c
>= '(' && c
<= '/')
2369 if (c1
< ' ' || c1
>= 0x80
2370 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2371 /* Invalid designation sequence. Just ignore. */
2373 reg
[(c
- '(') % 4] = id
;
2376 /* Invalid designation sequence. Just ignore. */
2379 else if (c
== 'N' || c
== 'O')
2381 /* ESC <Fe> for SS2 or SS3. */
2382 mask_iso
&= CATEGORY_MASK_ISO_7_ELSE
;
2385 else if (c
>= '0' && c
<= '4')
2387 /* ESC <Fp> for start/end composition. */
2388 mask_found
|= CATEGORY_MASK_ISO
;
2393 /* Invalid escape sequence. */
2394 mask_iso
&= ~CATEGORY_MASK_ISO_ESCAPE
;
2398 /* We found a valid designation sequence for CHARSET. */
2399 mask_iso
&= ~CATEGORY_MASK_ISO_8BIT
;
2400 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2402 mask_found
|= CATEGORY_MASK_ISO_7
;
2404 mask_iso
&= ~CATEGORY_MASK_ISO_7
;
2405 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2407 mask_found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2409 mask_iso
&= ~CATEGORY_MASK_ISO_7_TIGHT
;
2410 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2412 mask_found
|= CATEGORY_MASK_ISO_7_ELSE
;
2414 mask_iso
&= ~CATEGORY_MASK_ISO_7_ELSE
;
2415 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2417 mask_found
|= CATEGORY_MASK_ISO_8_ELSE
;
2419 mask_iso
&= ~CATEGORY_MASK_ISO_8_ELSE
;
2423 if (inhibit_iso_escape_detection
)
2425 single_shifting
= 0;
2428 || SHIFT_OUT_OK (coding_category_iso_7_else
)
2429 || SHIFT_OUT_OK (coding_category_iso_8_else
)))
2431 /* Locking shift out. */
2432 mask_iso
&= ~CATEGORY_MASK_ISO_7BIT
;
2433 mask_found
|= CATEGORY_MASK_ISO_ELSE
;
2438 if (inhibit_iso_escape_detection
)
2440 single_shifting
= 0;
2443 /* Locking shift in. */
2444 mask_iso
&= ~CATEGORY_MASK_ISO_7BIT
;
2445 mask_found
|= CATEGORY_MASK_ISO_ELSE
;
2450 single_shifting
= 0;
2454 int newmask
= CATEGORY_MASK_ISO_8_ELSE
;
2456 if (inhibit_iso_escape_detection
)
2458 if (c
!= ISO_CODE_CSI
)
2460 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2461 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2462 newmask
|= CATEGORY_MASK_ISO_8_1
;
2463 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2464 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2465 newmask
|= CATEGORY_MASK_ISO_8_2
;
2466 single_shifting
= 1;
2468 if (VECTORP (Vlatin_extra_code_table
)
2469 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2471 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2472 & CODING_ISO_FLAG_LATIN_EXTRA
)
2473 newmask
|= CATEGORY_MASK_ISO_8_1
;
2474 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2475 & CODING_ISO_FLAG_LATIN_EXTRA
)
2476 newmask
|= CATEGORY_MASK_ISO_8_2
;
2478 mask_iso
&= newmask
;
2479 mask_found
|= newmask
;
2486 single_shifting
= 0;
2491 single_shifting
= 0;
2492 mask_8bit_found
= 1;
2493 if (VECTORP (Vlatin_extra_code_table
)
2494 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2498 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2499 & CODING_ISO_FLAG_LATIN_EXTRA
)
2500 newmask
|= CATEGORY_MASK_ISO_8_1
;
2501 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2502 & CODING_ISO_FLAG_LATIN_EXTRA
)
2503 newmask
|= CATEGORY_MASK_ISO_8_2
;
2504 mask_iso
&= newmask
;
2505 mask_found
|= newmask
;
2512 mask_iso
&= ~(CATEGORY_MASK_ISO_7BIT
2513 | CATEGORY_MASK_ISO_7_ELSE
);
2514 mask_found
|= CATEGORY_MASK_ISO_8_1
;
2515 mask_8bit_found
= 1;
2516 /* Check the length of succeeding codes of the range
2517 0xA0..0FF. If the byte length is odd, we exclude
2518 CATEGORY_MASK_ISO_8_2. We can check this only
2519 when we are not single shifting. */
2520 if (!single_shifting
2521 && mask_iso
& CATEGORY_MASK_ISO_8_2
)
2524 while (src
< src_end
)
2532 if (i
& 1 && src
< src_end
)
2533 mask_iso
&= ~CATEGORY_MASK_ISO_8_2
;
2535 mask_found
|= CATEGORY_MASK_ISO_8_2
;
2544 *mask
&= ~CATEGORY_MASK_ISO
;
2549 *mask
&= mask_iso
& mask_found
;
2550 if (! mask_8bit_found
)
2551 *mask
&= ~(CATEGORY_MASK_ISO_8BIT
| CATEGORY_MASK_ISO_8_ELSE
);
2556 /* Set designation state into CODING. */
2557 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2561 if (final < '0' || final >= 128 \
2562 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2563 || !SAFE_CHARSET_P (coding, id)) \
2565 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2566 goto invalid_code; \
2568 prev = CODING_ISO_DESIGNATION (coding, reg); \
2569 if (id == charset_jisx0201_roman) \
2571 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2572 id = charset_ascii; \
2574 else if (id == charset_jisx0208_1978) \
2576 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2577 id = charset_jisx0208; \
2579 CODING_ISO_DESIGNATION (coding, reg) = id; \
2580 /* If there was an invalid designation to REG previously, and this \
2581 designation is ASCII to REG, we should keep this designation \
2583 if (prev == -2 && id == charset_ascii) \
2584 goto invalid_code; \
2588 #define MAYBE_FINISH_COMPOSITION() \
2591 if (composition_state == COMPOSING_NO) \
2593 /* It is assured that we have enough room for producing \
2594 characters stored in the table `components'. */ \
2595 if (charbuf + component_idx > charbuf_end) \
2596 goto no_more_source; \
2597 composition_state = COMPOSING_NO; \
2598 if (method == COMPOSITION_RELATIVE \
2599 || method == COMPOSITION_WITH_ALTCHARS) \
2601 for (i = 0; i < component_idx; i++) \
2602 *charbuf++ = components[i]; \
2603 char_offset += component_idx; \
2607 for (i = 0; i < component_idx; i += 2) \
2608 *charbuf++ = components[i]; \
2609 char_offset += (component_idx / 2) + 1; \
2614 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2615 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2616 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2617 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2618 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2621 #define DECODE_COMPOSITION_START(c1) \
2624 && composition_state == COMPOSING_COMPONENT_RULE) \
2626 component_len = component_idx; \
2627 composition_state = COMPOSING_CHAR; \
2633 MAYBE_FINISH_COMPOSITION (); \
2634 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2635 goto no_more_source; \
2636 for (p = src; p < src_end - 1; p++) \
2637 if (*p == ISO_CODE_ESC && p[1] == '1') \
2639 if (p == src_end - 1) \
2641 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2642 goto invalid_code; \
2643 goto no_more_source; \
2646 /* This is surely the start of a composition. */ \
2647 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2648 : c1 == '2' ? COMPOSITION_WITH_RULE \
2649 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2650 : COMPOSITION_WITH_RULE_ALTCHARS); \
2651 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2652 : COMPOSING_COMPONENT_CHAR); \
2653 component_idx = component_len = 0; \
2658 /* Handle compositoin end sequence ESC 1. */
2660 #define DECODE_COMPOSITION_END() \
2662 int nchars = (component_len > 0 ? component_idx - component_len \
2663 : method == COMPOSITION_RELATIVE ? component_idx \
2664 : (component_idx + 1) / 2); \
2666 int *saved_charbuf = charbuf; \
2668 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
2669 if (method != COMPOSITION_RELATIVE) \
2671 if (component_len == 0) \
2672 for (i = 0; i < component_idx; i++) \
2673 *charbuf++ = components[i]; \
2675 for (i = 0; i < component_len; i++) \
2676 *charbuf++ = components[i]; \
2677 *saved_charbuf = saved_charbuf - charbuf; \
2679 if (method == COMPOSITION_WITH_RULE) \
2680 for (i = 0; i < component_idx; i += 2, char_offset++) \
2681 *charbuf++ = components[i]; \
2683 for (i = component_len; i < component_idx; i++, char_offset++) \
2684 *charbuf++ = components[i]; \
2685 coding->annotated = 1; \
2686 composition_state = COMPOSING_NO; \
2690 /* Decode a composition rule from the byte C1 (and maybe one more byte
2691 from SRC) and store one encoded composition rule in
2692 coding->cmp_data. */
2694 #define DECODE_COMPOSITION_RULE(c1) \
2697 if (c1 < 81) /* old format (before ver.21) */ \
2699 int gref = (c1) / 9; \
2700 int nref = (c1) % 9; \
2701 if (gref == 4) gref = 10; \
2702 if (nref == 4) nref = 10; \
2703 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2705 else if (c1 < 93) /* new format (after ver.21) */ \
2707 ONE_MORE_BYTE (c2); \
2708 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2715 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2718 decode_coding_iso_2022 (coding
)
2719 struct coding_system
*coding
;
2721 unsigned char *src
= coding
->source
+ coding
->consumed
;
2722 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2723 unsigned char *src_base
;
2724 int *charbuf
= coding
->charbuf
;
2725 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- 4;
2726 int consumed_chars
= 0, consumed_chars_base
;
2727 int char_offset
= 0;
2728 int multibytep
= coding
->src_multibyte
;
2729 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2730 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2731 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2732 struct charset
*charset
;
2734 /* For handling composition sequence. */
2735 #define COMPOSING_NO 0
2736 #define COMPOSING_CHAR 1
2737 #define COMPOSING_RULE 2
2738 #define COMPOSING_COMPONENT_CHAR 3
2739 #define COMPOSING_COMPONENT_RULE 4
2741 int composition_state
= COMPOSING_NO
;
2742 enum composition_method method
;
2743 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2746 Lisp_Object attrs
, eol_type
, charset_list
;
2748 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2749 setup_iso_safe_charsets (attrs
);
2756 consumed_chars_base
= consumed_chars
;
2758 if (charbuf
>= charbuf_end
)
2763 /* We produce no character or one character. */
2764 switch (iso_code_class
[c1
])
2766 case ISO_0x20_or_0x7F
:
2767 if (composition_state
!= COMPOSING_NO
)
2769 if (composition_state
== COMPOSING_RULE
2770 || composition_state
== COMPOSING_COMPONENT_RULE
)
2772 DECODE_COMPOSITION_RULE (c1
);
2773 components
[component_idx
++] = c1
;
2774 composition_state
--;
2778 if (charset_id_0
< 0
2779 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2780 /* This is SPACE or DEL. */
2781 charset
= CHARSET_FROM_ID (charset_ascii
);
2783 charset
= CHARSET_FROM_ID (charset_id_0
);
2786 case ISO_graphic_plane_0
:
2787 if (composition_state
!= COMPOSING_NO
)
2789 if (composition_state
== COMPOSING_RULE
2790 || composition_state
== COMPOSING_COMPONENT_RULE
)
2792 DECODE_COMPOSITION_RULE (c1
);
2793 components
[component_idx
++] = c1
;
2794 composition_state
--;
2798 charset
= CHARSET_FROM_ID (charset_id_0
);
2801 case ISO_0xA0_or_0xFF
:
2802 if (charset_id_1
< 0
2803 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
2804 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
2806 /* This is a graphic character, we fall down ... */
2808 case ISO_graphic_plane_1
:
2809 if (charset_id_1
< 0)
2811 charset
= CHARSET_FROM_ID (charset_id_1
);
2814 case ISO_carriage_return
:
2817 if (EQ (eol_type
, Qdos
))
2820 goto no_more_source
;
2824 else if (EQ (eol_type
, Qmac
))
2830 MAYBE_FINISH_COMPOSITION ();
2831 charset
= CHARSET_FROM_ID (charset_ascii
);
2835 MAYBE_FINISH_COMPOSITION ();
2839 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2840 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
2842 CODING_ISO_INVOCATION (coding
, 0) = 1;
2843 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2847 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
2849 CODING_ISO_INVOCATION (coding
, 0) = 0;
2850 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2853 case ISO_single_shift_2_7
:
2854 case ISO_single_shift_2
:
2855 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2857 /* SS2 is handled as an escape sequence of ESC 'N' */
2859 goto label_escape_sequence
;
2861 case ISO_single_shift_3
:
2862 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2864 /* SS2 is handled as an escape sequence of ESC 'O' */
2866 goto label_escape_sequence
;
2868 case ISO_control_sequence_introducer
:
2869 /* CSI is handled as an escape sequence of ESC '[' ... */
2871 goto label_escape_sequence
;
2875 label_escape_sequence
:
2876 /* Escape sequences handled here are invocation,
2877 designation, direction specification, and character
2878 composition specification. */
2881 case '&': /* revision of following character set */
2883 if (!(c1
>= '@' && c1
<= '~'))
2886 if (c1
!= ISO_CODE_ESC
)
2889 goto label_escape_sequence
;
2891 case '$': /* designation of 2-byte character set */
2892 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2895 if (c1
>= '@' && c1
<= 'B')
2896 { /* designation of JISX0208.1978, GB2312.1980,
2898 DECODE_DESIGNATION (0, 2, 0, c1
);
2900 else if (c1
>= 0x28 && c1
<= 0x2B)
2901 { /* designation of DIMENSION2_CHARS94 character set */
2903 DECODE_DESIGNATION (c1
- 0x28, 2, 0, c2
);
2905 else if (c1
>= 0x2C && c1
<= 0x2F)
2906 { /* designation of DIMENSION2_CHARS96 character set */
2908 DECODE_DESIGNATION (c1
- 0x2C, 2, 1, c2
);
2912 /* We must update these variables now. */
2913 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2914 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2917 case 'n': /* invocation of locking-shift-2 */
2918 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2919 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2921 CODING_ISO_INVOCATION (coding
, 0) = 2;
2922 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2925 case 'o': /* invocation of locking-shift-3 */
2926 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2927 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2929 CODING_ISO_INVOCATION (coding
, 0) = 3;
2930 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2933 case 'N': /* invocation of single-shift-2 */
2934 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2935 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2937 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 2));
2939 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
2943 case 'O': /* invocation of single-shift-3 */
2944 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2945 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2947 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 3));
2949 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
2953 case '0': case '2': case '3': case '4': /* start composition */
2954 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
2956 DECODE_COMPOSITION_START (c1
);
2959 case '1': /* end composition */
2960 if (composition_state
== COMPOSING_NO
)
2962 DECODE_COMPOSITION_END ();
2965 case '[': /* specification of direction */
2966 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
2968 /* For the moment, nested direction is not supported.
2969 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2970 left-to-right, and nozero means right-to-left. */
2974 case ']': /* end of the current direction */
2975 coding
->mode
&= ~CODING_MODE_DIRECTION
;
2977 case '0': /* end of the current direction */
2978 case '1': /* start of left-to-right direction */
2981 coding
->mode
&= ~CODING_MODE_DIRECTION
;
2986 case '2': /* start of right-to-left direction */
2989 coding
->mode
|= CODING_MODE_DIRECTION
;
3000 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3002 if (c1
>= 0x28 && c1
<= 0x2B)
3003 { /* designation of DIMENSION1_CHARS94 character set */
3005 DECODE_DESIGNATION (c1
- 0x28, 1, 0, c2
);
3007 else if (c1
>= 0x2C && c1
<= 0x2F)
3008 { /* designation of DIMENSION1_CHARS96 character set */
3010 DECODE_DESIGNATION (c1
- 0x2C, 1, 1, c2
);
3014 /* We must update these variables now. */
3015 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3016 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3021 /* Now we know CHARSET and 1st position code C1 of a character.
3022 Produce a decoded character while getting 2nd position code
3025 if (CHARSET_DIMENSION (charset
) > 1)
3028 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3029 /* C2 is not in a valid range. */
3031 c1
= (c1
<< 8) | (c2
& 0x7F);
3032 if (CHARSET_DIMENSION (charset
) > 2)
3035 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3036 /* C2 is not in a valid range. */
3038 c1
= (c1
<< 8) | (c2
& 0x7F);
3042 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3045 MAYBE_FINISH_COMPOSITION ();
3046 for (; src_base
< src
; src_base
++, char_offset
++)
3048 if (ASCII_BYTE_P (*src_base
))
3049 *charbuf
++ = *src_base
;
3051 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3054 else if (composition_state
== COMPOSING_NO
)
3061 components
[component_idx
++] = c
;
3062 if (method
== COMPOSITION_WITH_RULE
3063 || (method
== COMPOSITION_WITH_RULE_ALTCHARS
3064 && composition_state
== COMPOSING_COMPONENT_CHAR
))
3065 composition_state
++;
3070 MAYBE_FINISH_COMPOSITION ();
3072 consumed_chars
= consumed_chars_base
;
3074 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3079 coding
->consumed_char
+= consumed_chars_base
;
3080 coding
->consumed
= src_base
- coding
->source
;
3081 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3085 /* ISO2022 encoding stuff. */
3088 It is not enough to say just "ISO2022" on encoding, we have to
3089 specify more details. In Emacs, each coding system of ISO2022
3090 variant has the following specifications:
3091 1. Initial designation to G0 thru G3.
3092 2. Allows short-form designation?
3093 3. ASCII should be designated to G0 before control characters?
3094 4. ASCII should be designated to G0 at end of line?
3095 5. 7-bit environment or 8-bit environment?
3096 6. Use locking-shift?
3097 7. Use Single-shift?
3098 And the following two are only for Japanese:
3099 8. Use ASCII in place of JIS0201-1976-Roman?
3100 9. Use JISX0208-1983 in place of JISX0208-1978?
3101 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3102 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3106 /* Produce codes (escape sequence) for designating CHARSET to graphic
3107 register REG at DST, and increment DST. If <final-char> of CHARSET is
3108 '@', 'A', or 'B' and the coding system CODING allows, produce
3109 designation sequence of short-form. */
3111 #define ENCODE_DESIGNATION(charset, reg, coding) \
3113 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3114 char *intermediate_char_94 = "()*+"; \
3115 char *intermediate_char_96 = ",-./"; \
3116 int revision = -1; \
3119 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3120 revision = XINT (CHARSET_ISO_REVISION (charset)); \
3122 if (revision >= 0) \
3124 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3125 EMIT_ONE_BYTE ('@' + revision); \
3127 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3128 if (CHARSET_DIMENSION (charset) == 1) \
3130 if (! CHARSET_ISO_CHARS_96 (charset)) \
3131 c = intermediate_char_94[reg]; \
3133 c = intermediate_char_96[reg]; \
3134 EMIT_ONE_ASCII_BYTE (c); \
3138 EMIT_ONE_ASCII_BYTE ('$'); \
3139 if (! CHARSET_ISO_CHARS_96 (charset)) \
3141 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3143 || final_char < '@' || final_char > 'B') \
3144 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3147 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3149 EMIT_ONE_ASCII_BYTE (final_char); \
3151 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3155 /* The following two macros produce codes (control character or escape
3156 sequence) for ISO2022 single-shift functions (single-shift-2 and
3159 #define ENCODE_SINGLE_SHIFT_2 \
3161 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3162 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3164 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3165 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3169 #define ENCODE_SINGLE_SHIFT_3 \
3171 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3172 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3174 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3175 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3179 /* The following four macros produce codes (control character or
3180 escape sequence) for ISO2022 locking-shift functions (shift-in,
3181 shift-out, locking-shift-2, and locking-shift-3). */
3183 #define ENCODE_SHIFT_IN \
3185 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3186 CODING_ISO_INVOCATION (coding, 0) = 0; \
3190 #define ENCODE_SHIFT_OUT \
3192 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3193 CODING_ISO_INVOCATION (coding, 0) = 1; \
3197 #define ENCODE_LOCKING_SHIFT_2 \
3199 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3200 CODING_ISO_INVOCATION (coding, 0) = 2; \
3204 #define ENCODE_LOCKING_SHIFT_3 \
3206 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3207 CODING_ISO_INVOCATION (coding, 0) = 3; \
3211 /* Produce codes for a DIMENSION1 character whose character set is
3212 CHARSET and whose position-code is C1. Designation and invocation
3213 sequences are also produced in advance if necessary. */
3215 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3217 int id = CHARSET_ID (charset); \
3219 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3220 && id == charset_ascii) \
3222 id = charset_jisx0201_roman; \
3223 charset = CHARSET_FROM_ID (id); \
3226 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3228 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3229 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3231 EMIT_ONE_BYTE (c1 | 0x80); \
3232 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3235 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3237 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3240 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3242 EMIT_ONE_BYTE (c1 | 0x80); \
3246 /* Since CHARSET is not yet invoked to any graphic planes, we \
3247 must invoke it, or, at first, designate it to some graphic \
3248 register. Then repeat the loop to actually produce the \
3250 dst = encode_invocation_designation (charset, coding, dst, \
3255 /* Produce codes for a DIMENSION2 character whose character set is
3256 CHARSET and whose position-codes are C1 and C2. Designation and
3257 invocation codes are also produced in advance if necessary. */
3259 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3261 int id = CHARSET_ID (charset); \
3263 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3264 && id == charset_jisx0208) \
3266 id = charset_jisx0208_1978; \
3267 charset = CHARSET_FROM_ID (id); \
3270 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3272 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3273 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3275 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3276 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3279 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3281 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3284 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3286 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3290 /* Since CHARSET is not yet invoked to any graphic planes, we \
3291 must invoke it, or, at first, designate it to some graphic \
3292 register. Then repeat the loop to actually produce the \
3294 dst = encode_invocation_designation (charset, coding, dst, \
3299 #define ENCODE_ISO_CHARACTER(charset, c) \
3301 int code = ENCODE_CHAR ((charset),(c)); \
3303 if (CHARSET_DIMENSION (charset) == 1) \
3304 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3306 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3310 /* Produce designation and invocation codes at a place pointed by DST
3311 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3315 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3316 struct charset
*charset
;
3317 struct coding_system
*coding
;
3321 int multibytep
= coding
->dst_multibyte
;
3322 int produced_chars
= *p_nchars
;
3323 int reg
; /* graphic register number */
3324 int id
= CHARSET_ID (charset
);
3326 /* At first, check designations. */
3327 for (reg
= 0; reg
< 4; reg
++)
3328 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3333 /* CHARSET is not yet designated to any graphic registers. */
3334 /* At first check the requested designation. */
3335 reg
= CODING_ISO_REQUEST (coding
, id
);
3337 /* Since CHARSET requests no special designation, designate it
3338 to graphic register 0. */
3341 ENCODE_DESIGNATION (charset
, reg
, coding
);
3344 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3345 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3347 /* Since the graphic register REG is not invoked to any graphic
3348 planes, invoke it to graphic plane 0. */
3351 case 0: /* graphic register 0 */
3355 case 1: /* graphic register 1 */
3359 case 2: /* graphic register 2 */
3360 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3361 ENCODE_SINGLE_SHIFT_2
;
3363 ENCODE_LOCKING_SHIFT_2
;
3366 case 3: /* graphic register 3 */
3367 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3368 ENCODE_SINGLE_SHIFT_3
;
3370 ENCODE_LOCKING_SHIFT_3
;
3375 *p_nchars
= produced_chars
;
3379 /* The following three macros produce codes for indicating direction
3381 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3383 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3384 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3386 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3390 #define ENCODE_DIRECTION_R2L() \
3392 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3393 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3397 #define ENCODE_DIRECTION_L2R() \
3399 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3400 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3404 /* Produce codes for designation and invocation to reset the graphic
3405 planes and registers to initial state. */
3406 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3409 struct charset *charset; \
3411 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3413 for (reg = 0; reg < 4; reg++) \
3414 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3415 && (CODING_ISO_DESIGNATION (coding, reg) \
3416 != CODING_ISO_INITIAL (coding, reg))) \
3418 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3419 ENCODE_DESIGNATION (charset, reg, coding); \
3424 /* Produce designation sequences of charsets in the line started from
3425 SRC to a place pointed by DST, and return updated DST.
3427 If the current block ends before any end-of-line, we may fail to
3428 find all the necessary designations. */
3430 static unsigned char *
3431 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3432 struct coding_system
*coding
;
3433 int *charbuf
, *charbuf_end
;
3436 struct charset
*charset
;
3437 /* Table of charsets to be designated to each graphic register. */
3439 int c
, found
= 0, reg
;
3440 int produced_chars
= 0;
3441 int multibytep
= coding
->dst_multibyte
;
3443 Lisp_Object charset_list
;
3445 attrs
= CODING_ID_ATTRS (coding
->id
);
3446 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3447 if (EQ (charset_list
, Qiso_2022
))
3448 charset_list
= Viso_2022_charset_list
;
3450 for (reg
= 0; reg
< 4; reg
++)
3460 charset
= char_charset (c
, charset_list
, NULL
);
3461 id
= CHARSET_ID (charset
);
3462 reg
= CODING_ISO_REQUEST (coding
, id
);
3463 if (reg
>= 0 && r
[reg
] < 0)
3472 for (reg
= 0; reg
< 4; reg
++)
3474 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3475 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3481 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3484 encode_coding_iso_2022 (coding
)
3485 struct coding_system
*coding
;
3487 int multibytep
= coding
->dst_multibyte
;
3488 int *charbuf
= coding
->charbuf
;
3489 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3490 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3491 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3494 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3495 && CODING_ISO_BOL (coding
));
3496 int produced_chars
= 0;
3497 Lisp_Object attrs
, eol_type
, charset_list
;
3498 int ascii_compatible
;
3501 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3502 setup_iso_safe_charsets (attrs
);
3503 coding
->safe_charsets
3504 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs
))->data
;
3506 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3508 while (charbuf
< charbuf_end
)
3510 ASSURE_DESTINATION (safe_room
);
3512 if (bol_designation
)
3514 unsigned char *dst_prev
= dst
;
3516 /* We have to produce designation sequences if any now. */
3517 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3518 bol_designation
= 0;
3519 /* We are sure that designation sequences are all ASCII bytes. */
3520 produced_chars
+= dst
- dst_prev
;
3525 /* Now encode the character C. */
3526 if (c
< 0x20 || c
== 0x7F)
3529 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3531 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3532 ENCODE_RESET_PLANE_AND_REGISTER ();
3533 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3537 for (i
= 0; i
< 4; i
++)
3538 CODING_ISO_DESIGNATION (coding
, i
)
3539 = CODING_ISO_INITIAL (coding
, i
);
3542 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3544 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3545 ENCODE_RESET_PLANE_AND_REGISTER ();
3546 EMIT_ONE_ASCII_BYTE (c
);
3548 else if (ASCII_CHAR_P (c
))
3550 if (ascii_compatible
)
3551 EMIT_ONE_ASCII_BYTE (c
);
3554 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
3555 ENCODE_ISO_CHARACTER (charset
, c
);
3560 struct charset
*charset
= char_charset (c
, charset_list
, NULL
);
3564 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3566 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3567 charset
= CHARSET_FROM_ID (charset_ascii
);
3571 c
= coding
->default_char
;
3572 charset
= char_charset (c
, charset_list
, NULL
);
3575 ENCODE_ISO_CHARACTER (charset
, c
);
3579 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3580 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3582 ASSURE_DESTINATION (safe_room
);
3583 ENCODE_RESET_PLANE_AND_REGISTER ();
3585 coding
->result
= CODING_RESULT_SUCCESS
;
3586 CODING_ISO_BOL (coding
) = bol_designation
;
3587 coding
->produced_char
+= produced_chars
;
3588 coding
->produced
= dst
- coding
->destination
;
3593 /*** 8,9. SJIS and BIG5 handlers ***/
3595 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3596 quite widely. So, for the moment, Emacs supports them in the bare
3597 C code. But, in the future, they may be supported only by CCL. */
3599 /* SJIS is a coding system encoding three character sets: ASCII, right
3600 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3601 as is. A character of charset katakana-jisx0201 is encoded by
3602 "position-code + 0x80". A character of charset japanese-jisx0208
3603 is encoded in 2-byte but two position-codes are divided and shifted
3604 so that it fit in the range below.
3606 --- CODE RANGE of SJIS ---
3607 (character set) (range)
3609 KATAKANA-JISX0201 0xA0 .. 0xDF
3610 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3611 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3612 -------------------------------
3616 /* BIG5 is a coding system encoding two character sets: ASCII and
3617 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3618 character set and is encoded in two-byte.
3620 --- CODE RANGE of BIG5 ---
3621 (character set) (range)
3623 Big5 (1st byte) 0xA1 .. 0xFE
3624 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3625 --------------------------
3629 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3630 Check if a text is encoded in SJIS. If it is, return
3631 CATEGORY_MASK_SJIS, else return 0. */
3634 detect_coding_sjis (coding
, mask
)
3635 struct coding_system
*coding
;
3638 unsigned char *src
= coding
->source
, *src_base
= src
;
3639 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3640 int multibytep
= coding
->src_multibyte
;
3641 int consumed_chars
= 0;
3645 /* A coding system of this category is always ASCII compatible. */
3646 src
+= coding
->head_ascii
;
3653 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3656 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
3660 else if (c
>= 0xA0 && c
< 0xE0)
3665 *mask
&= ~CATEGORY_MASK_SJIS
;
3671 *mask
&= CATEGORY_MASK_SJIS
;
3675 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3676 Check if a text is encoded in BIG5. If it is, return
3677 CATEGORY_MASK_BIG5, else return 0. */
3680 detect_coding_big5 (coding
, mask
)
3681 struct coding_system
*coding
;
3684 unsigned char *src
= coding
->source
, *src_base
= src
;
3685 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3686 int multibytep
= coding
->src_multibyte
;
3687 int consumed_chars
= 0;
3691 /* A coding system of this category is always ASCII compatible. */
3692 src
+= coding
->head_ascii
;
3702 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
3709 *mask
&= ~CATEGORY_MASK_BIG5
;
3715 *mask
&= CATEGORY_MASK_BIG5
;
3719 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3720 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3723 decode_coding_sjis (coding
)
3724 struct coding_system
*coding
;
3726 unsigned char *src
= coding
->source
+ coding
->consumed
;
3727 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3728 unsigned char *src_base
;
3729 int *charbuf
= coding
->charbuf
;
3730 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
3731 int consumed_chars
= 0, consumed_chars_base
;
3732 int multibytep
= coding
->src_multibyte
;
3733 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3734 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3736 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3739 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3740 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3741 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3748 consumed_chars_base
= consumed_chars
;
3750 if (charbuf
>= charbuf_end
)
3757 if (EQ (eol_type
, Qdos
))
3760 goto no_more_source
;
3764 else if (EQ (eol_type
, Qmac
))
3769 struct charset
*charset
;
3772 charset
= charset_roman
;
3777 if (c
< 0xA0 || c
>= 0xE0)
3779 /* SJIS -> JISX0208 */
3781 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
3785 charset
= charset_kanji
;
3788 /* SJIS -> JISX0201-Kana */
3789 charset
= charset_kana
;
3791 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
3798 consumed_chars
= consumed_chars_base
;
3800 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3805 coding
->consumed_char
+= consumed_chars_base
;
3806 coding
->consumed
= src_base
- coding
->source
;
3807 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3811 decode_coding_big5 (coding
)
3812 struct coding_system
*coding
;
3814 unsigned char *src
= coding
->source
+ coding
->consumed
;
3815 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3816 unsigned char *src_base
;
3817 int *charbuf
= coding
->charbuf
;
3818 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
3819 int consumed_chars
= 0, consumed_chars_base
;
3820 int multibytep
= coding
->src_multibyte
;
3821 struct charset
*charset_roman
, *charset_big5
;
3822 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3824 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3826 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3827 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3834 consumed_chars_base
= consumed_chars
;
3836 if (charbuf
>= charbuf_end
)
3843 if (EQ (eol_type
, Qdos
))
3846 goto no_more_source
;
3850 else if (EQ (eol_type
, Qmac
))
3855 struct charset
*charset
;
3857 charset
= charset_roman
;
3861 if (c
< 0xA1 || c
> 0xFE)
3864 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
3867 charset
= charset_big5
;
3869 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
3877 consumed_chars
= consumed_chars_base
;
3879 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3884 coding
->consumed_char
+= consumed_chars_base
;
3885 coding
->consumed
= src_base
- coding
->source
;
3886 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3889 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3890 This function can encode charsets `ascii', `katakana-jisx0201',
3891 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3892 are sure that all these charsets are registered as official charset
3893 (i.e. do not have extended leading-codes). Characters of other
3894 charsets are produced without any encoding. If SJIS_P is 1, encode
3895 SJIS text, else encode BIG5 text. */
3898 encode_coding_sjis (coding
)
3899 struct coding_system
*coding
;
3901 int multibytep
= coding
->dst_multibyte
;
3902 int *charbuf
= coding
->charbuf
;
3903 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3904 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3905 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3907 int produced_chars
= 0;
3908 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3909 int ascii_compatible
;
3910 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3913 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3915 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3916 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3917 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3919 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3921 while (charbuf
< charbuf_end
)
3923 ASSURE_DESTINATION (safe_room
);
3925 /* Now encode the character C. */
3926 if (ASCII_CHAR_P (c
) && ascii_compatible
)
3927 EMIT_ONE_ASCII_BYTE (c
);
3931 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
3935 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3937 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3938 charset
= CHARSET_FROM_ID (charset_ascii
);
3942 c
= coding
->default_char
;
3943 charset
= char_charset (c
, charset_list
, &code
);
3946 if (code
== CHARSET_INVALID_CODE (charset
))
3948 if (charset
== charset_kanji
)
3952 c1
= code
>> 8, c2
= code
& 0xFF;
3953 EMIT_TWO_BYTES (c1
, c2
);
3955 else if (charset
== charset_kana
)
3956 EMIT_ONE_BYTE (code
| 0x80);
3958 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
3961 coding
->result
= CODING_RESULT_SUCCESS
;
3962 coding
->produced_char
+= produced_chars
;
3963 coding
->produced
= dst
- coding
->destination
;
3968 encode_coding_big5 (coding
)
3969 struct coding_system
*coding
;
3971 int multibytep
= coding
->dst_multibyte
;
3972 int *charbuf
= coding
->charbuf
;
3973 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3974 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3975 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3977 int produced_chars
= 0;
3978 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3979 int ascii_compatible
;
3980 struct charset
*charset_roman
, *charset_big5
;
3983 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3985 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3986 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3987 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3989 while (charbuf
< charbuf_end
)
3991 ASSURE_DESTINATION (safe_room
);
3993 /* Now encode the character C. */
3994 if (ASCII_CHAR_P (c
) && ascii_compatible
)
3995 EMIT_ONE_ASCII_BYTE (c
);
3999 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4003 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4005 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4006 charset
= CHARSET_FROM_ID (charset_ascii
);
4010 c
= coding
->default_char
;
4011 charset
= char_charset (c
, charset_list
, &code
);
4014 if (code
== CHARSET_INVALID_CODE (charset
))
4016 if (charset
== charset_big5
)
4020 c1
= code
>> 8, c2
= code
& 0xFF;
4021 EMIT_TWO_BYTES (c1
, c2
);
4024 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4027 coding
->result
= CODING_RESULT_SUCCESS
;
4028 coding
->produced_char
+= produced_chars
;
4029 coding
->produced
= dst
- coding
->destination
;
4034 /*** 10. CCL handlers ***/
4036 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4037 Check if a text is encoded in a coding system of which
4038 encoder/decoder are written in CCL program. If it is, return
4039 CATEGORY_MASK_CCL, else return 0. */
4042 detect_coding_ccl (coding
, mask
)
4043 struct coding_system
*coding
;
4046 unsigned char *src
= coding
->source
, *src_base
= src
;
4047 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4048 int multibytep
= coding
->src_multibyte
;
4049 int consumed_chars
= 0;
4051 unsigned char *valids
= CODING_CCL_VALIDS (coding
);
4052 int head_ascii
= coding
->head_ascii
;
4055 coding
= &coding_categories
[coding_category_ccl
];
4056 attrs
= CODING_ID_ATTRS (coding
->id
);
4057 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4066 if (!found
&& valids
[c
] > 1)
4069 *mask
&= ~CATEGORY_MASK_CCL
;
4075 *mask
&= CATEGORY_MASK_CCL
;
4080 decode_coding_ccl (coding
)
4081 struct coding_system
*coding
;
4083 unsigned char *src
= coding
->source
+ coding
->consumed
;
4084 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4085 int *charbuf
= coding
->charbuf
;
4086 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4087 int consumed_chars
= 0;
4088 int multibytep
= coding
->src_multibyte
;
4089 struct ccl_program ccl
;
4090 int source_charbuf
[1024];
4091 int source_byteidx
[1024];
4093 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
4095 while (src
< src_end
)
4097 unsigned char *p
= src
;
4098 int *source
, *source_end
;
4102 while (i
< 1024 && p
< src_end
)
4104 source_byteidx
[i
] = p
- src
;
4105 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4108 while (i
< 1024 && p
< src_end
)
4109 source_charbuf
[i
++] = *p
++;
4111 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4114 source
= source_charbuf
;
4115 source_end
= source
+ i
;
4116 while (source
< source_end
)
4118 ccl_driver (&ccl
, source
, charbuf
,
4119 source_end
- source
, charbuf_end
- charbuf
);
4120 source
+= ccl
.consumed
;
4121 charbuf
+= ccl
.produced
;
4122 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4125 if (source
< source_end
)
4126 src
+= source_byteidx
[source
- source_charbuf
];
4129 consumed_chars
+= source
- source_charbuf
;
4131 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4132 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4138 case CCL_STAT_SUSPEND_BY_SRC
:
4139 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4141 case CCL_STAT_SUSPEND_BY_DST
:
4144 case CCL_STAT_INVALID_CMD
:
4145 coding
->result
= CODING_RESULT_INTERRUPT
;
4148 coding
->result
= CODING_RESULT_SUCCESS
;
4151 coding
->consumed_char
+= consumed_chars
;
4152 coding
->consumed
= src
- coding
->source
;
4153 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4157 encode_coding_ccl (coding
)
4158 struct coding_system
*coding
;
4160 struct ccl_program ccl
;
4161 int multibytep
= coding
->dst_multibyte
;
4162 int *charbuf
= coding
->charbuf
;
4163 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4164 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4165 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4166 unsigned char *adjusted_dst_end
= dst_end
- 1;
4167 int destination_charbuf
[1024];
4168 int i
, produced_chars
= 0;
4170 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4172 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4173 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4175 while (charbuf
< charbuf_end
&& dst
< adjusted_dst_end
)
4177 int dst_bytes
= dst_end
- dst
;
4178 if (dst_bytes
> 1024)
4181 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4182 charbuf_end
- charbuf
, dst_bytes
);
4183 charbuf
+= ccl
.consumed
;
4185 for (i
= 0; i
< ccl
.produced
; i
++)
4186 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4189 for (i
= 0; i
< ccl
.produced
; i
++)
4190 *dst
++ = destination_charbuf
[i
] & 0xFF;
4191 produced_chars
+= ccl
.produced
;
4197 case CCL_STAT_SUSPEND_BY_SRC
:
4198 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4200 case CCL_STAT_SUSPEND_BY_DST
:
4201 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
4204 case CCL_STAT_INVALID_CMD
:
4205 coding
->result
= CODING_RESULT_INTERRUPT
;
4208 coding
->result
= CODING_RESULT_SUCCESS
;
4212 coding
->produced_char
+= produced_chars
;
4213 coding
->produced
= dst
- coding
->destination
;
4219 /*** 10, 11. no-conversion handlers ***/
4221 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4224 decode_coding_raw_text (coding
)
4225 struct coding_system
*coding
;
4227 coding
->chars_at_source
= 1;
4228 coding
->consumed_char
= 0;
4229 coding
->consumed
= 0;
4230 coding
->result
= CODING_RESULT_SUCCESS
;
4234 encode_coding_raw_text (coding
)
4235 struct coding_system
*coding
;
4237 int multibytep
= coding
->dst_multibyte
;
4238 int *charbuf
= coding
->charbuf
;
4239 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4240 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4241 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4242 int produced_chars
= 0;
4247 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4249 if (coding
->src_multibyte
)
4250 while (charbuf
< charbuf_end
)
4252 ASSURE_DESTINATION (safe_room
);
4254 if (ASCII_CHAR_P (c
))
4255 EMIT_ONE_ASCII_BYTE (c
);
4256 else if (CHAR_BYTE8_P (c
))
4258 c
= CHAR_TO_BYTE8 (c
);
4263 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4265 CHAR_STRING_ADVANCE (c
, p1
);
4267 EMIT_ONE_BYTE (*p0
);
4271 while (charbuf
< charbuf_end
)
4273 ASSURE_DESTINATION (safe_room
);
4280 if (coding
->src_multibyte
)
4282 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4284 while (charbuf
< charbuf_end
)
4286 ASSURE_DESTINATION (safe_room
);
4288 if (ASCII_CHAR_P (c
))
4290 else if (CHAR_BYTE8_P (c
))
4291 *dst
++ = CHAR_TO_BYTE8 (c
);
4293 CHAR_STRING_ADVANCE (c
, dst
);
4299 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4300 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4301 *dst
++ = *charbuf
++;
4302 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4305 coding
->result
= CODING_RESULT_SUCCESS
;
4306 coding
->produced_char
+= produced_chars
;
4307 coding
->produced
= dst
- coding
->destination
;
4312 detect_coding_charset (coding
, mask
)
4313 struct coding_system
*coding
;
4316 unsigned char *src
= coding
->source
, *src_base
= src
;
4317 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4318 int multibytep
= coding
->src_multibyte
;
4319 int consumed_chars
= 0;
4320 Lisp_Object attrs
, valids
;
4322 coding
= &coding_categories
[coding_category_charset
];
4323 attrs
= CODING_ID_ATTRS (coding
->id
);
4324 valids
= AREF (attrs
, coding_attr_charset_valids
);
4326 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4327 src
+= coding
->head_ascii
;
4334 if (NILP (AREF (valids
, c
)))
4337 *mask
&= ~CATEGORY_MASK_CHARSET
;
4341 *mask
&= CATEGORY_MASK_CHARSET
;
4346 decode_coding_charset (coding
)
4347 struct coding_system
*coding
;
4349 unsigned char *src
= coding
->source
+ coding
->consumed
;
4350 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4351 unsigned char *src_base
;
4352 int *charbuf
= coding
->charbuf
;
4353 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4354 int consumed_chars
= 0, consumed_chars_base
;
4355 int multibytep
= coding
->src_multibyte
;
4356 Lisp_Object attrs
, eol_type
, charset_list
, valids
;
4358 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4359 valids
= AREF (attrs
, coding_attr_charset_valids
);
4366 consumed_chars_base
= consumed_chars
;
4368 if (charbuf
>= charbuf_end
)
4374 /* Here we assume that no charset maps '\r' to something
4376 if (EQ (eol_type
, Qdos
))
4382 else if (EQ (eol_type
, Qmac
))
4388 struct charset
*charset
;
4393 val
= AREF (valids
, c
);
4398 charset
= CHARSET_FROM_ID (XFASTINT (val
));
4399 dim
= CHARSET_DIMENSION (charset
);
4403 code
= (code
<< 8) | c
;
4406 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
4411 /* VAL is a list of charset IDs. It is assured that the
4412 list is sorted by charset dimensions (smaller one
4416 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
4417 dim
= CHARSET_DIMENSION (charset
);
4421 code
= (code
<< 8) | c
;
4424 CODING_DECODE_CHAR (coding
, src
, src_base
,
4425 src_end
, charset
, code
, c
);
4439 consumed_chars
= consumed_chars_base
;
4441 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4446 coding
->consumed_char
+= consumed_chars_base
;
4447 coding
->consumed
= src_base
- coding
->source
;
4448 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4452 encode_coding_charset (coding
)
4453 struct coding_system
*coding
;
4455 int multibytep
= coding
->dst_multibyte
;
4456 int *charbuf
= coding
->charbuf
;
4457 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4458 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4459 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4460 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4461 int produced_chars
= 0;
4462 Lisp_Object attrs
, eol_type
, charset_list
;
4463 int ascii_compatible
;
4466 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4467 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4469 while (charbuf
< charbuf_end
)
4471 struct charset
*charset
;
4474 ASSURE_DESTINATION (safe_room
);
4476 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4477 EMIT_ONE_ASCII_BYTE (c
);
4480 charset
= char_charset (c
, charset_list
, &code
);
4483 if (CHARSET_DIMENSION (charset
) == 1)
4484 EMIT_ONE_BYTE (code
);
4485 else if (CHARSET_DIMENSION (charset
) == 2)
4486 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
4487 else if (CHARSET_DIMENSION (charset
) == 3)
4488 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
4490 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
4491 (code
>> 8) & 0xFF, code
& 0xFF);
4495 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4496 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4498 c
= coding
->default_char
;
4504 coding
->result
= CODING_RESULT_SUCCESS
;
4505 coding
->produced_char
+= produced_chars
;
4506 coding
->produced
= dst
- coding
->destination
;
4511 /*** 7. C library functions ***/
4513 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
4514 has a property `coding-system'. The value of this property is a
4515 vector of length 5 (called as coding-vector). Among elements of
4516 this vector, the first (element[0]) and the fifth (element[4])
4517 carry important information for decoding/encoding. Before
4518 decoding/encoding, this information should be set in fields of a
4519 structure of type `coding_system'.
4521 A value of property `coding-system' can be a symbol of another
4522 subsidiary coding-system. In that case, Emacs gets coding-vector
4525 `element[0]' contains information to be set in `coding->type'. The
4526 value and its meaning is as follows:
4528 0 -- coding_type_emacs_mule
4529 1 -- coding_type_sjis
4530 2 -- coding_type_iso_2022
4531 3 -- coding_type_big5
4532 4 -- coding_type_ccl encoder/decoder written in CCL
4533 nil -- coding_type_no_conversion
4534 t -- coding_type_undecided (automatic conversion on decoding,
4535 no-conversion on encoding)
4537 `element[4]' contains information to be set in `coding->flags' and
4538 `coding->spec'. The meaning varies by `coding->type'.
4540 If `coding->type' is `coding_type_iso_2022', element[4] is a vector
4541 of length 32 (of which the first 13 sub-elements are used now).
4542 Meanings of these sub-elements are:
4544 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022'
4545 If the value is an integer of valid charset, the charset is
4546 assumed to be designated to graphic register N initially.
4548 If the value is minus, it is a minus value of charset which
4549 reserves graphic register N, which means that the charset is
4550 not designated initially but should be designated to graphic
4551 register N just before encoding a character in that charset.
4553 If the value is nil, graphic register N is never used on
4556 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
4557 Each value takes t or nil. See the section ISO2022 of
4558 `coding.h' for more information.
4560 If `coding->type' is `coding_type_big5', element[4] is t to denote
4561 BIG5-ETen or nil to denote BIG5-HKU.
4563 If `coding->type' takes the other value, element[4] is ignored.
4565 Emacs Lisp's coding system also carries information about format of
4566 end-of-line in a value of property `eol-type'. If the value is
4567 integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr. If
4568 it is not integer, it should be a vector of subsidiary coding
4569 systems of which property `eol-type' has one of above values.
4573 /* Setup coding context CODING from information about CODING_SYSTEM.
4574 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4575 CODING_SYSTEM is invalid, signal an error. */
4578 setup_coding_system (coding_system
, coding
)
4579 Lisp_Object coding_system
;
4580 struct coding_system
*coding
;
4583 Lisp_Object eol_type
;
4584 Lisp_Object coding_type
;
4587 if (NILP (coding_system
))
4588 coding_system
= Qno_conversion
;
4590 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
4592 attrs
= CODING_ID_ATTRS (coding
->id
);
4593 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4596 coding
->head_ascii
= -1;
4597 coding
->common_flags
4598 = (VECTORP (eol_type
) ? CODING_REQUIRE_DETECTION_MASK
: 0);
4600 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4601 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4602 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4603 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
4605 coding_type
= CODING_ATTR_TYPE (attrs
);
4606 if (EQ (coding_type
, Qundecided
))
4608 coding
->detector
= NULL
;
4609 coding
->decoder
= decode_coding_raw_text
;
4610 coding
->encoder
= encode_coding_raw_text
;
4611 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4613 else if (EQ (coding_type
, Qiso_2022
))
4616 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
4618 /* Invoke graphic register 0 to plane 0. */
4619 CODING_ISO_INVOCATION (coding
, 0) = 0;
4620 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4621 CODING_ISO_INVOCATION (coding
, 1)
4622 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
4623 /* Setup the initial status of designation. */
4624 for (i
= 0; i
< 4; i
++)
4625 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
4626 /* Not single shifting initially. */
4627 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
4628 /* Beginning of buffer should also be regarded as bol. */
4629 CODING_ISO_BOL (coding
) = 1;
4630 coding
->detector
= detect_coding_iso_2022
;
4631 coding
->decoder
= decode_coding_iso_2022
;
4632 coding
->encoder
= encode_coding_iso_2022
;
4633 if (flags
& CODING_ISO_FLAG_SAFE
)
4634 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
4635 coding
->common_flags
4636 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4637 | CODING_REQUIRE_FLUSHING_MASK
);
4638 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
4639 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
4640 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
4642 setup_iso_safe_charsets (attrs
);
4643 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4644 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4645 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4647 CODING_ISO_FLAGS (coding
) = flags
;
4649 else if (EQ (coding_type
, Qcharset
))
4651 coding
->detector
= detect_coding_charset
;
4652 coding
->decoder
= decode_coding_charset
;
4653 coding
->encoder
= encode_coding_charset
;
4654 coding
->common_flags
4655 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4657 else if (EQ (coding_type
, Qutf_8
))
4659 coding
->detector
= detect_coding_utf_8
;
4660 coding
->decoder
= decode_coding_utf_8
;
4661 coding
->encoder
= encode_coding_utf_8
;
4662 coding
->common_flags
4663 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4665 else if (EQ (coding_type
, Qutf_16
))
4667 val
= AREF (attrs
, coding_attr_utf_16_bom
);
4668 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
4669 : EQ (val
, Qt
) ? utf_16_with_bom
4670 : utf_16_without_bom
);
4671 val
= AREF (attrs
, coding_attr_utf_16_endian
);
4672 CODING_UTF_16_ENDIAN (coding
) = (NILP (val
) ? utf_16_big_endian
4673 : utf_16_little_endian
);
4674 CODING_UTF_16_SURROGATE (coding
) = 0;
4675 coding
->detector
= detect_coding_utf_16
;
4676 coding
->decoder
= decode_coding_utf_16
;
4677 coding
->encoder
= encode_coding_utf_16
;
4678 coding
->common_flags
4679 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4681 else if (EQ (coding_type
, Qccl
))
4683 coding
->detector
= detect_coding_ccl
;
4684 coding
->decoder
= decode_coding_ccl
;
4685 coding
->encoder
= encode_coding_ccl
;
4686 coding
->common_flags
4687 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4688 | CODING_REQUIRE_FLUSHING_MASK
);
4690 else if (EQ (coding_type
, Qemacs_mule
))
4692 coding
->detector
= detect_coding_emacs_mule
;
4693 coding
->decoder
= decode_coding_emacs_mule
;
4694 coding
->encoder
= encode_coding_emacs_mule
;
4695 coding
->common_flags
4696 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4697 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
4698 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
4700 Lisp_Object tail
, safe_charsets
;
4701 int max_charset_id
= 0;
4703 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4705 if (max_charset_id
< XFASTINT (XCAR (tail
)))
4706 max_charset_id
= XFASTINT (XCAR (tail
));
4707 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
4709 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4711 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
4712 coding
->max_charset_id
= max_charset_id
;
4713 coding
->safe_charsets
= (char *) XSTRING (safe_charsets
)->data
;
4716 else if (EQ (coding_type
, Qshift_jis
))
4718 coding
->detector
= detect_coding_sjis
;
4719 coding
->decoder
= decode_coding_sjis
;
4720 coding
->encoder
= encode_coding_sjis
;
4721 coding
->common_flags
4722 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4724 else if (EQ (coding_type
, Qbig5
))
4726 coding
->detector
= detect_coding_big5
;
4727 coding
->decoder
= decode_coding_big5
;
4728 coding
->encoder
= encode_coding_big5
;
4729 coding
->common_flags
4730 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4732 else /* EQ (coding_type, Qraw_text) */
4734 coding
->detector
= NULL
;
4735 coding
->decoder
= decode_coding_raw_text
;
4736 coding
->encoder
= encode_coding_raw_text
;
4737 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
4743 /* Return raw-text or one of its subsidiaries that has the same
4744 eol_type as CODING-SYSTEM. */
4747 raw_text_coding_system (coding_system
)
4748 Lisp_Object coding_system
;
4750 Lisp_Object spec
, attrs
;
4751 Lisp_Object eol_type
, raw_text_eol_type
;
4753 spec
= CODING_SYSTEM_SPEC (coding_system
);
4754 attrs
= AREF (spec
, 0);
4756 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
4757 return coding_system
;
4759 eol_type
= AREF (spec
, 2);
4760 if (VECTORP (eol_type
))
4762 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
4763 raw_text_eol_type
= AREF (spec
, 2);
4764 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
4765 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
4766 : AREF (raw_text_eol_type
, 2));
4770 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4771 does, return one of the subsidiary that has the same eol-spec as
4772 PARENT. Otherwise, return CODING_SYSTEM. */
4775 coding_inherit_eol_type (coding_system
, parent
)
4776 Lisp_Object coding_system
, parent
;
4778 Lisp_Object spec
, attrs
, eol_type
;
4780 spec
= CODING_SYSTEM_SPEC (coding_system
);
4781 attrs
= AREF (spec
, 0);
4782 eol_type
= AREF (spec
, 2);
4783 if (VECTORP (eol_type
))
4785 Lisp_Object parent_spec
;
4786 Lisp_Object parent_eol_type
;
4789 = CODING_SYSTEM_SPEC (buffer_defaults
.buffer_file_coding_system
);
4790 parent_eol_type
= AREF (parent_spec
, 2);
4791 if (EQ (parent_eol_type
, Qunix
))
4792 coding_system
= AREF (eol_type
, 0);
4793 else if (EQ (parent_eol_type
, Qdos
))
4794 coding_system
= AREF (eol_type
, 1);
4795 else if (EQ (parent_eol_type
, Qmac
))
4796 coding_system
= AREF (eol_type
, 2);
4798 return coding_system
;
4801 /* Emacs has a mechanism to automatically detect a coding system if it
4802 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4803 it's impossible to distinguish some coding systems accurately
4804 because they use the same range of codes. So, at first, coding
4805 systems are categorized into 7, those are:
4807 o coding-category-emacs-mule
4809 The category for a coding system which has the same code range
4810 as Emacs' internal format. Assigned the coding-system (Lisp
4811 symbol) `emacs-mule' by default.
4813 o coding-category-sjis
4815 The category for a coding system which has the same code range
4816 as SJIS. Assigned the coding-system (Lisp
4817 symbol) `japanese-shift-jis' by default.
4819 o coding-category-iso-7
4821 The category for a coding system which has the same code range
4822 as ISO2022 of 7-bit environment. This doesn't use any locking
4823 shift and single shift functions. This can encode/decode all
4824 charsets. Assigned the coding-system (Lisp symbol)
4825 `iso-2022-7bit' by default.
4827 o coding-category-iso-7-tight
4829 Same as coding-category-iso-7 except that this can
4830 encode/decode only the specified charsets.
4832 o coding-category-iso-8-1
4834 The category for a coding system which has the same code range
4835 as ISO2022 of 8-bit environment and graphic plane 1 used only
4836 for DIMENSION1 charset. This doesn't use any locking shift
4837 and single shift functions. Assigned the coding-system (Lisp
4838 symbol) `iso-latin-1' by default.
4840 o coding-category-iso-8-2
4842 The category for a coding system which has the same code range
4843 as ISO2022 of 8-bit environment and graphic plane 1 used only
4844 for DIMENSION2 charset. This doesn't use any locking shift
4845 and single shift functions. Assigned the coding-system (Lisp
4846 symbol) `japanese-iso-8bit' by default.
4848 o coding-category-iso-7-else
4850 The category for a coding system which has the same code range
4851 as ISO2022 of 7-bit environemnt but uses locking shift or
4852 single shift functions. Assigned the coding-system (Lisp
4853 symbol) `iso-2022-7bit-lock' by default.
4855 o coding-category-iso-8-else
4857 The category for a coding system which has the same code range
4858 as ISO2022 of 8-bit environemnt but uses locking shift or
4859 single shift functions. Assigned the coding-system (Lisp
4860 symbol) `iso-2022-8bit-ss2' by default.
4862 o coding-category-big5
4864 The category for a coding system which has the same code range
4865 as BIG5. Assigned the coding-system (Lisp symbol)
4866 `cn-big5' by default.
4868 o coding-category-utf-8
4870 The category for a coding system which has the same code range
4871 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
4872 symbol) `utf-8' by default.
4874 o coding-category-utf-16-be
4876 The category for a coding system in which a text has an
4877 Unicode signature (cf. Unicode Standard) in the order of BIG
4878 endian at the head. Assigned the coding-system (Lisp symbol)
4879 `utf-16-be' by default.
4881 o coding-category-utf-16-le
4883 The category for a coding system in which a text has an
4884 Unicode signature (cf. Unicode Standard) in the order of
4885 LITTLE endian at the head. Assigned the coding-system (Lisp
4886 symbol) `utf-16-le' by default.
4888 o coding-category-ccl
4890 The category for a coding system of which encoder/decoder is
4891 written in CCL programs. The default value is nil, i.e., no
4892 coding system is assigned.
4894 o coding-category-binary
4896 The category for a coding system not categorized in any of the
4897 above. Assigned the coding-system (Lisp symbol)
4898 `no-conversion' by default.
4900 Each of them is a Lisp symbol and the value is an actual
4901 `coding-system's (this is also a Lisp symbol) assigned by a user.
4902 What Emacs does actually is to detect a category of coding system.
4903 Then, it uses a `coding-system' assigned to it. If Emacs can't
4904 decide only one possible category, it selects a category of the
4905 highest priority. Priorities of categories are also specified by a
4906 user in a Lisp variable `coding-category-list'.
4910 #define EOL_SEEN_NONE 0
4911 #define EOL_SEEN_LF 1
4912 #define EOL_SEEN_CR 2
4913 #define EOL_SEEN_CRLF 4
4915 /* Detect how end-of-line of a text of length CODING->src_bytes
4916 pointed by CODING->source is encoded. Return one of
4919 #define MAX_EOL_CHECK_COUNT 3
4922 detect_eol (coding
, source
, src_bytes
)
4923 struct coding_system
*coding
;
4924 unsigned char *source
;
4925 EMACS_INT src_bytes
;
4927 Lisp_Object attrs
, coding_type
;
4928 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
4931 int eol_seen
= EOL_SEEN_NONE
;
4933 attrs
= CODING_ID_ATTRS (coding
->id
);
4934 coding_type
= CODING_ATTR_TYPE (attrs
);
4936 if (EQ (coding_type
, Qccl
))
4940 msb
= coding
->spec
.utf_16
.endian
== utf_16_little_endian
;
4943 while (src
+ 1 < src_end
)
4946 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
4951 this_eol
= EOL_SEEN_LF
;
4952 else if (src
+ 3 >= src_end
4953 || src
[msb
+ 2] != 0
4954 || src
[lsb
+ 2] != '\n')
4955 this_eol
= EOL_SEEN_CR
;
4957 this_eol
= EOL_SEEN_CRLF
;
4959 if (eol_seen
== EOL_SEEN_NONE
)
4960 /* This is the first end-of-line. */
4961 eol_seen
= this_eol
;
4962 else if (eol_seen
!= this_eol
)
4964 /* The found type is different from what found before. */
4965 eol_seen
= EOL_SEEN_LF
;
4968 if (++total
== MAX_EOL_CHECK_COUNT
)
4976 while (src
< src_end
)
4979 if (c
== '\n' || c
== '\r')
4984 this_eol
= EOL_SEEN_LF
;
4985 else if (src
>= src_end
|| *src
!= '\n')
4986 this_eol
= EOL_SEEN_CR
;
4988 this_eol
= EOL_SEEN_CRLF
, src
++;
4990 if (eol_seen
== EOL_SEEN_NONE
)
4991 /* This is the first end-of-line. */
4992 eol_seen
= this_eol
;
4993 else if (eol_seen
!= this_eol
)
4995 /* The found type is different from what found before. */
4996 eol_seen
= EOL_SEEN_LF
;
4999 if (++total
== MAX_EOL_CHECK_COUNT
)
5009 adjust_coding_eol_type (coding
, eol_seen
)
5010 struct coding_system
*coding
;
5013 Lisp_Object eol_type
;
5015 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5016 if (eol_seen
& EOL_SEEN_LF
)
5017 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
5018 else if (eol_type
& EOL_SEEN_CRLF
)
5019 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
5020 else if (eol_type
& EOL_SEEN_CR
)
5021 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
5024 /* Detect how a text specified in CODING is encoded. If a coding
5025 system is detected, update fields of CODING by the detected coding
5029 detect_coding (coding
)
5030 struct coding_system
*coding
;
5032 unsigned char *src
, *src_end
;
5033 Lisp_Object attrs
, coding_type
;
5035 coding
->consumed
= coding
->consumed_char
= 0;
5036 coding
->produced
= coding
->produced_char
= 0;
5037 coding_set_source (coding
);
5039 src_end
= coding
->source
+ coding
->src_bytes
;
5041 /* If we have not yet decided the text encoding type, detect it
5043 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
5045 int mask
= CATEGORY_MASK_ANY
;
5048 for (src
= coding
->source
; src
< src_end
; src
++)
5051 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
5053 || c
== ISO_CODE_SO
)))
5056 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
5058 if (coding
->head_ascii
< coding
->src_bytes
)
5062 for (i
= 0; i
< coding_category_raw_text
; i
++)
5064 enum coding_category category
= coding_priorities
[i
];
5065 struct coding_system
*this = coding_categories
+ category
;
5067 if (category
>= coding_category_raw_text
5068 || detected
& (1 << category
))
5073 /* No coding system of this category is defined. */
5074 mask
&= ~(1 << category
);
5078 detected
|= detected_mask
[category
];
5079 if ((*(this->detector
)) (coding
, &mask
))
5084 setup_coding_system (Qraw_text
, coding
);
5085 else if (mask
!= CATEGORY_MASK_ANY
)
5086 for (i
= 0; i
< coding_category_raw_text
; i
++)
5088 enum coding_category category
= coding_priorities
[i
];
5089 struct coding_system
*this = coding_categories
+ category
;
5091 if (mask
& (1 << category
))
5093 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5100 attrs
= CODING_ID_ATTRS (coding
->id
);
5101 coding_type
= CODING_ATTR_TYPE (attrs
);
5103 /* If we have not yet decided the EOL type, detect it now. But, the
5104 detection is impossible for a CCL based coding system, in which
5105 case, we detct the EOL type after decoding. */
5106 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
))
5107 && ! EQ (coding_type
, Qccl
))
5109 int eol_seen
= detect_eol (coding
, coding
->source
, coding
->src_bytes
);
5111 if (eol_seen
!= EOL_SEEN_NONE
)
5112 adjust_coding_eol_type (coding
, eol_seen
);
5119 struct coding_system
*coding
;
5121 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
)))
5123 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5124 unsigned char *pend
= p
+ coding
->produced
;
5125 int eol_seen
= EOL_SEEN_NONE
;
5127 for (; p
< pend
; p
++)
5130 eol_seen
|= EOL_SEEN_LF
;
5131 else if (*p
== '\r')
5133 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
5135 eol_seen
|= EOL_SEEN_CRLF
;
5139 eol_seen
|= EOL_SEEN_CR
;
5142 if (eol_seen
!= EOL_SEEN_NONE
)
5143 adjust_coding_eol_type (coding
, eol_seen
);
5146 if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qmac
))
5148 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5149 unsigned char *pend
= p
+ coding
->produced
;
5151 for (; p
< pend
; p
++)
5155 else if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
))
5157 unsigned char *p
, *pbeg
, *pend
;
5158 Lisp_Object undo_list
;
5160 move_gap_both (coding
->dst_pos
+ coding
->produced_char
,
5161 coding
->dst_pos_byte
+ coding
->produced
);
5162 undo_list
= current_buffer
->undo_list
;
5163 current_buffer
->undo_list
= Qt
;
5164 del_range_2 (coding
->dst_pos
, coding
->dst_pos_byte
, GPT
, GPT_BYTE
, Qnil
);
5165 current_buffer
->undo_list
= undo_list
;
5167 pend
= pbeg
+ coding
->produced
;
5169 for (p
= pend
- 1; p
>= pbeg
; p
--)
5172 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
- p
- 1);
5175 coding
->produced_char
-= coding
->produced
- (pend
- pbeg
);
5176 coding
->produced
= pend
- pbeg
;
5177 insert_from_gap (coding
->produced_char
, coding
->produced
);
5182 translate_chars (coding
, table
)
5183 struct coding_system
*coding
;
5186 int *charbuf
= coding
->charbuf
;
5187 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5190 if (coding
->chars_at_source
)
5193 while (charbuf
< charbuf_end
)
5199 *charbuf
++ = translate_char (table
, c
);
5204 produce_chars (coding
)
5205 struct coding_system
*coding
;
5207 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5208 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5210 int produced_chars
= 0;
5212 if (! coding
->chars_at_source
)
5214 /* Characters are in coding->charbuf. */
5215 int *buf
= coding
->charbuf
;
5216 int *buf_end
= buf
+ coding
->charbuf_used
;
5217 unsigned char *adjusted_dst_end
;
5219 if (BUFFERP (coding
->src_object
)
5220 && EQ (coding
->src_object
, coding
->dst_object
))
5221 dst_end
= coding
->source
+ coding
->consumed
;
5222 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5224 while (buf
< buf_end
)
5228 if (dst
>= adjusted_dst_end
)
5230 dst
= alloc_destination (coding
,
5231 buf_end
- buf
+ MAX_MULTIBYTE_LENGTH
,
5233 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5234 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5238 if (coding
->dst_multibyte
5239 || ! CHAR_BYTE8_P (c
))
5240 CHAR_STRING_ADVANCE (c
, dst
);
5242 *dst
++ = CHAR_TO_BYTE8 (c
);
5246 /* This is an annotation data. */
5252 unsigned char *src
= coding
->source
;
5253 unsigned char *src_end
= src
+ coding
->src_bytes
;
5254 Lisp_Object eol_type
;
5256 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5258 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5260 if (coding
->src_multibyte
)
5267 unsigned char *src_base
= src
;
5273 if (EQ (eol_type
, Qdos
))
5279 else if (EQ (eol_type
, Qmac
))
5284 coding
->consumed
= src
- coding
->source
;
5286 if (EQ (coding
->src_object
, coding
->dst_object
))
5290 dst
= alloc_destination (coding
, src_end
- src
+ 1,
5292 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5293 coding_set_source (coding
);
5294 src
= coding
->source
+ coding
->consumed
;
5295 src_end
= coding
->source
+ coding
->src_bytes
;
5305 while (src
< src_end
)
5312 if (EQ (eol_type
, Qdos
))
5318 else if (EQ (eol_type
, Qmac
))
5321 if (dst
>= dst_end
- 1)
5323 coding
->consumed
= src
- coding
->source
;
5325 if (EQ (coding
->src_object
, coding
->dst_object
))
5327 if (dst
>= dst_end
- 1)
5329 dst
= alloc_destination (coding
, src_end
- src
+ 2,
5331 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5332 coding_set_source (coding
);
5333 src
= coding
->source
+ coding
->consumed
;
5334 src_end
= coding
->source
+ coding
->src_bytes
;
5342 if (!EQ (coding
->src_object
, coding
->dst_object
))
5344 int require
= coding
->src_bytes
- coding
->dst_bytes
;
5348 EMACS_INT offset
= src
- coding
->source
;
5350 dst
= alloc_destination (coding
, require
, dst
);
5351 coding_set_source (coding
);
5352 src
= coding
->source
+ offset
;
5353 src_end
= coding
->source
+ coding
->src_bytes
;
5356 produced_chars
= coding
->src_chars
;
5357 while (src
< src_end
)
5363 if (EQ (eol_type
, Qdos
))
5370 else if (EQ (eol_type
, Qmac
))
5376 coding
->consumed
= coding
->src_bytes
;
5377 coding
->consumed_char
= coding
->src_chars
;
5380 produced
= dst
- (coding
->destination
+ coding
->produced
);
5381 if (BUFFERP (coding
->dst_object
))
5382 insert_from_gap (produced_chars
, produced
);
5383 coding
->produced
+= produced
;
5384 coding
->produced_char
+= produced_chars
;
5385 return produced_chars
;
5388 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5390 [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5394 produce_composition (coding
, charbuf
)
5395 struct coding_system
*coding
;
5401 enum composition_method method
;
5403 Lisp_Object components
;
5405 buffer
= coding
->dst_object
;
5407 pos
= coding
->dst_pos
+ charbuf
[1];
5408 method
= (enum composition_method
) (charbuf
[3]);
5409 cmp_len
= charbuf
[4];
5411 if (method
== COMPOSITION_RELATIVE
)
5415 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
5420 for (i
= 0; i
< len
; i
++)
5421 args
[i
] = make_number (charbuf
[i
]);
5422 components
= (method
== COMPOSITION_WITH_ALTCHARS
5423 ? Fstring (len
, args
) : Fvector (len
, args
));
5425 compose_text (pos
, pos
+ cmp_len
, components
, Qnil
, Qnil
);
5429 save_composition_data (buf
, buf_end
, prop
)
5433 enum composition_method method
= COMPOSITION_METHOD (prop
);
5434 int cmp_len
= COMPOSITION_LENGTH (prop
);
5436 if (buf
+ 4 + (MAX_COMPOSITION_COMPONENTS
* 2 - 1) > buf_end
)
5439 buf
[1] = CODING_ANNOTATE_COMPOSITION_MASK
;
5443 if (method
== COMPOSITION_RELATIVE
)
5447 Lisp_Object components
;
5450 components
= COMPOSITION_COMPONENTS (prop
);
5451 if (VECTORP (components
))
5453 len
= XVECTOR (components
)->size
;
5454 for (i
= 0; i
< len
; i
++)
5455 buf
[4 + i
] = XINT (AREF (components
, i
));
5457 else if (STRINGP (components
))
5461 len
= XSTRING (components
)->size
;
5464 FETCH_STRING_CHAR_ADVANCE (buf
[4 + i
], components
, i
, i_byte
);
5466 else if (INTEGERP (components
))
5469 buf
[4] = XINT (components
);
5471 else if (CONSP (components
))
5473 for (len
= 0; CONSP (components
);
5474 len
++, components
= XCDR (components
))
5475 buf
[4 + len
] = XINT (XCAR (components
));
5481 return (buf
+ buf
[0]);
5484 #define CHARBUF_SIZE 0x4000
5486 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5488 int size = CHARBUF_SIZE;; \
5490 coding->charbuf = NULL; \
5491 while (size > 1024) \
5493 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5494 if (coding->charbuf) \
5498 if (! coding->charbuf) \
5500 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5501 return coding->result; \
5503 coding->charbuf_size = size; \
5508 produce_annotation (coding
)
5509 struct coding_system
*coding
;
5511 int *charbuf
= coding
->charbuf
;
5512 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5514 while (charbuf
< charbuf_end
)
5520 int len
= -*charbuf
;
5523 case CODING_ANNOTATE_COMPOSITION_MASK
:
5524 produce_composition (coding
, charbuf
);
5534 /* Decode the data at CODING->src_object into CODING->dst_object.
5535 CODING->src_object is a buffer, a string, or nil.
5536 CODING->dst_object is a buffer.
5538 If CODING->src_object is a buffer, it must be the current buffer.
5539 In this case, if CODING->src_pos is positive, it is a position of
5540 the source text in the buffer, otherwise, the source text is in the
5541 gap area of the buffer, and CODING->src_pos specifies the offset of
5542 the text from GPT (which must be the same as PT). If this is the
5543 same buffer as CODING->dst_object, CODING->src_pos must be
5546 If CODING->src_object is a string, CODING->src_pos in an index to
5549 If CODING->src_object is nil, CODING->source must already point to
5550 the non-relocatable memory area. In this case, CODING->src_pos is
5551 an offset from CODING->source.
5553 The decoded data is inserted at the current point of the buffer
5558 decode_coding (coding
)
5559 struct coding_system
*coding
;
5563 if (BUFFERP (coding
->src_object
)
5564 && coding
->src_pos
> 0
5565 && coding
->src_pos
< GPT
5566 && coding
->src_pos
+ coding
->src_chars
> GPT
)
5567 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
5569 if (BUFFERP (coding
->dst_object
))
5571 if (current_buffer
!= XBUFFER (coding
->dst_object
))
5572 set_buffer_internal (XBUFFER (coding
->dst_object
));
5574 move_gap_both (PT
, PT_BYTE
);
5577 coding
->consumed
= coding
->consumed_char
= 0;
5578 coding
->produced
= coding
->produced_char
= 0;
5579 coding
->chars_at_source
= 0;
5580 coding
->result
= CODING_RESULT_SUCCESS
;
5583 ALLOC_CONVERSION_WORK_AREA (coding
);
5585 attrs
= CODING_ID_ATTRS (coding
->id
);
5589 coding_set_source (coding
);
5590 coding
->annotated
= 0;
5591 (*(coding
->decoder
)) (coding
);
5592 if (!NILP (CODING_ATTR_DECODE_TBL (attrs
)))
5593 translate_chars (CODING_ATTR_DECODE_TBL (attrs
), coding
);
5594 coding_set_destination (coding
);
5595 produce_chars (coding
);
5596 if (coding
->annotated
)
5597 produce_annotation (coding
);
5599 while (coding
->consumed
< coding
->src_bytes
5600 && ! coding
->result
);
5602 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qccl
)
5603 && SYMBOLP (CODING_ID_EOL_TYPE (coding
->id
))
5604 && ! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
5605 decode_eol (coding
);
5607 coding
->carryover_bytes
= 0;
5608 if (coding
->consumed
< coding
->src_bytes
)
5610 int nbytes
= coding
->src_bytes
- coding
->consumed
;
5613 coding_set_source (coding
);
5614 coding_set_destination (coding
);
5615 src
= coding
->source
+ coding
->consumed
;
5617 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
5619 /* Flush out unprocessed data as binary chars. We are sure
5620 that the number of data is less than the size of
5622 int *charbuf
= coding
->charbuf
;
5624 while (nbytes
-- > 0)
5627 *charbuf
++ = (c
& 0x80 ? - c
: c
);
5629 produce_chars (coding
);
5633 /* Record unprocessed bytes in coding->carryover. We are
5634 sure that the number of data is less than the size of
5635 coding->carryover. */
5636 unsigned char *p
= coding
->carryover
;
5638 coding
->carryover_bytes
= nbytes
;
5639 while (nbytes
-- > 0)
5642 coding
->consumed
= coding
->src_bytes
;
5645 return coding
->result
;
5649 consume_chars (coding
)
5650 struct coding_system
*coding
;
5652 int *buf
= coding
->charbuf
;
5653 /* -1 is to compensate for CRLF. */
5654 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
- 1;
5655 unsigned char *src
= coding
->source
+ coding
->consumed
;
5656 int pos
= coding
->src_pos
+ coding
->consumed_char
;
5657 int end_pos
= coding
->src_pos
+ coding
->src_chars
;
5658 int multibytep
= coding
->src_multibyte
;
5659 Lisp_Object eol_type
;
5661 int start
, end
, stop
;
5662 Lisp_Object object
, prop
;
5664 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5665 if (VECTORP (eol_type
))
5668 object
= coding
->src_object
;
5670 /* Note: composition handling is not yet implemented. */
5671 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
5673 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
5674 && find_composition (pos
, end_pos
, &start
, &end
, &prop
, object
)
5677 || (find_composition (end
, end_pos
, &start
, &end
, &prop
, object
)
5678 && end
<= end_pos
)))
5683 while (buf
< buf_end
)
5691 p
= save_composition_data (buf
, buf_end
, prop
);
5695 if (find_composition (end
, end_pos
, &start
, &end
, &prop
, object
)
5705 c
= STRING_CHAR_ADVANCE (src
);
5706 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
5708 if (! EQ (eol_type
, Qunix
))
5712 if (EQ (eol_type
, Qdos
))
5722 coding
->consumed
= src
- coding
->source
;
5723 coding
->consumed_char
= pos
- coding
->src_pos
;
5724 coding
->charbuf_used
= buf
- coding
->charbuf
;
5725 coding
->chars_at_source
= 0;
5729 /* Encode the text at CODING->src_object into CODING->dst_object.
5730 CODING->src_object is a buffer or a string.
5731 CODING->dst_object is a buffer or nil.
5733 If CODING->src_object is a buffer, it must be the current buffer.
5734 In this case, if CODING->src_pos is positive, it is a position of
5735 the source text in the buffer, otherwise. the source text is in the
5736 gap area of the buffer, and coding->src_pos specifies the offset of
5737 the text from GPT (which must be the same as PT). If this is the
5738 same buffer as CODING->dst_object, CODING->src_pos must be
5739 negative and CODING should not have `pre-write-conversion'.
5741 If CODING->src_object is a string, CODING should not have
5742 `pre-write-conversion'.
5744 If CODING->dst_object is a buffer, the encoded data is inserted at
5745 the current point of that buffer.
5747 If CODING->dst_object is nil, the encoded data is placed at the
5748 memory area specified by CODING->destination. */
5751 encode_coding (coding
)
5752 struct coding_system
*coding
;
5756 attrs
= CODING_ID_ATTRS (coding
->id
);
5758 if (BUFFERP (coding
->dst_object
))
5760 set_buffer_internal (XBUFFER (coding
->dst_object
));
5761 coding
->dst_multibyte
5762 = ! NILP (current_buffer
->enable_multibyte_characters
);
5765 coding
->consumed
= coding
->consumed_char
= 0;
5766 coding
->produced
= coding
->produced_char
= 0;
5767 coding
->result
= CODING_RESULT_SUCCESS
;
5770 ALLOC_CONVERSION_WORK_AREA (coding
);
5773 coding_set_source (coding
);
5774 consume_chars (coding
);
5776 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs
)))
5777 translate_chars (CODING_ATTR_ENCODE_TBL (attrs
), coding
);
5779 coding_set_destination (coding
);
5780 (*(coding
->encoder
)) (coding
);
5781 } while (coding
->consumed_char
< coding
->src_chars
);
5783 if (BUFFERP (coding
->dst_object
))
5784 insert_from_gap (coding
->produced_char
, coding
->produced
);
5786 return (coding
->result
);
5791 /* List of currently used working buffer. */
5792 Lisp_Object Vcode_conversion_work_buf_list
;
5794 /* A working buffer used by the top level conversion. */
5795 Lisp_Object Vcode_conversion_reused_work_buf
;
5798 /* Return a working buffer that can be freely used by the following
5799 code conversion. MULTIBYTEP specifies the multibyteness of the
5803 make_conversion_work_buffer (multibytep
)
5806 struct buffer
*current
= current_buffer
;
5809 if (NILP (Vcode_conversion_work_buf_list
))
5811 if (NILP (Vcode_conversion_reused_work_buf
))
5812 Vcode_conversion_reused_work_buf
5813 = Fget_buffer_create (build_string (" *code-conversion-work*"));
5814 Vcode_conversion_work_buf_list
5815 = Fcons (Vcode_conversion_reused_work_buf
, Qnil
);
5819 int depth
= Flength (Vcode_conversion_work_buf_list
);
5822 sprintf (str
, " *code-conversion-work*<%d>", depth
);
5823 Vcode_conversion_work_buf_list
5824 = Fcons (Fget_buffer_create (build_string (str
)),
5825 Vcode_conversion_work_buf_list
);
5828 buf
= XCAR (Vcode_conversion_work_buf_list
);
5829 set_buffer_internal (XBUFFER (buf
));
5830 current_buffer
->undo_list
= Qt
;
5832 Fset_buffer_multibyte (multibytep
? Qt
: Qnil
);
5833 set_buffer_internal (current
);
5837 static struct coding_system
*saved_coding
;
5840 code_conversion_restore (info
)
5843 int depth
= Flength (Vcode_conversion_work_buf_list
);
5848 buf
= XCAR (Vcode_conversion_work_buf_list
);
5849 Vcode_conversion_work_buf_list
= XCDR (Vcode_conversion_work_buf_list
);
5850 if (depth
> 1 && !NILP (Fbuffer_live_p (buf
)))
5854 if (saved_coding
->dst_object
== Qt
5855 && saved_coding
->destination
)
5856 xfree (saved_coding
->destination
);
5858 return save_excursion_restore (info
);
5863 decode_coding_gap (coding
, chars
, bytes
)
5864 struct coding_system
*coding
;
5865 EMACS_INT chars
, bytes
;
5867 int count
= specpdl_ptr
- specpdl
;
5869 saved_coding
= coding
;
5870 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5872 coding
->src_object
= Fcurrent_buffer ();
5873 coding
->src_chars
= chars
;
5874 coding
->src_bytes
= bytes
;
5875 coding
->src_pos
= -chars
;
5876 coding
->src_pos_byte
= -bytes
;
5877 coding
->src_multibyte
= chars
< bytes
;
5878 coding
->dst_object
= coding
->src_object
;
5879 coding
->dst_pos
= PT
;
5880 coding
->dst_pos_byte
= PT_BYTE
;
5881 coding
->dst_multibyte
= ! NILP (current_buffer
->enable_multibyte_characters
);
5883 if (CODING_REQUIRE_DETECTION (coding
))
5884 detect_coding (coding
);
5886 decode_coding (coding
);
5888 unbind_to (count
, Qnil
);
5889 return coding
->result
;
5893 encode_coding_gap (coding
, chars
, bytes
)
5894 struct coding_system
*coding
;
5895 EMACS_INT chars
, bytes
;
5897 int count
= specpdl_ptr
- specpdl
;
5900 saved_coding
= coding
;
5901 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5903 buffer
= Fcurrent_buffer ();
5904 coding
->src_object
= buffer
;
5905 coding
->src_chars
= chars
;
5906 coding
->src_bytes
= bytes
;
5907 coding
->src_pos
= -chars
;
5908 coding
->src_pos_byte
= -bytes
;
5909 coding
->src_multibyte
= chars
< bytes
;
5910 coding
->dst_object
= coding
->src_object
;
5911 coding
->dst_pos
= PT
;
5912 coding
->dst_pos_byte
= PT_BYTE
;
5914 encode_coding (coding
);
5916 unbind_to (count
, Qnil
);
5917 return coding
->result
;
5921 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5922 SRC_OBJECT into DST_OBJECT by coding context CODING.
5924 SRC_OBJECT is a buffer, a string, or Qnil.
5926 If it is a buffer, the text is at point of the buffer. FROM and TO
5927 are positions in the buffer.
5929 If it is a string, the text is at the beginning of the string.
5930 FROM and TO are indices to the string.
5932 If it is nil, the text is at coding->source. FROM and TO are
5933 indices to coding->source.
5935 DST_OBJECT is a buffer, Qt, or Qnil.
5937 If it is a buffer, the decoded text is inserted at point of the
5938 buffer. If the buffer is the same as SRC_OBJECT, the source text
5941 If it is Qt, a string is made from the decoded text, and
5942 set in CODING->dst_object.
5944 If it is Qnil, the decoded text is stored at CODING->destination.
5945 The called must allocate CODING->dst_bytes bytes at
5946 CODING->destination by xmalloc. If the decoded text is longer than
5947 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5951 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
5953 struct coding_system
*coding
;
5954 Lisp_Object src_object
;
5955 EMACS_INT from
, from_byte
, to
, to_byte
;
5956 Lisp_Object dst_object
;
5958 int count
= specpdl_ptr
- specpdl
;
5959 unsigned char *destination
;
5960 EMACS_INT dst_bytes
;
5961 EMACS_INT chars
= to
- from
;
5962 EMACS_INT bytes
= to_byte
- from_byte
;
5965 saved_coding
= coding
;
5966 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5968 if (NILP (dst_object
))
5970 destination
= coding
->destination
;
5971 dst_bytes
= coding
->dst_bytes
;
5974 coding
->src_object
= src_object
;
5975 coding
->src_chars
= chars
;
5976 coding
->src_bytes
= bytes
;
5977 coding
->src_multibyte
= chars
< bytes
;
5979 if (STRINGP (src_object
))
5981 coding
->src_pos
= from
;
5982 coding
->src_pos_byte
= from_byte
;
5984 else if (BUFFERP (src_object
))
5986 set_buffer_internal (XBUFFER (src_object
));
5988 move_gap_both (from
, from_byte
);
5989 if (EQ (src_object
, dst_object
))
5991 TEMP_SET_PT_BOTH (from
, from_byte
);
5992 del_range_both (from
, from_byte
, to
, to_byte
, 1);
5993 coding
->src_pos
= -chars
;
5994 coding
->src_pos_byte
= -bytes
;
5998 coding
->src_pos
= from
;
5999 coding
->src_pos_byte
= from_byte
;
6003 if (CODING_REQUIRE_DETECTION (coding
))
6004 detect_coding (coding
);
6005 attrs
= CODING_ID_ATTRS (coding
->id
);
6007 if (! NILP (CODING_ATTR_POST_READ (attrs
))
6008 || EQ (dst_object
, Qt
))
6010 coding
->dst_object
= make_conversion_work_buffer (1);
6011 coding
->dst_pos
= BEG
;
6012 coding
->dst_pos_byte
= BEG_BYTE
;
6013 coding
->dst_multibyte
= 1;
6015 else if (BUFFERP (dst_object
))
6017 coding
->dst_object
= dst_object
;
6018 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6019 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6020 coding
->dst_multibyte
6021 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6025 coding
->dst_object
= Qnil
;
6026 coding
->dst_multibyte
= 1;
6029 decode_coding (coding
);
6031 if (BUFFERP (coding
->dst_object
))
6032 set_buffer_internal (XBUFFER (coding
->dst_object
));
6034 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6036 struct gcpro gcpro1
, gcpro2
;
6037 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6040 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6041 GCPRO2 (coding
->src_object
, coding
->dst_object
);
6042 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6043 make_number (coding
->produced_char
));
6046 coding
->produced_char
+= Z
- prev_Z
;
6047 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6050 if (EQ (dst_object
, Qt
))
6052 coding
->dst_object
= Fbuffer_string ();
6054 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
6056 set_buffer_internal (XBUFFER (coding
->dst_object
));
6057 if (dst_bytes
< coding
->produced
)
6060 = (unsigned char *) xrealloc (destination
, coding
->produced
);
6063 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
6064 unbind_to (count
, Qnil
);
6067 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
6068 move_gap_both (BEGV
, BEGV_BYTE
);
6069 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
6070 coding
->destination
= destination
;
6074 unbind_to (count
, Qnil
);
6079 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6081 struct coding_system
*coding
;
6082 Lisp_Object src_object
;
6083 EMACS_INT from
, from_byte
, to
, to_byte
;
6084 Lisp_Object dst_object
;
6086 int count
= specpdl_ptr
- specpdl
;
6087 EMACS_INT chars
= to
- from
;
6088 EMACS_INT bytes
= to_byte
- from_byte
;
6091 saved_coding
= coding
;
6092 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
6094 coding
->src_object
= src_object
;
6095 coding
->src_chars
= chars
;
6096 coding
->src_bytes
= bytes
;
6097 coding
->src_multibyte
= chars
< bytes
;
6099 attrs
= CODING_ID_ATTRS (coding
->id
);
6101 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
6103 coding
->src_object
= make_conversion_work_buffer (coding
->src_multibyte
);
6104 set_buffer_internal (XBUFFER (coding
->src_object
));
6105 if (STRINGP (src_object
))
6106 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
6107 else if (BUFFERP (src_object
))
6108 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
6110 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
6112 if (EQ (src_object
, dst_object
))
6114 set_buffer_internal (XBUFFER (src_object
));
6115 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6116 set_buffer_internal (XBUFFER (coding
->src_object
));
6119 call2 (CODING_ATTR_PRE_WRITE (attrs
),
6120 make_number (BEG
), make_number (Z
));
6121 coding
->src_object
= Fcurrent_buffer ();
6123 move_gap_both (BEG
, BEG_BYTE
);
6124 coding
->src_chars
= Z
- BEG
;
6125 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
6126 coding
->src_pos
= BEG
;
6127 coding
->src_pos_byte
= BEG_BYTE
;
6128 coding
->src_multibyte
= Z
< Z_BYTE
;
6130 else if (STRINGP (src_object
))
6132 coding
->src_pos
= from
;
6133 coding
->src_pos_byte
= from_byte
;
6135 else if (BUFFERP (src_object
))
6137 set_buffer_internal (XBUFFER (src_object
));
6139 move_gap_both (from
, from_byte
);
6140 if (EQ (src_object
, dst_object
))
6142 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6143 coding
->src_pos
= -chars
;
6144 coding
->src_pos_byte
= -bytes
;
6148 coding
->src_pos
= from
;
6149 coding
->src_pos_byte
= from_byte
;
6153 if (BUFFERP (dst_object
))
6155 coding
->dst_object
= dst_object
;
6156 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6157 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6158 coding
->dst_multibyte
6159 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6161 else if (EQ (dst_object
, Qt
))
6163 coding
->dst_object
= Qnil
;
6164 coding
->dst_bytes
= coding
->src_chars
;
6165 if (coding
->dst_bytes
== 0)
6166 coding
->dst_bytes
= 1;
6167 coding
->destination
= (unsigned char *) xmalloc (coding
->dst_bytes
);
6168 coding
->dst_multibyte
= 0;
6172 coding
->dst_object
= Qnil
;
6173 coding
->dst_multibyte
= 0;
6176 encode_coding (coding
);
6178 if (EQ (dst_object
, Qt
))
6180 if (BUFFERP (coding
->dst_object
))
6181 coding
->dst_object
= Fbuffer_string ();
6185 = make_unibyte_string ((char *) coding
->destination
,
6187 xfree (coding
->destination
);
6191 unbind_to (count
, Qnil
);
6196 preferred_coding_system ()
6198 int id
= coding_categories
[coding_priorities
[0]].id
;
6200 return CODING_ID_NAME (id
);
6205 /*** 8. Emacs Lisp library functions ***/
6207 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
6208 doc
: /* Return t if OBJECT is nil or a coding-system.
6209 See the documentation of `define-coding-system' for information
6210 about coding-system objects. */)
6214 return ((NILP (obj
) || CODING_SYSTEM_P (obj
)) ? Qt
: Qnil
);
6217 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
6218 Sread_non_nil_coding_system
, 1, 1, 0,
6219 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6226 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6227 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
6229 while (XSTRING (val
)->size
== 0);
6230 return (Fintern (val
, Qnil
));
6233 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
6234 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6235 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6236 (prompt
, default_coding_system
)
6237 Lisp_Object prompt
, default_coding_system
;
6240 if (SYMBOLP (default_coding_system
))
6241 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
6242 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6243 Qt
, Qnil
, Qcoding_system_history
,
6244 default_coding_system
, Qnil
);
6245 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
6248 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
6250 doc
: /* Check validity of CODING-SYSTEM.
6251 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6252 It is valid if it is a symbol with a non-nil `coding-system' property.
6253 The value of property should be a vector of length 5. */)
6255 Lisp_Object coding_system
;
6257 CHECK_SYMBOL (coding_system
);
6258 if (!NILP (Fcoding_system_p (coding_system
)))
6259 return coding_system
;
6261 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
6266 detect_coding_system (src
, src_bytes
, highest
, multibytep
, coding_system
)
6268 int src_bytes
, highest
;
6270 Lisp_Object coding_system
;
6272 unsigned char *src_end
= src
+ src_bytes
;
6273 int mask
= CATEGORY_MASK_ANY
;
6276 Lisp_Object attrs
, eol_type
;
6278 struct coding_system coding
;
6280 if (NILP (coding_system
))
6281 coding_system
= Qundecided
;
6282 setup_coding_system (coding_system
, &coding
);
6283 attrs
= CODING_ID_ATTRS (coding
.id
);
6284 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
6286 coding
.source
= src
;
6287 coding
.src_bytes
= src_bytes
;
6288 coding
.src_multibyte
= multibytep
;
6289 coding
.consumed
= 0;
6291 if (XINT (CODING_ATTR_CATEGORY (attrs
)) != coding_category_undecided
)
6293 mask
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
6297 coding_system
= Qnil
;
6298 for (; src
< src_end
; src
++)
6301 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
6303 || c
== ISO_CODE_SO
)))
6306 coding
.head_ascii
= src
- coding
.source
;
6309 for (i
= 0; i
< coding_category_raw_text
; i
++)
6311 enum coding_category category
= coding_priorities
[i
];
6312 struct coding_system
*this = coding_categories
+ category
;
6314 if (category
>= coding_category_raw_text
6315 || detected
& (1 << category
))
6320 /* No coding system of this category is defined. */
6321 mask
&= ~(1 << category
);
6325 detected
|= detected_mask
[category
];
6326 if ((*(coding_categories
[category
].detector
)) (&coding
, &mask
)
6329 mask
&= detected_mask
[category
];
6337 val
= Fcons (make_number (coding_category_raw_text
), Qnil
);
6338 else if (mask
== CATEGORY_MASK_ANY
)
6339 val
= Fcons (make_number (coding_category_undecided
), Qnil
);
6342 for (i
= 0; i
< coding_category_raw_text
; i
++)
6343 if (mask
& (1 << coding_priorities
[i
]))
6345 val
= Fcons (make_number (coding_priorities
[i
]), Qnil
);
6352 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6353 if (mask
& (1 << coding_priorities
[i
]))
6354 val
= Fcons (make_number (coding_priorities
[i
]), val
);
6358 int one_byte_eol
= -1, two_byte_eol
= -1;
6361 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
6363 struct coding_system
*this
6364 = (NILP (coding_system
) ? coding_categories
+ XINT (XCAR (tail
))
6368 attrs
= CODING_ID_ATTRS (this->id
);
6369 eol_type
= CODING_ID_EOL_TYPE (this->id
);
6370 XSETCAR (tail
, CODING_ID_NAME (this->id
));
6371 if (VECTORP (eol_type
))
6373 if (EQ (CODING_ATTR_TYPE (attrs
), Qutf_16
))
6375 if (two_byte_eol
< 0)
6376 two_byte_eol
= detect_eol (this, coding
.source
, src_bytes
);
6377 this_eol
= two_byte_eol
;
6381 if (one_byte_eol
< 0)
6382 one_byte_eol
=detect_eol (this, coding
.source
, src_bytes
);
6383 this_eol
= one_byte_eol
;
6385 if (this_eol
== EOL_SEEN_LF
)
6386 XSETCAR (tail
, AREF (eol_type
, 0));
6387 else if (this_eol
== EOL_SEEN_CRLF
)
6388 XSETCAR (tail
, AREF (eol_type
, 1));
6389 else if (this_eol
== EOL_SEEN_CR
)
6390 XSETCAR (tail
, AREF (eol_type
, 2));
6395 return (highest
? XCAR (val
) : val
);
6399 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
6401 doc
: /* Detect coding system of the text in the region between START and END.
6402 Return a list of possible coding systems ordered by priority.
6404 If only ASCII characters are found, it returns a list of single element
6405 `undecided' or its subsidiary coding system according to a detected
6408 If optional argument HIGHEST is non-nil, return the coding system of
6409 highest priority. */)
6410 (start
, end
, highest
)
6411 Lisp_Object start
, end
, highest
;
6414 int from_byte
, to_byte
;
6416 CHECK_NUMBER_COERCE_MARKER (start
);
6417 CHECK_NUMBER_COERCE_MARKER (end
);
6419 validate_region (&start
, &end
);
6420 from
= XINT (start
), to
= XINT (end
);
6421 from_byte
= CHAR_TO_BYTE (from
);
6422 to_byte
= CHAR_TO_BYTE (to
);
6424 if (from
< GPT
&& to
>= GPT
)
6425 move_gap_both (to
, to_byte
);
6427 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
6428 to_byte
- from_byte
,
6430 !NILP (current_buffer
6431 ->enable_multibyte_characters
),
6435 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
6437 doc
: /* Detect coding system of the text in STRING.
6438 Return a list of possible coding systems ordered by priority.
6440 If only ASCII characters are found, it returns a list of single element
6441 `undecided' or its subsidiary coding system according to a detected
6444 If optional argument HIGHEST is non-nil, return the coding system of
6445 highest priority. */)
6447 Lisp_Object string
, highest
;
6449 CHECK_STRING (string
);
6451 return detect_coding_system (XSTRING (string
)->data
,
6452 STRING_BYTES (XSTRING (string
)),
6454 STRING_MULTIBYTE (string
),
6460 char_encodable_p (c
, attrs
)
6465 struct charset
*charset
;
6467 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
6468 CONSP (tail
); tail
= XCDR (tail
))
6470 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
6471 if (CHAR_CHARSET_P (c
, charset
))
6474 return (! NILP (tail
));
6478 /* Return a list of coding systems that safely encode the text between
6479 START and END. If EXCLUDE is non-nil, it is a list of coding
6480 systems not to check. The returned list doesn't contain any such
6481 coding systems. In any case, If the text contains only ASCII or is
6482 unibyte, return t. */
6484 DEFUN ("find-coding-systems-region-internal",
6485 Ffind_coding_systems_region_internal
,
6486 Sfind_coding_systems_region_internal
, 2, 3, 0,
6487 doc
: /* Internal use only. */)
6488 (start
, end
, exclude
)
6489 Lisp_Object start
, end
, exclude
;
6491 Lisp_Object coding_attrs_list
, safe_codings
;
6492 EMACS_INT start_byte
, end_byte
;
6493 unsigned char *p
, *pbeg
, *pend
;
6495 Lisp_Object tail
, elt
;
6497 if (STRINGP (start
))
6499 if (!STRING_MULTIBYTE (start
)
6500 && XSTRING (start
)->size
!= STRING_BYTES (XSTRING (start
)))
6503 end_byte
= STRING_BYTES (XSTRING (start
));
6507 CHECK_NUMBER_COERCE_MARKER (start
);
6508 CHECK_NUMBER_COERCE_MARKER (end
);
6509 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
6510 args_out_of_range (start
, end
);
6511 if (NILP (current_buffer
->enable_multibyte_characters
))
6513 start_byte
= CHAR_TO_BYTE (XINT (start
));
6514 end_byte
= CHAR_TO_BYTE (XINT (end
));
6515 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
6518 if (start
< GPT
&& end
> GPT
)
6520 if ((GPT
- start
) < (end
- GPT
))
6521 move_gap_both (start
, start_byte
);
6523 move_gap_both (end
, end_byte
);
6527 coding_attrs_list
= Qnil
;
6528 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
6530 || NILP (Fmemq (XCAR (tail
), exclude
)))
6534 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
6535 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
6536 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
6537 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
6540 if (STRINGP (start
))
6541 p
= pbeg
= XSTRING (start
)->data
;
6543 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
6544 pend
= p
+ (end_byte
- start_byte
);
6546 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
6547 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
6551 if (ASCII_BYTE_P (*p
))
6555 c
= STRING_CHAR_ADVANCE (p
);
6557 charset_map_loaded
= 0;
6558 for (tail
= coding_attrs_list
; CONSP (tail
);)
6563 else if (char_encodable_p (c
, elt
))
6565 else if (CONSP (XCDR (tail
)))
6567 XSETCAR (tail
, XCAR (XCDR (tail
)));
6568 XSETCDR (tail
, XCDR (XCDR (tail
)));
6572 XSETCAR (tail
, Qnil
);
6576 if (charset_map_loaded
)
6578 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
6580 if (STRINGP (start
))
6581 pbeg
= XSTRING (start
)->data
;
6583 pbeg
= BYTE_POS_ADDR (start_byte
);
6584 p
= pbeg
+ p_offset
;
6585 pend
= pbeg
+ pend_offset
;
6590 safe_codings
= Qnil
;
6591 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
6592 if (! NILP (XCAR (tail
)))
6593 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
6595 return safe_codings
;
6599 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
6600 Scheck_coding_systems_region
, 3, 3, 0,
6601 doc
: /* Check if the region is encodable by coding systems.
6603 START and END are buffer positions specifying the region.
6604 CODING-SYSTEM-LIST is a list of coding systems to check.
6606 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6607 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6608 whole region, POS0, POS1, ... are buffer positions where non-encodable
6609 characters are found.
6611 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6614 START may be a string. In that case, check if the string is
6615 encodable, and the value contains indices to the string instead of
6616 buffer positions. END is ignored. */)
6617 (start
, end
, coding_system_list
)
6618 Lisp_Object start
, end
, coding_system_list
;
6621 EMACS_INT start_byte
, end_byte
;
6623 unsigned char *p
, *pbeg
, *pend
;
6625 Lisp_Object tail
, elt
;
6627 if (STRINGP (start
))
6629 if (!STRING_MULTIBYTE (start
)
6630 && XSTRING (start
)->size
!= STRING_BYTES (XSTRING (start
)))
6633 end_byte
= STRING_BYTES (XSTRING (start
));
6638 CHECK_NUMBER_COERCE_MARKER (start
);
6639 CHECK_NUMBER_COERCE_MARKER (end
);
6640 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
6641 args_out_of_range (start
, end
);
6642 if (NILP (current_buffer
->enable_multibyte_characters
))
6644 start_byte
= CHAR_TO_BYTE (XINT (start
));
6645 end_byte
= CHAR_TO_BYTE (XINT (end
));
6646 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
6649 if (start
< GPT
&& end
> GPT
)
6651 if ((GPT
- start
) < (end
- GPT
))
6652 move_gap_both (start
, start_byte
);
6654 move_gap_both (end
, end_byte
);
6660 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
6663 list
= Fcons (Fcons (elt
, Fcons (AREF (CODING_SYSTEM_SPEC (elt
), 0),
6668 if (STRINGP (start
))
6669 p
= pbeg
= XSTRING (start
)->data
;
6671 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
6672 pend
= p
+ (end_byte
- start_byte
);
6674 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
6675 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
6679 if (ASCII_BYTE_P (*p
))
6683 c
= STRING_CHAR_ADVANCE (p
);
6685 charset_map_loaded
= 0;
6686 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
6688 elt
= XCDR (XCAR (tail
));
6689 if (! char_encodable_p (c
, XCAR (elt
)))
6690 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
6692 if (charset_map_loaded
)
6694 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
6696 if (STRINGP (start
))
6697 pbeg
= XSTRING (start
)->data
;
6699 pbeg
= BYTE_POS_ADDR (start_byte
);
6700 p
= pbeg
+ p_offset
;
6701 pend
= pbeg
+ pend_offset
;
6709 for (; CONSP (tail
); tail
= XCDR (tail
))
6712 if (CONSP (XCDR (XCDR (elt
))))
6713 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
6723 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
6724 Lisp_Object start
, end
, coding_system
, dst_object
;
6725 int encodep
, norecord
;
6727 struct coding_system coding
;
6728 EMACS_INT from
, from_byte
, to
, to_byte
;
6729 Lisp_Object src_object
;
6731 CHECK_NUMBER_COERCE_MARKER (start
);
6732 CHECK_NUMBER_COERCE_MARKER (end
);
6733 if (NILP (coding_system
))
6734 coding_system
= Qno_conversion
;
6736 CHECK_CODING_SYSTEM (coding_system
);
6737 src_object
= Fcurrent_buffer ();
6738 if (NILP (dst_object
))
6739 dst_object
= src_object
;
6740 else if (! EQ (dst_object
, Qt
))
6741 CHECK_BUFFER (dst_object
);
6743 validate_region (&start
, &end
);
6744 from
= XFASTINT (start
);
6745 from_byte
= CHAR_TO_BYTE (from
);
6746 to
= XFASTINT (end
);
6747 to_byte
= CHAR_TO_BYTE (to
);
6749 setup_coding_system (coding_system
, &coding
);
6750 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6753 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
6756 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
6759 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
6761 if (coding
.result
!= CODING_RESULT_SUCCESS
)
6762 error ("Code conversion error: %d", coding
.result
);
6764 return (BUFFERP (dst_object
)
6765 ? make_number (coding
.produced_char
)
6766 : coding
.dst_object
);
6770 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
6771 3, 4, "r\nzCoding system: ",
6772 doc
: /* Decode the current region from the specified coding system.
6773 When called from a program, takes four arguments:
6774 START, END, CODING-SYSTEM, and DESTINATION.
6775 START and END are buffer positions.
6777 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6778 If nil, the region between START and END is replace by the decoded text.
6779 If buffer, the decoded text is inserted in the buffer.
6780 If t, the decoded text is returned.
6782 This function sets `last-coding-system-used' to the precise coding system
6783 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6784 not fully specified.)
6785 It returns the length of the decoded text. */)
6786 (start
, end
, coding_system
, destination
)
6787 Lisp_Object start
, end
, coding_system
, destination
;
6789 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
6792 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
6793 3, 4, "r\nzCoding system: ",
6794 doc
: /* Encode the current region by specified coding system.
6795 When called from a program, takes three arguments:
6796 START, END, and CODING-SYSTEM. START and END are buffer positions.
6798 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6799 If nil, the region between START and END is replace by the encoded text.
6800 If buffer, the encoded text is inserted in the buffer.
6801 If t, the encoded text is returned.
6803 This function sets `last-coding-system-used' to the precise coding system
6804 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6805 not fully specified.)
6806 It returns the length of the encoded text. */)
6807 (start
, end
, coding_system
, destination
)
6808 Lisp_Object start
, end
, coding_system
, destination
;
6810 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
6814 code_convert_string (string
, coding_system
, dst_object
,
6815 encodep
, nocopy
, norecord
)
6816 Lisp_Object string
, coding_system
, dst_object
;
6817 int encodep
, nocopy
, norecord
;
6819 struct coding_system coding
;
6820 EMACS_INT chars
, bytes
;
6822 CHECK_STRING (string
);
6823 if (NILP (coding_system
))
6826 Vlast_coding_system_used
= Qno_conversion
;
6827 if (NILP (dst_object
))
6828 return (nocopy
? Fcopy_sequence (string
) : string
);
6831 if (NILP (coding_system
))
6832 coding_system
= Qno_conversion
;
6834 CHECK_CODING_SYSTEM (coding_system
);
6835 if (NILP (dst_object
))
6837 else if (! EQ (dst_object
, Qt
))
6838 CHECK_BUFFER (dst_object
);
6840 setup_coding_system (coding_system
, &coding
);
6841 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6842 chars
= XSTRING (string
)->size
;
6843 bytes
= STRING_BYTES (XSTRING (string
));
6845 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
6847 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
6849 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
6851 if (coding
.result
!= CODING_RESULT_SUCCESS
)
6852 error ("Code conversion error: %d", coding
.result
);
6854 return (BUFFERP (dst_object
)
6855 ? make_number (coding
.produced_char
)
6856 : coding
.dst_object
);
6860 /* Encode or decode STRING according to CODING_SYSTEM.
6861 Do not set Vlast_coding_system_used.
6863 This function is called only from macros DECODE_FILE and
6864 ENCODE_FILE, thus we ignore character composition. */
6867 code_convert_string_norecord (string
, coding_system
, encodep
)
6868 Lisp_Object string
, coding_system
;
6871 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
6875 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
6877 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6879 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6880 if the decoding operation is trivial.
6882 Optional fourth arg BUFFER non-nil meant that the decoded text is
6883 inserted in BUFFER instead of returned as a astring. In this case,
6884 the return value is BUFFER.
6886 This function sets `last-coding-system-used' to the precise coding system
6887 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6888 not fully specified. */)
6889 (string
, coding_system
, nocopy
, buffer
)
6890 Lisp_Object string
, coding_system
, nocopy
, buffer
;
6892 return code_convert_string (string
, coding_system
, buffer
,
6893 0, ! NILP (nocopy
), 0);
6896 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
6898 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
6900 Optional third arg NOCOPY non-nil means it is OK to return STRING
6901 itself if the encoding operation is trivial.
6903 Optional fourth arg BUFFER non-nil meant that the encoded text is
6904 inserted in BUFFER instead of returned as a astring. In this case,
6905 the return value is BUFFER.
6907 This function sets `last-coding-system-used' to the precise coding system
6908 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6909 not fully specified.) */)
6910 (string
, coding_system
, nocopy
, buffer
)
6911 Lisp_Object string
, coding_system
, nocopy
, buffer
;
6913 return code_convert_string (string
, coding_system
, buffer
,
6914 nocopy
, ! NILP (nocopy
), 1);
6918 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
6919 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
6920 Return the corresponding character. */)
6924 Lisp_Object spec
, attrs
, val
;
6925 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
6928 CHECK_NATNUM (code
);
6929 c
= XFASTINT (code
);
6930 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
6931 attrs
= AREF (spec
, 0);
6933 if (ASCII_BYTE_P (c
)
6934 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6937 val
= CODING_ATTR_CHARSET_LIST (attrs
);
6938 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6939 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6940 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
6943 charset
= charset_roman
;
6944 else if (c
>= 0xA0 && c
< 0xDF)
6946 charset
= charset_kana
;
6951 int s1
= c
>> 8, s2
= c
& 0xFF;
6953 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
6954 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
6955 error ("Invalid code: %d", code
);
6957 charset
= charset_kanji
;
6959 c
= DECODE_CHAR (charset
, c
);
6961 error ("Invalid code: %d", code
);
6962 return make_number (c
);
6966 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
6967 doc
: /* Encode a Japanese character CHAR to shift_jis encoding.
6968 Return the corresponding code in SJIS. */)
6972 Lisp_Object spec
, attrs
, charset_list
;
6974 struct charset
*charset
;
6977 CHECK_CHARACTER (ch
);
6979 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
6980 attrs
= AREF (spec
, 0);
6982 if (ASCII_CHAR_P (c
)
6983 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6986 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
6987 charset
= char_charset (c
, charset_list
, &code
);
6988 if (code
== CHARSET_INVALID_CODE (charset
))
6989 error ("Can't encode by shift_jis encoding: %d", c
);
6992 return make_number (code
);
6995 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
6996 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
6997 Return the corresponding character. */)
7001 Lisp_Object spec
, attrs
, val
;
7002 struct charset
*charset_roman
, *charset_big5
, *charset
;
7005 CHECK_NATNUM (code
);
7006 c
= XFASTINT (code
);
7007 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7008 attrs
= AREF (spec
, 0);
7010 if (ASCII_BYTE_P (c
)
7011 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7014 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7015 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7016 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7019 charset
= charset_roman
;
7022 int b1
= c
>> 8, b2
= c
& 0x7F;
7023 if (b1
< 0xA1 || b1
> 0xFE
7024 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
7025 error ("Invalid code: %d", code
);
7026 charset
= charset_big5
;
7028 c
= DECODE_CHAR (charset
, (unsigned )c
);
7030 error ("Invalid code: %d", code
);
7031 return make_number (c
);
7034 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
7035 doc
: /* Encode the Big5 character CHAR to BIG5 coding system.
7036 Return the corresponding character code in Big5. */)
7040 Lisp_Object spec
, attrs
, charset_list
;
7041 struct charset
*charset
;
7045 CHECK_CHARACTER (ch
);
7047 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7048 attrs
= AREF (spec
, 0);
7049 if (ASCII_CHAR_P (c
)
7050 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7053 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7054 charset
= char_charset (c
, charset_list
, &code
);
7055 if (code
== CHARSET_INVALID_CODE (charset
))
7056 error ("Can't encode by Big5 encoding: %d", c
);
7058 return make_number (code
);
7062 DEFUN ("set-terminal-coding-system-internal",
7063 Fset_terminal_coding_system_internal
,
7064 Sset_terminal_coding_system_internal
, 1, 1, 0,
7065 doc
: /* Internal use only. */)
7067 Lisp_Object coding_system
;
7069 CHECK_SYMBOL (coding_system
);
7070 setup_coding_system (Fcheck_coding_system (coding_system
),
7073 /* We had better not send unsafe characters to terminal. */
7074 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
7075 /* Characer composition should be disabled. */
7076 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7077 terminal_coding
.src_multibyte
= 1;
7078 terminal_coding
.dst_multibyte
= 0;
7082 DEFUN ("set-safe-terminal-coding-system-internal",
7083 Fset_safe_terminal_coding_system_internal
,
7084 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
7085 doc
: /* Internal use only. */)
7087 Lisp_Object coding_system
;
7089 CHECK_SYMBOL (coding_system
);
7090 setup_coding_system (Fcheck_coding_system (coding_system
),
7091 &safe_terminal_coding
);
7092 /* Characer composition should be disabled. */
7093 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7094 safe_terminal_coding
.src_multibyte
= 1;
7095 safe_terminal_coding
.dst_multibyte
= 0;
7099 DEFUN ("terminal-coding-system",
7100 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
7101 doc
: /* Return coding system specified for terminal output. */)
7104 return CODING_ID_NAME (terminal_coding
.id
);
7107 DEFUN ("set-keyboard-coding-system-internal",
7108 Fset_keyboard_coding_system_internal
,
7109 Sset_keyboard_coding_system_internal
, 1, 1, 0,
7110 doc
: /* Internal use only. */)
7112 Lisp_Object coding_system
;
7114 CHECK_SYMBOL (coding_system
);
7115 setup_coding_system (Fcheck_coding_system (coding_system
),
7117 /* Characer composition should be disabled. */
7118 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7122 DEFUN ("keyboard-coding-system",
7123 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
7124 doc
: /* Return coding system specified for decoding keyboard input. */)
7127 return CODING_ID_NAME (keyboard_coding
.id
);
7131 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
7132 Sfind_operation_coding_system
, 1, MANY
, 0,
7133 doc
: /* Choose a coding system for an operation based on the target name.
7134 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7135 DECODING-SYSTEM is the coding system to use for decoding
7136 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7137 for encoding (in case OPERATION does encoding).
7139 The first argument OPERATION specifies an I/O primitive:
7140 For file I/O, `insert-file-contents' or `write-region'.
7141 For process I/O, `call-process', `call-process-region', or `start-process'.
7142 For network I/O, `open-network-stream'.
7144 The remaining arguments should be the same arguments that were passed
7145 to the primitive. Depending on which primitive, one of those arguments
7146 is selected as the TARGET. For example, if OPERATION does file I/O,
7147 whichever argument specifies the file name is TARGET.
7149 TARGET has a meaning which depends on OPERATION:
7150 For file I/O, TARGET is a file name.
7151 For process I/O, TARGET is a process name.
7152 For network I/O, TARGET is a service name or a port number
7154 This function looks up what specified for TARGET in,
7155 `file-coding-system-alist', `process-coding-system-alist',
7156 or `network-coding-system-alist' depending on OPERATION.
7157 They may specify a coding system, a cons of coding systems,
7158 or a function symbol to call.
7159 In the last case, we call the function with one argument,
7160 which is a list of all the arguments given to this function.
7162 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7167 Lisp_Object operation
, target_idx
, target
, val
;
7168 register Lisp_Object chain
;
7171 error ("Too few arguments");
7172 operation
= args
[0];
7173 if (!SYMBOLP (operation
)
7174 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
7175 error ("Invalid first arguement");
7176 if (nargs
< 1 + XINT (target_idx
))
7177 error ("Too few arguments for operation: %s",
7178 XSYMBOL (operation
)->name
->data
);
7179 target
= args
[XINT (target_idx
) + 1];
7180 if (!(STRINGP (target
)
7181 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
7182 error ("Invalid %dth argument", XINT (target_idx
) + 1);
7184 chain
= ((EQ (operation
, Qinsert_file_contents
)
7185 || EQ (operation
, Qwrite_region
))
7186 ? Vfile_coding_system_alist
7187 : (EQ (operation
, Qopen_network_stream
)
7188 ? Vnetwork_coding_system_alist
7189 : Vprocess_coding_system_alist
));
7193 for (; CONSP (chain
); chain
= XCDR (chain
))
7199 && ((STRINGP (target
)
7200 && STRINGP (XCAR (elt
))
7201 && fast_string_match (XCAR (elt
), target
) >= 0)
7202 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
7205 /* Here, if VAL is both a valid coding system and a valid
7206 function symbol, we return VAL as a coding system. */
7209 if (! SYMBOLP (val
))
7211 if (! NILP (Fcoding_system_p (val
)))
7212 return Fcons (val
, val
);
7213 if (! NILP (Ffboundp (val
)))
7215 val
= call1 (val
, Flist (nargs
, args
));
7218 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
7219 return Fcons (val
, val
);
7227 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
7228 Sset_coding_system_priority
, 1, MANY
, 0,
7229 doc
: /* Assign higher priority to the coding systems given as arguments.
7230 usage: (set-coding-system-priority CODING-SYSTEM ...) */)
7236 int changed
[coding_category_max
];
7237 enum coding_category priorities
[coding_category_max
];
7239 bzero (changed
, sizeof changed
);
7241 for (i
= j
= 0; i
< nargs
; i
++)
7243 enum coding_category category
;
7244 Lisp_Object spec
, attrs
;
7246 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
7247 attrs
= AREF (spec
, 0);
7248 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7249 if (changed
[category
])
7250 /* Ignore this coding system because a coding system of the
7251 same category already had a higher priority. */
7253 changed
[category
] = 1;
7254 priorities
[j
++] = category
;
7255 if (coding_categories
[category
].id
>= 0
7256 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
7257 setup_coding_system (args
[i
], &coding_categories
[category
]);
7260 /* Now we have decided top J priorities. Reflect the order of the
7261 original priorities to the remaining priorities. */
7263 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
7265 while (j
< coding_category_max
7266 && changed
[coding_priorities
[j
]])
7268 if (j
== coding_category_max
)
7270 priorities
[i
] = coding_priorities
[j
];
7273 bcopy (priorities
, coding_priorities
, sizeof priorities
);
7277 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
7278 Scoding_system_priority_list
, 0, 1, 0,
7279 doc
: /* Return a list of coding systems ordered by their priorities.
7280 HIGHESTP non-nil means just return the highest priority one. */)
7282 Lisp_Object highestp
;
7287 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
7289 enum coding_category category
= coding_priorities
[i
];
7290 int id
= coding_categories
[category
].id
;
7295 attrs
= CODING_ID_ATTRS (id
);
7296 if (! NILP (highestp
))
7297 return CODING_ATTR_BASE_NAME (attrs
);
7298 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
7300 return Fnreverse (val
);
7303 static char *suffixes
[] = { "-unix", "-dos", "-mac" };
7306 make_subsidiaries (base
)
7309 Lisp_Object subsidiaries
;
7310 int base_name_len
= STRING_BYTES (XSYMBOL (base
)->name
);
7311 char *buf
= (char *) alloca (base_name_len
+ 6);
7314 bcopy (XSYMBOL (base
)->name
->data
, buf
, base_name_len
);
7315 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
7316 for (i
= 0; i
< 3; i
++)
7318 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
7319 ASET (subsidiaries
, i
, intern (buf
));
7321 return subsidiaries
;
7325 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
7326 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
7327 doc
: /* For internal use only.
7328 usage: (define-coding-system-internal ...) */)
7334 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
7335 Lisp_Object attrs
; /* Vector of attributes. */
7336 Lisp_Object eol_type
;
7337 Lisp_Object aliases
;
7338 Lisp_Object coding_type
, charset_list
, safe_charsets
;
7339 enum coding_category category
;
7340 Lisp_Object tail
, val
;
7341 int max_charset_id
= 0;
7344 if (nargs
< coding_arg_max
)
7347 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
7349 name
= args
[coding_arg_name
];
7350 CHECK_SYMBOL (name
);
7351 CODING_ATTR_BASE_NAME (attrs
) = name
;
7353 val
= args
[coding_arg_mnemonic
];
7354 if (! STRINGP (val
))
7355 CHECK_CHARACTER (val
);
7356 CODING_ATTR_MNEMONIC (attrs
) = val
;
7358 coding_type
= args
[coding_arg_coding_type
];
7359 CHECK_SYMBOL (coding_type
);
7360 CODING_ATTR_TYPE (attrs
) = coding_type
;
7362 charset_list
= args
[coding_arg_charset_list
];
7363 if (SYMBOLP (charset_list
))
7365 if (EQ (charset_list
, Qiso_2022
))
7367 if (! EQ (coding_type
, Qiso_2022
))
7368 error ("Invalid charset-list");
7369 charset_list
= Viso_2022_charset_list
;
7371 else if (EQ (charset_list
, Qemacs_mule
))
7373 if (! EQ (coding_type
, Qemacs_mule
))
7374 error ("Invalid charset-list");
7375 charset_list
= Vemacs_mule_charset_list
;
7377 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7378 if (max_charset_id
< XFASTINT (XCAR (tail
)))
7379 max_charset_id
= XFASTINT (XCAR (tail
));
7383 charset_list
= Fcopy_sequence (charset_list
);
7384 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
7386 struct charset
*charset
;
7389 CHECK_CHARSET_GET_CHARSET (val
, charset
);
7390 if (EQ (coding_type
, Qiso_2022
)
7391 ? CHARSET_ISO_FINAL (charset
) < 0
7392 : EQ (coding_type
, Qemacs_mule
)
7393 ? CHARSET_EMACS_MULE_ID (charset
) < 0
7395 error ("Can't handle charset `%s'",
7396 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7398 XCAR (tail
) = make_number (charset
->id
);
7399 if (max_charset_id
< charset
->id
)
7400 max_charset_id
= charset
->id
;
7403 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
7405 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
7407 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7408 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
7409 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
7411 val
= args
[coding_arg_decode_translation_table
];
7413 CHECK_CHAR_TABLE (val
);
7414 CODING_ATTR_DECODE_TBL (attrs
) = val
;
7416 val
= args
[coding_arg_encode_translation_table
];
7418 CHECK_CHAR_TABLE (val
);
7419 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
7421 val
= args
[coding_arg_post_read_conversion
];
7423 CODING_ATTR_POST_READ (attrs
) = val
;
7425 val
= args
[coding_arg_pre_write_conversion
];
7427 CODING_ATTR_PRE_WRITE (attrs
) = val
;
7429 val
= args
[coding_arg_default_char
];
7431 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
7434 CHECK_CHARACTER (val
);
7435 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
7438 val
= args
[coding_arg_plist
];
7440 CODING_ATTR_PLIST (attrs
) = val
;
7442 if (EQ (coding_type
, Qcharset
))
7444 /* Generate a lisp vector of 256 elements. Each element is nil,
7445 integer, or a list of charset IDs.
7447 If Nth element is nil, the byte code N is invalid in this
7450 If Nth element is a number NUM, N is the first byte of a
7451 charset whose ID is NUM.
7453 If Nth element is a list of charset IDs, N is the first byte
7454 of one of them. The list is sorted by dimensions of the
7455 charsets. A charset of smaller dimension comes firtst.
7457 val
= Fmake_vector (make_number (256), Qnil
);
7459 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7461 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
7462 int dim
= CHARSET_DIMENSION (charset
);
7463 int idx
= (dim
- 1) * 4;
7465 for (i
= charset
->code_space
[idx
];
7466 i
<= charset
->code_space
[idx
+ 1]; i
++)
7468 Lisp_Object tmp
, tmp2
;
7471 tmp
= AREF (val
, i
);
7474 else if (NUMBERP (tmp
))
7476 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
7478 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
7480 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
7484 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
7486 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
7491 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
7494 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
7495 XSETCAR (tmp2
, XCAR (tail
));
7501 ASET (attrs
, coding_attr_charset_valids
, val
);
7502 category
= coding_category_charset
;
7504 else if (EQ (coding_type
, Qccl
))
7508 if (nargs
< coding_arg_ccl_max
)
7511 val
= args
[coding_arg_ccl_decoder
];
7512 CHECK_CCL_PROGRAM (val
);
7514 val
= Fcopy_sequence (val
);
7515 ASET (attrs
, coding_attr_ccl_decoder
, val
);
7517 val
= args
[coding_arg_ccl_encoder
];
7518 CHECK_CCL_PROGRAM (val
);
7520 val
= Fcopy_sequence (val
);
7521 ASET (attrs
, coding_attr_ccl_encoder
, val
);
7523 val
= args
[coding_arg_ccl_valids
];
7524 valids
= Fmake_string (make_number (256), make_number (0));
7525 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
7529 ASET (valids
, XINT (val
), 1);
7535 CHECK_NUMBER (XCAR (val
));
7536 CHECK_NUMBER (XCDR (val
));
7537 from
= XINT (XCAR (val
));
7538 to
= XINT (XCDR (val
));
7539 for (i
= from
; i
<= to
; i
++)
7540 ASET (valids
, i
, 1);
7543 ASET (attrs
, coding_attr_ccl_valids
, valids
);
7545 category
= coding_category_ccl
;
7547 else if (EQ (coding_type
, Qutf_16
))
7549 Lisp_Object bom
, endian
;
7551 if (nargs
< coding_arg_utf16_max
)
7554 bom
= args
[coding_arg_utf16_bom
];
7555 if (! NILP (bom
) && ! EQ (bom
, Qt
))
7558 CHECK_CODING_SYSTEM (XCAR (bom
));
7559 CHECK_CODING_SYSTEM (XCDR (bom
));
7561 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
7563 endian
= args
[coding_arg_utf16_endian
];
7564 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
7566 category
= (CONSP (bom
)
7567 ? coding_category_utf_16_auto
7570 ? coding_category_utf_16_be_nosig
7571 : coding_category_utf_16_le_nosig
)
7573 ? coding_category_utf_16_be
7574 : coding_category_utf_16_le
));
7576 else if (EQ (coding_type
, Qiso_2022
))
7578 Lisp_Object initial
, reg_usage
, request
, flags
;
7581 if (nargs
< coding_arg_iso2022_max
)
7584 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
7585 CHECK_VECTOR (initial
);
7586 for (i
= 0; i
< 4; i
++)
7588 val
= Faref (initial
, make_number (i
));
7591 CHECK_CHARSET_GET_ID (val
, id
);
7592 ASET (initial
, i
, make_number (id
));
7595 ASET (initial
, i
, make_number (-1));
7598 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
7599 CHECK_CONS (reg_usage
);
7600 CHECK_NATNUM (XCAR (reg_usage
));
7601 CHECK_NATNUM (XCDR (reg_usage
));
7603 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
7604 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
7610 CHECK_CHARSET_GET_ID (XCAR (val
), id
);
7611 CHECK_NATNUM (XCDR (val
));
7612 if (XINT (XCDR (val
)) >= 4)
7613 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
7614 XCAR (val
) = make_number (id
);
7617 flags
= args
[coding_arg_iso2022_flags
];
7618 CHECK_NATNUM (flags
);
7620 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
7621 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
7623 ASET (attrs
, coding_attr_iso_initial
, initial
);
7624 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
7625 ASET (attrs
, coding_attr_iso_request
, request
);
7626 ASET (attrs
, coding_attr_iso_flags
, flags
);
7627 setup_iso_safe_charsets (attrs
);
7629 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
7630 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
7631 | CODING_ISO_FLAG_SINGLE_SHIFT
))
7632 ? coding_category_iso_7_else
7633 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
7634 ? coding_category_iso_7
7635 : coding_category_iso_7_tight
);
7638 int id
= XINT (AREF (initial
, 1));
7640 category
= (((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
7641 | CODING_ISO_FLAG_SINGLE_SHIFT
))
7642 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
7644 ? coding_category_iso_8_else
7645 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
7646 ? coding_category_iso_8_1
7647 : coding_category_iso_8_2
);
7650 else if (EQ (coding_type
, Qemacs_mule
))
7652 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
7653 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
7655 category
= coding_category_emacs_mule
;
7657 else if (EQ (coding_type
, Qshift_jis
))
7660 struct charset
*charset
;
7662 if (XINT (Flength (charset_list
)) != 3)
7663 error ("There should be just three charsets");
7665 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7666 if (CHARSET_DIMENSION (charset
) != 1)
7667 error ("Dimension of charset %s is not one",
7668 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7670 charset_list
= XCDR (charset_list
);
7671 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7672 if (CHARSET_DIMENSION (charset
) != 1)
7673 error ("Dimension of charset %s is not one",
7674 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7676 charset_list
= XCDR (charset_list
);
7677 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7678 if (CHARSET_DIMENSION (charset
) != 2)
7679 error ("Dimension of charset %s is not two",
7680 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7682 category
= coding_category_sjis
;
7683 Vsjis_coding_system
= name
;
7685 else if (EQ (coding_type
, Qbig5
))
7687 struct charset
*charset
;
7689 if (XINT (Flength (charset_list
)) != 2)
7690 error ("There should be just two charsets");
7692 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7693 if (CHARSET_DIMENSION (charset
) != 1)
7694 error ("Dimension of charset %s is not one",
7695 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7697 charset_list
= XCDR (charset_list
);
7698 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7699 if (CHARSET_DIMENSION (charset
) != 2)
7700 error ("Dimension of charset %s is not two",
7701 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7703 category
= coding_category_big5
;
7704 Vbig5_coding_system
= name
;
7706 else if (EQ (coding_type
, Qraw_text
))
7707 category
= coding_category_raw_text
;
7708 else if (EQ (coding_type
, Qutf_8
))
7709 category
= coding_category_utf_8
;
7710 else if (EQ (coding_type
, Qundecided
))
7711 category
= coding_category_undecided
;
7713 error ("Invalid coding system type: %s",
7714 XSYMBOL (coding_type
)->name
->data
);
7716 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
7718 eol_type
= args
[coding_arg_eol_type
];
7719 if (! NILP (eol_type
)
7720 && ! EQ (eol_type
, Qunix
)
7721 && ! EQ (eol_type
, Qdos
)
7722 && ! EQ (eol_type
, Qmac
))
7723 error ("Invalid eol-type");
7725 aliases
= Fcons (name
, Qnil
);
7727 if (NILP (eol_type
))
7729 eol_type
= make_subsidiaries (name
);
7730 for (i
= 0; i
< 3; i
++)
7732 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
7734 this_name
= AREF (eol_type
, i
);
7735 this_aliases
= Fcons (this_name
, Qnil
);
7736 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
7737 this_spec
= Fmake_vector (make_number (3), attrs
);
7738 ASET (this_spec
, 1, this_aliases
);
7739 ASET (this_spec
, 2, this_eol_type
);
7740 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
7741 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
7742 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
7743 Vcoding_system_alist
);
7747 spec_vec
= Fmake_vector (make_number (3), attrs
);
7748 ASET (spec_vec
, 1, aliases
);
7749 ASET (spec_vec
, 2, eol_type
);
7751 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
7752 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
7753 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
7754 Vcoding_system_alist
);
7757 int id
= coding_categories
[category
].id
;
7759 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
7760 setup_coding_system (name
, &coding_categories
[category
]);
7766 return Fsignal (Qwrong_number_of_arguments
,
7767 Fcons (intern ("define-coding-system-internal"),
7768 make_number (nargs
)));
7771 /* Fixme: should this record the alias relationships for
7773 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
7774 Sdefine_coding_system_alias
, 2, 2, 0,
7775 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
7776 (alias
, coding_system
)
7777 Lisp_Object alias
, coding_system
;
7779 Lisp_Object spec
, aliases
, eol_type
;
7781 CHECK_SYMBOL (alias
);
7782 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7783 aliases
= AREF (spec
, 1);
7784 while (!NILP (XCDR (aliases
)))
7785 aliases
= XCDR (aliases
);
7786 XCDR (aliases
) = Fcons (alias
, Qnil
);
7788 eol_type
= AREF (spec
, 2);
7789 if (VECTORP (eol_type
))
7791 Lisp_Object subsidiaries
;
7794 subsidiaries
= make_subsidiaries (alias
);
7795 for (i
= 0; i
< 3; i
++)
7796 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
7797 AREF (eol_type
, i
));
7799 ASET (spec
, 2, subsidiaries
);
7802 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
7803 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
7804 Vcoding_system_alist
);
7809 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
7811 doc
: /* Return the base of CODING-SYSTEM.
7812 Any alias or subsidiary coding system is not a base coding system. */)
7814 Lisp_Object coding_system
;
7816 Lisp_Object spec
, attrs
;
7818 if (NILP (coding_system
))
7819 return (Qno_conversion
);
7820 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7821 attrs
= AREF (spec
, 0);
7822 return CODING_ATTR_BASE_NAME (attrs
);
7825 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
7827 doc
: "Return the property list of CODING-SYSTEM.")
7829 Lisp_Object coding_system
;
7831 Lisp_Object spec
, attrs
;
7833 if (NILP (coding_system
))
7834 coding_system
= Qno_conversion
;
7835 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7836 attrs
= AREF (spec
, 0);
7837 return CODING_ATTR_PLIST (attrs
);
7841 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
7843 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
7845 Lisp_Object coding_system
;
7849 if (NILP (coding_system
))
7850 coding_system
= Qno_conversion
;
7851 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7852 return AREF (spec
, 1);
7855 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
7856 Scoding_system_eol_type
, 1, 1, 0,
7857 doc
: /* Return eol-type of CODING-SYSTEM.
7858 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7860 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7861 and CR respectively.
7863 A vector value indicates that a format of end-of-line should be
7864 detected automatically. Nth element of the vector is the subsidiary
7865 coding system whose eol-type is N. */)
7867 Lisp_Object coding_system
;
7869 Lisp_Object spec
, eol_type
;
7872 if (NILP (coding_system
))
7873 coding_system
= Qno_conversion
;
7874 if (! CODING_SYSTEM_P (coding_system
))
7876 spec
= CODING_SYSTEM_SPEC (coding_system
);
7877 eol_type
= AREF (spec
, 2);
7878 if (VECTORP (eol_type
))
7879 return Fcopy_sequence (eol_type
);
7880 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
7881 return make_number (n
);
7887 /*** 9. Post-amble ***/
7894 for (i
= 0; i
< coding_category_max
; i
++)
7896 coding_categories
[i
].id
= -1;
7897 coding_priorities
[i
] = i
;
7900 /* ISO2022 specific initialize routine. */
7901 for (i
= 0; i
< 0x20; i
++)
7902 iso_code_class
[i
] = ISO_control_0
;
7903 for (i
= 0x21; i
< 0x7F; i
++)
7904 iso_code_class
[i
] = ISO_graphic_plane_0
;
7905 for (i
= 0x80; i
< 0xA0; i
++)
7906 iso_code_class
[i
] = ISO_control_1
;
7907 for (i
= 0xA1; i
< 0xFF; i
++)
7908 iso_code_class
[i
] = ISO_graphic_plane_1
;
7909 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
7910 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
7911 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
7912 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
7913 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
7914 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
7915 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
7916 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
7917 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
7918 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
7920 inhibit_pre_post_conversion
= 0;
7922 for (i
= 0; i
< 256; i
++)
7924 emacs_mule_bytes
[i
] = 1;
7926 emacs_mule_bytes
[LEADING_CODE_PRIVATE_11
] = 3;
7927 emacs_mule_bytes
[LEADING_CODE_PRIVATE_12
] = 3;
7928 emacs_mule_bytes
[LEADING_CODE_PRIVATE_21
] = 4;
7929 emacs_mule_bytes
[LEADING_CODE_PRIVATE_22
] = 4;
7937 staticpro (&Vcoding_system_hash_table
);
7938 Vcoding_system_hash_table
= Fmakehash (Qeq
);
7940 staticpro (&Vsjis_coding_system
);
7941 Vsjis_coding_system
= Qnil
;
7943 staticpro (&Vbig5_coding_system
);
7944 Vbig5_coding_system
= Qnil
;
7946 staticpro (&Vcode_conversion_work_buf_list
);
7947 Vcode_conversion_work_buf_list
= Qnil
;
7949 staticpro (&Vcode_conversion_reused_work_buf
);
7950 Vcode_conversion_reused_work_buf
= Qnil
;
7952 DEFSYM (Qcharset
, "charset");
7953 DEFSYM (Qtarget_idx
, "target-idx");
7954 DEFSYM (Qcoding_system_history
, "coding-system-history");
7955 Fset (Qcoding_system_history
, Qnil
);
7957 /* Target FILENAME is the first argument. */
7958 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
7959 /* Target FILENAME is the third argument. */
7960 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
7962 DEFSYM (Qcall_process
, "call-process");
7963 /* Target PROGRAM is the first argument. */
7964 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
7966 DEFSYM (Qcall_process_region
, "call-process-region");
7967 /* Target PROGRAM is the third argument. */
7968 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
7970 DEFSYM (Qstart_process
, "start-process");
7971 /* Target PROGRAM is the third argument. */
7972 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
7974 DEFSYM (Qopen_network_stream
, "open-network-stream");
7975 /* Target SERVICE is the fourth argument. */
7976 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
7978 DEFSYM (Qcoding_system
, "coding-system");
7979 DEFSYM (Qcoding_aliases
, "coding-aliases");
7981 DEFSYM (Qeol_type
, "eol-type");
7982 DEFSYM (Qunix
, "unix");
7983 DEFSYM (Qdos
, "dos");
7985 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
7986 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
7987 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
7988 DEFSYM (Qdefault_char
, "default-char");
7989 DEFSYM (Qundecided
, "undecided");
7990 DEFSYM (Qno_conversion
, "no-conversion");
7991 DEFSYM (Qraw_text
, "raw-text");
7993 DEFSYM (Qiso_2022
, "iso-2022");
7995 DEFSYM (Qutf_8
, "utf-8");
7997 DEFSYM (Qutf_16
, "utf-16");
7998 DEFSYM (Qutf_16_be
, "utf-16-be");
7999 DEFSYM (Qutf_16_be_nosig
, "utf-16-be-nosig");
8000 DEFSYM (Qutf_16_le
, "utf-16-l3");
8001 DEFSYM (Qutf_16_le_nosig
, "utf-16-le-nosig");
8002 DEFSYM (Qsignature
, "signature");
8003 DEFSYM (Qendian
, "endian");
8004 DEFSYM (Qbig
, "big");
8005 DEFSYM (Qlittle
, "little");
8007 DEFSYM (Qshift_jis
, "shift-jis");
8008 DEFSYM (Qbig5
, "big5");
8010 DEFSYM (Qcoding_system_p
, "coding-system-p");
8012 DEFSYM (Qcoding_system_error
, "coding-system-error");
8013 Fput (Qcoding_system_error
, Qerror_conditions
,
8014 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
8015 Fput (Qcoding_system_error
, Qerror_message
,
8016 build_string ("Invalid coding system"));
8018 /* Intern this now in case it isn't already done.
8019 Setting this variable twice is harmless.
8020 But don't staticpro it here--that is done in alloc.c. */
8021 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
8023 DEFSYM (Qtranslation_table
, "translation-table");
8024 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
8025 DEFSYM (Qtranslation_table_id
, "translation-table-id");
8026 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
8027 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
8029 DEFSYM (Qvalid_codes
, "valid-codes");
8031 DEFSYM (Qemacs_mule
, "emacs-mule");
8033 Vcoding_category_table
8034 = Fmake_vector (make_number (coding_category_max
), Qnil
);
8035 staticpro (&Vcoding_category_table
);
8036 /* Followings are target of code detection. */
8037 ASET (Vcoding_category_table
, coding_category_iso_7
,
8038 intern ("coding-category-iso-7"));
8039 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
8040 intern ("coding-category-iso-7-tight"));
8041 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
8042 intern ("coding-category-iso-8-1"));
8043 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
8044 intern ("coding-category-iso-8-2"));
8045 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
8046 intern ("coding-category-iso-7-else"));
8047 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
8048 intern ("coding-category-iso-8-else"));
8049 ASET (Vcoding_category_table
, coding_category_utf_8
,
8050 intern ("coding-category-utf-8"));
8051 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
8052 intern ("coding-category-utf-16-be"));
8053 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
8054 intern ("coding-category-utf-16-le"));
8055 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
8056 intern ("coding-category-utf-16-be-nosig"));
8057 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
8058 intern ("coding-category-utf-16-le-nosig"));
8059 ASET (Vcoding_category_table
, coding_category_charset
,
8060 intern ("coding-category-charset"));
8061 ASET (Vcoding_category_table
, coding_category_sjis
,
8062 intern ("coding-category-sjis"));
8063 ASET (Vcoding_category_table
, coding_category_big5
,
8064 intern ("coding-category-big5"));
8065 ASET (Vcoding_category_table
, coding_category_ccl
,
8066 intern ("coding-category-ccl"));
8067 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
8068 intern ("coding-category-emacs-mule"));
8069 /* Followings are NOT target of code detection. */
8070 ASET (Vcoding_category_table
, coding_category_raw_text
,
8071 intern ("coding-category-raw-text"));
8072 ASET (Vcoding_category_table
, coding_category_undecided
,
8073 intern ("coding-category-undecided"));
8075 defsubr (&Scoding_system_p
);
8076 defsubr (&Sread_coding_system
);
8077 defsubr (&Sread_non_nil_coding_system
);
8078 defsubr (&Scheck_coding_system
);
8079 defsubr (&Sdetect_coding_region
);
8080 defsubr (&Sdetect_coding_string
);
8081 defsubr (&Sfind_coding_systems_region_internal
);
8082 defsubr (&Scheck_coding_systems_region
);
8083 defsubr (&Sdecode_coding_region
);
8084 defsubr (&Sencode_coding_region
);
8085 defsubr (&Sdecode_coding_string
);
8086 defsubr (&Sencode_coding_string
);
8087 defsubr (&Sdecode_sjis_char
);
8088 defsubr (&Sencode_sjis_char
);
8089 defsubr (&Sdecode_big5_char
);
8090 defsubr (&Sencode_big5_char
);
8091 defsubr (&Sset_terminal_coding_system_internal
);
8092 defsubr (&Sset_safe_terminal_coding_system_internal
);
8093 defsubr (&Sterminal_coding_system
);
8094 defsubr (&Sset_keyboard_coding_system_internal
);
8095 defsubr (&Skeyboard_coding_system
);
8096 defsubr (&Sfind_operation_coding_system
);
8097 defsubr (&Sset_coding_system_priority
);
8098 defsubr (&Sdefine_coding_system_internal
);
8099 defsubr (&Sdefine_coding_system_alias
);
8100 defsubr (&Scoding_system_base
);
8101 defsubr (&Scoding_system_plist
);
8102 defsubr (&Scoding_system_aliases
);
8103 defsubr (&Scoding_system_eol_type
);
8104 defsubr (&Scoding_system_priority_list
);
8106 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
8107 doc
: /* List of coding systems.
8109 Do not alter the value of this variable manually. This variable should be
8110 updated by the functions `define-coding-system' and
8111 `define-coding-system-alias'. */);
8112 Vcoding_system_list
= Qnil
;
8114 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
8115 doc
: /* Alist of coding system names.
8116 Each element is one element list of coding system name.
8117 This variable is given to `completing-read' as TABLE argument.
8119 Do not alter the value of this variable manually. This variable should be
8120 updated by the functions `make-coding-system' and
8121 `define-coding-system-alias'. */);
8122 Vcoding_system_alist
= Qnil
;
8124 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
8125 doc
: /* List of coding-categories (symbols) ordered by priority.
8127 On detecting a coding system, Emacs tries code detection algorithms
8128 associated with each coding-category one by one in this order. When
8129 one algorithm agrees with a byte sequence of source text, the coding
8130 system bound to the corresponding coding-category is selected. */);
8134 Vcoding_category_list
= Qnil
;
8135 for (i
= coding_category_max
- 1; i
>= 0; i
--)
8136 Vcoding_category_list
8137 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
8138 Vcoding_category_list
);
8141 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
8142 doc
: /* Specify the coding system for read operations.
8143 It is useful to bind this variable with `let', but do not set it globally.
8144 If the value is a coding system, it is used for decoding on read operation.
8145 If not, an appropriate element is used from one of the coding system alists:
8146 There are three such tables, `file-coding-system-alist',
8147 `process-coding-system-alist', and `network-coding-system-alist'. */);
8148 Vcoding_system_for_read
= Qnil
;
8150 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
8151 doc
: /* Specify the coding system for write operations.
8152 Programs bind this variable with `let', but you should not set it globally.
8153 If the value is a coding system, it is used for encoding of output,
8154 when writing it to a file and when sending it to a file or subprocess.
8156 If this does not specify a coding system, an appropriate element
8157 is used from one of the coding system alists:
8158 There are three such tables, `file-coding-system-alist',
8159 `process-coding-system-alist', and `network-coding-system-alist'.
8160 For output to files, if the above procedure does not specify a coding system,
8161 the value of `buffer-file-coding-system' is used. */);
8162 Vcoding_system_for_write
= Qnil
;
8164 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
8166 Coding system used in the latest file or process I/O. */);
8167 Vlast_coding_system_used
= Qnil
;
8169 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
8171 *Non-nil means always inhibit code conversion of end-of-line format.
8172 See info node `Coding Systems' and info node `Text and Binary' concerning
8173 such conversion. */);
8174 inhibit_eol_conversion
= 0;
8176 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
8178 Non-nil means process buffer inherits coding system of process output.
8179 Bind it to t if the process output is to be treated as if it were a file
8180 read from some filesystem. */);
8181 inherit_process_coding_system
= 0;
8183 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
8185 Alist to decide a coding system to use for a file I/O operation.
8186 The format is ((PATTERN . VAL) ...),
8187 where PATTERN is a regular expression matching a file name,
8188 VAL is a coding system, a cons of coding systems, or a function symbol.
8189 If VAL is a coding system, it is used for both decoding and encoding
8191 If VAL is a cons of coding systems, the car part is used for decoding,
8192 and the cdr part is used for encoding.
8193 If VAL is a function symbol, the function must return a coding system
8194 or a cons of coding systems which are used as above. The function gets
8195 the arguments with which `find-operation-coding-systems' was called.
8197 See also the function `find-operation-coding-system'
8198 and the variable `auto-coding-alist'. */);
8199 Vfile_coding_system_alist
= Qnil
;
8201 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
8203 Alist to decide a coding system to use for a process I/O operation.
8204 The format is ((PATTERN . VAL) ...),
8205 where PATTERN is a regular expression matching a program name,
8206 VAL is a coding system, a cons of coding systems, or a function symbol.
8207 If VAL is a coding system, it is used for both decoding what received
8208 from the program and encoding what sent to the program.
8209 If VAL is a cons of coding systems, the car part is used for decoding,
8210 and the cdr part is used for encoding.
8211 If VAL is a function symbol, the function must return a coding system
8212 or a cons of coding systems which are used as above.
8214 See also the function `find-operation-coding-system'. */);
8215 Vprocess_coding_system_alist
= Qnil
;
8217 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
8219 Alist to decide a coding system to use for a network I/O operation.
8220 The format is ((PATTERN . VAL) ...),
8221 where PATTERN is a regular expression matching a network service name
8222 or is a port number to connect to,
8223 VAL is a coding system, a cons of coding systems, or a function symbol.
8224 If VAL is a coding system, it is used for both decoding what received
8225 from the network stream and encoding what sent to the network stream.
8226 If VAL is a cons of coding systems, the car part is used for decoding,
8227 and the cdr part is used for encoding.
8228 If VAL is a function symbol, the function must return a coding system
8229 or a cons of coding systems which are used as above.
8231 See also the function `find-operation-coding-system'. */);
8232 Vnetwork_coding_system_alist
= Qnil
;
8234 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
8235 doc
: /* Coding system to use with system messages.
8236 Also used for decoding keyboard input on X Window system. */);
8237 Vlocale_coding_system
= Qnil
;
8239 /* The eol mnemonics are reset in startup.el system-dependently. */
8240 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
8242 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8243 eol_mnemonic_unix
= build_string (":");
8245 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
8247 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8248 eol_mnemonic_dos
= build_string ("\\");
8250 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
8252 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8253 eol_mnemonic_mac
= build_string ("/");
8255 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
8257 *String displayed in mode line when end-of-line format is not yet determined. */);
8258 eol_mnemonic_undecided
= build_string (":");
8260 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
8262 *Non-nil enables character translation while encoding and decoding. */);
8263 Venable_character_translation
= Qt
;
8265 DEFVAR_LISP ("standard-translation-table-for-decode",
8266 &Vstandard_translation_table_for_decode
,
8267 doc
: /* Table for translating characters while decoding. */);
8268 Vstandard_translation_table_for_decode
= Qnil
;
8270 DEFVAR_LISP ("standard-translation-table-for-encode",
8271 &Vstandard_translation_table_for_encode
,
8272 doc
: /* Table for translating characters while encoding. */);
8273 Vstandard_translation_table_for_encode
= Qnil
;
8275 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
8276 doc
: /* Alist of charsets vs revision numbers.
8277 While encoding, if a charset (car part of an element) is found,
8278 designate it with the escape sequence identifying revision (cdr part
8279 of the element). */);
8280 Vcharset_revision_table
= Qnil
;
8282 DEFVAR_LISP ("default-process-coding-system",
8283 &Vdefault_process_coding_system
,
8284 doc
: /* Cons of coding systems used for process I/O by default.
8285 The car part is used for decoding a process output,
8286 the cdr part is used for encoding a text to be sent to a process. */);
8287 Vdefault_process_coding_system
= Qnil
;
8289 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
8291 Table of extra Latin codes in the range 128..159 (inclusive).
8292 This is a vector of length 256.
8293 If Nth element is non-nil, the existence of code N in a file
8294 \(or output of subprocess) doesn't prevent it to be detected as
8295 a coding system of ISO 2022 variant which has a flag
8296 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8297 or reading output of a subprocess.
8298 Only 128th through 159th elements has a meaning. */);
8299 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
8301 DEFVAR_LISP ("select-safe-coding-system-function",
8302 &Vselect_safe_coding_system_function
,
8304 Function to call to select safe coding system for encoding a text.
8306 If set, this function is called to force a user to select a proper
8307 coding system which can encode the text in the case that a default
8308 coding system used in each operation can't encode the text.
8310 The default value is `select-safe-coding-system' (which see). */);
8311 Vselect_safe_coding_system_function
= Qnil
;
8313 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8314 &inhibit_iso_escape_detection
,
8316 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8318 By default, on reading a file, Emacs tries to detect how the text is
8319 encoded. This code detection is sensitive to escape sequences. If
8320 the sequence is valid as ISO2022, the code is determined as one of
8321 the ISO2022 encodings, and the file is decoded by the corresponding
8322 coding system (e.g. `iso-2022-7bit').
8324 However, there may be a case that you want to read escape sequences in
8325 a file as is. In such a case, you can set this variable to non-nil.
8326 Then, as the code detection ignores any escape sequences, no file is
8327 detected as encoded in some ISO2022 encoding. The result is that all
8328 escape sequences become visible in a buffer.
8330 The default value is nil, and it is strongly recommended not to change
8331 it. That is because many Emacs Lisp source files that contain
8332 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8333 in Emacs's distribution, and they won't be decoded correctly on
8334 reading if you suppress escape sequence detection.
8336 The other way to read escape sequences in a file without decoding is
8337 to explicitly specify some coding system that doesn't use ISO2022's
8338 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8339 inhibit_iso_escape_detection
= 0;
8342 Lisp_Object args
[coding_arg_max
];
8343 Lisp_Object plist
[14];
8346 for (i
= 0; i
< coding_arg_max
; i
++)
8349 plist
[0] = intern (":name");
8350 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
8351 plist
[2] = intern (":mnemonic");
8352 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
8353 plist
[4] = intern (":coding-type");
8354 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
8355 plist
[6] = intern (":ascii-compatible-p");
8356 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
8357 plist
[8] = intern (":default-char");
8358 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
8359 plist
[10] = intern (":docstring");
8360 plist
[11] = build_string ("Do no conversion.\n\
8362 When you visit a file with this coding, the file is read into a\n\
8363 unibyte buffer as is, thus each byte of a file is treated as a\n\
8365 plist
[12] = intern (":eol-type");
8366 plist
[13] = args
[coding_arg_eol_type
] = Qunix
;
8367 args
[coding_arg_plist
] = Flist (14, plist
);
8368 Fdefine_coding_system_internal (coding_arg_max
, args
);
8371 setup_coding_system (Qno_conversion
, &keyboard_coding
);
8372 setup_coding_system (Qno_conversion
, &terminal_coding
);
8373 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
8377 emacs_strerror (error_number
)
8382 synchronize_system_messages_locale ();
8383 str
= strerror (error_number
);
8385 if (! NILP (Vlocale_coding_system
))
8387 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
8388 Vlocale_coding_system
,
8390 str
= (char *) XSTRING (dec
)->data
;