1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used by Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for a text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX. Return 1 if the data contains
148 a byte sequence which can be decoded into non-ASCII characters by
149 the coding system. Otherwize (i.e. the data contains only ASCII
150 characters or invalid sequence) return 0.
152 It also resets some bits of an integer pointed by MASK. The macros
153 CATEGORY_MASK_XXX specifies each bit of this integer.
155 Below is the template of these functions. */
159 detect_coding_XXX (coding
, mask
)
160 struct coding_system
*coding
;
163 unsigned char *src
= coding
->source
;
164 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
165 int multibytep
= coding
->src_multibyte
;
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
175 /* Check if it conforms to XXX. If not, break the loop. */
177 /* As the data is invalid for XXX, reset a proper bits. */
178 *mask
&= ~CODING_CATEGORY_XXX
;
181 /* The source exausted. */
183 /* ASCII characters only. */
185 /* Some data should be decoded into non-ASCII characters. */
186 *mask
&= CODING_CATEGORY_XXX
;
191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
193 These functions decode a byte sequence specified as a source by
194 CODING. The resulting multibyte text goes to a place pointed to by
195 CODING->charbuf, the length of which should not exceed
196 CODING->charbuf_size;
198 These functions set the information of original and decoded texts in
199 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
200 They also set CODING->result to one of CODING_RESULT_XXX indicating
201 how the decoding is finished.
203 Below is the template of these functions. */
207 decode_coding_XXXX (coding
)
208 struct coding_system
*coding
;
210 unsigned char *src
= coding
->source
+ coding
->consumed
;
211 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
212 /* SRC_BASE remembers the start position in source in each loop.
213 The loop will be exited when there's not enough source code, or
214 when there's no room in CHARBUF for a decoded character. */
215 unsigned char *src_base
;
216 /* A buffer to produce decoded characters. */
217 int *charbuf
= coding
->charbuf
;
218 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
219 int multibytep
= coding
->src_multibyte
;
224 if (charbuf
< charbuf_end
)
225 /* No more room to produce a decoded character. */
232 if (src_base
< src_end
233 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
234 /* If the source ends by partial bytes to construct a character,
235 treat them as eight-bit raw data. */
236 while (src_base
< src_end
&& charbuf
< charbuf_end
)
237 *charbuf
++ = *src_base
++;
238 /* Remember how many bytes and characters we consumed. If the
239 source is multibyte, the bytes and chars are not identical. */
240 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
241 /* Remember how many characters we produced. */
242 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
248 These functions encode SRC_BYTES length text at SOURCE of Emacs'
249 internal multibyte format by CODING. The resulting byte sequence
250 goes to a place pointed to by DESTINATION, the length of which
251 should not exceed DST_BYTES.
253 These functions set the information of original and encoded texts in
254 the members produced, produced_char, consumed, and consumed_char of
255 the structure *CODING. They also set the member result to one of
256 CODING_RESULT_XXX indicating how the encoding finished.
258 DST_BYTES zero means that source area and destination area are
259 overlapped, which means that we can produce a encoded text until it
260 reaches at the head of not-yet-encoded source text.
262 Below is a template of these functions. */
265 encode_coding_XXX (coding
)
266 struct coding_system
*coding
;
268 int multibytep
= coding
->dst_multibyte
;
269 int *charbuf
= coding
->charbuf
;
270 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
271 unsigned char *dst
= coding
->destination
+ coding
->produced
;
272 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
273 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
274 int produced_chars
= 0;
276 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
279 /* Encode C into DST, and increment DST. */
281 label_no_more_destination
:
282 /* How many chars and bytes we produced. */
283 coding
->produced_char
+= produced_chars
;
284 coding
->produced
= dst
- coding
->destination
;
289 /*** 1. Preamble ***/
296 #include "character.h"
299 #include "composite.h"
303 Lisp_Object Vcoding_system_hash_table
;
305 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
306 Lisp_Object Qunix
, Qdos
;
307 extern Lisp_Object Qmac
; /* frame.c */
308 Lisp_Object Qbuffer_file_coding_system
;
309 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
310 Lisp_Object Qdefault_char
;
311 Lisp_Object Qno_conversion
, Qundecided
;
312 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
313 Lisp_Object Qutf_16_be_nosig
, Qutf_16_be
, Qutf_16_le_nosig
, Qutf_16_le
;
314 Lisp_Object Qsignature
, Qendian
, Qbig
, Qlittle
;
315 Lisp_Object Qcoding_system_history
;
316 Lisp_Object Qvalid_codes
;
318 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
319 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
320 Lisp_Object Qstart_process
, Qopen_network_stream
;
321 Lisp_Object Qtarget_idx
;
323 Lisp_Object Vselect_safe_coding_system_function
;
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
327 /* Mnemonic string to indicate format of end-of-line is not yet
329 Lisp_Object eol_mnemonic_undecided
;
333 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
335 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule
, Qraw_text
;
341 /* Coding-systems are handed between Emacs Lisp programs and C internal
342 routines by the following three variables. */
343 /* Coding-system for reading files and receiving data from process. */
344 Lisp_Object Vcoding_system_for_read
;
345 /* Coding-system for writing files and sending data to process. */
346 Lisp_Object Vcoding_system_for_write
;
347 /* Coding-system actually used in the latest I/O. */
348 Lisp_Object Vlast_coding_system_used
;
350 /* A vector of length 256 which contains information about special
351 Latin codes (especially for dealing with Microsoft codes). */
352 Lisp_Object Vlatin_extra_code_table
;
354 /* Flag to inhibit code conversion of end-of-line format. */
355 int inhibit_eol_conversion
;
357 /* Flag to inhibit ISO2022 escape sequence detection. */
358 int inhibit_iso_escape_detection
;
360 /* Flag to make buffer-file-coding-system inherit from process-coding. */
361 int inherit_process_coding_system
;
363 /* Coding system to be used to encode text for terminal display. */
364 struct coding_system terminal_coding
;
366 /* Coding system to be used to encode text for terminal display when
367 terminal coding system is nil. */
368 struct coding_system safe_terminal_coding
;
370 /* Coding system of what is sent from terminal keyboard. */
371 struct coding_system keyboard_coding
;
373 Lisp_Object Vfile_coding_system_alist
;
374 Lisp_Object Vprocess_coding_system_alist
;
375 Lisp_Object Vnetwork_coding_system_alist
;
377 Lisp_Object Vlocale_coding_system
;
381 /* Flag to tell if we look up translation table on character code
383 Lisp_Object Venable_character_translation
;
384 /* Standard translation table to look up on decoding (reading). */
385 Lisp_Object Vstandard_translation_table_for_decode
;
386 /* Standard translation table to look up on encoding (writing). */
387 Lisp_Object Vstandard_translation_table_for_encode
;
389 Lisp_Object Qtranslation_table
;
390 Lisp_Object Qtranslation_table_id
;
391 Lisp_Object Qtranslation_table_for_decode
;
392 Lisp_Object Qtranslation_table_for_encode
;
394 /* Alist of charsets vs revision number. */
395 static Lisp_Object Vcharset_revision_table
;
397 /* Default coding systems used for process I/O. */
398 Lisp_Object Vdefault_process_coding_system
;
400 /* Global flag to tell that we can't call post-read-conversion and
401 pre-write-conversion functions. Usually the value is zero, but it
402 is set to 1 temporarily while such functions are running. This is
403 to avoid infinite recursive call. */
404 static int inhibit_pre_post_conversion
;
406 /* Two special coding systems. */
407 Lisp_Object Vsjis_coding_system
;
408 Lisp_Object Vbig5_coding_system
;
411 static int detect_coding_utf_8
P_ ((struct coding_system
*, int *));
412 static void decode_coding_utf_8
P_ ((struct coding_system
*));
413 static int encode_coding_utf_8
P_ ((struct coding_system
*));
415 static int detect_coding_utf_16
P_ ((struct coding_system
*, int *));
416 static void decode_coding_utf_16
P_ ((struct coding_system
*));
417 static int encode_coding_utf_16
P_ ((struct coding_system
*));
419 static int detect_coding_iso_2022
P_ ((struct coding_system
*, int *));
420 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
421 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
423 static int detect_coding_emacs_mule
P_ ((struct coding_system
*, int *));
424 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
425 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
427 static int detect_coding_sjis
P_ ((struct coding_system
*, int *));
428 static void decode_coding_sjis
P_ ((struct coding_system
*));
429 static int encode_coding_sjis
P_ ((struct coding_system
*));
431 static int detect_coding_big5
P_ ((struct coding_system
*, int *));
432 static void decode_coding_big5
P_ ((struct coding_system
*));
433 static int encode_coding_big5
P_ ((struct coding_system
*));
435 static int detect_coding_ccl
P_ ((struct coding_system
*, int *));
436 static void decode_coding_ccl
P_ ((struct coding_system
*));
437 static int encode_coding_ccl
P_ ((struct coding_system
*));
439 static void decode_coding_raw_text
P_ ((struct coding_system
*));
440 static int encode_coding_raw_text
P_ ((struct coding_system
*));
443 /* ISO2022 section */
445 #define CODING_ISO_INITIAL(coding, reg) \
446 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
447 coding_attr_iso_initial), \
451 #define CODING_ISO_REQUEST(coding, charset_id) \
452 ((charset_id <= (coding)->max_charset_id \
453 ? (coding)->safe_charsets[charset_id] \
457 #define CODING_ISO_FLAGS(coding) \
458 ((coding)->spec.iso_2022.flags)
459 #define CODING_ISO_DESIGNATION(coding, reg) \
460 ((coding)->spec.iso_2022.current_designation[reg])
461 #define CODING_ISO_INVOCATION(coding, plane) \
462 ((coding)->spec.iso_2022.current_invocation[plane])
463 #define CODING_ISO_SINGLE_SHIFTING(coding) \
464 ((coding)->spec.iso_2022.single_shifting)
465 #define CODING_ISO_BOL(coding) \
466 ((coding)->spec.iso_2022.bol)
467 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
468 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
470 /* Control characters of ISO2022. */
471 /* code */ /* function */
472 #define ISO_CODE_LF 0x0A /* line-feed */
473 #define ISO_CODE_CR 0x0D /* carriage-return */
474 #define ISO_CODE_SO 0x0E /* shift-out */
475 #define ISO_CODE_SI 0x0F /* shift-in */
476 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
477 #define ISO_CODE_ESC 0x1B /* escape */
478 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
479 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
480 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
482 /* All code (1-byte) of ISO2022 is classified into one of the
484 enum iso_code_class_type
486 ISO_control_0
, /* Control codes in the range
487 0x00..0x1F and 0x7F, except for the
488 following 5 codes. */
489 ISO_carriage_return
, /* ISO_CODE_CR (0x0D) */
490 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
491 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
492 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
493 ISO_escape
, /* ISO_CODE_SO (0x1B) */
494 ISO_control_1
, /* Control codes in the range
495 0x80..0x9F, except for the
496 following 3 codes. */
497 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
498 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
499 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
500 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
501 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
502 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
503 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
506 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
507 `iso-flags' attribute of an iso2022 coding system. */
509 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
510 instead of the correct short-form sequence (e.g. ESC $ A). */
511 #define CODING_ISO_FLAG_LONG_FORM 0x0001
513 /* If set, reset graphic planes and registers at end-of-line to the
515 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
517 /* If set, reset graphic planes and registers before any control
518 characters to the initial state. */
519 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
521 /* If set, encode by 7-bit environment. */
522 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
524 /* If set, use locking-shift function. */
525 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
527 /* If set, use single-shift function. Overwrite
528 CODING_ISO_FLAG_LOCKING_SHIFT. */
529 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
531 /* If set, use designation escape sequence. */
532 #define CODING_ISO_FLAG_DESIGNATION 0x0040
534 /* If set, produce revision number sequence. */
535 #define CODING_ISO_FLAG_REVISION 0x0080
537 /* If set, produce ISO6429's direction specifying sequence. */
538 #define CODING_ISO_FLAG_DIRECTION 0x0100
540 /* If set, assume designation states are reset at beginning of line on
542 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
544 /* If set, designation sequence should be placed at beginning of line
546 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
548 /* If set, do not encode unsafe charactes on output. */
549 #define CODING_ISO_FLAG_SAFE 0x0800
551 /* If set, extra latin codes (128..159) are accepted as a valid code
553 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
555 #define CODING_ISO_FLAG_COMPOSITION 0x2000
557 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
559 #define CODING_ISO_FLAG_FULL_SUPPORT 0x8000
561 /* A character to be produced on output if encoding of the original
562 character is prohibited by CODING_ISO_FLAG_SAFE. */
563 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
567 #define CODING_UTF_16_BOM(coding) \
568 ((coding)->spec.utf_16.bom)
570 #define CODING_UTF_16_ENDIAN(coding) \
571 ((coding)->spec.utf_16.endian)
573 #define CODING_UTF_16_SURROGATE(coding) \
574 ((coding)->spec.utf_16.surrogate)
578 #define CODING_CCL_DECODER(coding) \
579 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
580 #define CODING_CCL_ENCODER(coding) \
581 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
582 #define CODING_CCL_VALIDS(coding) \
583 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
586 /* Index for each coding category in `coding_category_table' */
590 coding_category_iso_7
,
591 coding_category_iso_7_tight
,
592 coding_category_iso_8_1
,
593 coding_category_iso_8_2
,
594 coding_category_iso_7_else
,
595 coding_category_iso_8_else
,
596 coding_category_utf_8
,
597 coding_category_utf_16_auto
,
598 coding_category_utf_16_be
,
599 coding_category_utf_16_le
,
600 coding_category_utf_16_be_nosig
,
601 coding_category_utf_16_le_nosig
,
602 coding_category_charset
,
603 coding_category_sjis
,
604 coding_category_big5
,
606 coding_category_emacs_mule
,
607 /* All above are targets of code detection. */
608 coding_category_raw_text
,
609 coding_category_undecided
,
613 /* Definitions of flag bits used in detect_coding_XXXX. */
614 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
615 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
616 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
617 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
618 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
619 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
620 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
621 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
622 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
623 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
624 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
625 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
626 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
627 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
628 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
629 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
631 /* This value is returned if detect_coding_mask () find nothing other
632 than ASCII characters. */
633 #define CATEGORY_MASK_ANY \
634 (CATEGORY_MASK_ISO_7 \
635 | CATEGORY_MASK_ISO_7_TIGHT \
636 | CATEGORY_MASK_ISO_8_1 \
637 | CATEGORY_MASK_ISO_8_2 \
638 | CATEGORY_MASK_ISO_7_ELSE \
639 | CATEGORY_MASK_ISO_8_ELSE \
640 | CATEGORY_MASK_UTF_8 \
641 | CATEGORY_MASK_UTF_16_BE \
642 | CATEGORY_MASK_UTF_16_LE \
643 | CATEGORY_MASK_UTF_16_BE_NOSIG \
644 | CATEGORY_MASK_UTF_16_LE_NOSIG \
645 | CATEGORY_MASK_CHARSET \
646 | CATEGORY_MASK_SJIS \
647 | CATEGORY_MASK_BIG5 \
648 | CATEGORY_MASK_CCL \
649 | CATEGORY_MASK_EMACS_MULE)
652 #define CATEGORY_MASK_ISO_7BIT \
653 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
655 #define CATEGORY_MASK_ISO_8BIT \
656 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
658 #define CATEGORY_MASK_ISO_ELSE \
659 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
661 #define CATEGORY_MASK_ISO_ESCAPE \
662 (CATEGORY_MASK_ISO_7 \
663 | CATEGORY_MASK_ISO_7_TIGHT \
664 | CATEGORY_MASK_ISO_7_ELSE \
665 | CATEGORY_MASK_ISO_8_ELSE)
667 #define CATEGORY_MASK_ISO \
668 ( CATEGORY_MASK_ISO_7BIT \
669 | CATEGORY_MASK_ISO_8BIT \
670 | CATEGORY_MASK_ISO_ELSE)
672 #define CATEGORY_MASK_UTF_16 \
673 (CATEGORY_MASK_UTF_16_BE \
674 | CATEGORY_MASK_UTF_16_LE \
675 | CATEGORY_MASK_UTF_16_BE_NOSIG \
676 | CATEGORY_MASK_UTF_16_LE_NOSIG)
679 /* List of symbols `coding-category-xxx' ordered by priority. This
680 variable is exposed to Emacs Lisp. */
681 static Lisp_Object Vcoding_category_list
;
683 /* Table of coding categories (Lisp symbols). This variable is for
685 static Lisp_Object Vcoding_category_table
;
687 /* Table of coding-categories ordered by priority. */
688 static enum coding_category coding_priorities
[coding_category_max
];
690 /* Nth element is a coding context for the coding system bound to the
691 Nth coding category. */
692 static struct coding_system coding_categories
[coding_category_max
];
694 static int detected_mask
[coding_category_raw_text
] =
702 CATEGORY_MASK_UTF_16
,
703 CATEGORY_MASK_UTF_16
,
704 CATEGORY_MASK_UTF_16
,
705 CATEGORY_MASK_UTF_16
,
706 CATEGORY_MASK_UTF_16
,
707 CATEGORY_MASK_CHARSET
,
711 CATEGORY_MASK_EMACS_MULE
714 /*** Commonly used macros and functions ***/
717 #define min(a, b) ((a) < (b) ? (a) : (b))
720 #define max(a, b) ((a) > (b) ? (a) : (b))
723 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
725 attrs = CODING_ID_ATTRS (coding->id); \
726 eol_type = CODING_ID_EOL_TYPE (coding->id); \
727 if (VECTORP (eol_type)) \
729 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
733 /* Safely get one byte from the source text pointed by SRC which ends
734 at SRC_END, and set C to that byte. If there are not enough bytes
735 in the source, it jumps to `no_more_source'. The caller
736 should declare and set these variables appropriately in advance:
737 src, src_end, multibytep
740 #define ONE_MORE_BYTE(c) \
742 if (src == src_end) \
744 if (src_base < src) \
745 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
746 goto no_more_source; \
749 if (multibytep && (c & 0x80)) \
751 if ((c & 0xFE) != 0xC0) \
752 error ("Undecodable char found"); \
753 c = ((c & 1) << 6) | *src++; \
759 #define ONE_MORE_BYTE_NO_CHECK(c) \
762 if (multibytep && (c & 0x80)) \
764 if ((c & 0xFE) != 0xC0) \
765 error ("Undecodable char found"); \
766 c = ((c & 1) << 6) | *src++; \
772 /* Store a byte C in the place pointed by DST and increment DST to the
773 next free point, and increment PRODUCED_CHARS. The caller should
774 assure that C is 0..127, and declare and set the variable `dst'
775 appropriately in advance.
779 #define EMIT_ONE_ASCII_BYTE(c) \
786 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
788 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
790 produced_chars += 2; \
791 *dst++ = (c1), *dst++ = (c2); \
795 /* Store a byte C in the place pointed by DST and increment DST to the
796 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
797 nonzero, store in an appropriate multibyte from. The caller should
798 declare and set the variables `dst' and `multibytep' appropriately
801 #define EMIT_ONE_BYTE(c) \
808 ch = BYTE8_TO_CHAR (ch); \
809 CHAR_STRING_ADVANCE (ch, dst); \
816 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
818 #define EMIT_TWO_BYTES(c1, c2) \
820 produced_chars += 2; \
827 ch = BYTE8_TO_CHAR (ch); \
828 CHAR_STRING_ADVANCE (ch, dst); \
831 ch = BYTE8_TO_CHAR (ch); \
832 CHAR_STRING_ADVANCE (ch, dst); \
842 #define EMIT_THREE_BYTES(c1, c2, c3) \
844 EMIT_ONE_BYTE (c1); \
845 EMIT_TWO_BYTES (c2, c3); \
849 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
851 EMIT_TWO_BYTES (c1, c2); \
852 EMIT_TWO_BYTES (c3, c4); \
856 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
858 charset_map_loaded = 0; \
859 c = DECODE_CHAR (charset, code); \
860 if (charset_map_loaded) \
862 unsigned char *orig = coding->source; \
865 coding_set_source (coding); \
866 offset = coding->source - orig; \
868 src_base += offset; \
874 #define ASSURE_DESTINATION(bytes) \
876 if (dst + (bytes) >= dst_end) \
878 int more_bytes = charbuf_end - charbuf + (bytes); \
880 dst = alloc_destination (coding, more_bytes, dst); \
881 dst_end = coding->destination + coding->dst_bytes; \
888 coding_set_source (coding
)
889 struct coding_system
*coding
;
891 if (BUFFERP (coding
->src_object
))
893 if (coding
->src_pos
< 0)
894 coding
->source
= GAP_END_ADDR
+ coding
->src_pos_byte
;
897 struct buffer
*buf
= XBUFFER (coding
->src_object
);
898 EMACS_INT gpt_byte
= BUF_GPT_BYTE (buf
);
899 unsigned char *beg_addr
= BUF_BEG_ADDR (buf
);
901 coding
->source
= beg_addr
+ coding
->src_pos_byte
- 1;
902 if (coding
->src_pos_byte
>= gpt_byte
)
903 coding
->source
+= BUF_GAP_SIZE (buf
);
906 else if (STRINGP (coding
->src_object
))
908 coding
->source
= (XSTRING (coding
->src_object
)->data
909 + coding
->src_pos_byte
);
912 /* Otherwise, the source is C string and is never relocated
913 automatically. Thus we don't have to update anything. */
918 coding_set_destination (coding
)
919 struct coding_system
*coding
;
921 if (BUFFERP (coding
->dst_object
))
923 /* We are sure that coding->dst_pos_byte is before the gap of the
925 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
926 + coding
->dst_pos_byte
- 1);
927 if (coding
->src_pos
< 0)
928 coding
->dst_bytes
= (GAP_END_ADDR
929 - (coding
->src_bytes
- coding
->consumed
)
930 - coding
->destination
);
932 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
933 - coding
->destination
);
936 /* Otherwise, the destination is C string and is never relocated
937 automatically. Thus we don't have to update anything. */
943 coding_alloc_by_realloc (coding
, bytes
)
944 struct coding_system
*coding
;
947 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
948 coding
->dst_bytes
+ bytes
);
949 coding
->dst_bytes
+= bytes
;
953 coding_alloc_by_making_gap (coding
, bytes
)
954 struct coding_system
*coding
;
957 if (BUFFERP (coding
->dst_object
)
958 && EQ (coding
->src_object
, coding
->dst_object
))
960 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
962 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
964 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
968 Lisp_Object this_buffer
;
970 this_buffer
= Fcurrent_buffer ();
971 set_buffer_internal (XBUFFER (coding
->dst_object
));
973 set_buffer_internal (XBUFFER (this_buffer
));
978 static unsigned char *
979 alloc_destination (coding
, nbytes
, dst
)
980 struct coding_system
*coding
;
984 EMACS_INT offset
= dst
- coding
->destination
;
986 if (BUFFERP (coding
->dst_object
))
987 coding_alloc_by_making_gap (coding
, nbytes
);
989 coding_alloc_by_realloc (coding
, nbytes
);
990 coding
->result
= CODING_RESULT_SUCCESS
;
991 coding_set_destination (coding
);
992 dst
= coding
->destination
+ offset
;
997 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1004 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1005 Check if a text is encoded in UTF-8. If it is, return
1006 CATEGORY_MASK_UTF_8, else return 0. */
1008 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1009 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1010 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1011 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1012 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1013 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1016 detect_coding_utf_8 (coding
, mask
)
1017 struct coding_system
*coding
;
1020 unsigned char *src
= coding
->source
, *src_base
= src
;
1021 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1022 int multibytep
= coding
->src_multibyte
;
1023 int consumed_chars
= 0;
1026 /* A coding system of this category is always ASCII compatible. */
1027 src
+= coding
->head_ascii
;
1031 int c
, c1
, c2
, c3
, c4
;
1034 if (UTF_8_1_OCTET_P (c
))
1037 if (! UTF_8_EXTRA_OCTET_P (c1
))
1039 if (UTF_8_2_OCTET_LEADING_P (c
))
1045 if (! UTF_8_EXTRA_OCTET_P (c2
))
1047 if (UTF_8_3_OCTET_LEADING_P (c
))
1053 if (! UTF_8_EXTRA_OCTET_P (c3
))
1055 if (UTF_8_4_OCTET_LEADING_P (c
))
1061 if (! UTF_8_EXTRA_OCTET_P (c4
))
1063 if (UTF_8_5_OCTET_LEADING_P (c
))
1070 *mask
&= ~CATEGORY_MASK_UTF_8
;
1076 *mask
&= CATEGORY_MASK_UTF_8
;
1081 /* Fixme: deal with surrogates? */
1083 decode_coding_utf_8 (coding
)
1084 struct coding_system
*coding
;
1086 unsigned char *src
= coding
->source
+ coding
->consumed
;
1087 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1088 unsigned char *src_base
;
1089 int *charbuf
= coding
->charbuf
;
1090 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1091 int consumed_chars
= 0, consumed_chars_base
;
1092 int multibytep
= coding
->src_multibyte
;
1093 Lisp_Object attr
, eol_type
, charset_list
;
1095 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1099 int c
, c1
, c2
, c3
, c4
, c5
;
1102 consumed_chars_base
= consumed_chars
;
1104 if (charbuf
>= charbuf_end
)
1108 if (UTF_8_1_OCTET_P(c1
))
1113 if (EQ (eol_type
, Qdos
))
1116 goto no_more_source
;
1120 else if (EQ (eol_type
, Qmac
))
1127 if (! UTF_8_EXTRA_OCTET_P (c2
))
1129 if (UTF_8_2_OCTET_LEADING_P (c1
))
1131 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1132 /* Reject overlong sequences here and below. Encoders
1133 producing them are incorrect, they can be misleading,
1134 and they mess up read/write invariance. */
1141 if (! UTF_8_EXTRA_OCTET_P (c3
))
1143 if (UTF_8_3_OCTET_LEADING_P (c1
))
1145 c
= (((c1
& 0xF) << 12)
1146 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1153 if (! UTF_8_EXTRA_OCTET_P (c4
))
1155 if (UTF_8_4_OCTET_LEADING_P (c1
))
1157 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1158 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1165 if (! UTF_8_EXTRA_OCTET_P (c5
))
1167 if (UTF_8_5_OCTET_LEADING_P (c1
))
1169 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1170 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1172 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1187 consumed_chars
= consumed_chars_base
;
1189 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1194 coding
->consumed_char
+= consumed_chars_base
;
1195 coding
->consumed
= src_base
- coding
->source
;
1196 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1201 encode_coding_utf_8 (coding
)
1202 struct coding_system
*coding
;
1204 int multibytep
= coding
->dst_multibyte
;
1205 int *charbuf
= coding
->charbuf
;
1206 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1207 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1208 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1209 int produced_chars
= 0;
1214 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1216 while (charbuf
< charbuf_end
)
1218 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1220 ASSURE_DESTINATION (safe_room
);
1222 CHAR_STRING_ADVANCE (c
, pend
);
1223 for (p
= str
; p
< pend
; p
++)
1229 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1231 while (charbuf
< charbuf_end
)
1233 ASSURE_DESTINATION (safe_room
);
1235 dst
+= CHAR_STRING (c
, dst
);
1239 coding
->result
= CODING_RESULT_SUCCESS
;
1240 coding
->produced_char
+= produced_chars
;
1241 coding
->produced
= dst
- coding
->destination
;
1246 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1247 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1248 Little Endian (otherwise). If it is, return
1249 CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1252 #define UTF_16_HIGH_SURROGATE_P(val) \
1253 (((val) & 0xFC00) == 0xD800)
1255 #define UTF_16_LOW_SURROGATE_P(val) \
1256 (((val) & 0xFC00) == 0xDC00)
1258 #define UTF_16_INVALID_P(val) \
1259 (((val) == 0xFFFE) \
1260 || ((val) == 0xFFFF) \
1261 || UTF_16_LOW_SURROGATE_P (val))
1265 detect_coding_utf_16 (coding
, mask
)
1266 struct coding_system
*coding
;
1269 unsigned char *src
= coding
->source
, *src_base
= src
;
1270 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1271 int multibytep
= coding
->src_multibyte
;
1272 int consumed_chars
= 0;
1278 if ((c1
== 0xFF) && (c2
== 0xFE))
1280 *mask
&= CATEGORY_MASK_UTF_16_LE
;
1283 else if ((c1
== 0xFE) && (c2
== 0xFF))
1285 *mask
&= CATEGORY_MASK_UTF_16_BE
;
1293 decode_coding_utf_16 (coding
)
1294 struct coding_system
*coding
;
1296 unsigned char *src
= coding
->source
+ coding
->consumed
;
1297 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1298 unsigned char *src_base
;
1299 int *charbuf
= coding
->charbuf
;
1300 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1301 int consumed_chars
= 0, consumed_chars_base
;
1302 int multibytep
= coding
->src_multibyte
;
1303 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1304 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1305 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1306 Lisp_Object attr
, eol_type
, charset_list
;
1308 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1310 if (bom
!= utf_16_without_bom
)
1318 if (bom
== utf_16_with_bom
)
1320 if (endian
== utf_16_big_endian
1321 ? c
!= 0xFFFE : c
!= 0xFEFF)
1323 /* We are sure that there's enouph room at CHARBUF. */
1332 CODING_UTF_16_ENDIAN (coding
)
1333 = endian
= utf_16_big_endian
;
1334 else if (c
== 0xFEFF)
1335 CODING_UTF_16_ENDIAN (coding
)
1336 = endian
= utf_16_little_endian
;
1339 CODING_UTF_16_ENDIAN (coding
)
1340 = endian
= utf_16_big_endian
;
1344 CODING_UTF_16_BOM (coding
) = utf_16_with_bom
;
1352 consumed_chars_base
= consumed_chars
;
1354 if (charbuf
+ 2 >= charbuf_end
)
1359 c
= (endian
== utf_16_big_endian
1360 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1363 if (! UTF_16_LOW_SURROGATE_P (c
))
1365 if (endian
== utf_16_big_endian
)
1366 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1368 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1372 if (UTF_16_HIGH_SURROGATE_P (c
))
1373 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1379 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1380 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1386 if (UTF_16_HIGH_SURROGATE_P (c
))
1387 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1394 coding
->consumed_char
+= consumed_chars_base
;
1395 coding
->consumed
= src_base
- coding
->source
;
1396 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1400 encode_coding_utf_16 (coding
)
1401 struct coding_system
*coding
;
1403 int multibytep
= coding
->dst_multibyte
;
1404 int *charbuf
= coding
->charbuf
;
1405 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1406 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1407 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1409 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1410 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1411 int produced_chars
= 0;
1412 Lisp_Object attrs
, eol_type
, charset_list
;
1415 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1417 if (bom
== utf_16_with_bom
)
1419 ASSURE_DESTINATION (safe_room
);
1421 EMIT_TWO_BYTES (0xFF, 0xFE);
1423 EMIT_TWO_BYTES (0xFE, 0xFF);
1424 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1427 while (charbuf
< charbuf_end
)
1429 ASSURE_DESTINATION (safe_room
);
1431 if (c
>= MAX_UNICODE_CHAR
)
1432 c
= coding
->default_char
;
1437 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1439 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1446 c1
= (c
>> 10) + 0xD800;
1447 c2
= (c
& 0x3FF) + 0xDC00;
1449 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1451 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1454 coding
->result
= CODING_RESULT_SUCCESS
;
1455 coding
->produced
= dst
- coding
->destination
;
1456 coding
->produced_char
+= produced_chars
;
1461 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1463 /* Emacs' internal format for representation of multiple character
1464 sets is a kind of multi-byte encoding, i.e. characters are
1465 represented by variable-length sequences of one-byte codes.
1467 ASCII characters and control characters (e.g. `tab', `newline') are
1468 represented by one-byte sequences which are their ASCII codes, in
1469 the range 0x00 through 0x7F.
1471 8-bit characters of the range 0x80..0x9F are represented by
1472 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1475 8-bit characters of the range 0xA0..0xFF are represented by
1476 one-byte sequences which are their 8-bit code.
1478 The other characters are represented by a sequence of `base
1479 leading-code', optional `extended leading-code', and one or two
1480 `position-code's. The length of the sequence is determined by the
1481 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1482 whereas extended leading-code and position-code take the range 0xA0
1483 through 0xFF. See `charset.h' for more details about leading-code
1486 --- CODE RANGE of Emacs' internal format ---
1490 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1491 eight-bit-graphic 0xA0..0xBF
1492 ELSE 0x81..0x9D + [0xA0..0xFF]+
1493 ---------------------------------------------
1495 As this is the internal character representation, the format is
1496 usually not used externally (i.e. in a file or in a data sent to a
1497 process). But, it is possible to have a text externally in this
1498 format (i.e. by encoding by the coding system `emacs-mule').
1500 In that case, a sequence of one-byte codes has a slightly different
1503 At first, all characters in eight-bit-control are represented by
1504 one-byte sequences which are their 8-bit code.
1506 Next, character composition data are represented by the byte
1507 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1509 METHOD is 0xF0 plus one of composition method (enum
1510 composition_method),
1512 BYTES is 0xA0 plus a byte length of this composition data,
1514 CHARS is 0x20 plus a number of characters composed by this
1517 COMPONENTs are characters of multibye form or composition
1518 rules encoded by two-byte of ASCII codes.
1520 In addition, for backward compatibility, the following formats are
1521 also recognized as composition data on decoding.
1524 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1527 MSEQ is a multibyte form but in these special format:
1528 ASCII: 0xA0 ASCII_CODE+0x80,
1529 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1530 RULE is a one byte code of the range 0xA0..0xF0 that
1531 represents a composition rule.
1534 char emacs_mule_bytes
[256];
1536 /* Leading-code followed by extended leading-code. */
1537 #define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
1538 #define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
1539 #define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
1540 #define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
1544 emacs_mule_char (coding
, src
, nbytes
, nchars
)
1545 struct coding_system
*coding
;
1547 int *nbytes
, *nchars
;
1549 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1550 int multibytep
= coding
->src_multibyte
;
1551 unsigned char *src_base
= src
;
1552 struct charset
*charset
;
1555 int consumed_chars
= 0;
1558 switch (emacs_mule_bytes
[c
])
1561 if (! (charset
= emacs_mule_charset
[c
]))
1568 if (c
== LEADING_CODE_PRIVATE_11
1569 || c
== LEADING_CODE_PRIVATE_12
)
1572 if (! (charset
= emacs_mule_charset
[c
]))
1579 if (! (charset
= emacs_mule_charset
[c
]))
1582 code
= (c
& 0x7F) << 8;
1590 if (! (charset
= emacs_mule_charset
[c
]))
1593 code
= (c
& 0x7F) << 8;
1600 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
) ? charset_ascii
1601 : code
< 0xA0 ? charset_8_bit_control
1602 : charset_8_bit_graphic
);
1608 c
= DECODE_CHAR (charset
, code
);
1611 *nbytes
= src
- src_base
;
1612 *nchars
= consumed_chars
;
1623 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1624 Check if a text is encoded in `emacs-mule'. */
1627 detect_coding_emacs_mule (coding
, mask
)
1628 struct coding_system
*coding
;
1631 unsigned char *src
= coding
->source
, *src_base
= src
;
1632 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1633 int multibytep
= coding
->src_multibyte
;
1634 int consumed_chars
= 0;
1638 /* A coding system of this category is always ASCII compatible. */
1639 src
+= coding
->head_ascii
;
1647 /* Perhaps the start of composite character. We simple skip
1648 it because analyzing it is too heavy for detecting. But,
1649 at least, we check that the composite character
1650 constitues of more than 4 bytes. */
1651 unsigned char *src_base
;
1661 if (src
- src_base
<= 4)
1671 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1676 unsigned char *src_base
= src
- 1;
1683 if (src
- src_base
!= emacs_mule_bytes
[*src_base
])
1688 *mask
&= ~CATEGORY_MASK_EMACS_MULE
;
1694 *mask
&= CATEGORY_MASK_EMACS_MULE
;
1699 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1701 /* Decode a character represented as a component of composition
1702 sequence of Emacs 20/21 style at SRC. Set C to that character and
1703 update SRC to the head of next character (or an encoded composition
1704 rule). If SRC doesn't points a composition component, set C to -1.
1705 If SRC points an invalid byte sequence, global exit by a return
1708 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1712 int nbytes, nchars; \
1714 if (src == src_end) \
1716 c = emacs_mule_char (coding, src, &nbytes, &nchars); \
1721 goto invalid_code; \
1725 consumed_chars += nchars; \
1730 /* Decode a composition rule represented as a component of composition
1731 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1732 and increment BUF. If SRC points an invalid byte sequence, set C
1735 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1737 int c, gref, nref; \
1739 if (src >= src_end) \
1740 goto invalid_code; \
1741 ONE_MORE_BYTE_NO_CHECK (c); \
1743 if (c < 0 || c >= 81) \
1744 goto invalid_code; \
1746 gref = c / 9, nref = c % 9; \
1747 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1751 /* Decode a composition rule represented as a component of composition
1752 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1753 and increment BUF. If SRC points an invalid byte sequence, set C
1756 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1760 if (src + 1>= src_end) \
1761 goto invalid_code; \
1762 ONE_MORE_BYTE_NO_CHECK (gref); \
1764 ONE_MORE_BYTE_NO_CHECK (nref); \
1766 if (gref < 0 || gref >= 81 \
1767 || nref < 0 || nref >= 81) \
1768 goto invalid_code; \
1769 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1773 #define ADD_COMPOSITION_DATA(buf, method, nchars) \
1776 *buf++ = coding->produced_char + char_offset; \
1777 *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \
1783 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1785 /* Emacs 21 style format. The first three bytes at SRC are \
1786 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1787 the byte length of this composition information, CHARS is the \
1788 number of characters composed by this composition. */ \
1789 enum composition_method method = c - 0xF2; \
1790 int *charbuf_base = charbuf; \
1791 int consumed_chars_limit; \
1792 int nbytes, nchars; \
1794 ONE_MORE_BYTE (c); \
1795 nbytes = c - 0xA0; \
1797 goto invalid_code; \
1798 ONE_MORE_BYTE (c); \
1799 nchars = c - 0xA0; \
1800 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
1801 consumed_chars_limit = consumed_chars_base + nbytes; \
1802 if (method != COMPOSITION_RELATIVE) \
1805 while (consumed_chars < consumed_chars_limit) \
1807 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1808 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1810 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1813 if (consumed_chars < consumed_chars_limit) \
1814 goto invalid_code; \
1815 charbuf_base[0] -= i; \
1820 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1822 /* Emacs 20 style format for relative composition. */ \
1823 /* Store multibyte form of characters to be composed. */ \
1824 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1825 int *buf = components; \
1829 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1830 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1831 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1833 goto invalid_code; \
1834 ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \
1835 for (j = 0; j < i; j++) \
1836 *charbuf++ = components[j]; \
1840 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1842 /* Emacs 20 style format for rule-base composition. */ \
1843 /* Store multibyte form of characters to be composed. */ \
1844 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1845 int *buf = components; \
1848 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1849 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1851 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1852 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1854 if (i < 1 || (buf - components) % 2 == 0) \
1855 goto invalid_code; \
1856 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1857 goto no_more_source; \
1858 ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \
1859 for (j = 0; j < i; j++) \
1860 *charbuf++ = components[j]; \
1861 for (j = 0; j < i; j += 2) \
1862 *charbuf++ = components[j]; \
1867 decode_coding_emacs_mule (coding
)
1868 struct coding_system
*coding
;
1870 unsigned char *src
= coding
->source
+ coding
->consumed
;
1871 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1872 unsigned char *src_base
;
1873 int *charbuf
= coding
->charbuf
;
1874 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1875 int consumed_chars
= 0, consumed_chars_base
;
1876 int char_offset
= 0;
1877 int multibytep
= coding
->src_multibyte
;
1878 Lisp_Object attrs
, eol_type
, charset_list
;
1880 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1887 consumed_chars_base
= consumed_chars
;
1889 if (charbuf
>= charbuf_end
)
1898 if (EQ (eol_type
, Qdos
))
1901 goto no_more_source
;
1905 else if (EQ (eol_type
, Qmac
))
1913 if (charbuf
+ 5 + (MAX_COMPOSITION_COMPONENTS
* 2) - 1 > charbuf_end
)
1916 if (c
- 0xF2 >= COMPOSITION_RELATIVE
1917 && c
- 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS
)
1918 DECODE_EMACS_MULE_21_COMPOSITION (c
);
1920 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
1922 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
1925 coding
->annotated
= 1;
1927 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
1931 consumed_chars
= consumed_chars_base
;
1932 c
= emacs_mule_char (coding
, src
, &nbytes
, &nchars
);
1941 consumed_chars
+= nchars
;
1948 consumed_chars
= consumed_chars_base
;
1950 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1955 coding
->consumed_char
+= consumed_chars_base
;
1956 coding
->consumed
= src_base
- coding
->source
;
1957 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1961 #define EMACS_MULE_LEADING_CODES(id, codes) \
1964 codes[0] = id, codes[1] = 0; \
1965 else if (id < 0xE0) \
1966 codes[0] = 0x9A, codes[1] = id; \
1967 else if (id < 0xF0) \
1968 codes[0] = 0x9B, codes[1] = id; \
1969 else if (id < 0xF5) \
1970 codes[0] = 0x9C, codes[1] = id; \
1972 codes[0] = 0x9D, codes[1] = id; \
1977 encode_coding_emacs_mule (coding
)
1978 struct coding_system
*coding
;
1980 int multibytep
= coding
->dst_multibyte
;
1981 int *charbuf
= coding
->charbuf
;
1982 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1983 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1984 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1986 int produced_chars
= 0;
1987 Lisp_Object attrs
, eol_type
, charset_list
;
1990 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1992 while (charbuf
< charbuf_end
)
1994 ASSURE_DESTINATION (safe_room
);
1996 if (ASCII_CHAR_P (c
))
1997 EMIT_ONE_ASCII_BYTE (c
);
2000 struct charset
*charset
;
2004 unsigned char leading_codes
[2];
2006 charset
= char_charset (c
, charset_list
, &code
);
2009 c
= coding
->default_char
;
2010 if (ASCII_CHAR_P (c
))
2012 EMIT_ONE_ASCII_BYTE (c
);
2015 charset
= char_charset (c
, charset_list
, &code
);
2017 dimension
= CHARSET_DIMENSION (charset
);
2018 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2019 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2020 EMIT_ONE_BYTE (leading_codes
[0]);
2021 if (leading_codes
[1])
2022 EMIT_ONE_BYTE (leading_codes
[1]);
2024 EMIT_ONE_BYTE (code
);
2027 EMIT_ONE_BYTE (code
>> 8);
2028 EMIT_ONE_BYTE (code
& 0xFF);
2032 coding
->result
= CODING_RESULT_SUCCESS
;
2033 coding
->produced_char
+= produced_chars
;
2034 coding
->produced
= dst
- coding
->destination
;
2039 /*** 7. ISO2022 handlers ***/
2041 /* The following note describes the coding system ISO2022 briefly.
2042 Since the intention of this note is to help understand the
2043 functions in this file, some parts are NOT ACCURATE or OVERLY
2044 SIMPLIFIED. For thorough understanding, please refer to the
2045 original document of ISO2022.
2047 ISO2022 provides many mechanisms to encode several character sets
2048 in 7-bit and 8-bit environments. For 7-bite environments, all text
2049 is encoded using bytes less than 128. This may make the encoded
2050 text a little bit longer, but the text passes more easily through
2051 several gateways, some of which strip off MSB (Most Signigant Bit).
2053 There are two kinds of character sets: control character set and
2054 graphic character set. The former contains control characters such
2055 as `newline' and `escape' to provide control functions (control
2056 functions are also provided by escape sequences). The latter
2057 contains graphic characters such as 'A' and '-'. Emacs recognizes
2058 two control character sets and many graphic character sets.
2060 Graphic character sets are classified into one of the following
2061 four classes, according to the number of bytes (DIMENSION) and
2062 number of characters in one dimension (CHARS) of the set:
2063 - DIMENSION1_CHARS94
2064 - DIMENSION1_CHARS96
2065 - DIMENSION2_CHARS94
2066 - DIMENSION2_CHARS96
2068 In addition, each character set is assigned an identification tag,
2069 unique for each set, called "final character" (denoted as <F>
2070 hereafter). The <F> of each character set is decided by ECMA(*)
2071 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2072 (0x30..0x3F are for private use only).
2074 Note (*): ECMA = European Computer Manufacturers Association
2076 Here are examples of graphic character set [NAME(<F>)]:
2077 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2078 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2079 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2080 o DIMENSION2_CHARS96 -- none for the moment
2082 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2083 C0 [0x00..0x1F] -- control character plane 0
2084 GL [0x20..0x7F] -- graphic character plane 0
2085 C1 [0x80..0x9F] -- control character plane 1
2086 GR [0xA0..0xFF] -- graphic character plane 1
2088 A control character set is directly designated and invoked to C0 or
2089 C1 by an escape sequence. The most common case is that:
2090 - ISO646's control character set is designated/invoked to C0, and
2091 - ISO6429's control character set is designated/invoked to C1,
2092 and usually these designations/invocations are omitted in encoded
2093 text. In a 7-bit environment, only C0 can be used, and a control
2094 character for C1 is encoded by an appropriate escape sequence to
2095 fit into the environment. All control characters for C1 are
2096 defined to have corresponding escape sequences.
2098 A graphic character set is at first designated to one of four
2099 graphic registers (G0 through G3), then these graphic registers are
2100 invoked to GL or GR. These designations and invocations can be
2101 done independently. The most common case is that G0 is invoked to
2102 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2103 these invocations and designations are omitted in encoded text.
2104 In a 7-bit environment, only GL can be used.
2106 When a graphic character set of CHARS94 is invoked to GL, codes
2107 0x20 and 0x7F of the GL area work as control characters SPACE and
2108 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2111 There are two ways of invocation: locking-shift and single-shift.
2112 With locking-shift, the invocation lasts until the next different
2113 invocation, whereas with single-shift, the invocation affects the
2114 following character only and doesn't affect the locking-shift
2115 state. Invocations are done by the following control characters or
2118 ----------------------------------------------------------------------
2119 abbrev function cntrl escape seq description
2120 ----------------------------------------------------------------------
2121 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2122 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2123 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2124 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2125 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2126 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2127 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2128 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2129 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2130 ----------------------------------------------------------------------
2131 (*) These are not used by any known coding system.
2133 Control characters for these functions are defined by macros
2134 ISO_CODE_XXX in `coding.h'.
2136 Designations are done by the following escape sequences:
2137 ----------------------------------------------------------------------
2138 escape sequence description
2139 ----------------------------------------------------------------------
2140 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2141 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2142 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2143 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2144 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2145 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2146 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2147 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2148 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2149 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2150 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2151 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2152 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2153 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2154 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2155 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2156 ----------------------------------------------------------------------
2158 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2159 of dimension 1, chars 94, and final character <F>, etc...
2161 Note (*): Although these designations are not allowed in ISO2022,
2162 Emacs accepts them on decoding, and produces them on encoding
2163 CHARS96 character sets in a coding system which is characterized as
2164 7-bit environment, non-locking-shift, and non-single-shift.
2166 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2167 '(' must be omitted. We refer to this as "short-form" hereafter.
2169 Now you may notice that there are a lot of ways for encoding the
2170 same multilingual text in ISO2022. Actually, there exist many
2171 coding systems such as Compound Text (used in X11's inter client
2172 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
2173 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
2174 localized platforms), and all of these are variants of ISO2022.
2176 In addition to the above, Emacs handles two more kinds of escape
2177 sequences: ISO6429's direction specification and Emacs' private
2178 sequence for specifying character composition.
2180 ISO6429's direction specification takes the following form:
2181 o CSI ']' -- end of the current direction
2182 o CSI '0' ']' -- end of the current direction
2183 o CSI '1' ']' -- start of left-to-right text
2184 o CSI '2' ']' -- start of right-to-left text
2185 The control character CSI (0x9B: control sequence introducer) is
2186 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2188 Character composition specification takes the following form:
2189 o ESC '0' -- start relative composition
2190 o ESC '1' -- end composition
2191 o ESC '2' -- start rule-base composition (*)
2192 o ESC '3' -- start relative composition with alternate chars (**)
2193 o ESC '4' -- start rule-base composition with alternate chars (**)
2194 Since these are not standard escape sequences of any ISO standard,
2195 the use of them for these meaning is restricted to Emacs only.
2197 (*) This form is used only in Emacs 20.5 and the older versions,
2198 but the newer versions can safely decode it.
2199 (**) This form is used only in Emacs 21.1 and the newer versions,
2200 and the older versions can't decode it.
2202 Here's a list of examples usages of these composition escape
2203 sequences (categorized by `enum composition_method').
2205 COMPOSITION_RELATIVE:
2206 ESC 0 CHAR [ CHAR ] ESC 1
2207 COMPOSITOIN_WITH_RULE:
2208 ESC 2 CHAR [ RULE CHAR ] ESC 1
2209 COMPOSITION_WITH_ALTCHARS:
2210 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2211 COMPOSITION_WITH_RULE_ALTCHARS:
2212 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2214 enum iso_code_class_type iso_code_class
[256];
2216 #define SAFE_CHARSET_P(coding, id) \
2217 ((id) <= (coding)->max_charset_id \
2218 && (coding)->safe_charsets[id] >= 0)
2221 #define SHIFT_OUT_OK(category) \
2222 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2225 setup_iso_safe_charsets (attrs
)
2228 Lisp_Object charset_list
, safe_charsets
;
2229 Lisp_Object request
;
2230 Lisp_Object reg_usage
;
2233 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2236 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2237 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2238 && ! EQ (charset_list
, Viso_2022_charset_list
))
2240 CODING_ATTR_CHARSET_LIST (attrs
)
2241 = charset_list
= Viso_2022_charset_list
;
2242 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2245 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2249 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2251 int id
= XINT (XCAR (tail
));
2252 if (max_charset_id
< id
)
2253 max_charset_id
= id
;
2256 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2258 request
= AREF (attrs
, coding_attr_iso_request
);
2259 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2260 reg94
= XINT (XCAR (reg_usage
));
2261 reg96
= XINT (XCDR (reg_usage
));
2263 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2267 struct charset
*charset
;
2270 charset
= CHARSET_FROM_ID (XINT (id
));
2271 reg
= Fcdr (Fassq (request
, id
));
2273 XSTRING (safe_charsets
)->data
[XINT (id
)] = XINT (reg
);
2274 else if (charset
->iso_chars_96
)
2277 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg96
;
2282 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg94
;
2285 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2289 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2290 Check if a text is encoded in ISO2022. If it is, returns an
2291 integer in which appropriate flag bits any of:
2293 CATEGORY_MASK_ISO_7_TIGHT
2294 CATEGORY_MASK_ISO_8_1
2295 CATEGORY_MASK_ISO_8_2
2296 CATEGORY_MASK_ISO_7_ELSE
2297 CATEGORY_MASK_ISO_8_ELSE
2298 are set. If a code which should never appear in ISO2022 is found,
2302 detect_coding_iso_2022 (coding
, mask
)
2303 struct coding_system
*coding
;
2306 unsigned char *src
= coding
->source
, *src_base
= src
;
2307 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2308 int multibytep
= coding
->src_multibyte
;
2309 int mask_iso
= CATEGORY_MASK_ISO
;
2310 int mask_found
= 0, mask_8bit_found
= 0;
2311 int reg
[4], shift_out
= 0, single_shifting
= 0;
2314 int consumed_chars
= 0;
2317 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2319 struct coding_system
*this = &(coding_categories
[i
]);
2320 Lisp_Object attrs
, val
;
2322 attrs
= CODING_ID_ATTRS (this->id
);
2323 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2324 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2325 setup_iso_safe_charsets (attrs
);
2326 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2327 this->max_charset_id
= XSTRING (val
)->size
- 1;
2328 this->safe_charsets
= (char *) XSTRING (val
)->data
;
2331 /* A coding system of this category is always ASCII compatible. */
2332 src
+= coding
->head_ascii
;
2334 reg
[0] = charset_ascii
, reg
[1] = reg
[2] = reg
[3] = -1;
2335 while (mask_iso
&& src
< src_end
)
2341 if (inhibit_iso_escape_detection
)
2343 single_shifting
= 0;
2345 if (c
>= '(' && c
<= '/')
2347 /* Designation sequence for a charset of dimension 1. */
2349 if (c1
< ' ' || c1
>= 0x80
2350 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2351 /* Invalid designation sequence. Just ignore. */
2353 reg
[(c
- '(') % 4] = id
;
2357 /* Designation sequence for a charset of dimension 2. */
2359 if (c
>= '@' && c
<= 'B')
2360 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2361 reg
[0] = id
= iso_charset_table
[1][0][c
];
2362 else if (c
>= '(' && c
<= '/')
2365 if (c1
< ' ' || c1
>= 0x80
2366 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2367 /* Invalid designation sequence. Just ignore. */
2369 reg
[(c
- '(') % 4] = id
;
2372 /* Invalid designation sequence. Just ignore. */
2375 else if (c
== 'N' || c
== 'O')
2377 /* ESC <Fe> for SS2 or SS3. */
2378 mask_iso
&= CATEGORY_MASK_ISO_7_ELSE
;
2381 else if (c
>= '0' && c
<= '4')
2383 /* ESC <Fp> for start/end composition. */
2384 mask_found
|= CATEGORY_MASK_ISO
;
2389 /* Invalid escape sequence. */
2390 mask_iso
&= ~CATEGORY_MASK_ISO_ESCAPE
;
2394 /* We found a valid designation sequence for CHARSET. */
2395 mask_iso
&= ~CATEGORY_MASK_ISO_8BIT
;
2396 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2398 mask_found
|= CATEGORY_MASK_ISO_7
;
2400 mask_iso
&= ~CATEGORY_MASK_ISO_7
;
2401 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2403 mask_found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2405 mask_iso
&= ~CATEGORY_MASK_ISO_7_TIGHT
;
2406 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2408 mask_found
|= CATEGORY_MASK_ISO_7_ELSE
;
2410 mask_iso
&= ~CATEGORY_MASK_ISO_7_ELSE
;
2411 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2413 mask_found
|= CATEGORY_MASK_ISO_8_ELSE
;
2415 mask_iso
&= ~CATEGORY_MASK_ISO_8_ELSE
;
2419 if (inhibit_iso_escape_detection
)
2421 single_shifting
= 0;
2424 || SHIFT_OUT_OK (coding_category_iso_7_else
)
2425 || SHIFT_OUT_OK (coding_category_iso_8_else
)))
2427 /* Locking shift out. */
2428 mask_iso
&= ~CATEGORY_MASK_ISO_7BIT
;
2429 mask_found
|= CATEGORY_MASK_ISO_ELSE
;
2434 if (inhibit_iso_escape_detection
)
2436 single_shifting
= 0;
2439 /* Locking shift in. */
2440 mask_iso
&= ~CATEGORY_MASK_ISO_7BIT
;
2441 mask_found
|= CATEGORY_MASK_ISO_ELSE
;
2446 single_shifting
= 0;
2450 int newmask
= CATEGORY_MASK_ISO_8_ELSE
;
2452 if (inhibit_iso_escape_detection
)
2454 if (c
!= ISO_CODE_CSI
)
2456 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2457 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2458 newmask
|= CATEGORY_MASK_ISO_8_1
;
2459 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2460 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2461 newmask
|= CATEGORY_MASK_ISO_8_2
;
2462 single_shifting
= 1;
2464 if (VECTORP (Vlatin_extra_code_table
)
2465 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2467 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2468 & CODING_ISO_FLAG_LATIN_EXTRA
)
2469 newmask
|= CATEGORY_MASK_ISO_8_1
;
2470 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2471 & CODING_ISO_FLAG_LATIN_EXTRA
)
2472 newmask
|= CATEGORY_MASK_ISO_8_2
;
2474 mask_iso
&= newmask
;
2475 mask_found
|= newmask
;
2482 single_shifting
= 0;
2487 single_shifting
= 0;
2488 mask_8bit_found
= 1;
2489 if (VECTORP (Vlatin_extra_code_table
)
2490 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2494 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2495 & CODING_ISO_FLAG_LATIN_EXTRA
)
2496 newmask
|= CATEGORY_MASK_ISO_8_1
;
2497 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2498 & CODING_ISO_FLAG_LATIN_EXTRA
)
2499 newmask
|= CATEGORY_MASK_ISO_8_2
;
2500 mask_iso
&= newmask
;
2501 mask_found
|= newmask
;
2508 mask_iso
&= ~(CATEGORY_MASK_ISO_7BIT
2509 | CATEGORY_MASK_ISO_7_ELSE
);
2510 mask_found
|= CATEGORY_MASK_ISO_8_1
;
2511 mask_8bit_found
= 1;
2512 /* Check the length of succeeding codes of the range
2513 0xA0..0FF. If the byte length is odd, we exclude
2514 CATEGORY_MASK_ISO_8_2. We can check this only
2515 when we are not single shifting. */
2516 if (!single_shifting
2517 && mask_iso
& CATEGORY_MASK_ISO_8_2
)
2520 while (src
< src_end
)
2528 if (i
& 1 && src
< src_end
)
2529 mask_iso
&= ~CATEGORY_MASK_ISO_8_2
;
2531 mask_found
|= CATEGORY_MASK_ISO_8_2
;
2540 *mask
&= ~CATEGORY_MASK_ISO
;
2545 *mask
&= mask_iso
& mask_found
;
2546 if (! mask_8bit_found
)
2547 *mask
&= ~(CATEGORY_MASK_ISO_8BIT
| CATEGORY_MASK_ISO_8_ELSE
);
2552 /* Set designation state into CODING. */
2553 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2557 if (final < '0' || final >= 128 \
2558 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2559 || !SAFE_CHARSET_P (coding, id)) \
2561 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2562 goto invalid_code; \
2564 prev = CODING_ISO_DESIGNATION (coding, reg); \
2565 CODING_ISO_DESIGNATION (coding, reg) = id; \
2566 /* If there was an invalid designation to REG previously, and this \
2567 designation is ASCII to REG, we should keep this designation \
2569 if (prev == -2 && id == charset_ascii) \
2570 goto invalid_code; \
2574 #define MAYBE_FINISH_COMPOSITION() \
2577 if (composition_state == COMPOSING_NO) \
2579 /* It is assured that we have enough room for producing \
2580 characters stored in the table `components'. */ \
2581 if (charbuf + component_idx > charbuf_end) \
2582 goto no_more_source; \
2583 composition_state = COMPOSING_NO; \
2584 if (method == COMPOSITION_RELATIVE \
2585 || method == COMPOSITION_WITH_ALTCHARS) \
2587 for (i = 0; i < component_idx; i++) \
2588 *charbuf++ = components[i]; \
2589 char_offset += component_idx; \
2593 for (i = 0; i < component_idx; i += 2) \
2594 *charbuf++ = components[i]; \
2595 char_offset += (component_idx / 2) + 1; \
2600 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2601 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2602 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2603 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2604 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2607 #define DECODE_COMPOSITION_START(c1) \
2610 && composition_state == COMPOSING_COMPONENT_RULE) \
2612 component_len = component_idx; \
2613 composition_state = COMPOSING_CHAR; \
2619 MAYBE_FINISH_COMPOSITION (); \
2620 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2621 goto no_more_source; \
2622 for (p = src; p < src_end - 1; p++) \
2623 if (*p == ISO_CODE_ESC && p[1] == '1') \
2625 if (p == src_end - 1) \
2627 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2628 goto invalid_code; \
2629 goto no_more_source; \
2632 /* This is surely the start of a composition. */ \
2633 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2634 : c1 == '2' ? COMPOSITION_WITH_RULE \
2635 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2636 : COMPOSITION_WITH_RULE_ALTCHARS); \
2637 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2638 : COMPOSING_COMPONENT_CHAR); \
2639 component_idx = component_len = 0; \
2644 /* Handle compositoin end sequence ESC 1. */
2646 #define DECODE_COMPOSITION_END() \
2648 int nchars = (component_len > 0 ? component_idx - component_len \
2649 : method == COMPOSITION_RELATIVE ? component_idx \
2650 : (component_idx + 1) / 2); \
2652 int *saved_charbuf = charbuf; \
2654 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
2655 if (method != COMPOSITION_RELATIVE) \
2657 if (component_len == 0) \
2658 for (i = 0; i < component_idx; i++) \
2659 *charbuf++ = components[i]; \
2661 for (i = 0; i < component_len; i++) \
2662 *charbuf++ = components[i]; \
2663 *saved_charbuf = saved_charbuf - charbuf; \
2665 if (method == COMPOSITION_WITH_RULE) \
2666 for (i = 0; i < component_idx; i += 2, char_offset++) \
2667 *charbuf++ = components[i]; \
2669 for (i = component_len; i < component_idx; i++, char_offset++) \
2670 *charbuf++ = components[i]; \
2671 coding->annotated = 1; \
2672 composition_state = COMPOSING_NO; \
2676 /* Decode a composition rule from the byte C1 (and maybe one more byte
2677 from SRC) and store one encoded composition rule in
2678 coding->cmp_data. */
2680 #define DECODE_COMPOSITION_RULE(c1) \
2683 if (c1 < 81) /* old format (before ver.21) */ \
2685 int gref = (c1) / 9; \
2686 int nref = (c1) % 9; \
2687 if (gref == 4) gref = 10; \
2688 if (nref == 4) nref = 10; \
2689 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2691 else if (c1 < 93) /* new format (after ver.21) */ \
2693 ONE_MORE_BYTE (c2); \
2694 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2701 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2704 decode_coding_iso_2022 (coding
)
2705 struct coding_system
*coding
;
2707 unsigned char *src
= coding
->source
+ coding
->consumed
;
2708 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2709 unsigned char *src_base
;
2710 int *charbuf
= coding
->charbuf
;
2711 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- 4;
2712 int consumed_chars
= 0, consumed_chars_base
;
2713 int char_offset
= 0;
2714 int multibytep
= coding
->src_multibyte
;
2715 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2716 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2717 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2718 struct charset
*charset
;
2720 /* For handling composition sequence. */
2721 #define COMPOSING_NO 0
2722 #define COMPOSING_CHAR 1
2723 #define COMPOSING_RULE 2
2724 #define COMPOSING_COMPONENT_CHAR 3
2725 #define COMPOSING_COMPONENT_RULE 4
2727 int composition_state
= COMPOSING_NO
;
2728 enum composition_method method
;
2729 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2732 Lisp_Object attrs
, eol_type
, charset_list
;
2734 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2735 setup_iso_safe_charsets (attrs
);
2742 consumed_chars_base
= consumed_chars
;
2744 if (charbuf
>= charbuf_end
)
2749 /* We produce no character or one character. */
2750 switch (iso_code_class
[c1
])
2752 case ISO_0x20_or_0x7F
:
2753 if (composition_state
!= COMPOSING_NO
)
2755 if (composition_state
== COMPOSING_RULE
2756 || composition_state
== COMPOSING_COMPONENT_RULE
)
2758 DECODE_COMPOSITION_RULE (c1
);
2759 components
[component_idx
++] = c1
;
2760 composition_state
--;
2764 if (charset_id_0
< 0
2765 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2766 /* This is SPACE or DEL. */
2767 charset
= CHARSET_FROM_ID (charset_ascii
);
2769 charset
= CHARSET_FROM_ID (charset_id_0
);
2772 case ISO_graphic_plane_0
:
2773 if (composition_state
!= COMPOSING_NO
)
2775 if (composition_state
== COMPOSING_RULE
2776 || composition_state
== COMPOSING_COMPONENT_RULE
)
2778 DECODE_COMPOSITION_RULE (c1
);
2779 components
[component_idx
++] = c1
;
2780 composition_state
--;
2784 charset
= CHARSET_FROM_ID (charset_id_0
);
2787 case ISO_0xA0_or_0xFF
:
2788 if (charset_id_1
< 0
2789 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
2790 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
2792 /* This is a graphic character, we fall down ... */
2794 case ISO_graphic_plane_1
:
2795 if (charset_id_1
< 0)
2797 charset
= CHARSET_FROM_ID (charset_id_1
);
2800 case ISO_carriage_return
:
2803 if (EQ (eol_type
, Qdos
))
2806 goto no_more_source
;
2810 else if (EQ (eol_type
, Qmac
))
2816 MAYBE_FINISH_COMPOSITION ();
2817 charset
= CHARSET_FROM_ID (charset_ascii
);
2821 MAYBE_FINISH_COMPOSITION ();
2825 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2826 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
2828 CODING_ISO_INVOCATION (coding
, 0) = 1;
2829 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2833 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
2835 CODING_ISO_INVOCATION (coding
, 0) = 0;
2836 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2839 case ISO_single_shift_2_7
:
2840 case ISO_single_shift_2
:
2841 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2843 /* SS2 is handled as an escape sequence of ESC 'N' */
2845 goto label_escape_sequence
;
2847 case ISO_single_shift_3
:
2848 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2850 /* SS2 is handled as an escape sequence of ESC 'O' */
2852 goto label_escape_sequence
;
2854 case ISO_control_sequence_introducer
:
2855 /* CSI is handled as an escape sequence of ESC '[' ... */
2857 goto label_escape_sequence
;
2861 label_escape_sequence
:
2862 /* Escape sequences handled here are invocation,
2863 designation, direction specification, and character
2864 composition specification. */
2867 case '&': /* revision of following character set */
2869 if (!(c1
>= '@' && c1
<= '~'))
2872 if (c1
!= ISO_CODE_ESC
)
2875 goto label_escape_sequence
;
2877 case '$': /* designation of 2-byte character set */
2878 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2881 if (c1
>= '@' && c1
<= 'B')
2882 { /* designation of JISX0208.1978, GB2312.1980,
2884 DECODE_DESIGNATION (0, 2, 0, c1
);
2886 else if (c1
>= 0x28 && c1
<= 0x2B)
2887 { /* designation of DIMENSION2_CHARS94 character set */
2889 DECODE_DESIGNATION (c1
- 0x28, 2, 0, c2
);
2891 else if (c1
>= 0x2C && c1
<= 0x2F)
2892 { /* designation of DIMENSION2_CHARS96 character set */
2894 DECODE_DESIGNATION (c1
- 0x2C, 2, 1, c2
);
2898 /* We must update these variables now. */
2899 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2900 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2903 case 'n': /* invocation of locking-shift-2 */
2904 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2905 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2907 CODING_ISO_INVOCATION (coding
, 0) = 2;
2908 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2911 case 'o': /* invocation of locking-shift-3 */
2912 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2913 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2915 CODING_ISO_INVOCATION (coding
, 0) = 3;
2916 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2919 case 'N': /* invocation of single-shift-2 */
2920 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2921 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2923 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 2));
2925 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
2929 case 'O': /* invocation of single-shift-3 */
2930 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2931 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2933 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 3));
2935 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
2939 case '0': case '2': case '3': case '4': /* start composition */
2940 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
2942 DECODE_COMPOSITION_START (c1
);
2945 case '1': /* end composition */
2946 if (composition_state
== COMPOSING_NO
)
2948 DECODE_COMPOSITION_END ();
2951 case '[': /* specification of direction */
2952 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
2954 /* For the moment, nested direction is not supported.
2955 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2956 left-to-right, and nozero means right-to-left. */
2960 case ']': /* end of the current direction */
2961 coding
->mode
&= ~CODING_MODE_DIRECTION
;
2963 case '0': /* end of the current direction */
2964 case '1': /* start of left-to-right direction */
2967 coding
->mode
&= ~CODING_MODE_DIRECTION
;
2972 case '2': /* start of right-to-left direction */
2975 coding
->mode
|= CODING_MODE_DIRECTION
;
2986 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2988 if (c1
>= 0x28 && c1
<= 0x2B)
2989 { /* designation of DIMENSION1_CHARS94 character set */
2991 DECODE_DESIGNATION (c1
- 0x28, 1, 0, c2
);
2993 else if (c1
>= 0x2C && c1
<= 0x2F)
2994 { /* designation of DIMENSION1_CHARS96 character set */
2996 DECODE_DESIGNATION (c1
- 0x2C, 1, 1, c2
);
3000 /* We must update these variables now. */
3001 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3002 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3007 /* Now we know CHARSET and 1st position code C1 of a character.
3008 Produce a decoded character while getting 2nd position code
3011 if (CHARSET_DIMENSION (charset
) > 1)
3014 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3015 /* C2 is not in a valid range. */
3017 c1
= (c1
<< 8) | (c2
& 0x7F);
3018 if (CHARSET_DIMENSION (charset
) > 2)
3021 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3022 /* C2 is not in a valid range. */
3024 c1
= (c1
<< 8) | (c2
& 0x7F);
3028 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3031 MAYBE_FINISH_COMPOSITION ();
3032 for (; src_base
< src
; src_base
++, char_offset
++)
3034 if (ASCII_BYTE_P (*src_base
))
3035 *charbuf
++ = *src_base
;
3037 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3040 else if (composition_state
== COMPOSING_NO
)
3047 components
[component_idx
++] = c
;
3048 if (method
== COMPOSITION_WITH_RULE
3049 || (method
== COMPOSITION_WITH_RULE_ALTCHARS
3050 && composition_state
== COMPOSING_COMPONENT_CHAR
))
3051 composition_state
++;
3056 MAYBE_FINISH_COMPOSITION ();
3058 consumed_chars
= consumed_chars_base
;
3060 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3065 coding
->consumed_char
+= consumed_chars_base
;
3066 coding
->consumed
= src_base
- coding
->source
;
3067 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3071 /* ISO2022 encoding stuff. */
3074 It is not enough to say just "ISO2022" on encoding, we have to
3075 specify more details. In Emacs, each coding system of ISO2022
3076 variant has the following specifications:
3077 1. Initial designation to G0 thru G3.
3078 2. Allows short-form designation?
3079 3. ASCII should be designated to G0 before control characters?
3080 4. ASCII should be designated to G0 at end of line?
3081 5. 7-bit environment or 8-bit environment?
3082 6. Use locking-shift?
3083 7. Use Single-shift?
3084 And the following two are only for Japanese:
3085 8. Use ASCII in place of JIS0201-1976-Roman?
3086 9. Use JISX0208-1983 in place of JISX0208-1978?
3087 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3088 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3092 /* Produce codes (escape sequence) for designating CHARSET to graphic
3093 register REG at DST, and increment DST. If <final-char> of CHARSET is
3094 '@', 'A', or 'B' and the coding system CODING allows, produce
3095 designation sequence of short-form. */
3097 #define ENCODE_DESIGNATION(charset, reg, coding) \
3099 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3100 char *intermediate_char_94 = "()*+"; \
3101 char *intermediate_char_96 = ",-./"; \
3102 int revision = -1; \
3105 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3106 revision = XINT (CHARSET_ISO_REVISION (charset)); \
3108 if (revision >= 0) \
3110 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3111 EMIT_ONE_BYTE ('@' + revision); \
3113 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3114 if (CHARSET_DIMENSION (charset) == 1) \
3116 if (! CHARSET_ISO_CHARS_96 (charset)) \
3117 c = intermediate_char_94[reg]; \
3119 c = intermediate_char_96[reg]; \
3120 EMIT_ONE_ASCII_BYTE (c); \
3124 EMIT_ONE_ASCII_BYTE ('$'); \
3125 if (! CHARSET_ISO_CHARS_96 (charset)) \
3127 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3129 || final_char < '@' || final_char > 'B') \
3130 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3133 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3135 EMIT_ONE_ASCII_BYTE (final_char); \
3137 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3141 /* The following two macros produce codes (control character or escape
3142 sequence) for ISO2022 single-shift functions (single-shift-2 and
3145 #define ENCODE_SINGLE_SHIFT_2 \
3147 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3148 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3150 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3151 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3155 #define ENCODE_SINGLE_SHIFT_3 \
3157 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3158 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3160 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3161 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3165 /* The following four macros produce codes (control character or
3166 escape sequence) for ISO2022 locking-shift functions (shift-in,
3167 shift-out, locking-shift-2, and locking-shift-3). */
3169 #define ENCODE_SHIFT_IN \
3171 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3172 CODING_ISO_INVOCATION (coding, 0) = 0; \
3176 #define ENCODE_SHIFT_OUT \
3178 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3179 CODING_ISO_INVOCATION (coding, 0) = 1; \
3183 #define ENCODE_LOCKING_SHIFT_2 \
3185 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3186 CODING_ISO_INVOCATION (coding, 0) = 2; \
3190 #define ENCODE_LOCKING_SHIFT_3 \
3192 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3193 CODING_ISO_INVOCATION (coding, 0) = 3; \
3197 /* Produce codes for a DIMENSION1 character whose character set is
3198 CHARSET and whose position-code is C1. Designation and invocation
3199 sequences are also produced in advance if necessary. */
3201 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3203 int id = CHARSET_ID (charset); \
3204 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3206 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3207 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3209 EMIT_ONE_BYTE (c1 | 0x80); \
3210 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3213 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3215 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3218 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3220 EMIT_ONE_BYTE (c1 | 0x80); \
3224 /* Since CHARSET is not yet invoked to any graphic planes, we \
3225 must invoke it, or, at first, designate it to some graphic \
3226 register. Then repeat the loop to actually produce the \
3228 dst = encode_invocation_designation (charset, coding, dst, \
3233 /* Produce codes for a DIMENSION2 character whose character set is
3234 CHARSET and whose position-codes are C1 and C2. Designation and
3235 invocation codes are also produced in advance if necessary. */
3237 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3239 int id = CHARSET_ID (charset); \
3240 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3242 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3243 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3245 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3246 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3249 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3251 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3254 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3256 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3260 /* Since CHARSET is not yet invoked to any graphic planes, we \
3261 must invoke it, or, at first, designate it to some graphic \
3262 register. Then repeat the loop to actually produce the \
3264 dst = encode_invocation_designation (charset, coding, dst, \
3269 #define ENCODE_ISO_CHARACTER(charset, c) \
3271 int code = ENCODE_CHAR ((charset),(c)); \
3273 if (CHARSET_DIMENSION (charset) == 1) \
3274 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3276 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3280 /* Produce designation and invocation codes at a place pointed by DST
3281 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3285 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3286 struct charset
*charset
;
3287 struct coding_system
*coding
;
3291 int multibytep
= coding
->dst_multibyte
;
3292 int produced_chars
= *p_nchars
;
3293 int reg
; /* graphic register number */
3294 int id
= CHARSET_ID (charset
);
3296 /* At first, check designations. */
3297 for (reg
= 0; reg
< 4; reg
++)
3298 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3303 /* CHARSET is not yet designated to any graphic registers. */
3304 /* At first check the requested designation. */
3305 reg
= CODING_ISO_REQUEST (coding
, id
);
3307 /* Since CHARSET requests no special designation, designate it
3308 to graphic register 0. */
3311 ENCODE_DESIGNATION (charset
, reg
, coding
);
3314 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3315 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3317 /* Since the graphic register REG is not invoked to any graphic
3318 planes, invoke it to graphic plane 0. */
3321 case 0: /* graphic register 0 */
3325 case 1: /* graphic register 1 */
3329 case 2: /* graphic register 2 */
3330 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3331 ENCODE_SINGLE_SHIFT_2
;
3333 ENCODE_LOCKING_SHIFT_2
;
3336 case 3: /* graphic register 3 */
3337 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3338 ENCODE_SINGLE_SHIFT_3
;
3340 ENCODE_LOCKING_SHIFT_3
;
3345 *p_nchars
= produced_chars
;
3349 /* The following three macros produce codes for indicating direction
3351 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3353 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3354 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3356 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3360 #define ENCODE_DIRECTION_R2L() \
3362 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3363 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3367 #define ENCODE_DIRECTION_L2R() \
3369 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3370 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3374 /* Produce codes for designation and invocation to reset the graphic
3375 planes and registers to initial state. */
3376 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3379 struct charset *charset; \
3381 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3383 for (reg = 0; reg < 4; reg++) \
3384 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3385 && (CODING_ISO_DESIGNATION (coding, reg) \
3386 != CODING_ISO_INITIAL (coding, reg))) \
3388 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3389 ENCODE_DESIGNATION (charset, reg, coding); \
3394 /* Produce designation sequences of charsets in the line started from
3395 SRC to a place pointed by DST, and return updated DST.
3397 If the current block ends before any end-of-line, we may fail to
3398 find all the necessary designations. */
3400 static unsigned char *
3401 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3402 struct coding_system
*coding
;
3403 int *charbuf
, *charbuf_end
;
3406 struct charset
*charset
;
3407 /* Table of charsets to be designated to each graphic register. */
3409 int c
, found
= 0, reg
;
3410 int produced_chars
= 0;
3411 int multibytep
= coding
->dst_multibyte
;
3413 Lisp_Object charset_list
;
3415 attrs
= CODING_ID_ATTRS (coding
->id
);
3416 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3417 if (EQ (charset_list
, Qiso_2022
))
3418 charset_list
= Viso_2022_charset_list
;
3420 for (reg
= 0; reg
< 4; reg
++)
3430 charset
= char_charset (c
, charset_list
, NULL
);
3431 id
= CHARSET_ID (charset
);
3432 reg
= CODING_ISO_REQUEST (coding
, id
);
3433 if (reg
>= 0 && r
[reg
] < 0)
3442 for (reg
= 0; reg
< 4; reg
++)
3444 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3445 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3451 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3454 encode_coding_iso_2022 (coding
)
3455 struct coding_system
*coding
;
3457 int multibytep
= coding
->dst_multibyte
;
3458 int *charbuf
= coding
->charbuf
;
3459 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3460 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3461 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3464 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3465 && CODING_ISO_BOL (coding
));
3466 int produced_chars
= 0;
3467 Lisp_Object attrs
, eol_type
, charset_list
;
3468 int ascii_compatible
;
3471 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3472 setup_iso_safe_charsets (attrs
);
3473 coding
->safe_charsets
3474 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs
))->data
;
3476 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3478 while (charbuf
< charbuf_end
)
3480 ASSURE_DESTINATION (safe_room
);
3482 if (bol_designation
)
3484 unsigned char *dst_prev
= dst
;
3486 /* We have to produce designation sequences if any now. */
3487 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3488 bol_designation
= 0;
3489 /* We are sure that designation sequences are all ASCII bytes. */
3490 produced_chars
+= dst
- dst_prev
;
3495 /* Now encode the character C. */
3496 if (c
< 0x20 || c
== 0x7F)
3499 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3501 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3502 ENCODE_RESET_PLANE_AND_REGISTER ();
3503 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3507 for (i
= 0; i
< 4; i
++)
3508 CODING_ISO_DESIGNATION (coding
, i
)
3509 = CODING_ISO_INITIAL (coding
, i
);
3512 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3514 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3515 ENCODE_RESET_PLANE_AND_REGISTER ();
3516 EMIT_ONE_ASCII_BYTE (c
);
3518 else if (ASCII_CHAR_P (c
))
3520 if (ascii_compatible
)
3521 EMIT_ONE_ASCII_BYTE (c
);
3523 ENCODE_ISO_CHARACTER (CHARSET_FROM_ID (charset_ascii
), c
);
3527 struct charset
*charset
= char_charset (c
, charset_list
, NULL
);
3531 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3533 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3534 charset
= CHARSET_FROM_ID (charset_ascii
);
3538 c
= coding
->default_char
;
3539 charset
= char_charset (c
, charset_list
, NULL
);
3542 ENCODE_ISO_CHARACTER (charset
, c
);
3546 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3547 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3549 ASSURE_DESTINATION (safe_room
);
3550 ENCODE_RESET_PLANE_AND_REGISTER ();
3552 coding
->result
= CODING_RESULT_SUCCESS
;
3553 CODING_ISO_BOL (coding
) = bol_designation
;
3554 coding
->produced_char
+= produced_chars
;
3555 coding
->produced
= dst
- coding
->destination
;
3560 /*** 8,9. SJIS and BIG5 handlers ***/
3562 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3563 quite widely. So, for the moment, Emacs supports them in the bare
3564 C code. But, in the future, they may be supported only by CCL. */
3566 /* SJIS is a coding system encoding three character sets: ASCII, right
3567 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3568 as is. A character of charset katakana-jisx0201 is encoded by
3569 "position-code + 0x80". A character of charset japanese-jisx0208
3570 is encoded in 2-byte but two position-codes are divided and shifted
3571 so that it fit in the range below.
3573 --- CODE RANGE of SJIS ---
3574 (character set) (range)
3576 KATAKANA-JISX0201 0xA0 .. 0xDF
3577 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3578 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3579 -------------------------------
3583 /* BIG5 is a coding system encoding two character sets: ASCII and
3584 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3585 character set and is encoded in two-byte.
3587 --- CODE RANGE of BIG5 ---
3588 (character set) (range)
3590 Big5 (1st byte) 0xA1 .. 0xFE
3591 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3592 --------------------------
3596 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3597 Check if a text is encoded in SJIS. If it is, return
3598 CATEGORY_MASK_SJIS, else return 0. */
3601 detect_coding_sjis (coding
, mask
)
3602 struct coding_system
*coding
;
3605 unsigned char *src
= coding
->source
, *src_base
= src
;
3606 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3607 int multibytep
= coding
->src_multibyte
;
3608 int consumed_chars
= 0;
3612 /* A coding system of this category is always ASCII compatible. */
3613 src
+= coding
->head_ascii
;
3620 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3623 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
3627 else if (c
>= 0xA0 && c
< 0xE0)
3632 *mask
&= ~CATEGORY_MASK_SJIS
;
3638 *mask
&= CATEGORY_MASK_SJIS
;
3642 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3643 Check if a text is encoded in BIG5. If it is, return
3644 CATEGORY_MASK_BIG5, else return 0. */
3647 detect_coding_big5 (coding
, mask
)
3648 struct coding_system
*coding
;
3651 unsigned char *src
= coding
->source
, *src_base
= src
;
3652 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3653 int multibytep
= coding
->src_multibyte
;
3654 int consumed_chars
= 0;
3658 /* A coding system of this category is always ASCII compatible. */
3659 src
+= coding
->head_ascii
;
3669 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
3676 *mask
&= ~CATEGORY_MASK_BIG5
;
3682 *mask
&= CATEGORY_MASK_BIG5
;
3686 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3687 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3690 decode_coding_sjis (coding
)
3691 struct coding_system
*coding
;
3693 unsigned char *src
= coding
->source
+ coding
->consumed
;
3694 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3695 unsigned char *src_base
;
3696 int *charbuf
= coding
->charbuf
;
3697 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
3698 int consumed_chars
= 0, consumed_chars_base
;
3699 int multibytep
= coding
->src_multibyte
;
3700 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3701 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3703 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3706 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3707 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3708 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3715 consumed_chars_base
= consumed_chars
;
3717 if (charbuf
>= charbuf_end
)
3724 if (EQ (eol_type
, Qdos
))
3727 goto no_more_source
;
3731 else if (EQ (eol_type
, Qmac
))
3736 struct charset
*charset
;
3739 charset
= charset_roman
;
3744 if (c
< 0xA0 || c
>= 0xE0)
3746 /* SJIS -> JISX0208 */
3748 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
3752 charset
= charset_kanji
;
3755 /* SJIS -> JISX0201-Kana */
3756 charset
= charset_kana
;
3758 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
3765 consumed_chars
= consumed_chars_base
;
3767 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3772 coding
->consumed_char
+= consumed_chars_base
;
3773 coding
->consumed
= src_base
- coding
->source
;
3774 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3778 decode_coding_big5 (coding
)
3779 struct coding_system
*coding
;
3781 unsigned char *src
= coding
->source
+ coding
->consumed
;
3782 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3783 unsigned char *src_base
;
3784 int *charbuf
= coding
->charbuf
;
3785 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
3786 int consumed_chars
= 0, consumed_chars_base
;
3787 int multibytep
= coding
->src_multibyte
;
3788 struct charset
*charset_roman
, *charset_big5
;
3789 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3791 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3793 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3794 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3801 consumed_chars_base
= consumed_chars
;
3803 if (charbuf
>= charbuf_end
)
3810 if (EQ (eol_type
, Qdos
))
3813 goto no_more_source
;
3817 else if (EQ (eol_type
, Qmac
))
3822 struct charset
*charset
;
3824 charset
= charset_roman
;
3828 if (c
< 0xA1 || c
> 0xFE)
3831 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
3834 charset
= charset_big5
;
3836 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
3844 consumed_chars
= consumed_chars_base
;
3846 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3851 coding
->consumed_char
+= consumed_chars_base
;
3852 coding
->consumed
= src_base
- coding
->source
;
3853 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3856 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3857 This function can encode charsets `ascii', `katakana-jisx0201',
3858 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3859 are sure that all these charsets are registered as official charset
3860 (i.e. do not have extended leading-codes). Characters of other
3861 charsets are produced without any encoding. If SJIS_P is 1, encode
3862 SJIS text, else encode BIG5 text. */
3865 encode_coding_sjis (coding
)
3866 struct coding_system
*coding
;
3868 int multibytep
= coding
->dst_multibyte
;
3869 int *charbuf
= coding
->charbuf
;
3870 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3871 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3872 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3874 int produced_chars
= 0;
3875 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3876 int ascii_compatible
;
3877 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3880 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3882 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3883 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3884 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3886 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3888 while (charbuf
< charbuf_end
)
3890 ASSURE_DESTINATION (safe_room
);
3892 /* Now encode the character C. */
3893 if (ASCII_CHAR_P (c
) && ascii_compatible
)
3894 EMIT_ONE_ASCII_BYTE (c
);
3898 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
3902 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3904 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3905 charset
= CHARSET_FROM_ID (charset_ascii
);
3909 c
= coding
->default_char
;
3910 charset
= char_charset (c
, charset_list
, &code
);
3913 if (code
== CHARSET_INVALID_CODE (charset
))
3915 if (charset
== charset_kanji
)
3919 c1
= code
>> 8, c2
= code
& 0xFF;
3920 EMIT_TWO_BYTES (c1
, c2
);
3922 else if (charset
== charset_kana
)
3923 EMIT_ONE_BYTE (code
| 0x80);
3925 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
3928 coding
->result
= CODING_RESULT_SUCCESS
;
3929 coding
->produced_char
+= produced_chars
;
3930 coding
->produced
= dst
- coding
->destination
;
3935 encode_coding_big5 (coding
)
3936 struct coding_system
*coding
;
3938 int multibytep
= coding
->dst_multibyte
;
3939 int *charbuf
= coding
->charbuf
;
3940 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3941 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3942 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3944 int produced_chars
= 0;
3945 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3946 int ascii_compatible
;
3947 struct charset
*charset_roman
, *charset_big5
;
3950 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3952 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3953 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3954 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3956 while (charbuf
< charbuf_end
)
3958 ASSURE_DESTINATION (safe_room
);
3960 /* Now encode the character C. */
3961 if (ASCII_CHAR_P (c
) && ascii_compatible
)
3962 EMIT_ONE_ASCII_BYTE (c
);
3966 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
3970 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3972 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3973 charset
= CHARSET_FROM_ID (charset_ascii
);
3977 c
= coding
->default_char
;
3978 charset
= char_charset (c
, charset_list
, &code
);
3981 if (code
== CHARSET_INVALID_CODE (charset
))
3983 if (charset
== charset_big5
)
3987 c1
= code
>> 8, c2
= code
& 0xFF;
3988 EMIT_TWO_BYTES (c1
, c2
);
3991 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
3994 coding
->result
= CODING_RESULT_SUCCESS
;
3995 coding
->produced_char
+= produced_chars
;
3996 coding
->produced
= dst
- coding
->destination
;
4001 /*** 10. CCL handlers ***/
4003 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4004 Check if a text is encoded in a coding system of which
4005 encoder/decoder are written in CCL program. If it is, return
4006 CATEGORY_MASK_CCL, else return 0. */
4009 detect_coding_ccl (coding
, mask
)
4010 struct coding_system
*coding
;
4013 unsigned char *src
= coding
->source
, *src_base
= src
;
4014 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4015 int multibytep
= coding
->src_multibyte
;
4016 int consumed_chars
= 0;
4018 unsigned char *valids
= CODING_CCL_VALIDS (coding
);
4019 int head_ascii
= coding
->head_ascii
;
4022 coding
= &coding_categories
[coding_category_ccl
];
4023 attrs
= CODING_ID_ATTRS (coding
->id
);
4024 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4033 if (!found
&& valids
[c
] > 1)
4036 *mask
&= ~CATEGORY_MASK_CCL
;
4042 *mask
&= CATEGORY_MASK_CCL
;
4047 decode_coding_ccl (coding
)
4048 struct coding_system
*coding
;
4050 unsigned char *src
= coding
->source
+ coding
->consumed
;
4051 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4052 int *charbuf
= coding
->charbuf
;
4053 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4054 int consumed_chars
= 0;
4055 int multibytep
= coding
->src_multibyte
;
4056 struct ccl_program ccl
;
4057 int source_charbuf
[1024];
4058 int source_byteidx
[1024];
4060 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
4062 while (src
< src_end
)
4064 unsigned char *p
= src
;
4065 int *source
, *source_end
;
4069 while (i
< 1024 && p
< src_end
)
4071 source_byteidx
[i
] = p
- src
;
4072 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4075 while (i
< 1024 && p
< src_end
)
4076 source_charbuf
[i
++] = *p
++;
4078 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4081 source
= source_charbuf
;
4082 source_end
= source
+ i
;
4083 while (source
< source_end
)
4085 ccl_driver (&ccl
, source
, charbuf
,
4086 source_end
- source
, charbuf_end
- charbuf
);
4087 source
+= ccl
.consumed
;
4088 charbuf
+= ccl
.produced
;
4089 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4092 if (source
< source_end
)
4093 src
+= source_byteidx
[source
- source_charbuf
];
4096 consumed_chars
+= source
- source_charbuf
;
4098 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4099 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4105 case CCL_STAT_SUSPEND_BY_SRC
:
4106 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4108 case CCL_STAT_SUSPEND_BY_DST
:
4111 case CCL_STAT_INVALID_CMD
:
4112 coding
->result
= CODING_RESULT_INTERRUPT
;
4115 coding
->result
= CODING_RESULT_SUCCESS
;
4118 coding
->consumed_char
+= consumed_chars
;
4119 coding
->consumed
= src
- coding
->source
;
4120 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4124 encode_coding_ccl (coding
)
4125 struct coding_system
*coding
;
4127 struct ccl_program ccl
;
4128 int multibytep
= coding
->dst_multibyte
;
4129 int *charbuf
= coding
->charbuf
;
4130 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4131 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4132 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4133 unsigned char *adjusted_dst_end
= dst_end
- 1;
4134 int destination_charbuf
[1024];
4135 int i
, produced_chars
= 0;
4137 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4139 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4140 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4142 while (charbuf
< charbuf_end
&& dst
< adjusted_dst_end
)
4144 int dst_bytes
= dst_end
- dst
;
4145 if (dst_bytes
> 1024)
4148 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4149 charbuf_end
- charbuf
, dst_bytes
);
4150 charbuf
+= ccl
.consumed
;
4152 for (i
= 0; i
< ccl
.produced
; i
++)
4153 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4156 for (i
= 0; i
< ccl
.produced
; i
++)
4157 *dst
++ = destination_charbuf
[i
] & 0xFF;
4158 produced_chars
+= ccl
.produced
;
4164 case CCL_STAT_SUSPEND_BY_SRC
:
4165 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4167 case CCL_STAT_SUSPEND_BY_DST
:
4168 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
4171 case CCL_STAT_INVALID_CMD
:
4172 coding
->result
= CODING_RESULT_INTERRUPT
;
4175 coding
->result
= CODING_RESULT_SUCCESS
;
4179 coding
->produced_char
+= produced_chars
;
4180 coding
->produced
= dst
- coding
->destination
;
4186 /*** 10, 11. no-conversion handlers ***/
4188 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4191 decode_coding_raw_text (coding
)
4192 struct coding_system
*coding
;
4194 coding
->chars_at_source
= 1;
4195 coding
->consumed_char
= 0;
4196 coding
->consumed
= 0;
4197 coding
->result
= CODING_RESULT_SUCCESS
;
4201 encode_coding_raw_text (coding
)
4202 struct coding_system
*coding
;
4204 int multibytep
= coding
->dst_multibyte
;
4205 int *charbuf
= coding
->charbuf
;
4206 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4207 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4208 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4209 int produced_chars
= 0;
4214 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4216 if (coding
->src_multibyte
)
4217 while (charbuf
< charbuf_end
)
4219 ASSURE_DESTINATION (safe_room
);
4221 if (ASCII_CHAR_P (c
))
4222 EMIT_ONE_ASCII_BYTE (c
);
4223 else if (CHAR_BYTE8_P (c
))
4225 c
= CHAR_TO_BYTE8 (c
);
4230 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4232 CHAR_STRING_ADVANCE (c
, p1
);
4234 EMIT_ONE_BYTE (*p0
);
4238 while (charbuf
< charbuf_end
)
4240 ASSURE_DESTINATION (safe_room
);
4247 if (coding
->src_multibyte
)
4249 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4251 while (charbuf
< charbuf_end
)
4253 ASSURE_DESTINATION (safe_room
);
4255 if (ASCII_CHAR_P (c
))
4257 else if (CHAR_BYTE8_P (c
))
4258 *dst
++ = CHAR_TO_BYTE8 (c
);
4260 CHAR_STRING_ADVANCE (c
, dst
);
4266 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4267 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4268 *dst
++ = *charbuf
++;
4269 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4272 coding
->result
= CODING_RESULT_SUCCESS
;
4273 coding
->produced_char
+= produced_chars
;
4274 coding
->produced
= dst
- coding
->destination
;
4279 detect_coding_charset (coding
, mask
)
4280 struct coding_system
*coding
;
4283 unsigned char *src
= coding
->source
, *src_base
= src
;
4284 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4285 int multibytep
= coding
->src_multibyte
;
4286 int consumed_chars
= 0;
4287 Lisp_Object attrs
, valids
;
4289 coding
= &coding_categories
[coding_category_charset
];
4290 attrs
= CODING_ID_ATTRS (coding
->id
);
4291 valids
= AREF (attrs
, coding_attr_charset_valids
);
4293 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4294 src
+= coding
->head_ascii
;
4301 if (NILP (AREF (valids
, c
)))
4304 *mask
&= ~CATEGORY_MASK_CHARSET
;
4308 *mask
&= CATEGORY_MASK_CHARSET
;
4313 decode_coding_charset (coding
)
4314 struct coding_system
*coding
;
4316 unsigned char *src
= coding
->source
+ coding
->consumed
;
4317 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4318 unsigned char *src_base
;
4319 int *charbuf
= coding
->charbuf
;
4320 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4321 int consumed_chars
= 0, consumed_chars_base
;
4322 int multibytep
= coding
->src_multibyte
;
4323 Lisp_Object attrs
, eol_type
, charset_list
, valids
;
4325 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4326 valids
= AREF (attrs
, coding_attr_charset_valids
);
4333 consumed_chars_base
= consumed_chars
;
4335 if (charbuf
>= charbuf_end
)
4341 /* Here we assume that no charset maps '\r' to something
4343 if (EQ (eol_type
, Qdos
))
4349 else if (EQ (eol_type
, Qmac
))
4355 struct charset
*charset
;
4360 val
= AREF (valids
, c
);
4365 charset
= CHARSET_FROM_ID (XFASTINT (val
));
4366 dim
= CHARSET_DIMENSION (charset
);
4370 code
= (code
<< 8) | c
;
4373 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
4378 /* VAL is a list of charset IDs. It is assured that the
4379 list is sorted by charset dimensions (smaller one
4383 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
4384 dim
= CHARSET_DIMENSION (charset
);
4388 code
= (code
<< 8) | c
;
4391 CODING_DECODE_CHAR (coding
, src
, src_base
,
4392 src_end
, charset
, code
, c
);
4406 consumed_chars
= consumed_chars_base
;
4408 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4413 coding
->consumed_char
+= consumed_chars_base
;
4414 coding
->consumed
= src_base
- coding
->source
;
4415 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4419 encode_coding_charset (coding
)
4420 struct coding_system
*coding
;
4422 int multibytep
= coding
->dst_multibyte
;
4423 int *charbuf
= coding
->charbuf
;
4424 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4425 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4426 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4427 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4428 int produced_chars
= 0;
4429 Lisp_Object attrs
, eol_type
, charset_list
;
4430 int ascii_compatible
;
4433 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4434 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4436 while (charbuf
< charbuf_end
)
4438 struct charset
*charset
;
4441 ASSURE_DESTINATION (safe_room
);
4443 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4444 EMIT_ONE_ASCII_BYTE (c
);
4447 charset
= char_charset (c
, charset_list
, &code
);
4450 if (CHARSET_DIMENSION (charset
) == 1)
4451 EMIT_ONE_BYTE (code
);
4452 else if (CHARSET_DIMENSION (charset
) == 2)
4453 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
4454 else if (CHARSET_DIMENSION (charset
) == 3)
4455 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
4457 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
4458 (code
>> 8) & 0xFF, code
& 0xFF);
4462 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4463 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4465 c
= coding
->default_char
;
4471 coding
->result
= CODING_RESULT_SUCCESS
;
4472 coding
->produced_char
+= produced_chars
;
4473 coding
->produced
= dst
- coding
->destination
;
4478 /*** 7. C library functions ***/
4480 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
4481 has a property `coding-system'. The value of this property is a
4482 vector of length 5 (called as coding-vector). Among elements of
4483 this vector, the first (element[0]) and the fifth (element[4])
4484 carry important information for decoding/encoding. Before
4485 decoding/encoding, this information should be set in fields of a
4486 structure of type `coding_system'.
4488 A value of property `coding-system' can be a symbol of another
4489 subsidiary coding-system. In that case, Emacs gets coding-vector
4492 `element[0]' contains information to be set in `coding->type'. The
4493 value and its meaning is as follows:
4495 0 -- coding_type_emacs_mule
4496 1 -- coding_type_sjis
4497 2 -- coding_type_iso_2022
4498 3 -- coding_type_big5
4499 4 -- coding_type_ccl encoder/decoder written in CCL
4500 nil -- coding_type_no_conversion
4501 t -- coding_type_undecided (automatic conversion on decoding,
4502 no-conversion on encoding)
4504 `element[4]' contains information to be set in `coding->flags' and
4505 `coding->spec'. The meaning varies by `coding->type'.
4507 If `coding->type' is `coding_type_iso_2022', element[4] is a vector
4508 of length 32 (of which the first 13 sub-elements are used now).
4509 Meanings of these sub-elements are:
4511 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022'
4512 If the value is an integer of valid charset, the charset is
4513 assumed to be designated to graphic register N initially.
4515 If the value is minus, it is a minus value of charset which
4516 reserves graphic register N, which means that the charset is
4517 not designated initially but should be designated to graphic
4518 register N just before encoding a character in that charset.
4520 If the value is nil, graphic register N is never used on
4523 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
4524 Each value takes t or nil. See the section ISO2022 of
4525 `coding.h' for more information.
4527 If `coding->type' is `coding_type_big5', element[4] is t to denote
4528 BIG5-ETen or nil to denote BIG5-HKU.
4530 If `coding->type' takes the other value, element[4] is ignored.
4532 Emacs Lisp's coding system also carries information about format of
4533 end-of-line in a value of property `eol-type'. If the value is
4534 integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr. If
4535 it is not integer, it should be a vector of subsidiary coding
4536 systems of which property `eol-type' has one of above values.
4540 /* Setup coding context CODING from information about CODING_SYSTEM.
4541 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4542 CODING_SYSTEM is invalid, signal an error. */
4545 setup_coding_system (coding_system
, coding
)
4546 Lisp_Object coding_system
;
4547 struct coding_system
*coding
;
4550 Lisp_Object eol_type
;
4551 Lisp_Object coding_type
;
4554 if (NILP (coding_system
))
4555 coding_system
= Qno_conversion
;
4557 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
4559 attrs
= CODING_ID_ATTRS (coding
->id
);
4560 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4563 coding
->head_ascii
= -1;
4564 coding
->common_flags
4565 = (VECTORP (eol_type
) ? CODING_REQUIRE_DETECTION_MASK
: 0);
4567 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4568 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4569 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4570 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
4572 coding_type
= CODING_ATTR_TYPE (attrs
);
4573 if (EQ (coding_type
, Qundecided
))
4575 coding
->detector
= NULL
;
4576 coding
->decoder
= decode_coding_raw_text
;
4577 coding
->encoder
= encode_coding_raw_text
;
4578 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4580 else if (EQ (coding_type
, Qiso_2022
))
4583 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
4585 /* Invoke graphic register 0 to plane 0. */
4586 CODING_ISO_INVOCATION (coding
, 0) = 0;
4587 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4588 CODING_ISO_INVOCATION (coding
, 1)
4589 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
4590 /* Setup the initial status of designation. */
4591 for (i
= 0; i
< 4; i
++)
4592 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
4593 /* Not single shifting initially. */
4594 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
4595 /* Beginning of buffer should also be regarded as bol. */
4596 CODING_ISO_BOL (coding
) = 1;
4597 coding
->detector
= detect_coding_iso_2022
;
4598 coding
->decoder
= decode_coding_iso_2022
;
4599 coding
->encoder
= encode_coding_iso_2022
;
4600 if (flags
& CODING_ISO_FLAG_SAFE
)
4601 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
4602 coding
->common_flags
4603 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4604 | CODING_REQUIRE_FLUSHING_MASK
);
4605 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
4606 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
4607 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
4609 setup_iso_safe_charsets (attrs
);
4610 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4611 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4612 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4614 CODING_ISO_FLAGS (coding
) = flags
;
4616 else if (EQ (coding_type
, Qcharset
))
4618 coding
->detector
= detect_coding_charset
;
4619 coding
->decoder
= decode_coding_charset
;
4620 coding
->encoder
= encode_coding_charset
;
4621 coding
->common_flags
4622 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4624 else if (EQ (coding_type
, Qutf_8
))
4626 coding
->detector
= detect_coding_utf_8
;
4627 coding
->decoder
= decode_coding_utf_8
;
4628 coding
->encoder
= encode_coding_utf_8
;
4629 coding
->common_flags
4630 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4632 else if (EQ (coding_type
, Qutf_16
))
4634 val
= AREF (attrs
, coding_attr_utf_16_bom
);
4635 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
4636 : EQ (val
, Qt
) ? utf_16_with_bom
4637 : utf_16_without_bom
);
4638 val
= AREF (attrs
, coding_attr_utf_16_endian
);
4639 CODING_UTF_16_ENDIAN (coding
) = (NILP (val
) ? utf_16_big_endian
4640 : utf_16_little_endian
);
4641 CODING_UTF_16_SURROGATE (coding
) = 0;
4642 coding
->detector
= detect_coding_utf_16
;
4643 coding
->decoder
= decode_coding_utf_16
;
4644 coding
->encoder
= encode_coding_utf_16
;
4645 coding
->common_flags
4646 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4648 else if (EQ (coding_type
, Qccl
))
4650 coding
->detector
= detect_coding_ccl
;
4651 coding
->decoder
= decode_coding_ccl
;
4652 coding
->encoder
= encode_coding_ccl
;
4653 coding
->common_flags
4654 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4655 | CODING_REQUIRE_FLUSHING_MASK
);
4657 else if (EQ (coding_type
, Qemacs_mule
))
4659 coding
->detector
= detect_coding_emacs_mule
;
4660 coding
->decoder
= decode_coding_emacs_mule
;
4661 coding
->encoder
= encode_coding_emacs_mule
;
4662 coding
->common_flags
4663 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4664 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
4665 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
4667 Lisp_Object tail
, safe_charsets
;
4668 int max_charset_id
= 0;
4670 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4672 if (max_charset_id
< XFASTINT (XCAR (tail
)))
4673 max_charset_id
= XFASTINT (XCAR (tail
));
4674 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
4676 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4678 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
4679 coding
->max_charset_id
= max_charset_id
;
4680 coding
->safe_charsets
= (char *) XSTRING (safe_charsets
)->data
;
4683 else if (EQ (coding_type
, Qshift_jis
))
4685 coding
->detector
= detect_coding_sjis
;
4686 coding
->decoder
= decode_coding_sjis
;
4687 coding
->encoder
= encode_coding_sjis
;
4688 coding
->common_flags
4689 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4691 else if (EQ (coding_type
, Qbig5
))
4693 coding
->detector
= detect_coding_big5
;
4694 coding
->decoder
= decode_coding_big5
;
4695 coding
->encoder
= encode_coding_big5
;
4696 coding
->common_flags
4697 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4699 else /* EQ (coding_type, Qraw_text) */
4701 coding
->detector
= NULL
;
4702 coding
->decoder
= decode_coding_raw_text
;
4703 coding
->encoder
= encode_coding_raw_text
;
4704 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
4710 /* Return raw-text or one of its subsidiaries that has the same
4711 eol_type as CODING-SYSTEM. */
4714 raw_text_coding_system (coding_system
)
4715 Lisp_Object coding_system
;
4717 Lisp_Object spec
, attrs
;
4718 Lisp_Object eol_type
, raw_text_eol_type
;
4720 spec
= CODING_SYSTEM_SPEC (coding_system
);
4721 attrs
= AREF (spec
, 0);
4723 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
4724 return coding_system
;
4726 eol_type
= AREF (spec
, 2);
4727 if (VECTORP (eol_type
))
4729 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
4730 raw_text_eol_type
= AREF (spec
, 2);
4731 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
4732 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
4733 : AREF (raw_text_eol_type
, 2));
4737 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4738 does, return one of the subsidiary that has the same eol-spec as
4739 PARENT. Otherwise, return CODING_SYSTEM. */
4742 coding_inherit_eol_type (coding_system
, parent
)
4743 Lisp_Object coding_system
, parent
;
4745 Lisp_Object spec
, attrs
, eol_type
;
4747 spec
= CODING_SYSTEM_SPEC (coding_system
);
4748 attrs
= AREF (spec
, 0);
4749 eol_type
= AREF (spec
, 2);
4750 if (VECTORP (eol_type
))
4752 Lisp_Object parent_spec
;
4753 Lisp_Object parent_eol_type
;
4756 = CODING_SYSTEM_SPEC (buffer_defaults
.buffer_file_coding_system
);
4757 parent_eol_type
= AREF (parent_spec
, 2);
4758 if (EQ (parent_eol_type
, Qunix
))
4759 coding_system
= AREF (eol_type
, 0);
4760 else if (EQ (parent_eol_type
, Qdos
))
4761 coding_system
= AREF (eol_type
, 1);
4762 else if (EQ (parent_eol_type
, Qmac
))
4763 coding_system
= AREF (eol_type
, 2);
4765 return coding_system
;
4768 /* Emacs has a mechanism to automatically detect a coding system if it
4769 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4770 it's impossible to distinguish some coding systems accurately
4771 because they use the same range of codes. So, at first, coding
4772 systems are categorized into 7, those are:
4774 o coding-category-emacs-mule
4776 The category for a coding system which has the same code range
4777 as Emacs' internal format. Assigned the coding-system (Lisp
4778 symbol) `emacs-mule' by default.
4780 o coding-category-sjis
4782 The category for a coding system which has the same code range
4783 as SJIS. Assigned the coding-system (Lisp
4784 symbol) `japanese-shift-jis' by default.
4786 o coding-category-iso-7
4788 The category for a coding system which has the same code range
4789 as ISO2022 of 7-bit environment. This doesn't use any locking
4790 shift and single shift functions. This can encode/decode all
4791 charsets. Assigned the coding-system (Lisp symbol)
4792 `iso-2022-7bit' by default.
4794 o coding-category-iso-7-tight
4796 Same as coding-category-iso-7 except that this can
4797 encode/decode only the specified charsets.
4799 o coding-category-iso-8-1
4801 The category for a coding system which has the same code range
4802 as ISO2022 of 8-bit environment and graphic plane 1 used only
4803 for DIMENSION1 charset. This doesn't use any locking shift
4804 and single shift functions. Assigned the coding-system (Lisp
4805 symbol) `iso-latin-1' by default.
4807 o coding-category-iso-8-2
4809 The category for a coding system which has the same code range
4810 as ISO2022 of 8-bit environment and graphic plane 1 used only
4811 for DIMENSION2 charset. This doesn't use any locking shift
4812 and single shift functions. Assigned the coding-system (Lisp
4813 symbol) `japanese-iso-8bit' by default.
4815 o coding-category-iso-7-else
4817 The category for a coding system which has the same code range
4818 as ISO2022 of 7-bit environemnt but uses locking shift or
4819 single shift functions. Assigned the coding-system (Lisp
4820 symbol) `iso-2022-7bit-lock' by default.
4822 o coding-category-iso-8-else
4824 The category for a coding system which has the same code range
4825 as ISO2022 of 8-bit environemnt but uses locking shift or
4826 single shift functions. Assigned the coding-system (Lisp
4827 symbol) `iso-2022-8bit-ss2' by default.
4829 o coding-category-big5
4831 The category for a coding system which has the same code range
4832 as BIG5. Assigned the coding-system (Lisp symbol)
4833 `cn-big5' by default.
4835 o coding-category-utf-8
4837 The category for a coding system which has the same code range
4838 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
4839 symbol) `utf-8' by default.
4841 o coding-category-utf-16-be
4843 The category for a coding system in which a text has an
4844 Unicode signature (cf. Unicode Standard) in the order of BIG
4845 endian at the head. Assigned the coding-system (Lisp symbol)
4846 `utf-16-be' by default.
4848 o coding-category-utf-16-le
4850 The category for a coding system in which a text has an
4851 Unicode signature (cf. Unicode Standard) in the order of
4852 LITTLE endian at the head. Assigned the coding-system (Lisp
4853 symbol) `utf-16-le' by default.
4855 o coding-category-ccl
4857 The category for a coding system of which encoder/decoder is
4858 written in CCL programs. The default value is nil, i.e., no
4859 coding system is assigned.
4861 o coding-category-binary
4863 The category for a coding system not categorized in any of the
4864 above. Assigned the coding-system (Lisp symbol)
4865 `no-conversion' by default.
4867 Each of them is a Lisp symbol and the value is an actual
4868 `coding-system's (this is also a Lisp symbol) assigned by a user.
4869 What Emacs does actually is to detect a category of coding system.
4870 Then, it uses a `coding-system' assigned to it. If Emacs can't
4871 decide only one possible category, it selects a category of the
4872 highest priority. Priorities of categories are also specified by a
4873 user in a Lisp variable `coding-category-list'.
4877 #define EOL_SEEN_NONE 0
4878 #define EOL_SEEN_LF 1
4879 #define EOL_SEEN_CR 2
4880 #define EOL_SEEN_CRLF 4
4882 /* Detect how end-of-line of a text of length CODING->src_bytes
4883 pointed by CODING->source is encoded. Return one of
4886 #define MAX_EOL_CHECK_COUNT 3
4889 detect_eol (coding
, source
, src_bytes
)
4890 struct coding_system
*coding
;
4891 unsigned char *source
;
4892 EMACS_INT src_bytes
;
4894 Lisp_Object attrs
, coding_type
;
4895 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
4898 int eol_seen
= EOL_SEEN_NONE
;
4900 attrs
= CODING_ID_ATTRS (coding
->id
);
4901 coding_type
= CODING_ATTR_TYPE (attrs
);
4903 if (EQ (coding_type
, Qccl
))
4907 msb
= coding
->spec
.utf_16
.endian
== utf_16_little_endian
;
4910 while (src
+ 1 < src_end
)
4913 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
4918 this_eol
= EOL_SEEN_LF
;
4919 else if (src
+ 3 >= src_end
4920 || src
[msb
+ 2] != 0
4921 || src
[lsb
+ 2] != '\n')
4922 this_eol
= EOL_SEEN_CR
;
4924 this_eol
= EOL_SEEN_CRLF
;
4926 if (eol_seen
== EOL_SEEN_NONE
)
4927 /* This is the first end-of-line. */
4928 eol_seen
= this_eol
;
4929 else if (eol_seen
!= this_eol
)
4931 /* The found type is different from what found before. */
4932 eol_seen
= EOL_SEEN_LF
;
4935 if (++total
== MAX_EOL_CHECK_COUNT
)
4943 while (src
< src_end
)
4946 if (c
== '\n' || c
== '\r')
4951 this_eol
= EOL_SEEN_LF
;
4952 else if (src
>= src_end
|| *src
!= '\n')
4953 this_eol
= EOL_SEEN_CR
;
4955 this_eol
= EOL_SEEN_CRLF
, src
++;
4957 if (eol_seen
== EOL_SEEN_NONE
)
4958 /* This is the first end-of-line. */
4959 eol_seen
= this_eol
;
4960 else if (eol_seen
!= this_eol
)
4962 /* The found type is different from what found before. */
4963 eol_seen
= EOL_SEEN_LF
;
4966 if (++total
== MAX_EOL_CHECK_COUNT
)
4976 adjust_coding_eol_type (coding
, eol_seen
)
4977 struct coding_system
*coding
;
4980 Lisp_Object eol_type
;
4982 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4983 if (eol_seen
& EOL_SEEN_LF
)
4984 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
4985 else if (eol_type
& EOL_SEEN_CRLF
)
4986 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
4987 else if (eol_type
& EOL_SEEN_CR
)
4988 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
4991 /* Detect how a text specified in CODING is encoded. If a coding
4992 system is detected, update fields of CODING by the detected coding
4996 detect_coding (coding
)
4997 struct coding_system
*coding
;
4999 unsigned char *src
, *src_end
;
5000 Lisp_Object attrs
, coding_type
;
5002 coding
->consumed
= coding
->consumed_char
= 0;
5003 coding
->produced
= coding
->produced_char
= 0;
5004 coding_set_source (coding
);
5006 src_end
= coding
->source
+ coding
->src_bytes
;
5008 /* If we have not yet decided the text encoding type, detect it
5010 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
5012 int mask
= CATEGORY_MASK_ANY
;
5015 for (src
= coding
->source
; src
< src_end
; src
++)
5018 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
5020 || c
== ISO_CODE_SO
)))
5023 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
5025 if (coding
->head_ascii
< coding
->src_bytes
)
5029 for (i
= 0; i
< coding_category_raw_text
; i
++)
5031 enum coding_category category
= coding_priorities
[i
];
5032 struct coding_system
*this = coding_categories
+ category
;
5034 if (category
>= coding_category_raw_text
5035 || detected
& (1 << category
))
5040 /* No coding system of this category is defined. */
5041 mask
&= ~(1 << category
);
5045 detected
|= detected_mask
[category
];
5046 if ((*(this->detector
)) (coding
, &mask
))
5051 setup_coding_system (Qraw_text
, coding
);
5052 else if (mask
!= CATEGORY_MASK_ANY
)
5053 for (i
= 0; i
< coding_category_raw_text
; i
++)
5055 enum coding_category category
= coding_priorities
[i
];
5056 struct coding_system
*this = coding_categories
+ category
;
5058 if (mask
& (1 << category
))
5060 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5067 attrs
= CODING_ID_ATTRS (coding
->id
);
5068 coding_type
= CODING_ATTR_TYPE (attrs
);
5070 /* If we have not yet decided the EOL type, detect it now. But, the
5071 detection is impossible for a CCL based coding system, in which
5072 case, we detct the EOL type after decoding. */
5073 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
))
5074 && ! EQ (coding_type
, Qccl
))
5076 int eol_seen
= detect_eol (coding
, coding
->source
, coding
->src_bytes
);
5078 if (eol_seen
!= EOL_SEEN_NONE
)
5079 adjust_coding_eol_type (coding
, eol_seen
);
5086 struct coding_system
*coding
;
5088 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
)))
5090 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5091 unsigned char *pend
= p
+ coding
->produced
;
5092 int eol_seen
= EOL_SEEN_NONE
;
5094 for (; p
< pend
; p
++)
5097 eol_seen
|= EOL_SEEN_LF
;
5098 else if (*p
== '\r')
5100 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
5102 eol_seen
|= EOL_SEEN_CRLF
;
5106 eol_seen
|= EOL_SEEN_CR
;
5109 if (eol_seen
!= EOL_SEEN_NONE
)
5110 adjust_coding_eol_type (coding
, eol_seen
);
5113 if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qmac
))
5115 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5116 unsigned char *pend
= p
+ coding
->produced
;
5118 for (; p
< pend
; p
++)
5122 else if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
))
5124 unsigned char *p
, *pbeg
, *pend
;
5125 Lisp_Object undo_list
;
5127 move_gap_both (coding
->dst_pos
+ coding
->produced_char
,
5128 coding
->dst_pos_byte
+ coding
->produced
);
5129 undo_list
= current_buffer
->undo_list
;
5130 current_buffer
->undo_list
= Qt
;
5131 del_range_2 (coding
->dst_pos
, coding
->dst_pos_byte
, GPT
, GPT_BYTE
, Qnil
);
5132 current_buffer
->undo_list
= undo_list
;
5134 pend
= pbeg
+ coding
->produced
;
5136 for (p
= pend
- 1; p
>= pbeg
; p
--)
5139 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
- p
- 1);
5142 coding
->produced_char
-= coding
->produced
- (pend
- pbeg
);
5143 coding
->produced
= pend
- pbeg
;
5144 insert_from_gap (coding
->produced_char
, coding
->produced
);
5149 translate_chars (coding
, table
)
5150 struct coding_system
*coding
;
5153 int *charbuf
= coding
->charbuf
;
5154 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5157 if (coding
->chars_at_source
)
5160 while (charbuf
< charbuf_end
)
5166 *charbuf
++ = translate_char (table
, c
);
5171 produce_chars (coding
)
5172 struct coding_system
*coding
;
5174 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5175 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5177 int produced_chars
= 0;
5179 if (! coding
->chars_at_source
)
5181 /* Characters are in coding->charbuf. */
5182 int *buf
= coding
->charbuf
;
5183 int *buf_end
= buf
+ coding
->charbuf_used
;
5184 unsigned char *adjusted_dst_end
;
5186 if (BUFFERP (coding
->src_object
)
5187 && EQ (coding
->src_object
, coding
->dst_object
))
5188 dst_end
= coding
->source
+ coding
->consumed
;
5189 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5191 while (buf
< buf_end
)
5195 if (dst
>= adjusted_dst_end
)
5197 dst
= alloc_destination (coding
,
5198 buf_end
- buf
+ MAX_MULTIBYTE_LENGTH
,
5200 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5201 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5205 if (coding
->dst_multibyte
5206 || ! CHAR_BYTE8_P (c
))
5207 CHAR_STRING_ADVANCE (c
, dst
);
5209 *dst
++ = CHAR_TO_BYTE8 (c
);
5213 /* This is an annotation data. */
5219 unsigned char *src
= coding
->source
;
5220 unsigned char *src_end
= src
+ coding
->src_bytes
;
5221 Lisp_Object eol_type
;
5223 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5225 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5227 if (coding
->src_multibyte
)
5234 unsigned char *src_base
= src
;
5240 if (EQ (eol_type
, Qdos
))
5246 else if (EQ (eol_type
, Qmac
))
5251 coding
->consumed
= src
- coding
->source
;
5253 if (EQ (coding
->src_object
, coding
->dst_object
))
5257 dst
= alloc_destination (coding
, src_end
- src
+ 1,
5259 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5260 coding_set_source (coding
);
5261 src
= coding
->source
+ coding
->consumed
;
5262 src_end
= coding
->source
+ coding
->src_bytes
;
5272 while (src
< src_end
)
5279 if (EQ (eol_type
, Qdos
))
5285 else if (EQ (eol_type
, Qmac
))
5288 if (dst
>= dst_end
- 1)
5290 coding
->consumed
= src
- coding
->source
;
5292 if (EQ (coding
->src_object
, coding
->dst_object
))
5294 if (dst
>= dst_end
- 1)
5296 dst
= alloc_destination (coding
, src_end
- src
+ 2,
5298 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5299 coding_set_source (coding
);
5300 src
= coding
->source
+ coding
->consumed
;
5301 src_end
= coding
->source
+ coding
->src_bytes
;
5309 if (!EQ (coding
->src_object
, coding
->dst_object
))
5311 int require
= coding
->src_bytes
- coding
->dst_bytes
;
5315 EMACS_INT offset
= src
- coding
->source
;
5317 dst
= alloc_destination (coding
, require
, dst
);
5318 coding_set_source (coding
);
5319 src
= coding
->source
+ offset
;
5320 src_end
= coding
->source
+ coding
->src_bytes
;
5323 produced_chars
= coding
->src_chars
;
5324 while (src
< src_end
)
5330 if (EQ (eol_type
, Qdos
))
5337 else if (EQ (eol_type
, Qmac
))
5343 coding
->consumed
= coding
->src_bytes
;
5344 coding
->consumed_char
= coding
->src_chars
;
5347 produced
= dst
- (coding
->destination
+ coding
->produced
);
5348 if (BUFFERP (coding
->dst_object
))
5349 insert_from_gap (produced_chars
, produced
);
5350 coding
->produced
+= produced
;
5351 coding
->produced_char
+= produced_chars
;
5352 return produced_chars
;
5355 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5357 [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5361 produce_composition (coding
, charbuf
)
5362 struct coding_system
*coding
;
5368 enum composition_method method
;
5370 Lisp_Object components
;
5372 buffer
= coding
->dst_object
;
5374 pos
= coding
->dst_pos
+ charbuf
[1];
5375 method
= (enum composition_method
) (charbuf
[3]);
5376 cmp_len
= charbuf
[4];
5378 if (method
== COMPOSITION_RELATIVE
)
5382 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
5387 for (i
= 0; i
< len
; i
++)
5388 args
[i
] = make_number (charbuf
[i
]);
5389 components
= (method
== COMPOSITION_WITH_ALTCHARS
5390 ? Fstring (len
, args
) : Fvector (len
, args
));
5392 compose_text (pos
, pos
+ cmp_len
, components
, Qnil
, Qnil
);
5396 save_composition_data (buf
, buf_end
, prop
)
5400 enum composition_method method
= COMPOSITION_METHOD (prop
);
5401 int cmp_len
= COMPOSITION_LENGTH (prop
);
5403 if (buf
+ 4 + (MAX_COMPOSITION_COMPONENTS
* 2 - 1) > buf_end
)
5406 buf
[1] = CODING_ANNOTATE_COMPOSITION_MASK
;
5410 if (method
== COMPOSITION_RELATIVE
)
5414 Lisp_Object components
;
5417 components
= COMPOSITION_COMPONENTS (prop
);
5418 if (VECTORP (components
))
5420 len
= XVECTOR (components
)->size
;
5421 for (i
= 0; i
< len
; i
++)
5422 buf
[4 + i
] = XINT (AREF (components
, i
));
5424 else if (STRINGP (components
))
5428 len
= XSTRING (components
)->size
;
5431 FETCH_STRING_CHAR_ADVANCE (buf
[4 + i
], components
, i
, i_byte
);
5433 else if (INTEGERP (components
))
5436 buf
[4] = XINT (components
);
5438 else if (CONSP (components
))
5440 for (len
= 0; CONSP (components
);
5441 len
++, components
= XCDR (components
))
5442 buf
[4 + len
] = XINT (XCAR (components
));
5448 return (buf
+ buf
[0]);
5451 #define CHARBUF_SIZE 0x4000
5453 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5455 int size = CHARBUF_SIZE;; \
5457 coding->charbuf = NULL; \
5458 while (size > 1024) \
5460 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5461 if (coding->charbuf) \
5465 if (! coding->charbuf) \
5467 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5468 return coding->result; \
5470 coding->charbuf_size = size; \
5475 produce_annotation (coding
)
5476 struct coding_system
*coding
;
5478 int *charbuf
= coding
->charbuf
;
5479 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5481 while (charbuf
< charbuf_end
)
5487 int len
= -*charbuf
;
5490 case CODING_ANNOTATE_COMPOSITION_MASK
:
5491 produce_composition (coding
, charbuf
);
5501 /* Decode the data at CODING->src_object into CODING->dst_object.
5502 CODING->src_object is a buffer, a string, or nil.
5503 CODING->dst_object is a buffer.
5505 If CODING->src_object is a buffer, it must be the current buffer.
5506 In this case, if CODING->src_pos is positive, it is a position of
5507 the source text in the buffer, otherwise, the source text is in the
5508 gap area of the buffer, and CODING->src_pos specifies the offset of
5509 the text from GPT (which must be the same as PT). If this is the
5510 same buffer as CODING->dst_object, CODING->src_pos must be
5513 If CODING->src_object is a string, CODING->src_pos in an index to
5516 If CODING->src_object is nil, CODING->source must already point to
5517 the non-relocatable memory area. In this case, CODING->src_pos is
5518 an offset from CODING->source.
5520 The decoded data is inserted at the current point of the buffer
5525 decode_coding (coding
)
5526 struct coding_system
*coding
;
5530 if (BUFFERP (coding
->src_object
)
5531 && coding
->src_pos
> 0
5532 && coding
->src_pos
< GPT
5533 && coding
->src_pos
+ coding
->src_chars
> GPT
)
5534 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
5536 if (BUFFERP (coding
->dst_object
))
5538 if (current_buffer
!= XBUFFER (coding
->dst_object
))
5539 set_buffer_internal (XBUFFER (coding
->dst_object
));
5541 move_gap_both (PT
, PT_BYTE
);
5544 coding
->consumed
= coding
->consumed_char
= 0;
5545 coding
->produced
= coding
->produced_char
= 0;
5546 coding
->chars_at_source
= 0;
5547 coding
->result
= CODING_RESULT_SUCCESS
;
5550 ALLOC_CONVERSION_WORK_AREA (coding
);
5552 attrs
= CODING_ID_ATTRS (coding
->id
);
5556 coding_set_source (coding
);
5557 coding
->annotated
= 0;
5558 (*(coding
->decoder
)) (coding
);
5559 if (!NILP (CODING_ATTR_DECODE_TBL (attrs
)))
5560 translate_chars (CODING_ATTR_DECODE_TBL (attrs
), coding
);
5561 coding_set_destination (coding
);
5562 produce_chars (coding
);
5563 if (coding
->annotated
)
5564 produce_annotation (coding
);
5566 while (coding
->consumed
< coding
->src_bytes
5567 && ! coding
->result
);
5569 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qccl
)
5570 && SYMBOLP (CODING_ID_EOL_TYPE (coding
->id
))
5571 && ! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
5572 decode_eol (coding
);
5574 coding
->carryover_bytes
= 0;
5575 if (coding
->consumed
< coding
->src_bytes
)
5577 int nbytes
= coding
->src_bytes
- coding
->consumed
;
5580 coding_set_source (coding
);
5581 coding_set_destination (coding
);
5582 src
= coding
->source
+ coding
->consumed
;
5584 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
5586 /* Flush out unprocessed data as binary chars. We are sure
5587 that the number of data is less than the size of
5589 int *charbuf
= coding
->charbuf
;
5591 while (nbytes
-- > 0)
5594 *charbuf
++ = (c
& 0x80 ? - c
: c
);
5596 produce_chars (coding
);
5600 /* Record unprocessed bytes in coding->carryover. We are
5601 sure that the number of data is less than the size of
5602 coding->carryover. */
5603 unsigned char *p
= coding
->carryover
;
5605 coding
->carryover_bytes
= nbytes
;
5606 while (nbytes
-- > 0)
5609 coding
->consumed
= coding
->src_bytes
;
5612 return coding
->result
;
5616 consume_chars (coding
)
5617 struct coding_system
*coding
;
5619 int *buf
= coding
->charbuf
;
5620 /* -1 is to compensate for CRLF. */
5621 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
- 1;
5622 unsigned char *src
= coding
->source
+ coding
->consumed
;
5623 int pos
= coding
->src_pos
+ coding
->consumed_char
;
5624 int end_pos
= coding
->src_pos
+ coding
->src_chars
;
5625 int multibytep
= coding
->src_multibyte
;
5626 Lisp_Object eol_type
;
5628 int start
, end
, stop
;
5629 Lisp_Object object
, prop
;
5631 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5632 if (VECTORP (eol_type
))
5635 object
= coding
->src_object
;
5637 /* Note: composition handling is not yet implemented. */
5638 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
5640 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
5641 && find_composition (pos
, end_pos
, &start
, &end
, &prop
, object
)
5644 || (find_composition (end
, end_pos
, &start
, &end
, &prop
, object
)
5645 && end
<= end_pos
)))
5650 while (buf
< buf_end
)
5658 p
= save_composition_data (buf
, buf_end
, prop
);
5662 if (find_composition (end
, end_pos
, &start
, &end
, &prop
, object
)
5672 c
= STRING_CHAR_ADVANCE (src
);
5673 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
5675 if (! EQ (eol_type
, Qunix
))
5679 if (EQ (eol_type
, Qdos
))
5689 coding
->consumed
= src
- coding
->source
;
5690 coding
->consumed_char
= pos
- coding
->src_pos
;
5691 coding
->charbuf_used
= buf
- coding
->charbuf
;
5692 coding
->chars_at_source
= 0;
5696 /* Encode the text at CODING->src_object into CODING->dst_object.
5697 CODING->src_object is a buffer or a string.
5698 CODING->dst_object is a buffer or nil.
5700 If CODING->src_object is a buffer, it must be the current buffer.
5701 In this case, if CODING->src_pos is positive, it is a position of
5702 the source text in the buffer, otherwise. the source text is in the
5703 gap area of the buffer, and coding->src_pos specifies the offset of
5704 the text from GPT (which must be the same as PT). If this is the
5705 same buffer as CODING->dst_object, CODING->src_pos must be
5706 negative and CODING should not have `pre-write-conversion'.
5708 If CODING->src_object is a string, CODING should not have
5709 `pre-write-conversion'.
5711 If CODING->dst_object is a buffer, the encoded data is inserted at
5712 the current point of that buffer.
5714 If CODING->dst_object is nil, the encoded data is placed at the
5715 memory area specified by CODING->destination. */
5718 encode_coding (coding
)
5719 struct coding_system
*coding
;
5723 attrs
= CODING_ID_ATTRS (coding
->id
);
5725 if (BUFFERP (coding
->dst_object
))
5727 set_buffer_internal (XBUFFER (coding
->dst_object
));
5728 coding
->dst_multibyte
5729 = ! NILP (current_buffer
->enable_multibyte_characters
);
5732 coding
->consumed
= coding
->consumed_char
= 0;
5733 coding
->produced
= coding
->produced_char
= 0;
5734 coding
->result
= CODING_RESULT_SUCCESS
;
5737 ALLOC_CONVERSION_WORK_AREA (coding
);
5740 coding_set_source (coding
);
5741 consume_chars (coding
);
5743 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs
)))
5744 translate_chars (CODING_ATTR_ENCODE_TBL (attrs
), coding
);
5746 coding_set_destination (coding
);
5747 (*(coding
->encoder
)) (coding
);
5748 } while (coding
->consumed_char
< coding
->src_chars
);
5750 if (BUFFERP (coding
->dst_object
))
5751 insert_from_gap (coding
->produced_char
, coding
->produced
);
5753 return (coding
->result
);
5758 /* List of currently used working buffer. */
5759 Lisp_Object Vcode_conversion_work_buf_list
;
5761 /* A working buffer used by the top level conversion. */
5762 Lisp_Object Vcode_conversion_reused_work_buf
;
5765 /* Return a working buffer that can be freely used by the following
5766 code conversion. MULTIBYTEP specifies the multibyteness of the
5770 make_conversion_work_buffer (multibytep
)
5773 struct buffer
*current
= current_buffer
;
5776 if (NILP (Vcode_conversion_work_buf_list
))
5778 if (NILP (Vcode_conversion_reused_work_buf
))
5779 Vcode_conversion_reused_work_buf
5780 = Fget_buffer_create (build_string (" *code-conversion-work*"));
5781 Vcode_conversion_work_buf_list
5782 = Fcons (Vcode_conversion_reused_work_buf
, Qnil
);
5786 int depth
= Flength (Vcode_conversion_work_buf_list
);
5789 sprintf (str
, " *code-conversion-work*<%d>", depth
);
5790 Vcode_conversion_work_buf_list
5791 = Fcons (Fget_buffer_create (build_string (str
)),
5792 Vcode_conversion_work_buf_list
);
5795 buf
= XCAR (Vcode_conversion_work_buf_list
);
5796 set_buffer_internal (XBUFFER (buf
));
5797 current_buffer
->undo_list
= Qt
;
5799 Fset_buffer_multibyte (multibytep
? Qt
: Qnil
);
5800 set_buffer_internal (current
);
5804 static struct coding_system
*saved_coding
;
5807 code_conversion_restore (info
)
5810 int depth
= Flength (Vcode_conversion_work_buf_list
);
5815 buf
= XCAR (Vcode_conversion_work_buf_list
);
5816 Vcode_conversion_work_buf_list
= XCDR (Vcode_conversion_work_buf_list
);
5817 if (depth
> 1 && !NILP (Fbuffer_live_p (buf
)))
5821 if (saved_coding
->dst_object
== Qt
5822 && saved_coding
->destination
)
5823 xfree (saved_coding
->destination
);
5825 return save_excursion_restore (info
);
5830 decode_coding_gap (coding
, chars
, bytes
)
5831 struct coding_system
*coding
;
5832 EMACS_INT chars
, bytes
;
5834 int count
= specpdl_ptr
- specpdl
;
5836 saved_coding
= coding
;
5837 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5839 coding
->src_object
= Fcurrent_buffer ();
5840 coding
->src_chars
= chars
;
5841 coding
->src_bytes
= bytes
;
5842 coding
->src_pos
= -chars
;
5843 coding
->src_pos_byte
= -bytes
;
5844 coding
->src_multibyte
= chars
< bytes
;
5845 coding
->dst_object
= coding
->src_object
;
5846 coding
->dst_pos
= PT
;
5847 coding
->dst_pos_byte
= PT_BYTE
;
5848 coding
->dst_multibyte
= ! NILP (current_buffer
->enable_multibyte_characters
);
5850 if (CODING_REQUIRE_DETECTION (coding
))
5851 detect_coding (coding
);
5853 decode_coding (coding
);
5855 unbind_to (count
, Qnil
);
5856 return coding
->result
;
5860 encode_coding_gap (coding
, chars
, bytes
)
5861 struct coding_system
*coding
;
5862 EMACS_INT chars
, bytes
;
5864 int count
= specpdl_ptr
- specpdl
;
5867 saved_coding
= coding
;
5868 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5870 buffer
= Fcurrent_buffer ();
5871 coding
->src_object
= buffer
;
5872 coding
->src_chars
= chars
;
5873 coding
->src_bytes
= bytes
;
5874 coding
->src_pos
= -chars
;
5875 coding
->src_pos_byte
= -bytes
;
5876 coding
->src_multibyte
= chars
< bytes
;
5877 coding
->dst_object
= coding
->src_object
;
5878 coding
->dst_pos
= PT
;
5879 coding
->dst_pos_byte
= PT_BYTE
;
5881 encode_coding (coding
);
5883 unbind_to (count
, Qnil
);
5884 return coding
->result
;
5888 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5889 SRC_OBJECT into DST_OBJECT by coding context CODING.
5891 SRC_OBJECT is a buffer, a string, or Qnil.
5893 If it is a buffer, the text is at point of the buffer. FROM and TO
5894 are positions in the buffer.
5896 If it is a string, the text is at the beginning of the string.
5897 FROM and TO are indices to the string.
5899 If it is nil, the text is at coding->source. FROM and TO are
5900 indices to coding->source.
5902 DST_OBJECT is a buffer, Qt, or Qnil.
5904 If it is a buffer, the decoded text is inserted at point of the
5905 buffer. If the buffer is the same as SRC_OBJECT, the source text
5908 If it is Qt, a string is made from the decoded text, and
5909 set in CODING->dst_object.
5911 If it is Qnil, the decoded text is stored at CODING->destination.
5912 The called must allocate CODING->dst_bytes bytes at
5913 CODING->destination by xmalloc. If the decoded text is longer than
5914 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5918 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
5920 struct coding_system
*coding
;
5921 Lisp_Object src_object
;
5922 EMACS_INT from
, from_byte
, to
, to_byte
;
5923 Lisp_Object dst_object
;
5925 int count
= specpdl_ptr
- specpdl
;
5926 unsigned char *destination
;
5927 EMACS_INT dst_bytes
;
5928 EMACS_INT chars
= to
- from
;
5929 EMACS_INT bytes
= to_byte
- from_byte
;
5932 saved_coding
= coding
;
5933 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5935 if (NILP (dst_object
))
5937 destination
= coding
->destination
;
5938 dst_bytes
= coding
->dst_bytes
;
5941 coding
->src_object
= src_object
;
5942 coding
->src_chars
= chars
;
5943 coding
->src_bytes
= bytes
;
5944 coding
->src_multibyte
= chars
< bytes
;
5946 if (STRINGP (src_object
))
5948 coding
->src_pos
= from
;
5949 coding
->src_pos_byte
= from_byte
;
5951 else if (BUFFERP (src_object
))
5953 set_buffer_internal (XBUFFER (src_object
));
5955 move_gap_both (from
, from_byte
);
5956 if (EQ (src_object
, dst_object
))
5958 TEMP_SET_PT_BOTH (from
, from_byte
);
5959 del_range_both (from
, from_byte
, to
, to_byte
, 1);
5960 coding
->src_pos
= -chars
;
5961 coding
->src_pos_byte
= -bytes
;
5965 coding
->src_pos
= from
;
5966 coding
->src_pos_byte
= from_byte
;
5970 if (CODING_REQUIRE_DETECTION (coding
))
5971 detect_coding (coding
);
5972 attrs
= CODING_ID_ATTRS (coding
->id
);
5974 if (! NILP (CODING_ATTR_POST_READ (attrs
))
5975 || EQ (dst_object
, Qt
))
5977 coding
->dst_object
= make_conversion_work_buffer (1);
5978 coding
->dst_pos
= BEG
;
5979 coding
->dst_pos_byte
= BEG_BYTE
;
5980 coding
->dst_multibyte
= 1;
5982 else if (BUFFERP (dst_object
))
5984 coding
->dst_object
= dst_object
;
5985 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
5986 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
5987 coding
->dst_multibyte
5988 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
5992 coding
->dst_object
= Qnil
;
5993 coding
->dst_multibyte
= 1;
5996 decode_coding (coding
);
5998 if (BUFFERP (coding
->dst_object
))
5999 set_buffer_internal (XBUFFER (coding
->dst_object
));
6001 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6003 struct gcpro gcpro1
, gcpro2
;
6004 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6007 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6008 GCPRO2 (coding
->src_object
, coding
->dst_object
);
6009 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6010 make_number (coding
->produced_char
));
6013 coding
->produced_char
+= Z
- prev_Z
;
6014 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6017 if (EQ (dst_object
, Qt
))
6019 coding
->dst_object
= Fbuffer_string ();
6021 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
6023 set_buffer_internal (XBUFFER (coding
->dst_object
));
6024 if (dst_bytes
< coding
->produced
)
6027 = (unsigned char *) xrealloc (destination
, coding
->produced
);
6030 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
6031 unbind_to (count
, Qnil
);
6034 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
6035 move_gap_both (BEGV
, BEGV_BYTE
);
6036 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
6037 coding
->destination
= destination
;
6041 unbind_to (count
, Qnil
);
6046 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6048 struct coding_system
*coding
;
6049 Lisp_Object src_object
;
6050 EMACS_INT from
, from_byte
, to
, to_byte
;
6051 Lisp_Object dst_object
;
6053 int count
= specpdl_ptr
- specpdl
;
6054 EMACS_INT chars
= to
- from
;
6055 EMACS_INT bytes
= to_byte
- from_byte
;
6058 saved_coding
= coding
;
6059 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
6061 coding
->src_object
= src_object
;
6062 coding
->src_chars
= chars
;
6063 coding
->src_bytes
= bytes
;
6064 coding
->src_multibyte
= chars
< bytes
;
6066 attrs
= CODING_ID_ATTRS (coding
->id
);
6068 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
6070 coding
->src_object
= make_conversion_work_buffer (coding
->src_multibyte
);
6071 set_buffer_internal (XBUFFER (coding
->src_object
));
6072 if (STRINGP (src_object
))
6073 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
6074 else if (BUFFERP (src_object
))
6075 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
6077 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
6079 if (EQ (src_object
, dst_object
))
6081 set_buffer_internal (XBUFFER (src_object
));
6082 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6083 set_buffer_internal (XBUFFER (coding
->src_object
));
6086 call2 (CODING_ATTR_PRE_WRITE (attrs
),
6087 make_number (BEG
), make_number (Z
));
6088 coding
->src_object
= Fcurrent_buffer ();
6090 move_gap_both (BEG
, BEG_BYTE
);
6091 coding
->src_chars
= Z
- BEG
;
6092 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
6093 coding
->src_pos
= BEG
;
6094 coding
->src_pos_byte
= BEG_BYTE
;
6095 coding
->src_multibyte
= Z
< Z_BYTE
;
6097 else if (STRINGP (src_object
))
6099 coding
->src_pos
= from
;
6100 coding
->src_pos_byte
= from_byte
;
6102 else if (BUFFERP (src_object
))
6104 set_buffer_internal (XBUFFER (src_object
));
6106 move_gap_both (from
, from_byte
);
6107 if (EQ (src_object
, dst_object
))
6109 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6110 coding
->src_pos
= -chars
;
6111 coding
->src_pos_byte
= -bytes
;
6115 coding
->src_pos
= from
;
6116 coding
->src_pos_byte
= from_byte
;
6120 if (BUFFERP (dst_object
))
6122 coding
->dst_object
= dst_object
;
6123 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6124 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6125 coding
->dst_multibyte
6126 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6128 else if (EQ (dst_object
, Qt
))
6130 coding
->dst_object
= Qnil
;
6131 coding
->dst_bytes
= coding
->src_chars
;
6132 if (coding
->dst_bytes
== 0)
6133 coding
->dst_bytes
= 1;
6134 coding
->destination
= (unsigned char *) xmalloc (coding
->dst_bytes
);
6135 coding
->dst_multibyte
= 0;
6139 coding
->dst_object
= Qnil
;
6140 coding
->dst_multibyte
= 0;
6143 encode_coding (coding
);
6145 if (EQ (dst_object
, Qt
))
6147 if (BUFFERP (coding
->dst_object
))
6148 coding
->dst_object
= Fbuffer_string ();
6152 = make_unibyte_string ((char *) coding
->destination
,
6154 xfree (coding
->destination
);
6158 unbind_to (count
, Qnil
);
6163 preferred_coding_system ()
6165 int id
= coding_categories
[coding_priorities
[0]].id
;
6167 return CODING_ID_NAME (id
);
6172 /*** 8. Emacs Lisp library functions ***/
6174 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
6175 doc
: /* Return t if OBJECT is nil or a coding-system.
6176 See the documentation of `define-coding-system' for information
6177 about coding-system objects. */)
6181 return ((NILP (obj
) || CODING_SYSTEM_P (obj
)) ? Qt
: Qnil
);
6184 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
6185 Sread_non_nil_coding_system
, 1, 1, 0,
6186 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6193 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6194 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
6196 while (XSTRING (val
)->size
== 0);
6197 return (Fintern (val
, Qnil
));
6200 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
6201 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6202 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6203 (prompt
, default_coding_system
)
6204 Lisp_Object prompt
, default_coding_system
;
6207 if (SYMBOLP (default_coding_system
))
6208 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
6209 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6210 Qt
, Qnil
, Qcoding_system_history
,
6211 default_coding_system
, Qnil
);
6212 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
6215 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
6217 doc
: /* Check validity of CODING-SYSTEM.
6218 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6219 It is valid if it is a symbol with a non-nil `coding-system' property.
6220 The value of property should be a vector of length 5. */)
6222 Lisp_Object coding_system
;
6224 CHECK_SYMBOL (coding_system
);
6225 if (!NILP (Fcoding_system_p (coding_system
)))
6226 return coding_system
;
6228 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
6233 detect_coding_system (src
, src_bytes
, highest
, multibytep
, coding_system
)
6235 int src_bytes
, highest
;
6237 Lisp_Object coding_system
;
6239 unsigned char *src_end
= src
+ src_bytes
;
6240 int mask
= CATEGORY_MASK_ANY
;
6243 Lisp_Object attrs
, eol_type
;
6245 struct coding_system coding
;
6247 if (NILP (coding_system
))
6248 coding_system
= Qundecided
;
6249 setup_coding_system (coding_system
, &coding
);
6250 attrs
= CODING_ID_ATTRS (coding
.id
);
6251 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
6253 coding
.source
= src
;
6254 coding
.src_bytes
= src_bytes
;
6255 coding
.src_multibyte
= multibytep
;
6256 coding
.consumed
= 0;
6258 if (XINT (CODING_ATTR_CATEGORY (attrs
)) != coding_category_undecided
)
6260 mask
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
6264 coding_system
= Qnil
;
6265 for (; src
< src_end
; src
++)
6268 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
6270 || c
== ISO_CODE_SO
)))
6273 coding
.head_ascii
= src
- coding
.source
;
6276 for (i
= 0; i
< coding_category_raw_text
; i
++)
6278 enum coding_category category
= coding_priorities
[i
];
6279 struct coding_system
*this = coding_categories
+ category
;
6281 if (category
>= coding_category_raw_text
6282 || detected
& (1 << category
))
6287 /* No coding system of this category is defined. */
6288 mask
&= ~(1 << category
);
6292 detected
|= detected_mask
[category
];
6293 if ((*(coding_categories
[category
].detector
)) (&coding
, &mask
)
6296 mask
&= detected_mask
[category
];
6304 val
= Fcons (make_number (coding_category_raw_text
), Qnil
);
6305 else if (mask
== CATEGORY_MASK_ANY
)
6306 val
= Fcons (make_number (coding_category_undecided
), Qnil
);
6309 for (i
= 0; i
< coding_category_raw_text
; i
++)
6310 if (mask
& (1 << coding_priorities
[i
]))
6312 val
= Fcons (make_number (coding_priorities
[i
]), Qnil
);
6319 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6320 if (mask
& (1 << coding_priorities
[i
]))
6321 val
= Fcons (make_number (coding_priorities
[i
]), val
);
6325 int one_byte_eol
= -1, two_byte_eol
= -1;
6328 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
6330 struct coding_system
*this
6331 = (NILP (coding_system
) ? coding_categories
+ XINT (XCAR (tail
))
6335 attrs
= CODING_ID_ATTRS (this->id
);
6336 eol_type
= CODING_ID_EOL_TYPE (this->id
);
6337 XSETCAR (tail
, CODING_ID_NAME (this->id
));
6338 if (VECTORP (eol_type
))
6340 if (EQ (CODING_ATTR_TYPE (attrs
), Qutf_16
))
6342 if (two_byte_eol
< 0)
6343 two_byte_eol
= detect_eol (this, coding
.source
, src_bytes
);
6344 this_eol
= two_byte_eol
;
6348 if (one_byte_eol
< 0)
6349 one_byte_eol
=detect_eol (this, coding
.source
, src_bytes
);
6350 this_eol
= one_byte_eol
;
6352 if (this_eol
== EOL_SEEN_LF
)
6353 XSETCAR (tail
, AREF (eol_type
, 0));
6354 else if (this_eol
== EOL_SEEN_CRLF
)
6355 XSETCAR (tail
, AREF (eol_type
, 1));
6356 else if (this_eol
== EOL_SEEN_CR
)
6357 XSETCAR (tail
, AREF (eol_type
, 2));
6362 return (highest
? XCAR (val
) : val
);
6366 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
6368 doc
: /* Detect coding system of the text in the region between START and END.
6369 Return a list of possible coding systems ordered by priority.
6371 If only ASCII characters are found, it returns a list of single element
6372 `undecided' or its subsidiary coding system according to a detected
6375 If optional argument HIGHEST is non-nil, return the coding system of
6376 highest priority. */)
6377 (start
, end
, highest
)
6378 Lisp_Object start
, end
, highest
;
6381 int from_byte
, to_byte
;
6383 CHECK_NUMBER_COERCE_MARKER (start
);
6384 CHECK_NUMBER_COERCE_MARKER (end
);
6386 validate_region (&start
, &end
);
6387 from
= XINT (start
), to
= XINT (end
);
6388 from_byte
= CHAR_TO_BYTE (from
);
6389 to_byte
= CHAR_TO_BYTE (to
);
6391 if (from
< GPT
&& to
>= GPT
)
6392 move_gap_both (to
, to_byte
);
6394 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
6395 to_byte
- from_byte
,
6397 !NILP (current_buffer
6398 ->enable_multibyte_characters
),
6402 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
6404 doc
: /* Detect coding system of the text in STRING.
6405 Return a list of possible coding systems ordered by priority.
6407 If only ASCII characters are found, it returns a list of single element
6408 `undecided' or its subsidiary coding system according to a detected
6411 If optional argument HIGHEST is non-nil, return the coding system of
6412 highest priority. */)
6414 Lisp_Object string
, highest
;
6416 CHECK_STRING (string
);
6418 return detect_coding_system (XSTRING (string
)->data
,
6419 STRING_BYTES (XSTRING (string
)),
6421 STRING_MULTIBYTE (string
),
6427 char_encodable_p (c
, attrs
)
6432 struct charset
*charset
;
6434 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
6435 CONSP (tail
); tail
= XCDR (tail
))
6437 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
6438 if (CHAR_CHARSET_P (c
, charset
))
6441 return (! NILP (tail
));
6445 /* Return a list of coding systems that safely encode the text between
6446 START and END. If EXCLUDE is non-nil, it is a list of coding
6447 systems not to check. The returned list doesn't contain any such
6448 coding systems. In any case, If the text contains only ASCII or is
6449 unibyte, return t. */
6451 DEFUN ("find-coding-systems-region-internal",
6452 Ffind_coding_systems_region_internal
,
6453 Sfind_coding_systems_region_internal
, 2, 3, 0,
6454 doc
: /* Internal use only. */)
6455 (start
, end
, exclude
)
6456 Lisp_Object start
, end
, exclude
;
6458 Lisp_Object coding_attrs_list
, safe_codings
;
6459 EMACS_INT start_byte
, end_byte
;
6460 unsigned char *p
, *pbeg
, *pend
;
6462 Lisp_Object tail
, elt
;
6464 if (STRINGP (start
))
6466 if (!STRING_MULTIBYTE (start
)
6467 && XSTRING (start
)->size
!= STRING_BYTES (XSTRING (start
)))
6470 end_byte
= STRING_BYTES (XSTRING (start
));
6474 CHECK_NUMBER_COERCE_MARKER (start
);
6475 CHECK_NUMBER_COERCE_MARKER (end
);
6476 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
6477 args_out_of_range (start
, end
);
6478 if (NILP (current_buffer
->enable_multibyte_characters
))
6480 start_byte
= CHAR_TO_BYTE (XINT (start
));
6481 end_byte
= CHAR_TO_BYTE (XINT (end
));
6482 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
6485 if (start
< GPT
&& end
> GPT
)
6487 if ((GPT
- start
) < (end
- GPT
))
6488 move_gap_both (start
, start_byte
);
6490 move_gap_both (end
, end_byte
);
6494 coding_attrs_list
= Qnil
;
6495 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
6497 || NILP (Fmemq (XCAR (tail
), exclude
)))
6501 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
6502 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
6503 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
6504 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
6507 if (STRINGP (start
))
6508 p
= pbeg
= XSTRING (start
)->data
;
6510 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
6511 pend
= p
+ (end_byte
- start_byte
);
6513 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
6514 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
6518 if (ASCII_BYTE_P (*p
))
6522 c
= STRING_CHAR_ADVANCE (p
);
6524 charset_map_loaded
= 0;
6525 for (tail
= coding_attrs_list
; CONSP (tail
);)
6530 else if (char_encodable_p (c
, elt
))
6532 else if (CONSP (XCDR (tail
)))
6534 XSETCAR (tail
, XCAR (XCDR (tail
)));
6535 XSETCDR (tail
, XCDR (XCDR (tail
)));
6539 XSETCAR (tail
, Qnil
);
6543 if (charset_map_loaded
)
6545 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
6547 if (STRINGP (start
))
6548 pbeg
= XSTRING (start
)->data
;
6550 pbeg
= BYTE_POS_ADDR (start_byte
);
6551 p
= pbeg
+ p_offset
;
6552 pend
= pbeg
+ pend_offset
;
6557 safe_codings
= Qnil
;
6558 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
6559 if (! NILP (XCAR (tail
)))
6560 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
6562 return safe_codings
;
6566 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
6567 Scheck_coding_systems_region
, 3, 3, 0,
6568 doc
: /* Check if the region is encodable by coding systems.
6570 START and END are buffer positions specifying the region.
6571 CODING-SYSTEM-LIST is a list of coding systems to check.
6573 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6574 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6575 whole region, POS0, POS1, ... are buffer positions where non-encodable
6576 characters are found.
6578 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6581 START may be a string. In that case, check if the string is
6582 encodable, and the value contains indices to the string instead of
6583 buffer positions. END is ignored. */)
6584 (start
, end
, coding_system_list
)
6585 Lisp_Object start
, end
, coding_system_list
;
6588 EMACS_INT start_byte
, end_byte
;
6590 unsigned char *p
, *pbeg
, *pend
;
6592 Lisp_Object tail
, elt
;
6594 if (STRINGP (start
))
6596 if (!STRING_MULTIBYTE (start
)
6597 && XSTRING (start
)->size
!= STRING_BYTES (XSTRING (start
)))
6600 end_byte
= STRING_BYTES (XSTRING (start
));
6605 CHECK_NUMBER_COERCE_MARKER (start
);
6606 CHECK_NUMBER_COERCE_MARKER (end
);
6607 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
6608 args_out_of_range (start
, end
);
6609 if (NILP (current_buffer
->enable_multibyte_characters
))
6611 start_byte
= CHAR_TO_BYTE (XINT (start
));
6612 end_byte
= CHAR_TO_BYTE (XINT (end
));
6613 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
6616 if (start
< GPT
&& end
> GPT
)
6618 if ((GPT
- start
) < (end
- GPT
))
6619 move_gap_both (start
, start_byte
);
6621 move_gap_both (end
, end_byte
);
6627 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
6630 list
= Fcons (Fcons (elt
, Fcons (AREF (CODING_SYSTEM_SPEC (elt
), 0),
6635 if (STRINGP (start
))
6636 p
= pbeg
= XSTRING (start
)->data
;
6638 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
6639 pend
= p
+ (end_byte
- start_byte
);
6641 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
6642 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
6646 if (ASCII_BYTE_P (*p
))
6650 c
= STRING_CHAR_ADVANCE (p
);
6652 charset_map_loaded
= 0;
6653 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
6655 elt
= XCDR (XCAR (tail
));
6656 if (! char_encodable_p (c
, XCAR (elt
)))
6657 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
6659 if (charset_map_loaded
)
6661 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
6663 if (STRINGP (start
))
6664 pbeg
= XSTRING (start
)->data
;
6666 pbeg
= BYTE_POS_ADDR (start_byte
);
6667 p
= pbeg
+ p_offset
;
6668 pend
= pbeg
+ pend_offset
;
6676 for (; CONSP (tail
); tail
= XCDR (tail
))
6679 if (CONSP (XCDR (XCDR (elt
))))
6680 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
6690 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
6691 Lisp_Object start
, end
, coding_system
, dst_object
;
6692 int encodep
, norecord
;
6694 struct coding_system coding
;
6695 EMACS_INT from
, from_byte
, to
, to_byte
;
6696 Lisp_Object src_object
;
6698 CHECK_NUMBER_COERCE_MARKER (start
);
6699 CHECK_NUMBER_COERCE_MARKER (end
);
6700 if (NILP (coding_system
))
6701 coding_system
= Qno_conversion
;
6703 CHECK_CODING_SYSTEM (coding_system
);
6704 src_object
= Fcurrent_buffer ();
6705 if (NILP (dst_object
))
6706 dst_object
= src_object
;
6707 else if (! EQ (dst_object
, Qt
))
6708 CHECK_BUFFER (dst_object
);
6710 validate_region (&start
, &end
);
6711 from
= XFASTINT (start
);
6712 from_byte
= CHAR_TO_BYTE (from
);
6713 to
= XFASTINT (end
);
6714 to_byte
= CHAR_TO_BYTE (to
);
6716 setup_coding_system (coding_system
, &coding
);
6717 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6720 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
6723 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
6726 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
6728 if (coding
.result
!= CODING_RESULT_SUCCESS
)
6729 error ("Code conversion error: %d", coding
.result
);
6731 return (BUFFERP (dst_object
)
6732 ? make_number (coding
.produced_char
)
6733 : coding
.dst_object
);
6737 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
6738 3, 4, "r\nzCoding system: ",
6739 doc
: /* Decode the current region from the specified coding system.
6740 When called from a program, takes four arguments:
6741 START, END, CODING-SYSTEM, and DESTINATION.
6742 START and END are buffer positions.
6744 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6745 If nil, the region between START and END is replace by the decoded text.
6746 If buffer, the decoded text is inserted in the buffer.
6747 If t, the decoded text is returned.
6749 This function sets `last-coding-system-used' to the precise coding system
6750 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6751 not fully specified.)
6752 It returns the length of the decoded text. */)
6753 (start
, end
, coding_system
, destination
)
6754 Lisp_Object start
, end
, coding_system
, destination
;
6756 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
6759 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
6760 3, 4, "r\nzCoding system: ",
6761 doc
: /* Encode the current region by specified coding system.
6762 When called from a program, takes three arguments:
6763 START, END, and CODING-SYSTEM. START and END are buffer positions.
6765 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6766 If nil, the region between START and END is replace by the encoded text.
6767 If buffer, the encoded text is inserted in the buffer.
6768 If t, the encoded text is returned.
6770 This function sets `last-coding-system-used' to the precise coding system
6771 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6772 not fully specified.)
6773 It returns the length of the encoded text. */)
6774 (start
, end
, coding_system
, destination
)
6775 Lisp_Object start
, end
, coding_system
, destination
;
6777 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
6781 code_convert_string (string
, coding_system
, dst_object
,
6782 encodep
, nocopy
, norecord
)
6783 Lisp_Object string
, coding_system
, dst_object
;
6784 int encodep
, nocopy
, norecord
;
6786 struct coding_system coding
;
6787 EMACS_INT chars
, bytes
;
6789 CHECK_STRING (string
);
6790 if (NILP (coding_system
))
6793 Vlast_coding_system_used
= Qno_conversion
;
6794 if (NILP (dst_object
))
6795 return (nocopy
? Fcopy_sequence (string
) : string
);
6798 if (NILP (coding_system
))
6799 coding_system
= Qno_conversion
;
6801 CHECK_CODING_SYSTEM (coding_system
);
6802 if (NILP (dst_object
))
6804 else if (! EQ (dst_object
, Qt
))
6805 CHECK_BUFFER (dst_object
);
6807 setup_coding_system (coding_system
, &coding
);
6808 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6809 chars
= XSTRING (string
)->size
;
6810 bytes
= STRING_BYTES (XSTRING (string
));
6812 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
6814 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
6816 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
6818 if (coding
.result
!= CODING_RESULT_SUCCESS
)
6819 error ("Code conversion error: %d", coding
.result
);
6821 return (BUFFERP (dst_object
)
6822 ? make_number (coding
.produced_char
)
6823 : coding
.dst_object
);
6827 /* Encode or decode STRING according to CODING_SYSTEM.
6828 Do not set Vlast_coding_system_used.
6830 This function is called only from macros DECODE_FILE and
6831 ENCODE_FILE, thus we ignore character composition. */
6834 code_convert_string_norecord (string
, coding_system
, encodep
)
6835 Lisp_Object string
, coding_system
;
6838 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
6842 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
6844 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6846 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6847 if the decoding operation is trivial.
6849 Optional fourth arg BUFFER non-nil meant that the decoded text is
6850 inserted in BUFFER instead of returned as a astring. In this case,
6851 the return value is BUFFER.
6853 This function sets `last-coding-system-used' to the precise coding system
6854 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6855 not fully specified. */)
6856 (string
, coding_system
, nocopy
, buffer
)
6857 Lisp_Object string
, coding_system
, nocopy
, buffer
;
6859 return code_convert_string (string
, coding_system
, buffer
,
6860 0, ! NILP (nocopy
), 0);
6863 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
6865 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
6867 Optional third arg NOCOPY non-nil means it is OK to return STRING
6868 itself if the encoding operation is trivial.
6870 Optional fourth arg BUFFER non-nil meant that the encoded text is
6871 inserted in BUFFER instead of returned as a astring. In this case,
6872 the return value is BUFFER.
6874 This function sets `last-coding-system-used' to the precise coding system
6875 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6876 not fully specified.) */)
6877 (string
, coding_system
, nocopy
, buffer
)
6878 Lisp_Object string
, coding_system
, nocopy
, buffer
;
6880 return code_convert_string (string
, coding_system
, buffer
,
6881 nocopy
, ! NILP (nocopy
), 1);
6885 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
6886 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
6887 Return the corresponding character. */)
6891 Lisp_Object spec
, attrs
, val
;
6892 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
6895 CHECK_NATNUM (code
);
6896 c
= XFASTINT (code
);
6897 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
6898 attrs
= AREF (spec
, 0);
6900 if (ASCII_BYTE_P (c
)
6901 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6904 val
= CODING_ATTR_CHARSET_LIST (attrs
);
6905 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6906 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6907 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
6910 charset
= charset_roman
;
6911 else if (c
>= 0xA0 && c
< 0xDF)
6913 charset
= charset_kana
;
6918 int s1
= c
>> 8, s2
= c
& 0xFF;
6920 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
6921 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
6922 error ("Invalid code: %d", code
);
6924 charset
= charset_kanji
;
6926 c
= DECODE_CHAR (charset
, c
);
6928 error ("Invalid code: %d", code
);
6929 return make_number (c
);
6933 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
6934 doc
: /* Encode a Japanese character CHAR to shift_jis encoding.
6935 Return the corresponding code in SJIS. */)
6939 Lisp_Object spec
, attrs
, charset_list
;
6941 struct charset
*charset
;
6944 CHECK_CHARACTER (ch
);
6946 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
6947 attrs
= AREF (spec
, 0);
6949 if (ASCII_CHAR_P (c
)
6950 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6953 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
6954 charset
= char_charset (c
, charset_list
, &code
);
6955 if (code
== CHARSET_INVALID_CODE (charset
))
6956 error ("Can't encode by shift_jis encoding: %d", c
);
6959 return make_number (code
);
6962 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
6963 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
6964 Return the corresponding character. */)
6968 Lisp_Object spec
, attrs
, val
;
6969 struct charset
*charset_roman
, *charset_big5
, *charset
;
6972 CHECK_NATNUM (code
);
6973 c
= XFASTINT (code
);
6974 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
6975 attrs
= AREF (spec
, 0);
6977 if (ASCII_BYTE_P (c
)
6978 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6981 val
= CODING_ATTR_CHARSET_LIST (attrs
);
6982 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6983 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
6986 charset
= charset_roman
;
6989 int b1
= c
>> 8, b2
= c
& 0x7F;
6990 if (b1
< 0xA1 || b1
> 0xFE
6991 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
6992 error ("Invalid code: %d", code
);
6993 charset
= charset_big5
;
6995 c
= DECODE_CHAR (charset
, (unsigned )c
);
6997 error ("Invalid code: %d", code
);
6998 return make_number (c
);
7001 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
7002 doc
: /* Encode the Big5 character CHAR to BIG5 coding system.
7003 Return the corresponding character code in Big5. */)
7007 Lisp_Object spec
, attrs
, charset_list
;
7008 struct charset
*charset
;
7012 CHECK_CHARACTER (ch
);
7014 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7015 attrs
= AREF (spec
, 0);
7016 if (ASCII_CHAR_P (c
)
7017 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7020 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7021 charset
= char_charset (c
, charset_list
, &code
);
7022 if (code
== CHARSET_INVALID_CODE (charset
))
7023 error ("Can't encode by Big5 encoding: %d", c
);
7025 return make_number (code
);
7029 DEFUN ("set-terminal-coding-system-internal",
7030 Fset_terminal_coding_system_internal
,
7031 Sset_terminal_coding_system_internal
, 1, 1, 0,
7032 doc
: /* Internal use only. */)
7034 Lisp_Object coding_system
;
7036 CHECK_SYMBOL (coding_system
);
7037 setup_coding_system (Fcheck_coding_system (coding_system
),
7040 /* We had better not send unsafe characters to terminal. */
7041 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
7042 /* Characer composition should be disabled. */
7043 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7044 terminal_coding
.src_multibyte
= 1;
7045 terminal_coding
.dst_multibyte
= 0;
7049 DEFUN ("set-safe-terminal-coding-system-internal",
7050 Fset_safe_terminal_coding_system_internal
,
7051 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
7052 doc
: /* Internal use only. */)
7054 Lisp_Object coding_system
;
7056 CHECK_SYMBOL (coding_system
);
7057 setup_coding_system (Fcheck_coding_system (coding_system
),
7058 &safe_terminal_coding
);
7059 /* Characer composition should be disabled. */
7060 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7061 safe_terminal_coding
.src_multibyte
= 1;
7062 safe_terminal_coding
.dst_multibyte
= 0;
7066 DEFUN ("terminal-coding-system",
7067 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
7068 doc
: /* Return coding system specified for terminal output. */)
7071 return CODING_ID_NAME (terminal_coding
.id
);
7074 DEFUN ("set-keyboard-coding-system-internal",
7075 Fset_keyboard_coding_system_internal
,
7076 Sset_keyboard_coding_system_internal
, 1, 1, 0,
7077 doc
: /* Internal use only. */)
7079 Lisp_Object coding_system
;
7081 CHECK_SYMBOL (coding_system
);
7082 setup_coding_system (Fcheck_coding_system (coding_system
),
7084 /* Characer composition should be disabled. */
7085 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7089 DEFUN ("keyboard-coding-system",
7090 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
7091 doc
: /* Return coding system specified for decoding keyboard input. */)
7094 return CODING_ID_NAME (keyboard_coding
.id
);
7098 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
7099 Sfind_operation_coding_system
, 1, MANY
, 0,
7100 doc
: /* Choose a coding system for an operation based on the target name.
7101 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7102 DECODING-SYSTEM is the coding system to use for decoding
7103 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7104 for encoding (in case OPERATION does encoding).
7106 The first argument OPERATION specifies an I/O primitive:
7107 For file I/O, `insert-file-contents' or `write-region'.
7108 For process I/O, `call-process', `call-process-region', or `start-process'.
7109 For network I/O, `open-network-stream'.
7111 The remaining arguments should be the same arguments that were passed
7112 to the primitive. Depending on which primitive, one of those arguments
7113 is selected as the TARGET. For example, if OPERATION does file I/O,
7114 whichever argument specifies the file name is TARGET.
7116 TARGET has a meaning which depends on OPERATION:
7117 For file I/O, TARGET is a file name.
7118 For process I/O, TARGET is a process name.
7119 For network I/O, TARGET is a service name or a port number
7121 This function looks up what specified for TARGET in,
7122 `file-coding-system-alist', `process-coding-system-alist',
7123 or `network-coding-system-alist' depending on OPERATION.
7124 They may specify a coding system, a cons of coding systems,
7125 or a function symbol to call.
7126 In the last case, we call the function with one argument,
7127 which is a list of all the arguments given to this function.
7129 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7134 Lisp_Object operation
, target_idx
, target
, val
;
7135 register Lisp_Object chain
;
7138 error ("Too few arguments");
7139 operation
= args
[0];
7140 if (!SYMBOLP (operation
)
7141 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
7142 error ("Invalid first arguement");
7143 if (nargs
< 1 + XINT (target_idx
))
7144 error ("Too few arguments for operation: %s",
7145 XSYMBOL (operation
)->name
->data
);
7146 target
= args
[XINT (target_idx
) + 1];
7147 if (!(STRINGP (target
)
7148 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
7149 error ("Invalid %dth argument", XINT (target_idx
) + 1);
7151 chain
= ((EQ (operation
, Qinsert_file_contents
)
7152 || EQ (operation
, Qwrite_region
))
7153 ? Vfile_coding_system_alist
7154 : (EQ (operation
, Qopen_network_stream
)
7155 ? Vnetwork_coding_system_alist
7156 : Vprocess_coding_system_alist
));
7160 for (; CONSP (chain
); chain
= XCDR (chain
))
7166 && ((STRINGP (target
)
7167 && STRINGP (XCAR (elt
))
7168 && fast_string_match (XCAR (elt
), target
) >= 0)
7169 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
7172 /* Here, if VAL is both a valid coding system and a valid
7173 function symbol, we return VAL as a coding system. */
7176 if (! SYMBOLP (val
))
7178 if (! NILP (Fcoding_system_p (val
)))
7179 return Fcons (val
, val
);
7180 if (! NILP (Ffboundp (val
)))
7182 val
= call1 (val
, Flist (nargs
, args
));
7185 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
7186 return Fcons (val
, val
);
7194 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
7195 Sset_coding_system_priority
, 1, MANY
, 0,
7196 doc
: /* Assign higher priority to the coding systems given as arguments.
7197 usage: (set-coding-system-priority CODING-SYSTEM ...) */)
7203 int changed
[coding_category_max
];
7204 enum coding_category priorities
[coding_category_max
];
7206 bzero (changed
, sizeof changed
);
7208 for (i
= j
= 0; i
< nargs
; i
++)
7210 enum coding_category category
;
7211 Lisp_Object spec
, attrs
;
7213 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
7214 attrs
= AREF (spec
, 0);
7215 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7216 if (changed
[category
])
7217 /* Ignore this coding system because a coding system of the
7218 same category already had a higher priority. */
7220 changed
[category
] = 1;
7221 priorities
[j
++] = category
;
7222 if (coding_categories
[category
].id
>= 0
7223 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
7224 setup_coding_system (args
[i
], &coding_categories
[category
]);
7227 /* Now we have decided top J priorities. Reflect the order of the
7228 original priorities to the remaining priorities. */
7230 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
7232 while (j
< coding_category_max
7233 && changed
[coding_priorities
[j
]])
7235 if (j
== coding_category_max
)
7237 priorities
[i
] = coding_priorities
[j
];
7240 bcopy (priorities
, coding_priorities
, sizeof priorities
);
7244 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
7245 Scoding_system_priority_list
, 0, 1, 0,
7246 doc
: /* Return a list of coding systems ordered by their priorities.
7247 HIGHESTP non-nil means just return the highest priority one. */)
7249 Lisp_Object highestp
;
7254 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
7256 enum coding_category category
= coding_priorities
[i
];
7257 int id
= coding_categories
[category
].id
;
7262 attrs
= CODING_ID_ATTRS (id
);
7263 if (! NILP (highestp
))
7264 return CODING_ATTR_BASE_NAME (attrs
);
7265 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
7267 return Fnreverse (val
);
7270 static char *suffixes
[] = { "-unix", "-dos", "-mac" };
7273 make_subsidiaries (base
)
7276 Lisp_Object subsidiaries
;
7277 int base_name_len
= STRING_BYTES (XSYMBOL (base
)->name
);
7278 char *buf
= (char *) alloca (base_name_len
+ 6);
7281 bcopy (XSYMBOL (base
)->name
->data
, buf
, base_name_len
);
7282 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
7283 for (i
= 0; i
< 3; i
++)
7285 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
7286 ASET (subsidiaries
, i
, intern (buf
));
7288 return subsidiaries
;
7292 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
7293 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
7294 doc
: /* For internal use only.
7295 usage: (define-coding-system-internal ...) */)
7301 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
7302 Lisp_Object attrs
; /* Vector of attributes. */
7303 Lisp_Object eol_type
;
7304 Lisp_Object aliases
;
7305 Lisp_Object coding_type
, charset_list
, safe_charsets
;
7306 enum coding_category category
;
7307 Lisp_Object tail
, val
;
7308 int max_charset_id
= 0;
7311 if (nargs
< coding_arg_max
)
7314 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
7316 name
= args
[coding_arg_name
];
7317 CHECK_SYMBOL (name
);
7318 CODING_ATTR_BASE_NAME (attrs
) = name
;
7320 val
= args
[coding_arg_mnemonic
];
7321 if (! STRINGP (val
))
7322 CHECK_CHARACTER (val
);
7323 CODING_ATTR_MNEMONIC (attrs
) = val
;
7325 coding_type
= args
[coding_arg_coding_type
];
7326 CHECK_SYMBOL (coding_type
);
7327 CODING_ATTR_TYPE (attrs
) = coding_type
;
7329 charset_list
= args
[coding_arg_charset_list
];
7330 if (SYMBOLP (charset_list
))
7332 if (EQ (charset_list
, Qiso_2022
))
7334 if (! EQ (coding_type
, Qiso_2022
))
7335 error ("Invalid charset-list");
7336 charset_list
= Viso_2022_charset_list
;
7338 else if (EQ (charset_list
, Qemacs_mule
))
7340 if (! EQ (coding_type
, Qemacs_mule
))
7341 error ("Invalid charset-list");
7342 charset_list
= Vemacs_mule_charset_list
;
7344 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7345 if (max_charset_id
< XFASTINT (XCAR (tail
)))
7346 max_charset_id
= XFASTINT (XCAR (tail
));
7350 charset_list
= Fcopy_sequence (charset_list
);
7351 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
7353 struct charset
*charset
;
7356 CHECK_CHARSET_GET_CHARSET (val
, charset
);
7357 if (EQ (coding_type
, Qiso_2022
)
7358 ? CHARSET_ISO_FINAL (charset
) < 0
7359 : EQ (coding_type
, Qemacs_mule
)
7360 ? CHARSET_EMACS_MULE_ID (charset
) < 0
7362 error ("Can't handle charset `%s'",
7363 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7365 XCAR (tail
) = make_number (charset
->id
);
7366 if (max_charset_id
< charset
->id
)
7367 max_charset_id
= charset
->id
;
7370 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
7372 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
7374 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7375 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
7376 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
7378 val
= args
[coding_arg_decode_translation_table
];
7380 CHECK_CHAR_TABLE (val
);
7381 CODING_ATTR_DECODE_TBL (attrs
) = val
;
7383 val
= args
[coding_arg_encode_translation_table
];
7385 CHECK_CHAR_TABLE (val
);
7386 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
7388 val
= args
[coding_arg_post_read_conversion
];
7390 CODING_ATTR_POST_READ (attrs
) = val
;
7392 val
= args
[coding_arg_pre_write_conversion
];
7394 CODING_ATTR_PRE_WRITE (attrs
) = val
;
7396 val
= args
[coding_arg_default_char
];
7398 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
7401 CHECK_CHARACTER (val
);
7402 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
7405 val
= args
[coding_arg_plist
];
7407 CODING_ATTR_PLIST (attrs
) = val
;
7409 if (EQ (coding_type
, Qcharset
))
7411 /* Generate a lisp vector of 256 elements. Each element is nil,
7412 integer, or a list of charset IDs.
7414 If Nth element is nil, the byte code N is invalid in this
7417 If Nth element is a number NUM, N is the first byte of a
7418 charset whose ID is NUM.
7420 If Nth element is a list of charset IDs, N is the first byte
7421 of one of them. The list is sorted by dimensions of the
7422 charsets. A charset of smaller dimension comes firtst.
7424 val
= Fmake_vector (make_number (256), Qnil
);
7426 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7428 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
7429 int dim
= CHARSET_DIMENSION (charset
);
7430 int idx
= (dim
- 1) * 4;
7432 for (i
= charset
->code_space
[idx
];
7433 i
<= charset
->code_space
[idx
+ 1]; i
++)
7435 Lisp_Object tmp
, tmp2
;
7438 tmp
= AREF (val
, i
);
7441 else if (NUMBERP (tmp
))
7443 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
7445 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
7447 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
7451 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
7453 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
7458 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
7461 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
7462 XSETCAR (tmp2
, XCAR (tail
));
7468 ASET (attrs
, coding_attr_charset_valids
, val
);
7469 category
= coding_category_charset
;
7471 else if (EQ (coding_type
, Qccl
))
7475 if (nargs
< coding_arg_ccl_max
)
7478 val
= args
[coding_arg_ccl_decoder
];
7479 CHECK_CCL_PROGRAM (val
);
7481 val
= Fcopy_sequence (val
);
7482 ASET (attrs
, coding_attr_ccl_decoder
, val
);
7484 val
= args
[coding_arg_ccl_encoder
];
7485 CHECK_CCL_PROGRAM (val
);
7487 val
= Fcopy_sequence (val
);
7488 ASET (attrs
, coding_attr_ccl_encoder
, val
);
7490 val
= args
[coding_arg_ccl_valids
];
7491 valids
= Fmake_string (make_number (256), make_number (0));
7492 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
7496 ASET (valids
, XINT (val
), 1);
7502 CHECK_NUMBER (XCAR (val
));
7503 CHECK_NUMBER (XCDR (val
));
7504 from
= XINT (XCAR (val
));
7505 to
= XINT (XCDR (val
));
7506 for (i
= from
; i
<= to
; i
++)
7507 ASET (valids
, i
, 1);
7510 ASET (attrs
, coding_attr_ccl_valids
, valids
);
7512 category
= coding_category_ccl
;
7514 else if (EQ (coding_type
, Qutf_16
))
7516 Lisp_Object bom
, endian
;
7518 if (nargs
< coding_arg_utf16_max
)
7521 bom
= args
[coding_arg_utf16_bom
];
7522 if (! NILP (bom
) && ! EQ (bom
, Qt
))
7525 CHECK_CODING_SYSTEM (XCAR (bom
));
7526 CHECK_CODING_SYSTEM (XCDR (bom
));
7528 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
7530 endian
= args
[coding_arg_utf16_endian
];
7531 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
7533 category
= (CONSP (bom
)
7534 ? coding_category_utf_16_auto
7537 ? coding_category_utf_16_be_nosig
7538 : coding_category_utf_16_le_nosig
)
7540 ? coding_category_utf_16_be
7541 : coding_category_utf_16_le
));
7543 else if (EQ (coding_type
, Qiso_2022
))
7545 Lisp_Object initial
, reg_usage
, request
, flags
;
7548 if (nargs
< coding_arg_iso2022_max
)
7551 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
7552 CHECK_VECTOR (initial
);
7553 for (i
= 0; i
< 4; i
++)
7555 val
= Faref (initial
, make_number (i
));
7558 CHECK_CHARSET_GET_ID (val
, id
);
7559 ASET (initial
, i
, make_number (id
));
7562 ASET (initial
, i
, make_number (-1));
7565 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
7566 CHECK_CONS (reg_usage
);
7567 CHECK_NATNUM (XCAR (reg_usage
));
7568 CHECK_NATNUM (XCDR (reg_usage
));
7570 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
7571 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
7577 CHECK_CHARSET_GET_ID (XCAR (val
), id
);
7578 CHECK_NATNUM (XCDR (val
));
7579 if (XINT (XCDR (val
)) >= 4)
7580 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
7581 XCAR (val
) = make_number (id
);
7584 flags
= args
[coding_arg_iso2022_flags
];
7585 CHECK_NATNUM (flags
);
7587 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
7588 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
7590 ASET (attrs
, coding_attr_iso_initial
, initial
);
7591 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
7592 ASET (attrs
, coding_attr_iso_request
, request
);
7593 ASET (attrs
, coding_attr_iso_flags
, flags
);
7594 setup_iso_safe_charsets (attrs
);
7596 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
7597 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
7598 | CODING_ISO_FLAG_SINGLE_SHIFT
))
7599 ? coding_category_iso_7_else
7600 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
7601 ? coding_category_iso_7
7602 : coding_category_iso_7_tight
);
7605 int id
= XINT (AREF (initial
, 1));
7607 category
= (((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
7608 | CODING_ISO_FLAG_SINGLE_SHIFT
))
7609 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
7611 ? coding_category_iso_8_else
7612 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
7613 ? coding_category_iso_8_1
7614 : coding_category_iso_8_2
);
7617 else if (EQ (coding_type
, Qemacs_mule
))
7619 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
7620 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
7622 category
= coding_category_emacs_mule
;
7624 else if (EQ (coding_type
, Qshift_jis
))
7627 struct charset
*charset
;
7629 if (XINT (Flength (charset_list
)) != 3)
7630 error ("There should be just three charsets");
7632 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7633 if (CHARSET_DIMENSION (charset
) != 1)
7634 error ("Dimension of charset %s is not one",
7635 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7637 charset_list
= XCDR (charset_list
);
7638 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7639 if (CHARSET_DIMENSION (charset
) != 1)
7640 error ("Dimension of charset %s is not one",
7641 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7643 charset_list
= XCDR (charset_list
);
7644 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7645 if (CHARSET_DIMENSION (charset
) != 2)
7646 error ("Dimension of charset %s is not two",
7647 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7649 category
= coding_category_sjis
;
7650 Vsjis_coding_system
= name
;
7652 else if (EQ (coding_type
, Qbig5
))
7654 struct charset
*charset
;
7656 if (XINT (Flength (charset_list
)) != 2)
7657 error ("There should be just two charsets");
7659 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7660 if (CHARSET_DIMENSION (charset
) != 1)
7661 error ("Dimension of charset %s is not one",
7662 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7664 charset_list
= XCDR (charset_list
);
7665 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7666 if (CHARSET_DIMENSION (charset
) != 2)
7667 error ("Dimension of charset %s is not two",
7668 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7670 category
= coding_category_big5
;
7671 Vbig5_coding_system
= name
;
7673 else if (EQ (coding_type
, Qraw_text
))
7674 category
= coding_category_raw_text
;
7675 else if (EQ (coding_type
, Qutf_8
))
7676 category
= coding_category_utf_8
;
7677 else if (EQ (coding_type
, Qundecided
))
7678 category
= coding_category_undecided
;
7680 error ("Invalid coding system type: %s",
7681 XSYMBOL (coding_type
)->name
->data
);
7683 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
7685 eol_type
= args
[coding_arg_eol_type
];
7686 if (! NILP (eol_type
)
7687 && ! EQ (eol_type
, Qunix
)
7688 && ! EQ (eol_type
, Qdos
)
7689 && ! EQ (eol_type
, Qmac
))
7690 error ("Invalid eol-type");
7692 aliases
= Fcons (name
, Qnil
);
7694 if (NILP (eol_type
))
7696 eol_type
= make_subsidiaries (name
);
7697 for (i
= 0; i
< 3; i
++)
7699 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
7701 this_name
= AREF (eol_type
, i
);
7702 this_aliases
= Fcons (this_name
, Qnil
);
7703 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
7704 this_spec
= Fmake_vector (make_number (3), attrs
);
7705 ASET (this_spec
, 1, this_aliases
);
7706 ASET (this_spec
, 2, this_eol_type
);
7707 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
7708 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
7709 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
7710 Vcoding_system_alist
);
7714 spec_vec
= Fmake_vector (make_number (3), attrs
);
7715 ASET (spec_vec
, 1, aliases
);
7716 ASET (spec_vec
, 2, eol_type
);
7718 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
7719 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
7720 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
7721 Vcoding_system_alist
);
7724 int id
= coding_categories
[category
].id
;
7726 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
7727 setup_coding_system (name
, &coding_categories
[category
]);
7733 return Fsignal (Qwrong_number_of_arguments
,
7734 Fcons (intern ("define-coding-system-internal"),
7735 make_number (nargs
)));
7738 /* Fixme: should this record the alias relationships for
7740 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
7741 Sdefine_coding_system_alias
, 2, 2, 0,
7742 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
7743 (alias
, coding_system
)
7744 Lisp_Object alias
, coding_system
;
7746 Lisp_Object spec
, aliases
, eol_type
;
7748 CHECK_SYMBOL (alias
);
7749 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7750 aliases
= AREF (spec
, 1);
7751 while (!NILP (XCDR (aliases
)))
7752 aliases
= XCDR (aliases
);
7753 XCDR (aliases
) = Fcons (alias
, Qnil
);
7755 eol_type
= AREF (spec
, 2);
7756 if (VECTORP (eol_type
))
7758 Lisp_Object subsidiaries
;
7761 subsidiaries
= make_subsidiaries (alias
);
7762 for (i
= 0; i
< 3; i
++)
7763 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
7764 AREF (eol_type
, i
));
7766 ASET (spec
, 2, subsidiaries
);
7769 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
7770 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
7771 Vcoding_system_alist
);
7776 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
7778 doc
: /* Return the base of CODING-SYSTEM.
7779 Any alias or subsidiary coding system is not a base coding system. */)
7781 Lisp_Object coding_system
;
7783 Lisp_Object spec
, attrs
;
7785 if (NILP (coding_system
))
7786 return (Qno_conversion
);
7787 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7788 attrs
= AREF (spec
, 0);
7789 return CODING_ATTR_BASE_NAME (attrs
);
7792 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
7794 doc
: "Return the property list of CODING-SYSTEM.")
7796 Lisp_Object coding_system
;
7798 Lisp_Object spec
, attrs
;
7800 if (NILP (coding_system
))
7801 coding_system
= Qno_conversion
;
7802 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7803 attrs
= AREF (spec
, 0);
7804 return CODING_ATTR_PLIST (attrs
);
7808 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
7810 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
7812 Lisp_Object coding_system
;
7816 if (NILP (coding_system
))
7817 coding_system
= Qno_conversion
;
7818 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7819 return AREF (spec
, 1);
7822 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
7823 Scoding_system_eol_type
, 1, 1, 0,
7824 doc
: /* Return eol-type of CODING-SYSTEM.
7825 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7827 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7828 and CR respectively.
7830 A vector value indicates that a format of end-of-line should be
7831 detected automatically. Nth element of the vector is the subsidiary
7832 coding system whose eol-type is N. */)
7834 Lisp_Object coding_system
;
7836 Lisp_Object spec
, eol_type
;
7839 if (NILP (coding_system
))
7840 coding_system
= Qno_conversion
;
7841 if (! CODING_SYSTEM_P (coding_system
))
7843 spec
= CODING_SYSTEM_SPEC (coding_system
);
7844 eol_type
= AREF (spec
, 2);
7845 if (VECTORP (eol_type
))
7846 return Fcopy_sequence (eol_type
);
7847 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
7848 return make_number (n
);
7854 /*** 9. Post-amble ***/
7861 for (i
= 0; i
< coding_category_max
; i
++)
7863 coding_categories
[i
].id
= -1;
7864 coding_priorities
[i
] = i
;
7867 /* ISO2022 specific initialize routine. */
7868 for (i
= 0; i
< 0x20; i
++)
7869 iso_code_class
[i
] = ISO_control_0
;
7870 for (i
= 0x21; i
< 0x7F; i
++)
7871 iso_code_class
[i
] = ISO_graphic_plane_0
;
7872 for (i
= 0x80; i
< 0xA0; i
++)
7873 iso_code_class
[i
] = ISO_control_1
;
7874 for (i
= 0xA1; i
< 0xFF; i
++)
7875 iso_code_class
[i
] = ISO_graphic_plane_1
;
7876 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
7877 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
7878 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
7879 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
7880 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
7881 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
7882 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
7883 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
7884 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
7885 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
7887 inhibit_pre_post_conversion
= 0;
7889 for (i
= 0; i
< 256; i
++)
7891 emacs_mule_bytes
[i
] = 1;
7893 emacs_mule_bytes
[LEADING_CODE_PRIVATE_11
] = 3;
7894 emacs_mule_bytes
[LEADING_CODE_PRIVATE_12
] = 3;
7895 emacs_mule_bytes
[LEADING_CODE_PRIVATE_21
] = 4;
7896 emacs_mule_bytes
[LEADING_CODE_PRIVATE_22
] = 4;
7904 staticpro (&Vcoding_system_hash_table
);
7905 Vcoding_system_hash_table
= Fmakehash (Qeq
);
7907 staticpro (&Vsjis_coding_system
);
7908 Vsjis_coding_system
= Qnil
;
7910 staticpro (&Vbig5_coding_system
);
7911 Vbig5_coding_system
= Qnil
;
7913 staticpro (&Vcode_conversion_work_buf_list
);
7914 Vcode_conversion_work_buf_list
= Qnil
;
7916 staticpro (&Vcode_conversion_reused_work_buf
);
7917 Vcode_conversion_reused_work_buf
= Qnil
;
7919 DEFSYM (Qcharset
, "charset");
7920 DEFSYM (Qtarget_idx
, "target-idx");
7921 DEFSYM (Qcoding_system_history
, "coding-system-history");
7922 Fset (Qcoding_system_history
, Qnil
);
7924 /* Target FILENAME is the first argument. */
7925 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
7926 /* Target FILENAME is the third argument. */
7927 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
7929 DEFSYM (Qcall_process
, "call-process");
7930 /* Target PROGRAM is the first argument. */
7931 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
7933 DEFSYM (Qcall_process_region
, "call-process-region");
7934 /* Target PROGRAM is the third argument. */
7935 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
7937 DEFSYM (Qstart_process
, "start-process");
7938 /* Target PROGRAM is the third argument. */
7939 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
7941 DEFSYM (Qopen_network_stream
, "open-network-stream");
7942 /* Target SERVICE is the fourth argument. */
7943 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
7945 DEFSYM (Qcoding_system
, "coding-system");
7946 DEFSYM (Qcoding_aliases
, "coding-aliases");
7948 DEFSYM (Qeol_type
, "eol-type");
7949 DEFSYM (Qunix
, "unix");
7950 DEFSYM (Qdos
, "dos");
7952 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
7953 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
7954 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
7955 DEFSYM (Qdefault_char
, "default-char");
7956 DEFSYM (Qundecided
, "undecided");
7957 DEFSYM (Qno_conversion
, "no-conversion");
7958 DEFSYM (Qraw_text
, "raw-text");
7960 DEFSYM (Qiso_2022
, "iso-2022");
7962 DEFSYM (Qutf_8
, "utf-8");
7964 DEFSYM (Qutf_16
, "utf-16");
7965 DEFSYM (Qutf_16_be
, "utf-16-be");
7966 DEFSYM (Qutf_16_be_nosig
, "utf-16-be-nosig");
7967 DEFSYM (Qutf_16_le
, "utf-16-l3");
7968 DEFSYM (Qutf_16_le_nosig
, "utf-16-le-nosig");
7969 DEFSYM (Qsignature
, "signature");
7970 DEFSYM (Qendian
, "endian");
7971 DEFSYM (Qbig
, "big");
7972 DEFSYM (Qlittle
, "little");
7974 DEFSYM (Qshift_jis
, "shift-jis");
7975 DEFSYM (Qbig5
, "big5");
7977 DEFSYM (Qcoding_system_p
, "coding-system-p");
7979 DEFSYM (Qcoding_system_error
, "coding-system-error");
7980 Fput (Qcoding_system_error
, Qerror_conditions
,
7981 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
7982 Fput (Qcoding_system_error
, Qerror_message
,
7983 build_string ("Invalid coding system"));
7985 /* Intern this now in case it isn't already done.
7986 Setting this variable twice is harmless.
7987 But don't staticpro it here--that is done in alloc.c. */
7988 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
7990 DEFSYM (Qtranslation_table
, "translation-table");
7991 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
7992 DEFSYM (Qtranslation_table_id
, "translation-table-id");
7993 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
7994 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
7996 DEFSYM (Qvalid_codes
, "valid-codes");
7998 DEFSYM (Qemacs_mule
, "emacs-mule");
8000 Vcoding_category_table
8001 = Fmake_vector (make_number (coding_category_max
), Qnil
);
8002 staticpro (&Vcoding_category_table
);
8003 /* Followings are target of code detection. */
8004 ASET (Vcoding_category_table
, coding_category_iso_7
,
8005 intern ("coding-category-iso-7"));
8006 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
8007 intern ("coding-category-iso-7-tight"));
8008 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
8009 intern ("coding-category-iso-8-1"));
8010 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
8011 intern ("coding-category-iso-8-2"));
8012 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
8013 intern ("coding-category-iso-7-else"));
8014 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
8015 intern ("coding-category-iso-8-else"));
8016 ASET (Vcoding_category_table
, coding_category_utf_8
,
8017 intern ("coding-category-utf-8"));
8018 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
8019 intern ("coding-category-utf-16-be"));
8020 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
8021 intern ("coding-category-utf-16-le"));
8022 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
8023 intern ("coding-category-utf-16-be-nosig"));
8024 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
8025 intern ("coding-category-utf-16-le-nosig"));
8026 ASET (Vcoding_category_table
, coding_category_charset
,
8027 intern ("coding-category-charset"));
8028 ASET (Vcoding_category_table
, coding_category_sjis
,
8029 intern ("coding-category-sjis"));
8030 ASET (Vcoding_category_table
, coding_category_big5
,
8031 intern ("coding-category-big5"));
8032 ASET (Vcoding_category_table
, coding_category_ccl
,
8033 intern ("coding-category-ccl"));
8034 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
8035 intern ("coding-category-emacs-mule"));
8036 /* Followings are NOT target of code detection. */
8037 ASET (Vcoding_category_table
, coding_category_raw_text
,
8038 intern ("coding-category-raw-text"));
8039 ASET (Vcoding_category_table
, coding_category_undecided
,
8040 intern ("coding-category-undecided"));
8042 defsubr (&Scoding_system_p
);
8043 defsubr (&Sread_coding_system
);
8044 defsubr (&Sread_non_nil_coding_system
);
8045 defsubr (&Scheck_coding_system
);
8046 defsubr (&Sdetect_coding_region
);
8047 defsubr (&Sdetect_coding_string
);
8048 defsubr (&Sfind_coding_systems_region_internal
);
8049 defsubr (&Scheck_coding_systems_region
);
8050 defsubr (&Sdecode_coding_region
);
8051 defsubr (&Sencode_coding_region
);
8052 defsubr (&Sdecode_coding_string
);
8053 defsubr (&Sencode_coding_string
);
8054 defsubr (&Sdecode_sjis_char
);
8055 defsubr (&Sencode_sjis_char
);
8056 defsubr (&Sdecode_big5_char
);
8057 defsubr (&Sencode_big5_char
);
8058 defsubr (&Sset_terminal_coding_system_internal
);
8059 defsubr (&Sset_safe_terminal_coding_system_internal
);
8060 defsubr (&Sterminal_coding_system
);
8061 defsubr (&Sset_keyboard_coding_system_internal
);
8062 defsubr (&Skeyboard_coding_system
);
8063 defsubr (&Sfind_operation_coding_system
);
8064 defsubr (&Sset_coding_system_priority
);
8065 defsubr (&Sdefine_coding_system_internal
);
8066 defsubr (&Sdefine_coding_system_alias
);
8067 defsubr (&Scoding_system_base
);
8068 defsubr (&Scoding_system_plist
);
8069 defsubr (&Scoding_system_aliases
);
8070 defsubr (&Scoding_system_eol_type
);
8071 defsubr (&Scoding_system_priority_list
);
8073 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
8074 doc
: /* List of coding systems.
8076 Do not alter the value of this variable manually. This variable should be
8077 updated by the functions `define-coding-system' and
8078 `define-coding-system-alias'. */);
8079 Vcoding_system_list
= Qnil
;
8081 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
8082 doc
: /* Alist of coding system names.
8083 Each element is one element list of coding system name.
8084 This variable is given to `completing-read' as TABLE argument.
8086 Do not alter the value of this variable manually. This variable should be
8087 updated by the functions `make-coding-system' and
8088 `define-coding-system-alias'. */);
8089 Vcoding_system_alist
= Qnil
;
8091 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
8092 doc
: /* List of coding-categories (symbols) ordered by priority.
8094 On detecting a coding system, Emacs tries code detection algorithms
8095 associated with each coding-category one by one in this order. When
8096 one algorithm agrees with a byte sequence of source text, the coding
8097 system bound to the corresponding coding-category is selected. */);
8101 Vcoding_category_list
= Qnil
;
8102 for (i
= coding_category_max
- 1; i
>= 0; i
--)
8103 Vcoding_category_list
8104 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
8105 Vcoding_category_list
);
8108 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
8109 doc
: /* Specify the coding system for read operations.
8110 It is useful to bind this variable with `let', but do not set it globally.
8111 If the value is a coding system, it is used for decoding on read operation.
8112 If not, an appropriate element is used from one of the coding system alists:
8113 There are three such tables, `file-coding-system-alist',
8114 `process-coding-system-alist', and `network-coding-system-alist'. */);
8115 Vcoding_system_for_read
= Qnil
;
8117 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
8118 doc
: /* Specify the coding system for write operations.
8119 Programs bind this variable with `let', but you should not set it globally.
8120 If the value is a coding system, it is used for encoding of output,
8121 when writing it to a file and when sending it to a file or subprocess.
8123 If this does not specify a coding system, an appropriate element
8124 is used from one of the coding system alists:
8125 There are three such tables, `file-coding-system-alist',
8126 `process-coding-system-alist', and `network-coding-system-alist'.
8127 For output to files, if the above procedure does not specify a coding system,
8128 the value of `buffer-file-coding-system' is used. */);
8129 Vcoding_system_for_write
= Qnil
;
8131 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
8133 Coding system used in the latest file or process I/O. */);
8134 Vlast_coding_system_used
= Qnil
;
8136 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
8138 *Non-nil means always inhibit code conversion of end-of-line format.
8139 See info node `Coding Systems' and info node `Text and Binary' concerning
8140 such conversion. */);
8141 inhibit_eol_conversion
= 0;
8143 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
8145 Non-nil means process buffer inherits coding system of process output.
8146 Bind it to t if the process output is to be treated as if it were a file
8147 read from some filesystem. */);
8148 inherit_process_coding_system
= 0;
8150 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
8152 Alist to decide a coding system to use for a file I/O operation.
8153 The format is ((PATTERN . VAL) ...),
8154 where PATTERN is a regular expression matching a file name,
8155 VAL is a coding system, a cons of coding systems, or a function symbol.
8156 If VAL is a coding system, it is used for both decoding and encoding
8158 If VAL is a cons of coding systems, the car part is used for decoding,
8159 and the cdr part is used for encoding.
8160 If VAL is a function symbol, the function must return a coding system
8161 or a cons of coding systems which are used as above. The function gets
8162 the arguments with which `find-operation-coding-systems' was called.
8164 See also the function `find-operation-coding-system'
8165 and the variable `auto-coding-alist'. */);
8166 Vfile_coding_system_alist
= Qnil
;
8168 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
8170 Alist to decide a coding system to use for a process I/O operation.
8171 The format is ((PATTERN . VAL) ...),
8172 where PATTERN is a regular expression matching a program name,
8173 VAL is a coding system, a cons of coding systems, or a function symbol.
8174 If VAL is a coding system, it is used for both decoding what received
8175 from the program and encoding what sent to the program.
8176 If VAL is a cons of coding systems, the car part is used for decoding,
8177 and the cdr part is used for encoding.
8178 If VAL is a function symbol, the function must return a coding system
8179 or a cons of coding systems which are used as above.
8181 See also the function `find-operation-coding-system'. */);
8182 Vprocess_coding_system_alist
= Qnil
;
8184 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
8186 Alist to decide a coding system to use for a network I/O operation.
8187 The format is ((PATTERN . VAL) ...),
8188 where PATTERN is a regular expression matching a network service name
8189 or is a port number to connect to,
8190 VAL is a coding system, a cons of coding systems, or a function symbol.
8191 If VAL is a coding system, it is used for both decoding what received
8192 from the network stream and encoding what sent to the network stream.
8193 If VAL is a cons of coding systems, the car part is used for decoding,
8194 and the cdr part is used for encoding.
8195 If VAL is a function symbol, the function must return a coding system
8196 or a cons of coding systems which are used as above.
8198 See also the function `find-operation-coding-system'. */);
8199 Vnetwork_coding_system_alist
= Qnil
;
8201 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
8202 doc
: /* Coding system to use with system messages.
8203 Also used for decoding keyboard input on X Window system. */);
8204 Vlocale_coding_system
= Qnil
;
8206 /* The eol mnemonics are reset in startup.el system-dependently. */
8207 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
8209 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8210 eol_mnemonic_unix
= build_string (":");
8212 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
8214 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8215 eol_mnemonic_dos
= build_string ("\\");
8217 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
8219 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8220 eol_mnemonic_mac
= build_string ("/");
8222 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
8224 *String displayed in mode line when end-of-line format is not yet determined. */);
8225 eol_mnemonic_undecided
= build_string (":");
8227 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
8229 *Non-nil enables character translation while encoding and decoding. */);
8230 Venable_character_translation
= Qt
;
8232 DEFVAR_LISP ("standard-translation-table-for-decode",
8233 &Vstandard_translation_table_for_decode
,
8234 doc
: /* Table for translating characters while decoding. */);
8235 Vstandard_translation_table_for_decode
= Qnil
;
8237 DEFVAR_LISP ("standard-translation-table-for-encode",
8238 &Vstandard_translation_table_for_encode
,
8239 doc
: /* Table for translating characters while encoding. */);
8240 Vstandard_translation_table_for_encode
= Qnil
;
8242 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
8243 doc
: /* Alist of charsets vs revision numbers.
8244 While encoding, if a charset (car part of an element) is found,
8245 designate it with the escape sequence identifying revision (cdr part
8246 of the element). */);
8247 Vcharset_revision_table
= Qnil
;
8249 DEFVAR_LISP ("default-process-coding-system",
8250 &Vdefault_process_coding_system
,
8251 doc
: /* Cons of coding systems used for process I/O by default.
8252 The car part is used for decoding a process output,
8253 the cdr part is used for encoding a text to be sent to a process. */);
8254 Vdefault_process_coding_system
= Qnil
;
8256 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
8258 Table of extra Latin codes in the range 128..159 (inclusive).
8259 This is a vector of length 256.
8260 If Nth element is non-nil, the existence of code N in a file
8261 \(or output of subprocess) doesn't prevent it to be detected as
8262 a coding system of ISO 2022 variant which has a flag
8263 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8264 or reading output of a subprocess.
8265 Only 128th through 159th elements has a meaning. */);
8266 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
8268 DEFVAR_LISP ("select-safe-coding-system-function",
8269 &Vselect_safe_coding_system_function
,
8271 Function to call to select safe coding system for encoding a text.
8273 If set, this function is called to force a user to select a proper
8274 coding system which can encode the text in the case that a default
8275 coding system used in each operation can't encode the text.
8277 The default value is `select-safe-coding-system' (which see). */);
8278 Vselect_safe_coding_system_function
= Qnil
;
8280 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8281 &inhibit_iso_escape_detection
,
8283 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8285 By default, on reading a file, Emacs tries to detect how the text is
8286 encoded. This code detection is sensitive to escape sequences. If
8287 the sequence is valid as ISO2022, the code is determined as one of
8288 the ISO2022 encodings, and the file is decoded by the corresponding
8289 coding system (e.g. `iso-2022-7bit').
8291 However, there may be a case that you want to read escape sequences in
8292 a file as is. In such a case, you can set this variable to non-nil.
8293 Then, as the code detection ignores any escape sequences, no file is
8294 detected as encoded in some ISO2022 encoding. The result is that all
8295 escape sequences become visible in a buffer.
8297 The default value is nil, and it is strongly recommended not to change
8298 it. That is because many Emacs Lisp source files that contain
8299 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8300 in Emacs's distribution, and they won't be decoded correctly on
8301 reading if you suppress escape sequence detection.
8303 The other way to read escape sequences in a file without decoding is
8304 to explicitly specify some coding system that doesn't use ISO2022's
8305 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8306 inhibit_iso_escape_detection
= 0;
8309 Lisp_Object args
[coding_arg_max
];
8310 Lisp_Object plist
[14];
8313 for (i
= 0; i
< coding_arg_max
; i
++)
8316 plist
[0] = intern (":name");
8317 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
8318 plist
[2] = intern (":mnemonic");
8319 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
8320 plist
[4] = intern (":coding-type");
8321 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
8322 plist
[6] = intern (":ascii-compatible-p");
8323 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
8324 plist
[8] = intern (":default-char");
8325 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
8326 plist
[10] = intern (":docstring");
8327 plist
[11] = build_string ("Do no conversion.\n\
8329 When you visit a file with this coding, the file is read into a\n\
8330 unibyte buffer as is, thus each byte of a file is treated as a\n\
8332 plist
[12] = intern (":eol-type");
8333 plist
[13] = args
[coding_arg_eol_type
] = Qunix
;
8334 args
[coding_arg_plist
] = Flist (14, plist
);
8335 Fdefine_coding_system_internal (coding_arg_max
, args
);
8338 setup_coding_system (Qno_conversion
, &keyboard_coding
);
8339 setup_coding_system (Qno_conversion
, &terminal_coding
);
8340 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
8344 emacs_strerror (error_number
)
8349 synchronize_system_messages_locale ();
8350 str
= strerror (error_number
);
8352 if (! NILP (Vlocale_coding_system
))
8354 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
8355 Vlocale_coding_system
,
8357 str
= (char *) XSTRING (dec
)->data
;