1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 Coding system is an object for a encoding mechanism that contains
50 information about how to convert byte sequence to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from a
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the mechanism of encoding. Here's a brief descrition about type.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by code converter defined for each
77 o Old Emacs' internal format (emacs-mule)
79 The coding system adopted by an old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used by Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode a text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for a text containing raw eight-bit data. Emacs
112 treat each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How end-of-line of a text is encoded depends on a system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text characters encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX. Return 1 if the data contains
148 a byte sequence which can be decoded into non-ASCII characters by
149 the coding system. Otherwize (i.e. the data contains only ASCII
150 characters or invalid sequence) return 0.
152 It also resets some bits of an integer pointed by MASK. The macros
153 CATEGORY_MASK_XXX specifies each bit of this integer.
155 Below is the template of these functions. */
159 detect_coding_XXX (coding
, mask
)
160 struct coding_system
*coding
;
163 unsigned char *src
= coding
->source
;
164 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
165 int multibytep
= coding
->src_multibyte
;
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
175 /* Check if it conforms to XXX. If not, break the loop. */
177 /* As the data is invalid for XXX, reset a proper bits. */
178 *mask
&= ~CODING_CATEGORY_XXX
;
181 /* The source exausted. */
183 /* ASCII characters only. */
185 /* Some data should be decoded into non-ASCII characters. */
186 *mask
&= CODING_CATEGORY_XXX
;
191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
193 These functions decode a byte sequence specified as a source by
194 CODING. The resulting multibyte text goes to a place pointed to by
195 CODING->charbuf, the length of which should not exceed
196 CODING->charbuf_size;
198 These functions set the information of original and decoded texts in
199 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
200 They also set CODING->result to one of CODING_RESULT_XXX indicating
201 how the decoding is finished.
203 Below is the template of these functions. */
207 decode_coding_XXXX (coding
)
208 struct coding_system
*coding
;
210 unsigned char *src
= coding
->source
+ coding
->consumed
;
211 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
212 /* SRC_BASE remembers the start position in source in each loop.
213 The loop will be exited when there's not enough source code, or
214 when there's no room in CHARBUF for a decoded character. */
215 unsigned char *src_base
;
216 /* A buffer to produce decoded characters. */
217 int *charbuf
= coding
->charbuf
;
218 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
219 int multibytep
= coding
->src_multibyte
;
224 if (charbuf
< charbuf_end
)
225 /* No more room to produce a decoded character. */
232 if (src_base
< src_end
233 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
234 /* If the source ends by partial bytes to construct a character,
235 treat them as eight-bit raw data. */
236 while (src_base
< src_end
&& charbuf
< charbuf_end
)
237 *charbuf
++ = *src_base
++;
238 /* Remember how many bytes and characters we consumed. If the
239 source is multibyte, the bytes and chars are not identical. */
240 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
241 /* Remember how many characters we produced. */
242 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
248 These functions encode SRC_BYTES length text at SOURCE of Emacs'
249 internal multibyte format by CODING. The resulting byte sequence
250 goes to a place pointed to by DESTINATION, the length of which
251 should not exceed DST_BYTES.
253 These functions set the information of original and encoded texts in
254 the members produced, produced_char, consumed, and consumed_char of
255 the structure *CODING. They also set the member result to one of
256 CODING_RESULT_XXX indicating how the encoding finished.
258 DST_BYTES zero means that source area and destination area are
259 overlapped, which means that we can produce a encoded text until it
260 reaches at the head of not-yet-encoded source text.
262 Below is a template of these functions. */
265 encode_coding_XXX (coding
)
266 struct coding_system
*coding
;
268 int multibytep
= coding
->dst_multibyte
;
269 int *charbuf
= coding
->charbuf
;
270 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
271 unsigned char *dst
= coding
->destination
+ coding
->produced
;
272 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
273 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
274 int produced_chars
= 0;
276 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
279 /* Encode C into DST, and increment DST. */
281 label_no_more_destination
:
282 /* How many chars and bytes we produced. */
283 coding
->produced_char
+= produced_chars
;
284 coding
->produced
= dst
- coding
->destination
;
289 /*** 1. Preamble ***/
296 #include "character.h"
299 #include "composite.h"
303 Lisp_Object Vcoding_system_hash_table
;
305 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
306 Lisp_Object Qunix
, Qdos
, Qmac
;
307 Lisp_Object Qbuffer_file_coding_system
;
308 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
309 Lisp_Object Qdefault_char
;
310 Lisp_Object Qno_conversion
, Qundecided
;
311 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
312 Lisp_Object Qutf_16_be_nosig
, Qutf_16_be
, Qutf_16_le_nosig
, Qutf_16_le
;
313 Lisp_Object Qsignature
, Qendian
, Qbig
, Qlittle
;
314 Lisp_Object Qcoding_system_history
;
315 Lisp_Object Qvalid_codes
;
317 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
318 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
319 Lisp_Object Qstart_process
, Qopen_network_stream
;
320 Lisp_Object Qtarget_idx
;
322 Lisp_Object Vselect_safe_coding_system_function
;
324 /* Mnemonic string for each format of end-of-line. */
325 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
326 /* Mnemonic string to indicate format of end-of-line is not yet
328 Lisp_Object eol_mnemonic_undecided
;
332 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
334 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
336 /* Coding system emacs-mule and raw-text are for converting only
337 end-of-line format. */
338 Lisp_Object Qemacs_mule
, Qraw_text
;
340 /* Coding-systems are handed between Emacs Lisp programs and C internal
341 routines by the following three variables. */
342 /* Coding-system for reading files and receiving data from process. */
343 Lisp_Object Vcoding_system_for_read
;
344 /* Coding-system for writing files and sending data to process. */
345 Lisp_Object Vcoding_system_for_write
;
346 /* Coding-system actually used in the latest I/O. */
347 Lisp_Object Vlast_coding_system_used
;
349 /* A vector of length 256 which contains information about special
350 Latin codes (especially for dealing with Microsoft codes). */
351 Lisp_Object Vlatin_extra_code_table
;
353 /* Flag to inhibit code conversion of end-of-line format. */
354 int inhibit_eol_conversion
;
356 /* Flag to inhibit ISO2022 escape sequence detection. */
357 int inhibit_iso_escape_detection
;
359 /* Flag to make buffer-file-coding-system inherit from process-coding. */
360 int inherit_process_coding_system
;
362 /* Coding system to be used to encode text for terminal display. */
363 struct coding_system terminal_coding
;
365 /* Coding system to be used to encode text for terminal display when
366 terminal coding system is nil. */
367 struct coding_system safe_terminal_coding
;
369 /* Coding system of what is sent from terminal keyboard. */
370 struct coding_system keyboard_coding
;
372 Lisp_Object Vfile_coding_system_alist
;
373 Lisp_Object Vprocess_coding_system_alist
;
374 Lisp_Object Vnetwork_coding_system_alist
;
376 Lisp_Object Vlocale_coding_system
;
380 /* Flag to tell if we look up translation table on character code
382 Lisp_Object Venable_character_translation
;
383 /* Standard translation table to look up on decoding (reading). */
384 Lisp_Object Vstandard_translation_table_for_decode
;
385 /* Standard translation table to look up on encoding (writing). */
386 Lisp_Object Vstandard_translation_table_for_encode
;
388 Lisp_Object Qtranslation_table
;
389 Lisp_Object Qtranslation_table_id
;
390 Lisp_Object Qtranslation_table_for_decode
;
391 Lisp_Object Qtranslation_table_for_encode
;
393 /* Alist of charsets vs revision number. */
394 static Lisp_Object Vcharset_revision_table
;
396 /* Default coding systems used for process I/O. */
397 Lisp_Object Vdefault_process_coding_system
;
399 /* Global flag to tell that we can't call post-read-conversion and
400 pre-write-conversion functions. Usually the value is zero, but it
401 is set to 1 temporarily while such functions are running. This is
402 to avoid infinite recursive call. */
403 static int inhibit_pre_post_conversion
;
405 /* Char-table containing safe coding systems of each character. */
406 Lisp_Object Vchar_coding_system_table
;
407 Lisp_Object Qchar_coding_system
;
409 /* Two special coding systems. */
410 Lisp_Object Vsjis_coding_system
;
411 Lisp_Object Vbig5_coding_system
;
414 static int detect_coding_utf_8
P_ ((struct coding_system
*, int *));
415 static void decode_coding_utf_8
P_ ((struct coding_system
*));
416 static int encode_coding_utf_8
P_ ((struct coding_system
*));
418 static int detect_coding_utf_16
P_ ((struct coding_system
*, int *));
419 static void decode_coding_utf_16
P_ ((struct coding_system
*));
420 static int encode_coding_utf_16
P_ ((struct coding_system
*));
422 static int detect_coding_iso_2022
P_ ((struct coding_system
*, int *));
423 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
424 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
426 static int detect_coding_emacs_mule
P_ ((struct coding_system
*, int *));
427 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
428 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
430 static int detect_coding_sjis
P_ ((struct coding_system
*, int *));
431 static void decode_coding_sjis
P_ ((struct coding_system
*));
432 static int encode_coding_sjis
P_ ((struct coding_system
*));
434 static int detect_coding_big5
P_ ((struct coding_system
*, int *));
435 static void decode_coding_big5
P_ ((struct coding_system
*));
436 static int encode_coding_big5
P_ ((struct coding_system
*));
438 static int detect_coding_ccl
P_ ((struct coding_system
*, int *));
439 static void decode_coding_ccl
P_ ((struct coding_system
*));
440 static int encode_coding_ccl
P_ ((struct coding_system
*));
442 static void decode_coding_raw_text
P_ ((struct coding_system
*));
443 static int encode_coding_raw_text
P_ ((struct coding_system
*));
446 /* ISO2022 section */
448 #define CODING_ISO_INITIAL(coding, reg) \
449 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
450 coding_attr_iso_initial), \
454 #define CODING_ISO_REQUEST(coding, charset_id) \
455 ((charset_id <= (coding)->max_charset_id \
456 ? (coding)->safe_charsets[charset_id] \
460 #define CODING_ISO_FLAGS(coding) \
461 ((coding)->spec.iso_2022.flags)
462 #define CODING_ISO_DESIGNATION(coding, reg) \
463 ((coding)->spec.iso_2022.current_designation[reg])
464 #define CODING_ISO_INVOCATION(coding, plane) \
465 ((coding)->spec.iso_2022.current_invocation[plane])
466 #define CODING_ISO_SINGLE_SHIFTING(coding) \
467 ((coding)->spec.iso_2022.single_shifting)
468 #define CODING_ISO_BOL(coding) \
469 ((coding)->spec.iso_2022.bol)
470 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
471 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
473 /* Control characters of ISO2022. */
474 /* code */ /* function */
475 #define ISO_CODE_LF 0x0A /* line-feed */
476 #define ISO_CODE_CR 0x0D /* carriage-return */
477 #define ISO_CODE_SO 0x0E /* shift-out */
478 #define ISO_CODE_SI 0x0F /* shift-in */
479 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
480 #define ISO_CODE_ESC 0x1B /* escape */
481 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
482 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
483 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
485 /* All code (1-byte) of ISO2022 is classified into one of the
487 enum iso_code_class_type
489 ISO_control_0
, /* Control codes in the range
490 0x00..0x1F and 0x7F, except for the
491 following 5 codes. */
492 ISO_carriage_return
, /* ISO_CODE_CR (0x0D) */
493 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
494 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
495 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
496 ISO_escape
, /* ISO_CODE_SO (0x1B) */
497 ISO_control_1
, /* Control codes in the range
498 0x80..0x9F, except for the
499 following 3 codes. */
500 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
501 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
502 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
503 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
504 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
505 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
506 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
509 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
510 `iso-flags' attribute of an iso2022 coding system. */
512 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
513 instead of the correct short-form sequence (e.g. ESC $ A). */
514 #define CODING_ISO_FLAG_LONG_FORM 0x0001
516 /* If set, reset graphic planes and registers at end-of-line to the
518 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
520 /* If set, reset graphic planes and registers before any control
521 characters to the initial state. */
522 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
524 /* If set, encode by 7-bit environment. */
525 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
527 /* If set, use locking-shift function. */
528 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
530 /* If set, use single-shift function. Overwrite
531 CODING_ISO_FLAG_LOCKING_SHIFT. */
532 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
534 /* If set, use designation escape sequence. */
535 #define CODING_ISO_FLAG_DESIGNATION 0x0040
537 /* If set, produce revision number sequence. */
538 #define CODING_ISO_FLAG_REVISION 0x0080
540 /* If set, produce ISO6429's direction specifying sequence. */
541 #define CODING_ISO_FLAG_DIRECTION 0x0100
543 /* If set, assume designation states are reset at beginning of line on
545 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
547 /* If set, designation sequence should be placed at beginning of line
549 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
551 /* If set, do not encode unsafe charactes on output. */
552 #define CODING_ISO_FLAG_SAFE 0x0800
554 /* If set, extra latin codes (128..159) are accepted as a valid code
556 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
558 #define CODING_ISO_FLAG_COMPOSITION 0x2000
560 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
562 #define CODING_ISO_FLAG_FULL_SUPPORT 0x8000
564 /* A character to be produced on output if encoding of the original
565 character is prohibited by CODING_ISO_FLAG_SAFE. */
566 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
570 #define CODING_UTF_16_BOM(coding) \
571 ((coding)->spec.utf_16.bom)
573 #define CODING_UTF_16_ENDIAN(coding) \
574 ((coding)->spec.utf_16.endian)
576 #define CODING_UTF_16_SURROGATE(coding) \
577 ((coding)->spec.utf_16.surrogate)
581 #define CODING_CCL_DECODER(coding) \
582 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
583 #define CODING_CCL_ENCODER(coding) \
584 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
585 #define CODING_CCL_VALIDS(coding) \
586 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
589 /* Index for each coding category in `coding_category_table' */
593 coding_category_iso_7
,
594 coding_category_iso_7_tight
,
595 coding_category_iso_8_1
,
596 coding_category_iso_8_2
,
597 coding_category_iso_7_else
,
598 coding_category_iso_8_else
,
599 coding_category_utf_8
,
600 coding_category_utf_16_auto
,
601 coding_category_utf_16_be
,
602 coding_category_utf_16_le
,
603 coding_category_utf_16_be_nosig
,
604 coding_category_utf_16_le_nosig
,
605 coding_category_charset
,
606 coding_category_sjis
,
607 coding_category_big5
,
609 coding_category_emacs_mule
,
610 /* All above are targets of code detection. */
611 coding_category_raw_text
,
612 coding_category_undecided
,
616 /* Definitions of flag bits used in detect_coding_XXXX. */
617 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
618 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
619 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
620 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
621 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
622 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
623 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
624 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
625 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
626 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
627 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
628 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
629 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
630 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
631 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
632 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
634 /* This value is returned if detect_coding_mask () find nothing other
635 than ASCII characters. */
636 #define CATEGORY_MASK_ANY \
637 (CATEGORY_MASK_ISO_7 \
638 | CATEGORY_MASK_ISO_7_TIGHT \
639 | CATEGORY_MASK_ISO_8_1 \
640 | CATEGORY_MASK_ISO_8_2 \
641 | CATEGORY_MASK_ISO_7_ELSE \
642 | CATEGORY_MASK_ISO_8_ELSE \
643 | CATEGORY_MASK_UTF_8 \
644 | CATEGORY_MASK_UTF_16_BE \
645 | CATEGORY_MASK_UTF_16_LE \
646 | CATEGORY_MASK_UTF_16_BE_NOSIG \
647 | CATEGORY_MASK_UTF_16_LE_NOSIG \
648 | CATEGORY_MASK_CHARSET \
649 | CATEGORY_MASK_SJIS \
650 | CATEGORY_MASK_BIG5 \
651 | CATEGORY_MASK_CCL \
652 | CATEGORY_MASK_EMACS_MULE)
655 #define CATEGORY_MASK_ISO_7BIT \
656 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
658 #define CATEGORY_MASK_ISO_8BIT \
659 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
661 #define CATEGORY_MASK_ISO_ELSE \
662 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
664 #define CATEGORY_MASK_ISO_ESCAPE \
665 (CATEGORY_MASK_ISO_7 \
666 | CATEGORY_MASK_ISO_7_TIGHT \
667 | CATEGORY_MASK_ISO_7_ELSE \
668 | CATEGORY_MASK_ISO_8_ELSE)
670 #define CATEGORY_MASK_ISO \
671 ( CATEGORY_MASK_ISO_7BIT \
672 | CATEGORY_MASK_ISO_8BIT \
673 | CATEGORY_MASK_ISO_ELSE)
675 #define CATEGORY_MASK_UTF_16 \
676 (CATEGORY_MASK_UTF_16_BE \
677 | CATEGORY_MASK_UTF_16_LE \
678 | CATEGORY_MASK_UTF_16_BE_NOSIG \
679 | CATEGORY_MASK_UTF_16_LE_NOSIG)
682 /* List of symbols `coding-category-xxx' ordered by priority. This
683 variable is exposed to Emacs Lisp. */
684 static Lisp_Object Vcoding_category_list
;
686 /* Table of coding categories (Lisp symbols). This variable is for
688 static Lisp_Object Vcoding_category_table
;
690 /* Table of coding-categories ordered by priority. */
691 static enum coding_category coding_priorities
[coding_category_max
];
693 /* Nth element is a coding context for the coding system bound to the
694 Nth coding category. */
695 static struct coding_system coding_categories
[coding_category_max
];
697 static int detected_mask
[coding_category_raw_text
] =
705 CATEGORY_MASK_UTF_16
,
706 CATEGORY_MASK_UTF_16
,
707 CATEGORY_MASK_UTF_16
,
708 CATEGORY_MASK_UTF_16
,
709 CATEGORY_MASK_UTF_16
,
710 CATEGORY_MASK_CHARSET
,
714 CATEGORY_MASK_EMACS_MULE
717 /*** Commonly used macros and functions ***/
720 #define min(a, b) ((a) < (b) ? (a) : (b))
723 #define max(a, b) ((a) > (b) ? (a) : (b))
726 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
728 attrs = CODING_ID_ATTRS (coding->id); \
729 eol_type = CODING_ID_EOL_TYPE (coding->id); \
730 if (VECTORP (eol_type)) \
732 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
736 /* Safely get one byte from the source text pointed by SRC which ends
737 at SRC_END, and set C to that byte. If there are not enough bytes
738 in the source, it jumps to `no_more_source'. The caller
739 should declare and set these variables appropriately in advance:
740 src, src_end, multibytep
743 #define ONE_MORE_BYTE(c) \
745 if (src == src_end) \
747 if (src_base < src) \
748 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
749 goto no_more_source; \
752 if (multibytep && (c & 0x80)) \
754 if ((c & 0xFE) != 0xC0) \
755 error ("Undecodable char found"); \
756 c = ((c & 1) << 6) | *src++; \
762 #define ONE_MORE_BYTE_NO_CHECK(c) \
765 if (multibytep && (c & 0x80)) \
767 if ((c & 0xFE) != 0xC0) \
768 error ("Undecodable char found"); \
769 c = ((c & 1) << 6) | *src++; \
774 /* Store a byte C in the place pointed by DST and increment DST to the
775 next free point, and increment PRODUCED_CHARS. The caller should
776 assure that C is 0..127, and declare and set the variable `dst'
777 appropriately in advance.
781 #define EMIT_ONE_ASCII_BYTE(c) \
788 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
790 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
792 produced_chars += 2; \
793 *dst++ = (c1), *dst++ = (c2); \
797 /* Store a byte C in the place pointed by DST and increment DST to the
798 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
799 nonzero, store in an appropriate multibyte from. The caller should
800 declare and set the variables `dst' and `multibytep' appropriately
803 #define EMIT_ONE_BYTE(c) \
810 ch = BYTE8_TO_CHAR (ch); \
811 CHAR_STRING_ADVANCE (ch, dst); \
818 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
820 #define EMIT_TWO_BYTES(c1, c2) \
822 produced_chars += 2; \
829 ch = BYTE8_TO_CHAR (ch); \
830 CHAR_STRING_ADVANCE (ch, dst); \
833 ch = BYTE8_TO_CHAR (ch); \
834 CHAR_STRING_ADVANCE (ch, dst); \
844 #define EMIT_THREE_BYTES(c1, c2, c3) \
846 EMIT_ONE_BYTE (c1); \
847 EMIT_TWO_BYTES (c2, c3); \
851 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
853 EMIT_TWO_BYTES (c1, c2); \
854 EMIT_TWO_BYTES (c3, c4); \
858 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
860 charset_map_loaded = 0; \
861 c = DECODE_CHAR (charset, code); \
862 if (charset_map_loaded) \
864 unsigned char *orig = coding->source; \
867 coding_set_source (coding); \
868 offset = coding->source - orig; \
870 src_base += offset; \
876 #define ASSURE_DESTINATION(bytes) \
878 if (dst + (bytes) >= dst_end) \
880 int more_bytes = charbuf_end - charbuf + (bytes); \
882 dst = alloc_destination (coding, more_bytes, dst); \
883 dst_end = coding->destination + coding->dst_bytes; \
890 coding_set_source (coding
)
891 struct coding_system
*coding
;
893 if (BUFFERP (coding
->src_object
))
895 if (coding
->src_pos
< 0)
896 coding
->source
= GAP_END_ADDR
+ coding
->src_pos_byte
;
899 struct buffer
*buf
= XBUFFER (coding
->src_object
);
900 EMACS_INT beg_byte
= BUF_BEG_BYTE (buf
);
901 EMACS_INT gpt_byte
= BUF_GPT_BYTE (buf
);
902 unsigned char *beg_addr
= BUF_BEG_ADDR (buf
);
904 coding
->source
= beg_addr
+ coding
->src_pos_byte
- 1;
905 if (coding
->src_pos_byte
>= gpt_byte
)
906 coding
->source
+= BUF_GAP_SIZE (buf
);
909 else if (STRINGP (coding
->src_object
))
911 coding
->source
= (XSTRING (coding
->src_object
)->data
912 + coding
->src_pos_byte
);
915 /* Otherwise, the source is C string and is never relocated
916 automatically. Thus we don't have to update anything. */
921 coding_set_destination (coding
)
922 struct coding_system
*coding
;
924 if (BUFFERP (coding
->dst_object
))
926 /* We are sure that coding->dst_pos_byte is before the gap of the
928 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
929 + coding
->dst_pos_byte
- 1);
930 if (coding
->src_pos
< 0)
931 /* The source and destination is in the same buffer. */
932 coding
->dst_bytes
= (GAP_END_ADDR
933 - (coding
->src_bytes
- coding
->consumed
)
934 - coding
->destination
);
936 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
937 - coding
->destination
);
940 /* Otherwise, the destination is C string and is never relocated
941 automatically. Thus we don't have to update anything. */
947 coding_alloc_by_realloc (coding
, bytes
)
948 struct coding_system
*coding
;
951 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
952 coding
->dst_bytes
+ bytes
);
953 coding
->dst_bytes
+= bytes
;
957 coding_alloc_by_making_gap (coding
, bytes
)
958 struct coding_system
*coding
;
961 Lisp_Object this_buffer
;
963 this_buffer
= Fcurrent_buffer ();
964 if (EQ (this_buffer
, coding
->dst_object
))
966 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
968 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
970 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
974 set_buffer_internal (XBUFFER (coding
->dst_object
));
976 set_buffer_internal (XBUFFER (this_buffer
));
981 static unsigned char *
982 alloc_destination (coding
, nbytes
, dst
)
983 struct coding_system
*coding
;
987 EMACS_INT offset
= dst
- coding
->destination
;
989 if (BUFFERP (coding
->dst_object
))
990 coding_alloc_by_making_gap (coding
, nbytes
);
992 coding_alloc_by_realloc (coding
, nbytes
);
993 coding
->result
= CODING_RESULT_SUCCESS
;
994 coding_set_destination (coding
);
995 dst
= coding
->destination
+ offset
;
1000 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1007 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1008 Check if a text is encoded in UTF-8. If it is, return
1009 CATEGORY_MASK_UTF_8, else return 0. */
1011 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1012 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1013 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1014 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1015 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1016 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1019 detect_coding_utf_8 (coding
, mask
)
1020 struct coding_system
*coding
;
1023 unsigned char *src
= coding
->source
, *src_base
= src
;
1024 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1025 int multibytep
= coding
->src_multibyte
;
1026 int consumed_chars
= 0;
1029 /* A coding system of this category is always ASCII compatible. */
1030 src
+= coding
->head_ascii
;
1034 int c
, c1
, c2
, c3
, c4
;
1037 if (UTF_8_1_OCTET_P (c
))
1040 if (! UTF_8_EXTRA_OCTET_P (c1
))
1042 if (UTF_8_2_OCTET_LEADING_P (c
))
1048 if (! UTF_8_EXTRA_OCTET_P (c2
))
1050 if (UTF_8_3_OCTET_LEADING_P (c
))
1056 if (! UTF_8_EXTRA_OCTET_P (c3
))
1058 if (UTF_8_4_OCTET_LEADING_P (c
))
1064 if (! UTF_8_EXTRA_OCTET_P (c4
))
1066 if (UTF_8_5_OCTET_LEADING_P (c
))
1073 *mask
&= ~CATEGORY_MASK_UTF_8
;
1079 *mask
&= CATEGORY_MASK_UTF_8
;
1085 decode_coding_utf_8 (coding
)
1086 struct coding_system
*coding
;
1088 unsigned char *src
= coding
->source
+ coding
->consumed
;
1089 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1090 unsigned char *src_base
;
1091 int *charbuf
= coding
->charbuf
;
1092 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1093 int consumed_chars
= 0, consumed_chars_base
;
1094 int multibytep
= coding
->src_multibyte
;
1095 Lisp_Object attr
, eol_type
, charset_list
;
1097 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1101 int c
, c1
, c2
, c3
, c4
, c5
;
1104 consumed_chars_base
= consumed_chars
;
1106 if (charbuf
>= charbuf_end
)
1110 if (UTF_8_1_OCTET_P(c1
))
1115 if (EQ (eol_type
, Qdos
))
1118 goto no_more_source
;
1122 else if (EQ (eol_type
, Qmac
))
1129 if (! UTF_8_EXTRA_OCTET_P (c2
))
1131 if (UTF_8_2_OCTET_LEADING_P (c1
))
1132 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1136 if (! UTF_8_EXTRA_OCTET_P (c3
))
1138 if (UTF_8_3_OCTET_LEADING_P (c1
))
1139 c
= (((c1
& 0xF) << 12)
1140 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1144 if (! UTF_8_EXTRA_OCTET_P (c4
))
1146 if (UTF_8_4_OCTET_LEADING_P (c1
))
1147 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1148 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1152 if (! UTF_8_EXTRA_OCTET_P (c5
))
1154 if (UTF_8_5_OCTET_LEADING_P (c1
))
1156 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1157 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1174 consumed_chars
= consumed_chars_base
;
1176 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1181 coding
->consumed_char
+= consumed_chars_base
;
1182 coding
->consumed
= src_base
- coding
->source
;
1183 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1188 encode_coding_utf_8 (coding
)
1189 struct coding_system
*coding
;
1191 int multibytep
= coding
->dst_multibyte
;
1192 int *charbuf
= coding
->charbuf
;
1193 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1194 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1195 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1196 int produced_chars
= 0;
1201 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1203 while (charbuf
< charbuf_end
)
1205 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1207 ASSURE_DESTINATION (safe_room
);
1209 CHAR_STRING_ADVANCE (c
, pend
);
1210 for (p
= str
; p
< pend
; p
++)
1216 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1218 while (charbuf
< charbuf_end
)
1220 ASSURE_DESTINATION (safe_room
);
1222 dst
+= CHAR_STRING (c
, dst
);
1226 coding
->result
= CODING_RESULT_SUCCESS
;
1227 coding
->produced_char
+= produced_chars
;
1228 coding
->produced
= dst
- coding
->destination
;
1233 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1234 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1235 Little Endian (otherwise). If it is, return
1236 CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1239 #define UTF_16_HIGH_SURROGATE_P(val) \
1240 (((val) & 0xFC00) == 0xD800)
1242 #define UTF_16_LOW_SURROGATE_P(val) \
1243 (((val) & 0xFC00) == 0xDC00)
1245 #define UTF_16_INVALID_P(val) \
1246 (((val) == 0xFFFE) \
1247 || ((val) == 0xFFFF) \
1248 || UTF_16_LOW_SURROGATE_P (val))
1252 detect_coding_utf_16 (coding
, mask
)
1253 struct coding_system
*coding
;
1256 unsigned char *src
= coding
->source
, *src_base
= src
;
1257 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1258 int multibytep
= coding
->src_multibyte
;
1259 int consumed_chars
= 0;
1265 if ((c1
== 0xFF) && (c2
== 0xFE))
1267 *mask
&= CATEGORY_MASK_UTF_16_LE
;
1270 else if ((c1
== 0xFE) && (c2
== 0xFF))
1272 *mask
&= CATEGORY_MASK_UTF_16_BE
;
1280 decode_coding_utf_16 (coding
)
1281 struct coding_system
*coding
;
1283 unsigned char *src
= coding
->source
+ coding
->consumed
;
1284 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1285 unsigned char *src_base
;
1286 int *charbuf
= coding
->charbuf
;
1287 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1288 int consumed_chars
= 0, consumed_chars_base
;
1289 int multibytep
= coding
->src_multibyte
;
1290 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1291 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1292 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1293 Lisp_Object attr
, eol_type
, charset_list
;
1295 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1297 if (bom
!= utf_16_without_bom
)
1305 if (bom
== utf_16_with_bom
)
1307 if (endian
== utf_16_big_endian
1308 ? c
!= 0xFFFE : c
!= 0xFEFF)
1310 /* We are sure that there's enouph room at CHARBUF. */
1319 CODING_UTF_16_ENDIAN (coding
)
1320 = endian
= utf_16_big_endian
;
1321 else if (c
== 0xFEFF)
1322 CODING_UTF_16_ENDIAN (coding
)
1323 = endian
= utf_16_little_endian
;
1326 CODING_UTF_16_ENDIAN (coding
)
1327 = endian
= utf_16_big_endian
;
1331 CODING_UTF_16_BOM (coding
) = utf_16_with_bom
;
1339 consumed_chars_base
= consumed_chars
;
1341 if (charbuf
+ 2 >= charbuf_end
)
1346 c
= (endian
== utf_16_big_endian
1347 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1350 if (! UTF_16_LOW_SURROGATE_P (c
))
1352 if (endian
== utf_16_big_endian
)
1353 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1355 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1359 if (UTF_16_HIGH_SURROGATE_P (c
))
1360 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1366 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1367 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1373 if (UTF_16_HIGH_SURROGATE_P (c
))
1374 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1381 coding
->consumed_char
+= consumed_chars_base
;
1382 coding
->consumed
= src_base
- coding
->source
;
1383 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1387 encode_coding_utf_16 (coding
)
1388 struct coding_system
*coding
;
1390 int multibytep
= coding
->dst_multibyte
;
1391 int *charbuf
= coding
->charbuf
;
1392 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1393 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1394 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1396 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1397 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1398 int produced_chars
= 0;
1399 Lisp_Object attrs
, eol_type
, charset_list
;
1402 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1404 if (bom
== utf_16_with_bom
)
1406 ASSURE_DESTINATION (safe_room
);
1408 EMIT_TWO_BYTES (0xFF, 0xFE);
1410 EMIT_TWO_BYTES (0xFE, 0xFF);
1411 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1414 while (charbuf
< charbuf_end
)
1416 ASSURE_DESTINATION (safe_room
);
1418 if (c
>= MAX_UNICODE_CHAR
)
1419 c
= coding
->default_char
;
1424 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1426 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1433 c1
= (c
>> 10) + 0xD800;
1434 c2
= (c
& 0x3FF) + 0xDC00;
1436 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1438 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1441 coding
->result
= CODING_RESULT_SUCCESS
;
1442 coding
->produced
= dst
- coding
->destination
;
1443 coding
->produced_char
+= produced_chars
;
1448 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1450 /* Emacs' internal format for representation of multiple character
1451 sets is a kind of multi-byte encoding, i.e. characters are
1452 represented by variable-length sequences of one-byte codes.
1454 ASCII characters and control characters (e.g. `tab', `newline') are
1455 represented by one-byte sequences which are their ASCII codes, in
1456 the range 0x00 through 0x7F.
1458 8-bit characters of the range 0x80..0x9F are represented by
1459 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1462 8-bit characters of the range 0xA0..0xFF are represented by
1463 one-byte sequences which are their 8-bit code.
1465 The other characters are represented by a sequence of `base
1466 leading-code', optional `extended leading-code', and one or two
1467 `position-code's. The length of the sequence is determined by the
1468 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1469 whereas extended leading-code and position-code take the range 0xA0
1470 through 0xFF. See `charset.h' for more details about leading-code
1473 --- CODE RANGE of Emacs' internal format ---
1477 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1478 eight-bit-graphic 0xA0..0xBF
1479 ELSE 0x81..0x9D + [0xA0..0xFF]+
1480 ---------------------------------------------
1482 As this is the internal character representation, the format is
1483 usually not used externally (i.e. in a file or in a data sent to a
1484 process). But, it is possible to have a text externally in this
1485 format (i.e. by encoding by the coding system `emacs-mule').
1487 In that case, a sequence of one-byte codes has a slightly different
1490 At first, all characters in eight-bit-control are represented by
1491 one-byte sequences which are their 8-bit code.
1493 Next, character composition data are represented by the byte
1494 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1496 METHOD is 0xF0 plus one of composition method (enum
1497 composition_method),
1499 BYTES is 0xA0 plus a byte length of this composition data,
1501 CHARS is 0x20 plus a number of characters composed by this
1504 COMPONENTs are characters of multibye form or composition
1505 rules encoded by two-byte of ASCII codes.
1507 In addition, for backward compatibility, the following formats are
1508 also recognized as composition data on decoding.
1511 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1514 MSEQ is a multibyte form but in these special format:
1515 ASCII: 0xA0 ASCII_CODE+0x80,
1516 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1517 RULE is a one byte code of the range 0xA0..0xF0 that
1518 represents a composition rule.
1521 char emacs_mule_bytes
[256];
1523 /* Leading-code followed by extended leading-code. */
1524 #define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
1525 #define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
1526 #define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
1527 #define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
1531 emacs_mule_char (coding
, composition
, nbytes
, nchars
)
1532 struct coding_system
*coding
;
1534 int *nbytes
, *nchars
;
1536 unsigned char *src
= coding
->source
+ coding
->consumed
;
1537 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1538 int multibytep
= coding
->src_multibyte
;
1539 unsigned char *src_base
= src
;
1540 struct charset
*charset
;
1543 int consumed_chars
= 0;
1554 *nbytes
= src
- src_base
;
1555 *nchars
= consumed_chars
;
1560 switch (emacs_mule_bytes
[c
])
1563 if (! (charset
= emacs_mule_charset
[c
]))
1570 if (c
== LEADING_CODE_PRIVATE_11
1571 || c
== LEADING_CODE_PRIVATE_12
)
1574 if (! (charset
= emacs_mule_charset
[c
]))
1581 if (! (charset
= emacs_mule_charset
[c
]))
1584 code
= (c
& 0x7F) << 7;
1591 if (! (charset
= emacs_mule_charset
[c
]))
1594 code
= (c
& 0x7F) << 7;
1601 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
) ? charset_ascii
1602 : code
< 0xA0 ? charset_8_bit_control
1603 : charset_8_bit_graphic
);
1609 c
= DECODE_CHAR (charset
, code
);
1612 *nbytes
= src
- src_base
;
1613 *nchars
= consumed_chars
;
1624 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1625 Check if a text is encoded in `emacs-mule'. */
1628 detect_coding_emacs_mule (coding
, mask
)
1629 struct coding_system
*coding
;
1632 unsigned char *src
= coding
->source
, *src_base
= src
;
1633 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1634 int multibytep
= coding
->src_multibyte
;
1635 int consumed_chars
= 0;
1639 /* A coding system of this category is always ASCII compatible. */
1640 src
+= coding
->head_ascii
;
1648 /* Perhaps the start of composite character. We simple skip
1649 it because analyzing it is too heavy for detecting. But,
1650 at least, we check that the composite character
1651 constitues of more than 4 bytes. */
1652 unsigned char *src_base
;
1662 if (src
- src_base
<= 4)
1672 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1677 unsigned char *src_base
= src
- 1;
1684 if (src
- src_base
!= emacs_mule_bytes
[*src_base
])
1689 *mask
&= ~CATEGORY_MASK_EMACS_MULE
;
1695 *mask
&= CATEGORY_MASK_EMACS_MULE
;
1700 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1702 /* Decode a character represented as a component of composition
1703 sequence of Emacs 20/21 style at SRC. Set C to that character and
1704 update SRC to the head of next character (or an encoded composition
1705 rule). If SRC doesn't points a composition component, set C to -1.
1706 If SRC points an invalid byte sequence, global exit by a return
1709 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1713 int nbytes, nchars; \
1715 if (src == src_end) \
1717 c = emacs_mule_char (coding, 1, &nbytes, &nchars); \
1722 goto invalid_code; \
1726 consumed_chars += nchars; \
1731 /* Decode a composition rule represented as a component of composition
1732 sequence of Emacs 20 style at SRC. Set C to the rule. If SRC
1733 points an invalid byte sequence, set C to -1. */
1735 #define DECODE_EMACS_MULE_COMPOSITION_RULE(buf) \
1737 int c, gref, nref; \
1739 if (src < src_end) \
1740 goto invalid_code; \
1741 ONE_MORE_BYTE_NO_CHECK (c); \
1743 if (c < 0 || c >= 81) \
1744 goto invalid_code; \
1746 gref = c / 9, nref = c % 9; \
1747 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1751 #define ADD_COMPOSITION_DATA(buf, method, nchars) \
1754 *buf++ = coding->produced_char + char_offset; \
1755 *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \
1761 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1763 /* Emacs 21 style format. The first three bytes at SRC are \
1764 (METHOD - 0xF0), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1765 the byte length of this composition information, CHARS is the \
1766 number of characters composed by this composition. */ \
1767 enum composition_method method = c - 0xF0; \
1768 int consumed_chars_limit; \
1769 int nbytes, nchars; \
1771 ONE_MORE_BYTE (c); \
1772 nbytes = c - 0xA0; \
1774 goto invalid_code; \
1775 ONE_MORE_BYTE (c); \
1776 nchars = c - 0xA0; \
1777 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
1778 consumed_chars_limit = consumed_chars_base + nbytes; \
1779 if (method != COMPOSITION_RELATIVE) \
1782 while (consumed_chars < consumed_chars_limit) \
1784 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1785 DECODE_EMACS_MULE_COMPOSITION_RULE (charbuf); \
1787 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1789 if (consumed_chars < consumed_chars_limit) \
1790 goto invalid_code; \
1795 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1797 /* Emacs 20 style format for relative composition. */ \
1798 /* Store multibyte form of characters to be composed. */ \
1799 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1800 int *buf = components; \
1804 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1805 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1806 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1808 goto invalid_code; \
1809 ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \
1810 for (j = 0; j < i; j++) \
1811 *charbuf++ = components[j]; \
1815 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1817 /* Emacs 20 style format for rule-base composition. */ \
1818 /* Store multibyte form of characters to be composed. */ \
1819 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1820 int *buf = components; \
1823 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1824 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1826 DECODE_EMACS_MULE_COMPOSITION_RULE (buf); \
1827 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1829 if (i < 1 || (buf - components) % 2 == 0) \
1830 goto invalid_code; \
1831 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1832 goto no_more_source; \
1833 ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \
1834 for (j = 0; j < i; j++) \
1835 *charbuf++ = components[j]; \
1836 for (j = 0; j < i; j += 2) \
1837 *charbuf++ = components[j]; \
1842 decode_coding_emacs_mule (coding
)
1843 struct coding_system
*coding
;
1845 unsigned char *src
= coding
->source
+ coding
->consumed
;
1846 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1847 unsigned char *src_base
;
1848 int *charbuf
= coding
->charbuf
;
1849 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1850 int consumed_chars
= 0, consumed_chars_base
;
1851 int char_offset
= 0;
1852 int multibytep
= coding
->src_multibyte
;
1853 Lisp_Object attrs
, eol_type
, charset_list
;
1855 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1862 consumed_chars_base
= consumed_chars
;
1864 if (charbuf
>= charbuf_end
)
1873 if (EQ (eol_type
, Qdos
))
1876 goto no_more_source
;
1880 else if (EQ (eol_type
, Qmac
))
1888 if (charbuf
+ 5 + (MAX_COMPOSITION_COMPONENTS
* 2) - 1 > charbuf_end
)
1891 if (c
- 0xF0 >= COMPOSITION_RELATIVE
1892 && c
- 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS
)
1893 DECODE_EMACS_MULE_21_COMPOSITION (c
);
1895 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
1897 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
1901 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
1905 c
= emacs_mule_char (coding
, 0, &nbytes
, &nchars
);
1919 consumed_chars
= consumed_chars_base
;
1921 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1926 coding
->consumed_char
+= consumed_chars_base
;
1927 coding
->consumed
= src_base
- coding
->source
;
1928 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1932 #define EMACS_MULE_LEADING_CODES(id, codes) \
1935 codes[0] = id, codes[1] = 0; \
1936 else if (id < 0xE0) \
1937 codes[0] = 0x9A, codes[1] = id; \
1938 else if (id < 0xF0) \
1939 codes[0] = 0x9B, codes[1] = id; \
1940 else if (id < 0xF5) \
1941 codes[0] = 0x9C, codes[1] = id; \
1943 codes[0] = 0x9D, codes[1] = id; \
1948 encode_coding_emacs_mule (coding
)
1949 struct coding_system
*coding
;
1951 int multibytep
= coding
->dst_multibyte
;
1952 int *charbuf
= coding
->charbuf
;
1953 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1954 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1955 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1957 int produced_chars
= 0;
1958 Lisp_Object attrs
, eol_type
, charset_list
;
1961 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1963 while (charbuf
< charbuf_end
)
1965 ASSURE_DESTINATION (safe_room
);
1967 if (ASCII_CHAR_P (c
))
1968 EMIT_ONE_ASCII_BYTE (c
);
1971 struct charset
*charset
;
1975 unsigned char leading_codes
[2];
1977 charset
= char_charset (c
, charset_list
, &code
);
1980 c
= coding
->default_char
;
1981 if (ASCII_CHAR_P (c
))
1983 EMIT_ONE_ASCII_BYTE (c
);
1986 charset
= char_charset (c
, charset_list
, &code
);
1988 dimension
= CHARSET_DIMENSION (charset
);
1989 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
1990 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
1991 EMIT_ONE_BYTE (leading_codes
[0]);
1992 if (leading_codes
[1])
1993 EMIT_ONE_BYTE (leading_codes
[1]);
1995 EMIT_ONE_BYTE (code
);
1998 EMIT_ONE_BYTE (code
>> 8);
1999 EMIT_ONE_BYTE (code
& 0xFF);
2003 coding
->result
= CODING_RESULT_SUCCESS
;
2004 coding
->produced_char
+= produced_chars
;
2005 coding
->produced
= dst
- coding
->destination
;
2010 /*** 7. ISO2022 handlers ***/
2012 /* The following note describes the coding system ISO2022 briefly.
2013 Since the intention of this note is to help understand the
2014 functions in this file, some parts are NOT ACCURATE or OVERLY
2015 SIMPLIFIED. For thorough understanding, please refer to the
2016 original document of ISO2022.
2018 ISO2022 provides many mechanisms to encode several character sets
2019 in 7-bit and 8-bit environments. For 7-bite environments, all text
2020 is encoded using bytes less than 128. This may make the encoded
2021 text a little bit longer, but the text passes more easily through
2022 several gateways, some of which strip off MSB (Most Signigant Bit).
2024 There are two kinds of character sets: control character set and
2025 graphic character set. The former contains control characters such
2026 as `newline' and `escape' to provide control functions (control
2027 functions are also provided by escape sequences). The latter
2028 contains graphic characters such as 'A' and '-'. Emacs recognizes
2029 two control character sets and many graphic character sets.
2031 Graphic character sets are classified into one of the following
2032 four classes, according to the number of bytes (DIMENSION) and
2033 number of characters in one dimension (CHARS) of the set:
2034 - DIMENSION1_CHARS94
2035 - DIMENSION1_CHARS96
2036 - DIMENSION2_CHARS94
2037 - DIMENSION2_CHARS96
2039 In addition, each character set is assigned an identification tag,
2040 unique for each set, called "final character" (denoted as <F>
2041 hereafter). The <F> of each character set is decided by ECMA(*)
2042 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2043 (0x30..0x3F are for private use only).
2045 Note (*): ECMA = European Computer Manufacturers Association
2047 Here are examples of graphic character set [NAME(<F>)]:
2048 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2049 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2050 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2051 o DIMENSION2_CHARS96 -- none for the moment
2053 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2054 C0 [0x00..0x1F] -- control character plane 0
2055 GL [0x20..0x7F] -- graphic character plane 0
2056 C1 [0x80..0x9F] -- control character plane 1
2057 GR [0xA0..0xFF] -- graphic character plane 1
2059 A control character set is directly designated and invoked to C0 or
2060 C1 by an escape sequence. The most common case is that:
2061 - ISO646's control character set is designated/invoked to C0, and
2062 - ISO6429's control character set is designated/invoked to C1,
2063 and usually these designations/invocations are omitted in encoded
2064 text. In a 7-bit environment, only C0 can be used, and a control
2065 character for C1 is encoded by an appropriate escape sequence to
2066 fit into the environment. All control characters for C1 are
2067 defined to have corresponding escape sequences.
2069 A graphic character set is at first designated to one of four
2070 graphic registers (G0 through G3), then these graphic registers are
2071 invoked to GL or GR. These designations and invocations can be
2072 done independently. The most common case is that G0 is invoked to
2073 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2074 these invocations and designations are omitted in encoded text.
2075 In a 7-bit environment, only GL can be used.
2077 When a graphic character set of CHARS94 is invoked to GL, codes
2078 0x20 and 0x7F of the GL area work as control characters SPACE and
2079 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2082 There are two ways of invocation: locking-shift and single-shift.
2083 With locking-shift, the invocation lasts until the next different
2084 invocation, whereas with single-shift, the invocation affects the
2085 following character only and doesn't affect the locking-shift
2086 state. Invocations are done by the following control characters or
2089 ----------------------------------------------------------------------
2090 abbrev function cntrl escape seq description
2091 ----------------------------------------------------------------------
2092 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2093 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2094 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2095 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2096 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2097 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2098 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2099 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2100 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2101 ----------------------------------------------------------------------
2102 (*) These are not used by any known coding system.
2104 Control characters for these functions are defined by macros
2105 ISO_CODE_XXX in `coding.h'.
2107 Designations are done by the following escape sequences:
2108 ----------------------------------------------------------------------
2109 escape sequence description
2110 ----------------------------------------------------------------------
2111 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2112 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2113 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2114 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2115 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2116 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2117 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2118 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2119 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2120 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2121 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2122 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2123 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2124 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2125 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2126 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2127 ----------------------------------------------------------------------
2129 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2130 of dimension 1, chars 94, and final character <F>, etc...
2132 Note (*): Although these designations are not allowed in ISO2022,
2133 Emacs accepts them on decoding, and produces them on encoding
2134 CHARS96 character sets in a coding system which is characterized as
2135 7-bit environment, non-locking-shift, and non-single-shift.
2137 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2138 '(' must be omitted. We refer to this as "short-form" hereafter.
2140 Now you may notice that there are a lot of ways for encoding the
2141 same multilingual text in ISO2022. Actually, there exist many
2142 coding systems such as Compound Text (used in X11's inter client
2143 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
2144 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
2145 localized platforms), and all of these are variants of ISO2022.
2147 In addition to the above, Emacs handles two more kinds of escape
2148 sequences: ISO6429's direction specification and Emacs' private
2149 sequence for specifying character composition.
2151 ISO6429's direction specification takes the following form:
2152 o CSI ']' -- end of the current direction
2153 o CSI '0' ']' -- end of the current direction
2154 o CSI '1' ']' -- start of left-to-right text
2155 o CSI '2' ']' -- start of right-to-left text
2156 The control character CSI (0x9B: control sequence introducer) is
2157 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2159 Character composition specification takes the following form:
2160 o ESC '0' -- start relative composition
2161 o ESC '1' -- end composition
2162 o ESC '2' -- start rule-base composition (*)
2163 o ESC '3' -- start relative composition with alternate chars (**)
2164 o ESC '4' -- start rule-base composition with alternate chars (**)
2165 Since these are not standard escape sequences of any ISO standard,
2166 the use of them for these meaning is restricted to Emacs only.
2168 (*) This form is used only in Emacs 20.5 and the older versions,
2169 but the newer versions can safely decode it.
2170 (**) This form is used only in Emacs 21.1 and the newer versions,
2171 and the older versions can't decode it.
2173 Here's a list of examples usages of these composition escape
2174 sequences (categorized by `enum composition_method').
2176 COMPOSITION_RELATIVE:
2177 ESC 0 CHAR [ CHAR ] ESC 1
2178 COMPOSITOIN_WITH_RULE:
2179 ESC 2 CHAR [ RULE CHAR ] ESC 1
2180 COMPOSITION_WITH_ALTCHARS:
2181 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2182 COMPOSITION_WITH_RULE_ALTCHARS:
2183 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2185 enum iso_code_class_type iso_code_class
[256];
2187 #define SAFE_CHARSET_P(coding, id) \
2188 ((id) <= (coding)->max_charset_id \
2189 && (coding)->safe_charsets[id] >= 0)
2192 #define SHIFT_OUT_OK(category) \
2193 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2196 setup_iso_safe_charsets (Lisp_Object attrs
)
2198 Lisp_Object charset_list
, safe_charsets
;
2199 Lisp_Object request
;
2200 Lisp_Object reg_usage
;
2203 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2206 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2207 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2208 && ! EQ (charset_list
, Viso_2022_charset_list
))
2210 CODING_ATTR_CHARSET_LIST (attrs
)
2211 = charset_list
= Viso_2022_charset_list
;
2212 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2215 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2219 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2221 int id
= XINT (XCAR (tail
));
2222 if (max_charset_id
< id
)
2223 max_charset_id
= id
;
2226 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2228 request
= AREF (attrs
, coding_attr_iso_request
);
2229 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2230 reg94
= XINT (XCAR (reg_usage
));
2231 reg96
= XINT (XCDR (reg_usage
));
2233 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2237 struct charset
*charset
;
2240 charset
= CHARSET_FROM_ID (XINT (id
));
2241 reg
= Fcdr (Fassq (request
, id
));
2243 XSTRING (safe_charsets
)->data
[XINT (id
)] = XINT (reg
);
2244 else if (charset
->iso_chars_96
)
2247 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg96
;
2252 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg94
;
2255 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2259 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2260 Check if a text is encoded in ISO2022. If it is, returns an
2261 integer in which appropriate flag bits any of:
2263 CATEGORY_MASK_ISO_7_TIGHT
2264 CATEGORY_MASK_ISO_8_1
2265 CATEGORY_MASK_ISO_8_2
2266 CATEGORY_MASK_ISO_7_ELSE
2267 CATEGORY_MASK_ISO_8_ELSE
2268 are set. If a code which should never appear in ISO2022 is found,
2272 detect_coding_iso_2022 (coding
, mask
)
2273 struct coding_system
*coding
;
2276 unsigned char *src
= coding
->source
, *src_base
= src
;
2277 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2278 int multibytep
= coding
->src_multibyte
;
2279 int mask_iso
= CATEGORY_MASK_ISO
;
2280 int mask_found
= 0, mask_8bit_found
= 0;
2281 int reg
[4], shift_out
= 0, single_shifting
= 0;
2284 int consumed_chars
= 0;
2287 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2289 struct coding_system
*this = &(coding_categories
[i
]);
2290 Lisp_Object attrs
, val
;
2292 attrs
= CODING_ID_ATTRS (this->id
);
2293 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2294 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2295 setup_iso_safe_charsets (attrs
);
2296 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2297 this->max_charset_id
= XSTRING (val
)->size
- 1;
2298 this->safe_charsets
= (char *) XSTRING (val
)->data
;
2301 /* A coding system of this category is always ASCII compatible. */
2302 src
+= coding
->head_ascii
;
2304 reg
[0] = charset_ascii
, reg
[1] = reg
[2] = reg
[3] = -1;
2305 while (mask_iso
&& src
< src_end
)
2311 if (inhibit_iso_escape_detection
)
2313 single_shifting
= 0;
2315 if (c
>= '(' && c
<= '/')
2317 /* Designation sequence for a charset of dimension 1. */
2319 if (c1
< ' ' || c1
>= 0x80
2320 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2321 /* Invalid designation sequence. Just ignore. */
2323 reg
[(c
- '(') % 4] = id
;
2327 /* Designation sequence for a charset of dimension 2. */
2329 if (c
>= '@' && c
<= 'B')
2330 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2331 reg
[0] = id
= iso_charset_table
[1][0][c
];
2332 else if (c
>= '(' && c
<= '/')
2335 if (c1
< ' ' || c1
>= 0x80
2336 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2337 /* Invalid designation sequence. Just ignore. */
2339 reg
[(c
- '(') % 4] = id
;
2342 /* Invalid designation sequence. Just ignore. */
2345 else if (c
== 'N' || c
== 'O')
2347 /* ESC <Fe> for SS2 or SS3. */
2348 mask_iso
&= CATEGORY_MASK_ISO_7_ELSE
;
2351 else if (c
>= '0' && c
<= '4')
2353 /* ESC <Fp> for start/end composition. */
2354 mask_found
|= CATEGORY_MASK_ISO
;
2359 /* Invalid escape sequence. */
2360 mask_iso
&= ~CATEGORY_MASK_ISO_ESCAPE
;
2364 /* We found a valid designation sequence for CHARSET. */
2365 mask_iso
&= ~CATEGORY_MASK_ISO_8BIT
;
2366 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2368 mask_found
|= CATEGORY_MASK_ISO_7
;
2370 mask_iso
&= ~CATEGORY_MASK_ISO_7
;
2371 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2373 mask_found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2375 mask_iso
&= ~CATEGORY_MASK_ISO_7_TIGHT
;
2376 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2378 mask_found
|= CATEGORY_MASK_ISO_7_ELSE
;
2380 mask_iso
&= ~CATEGORY_MASK_ISO_7_ELSE
;
2381 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2383 mask_found
|= CATEGORY_MASK_ISO_8_ELSE
;
2385 mask_iso
&= ~CATEGORY_MASK_ISO_8_ELSE
;
2389 if (inhibit_iso_escape_detection
)
2391 single_shifting
= 0;
2394 || SHIFT_OUT_OK (coding_category_iso_7_else
)
2395 || SHIFT_OUT_OK (coding_category_iso_8_else
)))
2397 /* Locking shift out. */
2398 mask_iso
&= ~CATEGORY_MASK_ISO_7BIT
;
2399 mask_found
|= CATEGORY_MASK_ISO_ELSE
;
2404 if (inhibit_iso_escape_detection
)
2406 single_shifting
= 0;
2409 /* Locking shift in. */
2410 mask_iso
&= ~CATEGORY_MASK_ISO_7BIT
;
2411 mask_found
|= CATEGORY_MASK_ISO_ELSE
;
2416 single_shifting
= 0;
2420 int newmask
= CATEGORY_MASK_ISO_8_ELSE
;
2422 if (inhibit_iso_escape_detection
)
2424 if (c
!= ISO_CODE_CSI
)
2426 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2427 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2428 newmask
|= CATEGORY_MASK_ISO_8_1
;
2429 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2430 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2431 newmask
|= CATEGORY_MASK_ISO_8_2
;
2432 single_shifting
= 1;
2434 if (VECTORP (Vlatin_extra_code_table
)
2435 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2437 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2438 & CODING_ISO_FLAG_LATIN_EXTRA
)
2439 newmask
|= CATEGORY_MASK_ISO_8_1
;
2440 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2441 & CODING_ISO_FLAG_LATIN_EXTRA
)
2442 newmask
|= CATEGORY_MASK_ISO_8_2
;
2444 mask_iso
&= newmask
;
2445 mask_found
|= newmask
;
2452 single_shifting
= 0;
2457 single_shifting
= 0;
2458 mask_8bit_found
= 1;
2459 if (VECTORP (Vlatin_extra_code_table
)
2460 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2464 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2465 & CODING_ISO_FLAG_LATIN_EXTRA
)
2466 newmask
|= CATEGORY_MASK_ISO_8_1
;
2467 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2468 & CODING_ISO_FLAG_LATIN_EXTRA
)
2469 newmask
|= CATEGORY_MASK_ISO_8_2
;
2470 mask_iso
&= newmask
;
2471 mask_found
|= newmask
;
2478 mask_iso
&= ~(CATEGORY_MASK_ISO_7BIT
2479 | CATEGORY_MASK_ISO_7_ELSE
);
2480 mask_found
|= CATEGORY_MASK_ISO_8_1
;
2481 mask_8bit_found
= 1;
2482 /* Check the length of succeeding codes of the range
2483 0xA0..0FF. If the byte length is odd, we exclude
2484 CATEGORY_MASK_ISO_8_2. We can check this only
2485 when we are not single shifting. */
2486 if (!single_shifting
2487 && mask_iso
& CATEGORY_MASK_ISO_8_2
)
2490 while (src
< src_end
)
2498 if (i
& 1 && src
< src_end
)
2499 mask_iso
&= ~CATEGORY_MASK_ISO_8_2
;
2501 mask_found
|= CATEGORY_MASK_ISO_8_2
;
2510 *mask
&= ~CATEGORY_MASK_ISO
;
2515 *mask
&= mask_iso
& mask_found
;
2516 if (! mask_8bit_found
)
2517 *mask
&= ~(CATEGORY_MASK_ISO_8BIT
| CATEGORY_MASK_ISO_8_ELSE
);
2522 /* Set designation state into CODING. */
2523 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2527 if (final < '0' || final >= 128 \
2528 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2529 || !SAFE_CHARSET_P (coding, id)) \
2531 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2532 goto invalid_code; \
2534 prev = CODING_ISO_DESIGNATION (coding, reg); \
2535 CODING_ISO_DESIGNATION (coding, reg) = id; \
2536 /* If there was an invalid designation to REG previously, and this \
2537 designation is ASCII to REG, we should keep this designation \
2539 if (prev == -2 && id == charset_ascii) \
2540 goto invalid_code; \
2544 #define MAYBE_FINISH_COMPOSITION() \
2547 if (composition_state == COMPOSING_NO) \
2549 /* It is assured that we have enough room for producing \
2550 characters stored in the table `components'. */ \
2551 if (charbuf + component_idx > charbuf_end) \
2552 goto no_more_source; \
2553 composition_state = COMPOSING_NO; \
2554 if (method == COMPOSITION_RELATIVE \
2555 || method == COMPOSITION_WITH_ALTCHARS) \
2557 for (i = 0; i < component_idx; i++) \
2558 *charbuf++ = components[i]; \
2559 char_offset += component_idx; \
2563 for (i = 0; i < component_idx; i += 2) \
2564 *charbuf++ = components[i]; \
2565 char_offset += (component_idx / 2) + 1; \
2570 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2571 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2572 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2573 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2574 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2577 #define DECODE_COMPOSITION_START(c1) \
2580 && composition_state == COMPOSING_COMPONENT_CHAR) \
2582 component_len = component_idx; \
2583 composition_state = COMPOSING_CHAR; \
2589 MAYBE_FINISH_COMPOSITION (); \
2590 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2591 goto no_more_source; \
2592 for (p = src; p < src_end - 1; p++) \
2593 if (*p == ISO_CODE_ESC && p[1] == '1') \
2595 if (p == src_end - 1) \
2597 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2598 goto invalid_code; \
2599 goto no_more_source; \
2602 /* This is surely the start of a composition. */ \
2603 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2604 : c1 == '2' ? COMPOSITION_WITH_RULE \
2605 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2606 : COMPOSITION_WITH_RULE_ALTCHARS); \
2607 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2608 : COMPOSING_COMPONENT_CHAR); \
2609 component_idx = component_len = 0; \
2614 /* Handle compositoin end sequence ESC 1. */
2616 #define DECODE_COMPOSITION_END() \
2618 int nchars = (component_len > 0 ? component_idx - component_len \
2619 : method == COMPOSITION_RELATIVE ? component_idx \
2620 : (component_idx + 1) / 2); \
2622 int *saved_charbuf = charbuf; \
2624 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
2625 if (method != COMPOSITION_RELATIVE) \
2627 if (component_len == 0) \
2628 for (i = 0; i < component_idx; i++) \
2629 *charbuf++ = components[i]; \
2631 for (i = 0; i < component_len; i++) \
2632 *charbuf++ = components[i]; \
2633 *saved_charbuf = saved_charbuf - charbuf; \
2635 if (method == COMPOSITION_WITH_RULE) \
2636 for (i = 0; i < component_idx; i += 2, char_offset++) \
2637 *charbuf++ = components[i]; \
2639 for (i = component_len; i < component_idx; i++, char_offset++) \
2640 *charbuf++ = components[i]; \
2641 coding->annotated = 1; \
2642 composition_state = COMPOSING_NO; \
2646 /* Decode a composition rule from the byte C1 (and maybe one more byte
2647 from SRC) and store one encoded composition rule in
2648 coding->cmp_data. */
2650 #define DECODE_COMPOSITION_RULE(c1) \
2653 if (c1 < 81) /* old format (before ver.21) */ \
2655 int gref = (c1) / 9; \
2656 int nref = (c1) % 9; \
2657 if (gref == 4) gref = 10; \
2658 if (nref == 4) nref = 10; \
2659 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2661 else if (c1 < 93) /* new format (after ver.21) */ \
2663 ONE_MORE_BYTE (c2); \
2664 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2671 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2674 decode_coding_iso_2022 (coding
)
2675 struct coding_system
*coding
;
2677 unsigned char *src
= coding
->source
+ coding
->consumed
;
2678 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2679 unsigned char *src_base
;
2680 int *charbuf
= coding
->charbuf
;
2681 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- 4;
2682 int consumed_chars
= 0, consumed_chars_base
;
2683 int char_offset
= 0;
2684 int multibytep
= coding
->src_multibyte
;
2685 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2686 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2687 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2688 struct charset
*charset
;
2690 /* For handling composition sequence. */
2691 #define COMPOSING_NO 0
2692 #define COMPOSING_CHAR 1
2693 #define COMPOSING_RULE 2
2694 #define COMPOSING_COMPONENT_CHAR 3
2695 #define COMPOSING_COMPONENT_RULE 4
2697 int composition_state
= COMPOSING_NO
;
2698 enum composition_method method
;
2699 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2702 Lisp_Object attrs
, eol_type
, charset_list
;
2704 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2705 setup_iso_safe_charsets (attrs
);
2712 consumed_chars_base
= consumed_chars
;
2714 if (charbuf
>= charbuf_end
)
2719 /* We produce no character or one character. */
2720 switch (iso_code_class
[c1
])
2722 case ISO_0x20_or_0x7F
:
2723 if (composition_state
!= COMPOSING_NO
)
2725 if (composition_state
== COMPOSING_RULE
2726 || composition_state
== COMPOSING_COMPONENT_RULE
)
2728 DECODE_COMPOSITION_RULE (c1
);
2729 components
[component_idx
++] = c1
;
2730 composition_state
--;
2733 else if (method
== COMPOSITION_WITH_RULE
)
2734 composition_state
= COMPOSING_RULE
;
2735 else if (method
== COMPOSITION_WITH_RULE_ALTCHARS
2736 && composition_state
== COMPOSING_COMPONENT_CHAR
)
2737 composition_state
= COMPOSING_COMPONENT_CHAR
;
2739 if (charset_id_0
< 0
2740 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2742 /* This is SPACE or DEL. */
2743 charset
= CHARSET_FROM_ID (charset_ascii
);
2746 /* This is a graphic character, we fall down ... */
2748 case ISO_graphic_plane_0
:
2749 if (composition_state
== COMPOSING_RULE
)
2751 DECODE_COMPOSITION_RULE (c1
);
2752 components
[component_idx
++] = c1
;
2753 composition_state
= COMPOSING_CHAR
;
2755 charset
= CHARSET_FROM_ID (charset_id_0
);
2758 case ISO_0xA0_or_0xFF
:
2759 if (charset_id_1
< 0
2760 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
2761 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
2763 /* This is a graphic character, we fall down ... */
2765 case ISO_graphic_plane_1
:
2766 if (charset_id_1
< 0)
2768 charset
= CHARSET_FROM_ID (charset_id_1
);
2771 case ISO_carriage_return
:
2774 if (EQ (eol_type
, Qdos
))
2777 goto no_more_source
;
2781 else if (EQ (eol_type
, Qmac
))
2787 MAYBE_FINISH_COMPOSITION ();
2788 charset
= CHARSET_FROM_ID (charset_ascii
);
2792 MAYBE_FINISH_COMPOSITION ();
2796 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2797 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
2799 CODING_ISO_INVOCATION (coding
, 0) = 1;
2800 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2804 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
2806 CODING_ISO_INVOCATION (coding
, 0) = 0;
2807 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2810 case ISO_single_shift_2_7
:
2811 case ISO_single_shift_2
:
2812 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2814 /* SS2 is handled as an escape sequence of ESC 'N' */
2816 goto label_escape_sequence
;
2818 case ISO_single_shift_3
:
2819 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2821 /* SS2 is handled as an escape sequence of ESC 'O' */
2823 goto label_escape_sequence
;
2825 case ISO_control_sequence_introducer
:
2826 /* CSI is handled as an escape sequence of ESC '[' ... */
2828 goto label_escape_sequence
;
2832 label_escape_sequence
:
2833 /* Escape sequences handled here are invocation,
2834 designation, direction specification, and character
2835 composition specification. */
2838 case '&': /* revision of following character set */
2840 if (!(c1
>= '@' && c1
<= '~'))
2843 if (c1
!= ISO_CODE_ESC
)
2846 goto label_escape_sequence
;
2848 case '$': /* designation of 2-byte character set */
2849 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2852 if (c1
>= '@' && c1
<= 'B')
2853 { /* designation of JISX0208.1978, GB2312.1980,
2855 DECODE_DESIGNATION (0, 2, 0, c1
);
2857 else if (c1
>= 0x28 && c1
<= 0x2B)
2858 { /* designation of DIMENSION2_CHARS94 character set */
2860 DECODE_DESIGNATION (c1
- 0x28, 2, 0, c2
);
2862 else if (c1
>= 0x2C && c1
<= 0x2F)
2863 { /* designation of DIMENSION2_CHARS96 character set */
2865 DECODE_DESIGNATION (c1
- 0x2C, 2, 1, c2
);
2869 /* We must update these variables now. */
2870 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2871 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2874 case 'n': /* invocation of locking-shift-2 */
2875 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2876 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2878 CODING_ISO_INVOCATION (coding
, 0) = 2;
2879 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2882 case 'o': /* invocation of locking-shift-3 */
2883 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2884 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2886 CODING_ISO_INVOCATION (coding
, 0) = 3;
2887 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2890 case 'N': /* invocation of single-shift-2 */
2891 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2892 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2894 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 2));
2896 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
2900 case 'O': /* invocation of single-shift-3 */
2901 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2902 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2904 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 3));
2906 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
2910 case '0': case '2': case '3': case '4': /* start composition */
2911 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
2913 DECODE_COMPOSITION_START (c1
);
2916 case '1': /* end composition */
2917 if (composition_state
== COMPOSING_NO
)
2919 DECODE_COMPOSITION_END ();
2922 case '[': /* specification of direction */
2923 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
2925 /* For the moment, nested direction is not supported.
2926 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2927 left-to-right, and nozero means right-to-left. */
2931 case ']': /* end of the current direction */
2932 coding
->mode
&= ~CODING_MODE_DIRECTION
;
2934 case '0': /* end of the current direction */
2935 case '1': /* start of left-to-right direction */
2938 coding
->mode
&= ~CODING_MODE_DIRECTION
;
2943 case '2': /* start of right-to-left direction */
2946 coding
->mode
|= CODING_MODE_DIRECTION
;
2957 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2959 if (c1
>= 0x28 && c1
<= 0x2B)
2960 { /* designation of DIMENSION1_CHARS94 character set */
2962 DECODE_DESIGNATION (c1
- 0x28, 1, 0, c2
);
2964 else if (c1
>= 0x2C && c1
<= 0x2F)
2965 { /* designation of DIMENSION1_CHARS96 character set */
2967 DECODE_DESIGNATION (c1
- 0x2C, 1, 1, c2
);
2971 /* We must update these variables now. */
2972 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2973 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2978 /* Now we know CHARSET and 1st position code C1 of a character.
2979 Produce a decoded character while getting 2nd position code
2982 if (CHARSET_DIMENSION (charset
) > 1)
2985 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
2986 /* C2 is not in a valid range. */
2988 c1
= (c1
<< 8) | (c2
& 0x7F);
2989 if (CHARSET_DIMENSION (charset
) > 2)
2992 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
2993 /* C2 is not in a valid range. */
2995 c1
= (c1
<< 8) | (c2
& 0x7F);
2999 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3002 MAYBE_FINISH_COMPOSITION ();
3003 for (; src_base
< src
; src_base
++, char_offset
++)
3005 if (ASCII_BYTE_P (*src_base
))
3006 *charbuf
++ = *src_base
;
3008 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3011 else if (composition_state
== COMPOSING_NO
)
3017 components
[component_idx
++] = c
;
3021 MAYBE_FINISH_COMPOSITION ();
3023 consumed_chars
= consumed_chars_base
;
3025 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3030 coding
->consumed_char
+= consumed_chars_base
;
3031 coding
->consumed
= src_base
- coding
->source
;
3032 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3036 /* ISO2022 encoding stuff. */
3039 It is not enough to say just "ISO2022" on encoding, we have to
3040 specify more details. In Emacs, each coding system of ISO2022
3041 variant has the following specifications:
3042 1. Initial designation to G0 thru G3.
3043 2. Allows short-form designation?
3044 3. ASCII should be designated to G0 before control characters?
3045 4. ASCII should be designated to G0 at end of line?
3046 5. 7-bit environment or 8-bit environment?
3047 6. Use locking-shift?
3048 7. Use Single-shift?
3049 And the following two are only for Japanese:
3050 8. Use ASCII in place of JIS0201-1976-Roman?
3051 9. Use JISX0208-1983 in place of JISX0208-1978?
3052 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3053 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3057 /* Produce codes (escape sequence) for designating CHARSET to graphic
3058 register REG at DST, and increment DST. If <final-char> of CHARSET is
3059 '@', 'A', or 'B' and the coding system CODING allows, produce
3060 designation sequence of short-form. */
3062 #define ENCODE_DESIGNATION(charset, reg, coding) \
3064 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3065 char *intermediate_char_94 = "()*+"; \
3066 char *intermediate_char_96 = ",-./"; \
3067 int revision = -1; \
3070 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3071 revision = XINT (CHARSET_ISO_REVISION (charset)); \
3073 if (revision >= 0) \
3075 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3076 EMIT_ONE_BYTE ('@' + revision); \
3078 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3079 if (CHARSET_DIMENSION (charset) == 1) \
3081 if (! CHARSET_ISO_CHARS_96 (charset)) \
3082 c = intermediate_char_94[reg]; \
3084 c = intermediate_char_96[reg]; \
3085 EMIT_ONE_ASCII_BYTE (c); \
3089 EMIT_ONE_ASCII_BYTE ('$'); \
3090 if (! CHARSET_ISO_CHARS_96 (charset)) \
3092 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3094 || final_char < '@' || final_char > 'B') \
3095 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3098 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3100 EMIT_ONE_ASCII_BYTE (final_char); \
3102 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3106 /* The following two macros produce codes (control character or escape
3107 sequence) for ISO2022 single-shift functions (single-shift-2 and
3110 #define ENCODE_SINGLE_SHIFT_2 \
3112 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3113 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3115 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3116 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3120 #define ENCODE_SINGLE_SHIFT_3 \
3122 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3123 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3125 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3126 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3130 /* The following four macros produce codes (control character or
3131 escape sequence) for ISO2022 locking-shift functions (shift-in,
3132 shift-out, locking-shift-2, and locking-shift-3). */
3134 #define ENCODE_SHIFT_IN \
3136 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3137 CODING_ISO_INVOCATION (coding, 0) = 0; \
3141 #define ENCODE_SHIFT_OUT \
3143 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3144 CODING_ISO_INVOCATION (coding, 0) = 1; \
3148 #define ENCODE_LOCKING_SHIFT_2 \
3150 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3151 CODING_ISO_INVOCATION (coding, 0) = 2; \
3155 #define ENCODE_LOCKING_SHIFT_3 \
3157 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3158 CODING_ISO_INVOCATION (coding, 0) = 3; \
3162 /* Produce codes for a DIMENSION1 character whose character set is
3163 CHARSET and whose position-code is C1. Designation and invocation
3164 sequences are also produced in advance if necessary. */
3166 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3168 int id = CHARSET_ID (charset); \
3169 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3171 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3172 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3174 EMIT_ONE_BYTE (c1 | 0x80); \
3175 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3178 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3180 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3183 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3185 EMIT_ONE_BYTE (c1 | 0x80); \
3189 /* Since CHARSET is not yet invoked to any graphic planes, we \
3190 must invoke it, or, at first, designate it to some graphic \
3191 register. Then repeat the loop to actually produce the \
3193 dst = encode_invocation_designation (charset, coding, dst, \
3198 /* Produce codes for a DIMENSION2 character whose character set is
3199 CHARSET and whose position-codes are C1 and C2. Designation and
3200 invocation codes are also produced in advance if necessary. */
3202 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3204 int id = CHARSET_ID (charset); \
3205 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3207 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3208 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3210 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3211 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3214 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3216 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3219 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3221 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3225 /* Since CHARSET is not yet invoked to any graphic planes, we \
3226 must invoke it, or, at first, designate it to some graphic \
3227 register. Then repeat the loop to actually produce the \
3229 dst = encode_invocation_designation (charset, coding, dst, \
3234 #define ENCODE_ISO_CHARACTER(charset, c) \
3236 int code = ENCODE_CHAR ((charset),(c)); \
3238 if (CHARSET_DIMENSION (charset) == 1) \
3239 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3241 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3245 /* Produce designation and invocation codes at a place pointed by DST
3246 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3250 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3251 struct charset
*charset
;
3252 struct coding_system
*coding
;
3256 int multibytep
= coding
->dst_multibyte
;
3257 int produced_chars
= *p_nchars
;
3258 int reg
; /* graphic register number */
3259 int id
= CHARSET_ID (charset
);
3261 /* At first, check designations. */
3262 for (reg
= 0; reg
< 4; reg
++)
3263 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3268 /* CHARSET is not yet designated to any graphic registers. */
3269 /* At first check the requested designation. */
3270 reg
= CODING_ISO_REQUEST (coding
, id
);
3272 /* Since CHARSET requests no special designation, designate it
3273 to graphic register 0. */
3276 ENCODE_DESIGNATION (charset
, reg
, coding
);
3279 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3280 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3282 /* Since the graphic register REG is not invoked to any graphic
3283 planes, invoke it to graphic plane 0. */
3286 case 0: /* graphic register 0 */
3290 case 1: /* graphic register 1 */
3294 case 2: /* graphic register 2 */
3295 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3296 ENCODE_SINGLE_SHIFT_2
;
3298 ENCODE_LOCKING_SHIFT_2
;
3301 case 3: /* graphic register 3 */
3302 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3303 ENCODE_SINGLE_SHIFT_3
;
3305 ENCODE_LOCKING_SHIFT_3
;
3310 *p_nchars
= produced_chars
;
3314 /* The following three macros produce codes for indicating direction
3316 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3318 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3319 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3321 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3325 #define ENCODE_DIRECTION_R2L() \
3327 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3328 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3332 #define ENCODE_DIRECTION_L2R() \
3334 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3335 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3339 /* Produce codes for designation and invocation to reset the graphic
3340 planes and registers to initial state. */
3341 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3344 struct charset *charset; \
3346 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3348 for (reg = 0; reg < 4; reg++) \
3349 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3350 && (CODING_ISO_DESIGNATION (coding, reg) \
3351 != CODING_ISO_INITIAL (coding, reg))) \
3353 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3354 ENCODE_DESIGNATION (charset, reg, coding); \
3359 /* Produce designation sequences of charsets in the line started from
3360 SRC to a place pointed by DST, and return updated DST.
3362 If the current block ends before any end-of-line, we may fail to
3363 find all the necessary designations. */
3365 static unsigned char *
3366 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3367 struct coding_system
*coding
;
3368 int *charbuf
, *charbuf_end
;
3371 struct charset
*charset
;
3372 /* Table of charsets to be designated to each graphic register. */
3374 int c
, found
= 0, reg
;
3375 int produced_chars
= 0;
3376 int multibytep
= coding
->dst_multibyte
;
3378 Lisp_Object charset_list
;
3380 attrs
= CODING_ID_ATTRS (coding
->id
);
3381 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3382 if (EQ (charset_list
, Qiso_2022
))
3383 charset_list
= Viso_2022_charset_list
;
3385 for (reg
= 0; reg
< 4; reg
++)
3395 charset
= char_charset (c
, charset_list
, NULL
);
3396 id
= CHARSET_ID (charset
);
3397 reg
= CODING_ISO_REQUEST (coding
, id
);
3398 if (reg
>= 0 && r
[reg
] < 0)
3407 for (reg
= 0; reg
< 4; reg
++)
3409 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3410 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3416 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3419 encode_coding_iso_2022 (coding
)
3420 struct coding_system
*coding
;
3422 int multibytep
= coding
->dst_multibyte
;
3423 int *charbuf
= coding
->charbuf
;
3424 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3425 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3426 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3429 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3430 && CODING_ISO_BOL (coding
));
3431 int produced_chars
= 0;
3432 Lisp_Object attrs
, eol_type
, charset_list
;
3433 int ascii_compatible
;
3436 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3438 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3440 while (charbuf
< charbuf_end
)
3442 ASSURE_DESTINATION (safe_room
);
3444 if (bol_designation
)
3446 unsigned char *dst_prev
= dst
;
3448 /* We have to produce designation sequences if any now. */
3449 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3450 bol_designation
= 0;
3451 /* We are sure that designation sequences are all ASCII bytes. */
3452 produced_chars
+= dst
- dst_prev
;
3457 /* Now encode the character C. */
3458 if (c
< 0x20 || c
== 0x7F)
3461 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3463 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3464 ENCODE_RESET_PLANE_AND_REGISTER ();
3465 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3469 for (i
= 0; i
< 4; i
++)
3470 CODING_ISO_DESIGNATION (coding
, i
)
3471 = CODING_ISO_INITIAL (coding
, i
);
3474 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3476 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3477 ENCODE_RESET_PLANE_AND_REGISTER ();
3478 EMIT_ONE_ASCII_BYTE (c
);
3480 else if (ASCII_CHAR_P (c
))
3482 if (ascii_compatible
)
3483 EMIT_ONE_ASCII_BYTE (c
);
3485 ENCODE_ISO_CHARACTER (CHARSET_FROM_ID (charset_ascii
), c
);
3489 struct charset
*charset
= char_charset (c
, charset_list
, NULL
);
3493 c
= coding
->default_char
;
3494 charset
= char_charset (c
, charset_list
, NULL
);
3496 ENCODE_ISO_CHARACTER (charset
, c
);
3500 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3501 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3503 ASSURE_DESTINATION (safe_room
);
3504 ENCODE_RESET_PLANE_AND_REGISTER ();
3506 coding
->result
= CODING_RESULT_SUCCESS
;
3507 CODING_ISO_BOL (coding
) = bol_designation
;
3508 coding
->produced_char
+= produced_chars
;
3509 coding
->produced
= dst
- coding
->destination
;
3514 /*** 8,9. SJIS and BIG5 handlers ***/
3516 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3517 quite widely. So, for the moment, Emacs supports them in the bare
3518 C code. But, in the future, they may be supported only by CCL. */
3520 /* SJIS is a coding system encoding three character sets: ASCII, right
3521 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3522 as is. A character of charset katakana-jisx0201 is encoded by
3523 "position-code + 0x80". A character of charset japanese-jisx0208
3524 is encoded in 2-byte but two position-codes are divided and shifted
3525 so that it fit in the range below.
3527 --- CODE RANGE of SJIS ---
3528 (character set) (range)
3530 KATAKANA-JISX0201 0xA0 .. 0xDF
3531 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3532 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3533 -------------------------------
3537 /* BIG5 is a coding system encoding two character sets: ASCII and
3538 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3539 character set and is encoded in two-byte.
3541 --- CODE RANGE of BIG5 ---
3542 (character set) (range)
3544 Big5 (1st byte) 0xA1 .. 0xFE
3545 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3546 --------------------------
3550 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3551 Check if a text is encoded in SJIS. If it is, return
3552 CATEGORY_MASK_SJIS, else return 0. */
3555 detect_coding_sjis (coding
, mask
)
3556 struct coding_system
*coding
;
3559 unsigned char *src
= coding
->source
, *src_base
= src
;
3560 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3561 int multibytep
= coding
->src_multibyte
;
3562 int consumed_chars
= 0;
3566 /* A coding system of this category is always ASCII compatible. */
3567 src
+= coding
->head_ascii
;
3574 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3577 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
3581 else if (c
>= 0xA0 && c
< 0xE0)
3586 *mask
&= ~CATEGORY_MASK_SJIS
;
3592 *mask
&= CATEGORY_MASK_SJIS
;
3596 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3597 Check if a text is encoded in BIG5. If it is, return
3598 CATEGORY_MASK_BIG5, else return 0. */
3601 detect_coding_big5 (coding
, mask
)
3602 struct coding_system
*coding
;
3605 unsigned char *src
= coding
->source
, *src_base
= src
;
3606 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3607 int multibytep
= coding
->src_multibyte
;
3608 int consumed_chars
= 0;
3612 /* A coding system of this category is always ASCII compatible. */
3613 src
+= coding
->head_ascii
;
3623 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
3630 *mask
&= ~CATEGORY_MASK_BIG5
;
3636 *mask
&= CATEGORY_MASK_BIG5
;
3640 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3641 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3644 decode_coding_sjis (coding
)
3645 struct coding_system
*coding
;
3647 unsigned char *src
= coding
->source
+ coding
->consumed
;
3648 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3649 unsigned char *src_base
;
3650 int *charbuf
= coding
->charbuf
;
3651 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
3652 int consumed_chars
= 0, consumed_chars_base
;
3653 int multibytep
= coding
->src_multibyte
;
3654 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3655 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3657 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3660 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3661 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3662 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3669 consumed_chars_base
= consumed_chars
;
3671 if (charbuf
>= charbuf_end
)
3678 if (EQ (eol_type
, Qdos
))
3681 goto no_more_source
;
3685 else if (EQ (eol_type
, Qmac
))
3690 struct charset
*charset
;
3693 charset
= charset_roman
;
3698 if (c
< 0xA0 || c
>= 0xE0)
3700 /* SJIS -> JISX0208 */
3702 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
3706 charset
= charset_kanji
;
3709 /* SJIS -> JISX0201-Kana */
3710 charset
= charset_kana
;
3712 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
3719 consumed_chars
= consumed_chars_base
;
3721 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3726 coding
->consumed_char
+= consumed_chars_base
;
3727 coding
->consumed
= src_base
- coding
->source
;
3728 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3732 decode_coding_big5 (coding
)
3733 struct coding_system
*coding
;
3735 unsigned char *src
= coding
->source
+ coding
->consumed
;
3736 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3737 unsigned char *src_base
;
3738 int *charbuf
= coding
->charbuf
;
3739 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
3740 int consumed_chars
= 0, consumed_chars_base
;
3741 int multibytep
= coding
->src_multibyte
;
3742 struct charset
*charset_roman
, *charset_big5
;
3743 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3745 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3747 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3748 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3755 consumed_chars_base
= consumed_chars
;
3757 if (charbuf
>= charbuf_end
)
3764 if (EQ (eol_type
, Qdos
))
3767 goto no_more_source
;
3771 else if (EQ (eol_type
, Qmac
))
3776 struct charset
*charset
;
3778 charset
= charset_roman
;
3782 if (c
< 0xA1 || c
> 0xFE)
3785 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
3788 charset
= charset_big5
;
3790 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
3798 consumed_chars
= consumed_chars_base
;
3800 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3805 coding
->consumed_char
+= consumed_chars_base
;
3806 coding
->consumed
= src_base
- coding
->source
;
3807 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3810 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3811 This function can encode charsets `ascii', `katakana-jisx0201',
3812 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3813 are sure that all these charsets are registered as official charset
3814 (i.e. do not have extended leading-codes). Characters of other
3815 charsets are produced without any encoding. If SJIS_P is 1, encode
3816 SJIS text, else encode BIG5 text. */
3819 encode_coding_sjis (coding
)
3820 struct coding_system
*coding
;
3822 int multibytep
= coding
->dst_multibyte
;
3823 int *charbuf
= coding
->charbuf
;
3824 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3825 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3826 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3828 int produced_chars
= 0;
3829 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3830 int ascii_compatible
;
3831 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3834 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3836 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3837 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3838 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3840 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3842 while (charbuf
< charbuf_end
)
3844 ASSURE_DESTINATION (safe_room
);
3846 /* Now encode the character C. */
3847 if (ASCII_CHAR_P (c
) && ascii_compatible
)
3848 EMIT_ONE_ASCII_BYTE (c
);
3852 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
3856 c
= coding
->default_char
;
3857 charset
= char_charset (c
, charset_list
, &code
);
3859 if (code
== CHARSET_INVALID_CODE (charset
))
3861 if (charset
== charset_kanji
)
3865 c1
= code
>> 8, c2
= code
& 0xFF;
3866 EMIT_TWO_BYTES (c1
, c2
);
3868 else if (charset
== charset_kana
)
3869 EMIT_ONE_BYTE (code
| 0x80);
3871 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
3874 coding
->result
= CODING_RESULT_SUCCESS
;
3875 coding
->produced_char
+= produced_chars
;
3876 coding
->produced
= dst
- coding
->destination
;
3881 encode_coding_big5 (coding
)
3882 struct coding_system
*coding
;
3884 int multibytep
= coding
->dst_multibyte
;
3885 int *charbuf
= coding
->charbuf
;
3886 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3887 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3888 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3890 int produced_chars
= 0;
3891 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3892 int ascii_compatible
;
3893 struct charset
*charset_roman
, *charset_big5
;
3896 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3898 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3899 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3900 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3902 while (charbuf
< charbuf_end
)
3904 ASSURE_DESTINATION (safe_room
);
3906 /* Now encode the character C. */
3907 if (ASCII_CHAR_P (c
) && ascii_compatible
)
3908 EMIT_ONE_ASCII_BYTE (c
);
3912 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
3916 c
= coding
->default_char
;
3917 charset
= char_charset (c
, charset_list
, &code
);
3919 if (code
== CHARSET_INVALID_CODE (charset
))
3921 if (charset
== charset_big5
)
3925 c1
= code
>> 8, c2
= code
& 0xFF;
3926 EMIT_TWO_BYTES (c1
, c2
);
3929 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
3932 coding
->result
= CODING_RESULT_SUCCESS
;
3933 coding
->produced_char
+= produced_chars
;
3934 coding
->produced
= dst
- coding
->destination
;
3939 /*** 10. CCL handlers ***/
3941 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3942 Check if a text is encoded in a coding system of which
3943 encoder/decoder are written in CCL program. If it is, return
3944 CATEGORY_MASK_CCL, else return 0. */
3947 detect_coding_ccl (coding
, mask
)
3948 struct coding_system
*coding
;
3951 unsigned char *src
= coding
->source
, *src_base
= src
;
3952 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3953 int multibytep
= coding
->src_multibyte
;
3954 int consumed_chars
= 0;
3956 unsigned char *valids
= CODING_CCL_VALIDS (coding
);
3957 int head_ascii
= coding
->head_ascii
;
3960 coding
= &coding_categories
[coding_category_ccl
];
3961 attrs
= CODING_ID_ATTRS (coding
->id
);
3962 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
3971 if (!found
&& valids
[c
] > 1)
3974 *mask
&= ~CATEGORY_MASK_CCL
;
3980 *mask
&= CATEGORY_MASK_CCL
;
3985 decode_coding_ccl (coding
)
3986 struct coding_system
*coding
;
3988 unsigned char *src
= coding
->source
+ coding
->consumed
;
3989 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3990 int *charbuf
= coding
->charbuf
;
3991 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
3992 int consumed_chars
= 0;
3993 int multibytep
= coding
->src_multibyte
;
3994 struct ccl_program ccl
;
3995 int source_charbuf
[1024];
3996 int source_byteidx
[1024];
3998 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
4000 while (src
< src_end
)
4002 unsigned char *p
= src
;
4003 int *source
, *source_end
;
4007 while (i
< 1024 && p
< src_end
)
4009 source_byteidx
[i
] = p
- src
;
4010 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4013 while (i
< 1024 && p
< src_end
)
4014 source_charbuf
[i
++] = *p
++;
4016 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4019 source
= source_charbuf
;
4020 source_end
= source
+ i
;
4021 while (source
< source_end
)
4023 ccl_driver (&ccl
, source
, charbuf
,
4024 source_end
- source
, charbuf_end
- charbuf
);
4025 source
+= ccl
.consumed
;
4026 charbuf
+= ccl
.produced
;
4027 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4030 if (source
< source_end
)
4031 src
+= source_byteidx
[source
- source_charbuf
];
4034 consumed_chars
+= source
- source_charbuf
;
4036 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4037 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4043 case CCL_STAT_SUSPEND_BY_SRC
:
4044 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4046 case CCL_STAT_SUSPEND_BY_DST
:
4049 case CCL_STAT_INVALID_CMD
:
4050 coding
->result
= CODING_RESULT_INTERRUPT
;
4053 coding
->result
= CODING_RESULT_SUCCESS
;
4056 coding
->consumed_char
+= consumed_chars
;
4057 coding
->consumed
= src
- coding
->source
;
4058 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4062 encode_coding_ccl (coding
)
4063 struct coding_system
*coding
;
4065 struct ccl_program ccl
;
4066 int multibytep
= coding
->dst_multibyte
;
4067 int *charbuf
= coding
->charbuf
;
4068 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4069 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4070 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4071 unsigned char *adjusted_dst_end
= dst_end
- 1;
4072 int destination_charbuf
[1024];
4073 int i
, produced_chars
= 0;
4075 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4077 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4078 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4080 while (charbuf
< charbuf_end
&& dst
< adjusted_dst_end
)
4082 int dst_bytes
= dst_end
- dst
;
4083 if (dst_bytes
> 1024)
4086 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4087 charbuf_end
- charbuf
, dst_bytes
);
4088 charbuf
+= ccl
.consumed
;
4090 for (i
= 0; i
< ccl
.produced
; i
++)
4091 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4094 for (i
= 0; i
< ccl
.produced
; i
++)
4095 *dst
++ = destination_charbuf
[i
] & 0xFF;
4096 produced_chars
+= ccl
.produced
;
4102 case CCL_STAT_SUSPEND_BY_SRC
:
4103 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4105 case CCL_STAT_SUSPEND_BY_DST
:
4106 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
4109 case CCL_STAT_INVALID_CMD
:
4110 coding
->result
= CODING_RESULT_INTERRUPT
;
4113 coding
->result
= CODING_RESULT_SUCCESS
;
4117 coding
->produced_char
+= produced_chars
;
4118 coding
->produced
= dst
- coding
->destination
;
4124 /*** 10, 11. no-conversion handlers ***/
4126 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4129 decode_coding_raw_text (coding
)
4130 struct coding_system
*coding
;
4132 coding
->chars_at_source
= 1;
4133 coding
->consumed_char
= coding
->src_chars
;
4134 coding
->consumed
= coding
->src_bytes
;
4135 coding
->result
= CODING_RESULT_SUCCESS
;
4139 encode_coding_raw_text (coding
)
4140 struct coding_system
*coding
;
4142 int multibytep
= coding
->dst_multibyte
;
4143 int *charbuf
= coding
->charbuf
;
4144 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4145 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4146 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4147 int produced_chars
= 0;
4152 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4154 if (coding
->src_multibyte
)
4155 while (charbuf
< charbuf_end
)
4157 ASSURE_DESTINATION (safe_room
);
4159 if (ASCII_CHAR_P (c
))
4160 EMIT_ONE_ASCII_BYTE (c
);
4161 else if (CHAR_BYTE8_P (c
))
4163 c
= CHAR_TO_BYTE8 (c
);
4168 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4170 CHAR_STRING_ADVANCE (c
, p1
);
4172 EMIT_ONE_BYTE (*p0
);
4176 while (charbuf
< charbuf_end
)
4178 ASSURE_DESTINATION (safe_room
);
4185 if (coding
->src_multibyte
)
4187 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4189 while (charbuf
< charbuf_end
)
4191 ASSURE_DESTINATION (safe_room
);
4193 if (ASCII_CHAR_P (c
))
4195 else if (CHAR_BYTE8_P (c
))
4196 *dst
++ = CHAR_TO_BYTE8 (c
);
4198 CHAR_STRING_ADVANCE (c
, dst
);
4204 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4205 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4206 *dst
++ = *charbuf
++;
4207 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4210 coding
->result
= CODING_RESULT_SUCCESS
;
4211 coding
->produced_char
+= produced_chars
;
4212 coding
->produced
= dst
- coding
->destination
;
4217 detect_coding_charset (coding
, mask
)
4218 struct coding_system
*coding
;
4221 unsigned char *src
= coding
->source
, *src_base
= src
;
4222 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4223 int multibytep
= coding
->src_multibyte
;
4224 int consumed_chars
= 0;
4225 Lisp_Object attrs
, valids
;
4227 coding
= &coding_categories
[coding_category_charset
];
4228 attrs
= CODING_ID_ATTRS (coding
->id
);
4229 valids
= AREF (attrs
, coding_attr_charset_valids
);
4231 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4232 src
+= coding
->head_ascii
;
4239 if (NILP (AREF (valids
, c
)))
4242 *mask
&= ~CATEGORY_MASK_CHARSET
;
4246 *mask
&= CATEGORY_MASK_CHARSET
;
4251 decode_coding_charset (coding
)
4252 struct coding_system
*coding
;
4254 unsigned char *src
= coding
->source
+ coding
->consumed
;
4255 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4256 unsigned char *src_base
;
4257 int *charbuf
= coding
->charbuf
;
4258 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4259 int consumed_chars
= 0, consumed_chars_base
;
4260 int multibytep
= coding
->src_multibyte
;
4261 struct charset
*charset
;
4262 Lisp_Object attrs
, eol_type
, charset_list
;
4264 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4265 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
4272 consumed_chars_base
= consumed_chars
;
4274 if (charbuf
>= charbuf_end
)
4280 if (EQ (eol_type
, Qdos
))
4283 goto no_more_source
;
4287 else if (EQ (eol_type
, Qmac
))
4292 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
4301 consumed_chars
= consumed_chars_base
;
4303 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4308 coding
->consumed_char
+= consumed_chars_base
;
4309 coding
->consumed
= src_base
- coding
->source
;
4310 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4314 encode_coding_charset (coding
)
4315 struct coding_system
*coding
;
4317 int multibytep
= coding
->dst_multibyte
;
4318 int *charbuf
= coding
->charbuf
;
4319 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4320 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4321 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4322 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4323 int produced_chars
= 0;
4324 struct charset
*charset
;
4325 Lisp_Object attrs
, eol_type
, charset_list
;
4326 int ascii_compatible
;
4329 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4330 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
4331 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4333 while (charbuf
< charbuf_end
)
4337 ASSURE_DESTINATION (safe_room
);
4339 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4340 EMIT_ONE_ASCII_BYTE (c
);
4341 else if ((code
= ENCODE_CHAR (charset
, c
))
4342 != CHARSET_INVALID_CODE (charset
))
4343 EMIT_ONE_BYTE (code
);
4345 EMIT_ONE_BYTE (coding
->default_char
);
4348 coding
->result
= CODING_RESULT_SUCCESS
;
4349 coding
->produced_char
+= produced_chars
;
4350 coding
->produced
= dst
- coding
->destination
;
4355 /*** 7. C library functions ***/
4357 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
4358 has a property `coding-system'. The value of this property is a
4359 vector of length 5 (called as coding-vector). Among elements of
4360 this vector, the first (element[0]) and the fifth (element[4])
4361 carry important information for decoding/encoding. Before
4362 decoding/encoding, this information should be set in fields of a
4363 structure of type `coding_system'.
4365 A value of property `coding-system' can be a symbol of another
4366 subsidiary coding-system. In that case, Emacs gets coding-vector
4369 `element[0]' contains information to be set in `coding->type'. The
4370 value and its meaning is as follows:
4372 0 -- coding_type_emacs_mule
4373 1 -- coding_type_sjis
4374 2 -- coding_type_iso_2022
4375 3 -- coding_type_big5
4376 4 -- coding_type_ccl encoder/decoder written in CCL
4377 nil -- coding_type_no_conversion
4378 t -- coding_type_undecided (automatic conversion on decoding,
4379 no-conversion on encoding)
4381 `element[4]' contains information to be set in `coding->flags' and
4382 `coding->spec'. The meaning varies by `coding->type'.
4384 If `coding->type' is `coding_type_iso_2022', element[4] is a vector
4385 of length 32 (of which the first 13 sub-elements are used now).
4386 Meanings of these sub-elements are:
4388 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022'
4389 If the value is an integer of valid charset, the charset is
4390 assumed to be designated to graphic register N initially.
4392 If the value is minus, it is a minus value of charset which
4393 reserves graphic register N, which means that the charset is
4394 not designated initially but should be designated to graphic
4395 register N just before encoding a character in that charset.
4397 If the value is nil, graphic register N is never used on
4400 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
4401 Each value takes t or nil. See the section ISO2022 of
4402 `coding.h' for more information.
4404 If `coding->type' is `coding_type_big5', element[4] is t to denote
4405 BIG5-ETen or nil to denote BIG5-HKU.
4407 If `coding->type' takes the other value, element[4] is ignored.
4409 Emacs Lisp's coding system also carries information about format of
4410 end-of-line in a value of property `eol-type'. If the value is
4411 integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr. If
4412 it is not integer, it should be a vector of subsidiary coding
4413 systems of which property `eol-type' has one of above values.
4417 /* Setup coding context CODING from information about CODING_SYSTEM.
4418 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4419 CODING_SYSTEM is invalid, signal an error. */
4422 setup_coding_system (coding_system
, coding
)
4423 Lisp_Object coding_system
;
4424 struct coding_system
*coding
;
4427 Lisp_Object eol_type
;
4428 Lisp_Object coding_type
;
4431 if (NILP (coding_system
))
4432 coding_system
= Qno_conversion
;
4434 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
4436 attrs
= CODING_ID_ATTRS (coding
->id
);
4437 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4440 coding
->head_ascii
= -1;
4441 coding
->common_flags
4442 = (VECTORP (eol_type
) ? CODING_REQUIRE_DETECTION_MASK
: 0);
4444 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4445 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4446 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4447 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
4449 coding_type
= CODING_ATTR_TYPE (attrs
);
4450 if (EQ (coding_type
, Qundecided
))
4452 coding
->detector
= NULL
;
4453 coding
->decoder
= decode_coding_raw_text
;
4454 coding
->encoder
= encode_coding_raw_text
;
4455 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4457 else if (EQ (coding_type
, Qiso_2022
))
4460 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
4462 /* Invoke graphic register 0 to plane 0. */
4463 CODING_ISO_INVOCATION (coding
, 0) = 0;
4464 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4465 CODING_ISO_INVOCATION (coding
, 1)
4466 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
4467 /* Setup the initial status of designation. */
4468 for (i
= 0; i
< 4; i
++)
4469 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
4470 /* Not single shifting initially. */
4471 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
4472 /* Beginning of buffer should also be regarded as bol. */
4473 CODING_ISO_BOL (coding
) = 1;
4474 coding
->detector
= detect_coding_iso_2022
;
4475 coding
->decoder
= decode_coding_iso_2022
;
4476 coding
->encoder
= encode_coding_iso_2022
;
4477 if (flags
& CODING_ISO_FLAG_SAFE
)
4478 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
4479 coding
->common_flags
4480 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4481 | CODING_REQUIRE_FLUSHING_MASK
);
4482 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
4483 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
4484 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
4486 setup_iso_safe_charsets (attrs
);
4487 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4488 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4489 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4491 CODING_ISO_FLAGS (coding
) = flags
;
4493 else if (EQ (coding_type
, Qcharset
))
4495 coding
->detector
= detect_coding_charset
;
4496 coding
->decoder
= decode_coding_charset
;
4497 coding
->encoder
= encode_coding_charset
;
4498 coding
->common_flags
4499 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4501 else if (EQ (coding_type
, Qutf_8
))
4503 coding
->detector
= detect_coding_utf_8
;
4504 coding
->decoder
= decode_coding_utf_8
;
4505 coding
->encoder
= encode_coding_utf_8
;
4506 coding
->common_flags
4507 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4509 else if (EQ (coding_type
, Qutf_16
))
4511 val
= AREF (attrs
, coding_attr_utf_16_bom
);
4512 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
4513 : EQ (val
, Qt
) ? utf_16_with_bom
4514 : utf_16_without_bom
);
4515 val
= AREF (attrs
, coding_attr_utf_16_endian
);
4516 CODING_UTF_16_ENDIAN (coding
) = (NILP (val
) ? utf_16_big_endian
4517 : utf_16_little_endian
);
4518 CODING_UTF_16_SURROGATE (coding
) = 0;
4519 coding
->detector
= detect_coding_utf_16
;
4520 coding
->decoder
= decode_coding_utf_16
;
4521 coding
->encoder
= encode_coding_utf_16
;
4522 coding
->common_flags
4523 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4525 else if (EQ (coding_type
, Qccl
))
4527 coding
->detector
= detect_coding_ccl
;
4528 coding
->decoder
= decode_coding_ccl
;
4529 coding
->encoder
= encode_coding_ccl
;
4530 coding
->common_flags
4531 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4532 | CODING_REQUIRE_FLUSHING_MASK
);
4534 else if (EQ (coding_type
, Qemacs_mule
))
4536 coding
->detector
= detect_coding_emacs_mule
;
4537 coding
->decoder
= decode_coding_emacs_mule
;
4538 coding
->encoder
= encode_coding_emacs_mule
;
4539 coding
->common_flags
4540 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4541 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
4542 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
4544 Lisp_Object tail
, safe_charsets
;
4545 int max_charset_id
= 0;
4547 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4549 if (max_charset_id
< XFASTINT (XCAR (tail
)))
4550 max_charset_id
= XFASTINT (XCAR (tail
));
4551 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
4553 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4555 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
4556 coding
->max_charset_id
= max_charset_id
;
4557 coding
->safe_charsets
= (char *) XSTRING (safe_charsets
)->data
;
4560 else if (EQ (coding_type
, Qshift_jis
))
4562 coding
->detector
= detect_coding_sjis
;
4563 coding
->decoder
= decode_coding_sjis
;
4564 coding
->encoder
= encode_coding_sjis
;
4565 coding
->common_flags
4566 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4568 else if (EQ (coding_type
, Qbig5
))
4570 coding
->detector
= detect_coding_big5
;
4571 coding
->decoder
= decode_coding_big5
;
4572 coding
->encoder
= encode_coding_big5
;
4573 coding
->common_flags
4574 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4576 else /* EQ (coding_type, Qraw_text) */
4578 coding
->detector
= NULL
;
4579 coding
->decoder
= decode_coding_raw_text
;
4580 coding
->encoder
= encode_coding_raw_text
;
4581 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
4587 /* Return raw-text or one of its subsidiaries that has the same
4588 eol_type as CODING-SYSTEM. */
4591 raw_text_coding_system (coding_system
)
4592 Lisp_Object coding_system
;
4594 Lisp_Object spec
, attrs
;
4595 Lisp_Object eol_type
, raw_text_eol_type
;
4597 spec
= CODING_SYSTEM_SPEC (coding_system
);
4598 attrs
= AREF (spec
, 0);
4600 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
4601 return coding_system
;
4603 eol_type
= AREF (spec
, 2);
4604 if (VECTORP (eol_type
))
4606 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
4607 raw_text_eol_type
= AREF (spec
, 2);
4608 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
4609 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
4610 : AREF (raw_text_eol_type
, 2));
4614 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4615 does, return one of the subsidiary that has the same eol-spec as
4616 PARENT. Otherwise, return CODING_SYSTEM. */
4619 coding_inherit_eol_type (coding_system
, parent
)
4621 Lisp_Object spec
, attrs
, eol_type
;
4623 spec
= CODING_SYSTEM_SPEC (coding_system
);
4624 attrs
= AREF (spec
, 0);
4625 eol_type
= AREF (spec
, 2);
4626 if (VECTORP (eol_type
))
4628 Lisp_Object parent_spec
;
4629 Lisp_Object parent_eol_type
;
4632 = CODING_SYSTEM_SPEC (buffer_defaults
.buffer_file_coding_system
);
4633 parent_eol_type
= AREF (parent_spec
, 2);
4634 if (EQ (parent_eol_type
, Qunix
))
4635 coding_system
= AREF (eol_type
, 0);
4636 else if (EQ (parent_eol_type
, Qdos
))
4637 coding_system
= AREF (eol_type
, 1);
4638 else if (EQ (parent_eol_type
, Qmac
))
4639 coding_system
= AREF (eol_type
, 2);
4641 return coding_system
;
4644 /* Emacs has a mechanism to automatically detect a coding system if it
4645 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4646 it's impossible to distinguish some coding systems accurately
4647 because they use the same range of codes. So, at first, coding
4648 systems are categorized into 7, those are:
4650 o coding-category-emacs-mule
4652 The category for a coding system which has the same code range
4653 as Emacs' internal format. Assigned the coding-system (Lisp
4654 symbol) `emacs-mule' by default.
4656 o coding-category-sjis
4658 The category for a coding system which has the same code range
4659 as SJIS. Assigned the coding-system (Lisp
4660 symbol) `japanese-shift-jis' by default.
4662 o coding-category-iso-7
4664 The category for a coding system which has the same code range
4665 as ISO2022 of 7-bit environment. This doesn't use any locking
4666 shift and single shift functions. This can encode/decode all
4667 charsets. Assigned the coding-system (Lisp symbol)
4668 `iso-2022-7bit' by default.
4670 o coding-category-iso-7-tight
4672 Same as coding-category-iso-7 except that this can
4673 encode/decode only the specified charsets.
4675 o coding-category-iso-8-1
4677 The category for a coding system which has the same code range
4678 as ISO2022 of 8-bit environment and graphic plane 1 used only
4679 for DIMENSION1 charset. This doesn't use any locking shift
4680 and single shift functions. Assigned the coding-system (Lisp
4681 symbol) `iso-latin-1' by default.
4683 o coding-category-iso-8-2
4685 The category for a coding system which has the same code range
4686 as ISO2022 of 8-bit environment and graphic plane 1 used only
4687 for DIMENSION2 charset. This doesn't use any locking shift
4688 and single shift functions. Assigned the coding-system (Lisp
4689 symbol) `japanese-iso-8bit' by default.
4691 o coding-category-iso-7-else
4693 The category for a coding system which has the same code range
4694 as ISO2022 of 7-bit environemnt but uses locking shift or
4695 single shift functions. Assigned the coding-system (Lisp
4696 symbol) `iso-2022-7bit-lock' by default.
4698 o coding-category-iso-8-else
4700 The category for a coding system which has the same code range
4701 as ISO2022 of 8-bit environemnt but uses locking shift or
4702 single shift functions. Assigned the coding-system (Lisp
4703 symbol) `iso-2022-8bit-ss2' by default.
4705 o coding-category-big5
4707 The category for a coding system which has the same code range
4708 as BIG5. Assigned the coding-system (Lisp symbol)
4709 `cn-big5' by default.
4711 o coding-category-utf-8
4713 The category for a coding system which has the same code range
4714 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
4715 symbol) `utf-8' by default.
4717 o coding-category-utf-16-be
4719 The category for a coding system in which a text has an
4720 Unicode signature (cf. Unicode Standard) in the order of BIG
4721 endian at the head. Assigned the coding-system (Lisp symbol)
4722 `utf-16-be' by default.
4724 o coding-category-utf-16-le
4726 The category for a coding system in which a text has an
4727 Unicode signature (cf. Unicode Standard) in the order of
4728 LITTLE endian at the head. Assigned the coding-system (Lisp
4729 symbol) `utf-16-le' by default.
4731 o coding-category-ccl
4733 The category for a coding system of which encoder/decoder is
4734 written in CCL programs. The default value is nil, i.e., no
4735 coding system is assigned.
4737 o coding-category-binary
4739 The category for a coding system not categorized in any of the
4740 above. Assigned the coding-system (Lisp symbol)
4741 `no-conversion' by default.
4743 Each of them is a Lisp symbol and the value is an actual
4744 `coding-system's (this is also a Lisp symbol) assigned by a user.
4745 What Emacs does actually is to detect a category of coding system.
4746 Then, it uses a `coding-system' assigned to it. If Emacs can't
4747 decide only one possible category, it selects a category of the
4748 highest priority. Priorities of categories are also specified by a
4749 user in a Lisp variable `coding-category-list'.
4753 #define EOL_SEEN_NONE 0
4754 #define EOL_SEEN_LF 1
4755 #define EOL_SEEN_CR 2
4756 #define EOL_SEEN_CRLF 4
4758 /* Detect how end-of-line of a text of length CODING->src_bytes
4759 pointed by CODING->source is encoded. Return one of
4762 #define MAX_EOL_CHECK_COUNT 3
4765 detect_eol (coding
, source
, src_bytes
)
4766 struct coding_system
*coding
;
4767 unsigned char *source
;
4768 EMACS_INT src_bytes
;
4770 Lisp_Object attrs
, coding_type
;
4771 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
4774 int eol_seen
= EOL_SEEN_NONE
;
4776 attrs
= CODING_ID_ATTRS (coding
->id
);
4777 coding_type
= CODING_ATTR_TYPE (attrs
);
4779 if (EQ (coding_type
, Qccl
))
4783 msb
= coding
->spec
.utf_16
.endian
== utf_16_little_endian
;
4786 while (src
+ 1 < src_end
)
4789 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
4794 this_eol
= EOL_SEEN_LF
;
4795 else if (src
+ 3 >= src_end
4796 || src
[msb
+ 2] != 0
4797 || src
[lsb
+ 2] != '\n')
4798 this_eol
= EOL_SEEN_CR
;
4800 this_eol
= EOL_SEEN_CRLF
;
4802 if (eol_seen
== EOL_SEEN_NONE
)
4803 /* This is the first end-of-line. */
4804 eol_seen
= this_eol
;
4805 else if (eol_seen
!= this_eol
)
4807 /* The found type is different from what found before. */
4808 eol_seen
= EOL_SEEN_LF
;
4811 if (++total
== MAX_EOL_CHECK_COUNT
)
4819 while (src
< src_end
)
4822 if (c
== '\n' || c
== '\r')
4827 this_eol
= EOL_SEEN_LF
;
4828 else if (src
>= src_end
|| *src
!= '\n')
4829 this_eol
= EOL_SEEN_CR
;
4831 this_eol
= EOL_SEEN_CRLF
, src
++;
4833 if (eol_seen
== EOL_SEEN_NONE
)
4834 /* This is the first end-of-line. */
4835 eol_seen
= this_eol
;
4836 else if (eol_seen
!= this_eol
)
4838 /* The found type is different from what found before. */
4839 eol_seen
= EOL_SEEN_LF
;
4842 if (++total
== MAX_EOL_CHECK_COUNT
)
4852 adjust_coding_eol_type (coding
, eol_seen
)
4853 struct coding_system
*coding
;
4856 Lisp_Object eol_type
;
4858 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4859 if (eol_seen
& EOL_SEEN_LF
)
4860 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
4861 else if (eol_type
& EOL_SEEN_CRLF
)
4862 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
4863 else if (eol_type
& EOL_SEEN_CR
)
4864 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
4867 /* Detect how a text specified in CODING is encoded. If a coding
4868 system is detected, update fields of CODING by the detected coding
4872 detect_coding (coding
)
4873 struct coding_system
*coding
;
4875 unsigned char *src
, *src_end
;
4876 Lisp_Object attrs
, coding_type
;
4878 coding
->consumed
= coding
->consumed_char
= 0;
4879 coding
->produced
= coding
->produced_char
= 0;
4880 coding_set_source (coding
);
4882 src_end
= coding
->source
+ coding
->src_bytes
;
4884 /* If we have not yet decided the text encoding type, detect it
4886 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
4888 int mask
= CATEGORY_MASK_ANY
;
4891 for (src
= coding
->source
; src
< src_end
; src
++)
4894 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
4896 || c
== ISO_CODE_SO
)))
4899 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
4901 if (coding
->head_ascii
< coding
->src_bytes
)
4905 for (i
= 0; i
< coding_category_raw_text
; i
++)
4907 enum coding_category category
= coding_priorities
[i
];
4908 struct coding_system
*this = coding_categories
+ category
;
4910 if (category
>= coding_category_raw_text
4911 || detected
& (1 << category
))
4916 /* No coding system of this category is defined. */
4917 mask
&= ~(1 << category
);
4921 detected
|= detected_mask
[category
];
4922 if ((*(this->detector
)) (coding
, &mask
))
4927 setup_coding_system (Qraw_text
, coding
);
4928 else if (mask
!= CATEGORY_MASK_ANY
)
4929 for (i
= 0; i
< coding_category_raw_text
; i
++)
4931 enum coding_category category
= coding_priorities
[i
];
4932 struct coding_system
*this = coding_categories
+ category
;
4934 if (mask
& (1 << category
))
4936 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
4943 attrs
= CODING_ID_ATTRS (coding
->id
);
4944 coding_type
= CODING_ATTR_TYPE (attrs
);
4946 /* If we have not yet decided the EOL type, detect it now. But, the
4947 detection is impossible for a CCL based coding system, in which
4948 case, we detct the EOL type after decoding. */
4949 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
))
4950 && ! EQ (coding_type
, Qccl
))
4952 int eol_seen
= detect_eol (coding
, coding
->source
, coding
->src_bytes
);
4954 if (eol_seen
!= EOL_SEEN_NONE
)
4955 adjust_coding_eol_type (coding
, eol_seen
);
4962 struct coding_system
*coding
;
4964 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
)))
4966 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
4967 unsigned char *pend
= p
+ coding
->produced
;
4968 int eol_seen
= EOL_SEEN_NONE
;
4970 for (; p
< pend
; p
++)
4973 eol_seen
|= EOL_SEEN_LF
;
4974 else if (*p
== '\r')
4976 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
4978 eol_seen
|= EOL_SEEN_CRLF
;
4982 eol_seen
|= EOL_SEEN_CR
;
4985 if (eol_seen
!= EOL_SEEN_NONE
)
4986 adjust_coding_eol_type (coding
, eol_seen
);
4989 if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qmac
))
4991 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
4992 unsigned char *pend
= p
+ coding
->produced
;
4994 for (; p
< pend
; p
++)
4998 else if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
))
5000 unsigned char *p
, *pbeg
, *pend
;
5001 Lisp_Object undo_list
;
5003 move_gap_both (coding
->dst_pos
+ coding
->produced_char
,
5004 coding
->dst_pos_byte
+ coding
->produced
);
5005 undo_list
= current_buffer
->undo_list
;
5006 current_buffer
->undo_list
= Qt
;
5007 del_range_2 (coding
->dst_pos
, coding
->dst_pos_byte
, GPT
, GPT_BYTE
, Qnil
);
5008 current_buffer
->undo_list
= undo_list
;
5010 pend
= pbeg
+ coding
->produced
;
5012 for (p
= pend
- 1; p
>= pbeg
; p
--)
5015 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
- p
- 1);
5018 coding
->produced_char
-= coding
->produced
- (pend
- pbeg
);
5019 coding
->produced
= pend
- pbeg
;
5020 insert_from_gap (coding
->produced_char
, coding
->produced
);
5025 translate_chars (coding
, table
)
5026 struct coding_system
*coding
;
5029 int *charbuf
= coding
->charbuf
;
5030 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5033 if (coding
->chars_at_source
)
5036 while (charbuf
< charbuf_end
)
5042 *charbuf
++ = translate_char (table
, c
);
5047 produce_chars (coding
)
5048 struct coding_system
*coding
;
5050 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5051 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5053 int produced_chars
= 0;
5055 if (! coding
->chars_at_source
)
5057 /* Characters are in coding->charbuf. */
5058 int *buf
= coding
->charbuf
;
5059 int *buf_end
= buf
+ coding
->charbuf_used
;
5060 unsigned char *adjusted_dst_end
;
5062 if (BUFFERP (coding
->src_object
)
5063 && EQ (coding
->src_object
, coding
->dst_object
))
5064 dst_end
= coding
->source
+ coding
->consumed
;
5065 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5067 while (buf
< buf_end
)
5071 if (dst
>= adjusted_dst_end
)
5073 dst
= alloc_destination (coding
,
5074 buf_end
- buf
+ MAX_MULTIBYTE_LENGTH
,
5076 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5077 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5081 if (coding
->dst_multibyte
5082 || ! CHAR_BYTE8_P (c
))
5083 CHAR_STRING_ADVANCE (c
, dst
);
5085 *dst
++ = CHAR_TO_BYTE8 (c
);
5089 /* This is an annotation data. */
5095 int multibytep
= coding
->src_multibyte
;
5096 unsigned char *src
= coding
->source
;
5097 unsigned char *src_end
= src
+ coding
->src_bytes
;
5098 Lisp_Object eol_type
;
5100 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5102 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5104 if (coding
->src_multibyte
)
5110 unsigned char *src_base
= src
;
5116 if (EQ (eol_type
, Qdos
))
5122 else if (EQ (eol_type
, Qmac
))
5127 EMACS_INT offset
= src
- coding
->source
;
5129 dst
= alloc_destination (coding
, src_end
- src
+ 1, dst
);
5130 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5131 coding_set_source (coding
);
5132 src
= coding
->source
+ offset
;
5133 src_end
= coding
->source
+ coding
->src_bytes
;
5142 while (src
< src_end
)
5148 if (EQ (eol_type
, Qdos
))
5154 else if (EQ (eol_type
, Qmac
))
5157 if (dst
>= dst_end
- 1)
5159 EMACS_INT offset
= src
- coding
->source
;
5161 dst
= alloc_destination (coding
, src_end
- src
+ 2, dst
);
5162 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5163 coding_set_source (coding
);
5164 src
= coding
->source
+ offset
;
5165 src_end
= coding
->source
+ coding
->src_bytes
;
5172 if (!EQ (coding
->src_object
, coding
->dst_object
))
5174 int require
= coding
->src_bytes
- coding
->dst_bytes
;
5178 EMACS_INT offset
= src
- coding
->source
;
5180 dst
= alloc_destination (coding
, require
, dst
);
5181 coding_set_source (coding
);
5182 src
= coding
->source
+ offset
;
5183 src_end
= coding
->source
+ coding
->src_bytes
;
5186 produced_chars
= coding
->src_chars
;
5187 while (src
< src_end
)
5193 if (EQ (eol_type
, Qdos
))
5200 else if (EQ (eol_type
, Qmac
))
5208 produced
= dst
- (coding
->destination
+ coding
->produced
);
5209 if (BUFFERP (coding
->dst_object
))
5210 insert_from_gap (produced_chars
, produced
);
5211 coding
->produced
+= produced
;
5212 coding
->produced_char
+= produced_chars
;
5213 return produced_chars
;
5216 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5218 [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5222 produce_composition (coding
, charbuf
)
5223 struct coding_system
*coding
;
5229 enum composition_method method
;
5231 Lisp_Object components
;
5233 buffer
= coding
->dst_object
;
5235 pos
= coding
->dst_pos
+ charbuf
[1];
5236 method
= (enum composition_method
) (charbuf
[3]);
5237 cmp_len
= charbuf
[4];
5239 if (method
== COMPOSITION_RELATIVE
)
5243 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
5248 for (i
= 0; i
< len
; i
++)
5249 args
[i
] = make_number (charbuf
[i
]);
5250 components
= (method
== COMPOSITION_WITH_ALTCHARS
5251 ? Fstring (len
, args
) : Fvector (len
, args
));
5253 compose_text (pos
, pos
+ cmp_len
, components
, Qnil
, Qnil
);
5257 save_composition_data (buf
, buf_end
, prop
)
5261 enum composition_method method
= COMPOSITION_METHOD (prop
);
5262 int cmp_len
= COMPOSITION_LENGTH (prop
);
5264 if (buf
+ 4 + (MAX_COMPOSITION_COMPONENTS
* 2 - 1) > buf_end
)
5267 buf
[1] = CODING_ANNOTATE_COMPOSITION_MASK
;
5271 if (method
== COMPOSITION_RELATIVE
)
5275 Lisp_Object components
;
5278 components
= COMPOSITION_COMPONENTS (prop
);
5279 if (VECTORP (components
))
5281 len
= XVECTOR (components
)->size
;
5282 for (i
= 0; i
< len
; i
++)
5283 buf
[4 + i
] = XINT (AREF (components
, i
));
5285 else if (STRINGP (components
))
5289 len
= XSTRING (components
)->size
;
5292 FETCH_STRING_CHAR_ADVANCE (buf
[4 + i
], components
, i
, i_byte
);
5294 else if (INTEGERP (components
))
5297 buf
[4] = XINT (components
);
5299 else if (CONSP (components
))
5301 for (len
= 0; CONSP (components
);
5302 len
++, components
= XCDR (components
))
5303 buf
[4 + len
] = XINT (XCAR (components
));
5309 return (buf
+ buf
[0]);
5312 #define CHARBUF_SIZE 0x4000
5314 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5316 int size = CHARBUF_SIZE;; \
5318 coding->charbuf = NULL; \
5319 while (size > 1024) \
5321 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5322 if (coding->charbuf) \
5326 if (! coding->charbuf) \
5328 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5329 return coding->result; \
5331 coding->charbuf_size = size; \
5336 produce_annotation (coding
)
5337 struct coding_system
*coding
;
5339 int *charbuf
= coding
->charbuf
;
5340 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5342 while (charbuf
< charbuf_end
)
5348 int len
= -*charbuf
;
5351 case CODING_ANNOTATE_COMPOSITION_MASK
:
5352 produce_composition (coding
, charbuf
);
5362 /* Decode the data at CODING->src_object into CODING->dst_object.
5363 CODING->src_object is a buffer, a string, or nil.
5364 CODING->dst_object is a buffer.
5366 If CODING->src_object is a buffer, it must be the current buffer.
5367 In this case, if CODING->src_pos is positive, it is a position of
5368 the source text in the buffer, otherwise, the source text is in the
5369 gap area of the buffer, and CODING->src_pos specifies the offset of
5370 the text from GPT (which must be the same as PT). If this is the
5371 same buffer as CODING->dst_object, CODING->src_pos must be
5374 If CODING->src_object is a string, CODING->src_pos in an index to
5377 If CODING->src_object is nil, CODING->source must already point to
5378 the non-relocatable memory area. In this case, CODING->src_pos is
5379 an offset from CODING->source.
5381 The decoded data is inserted at the current point of the buffer
5386 decode_coding (coding
)
5387 struct coding_system
*coding
;
5391 if (BUFFERP (coding
->src_object
)
5392 && coding
->src_pos
> 0
5393 && coding
->src_pos
< GPT
5394 && coding
->src_pos
+ coding
->src_chars
> GPT
)
5395 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
5397 if (BUFFERP (coding
->dst_object
))
5399 if (current_buffer
!= XBUFFER (coding
->dst_object
))
5400 set_buffer_internal (XBUFFER (coding
->dst_object
));
5402 move_gap_both (PT
, PT_BYTE
);
5405 coding
->consumed
= coding
->consumed_char
= 0;
5406 coding
->produced
= coding
->produced_char
= 0;
5407 coding
->chars_at_source
= 0;
5408 coding
->result
= CODING_RESULT_SUCCESS
;
5411 ALLOC_CONVERSION_WORK_AREA (coding
);
5413 attrs
= CODING_ID_ATTRS (coding
->id
);
5417 coding_set_source (coding
);
5418 coding
->annotated
= 0;
5419 (*(coding
->decoder
)) (coding
);
5420 if (!NILP (CODING_ATTR_DECODE_TBL (attrs
)))
5421 translate_chars (CODING_ATTR_DECODE_TBL (attrs
), coding
);
5422 coding_set_destination (coding
);
5423 produce_chars (coding
);
5424 if (coding
->annotated
)
5425 produce_annotation (coding
);
5427 while (coding
->consumed
< coding
->src_bytes
5428 && ! coding
->result
);
5430 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qccl
)
5431 && SYMBOLP (CODING_ID_EOL_TYPE (coding
->id
))
5432 && ! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
5433 decode_eol (coding
);
5435 coding
->carryover_bytes
= 0;
5436 if (coding
->consumed
< coding
->src_bytes
)
5438 int nbytes
= coding
->src_bytes
- coding
->consumed
;
5441 coding_set_source (coding
);
5442 coding_set_destination (coding
);
5443 src
= coding
->source
+ coding
->consumed
;
5445 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
5447 /* Flush out unprocessed data as binary chars. We are sure
5448 that the number of data is less than the size of
5450 int *charbuf
= coding
->charbuf
;
5452 while (nbytes
-- > 0)
5455 *charbuf
++ = (c
& 0x80 ? - c
: c
);
5457 produce_chars (coding
);
5461 /* Record unprocessed bytes in coding->carryover. We are
5462 sure that the number of data is less than the size of
5463 coding->carryover. */
5464 unsigned char *p
= coding
->carryover
;
5466 coding
->carryover_bytes
= nbytes
;
5467 while (nbytes
-- > 0)
5470 coding
->consumed
= coding
->src_bytes
;
5473 return coding
->result
;
5477 consume_chars (coding
)
5478 struct coding_system
*coding
;
5480 int *buf
= coding
->charbuf
;
5481 /* -1 is to compensate for CRLF. */
5482 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
- 1;
5483 unsigned char *src
= coding
->source
+ coding
->consumed
;
5484 int pos
= coding
->src_pos
+ coding
->consumed_char
;
5485 int end_pos
= coding
->src_pos
+ coding
->src_chars
;
5486 int multibytep
= coding
->src_multibyte
;
5487 Lisp_Object eol_type
;
5489 int start
, end
, stop
;
5490 Lisp_Object object
, prop
;
5492 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5493 if (VECTORP (eol_type
))
5496 object
= coding
->src_object
;
5498 /* Note: composition handling is not yet implemented. */
5499 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
5501 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
5502 && find_composition (pos
, end_pos
, &start
, &end
, &prop
, object
)
5505 || (find_composition (end
, end_pos
, &start
, &end
, &prop
, object
)
5506 && end
<= end_pos
)))
5511 while (buf
< buf_end
)
5519 p
= save_composition_data (buf
, buf_end
, prop
);
5523 if (find_composition (end
, end_pos
, &start
, &end
, &prop
, object
)
5533 c
= STRING_CHAR_ADVANCE (src
);
5534 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
5536 if (! EQ (eol_type
, Qunix
))
5540 if (EQ (eol_type
, Qdos
))
5550 coding
->consumed
= src
- coding
->source
;
5551 coding
->consumed_char
= pos
- coding
->src_pos
;
5552 coding
->charbuf_used
= buf
- coding
->charbuf
;
5553 coding
->chars_at_source
= 0;
5557 /* Encode the text at CODING->src_object into CODING->dst_object.
5558 CODING->src_object is a buffer or a string.
5559 CODING->dst_object is a buffer or nil.
5561 If CODING->src_object is a buffer, it must be the current buffer.
5562 In this case, if CODING->src_pos is positive, it is a position of
5563 the source text in the buffer, otherwise. the source text is in the
5564 gap area of the buffer, and coding->src_pos specifies the offset of
5565 the text from GPT (which must be the same as PT). If this is the
5566 same buffer as CODING->dst_object, CODING->src_pos must be
5567 negative and CODING should not have `pre-write-conversion'.
5569 If CODING->src_object is a string, CODING should not have
5570 `pre-write-conversion'.
5572 If CODING->dst_object is a buffer, the encoded data is inserted at
5573 the current point of that buffer.
5575 If CODING->dst_object is nil, the encoded data is placed at the
5576 memory area specified by CODING->destination. */
5579 encode_coding (coding
)
5580 struct coding_system
*coding
;
5584 attrs
= CODING_ID_ATTRS (coding
->id
);
5586 if (BUFFERP (coding
->dst_object
))
5588 set_buffer_internal (XBUFFER (coding
->dst_object
));
5589 coding
->dst_multibyte
5590 = ! NILP (current_buffer
->enable_multibyte_characters
);
5593 coding
->consumed
= coding
->consumed_char
= 0;
5594 coding
->produced
= coding
->produced_char
= 0;
5595 coding
->result
= CODING_RESULT_SUCCESS
;
5598 ALLOC_CONVERSION_WORK_AREA (coding
);
5601 coding_set_source (coding
);
5602 consume_chars (coding
);
5604 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs
)))
5605 translate_chars (CODING_ATTR_ENCODE_TBL (attrs
), coding
);
5607 coding_set_destination (coding
);
5608 (*(coding
->encoder
)) (coding
);
5609 } while (coding
->consumed_char
< coding
->src_chars
);
5611 if (BUFFERP (coding
->dst_object
))
5612 insert_from_gap (coding
->produced_char
, coding
->produced
);
5614 return (coding
->result
);
5619 /* List of currently used working buffer. */
5620 Lisp_Object Vcode_conversion_work_buf_list
;
5622 /* A working buffer used by the top level conversion. */
5623 Lisp_Object Vcode_conversion_reused_work_buf
;
5626 /* Return a working buffer that can be freely used by the following
5627 code conversion. MULTIBYTEP specifies the multibyteness of the
5631 make_conversion_work_buffer (multibytep
)
5634 struct buffer
*current
= current_buffer
;
5637 if (NILP (Vcode_conversion_work_buf_list
))
5639 if (NILP (Vcode_conversion_reused_work_buf
))
5640 Vcode_conversion_reused_work_buf
5641 = Fget_buffer_create (build_string (" *code-conversion-work*"));
5642 Vcode_conversion_work_buf_list
5643 = Fcons (Vcode_conversion_reused_work_buf
, Qnil
);
5647 int depth
= Flength (Vcode_conversion_work_buf_list
);
5650 sprintf (str
, " *code-conversion-work*<%d>", depth
);
5651 Vcode_conversion_work_buf_list
5652 = Fcons (Fget_buffer_create (build_string (str
)),
5653 Vcode_conversion_work_buf_list
);
5656 buf
= XCAR (Vcode_conversion_work_buf_list
);
5657 set_buffer_internal (XBUFFER (buf
));
5658 current_buffer
->undo_list
= Qt
;
5660 Fset_buffer_multibyte (multibytep
? Qt
: Qnil
);
5661 set_buffer_internal (current
);
5665 static struct coding_system
*saved_coding
;
5668 code_conversion_restore (info
)
5671 int depth
= Flength (Vcode_conversion_work_buf_list
);
5676 buf
= XCAR (Vcode_conversion_work_buf_list
);
5677 Vcode_conversion_work_buf_list
= XCDR (Vcode_conversion_work_buf_list
);
5678 if (depth
> 1 && !NILP (Fbuffer_live_p (buf
)))
5682 if (saved_coding
->dst_object
== Qt
5683 && saved_coding
->destination
)
5684 xfree (saved_coding
->destination
);
5686 return save_excursion_restore (info
);
5691 decode_coding_gap (coding
, chars
, bytes
)
5692 struct coding_system
*coding
;
5693 EMACS_INT chars
, bytes
;
5695 int count
= specpdl_ptr
- specpdl
;
5697 saved_coding
= coding
;
5698 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5700 coding
->src_object
= Fcurrent_buffer ();
5701 coding
->src_chars
= chars
;
5702 coding
->src_bytes
= bytes
;
5703 coding
->src_pos
= -chars
;
5704 coding
->src_pos_byte
= -bytes
;
5705 coding
->src_multibyte
= chars
< bytes
;
5706 coding
->dst_object
= coding
->src_object
;
5707 coding
->dst_pos
= PT
;
5708 coding
->dst_pos_byte
= PT_BYTE
;
5710 if (CODING_REQUIRE_DETECTION (coding
))
5711 detect_coding (coding
);
5713 decode_coding (coding
);
5715 unbind_to (count
, Qnil
);
5716 return coding
->result
;
5720 encode_coding_gap (coding
, chars
, bytes
)
5721 struct coding_system
*coding
;
5722 EMACS_INT chars
, bytes
;
5724 int count
= specpdl_ptr
- specpdl
;
5727 saved_coding
= coding
;
5728 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5730 buffer
= Fcurrent_buffer ();
5731 coding
->src_object
= buffer
;
5732 coding
->src_chars
= chars
;
5733 coding
->src_bytes
= bytes
;
5734 coding
->src_pos
= -chars
;
5735 coding
->src_pos_byte
= -bytes
;
5736 coding
->src_multibyte
= chars
< bytes
;
5737 coding
->dst_object
= coding
->src_object
;
5738 coding
->dst_pos
= PT
;
5739 coding
->dst_pos_byte
= PT_BYTE
;
5741 encode_coding (coding
);
5743 unbind_to (count
, Qnil
);
5744 return coding
->result
;
5748 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5749 SRC_OBJECT into DST_OBJECT by coding context CODING.
5751 SRC_OBJECT is a buffer, a string, or Qnil.
5753 If it is a buffer, the text is at point of the buffer. FROM and TO
5754 are positions in the buffer.
5756 If it is a string, the text is at the beginning of the string.
5757 FROM and TO are indices to the string.
5759 If it is nil, the text is at coding->source. FROM and TO are
5760 indices to coding->source.
5762 DST_OBJECT is a buffer, Qt, or Qnil.
5764 If it is a buffer, the decoded text is inserted at point of the
5765 buffer. If the buffer is the same as SRC_OBJECT, the source text
5768 If it is Qt, a string is made from the decoded text, and
5769 set in CODING->dst_object.
5771 If it is Qnil, the decoded text is stored at CODING->destination.
5772 The called must allocate CODING->dst_bytes bytes at
5773 CODING->destination by xmalloc. If the decoded text is longer than
5774 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5778 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
5780 struct coding_system
*coding
;
5781 Lisp_Object src_object
;
5782 EMACS_INT from
, from_byte
, to
, to_byte
;
5783 Lisp_Object dst_object
;
5785 int count
= specpdl_ptr
- specpdl
;
5786 unsigned char *destination
;
5787 EMACS_INT dst_bytes
;
5788 EMACS_INT chars
= to
- from
;
5789 EMACS_INT bytes
= to_byte
- from_byte
;
5792 saved_coding
= coding
;
5793 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5795 if (NILP (dst_object
))
5797 destination
= coding
->destination
;
5798 dst_bytes
= coding
->dst_bytes
;
5801 coding
->src_object
= src_object
;
5802 coding
->src_chars
= chars
;
5803 coding
->src_bytes
= bytes
;
5804 coding
->src_multibyte
= chars
< bytes
;
5806 if (STRINGP (src_object
))
5808 coding
->src_pos
= from
;
5809 coding
->src_pos_byte
= from_byte
;
5811 else if (BUFFERP (src_object
))
5813 set_buffer_internal (XBUFFER (src_object
));
5815 move_gap_both (from
, from_byte
);
5816 if (EQ (src_object
, dst_object
))
5818 TEMP_SET_PT_BOTH (from
, from_byte
);
5819 del_range_both (from
, from_byte
, to
, to_byte
, 1);
5820 coding
->src_pos
= -chars
;
5821 coding
->src_pos_byte
= -bytes
;
5825 coding
->src_pos
= from
;
5826 coding
->src_pos_byte
= from_byte
;
5830 if (CODING_REQUIRE_DETECTION (coding
))
5831 detect_coding (coding
);
5832 attrs
= CODING_ID_ATTRS (coding
->id
);
5834 if (! NILP (CODING_ATTR_POST_READ (attrs
))
5835 || EQ (dst_object
, Qt
))
5837 coding
->dst_object
= make_conversion_work_buffer (1);
5838 coding
->dst_pos
= BEG
;
5839 coding
->dst_pos_byte
= BEG_BYTE
;
5840 coding
->dst_multibyte
= 1;
5842 else if (BUFFERP (dst_object
))
5844 coding
->dst_object
= dst_object
;
5845 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
5846 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
5847 coding
->dst_multibyte
5848 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
5852 coding
->dst_object
= Qnil
;
5853 coding
->dst_multibyte
= 1;
5856 decode_coding (coding
);
5858 if (BUFFERP (coding
->dst_object
))
5859 set_buffer_internal (XBUFFER (coding
->dst_object
));
5861 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
5863 struct gcpro gcpro1
, gcpro2
;
5864 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
5867 GCPRO2 (coding
->src_object
, coding
->dst_object
);
5868 val
= call1 (CODING_ATTR_POST_READ (attrs
),
5869 make_number (coding
->produced_char
));
5872 coding
->produced_char
+= Z
- prev_Z
;
5873 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
5876 if (EQ (dst_object
, Qt
))
5878 coding
->dst_object
= Fbuffer_string ();
5880 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
5882 set_buffer_internal (XBUFFER (coding
->dst_object
));
5883 if (dst_bytes
< coding
->produced
)
5886 = (unsigned char *) xrealloc (destination
, coding
->produced
);
5889 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
5890 unbind_to (count
, Qnil
);
5893 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
5894 move_gap_both (BEGV
, BEGV_BYTE
);
5895 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
5896 coding
->destination
= destination
;
5900 unbind_to (count
, Qnil
);
5905 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
5907 struct coding_system
*coding
;
5908 Lisp_Object src_object
;
5909 EMACS_INT from
, from_byte
, to
, to_byte
;
5910 Lisp_Object dst_object
;
5912 int count
= specpdl_ptr
- specpdl
;
5913 EMACS_INT chars
= to
- from
;
5914 EMACS_INT bytes
= to_byte
- from_byte
;
5917 saved_coding
= coding
;
5918 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5920 coding
->src_object
= src_object
;
5921 coding
->src_chars
= chars
;
5922 coding
->src_bytes
= bytes
;
5923 coding
->src_multibyte
= chars
< bytes
;
5925 attrs
= CODING_ID_ATTRS (coding
->id
);
5927 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
5931 coding
->src_object
= make_conversion_work_buffer (coding
->src_multibyte
);
5932 set_buffer_internal (XBUFFER (coding
->src_object
));
5933 if (STRINGP (src_object
))
5934 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
5935 else if (BUFFERP (src_object
))
5936 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
5938 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
5940 if (EQ (src_object
, dst_object
))
5942 set_buffer_internal (XBUFFER (src_object
));
5943 del_range_both (from
, from_byte
, to
, to_byte
, 1);
5944 set_buffer_internal (XBUFFER (coding
->src_object
));
5947 val
= call2 (CODING_ATTR_PRE_WRITE (attrs
),
5948 make_number (1), make_number (chars
));
5951 move_gap_both (BEG
, BEG_BYTE
);
5952 coding
->src_chars
= Z
- BEG
;
5953 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
5954 coding
->src_pos
= BEG
;
5955 coding
->src_pos_byte
= BEG_BYTE
;
5956 coding
->src_multibyte
= Z
< Z_BYTE
;
5958 else if (STRINGP (src_object
))
5960 coding
->src_pos
= from
;
5961 coding
->src_pos_byte
= from_byte
;
5963 else if (BUFFERP (src_object
))
5965 set_buffer_internal (XBUFFER (src_object
));
5967 move_gap_both (from
, from_byte
);
5968 if (EQ (src_object
, dst_object
))
5970 del_range_both (from
, from_byte
, to
, to_byte
, 1);
5971 coding
->src_pos
= -chars
;
5972 coding
->src_pos_byte
= -bytes
;
5976 coding
->src_pos
= from
;
5977 coding
->src_pos_byte
= from_byte
;
5981 if (BUFFERP (dst_object
))
5983 coding
->dst_object
= dst_object
;
5984 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
5985 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
5986 coding
->dst_multibyte
5987 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
5989 else if (EQ (dst_object
, Qt
))
5991 coding
->dst_object
= Qnil
;
5992 coding
->destination
= (unsigned char *) xmalloc (coding
->src_chars
);
5993 coding
->dst_bytes
= coding
->src_chars
;
5994 coding
->dst_multibyte
= 0;
5998 coding
->dst_object
= Qnil
;
5999 coding
->dst_multibyte
= 0;
6002 encode_coding (coding
);
6004 if (EQ (dst_object
, Qt
))
6006 if (BUFFERP (coding
->dst_object
))
6007 coding
->dst_object
= Fbuffer_string ();
6011 = make_unibyte_string ((char *) coding
->destination
,
6013 xfree (coding
->destination
);
6017 unbind_to (count
, Qnil
);
6022 preferred_coding_system ()
6024 int id
= coding_categories
[coding_priorities
[0]].id
;
6026 return CODING_ID_NAME (id
);
6031 /*** 8. Emacs Lisp library functions ***/
6033 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
6034 doc
: /* Return t if OBJECT is nil or a coding-system.
6035 See the documentation of `define-coding-system' for information
6036 about coding-system objects. */)
6040 return ((NILP (obj
) || CODING_SYSTEM_P (obj
)) ? Qt
: Qnil
);
6043 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
6044 Sread_non_nil_coding_system
, 1, 1, 0,
6045 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6052 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6053 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
6055 while (XSTRING (val
)->size
== 0);
6056 return (Fintern (val
, Qnil
));
6059 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
6060 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6061 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6062 (prompt
, default_coding_system
)
6063 Lisp_Object prompt
, default_coding_system
;
6066 if (SYMBOLP (default_coding_system
))
6067 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
6068 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6069 Qt
, Qnil
, Qcoding_system_history
,
6070 default_coding_system
, Qnil
);
6071 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
6074 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
6076 doc
: /* Check validity of CODING-SYSTEM.
6077 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6078 It is valid if it is a symbol with a non-nil `coding-system' property.
6079 The value of property should be a vector of length 5. */)
6081 Lisp_Object coding_system
;
6083 CHECK_SYMBOL (coding_system
);
6084 if (!NILP (Fcoding_system_p (coding_system
)))
6085 return coding_system
;
6087 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
6092 detect_coding_system (src
, src_bytes
, highest
, multibytep
, coding_system
)
6094 int src_bytes
, highest
;
6096 Lisp_Object coding_system
;
6098 unsigned char *src_end
= src
+ src_bytes
;
6099 int mask
= CATEGORY_MASK_ANY
;
6102 Lisp_Object attrs
, eol_type
;
6104 struct coding_system coding
;
6106 if (NILP (coding_system
))
6107 coding_system
= Qundecided
;
6108 setup_coding_system (coding_system
, &coding
);
6109 attrs
= CODING_ID_ATTRS (coding
.id
);
6110 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
6112 coding
.source
= src
;
6113 coding
.src_bytes
= src_bytes
;
6114 coding
.src_multibyte
= multibytep
;
6115 coding
.consumed
= 0;
6117 if (XINT (CODING_ATTR_CATEGORY (attrs
)) != coding_category_undecided
)
6119 mask
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
6123 coding_system
= Qnil
;
6124 for (; src
< src_end
; src
++)
6127 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
6129 || c
== ISO_CODE_SO
)))
6132 coding
.head_ascii
= src
- coding
.source
;
6135 for (i
= 0; i
< coding_category_raw_text
; i
++)
6137 enum coding_category category
= coding_priorities
[i
];
6138 struct coding_system
*this = coding_categories
+ category
;
6140 if (category
>= coding_category_raw_text
6141 || detected
& (1 << category
))
6146 /* No coding system of this category is defined. */
6147 mask
&= ~(1 << category
);
6151 detected
|= detected_mask
[category
];
6152 if ((*(coding_categories
[category
].detector
)) (&coding
, &mask
)
6155 mask
&= detected_mask
[category
];
6163 val
= Fcons (make_number (coding_category_raw_text
), Qnil
);
6164 else if (mask
== CATEGORY_MASK_ANY
)
6165 val
= Fcons (make_number (coding_category_undecided
), Qnil
);
6168 for (i
= 0; i
< coding_category_raw_text
; i
++)
6169 if (mask
& (1 << coding_priorities
[i
]))
6171 val
= Fcons (make_number (coding_priorities
[i
]), Qnil
);
6178 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6179 if (mask
& (1 << coding_priorities
[i
]))
6180 val
= Fcons (make_number (coding_priorities
[i
]), val
);
6184 int one_byte_eol
= -1, two_byte_eol
= -1;
6187 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
6189 struct coding_system
*this
6190 = (NILP (coding_system
) ? coding_categories
+ XINT (XCAR (tail
))
6194 attrs
= CODING_ID_ATTRS (this->id
);
6195 eol_type
= CODING_ID_EOL_TYPE (this->id
);
6196 XSETCAR (tail
, CODING_ID_NAME (this->id
));
6197 if (VECTORP (eol_type
))
6199 if (EQ (CODING_ATTR_TYPE (attrs
), Qutf_16
))
6201 if (two_byte_eol
< 0)
6202 two_byte_eol
= detect_eol (this, coding
.source
, src_bytes
);
6203 this_eol
= two_byte_eol
;
6207 if (one_byte_eol
< 0)
6208 one_byte_eol
=detect_eol (this, coding
.source
, src_bytes
);
6209 this_eol
= one_byte_eol
;
6211 if (this_eol
== EOL_SEEN_LF
)
6212 XSETCAR (tail
, AREF (eol_type
, 0));
6213 else if (this_eol
== EOL_SEEN_CRLF
)
6214 XSETCAR (tail
, AREF (eol_type
, 1));
6215 else if (this_eol
== EOL_SEEN_CR
)
6216 XSETCAR (tail
, AREF (eol_type
, 2));
6221 return (highest
? XCAR (val
) : val
);
6225 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
6227 doc
: /* Detect coding system of the text in the region between START and END.
6228 Return a list of possible coding systems ordered by priority.
6230 If only ASCII characters are found, it returns a list of single element
6231 `undecided' or its subsidiary coding system according to a detected
6234 If optional argument HIGHEST is non-nil, return the coding system of
6235 highest priority. */)
6236 (start
, end
, highest
)
6237 Lisp_Object start
, end
, highest
;
6240 int from_byte
, to_byte
;
6242 CHECK_NUMBER_COERCE_MARKER (start
);
6243 CHECK_NUMBER_COERCE_MARKER (end
);
6245 validate_region (&start
, &end
);
6246 from
= XINT (start
), to
= XINT (end
);
6247 from_byte
= CHAR_TO_BYTE (from
);
6248 to_byte
= CHAR_TO_BYTE (to
);
6250 if (from
< GPT
&& to
>= GPT
)
6251 move_gap_both (to
, to_byte
);
6253 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
6254 to_byte
- from_byte
,
6256 !NILP (current_buffer
6257 ->enable_multibyte_characters
),
6261 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
6263 doc
: /* Detect coding system of the text in STRING.
6264 Return a list of possible coding systems ordered by priority.
6266 If only ASCII characters are found, it returns a list of single element
6267 `undecided' or its subsidiary coding system according to a detected
6270 If optional argument HIGHEST is non-nil, return the coding system of
6271 highest priority. */)
6273 Lisp_Object string
, highest
;
6275 CHECK_STRING (string
);
6277 return detect_coding_system (XSTRING (string
)->data
,
6278 STRING_BYTES (XSTRING (string
)),
6280 STRING_MULTIBYTE (string
),
6286 char_encodable_p (c
, attrs
)
6291 struct charset
*charset
;
6293 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
6294 CONSP (tail
); tail
= XCDR (tail
))
6296 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
6297 if (CHAR_CHARSET_P (c
, charset
))
6300 return (! NILP (tail
));
6304 /* Return a list of coding systems that safely encode the text between
6305 START and END. If EXCLUDE is non-nil, it is a list of coding
6306 systems not to check. The returned list doesn't contain any such
6307 coding systems. In any case, If the text contains only ASCII or is
6308 unibyte, return t. */
6310 DEFUN ("find-coding-systems-region-internal",
6311 Ffind_coding_systems_region_internal
,
6312 Sfind_coding_systems_region_internal
, 2, 3, 0,
6313 doc
: /* Internal use only. */)
6314 (start
, end
, exclude
)
6315 Lisp_Object start
, end
, exclude
;
6317 Lisp_Object coding_attrs_list
, safe_codings
;
6318 EMACS_INT start_byte
, end_byte
;
6319 unsigned char *p
, *pbeg
, *pend
;
6321 Lisp_Object tail
, elt
;
6323 if (STRINGP (start
))
6325 if (!STRING_MULTIBYTE (start
)
6326 && XSTRING (start
)->size
!= STRING_BYTES (XSTRING (start
)))
6329 end_byte
= STRING_BYTES (XSTRING (start
));
6333 CHECK_NUMBER_COERCE_MARKER (start
);
6334 CHECK_NUMBER_COERCE_MARKER (end
);
6335 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
6336 args_out_of_range (start
, end
);
6337 if (NILP (current_buffer
->enable_multibyte_characters
))
6339 start_byte
= CHAR_TO_BYTE (XINT (start
));
6340 end_byte
= CHAR_TO_BYTE (XINT (end
));
6341 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
6344 if (start
< GPT
&& end
> GPT
)
6346 if ((GPT
- start
) < (end
- GPT
))
6347 move_gap_both (start
, start_byte
);
6349 move_gap_both (end
, end_byte
);
6353 coding_attrs_list
= Qnil
;
6354 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
6356 || NILP (Fmemq (XCAR (tail
), exclude
)))
6360 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
6361 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
6362 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
6363 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
6366 if (STRINGP (start
))
6367 p
= pbeg
= XSTRING (start
)->data
;
6369 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
6370 pend
= p
+ (end_byte
- start_byte
);
6372 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
6373 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
6377 if (ASCII_BYTE_P (*p
))
6381 c
= STRING_CHAR_ADVANCE (p
);
6383 charset_map_loaded
= 0;
6384 for (tail
= coding_attrs_list
; CONSP (tail
);)
6389 else if (char_encodable_p (c
, elt
))
6391 else if (CONSP (XCDR (tail
)))
6393 XSETCAR (tail
, XCAR (XCDR (tail
)));
6394 XSETCDR (tail
, XCDR (XCDR (tail
)));
6398 XSETCAR (tail
, Qnil
);
6402 if (charset_map_loaded
)
6404 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
6406 if (STRINGP (start
))
6407 pbeg
= XSTRING (start
)->data
;
6409 pbeg
= BYTE_POS_ADDR (start_byte
);
6410 p
= pbeg
+ p_offset
;
6411 pend
= pbeg
+ pend_offset
;
6416 safe_codings
= Qnil
;
6417 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
6418 if (! NILP (XCAR (tail
)))
6419 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
6421 return safe_codings
;
6425 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
6426 Scheck_coding_systems_region
, 3, 3, 0,
6427 doc
: /* Check if the region is encodable by coding systems.
6429 START and END are buffer positions specifying the region.
6430 CODING-SYSTEM-LIST is a list of coding systems to check.
6432 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6433 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6434 whole region, POS0, POS1, ... are buffer positions where non-encodable
6435 characters are found.
6437 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6440 START may be a string. In that case, check if the string is
6441 encodable, and the value contains indices to the string instead of
6442 buffer positions. END is ignored. */)
6443 (start
, end
, coding_system_list
)
6444 Lisp_Object start
, end
, coding_system_list
;
6447 EMACS_INT start_byte
, end_byte
;
6449 unsigned char *p
, *pbeg
, *pend
;
6451 Lisp_Object tail
, elt
;
6453 if (STRINGP (start
))
6455 if (!STRING_MULTIBYTE (start
)
6456 && XSTRING (start
)->size
!= STRING_BYTES (XSTRING (start
)))
6459 end_byte
= STRING_BYTES (XSTRING (start
));
6464 CHECK_NUMBER_COERCE_MARKER (start
);
6465 CHECK_NUMBER_COERCE_MARKER (end
);
6466 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
6467 args_out_of_range (start
, end
);
6468 if (NILP (current_buffer
->enable_multibyte_characters
))
6470 start_byte
= CHAR_TO_BYTE (XINT (start
));
6471 end_byte
= CHAR_TO_BYTE (XINT (end
));
6472 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
6475 if (start
< GPT
&& end
> GPT
)
6477 if ((GPT
- start
) < (end
- GPT
))
6478 move_gap_both (start
, start_byte
);
6480 move_gap_both (end
, end_byte
);
6486 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
6489 list
= Fcons (Fcons (elt
, Fcons (AREF (CODING_SYSTEM_SPEC (elt
), 0),
6494 if (STRINGP (start
))
6495 p
= pbeg
= XSTRING (start
)->data
;
6497 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
6498 pend
= p
+ (end_byte
- start_byte
);
6500 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
6501 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
6505 if (ASCII_BYTE_P (*p
))
6509 c
= STRING_CHAR_ADVANCE (p
);
6511 charset_map_loaded
= 0;
6512 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
6514 elt
= XCDR (XCAR (tail
));
6515 if (! char_encodable_p (c
, XCAR (elt
)))
6516 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
6518 if (charset_map_loaded
)
6520 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
6522 if (STRINGP (start
))
6523 pbeg
= XSTRING (start
)->data
;
6525 pbeg
= BYTE_POS_ADDR (start_byte
);
6526 p
= pbeg
+ p_offset
;
6527 pend
= pbeg
+ pend_offset
;
6535 for (; CONSP (tail
); tail
= XCDR (tail
))
6538 if (CONSP (XCDR (XCDR (elt
))))
6539 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
6549 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
6550 Lisp_Object start
, end
, coding_system
, dst_object
;
6551 int encodep
, norecord
;
6553 struct coding_system coding
;
6554 EMACS_INT from
, from_byte
, to
, to_byte
;
6555 Lisp_Object src_object
;
6557 CHECK_NUMBER_COERCE_MARKER (start
);
6558 CHECK_NUMBER_COERCE_MARKER (end
);
6559 if (NILP (coding_system
))
6560 coding_system
= Qno_conversion
;
6562 CHECK_CODING_SYSTEM (coding_system
);
6563 src_object
= Fcurrent_buffer ();
6564 if (NILP (dst_object
))
6565 dst_object
= src_object
;
6566 else if (! EQ (dst_object
, Qt
))
6567 CHECK_BUFFER (dst_object
);
6569 validate_region (&start
, &end
);
6570 from
= XFASTINT (start
);
6571 from_byte
= CHAR_TO_BYTE (from
);
6572 to
= XFASTINT (end
);
6573 to_byte
= CHAR_TO_BYTE (to
);
6575 setup_coding_system (coding_system
, &coding
);
6576 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6579 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
6582 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
6585 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
6587 if (coding
.result
!= CODING_RESULT_SUCCESS
)
6588 error ("Code conversion error: %d", coding
.result
);
6590 return (BUFFERP (dst_object
)
6591 ? make_number (coding
.produced_char
)
6592 : coding
.dst_object
);
6596 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
6597 3, 4, "r\nzCoding system: ",
6598 doc
: /* Decode the current region from the specified coding system.
6599 When called from a program, takes four arguments:
6600 START, END, CODING-SYSTEM, and DESTINATION.
6601 START and END are buffer positions.
6603 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6604 If nil, the region between START and END is replace by the decoded text.
6605 If buffer, the decoded text is inserted in the buffer.
6606 If t, the decoded text is returned.
6608 This function sets `last-coding-system-used' to the precise coding system
6609 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6610 not fully specified.)
6611 It returns the length of the decoded text. */)
6612 (start
, end
, coding_system
, destination
)
6613 Lisp_Object start
, end
, coding_system
, destination
;
6615 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
6618 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
6619 3, 4, "r\nzCoding system: ",
6620 doc
: /* Encode the current region by specified coding system.
6621 When called from a program, takes three arguments:
6622 START, END, and CODING-SYSTEM. START and END are buffer positions.
6624 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6625 If nil, the region between START and END is replace by the encoded text.
6626 If buffer, the encoded text is inserted in the buffer.
6627 If t, the encoded text is returned.
6629 This function sets `last-coding-system-used' to the precise coding system
6630 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6631 not fully specified.)
6632 It returns the length of the encoded text. */)
6633 (start
, end
, coding_system
, destination
)
6634 Lisp_Object start
, end
, coding_system
, destination
;
6636 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
6640 code_convert_string (string
, coding_system
, dst_object
,
6641 encodep
, nocopy
, norecord
)
6642 Lisp_Object string
, coding_system
, dst_object
;
6643 int encodep
, nocopy
, norecord
;
6645 struct coding_system coding
;
6646 EMACS_INT chars
, bytes
;
6648 CHECK_STRING (string
);
6649 if (NILP (coding_system
))
6652 Vlast_coding_system_used
= Qno_conversion
;
6653 if (NILP (dst_object
))
6654 return (nocopy
? Fcopy_sequence (string
) : string
);
6657 if (NILP (coding_system
))
6658 coding_system
= Qno_conversion
;
6660 CHECK_CODING_SYSTEM (coding_system
);
6661 if (NILP (dst_object
))
6663 else if (! EQ (dst_object
, Qt
))
6664 CHECK_BUFFER (dst_object
);
6666 setup_coding_system (coding_system
, &coding
);
6667 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6668 chars
= XSTRING (string
)->size
;
6669 bytes
= STRING_BYTES (XSTRING (string
));
6671 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
6673 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
6675 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
6677 if (coding
.result
!= CODING_RESULT_SUCCESS
)
6678 error ("Code conversion error: %d", coding
.result
);
6680 return (BUFFERP (dst_object
)
6681 ? make_number (coding
.produced_char
)
6682 : coding
.dst_object
);
6686 /* Encode or decode STRING according to CODING_SYSTEM.
6687 Do not set Vlast_coding_system_used.
6689 This function is called only from macros DECODE_FILE and
6690 ENCODE_FILE, thus we ignore character composition. */
6693 code_convert_string_norecord (string
, coding_system
, encodep
)
6694 Lisp_Object string
, coding_system
;
6697 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
6701 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
6703 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6705 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6706 if the decoding operation is trivial.
6708 Optional fourth arg BUFFER non-nil meant that the decoded text is
6709 inserted in BUFFER instead of returned as a astring. In this case,
6710 the return value is BUFFER.
6712 This function sets `last-coding-system-used' to the precise coding system
6713 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6714 not fully specified. */)
6715 (string
, coding_system
, nocopy
, buffer
)
6716 Lisp_Object string
, coding_system
, nocopy
, buffer
;
6718 return code_convert_string (string
, coding_system
, buffer
,
6719 0, ! NILP (nocopy
), 0);
6722 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
6724 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
6726 Optional third arg NOCOPY non-nil means it is OK to return STRING
6727 itself if the encoding operation is trivial.
6729 Optional fourth arg BUFFER non-nil meant that the encoded text is
6730 inserted in BUFFER instead of returned as a astring. In this case,
6731 the return value is BUFFER.
6733 This function sets `last-coding-system-used' to the precise coding system
6734 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6735 not fully specified.) */)
6736 (string
, coding_system
, nocopy
, buffer
)
6737 Lisp_Object string
, coding_system
, nocopy
, buffer
;
6739 return code_convert_string (string
, coding_system
, buffer
,
6740 nocopy
, ! NILP (nocopy
), 1);
6744 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
6745 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
6746 Return the corresponding character. */)
6750 Lisp_Object spec
, attrs
, val
;
6751 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
6754 CHECK_NATNUM (code
);
6755 c
= XFASTINT (code
);
6756 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
6757 attrs
= AREF (spec
, 0);
6759 if (ASCII_BYTE_P (c
)
6760 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6763 val
= CODING_ATTR_CHARSET_LIST (attrs
);
6764 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6765 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6766 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
)));
6769 charset
= charset_roman
;
6770 else if (c
>= 0xA0 && c
< 0xDF)
6772 charset
= charset_kana
;
6777 int s1
= c
>> 8, s2
= c
& 0x7F;
6779 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
6780 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
6781 error ("Invalid code: %d", code
);
6783 charset
= charset_kanji
;
6785 c
= DECODE_CHAR (charset
, c
);
6787 error ("Invalid code: %d", code
);
6788 return make_number (c
);
6792 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
6793 doc
: /* Encode a Japanese character CHAR to shift_jis encoding.
6794 Return the corresponding code in SJIS. */)
6798 Lisp_Object spec
, attrs
, charset_list
;
6800 struct charset
*charset
;
6803 CHECK_CHARACTER (ch
);
6805 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
6806 attrs
= AREF (spec
, 0);
6808 if (ASCII_CHAR_P (c
)
6809 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6812 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
6813 charset
= char_charset (c
, charset_list
, &code
);
6814 if (code
== CHARSET_INVALID_CODE (charset
))
6815 error ("Can't encode by shift_jis encoding: %d", c
);
6818 return make_number (code
);
6821 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
6822 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
6823 Return the corresponding character. */)
6827 Lisp_Object spec
, attrs
, val
;
6828 struct charset
*charset_roman
, *charset_big5
, *charset
;
6831 CHECK_NATNUM (code
);
6832 c
= XFASTINT (code
);
6833 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
6834 attrs
= AREF (spec
, 0);
6836 if (ASCII_BYTE_P (c
)
6837 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6840 val
= CODING_ATTR_CHARSET_LIST (attrs
);
6841 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6842 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
6845 charset
= charset_roman
;
6848 int b1
= c
>> 8, b2
= c
& 0x7F;
6849 if (b1
< 0xA1 || b1
> 0xFE
6850 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
6851 error ("Invalid code: %d", code
);
6852 charset
= charset_big5
;
6854 c
= DECODE_CHAR (charset
, (unsigned )c
);
6856 error ("Invalid code: %d", code
);
6857 return make_number (c
);
6860 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
6861 doc
: /* Encode the Big5 character CHAR to BIG5 coding system.
6862 Return the corresponding character code in Big5. */)
6866 Lisp_Object spec
, attrs
, charset_list
;
6867 struct charset
*charset
;
6871 CHECK_CHARACTER (ch
);
6873 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
6874 attrs
= AREF (spec
, 0);
6875 if (ASCII_CHAR_P (c
)
6876 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6879 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
6880 charset
= char_charset (c
, charset_list
, &code
);
6881 if (code
== CHARSET_INVALID_CODE (charset
))
6882 error ("Can't encode by Big5 encoding: %d", c
);
6884 return make_number (code
);
6888 DEFUN ("set-terminal-coding-system-internal",
6889 Fset_terminal_coding_system_internal
,
6890 Sset_terminal_coding_system_internal
, 1, 1, 0,
6891 doc
: /* Internal use only. */)
6894 CHECK_SYMBOL (coding_system
);
6895 setup_coding_system (Fcheck_coding_system (coding_system
),
6898 /* We had better not send unsafe characters to terminal. */
6899 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
6900 /* Characer composition should be disabled. */
6901 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
6902 terminal_coding
.src_multibyte
= 1;
6903 terminal_coding
.dst_multibyte
= 0;
6907 DEFUN ("set-safe-terminal-coding-system-internal",
6908 Fset_safe_terminal_coding_system_internal
,
6909 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
6910 doc
: /* Internal use only. */)
6913 CHECK_SYMBOL (coding_system
);
6914 setup_coding_system (Fcheck_coding_system (coding_system
),
6915 &safe_terminal_coding
);
6916 /* Characer composition should be disabled. */
6917 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
6918 safe_terminal_coding
.src_multibyte
= 1;
6919 safe_terminal_coding
.dst_multibyte
= 0;
6923 DEFUN ("terminal-coding-system",
6924 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
6925 doc
: /* Return coding system specified for terminal output. */)
6928 return CODING_ID_NAME (terminal_coding
.id
);
6931 DEFUN ("set-keyboard-coding-system-internal",
6932 Fset_keyboard_coding_system_internal
,
6933 Sset_keyboard_coding_system_internal
, 1, 1, 0,
6934 doc
: /* Internal use only. */)
6936 Lisp_Object coding_system
;
6938 CHECK_SYMBOL (coding_system
);
6939 setup_coding_system (Fcheck_coding_system (coding_system
),
6941 /* Characer composition should be disabled. */
6942 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
6946 DEFUN ("keyboard-coding-system",
6947 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
6948 doc
: /* Return coding system specified for decoding keyboard input. */)
6951 return CODING_ID_NAME (keyboard_coding
.id
);
6955 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
6956 Sfind_operation_coding_system
, 1, MANY
, 0,
6957 doc
: /* Choose a coding system for an operation based on the target name.
6958 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
6959 DECODING-SYSTEM is the coding system to use for decoding
6960 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
6961 for encoding (in case OPERATION does encoding).
6963 The first argument OPERATION specifies an I/O primitive:
6964 For file I/O, `insert-file-contents' or `write-region'.
6965 For process I/O, `call-process', `call-process-region', or `start-process'.
6966 For network I/O, `open-network-stream'.
6968 The remaining arguments should be the same arguments that were passed
6969 to the primitive. Depending on which primitive, one of those arguments
6970 is selected as the TARGET. For example, if OPERATION does file I/O,
6971 whichever argument specifies the file name is TARGET.
6973 TARGET has a meaning which depends on OPERATION:
6974 For file I/O, TARGET is a file name.
6975 For process I/O, TARGET is a process name.
6976 For network I/O, TARGET is a service name or a port number
6978 This function looks up what specified for TARGET in,
6979 `file-coding-system-alist', `process-coding-system-alist',
6980 or `network-coding-system-alist' depending on OPERATION.
6981 They may specify a coding system, a cons of coding systems,
6982 or a function symbol to call.
6983 In the last case, we call the function with one argument,
6984 which is a list of all the arguments given to this function.
6986 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
6991 Lisp_Object operation
, target_idx
, target
, val
;
6992 register Lisp_Object chain
;
6995 error ("Too few arguments");
6996 operation
= args
[0];
6997 if (!SYMBOLP (operation
)
6998 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
6999 error ("Invalid first arguement");
7000 if (nargs
< 1 + XINT (target_idx
))
7001 error ("Too few arguments for operation: %s",
7002 XSYMBOL (operation
)->name
->data
);
7003 target
= args
[XINT (target_idx
) + 1];
7004 if (!(STRINGP (target
)
7005 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
7006 error ("Invalid %dth argument", XINT (target_idx
) + 1);
7008 chain
= ((EQ (operation
, Qinsert_file_contents
)
7009 || EQ (operation
, Qwrite_region
))
7010 ? Vfile_coding_system_alist
7011 : (EQ (operation
, Qopen_network_stream
)
7012 ? Vnetwork_coding_system_alist
7013 : Vprocess_coding_system_alist
));
7017 for (; CONSP (chain
); chain
= XCDR (chain
))
7023 && ((STRINGP (target
)
7024 && STRINGP (XCAR (elt
))
7025 && fast_string_match (XCAR (elt
), target
) >= 0)
7026 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
7029 /* Here, if VAL is both a valid coding system and a valid
7030 function symbol, we return VAL as a coding system. */
7033 if (! SYMBOLP (val
))
7035 if (! NILP (Fcoding_system_p (val
)))
7036 return Fcons (val
, val
);
7037 if (! NILP (Ffboundp (val
)))
7039 val
= call1 (val
, Flist (nargs
, args
));
7042 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
7043 return Fcons (val
, val
);
7051 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
7052 Sset_coding_system_priority
, 1, MANY
, 0,
7053 doc
: /* Put higher priority to coding systems of the arguments. */)
7059 int changed
[coding_category_max
];
7060 enum coding_category priorities
[coding_category_max
];
7062 bzero (changed
, sizeof changed
);
7064 for (i
= j
= 0; i
< nargs
; i
++)
7066 enum coding_category category
;
7067 Lisp_Object spec
, attrs
;
7069 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
7070 attrs
= AREF (spec
, 0);
7071 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7072 if (changed
[category
])
7073 /* Ignore this coding system because a coding system of the
7074 same category already had a higher priority. */
7076 changed
[category
] = 1;
7077 priorities
[j
++] = category
;
7078 if (coding_categories
[category
].id
>= 0
7079 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
7080 setup_coding_system (args
[i
], &coding_categories
[category
]);
7083 /* Now we have decided top J priorities. Reflect the order of the
7084 original priorities to the remaining priorities. */
7086 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
7088 while (j
< coding_category_max
7089 && changed
[coding_priorities
[j
]])
7091 if (j
== coding_category_max
)
7093 priorities
[i
] = coding_priorities
[j
];
7096 bcopy (priorities
, coding_priorities
, sizeof priorities
);
7100 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
7101 Scoding_system_priority_list
, 0, 1, 0,
7102 doc
: /* Return a list of coding systems ordered by their priorities. */)
7104 Lisp_Object highestp
;
7109 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
7111 enum coding_category category
= coding_priorities
[i
];
7112 int id
= coding_categories
[category
].id
;
7117 attrs
= CODING_ID_ATTRS (id
);
7118 if (! NILP (highestp
))
7119 return CODING_ATTR_BASE_NAME (attrs
);
7120 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
7122 return Fnreverse (val
);
7126 make_subsidiaries (base
)
7129 Lisp_Object subsidiaries
;
7130 char *suffixes
[] = { "-unix", "-dos", "-mac" };
7131 int base_name_len
= STRING_BYTES (XSYMBOL (base
)->name
);
7132 char *buf
= (char *) alloca (base_name_len
+ 6);
7135 bcopy (XSYMBOL (base
)->name
->data
, buf
, base_name_len
);
7136 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
7137 for (i
= 0; i
< 3; i
++)
7139 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
7140 ASET (subsidiaries
, i
, intern (buf
));
7142 return subsidiaries
;
7146 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
7147 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
7148 doc
: /* For internal use only. */)
7154 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
7155 Lisp_Object attrs
; /* Vector of attributes. */
7156 Lisp_Object eol_type
;
7157 Lisp_Object aliases
;
7158 Lisp_Object coding_type
, charset_list
, safe_charsets
;
7159 enum coding_category category
;
7160 Lisp_Object tail
, val
;
7161 int max_charset_id
= 0;
7164 if (nargs
< coding_arg_max
)
7167 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
7169 name
= args
[coding_arg_name
];
7170 CHECK_SYMBOL (name
);
7171 CODING_ATTR_BASE_NAME (attrs
) = name
;
7173 val
= args
[coding_arg_mnemonic
];
7174 if (! STRINGP (val
))
7175 CHECK_CHARACTER (val
);
7176 CODING_ATTR_MNEMONIC (attrs
) = val
;
7178 coding_type
= args
[coding_arg_coding_type
];
7179 CHECK_SYMBOL (coding_type
);
7180 CODING_ATTR_TYPE (attrs
) = coding_type
;
7182 charset_list
= args
[coding_arg_charset_list
];
7183 if (SYMBOLP (charset_list
))
7185 if (EQ (charset_list
, Qiso_2022
))
7187 if (! EQ (coding_type
, Qiso_2022
))
7188 error ("Invalid charset-list");
7189 charset_list
= Viso_2022_charset_list
;
7191 else if (EQ (charset_list
, Qemacs_mule
))
7193 if (! EQ (coding_type
, Qemacs_mule
))
7194 error ("Invalid charset-list");
7195 charset_list
= Vemacs_mule_charset_list
;
7197 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7198 if (max_charset_id
< XFASTINT (XCAR (tail
)))
7199 max_charset_id
= XFASTINT (XCAR (tail
));
7203 charset_list
= Fcopy_sequence (charset_list
);
7204 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
7206 struct charset
*charset
;
7209 CHECK_CHARSET_GET_CHARSET (val
, charset
);
7210 if (EQ (coding_type
, Qiso_2022
)
7211 ? CHARSET_ISO_FINAL (charset
) < 0
7212 : EQ (coding_type
, Qemacs_mule
)
7213 ? CHARSET_EMACS_MULE_ID (charset
) < 0
7215 error ("Can't handle charset `%s'",
7216 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7218 XCAR (tail
) = make_number (charset
->id
);
7219 if (max_charset_id
< charset
->id
)
7220 max_charset_id
= charset
->id
;
7223 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
7225 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
7227 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7228 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
7229 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
7231 val
= args
[coding_arg_decode_translation_table
];
7233 CHECK_CHAR_TABLE (val
);
7234 CODING_ATTR_DECODE_TBL (attrs
) = val
;
7236 val
= args
[coding_arg_encode_translation_table
];
7238 CHECK_CHAR_TABLE (val
);
7239 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
7241 val
= args
[coding_arg_post_read_conversion
];
7243 CODING_ATTR_POST_READ (attrs
) = val
;
7245 val
= args
[coding_arg_pre_write_conversion
];
7247 CODING_ATTR_PRE_WRITE (attrs
) = val
;
7249 val
= args
[coding_arg_default_char
];
7251 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
7254 CHECK_CHARACTER (val
);
7255 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
7258 val
= args
[coding_arg_plist
];
7260 CODING_ATTR_PLIST (attrs
) = val
;
7262 if (EQ (coding_type
, Qcharset
))
7264 val
= Fmake_vector (make_number (256), Qnil
);
7266 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7268 struct charset
*charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
7270 for (i
= charset
->code_space
[0]; i
<= charset
->code_space
[1]; i
++)
7271 if (NILP (AREF (val
, i
)))
7272 ASET (val
, i
, XCAR (tail
));
7274 ASET (attrs
, coding_attr_charset_valids
, val
);
7275 category
= coding_category_charset
;
7277 else if (EQ (coding_type
, Qccl
))
7281 if (nargs
< coding_arg_ccl_max
)
7284 val
= args
[coding_arg_ccl_decoder
];
7285 CHECK_CCL_PROGRAM (val
);
7287 val
= Fcopy_sequence (val
);
7288 ASET (attrs
, coding_attr_ccl_decoder
, val
);
7290 val
= args
[coding_arg_ccl_encoder
];
7291 CHECK_CCL_PROGRAM (val
);
7293 val
= Fcopy_sequence (val
);
7294 ASET (attrs
, coding_attr_ccl_encoder
, val
);
7296 val
= args
[coding_arg_ccl_valids
];
7297 valids
= Fmake_string (make_number (256), make_number (0));
7298 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
7302 ASET (valids
, XINT (val
), 1);
7308 CHECK_NUMBER (XCAR (val
));
7309 CHECK_NUMBER (XCDR (val
));
7310 from
= XINT (XCAR (val
));
7311 to
= XINT (XCDR (val
));
7312 for (i
= from
; i
<= to
; i
++)
7313 ASET (valids
, i
, 1);
7316 ASET (attrs
, coding_attr_ccl_valids
, valids
);
7318 category
= coding_category_ccl
;
7320 else if (EQ (coding_type
, Qutf_16
))
7322 Lisp_Object bom
, endian
;
7324 if (nargs
< coding_arg_utf16_max
)
7327 bom
= args
[coding_arg_utf16_bom
];
7328 if (! NILP (bom
) && ! EQ (bom
, Qt
))
7331 CHECK_CODING_SYSTEM (XCAR (bom
));
7332 CHECK_CODING_SYSTEM (XCDR (bom
));
7334 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
7336 endian
= args
[coding_arg_utf16_endian
];
7337 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
7339 category
= (CONSP (bom
)
7340 ? coding_category_utf_16_auto
7343 ? coding_category_utf_16_be_nosig
7344 : coding_category_utf_16_le_nosig
)
7346 ? coding_category_utf_16_be
7347 : coding_category_utf_16_le
));
7349 else if (EQ (coding_type
, Qiso_2022
))
7351 Lisp_Object initial
, reg_usage
, request
, flags
;
7352 struct charset
*charset
;
7355 if (nargs
< coding_arg_iso2022_max
)
7358 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
7359 CHECK_VECTOR (initial
);
7360 for (i
= 0; i
< 4; i
++)
7362 val
= Faref (initial
, make_number (i
));
7365 CHECK_CHARSET_GET_ID (val
, id
);
7366 ASET (initial
, i
, make_number (id
));
7369 ASET (initial
, i
, make_number (-1));
7372 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
7373 CHECK_CONS (reg_usage
);
7374 CHECK_NATNUM (XCAR (reg_usage
));
7375 CHECK_NATNUM (XCDR (reg_usage
));
7377 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
7378 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
7384 CHECK_CHARSET_GET_ID (XCAR (val
), id
);
7385 CHECK_NATNUM (XCDR (val
));
7386 if (XINT (XCDR (val
)) >= 4)
7387 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
7388 XCAR (val
) = make_number (id
);
7391 flags
= args
[coding_arg_iso2022_flags
];
7392 CHECK_NATNUM (flags
);
7394 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
7395 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
7397 ASET (attrs
, coding_attr_iso_initial
, initial
);
7398 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
7399 ASET (attrs
, coding_attr_iso_request
, request
);
7400 ASET (attrs
, coding_attr_iso_flags
, flags
);
7401 setup_iso_safe_charsets (attrs
);
7403 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
7404 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
7405 | CODING_ISO_FLAG_SINGLE_SHIFT
))
7406 ? coding_category_iso_7_else
7407 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
7408 ? coding_category_iso_7
7409 : coding_category_iso_7_tight
);
7412 int id
= XINT (AREF (initial
, 1));
7414 category
= (((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
7415 | CODING_ISO_FLAG_SINGLE_SHIFT
))
7416 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
7418 ? coding_category_iso_8_else
7419 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
7420 ? coding_category_iso_8_1
7421 : coding_category_iso_8_2
);
7424 else if (EQ (coding_type
, Qemacs_mule
))
7426 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
7427 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
7429 category
= coding_category_emacs_mule
;
7431 else if (EQ (coding_type
, Qshift_jis
))
7434 struct charset
*charset
;
7436 if (XINT (Flength (charset_list
)) != 3)
7437 error ("There should be just three charsets");
7439 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7440 if (CHARSET_DIMENSION (charset
) != 1)
7441 error ("Dimension of charset %s is not one",
7442 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7444 charset_list
= XCDR (charset_list
);
7445 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7446 if (CHARSET_DIMENSION (charset
) != 1)
7447 error ("Dimension of charset %s is not one",
7448 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7450 charset_list
= XCDR (charset_list
);
7451 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7452 if (CHARSET_DIMENSION (charset
) != 2)
7453 error ("Dimension of charset %s is not two",
7454 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7456 category
= coding_category_sjis
;
7457 Vsjis_coding_system
= name
;
7459 else if (EQ (coding_type
, Qbig5
))
7461 struct charset
*charset
;
7463 if (XINT (Flength (charset_list
)) != 2)
7464 error ("There should be just two charsets");
7466 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7467 if (CHARSET_DIMENSION (charset
) != 1)
7468 error ("Dimension of charset %s is not one",
7469 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7471 charset_list
= XCDR (charset_list
);
7472 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7473 if (CHARSET_DIMENSION (charset
) != 2)
7474 error ("Dimension of charset %s is not two",
7475 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7477 category
= coding_category_big5
;
7478 Vbig5_coding_system
= name
;
7480 else if (EQ (coding_type
, Qraw_text
))
7481 category
= coding_category_raw_text
;
7482 else if (EQ (coding_type
, Qutf_8
))
7483 category
= coding_category_utf_8
;
7484 else if (EQ (coding_type
, Qundecided
))
7485 category
= coding_category_undecided
;
7487 error ("Invalid coding system type: %s",
7488 XSYMBOL (coding_type
)->name
->data
);
7490 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
7492 eol_type
= args
[coding_arg_eol_type
];
7493 if (! NILP (eol_type
)
7494 && ! EQ (eol_type
, Qunix
)
7495 && ! EQ (eol_type
, Qdos
)
7496 && ! EQ (eol_type
, Qmac
))
7497 error ("Invalid eol-type");
7499 aliases
= Fcons (name
, Qnil
);
7501 if (NILP (eol_type
))
7503 eol_type
= make_subsidiaries (name
);
7504 for (i
= 0; i
< 3; i
++)
7506 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
7508 this_name
= AREF (eol_type
, i
);
7509 this_aliases
= Fcons (this_name
, Qnil
);
7510 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
7511 this_spec
= Fmake_vector (make_number (3), attrs
);
7512 ASET (this_spec
, 1, this_aliases
);
7513 ASET (this_spec
, 2, this_eol_type
);
7514 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
7515 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
7516 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
7517 Vcoding_system_alist
);
7521 spec_vec
= Fmake_vector (make_number (3), attrs
);
7522 ASET (spec_vec
, 1, aliases
);
7523 ASET (spec_vec
, 2, eol_type
);
7525 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
7526 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
7527 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
7528 Vcoding_system_alist
);
7531 int id
= coding_categories
[category
].id
;
7533 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
7534 setup_coding_system (name
, &coding_categories
[category
]);
7540 return Fsignal (Qwrong_number_of_arguments
,
7541 Fcons (intern ("define-coding-system-internal"),
7542 make_number (nargs
)));
7545 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
7546 Sdefine_coding_system_alias
, 2, 2, 0,
7547 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
7548 (alias
, coding_system
)
7549 Lisp_Object alias
, coding_system
;
7551 Lisp_Object spec
, aliases
, eol_type
;
7553 CHECK_SYMBOL (alias
);
7554 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7555 aliases
= AREF (spec
, 1);
7556 while (!NILP (XCDR (aliases
)))
7557 aliases
= XCDR (aliases
);
7558 XCDR (aliases
) = Fcons (alias
, Qnil
);
7560 eol_type
= AREF (spec
, 2);
7561 if (VECTORP (eol_type
))
7563 Lisp_Object subsidiaries
;
7566 subsidiaries
= make_subsidiaries (alias
);
7567 for (i
= 0; i
< 3; i
++)
7568 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
7569 AREF (eol_type
, i
));
7571 ASET (spec
, 2, subsidiaries
);
7574 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
7579 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
7581 doc
: /* Return the base of CODING-SYSTEM.
7582 Any alias or subsidiary coding systems are not base coding system. */)
7584 Lisp_Object coding_system
;
7586 Lisp_Object spec
, attrs
;
7588 if (NILP (coding_system
))
7589 return (Qno_conversion
);
7590 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7591 attrs
= AREF (spec
, 0);
7592 return CODING_ATTR_BASE_NAME (attrs
);
7595 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
7597 doc
: "Return the property list of CODING-SYSTEM.")
7599 Lisp_Object coding_system
;
7601 Lisp_Object spec
, attrs
;
7603 if (NILP (coding_system
))
7604 coding_system
= Qno_conversion
;
7605 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7606 attrs
= AREF (spec
, 0);
7607 return CODING_ATTR_PLIST (attrs
);
7611 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
7613 doc
: /* Return the list of aliases of CODING-SYSTEM.
7614 A base coding system is what made by `define-coding-system'.
7615 Any alias nor subsidiary coding systems are not base coding system. */)
7617 Lisp_Object coding_system
;
7621 if (NILP (coding_system
))
7622 coding_system
= Qno_conversion
;
7623 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7624 return AREF (spec
, 2);
7627 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
7628 Scoding_system_eol_type
, 1, 1, 0,
7629 doc
: /* Return eol-type of CODING-SYSTEM.
7630 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7632 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7633 and CR respectively.
7635 A vector value indicates that a format of end-of-line should be
7636 detected automatically. Nth element of the vector is the subsidiary
7637 coding system whose eol-type is N. */)
7639 Lisp_Object coding_system
;
7641 Lisp_Object spec
, eol_type
;
7644 if (NILP (coding_system
))
7645 coding_system
= Qno_conversion
;
7646 if (! CODING_SYSTEM_P (coding_system
))
7648 spec
= CODING_SYSTEM_SPEC (coding_system
);
7649 eol_type
= AREF (spec
, 2);
7650 if (VECTORP (eol_type
))
7651 return Fcopy_sequence (eol_type
);
7652 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
7653 return make_number (n
);
7659 /*** 9. Post-amble ***/
7666 for (i
= 0; i
< coding_category_max
; i
++)
7668 coding_categories
[i
].id
= -1;
7669 coding_priorities
[i
] = i
;
7672 /* ISO2022 specific initialize routine. */
7673 for (i
= 0; i
< 0x20; i
++)
7674 iso_code_class
[i
] = ISO_control_0
;
7675 for (i
= 0x21; i
< 0x7F; i
++)
7676 iso_code_class
[i
] = ISO_graphic_plane_0
;
7677 for (i
= 0x80; i
< 0xA0; i
++)
7678 iso_code_class
[i
] = ISO_control_1
;
7679 for (i
= 0xA1; i
< 0xFF; i
++)
7680 iso_code_class
[i
] = ISO_graphic_plane_1
;
7681 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
7682 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
7683 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
7684 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
7685 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
7686 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
7687 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
7688 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
7689 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
7690 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
7692 inhibit_pre_post_conversion
= 0;
7694 for (i
= 0; i
< 256; i
++)
7696 emacs_mule_bytes
[i
] = 1;
7705 staticpro (&Vcoding_system_hash_table
);
7706 Vcoding_system_hash_table
= Fmakehash (Qeq
);
7708 staticpro (&Vsjis_coding_system
);
7709 Vsjis_coding_system
= Qnil
;
7711 staticpro (&Vbig5_coding_system
);
7712 Vbig5_coding_system
= Qnil
;
7714 staticpro (&Vcode_conversion_work_buf_list
);
7715 Vcode_conversion_work_buf_list
= Qnil
;
7717 staticpro (&Vcode_conversion_reused_work_buf
);
7718 Vcode_conversion_reused_work_buf
= Qnil
;
7720 DEFSYM (Qcharset
, "charset");
7721 DEFSYM (Qtarget_idx
, "target-idx");
7722 DEFSYM (Qcoding_system_history
, "coding-system-history");
7723 Fset (Qcoding_system_history
, Qnil
);
7725 /* Target FILENAME is the first argument. */
7726 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
7727 /* Target FILENAME is the third argument. */
7728 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
7730 DEFSYM (Qcall_process
, "call-process");
7731 /* Target PROGRAM is the first argument. */
7732 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
7734 DEFSYM (Qcall_process_region
, "call-process-region");
7735 /* Target PROGRAM is the third argument. */
7736 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
7738 DEFSYM (Qstart_process
, "start-process");
7739 /* Target PROGRAM is the third argument. */
7740 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
7742 DEFSYM (Qopen_network_stream
, "open-network-stream");
7743 /* Target SERVICE is the fourth argument. */
7744 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
7746 DEFSYM (Qcoding_system
, "coding-system");
7747 DEFSYM (Qcoding_aliases
, "coding-aliases");
7749 DEFSYM (Qeol_type
, "eol-type");
7750 DEFSYM (Qunix
, "unix");
7751 DEFSYM (Qdos
, "dos");
7752 DEFSYM (Qmac
, "mac");
7754 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
7755 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
7756 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
7757 DEFSYM (Qdefault_char
, "default-char");
7758 DEFSYM (Qundecided
, "undecided");
7759 DEFSYM (Qno_conversion
, "no-conversion");
7760 DEFSYM (Qraw_text
, "raw-text");
7762 DEFSYM (Qiso_2022
, "iso-2022");
7764 DEFSYM (Qutf_8
, "utf-8");
7766 DEFSYM (Qutf_16
, "utf-16");
7767 DEFSYM (Qutf_16_be
, "utf-16-be");
7768 DEFSYM (Qutf_16_be_nosig
, "utf-16-be-nosig");
7769 DEFSYM (Qutf_16_le
, "utf-16-l3");
7770 DEFSYM (Qutf_16_le_nosig
, "utf-16-le-nosig");
7771 DEFSYM (Qsignature
, "signature");
7772 DEFSYM (Qendian
, "endian");
7773 DEFSYM (Qbig
, "big");
7774 DEFSYM (Qlittle
, "little");
7776 DEFSYM (Qshift_jis
, "shift-jis");
7777 DEFSYM (Qbig5
, "big5");
7779 DEFSYM (Qcoding_system_p
, "coding-system-p");
7781 DEFSYM (Qcoding_system_error
, "coding-system-error");
7782 Fput (Qcoding_system_error
, Qerror_conditions
,
7783 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
7784 Fput (Qcoding_system_error
, Qerror_message
,
7785 build_string ("Invalid coding system"));
7787 /* Intern this now in case it isn't already done.
7788 Setting this variable twice is harmless.
7789 But don't staticpro it here--that is done in alloc.c. */
7790 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
7792 DEFSYM (Qtranslation_table
, "translation-table");
7793 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
7794 DEFSYM (Qtranslation_table_id
, "translation-table-id");
7795 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
7796 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
7798 DEFSYM (Qchar_coding_system
, "char-coding-system");
7800 Fput (Qchar_coding_system
, Qchar_table_extra_slots
, make_number (2));
7802 DEFSYM (Qvalid_codes
, "valid-codes");
7804 DEFSYM (Qemacs_mule
, "emacs-mule");
7806 Vcoding_category_table
7807 = Fmake_vector (make_number (coding_category_max
), Qnil
);
7808 staticpro (&Vcoding_category_table
);
7809 /* Followings are target of code detection. */
7810 ASET (Vcoding_category_table
, coding_category_iso_7
,
7811 intern ("coding-category-iso-7"));
7812 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
7813 intern ("coding-category-iso-7-tight"));
7814 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
7815 intern ("coding-category-iso-8-1"));
7816 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
7817 intern ("coding-category-iso-8-2"));
7818 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
7819 intern ("coding-category-iso-7-else"));
7820 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
7821 intern ("coding-category-iso-8-else"));
7822 ASET (Vcoding_category_table
, coding_category_utf_8
,
7823 intern ("coding-category-utf-8"));
7824 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
7825 intern ("coding-category-utf-16-be"));
7826 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
7827 intern ("coding-category-utf-16-le"));
7828 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
7829 intern ("coding-category-utf-16-be-nosig"));
7830 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
7831 intern ("coding-category-utf-16-le-nosig"));
7832 ASET (Vcoding_category_table
, coding_category_charset
,
7833 intern ("coding-category-charset"));
7834 ASET (Vcoding_category_table
, coding_category_sjis
,
7835 intern ("coding-category-sjis"));
7836 ASET (Vcoding_category_table
, coding_category_big5
,
7837 intern ("coding-category-big5"));
7838 ASET (Vcoding_category_table
, coding_category_ccl
,
7839 intern ("coding-category-ccl"));
7840 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
7841 intern ("coding-category-emacs-mule"));
7842 /* Followings are NOT target of code detection. */
7843 ASET (Vcoding_category_table
, coding_category_raw_text
,
7844 intern ("coding-category-raw-text"));
7845 ASET (Vcoding_category_table
, coding_category_undecided
,
7846 intern ("coding-category-undecided"));
7849 Lisp_Object args
[coding_arg_max
];
7850 Lisp_Object plist
[14];
7853 for (i
= 0; i
< coding_arg_max
; i
++)
7856 plist
[0] = intern (":name");
7857 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
7858 plist
[2] = intern (":mnemonic");
7859 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
7860 plist
[4] = intern (":coding-type");
7861 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
7862 plist
[6] = intern (":ascii-compatible-p");
7863 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
7864 plist
[8] = intern (":default-char");
7865 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
7866 plist
[10] = intern (":docstring");
7867 plist
[11] = build_string ("Do no conversion.\n\
7869 When you visit a file with this coding, the file is read into a\n\
7870 unibyte buffer as is, thus each byte of a file is treated as a\n\
7872 plist
[12] = intern (":eol-type");
7873 plist
[13] = args
[coding_arg_eol_type
] = Qunix
;
7874 args
[coding_arg_plist
] = Flist (14, plist
);
7875 Fdefine_coding_system_internal (coding_arg_max
, args
);
7878 setup_coding_system (Qno_conversion
, &keyboard_coding
);
7879 setup_coding_system (Qno_conversion
, &terminal_coding
);
7880 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
7882 defsubr (&Scoding_system_p
);
7883 defsubr (&Sread_coding_system
);
7884 defsubr (&Sread_non_nil_coding_system
);
7885 defsubr (&Scheck_coding_system
);
7886 defsubr (&Sdetect_coding_region
);
7887 defsubr (&Sdetect_coding_string
);
7888 defsubr (&Sfind_coding_systems_region_internal
);
7889 defsubr (&Scheck_coding_systems_region
);
7890 defsubr (&Sdecode_coding_region
);
7891 defsubr (&Sencode_coding_region
);
7892 defsubr (&Sdecode_coding_string
);
7893 defsubr (&Sencode_coding_string
);
7894 defsubr (&Sdecode_sjis_char
);
7895 defsubr (&Sencode_sjis_char
);
7896 defsubr (&Sdecode_big5_char
);
7897 defsubr (&Sencode_big5_char
);
7898 defsubr (&Sset_terminal_coding_system_internal
);
7899 defsubr (&Sset_safe_terminal_coding_system_internal
);
7900 defsubr (&Sterminal_coding_system
);
7901 defsubr (&Sset_keyboard_coding_system_internal
);
7902 defsubr (&Skeyboard_coding_system
);
7903 defsubr (&Sfind_operation_coding_system
);
7904 defsubr (&Sset_coding_system_priority
);
7905 defsubr (&Sdefine_coding_system_internal
);
7906 defsubr (&Sdefine_coding_system_alias
);
7907 defsubr (&Scoding_system_base
);
7908 defsubr (&Scoding_system_plist
);
7909 defsubr (&Scoding_system_aliases
);
7910 defsubr (&Scoding_system_eol_type
);
7911 defsubr (&Scoding_system_priority_list
);
7913 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
7914 doc
: /* List of coding systems.
7916 Do not alter the value of this variable manually. This variable should be
7917 updated by the functions `define-coding-system' and
7918 `define-coding-system-alias'. */);
7919 Vcoding_system_list
= Qnil
;
7921 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
7922 doc
: /* Alist of coding system names.
7923 Each element is one element list of coding system name.
7924 This variable is given to `completing-read' as TABLE argument.
7926 Do not alter the value of this variable manually. This variable should be
7927 updated by the functions `make-coding-system' and
7928 `define-coding-system-alias'. */);
7929 Vcoding_system_alist
= Qnil
;
7931 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
7932 doc
: /* List of coding-categories (symbols) ordered by priority.
7934 On detecting a coding system, Emacs tries code detection algorithms
7935 associated with each coding-category one by one in this order. When
7936 one algorithm agrees with a byte sequence of source text, the coding
7937 system bound to the corresponding coding-category is selected. */);
7941 Vcoding_category_list
= Qnil
;
7942 for (i
= coding_category_max
- 1; i
>= 0; i
--)
7943 Vcoding_category_list
7944 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
7945 Vcoding_category_list
);
7948 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
7949 doc
: /* Specify the coding system for read operations.
7950 It is useful to bind this variable with `let', but do not set it globally.
7951 If the value is a coding system, it is used for decoding on read operation.
7952 If not, an appropriate element is used from one of the coding system alists:
7953 There are three such tables, `file-coding-system-alist',
7954 `process-coding-system-alist', and `network-coding-system-alist'. */);
7955 Vcoding_system_for_read
= Qnil
;
7957 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
7958 doc
: /* Specify the coding system for write operations.
7959 Programs bind this variable with `let', but you should not set it globally.
7960 If the value is a coding system, it is used for encoding of output,
7961 when writing it to a file and when sending it to a file or subprocess.
7963 If this does not specify a coding system, an appropriate element
7964 is used from one of the coding system alists:
7965 There are three such tables, `file-coding-system-alist',
7966 `process-coding-system-alist', and `network-coding-system-alist'.
7967 For output to files, if the above procedure does not specify a coding system,
7968 the value of `buffer-file-coding-system' is used. */);
7969 Vcoding_system_for_write
= Qnil
;
7971 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
7973 Coding system used in the latest file or process I/O. */);
7974 Vlast_coding_system_used
= Qnil
;
7976 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
7978 *Non-nil means always inhibit code conversion of end-of-line format.
7979 See info node `Coding Systems' and info node `Text and Binary' concerning
7980 such conversion. */);
7981 inhibit_eol_conversion
= 0;
7983 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
7985 Non-nil means process buffer inherits coding system of process output.
7986 Bind it to t if the process output is to be treated as if it were a file
7987 read from some filesystem. */);
7988 inherit_process_coding_system
= 0;
7990 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
7992 Alist to decide a coding system to use for a file I/O operation.
7993 The format is ((PATTERN . VAL) ...),
7994 where PATTERN is a regular expression matching a file name,
7995 VAL is a coding system, a cons of coding systems, or a function symbol.
7996 If VAL is a coding system, it is used for both decoding and encoding
7998 If VAL is a cons of coding systems, the car part is used for decoding,
7999 and the cdr part is used for encoding.
8000 If VAL is a function symbol, the function must return a coding system
8001 or a cons of coding systems which are used as above. The function gets
8002 the arguments with which `find-operation-coding-systems' was called.
8004 See also the function `find-operation-coding-system'
8005 and the variable `auto-coding-alist'. */);
8006 Vfile_coding_system_alist
= Qnil
;
8008 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
8010 Alist to decide a coding system to use for a process I/O operation.
8011 The format is ((PATTERN . VAL) ...),
8012 where PATTERN is a regular expression matching a program name,
8013 VAL is a coding system, a cons of coding systems, or a function symbol.
8014 If VAL is a coding system, it is used for both decoding what received
8015 from the program and encoding what sent to the program.
8016 If VAL is a cons of coding systems, the car part is used for decoding,
8017 and the cdr part is used for encoding.
8018 If VAL is a function symbol, the function must return a coding system
8019 or a cons of coding systems which are used as above.
8021 See also the function `find-operation-coding-system'. */);
8022 Vprocess_coding_system_alist
= Qnil
;
8024 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
8026 Alist to decide a coding system to use for a network I/O operation.
8027 The format is ((PATTERN . VAL) ...),
8028 where PATTERN is a regular expression matching a network service name
8029 or is a port number to connect to,
8030 VAL is a coding system, a cons of coding systems, or a function symbol.
8031 If VAL is a coding system, it is used for both decoding what received
8032 from the network stream and encoding what sent to the network stream.
8033 If VAL is a cons of coding systems, the car part is used for decoding,
8034 and the cdr part is used for encoding.
8035 If VAL is a function symbol, the function must return a coding system
8036 or a cons of coding systems which are used as above.
8038 See also the function `find-operation-coding-system'. */);
8039 Vnetwork_coding_system_alist
= Qnil
;
8041 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
8042 doc
: /* Coding system to use with system messages.
8043 Also used for decoding keyboard input on X Window system. */);
8044 Vlocale_coding_system
= Qnil
;
8046 /* The eol mnemonics are reset in startup.el system-dependently. */
8047 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
8049 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8050 eol_mnemonic_unix
= build_string (":");
8052 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
8054 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8055 eol_mnemonic_dos
= build_string ("\\");
8057 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
8059 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8060 eol_mnemonic_mac
= build_string ("/");
8062 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
8064 *String displayed in mode line when end-of-line format is not yet determined. */);
8065 eol_mnemonic_undecided
= build_string (":");
8067 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
8069 *Non-nil enables character translation while encoding and decoding. */);
8070 Venable_character_translation
= Qt
;
8072 DEFVAR_LISP ("standard-translation-table-for-decode",
8073 &Vstandard_translation_table_for_decode
,
8074 doc
: /* Table for translating characters while decoding. */);
8075 Vstandard_translation_table_for_decode
= Qnil
;
8077 DEFVAR_LISP ("standard-translation-table-for-encode",
8078 &Vstandard_translation_table_for_encode
,
8079 doc
: /* Table for translating characters while encoding. */);
8080 Vstandard_translation_table_for_encode
= Qnil
;
8082 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
8083 doc
: /* Alist of charsets vs revision numbers.
8084 While encoding, if a charset (car part of an element) is found,
8085 designate it with the escape sequence identifying revision (cdr part
8086 of the element). */);
8087 Vcharset_revision_table
= Qnil
;
8089 DEFVAR_LISP ("default-process-coding-system",
8090 &Vdefault_process_coding_system
,
8091 doc
: /* Cons of coding systems used for process I/O by default.
8092 The car part is used for decoding a process output,
8093 the cdr part is used for encoding a text to be sent to a process. */);
8094 Vdefault_process_coding_system
= Qnil
;
8096 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
8098 Table of extra Latin codes in the range 128..159 (inclusive).
8099 This is a vector of length 256.
8100 If Nth element is non-nil, the existence of code N in a file
8101 \(or output of subprocess) doesn't prevent it to be detected as
8102 a coding system of ISO 2022 variant which has a flag
8103 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8104 or reading output of a subprocess.
8105 Only 128th through 159th elements has a meaning. */);
8106 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
8108 DEFVAR_LISP ("select-safe-coding-system-function",
8109 &Vselect_safe_coding_system_function
,
8111 Function to call to select safe coding system for encoding a text.
8113 If set, this function is called to force a user to select a proper
8114 coding system which can encode the text in the case that a default
8115 coding system used in each operation can't encode the text.
8117 The default value is `select-safe-coding-system' (which see). */);
8118 Vselect_safe_coding_system_function
= Qnil
;
8120 DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table
,
8122 Char-table containing safe coding systems of each characters.
8123 Each element doesn't include such generic coding systems that can
8124 encode any characters. They are in the first extra slot. */);
8125 Vchar_coding_system_table
= Fmake_char_table (Qchar_coding_system
, Qnil
);
8127 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8128 &inhibit_iso_escape_detection
,
8130 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8132 By default, on reading a file, Emacs tries to detect how the text is
8133 encoded. This code detection is sensitive to escape sequences. If
8134 the sequence is valid as ISO2022, the code is determined as one of
8135 the ISO2022 encodings, and the file is decoded by the corresponding
8136 coding system (e.g. `iso-2022-7bit').
8138 However, there may be a case that you want to read escape sequences in
8139 a file as is. In such a case, you can set this variable to non-nil.
8140 Then, as the code detection ignores any escape sequences, no file is
8141 detected as encoded in some ISO2022 encoding. The result is that all
8142 escape sequences become visible in a buffer.
8144 The default value is nil, and it is strongly recommended not to change
8145 it. That is because many Emacs Lisp source files that contain
8146 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8147 in Emacs's distribution, and they won't be decoded correctly on
8148 reading if you suppress escape sequence detection.
8150 The other way to read escape sequences in a file without decoding is
8151 to explicitly specify some coding system that doesn't use ISO2022's
8152 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8153 inhibit_iso_escape_detection
= 0;
8157 emacs_strerror (error_number
)
8162 synchronize_system_messages_locale ();
8163 str
= strerror (error_number
);
8165 if (! NILP (Vlocale_coding_system
))
8167 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
8168 Vlocale_coding_system
,
8170 str
= (char *) XSTRING (dec
)->data
;