1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 Coding system is an encoding mechanism of one or more character
50 sets. Here's a list of coding system types supported by Emacs.
51 When we say "decode", it means converting a text encoded by some
52 coding system into Emacs' internal format (emacs-utf-8), and when we
53 say "encode", it means converting a text of emacs-utf-8 to some
56 Emacs represents a coding system by a Lisp symbol. Each symbol is a
57 key to the hash table Vcharset_hash_table. This hash table
58 associates the symbol to the corresponding detailed specifications.
60 Before using a coding system for decoding and encoding, we setup a
61 structure of type `struct coding_system'. This structure keeps
62 various information about a specific code conversion (e.g. the
63 location of source and destination data).
65 Coding systems are classified into the following types by how to
66 represent a character in a byte sequence. Here's a brief descrition
69 o Emacs' internal format (emacs-utf-8)
71 The extended UTF-8 which allows eight-bit raw bytes mixed with
72 character codes. Emacs holds characters in buffers and strings by
79 o Charset-base coding system
81 A coding system defined by one or more (coded) character sets.
82 Decoding and encoding are done by code converter defined for each
85 o Old Emacs' internal format (emacs-mule)
87 The coding system adopted by an old versions of Emacs (20 and 21).
89 o ISO2022-base coding system
91 The most famous coding system for multiple character sets. X's
92 Compound Text, various EUCs (Extended Unix Code), and coding systems
93 used in the Internet communication such as ISO-2022-JP are all
96 o SJIS (or Shift-JIS or MS-Kanji-Code)
98 A coding system to encode character sets: ASCII, JISX0201, and
99 JISX0208. Widely used for PC's in Japan. Details are described in
104 A coding system to encode character sets: ASCII and Big5. Widely
105 used by Chinese (mainly in Taiwan and Hong Kong). Details are
106 described in section 8. In this file, when we write "big5" (all
107 lowercase), we mean the coding system, and when we write "Big5"
108 (capitalized), we mean the character set.
112 If a user wants to decode/encode a text encoded in a coding system
113 not listed above, he can supply a decoder and an encoder for it in
114 CCL (Code Conversion Language) programs. Emacs executes the CCL
115 program while decoding/encoding.
119 A coding system for a text containing raw eight-bit data. Emacs
120 treat each byte of source text as a character (except for
121 end-of-line conversion).
125 Like raw text, but don't do end-of-line conversion.
130 How end-of-line of a text is encoded depends on a system. For
131 instance, Unix's format is just one byte of LF (line-feed) code,
132 whereas DOS's format is two-byte sequence of `carriage-return' and
133 `line-feed' codes. MacOS's format is usually one byte of
136 Since text characters encoding and end-of-line encoding are
137 independent, any coding system described above can take any format
138 of end-of-line (except for no-conversion).
145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
147 These functions check if a byte sequence specified as a source in
148 CODING conforms to the format of XXX. Return 1 if the data contains
149 a byte sequence which can be decoded into non-ASCII characters by
150 the coding system. Otherwize (i.e. the data contains only ASCII
151 characters or invalid sequence) return 0.
153 It also resets some bits of an integer pointed by MASK. The macros
154 CATEGORY_MASK_XXX specifies each bit of this integer.
156 Below is the template of these functions. */
160 detect_coding_XXX (coding
, mask
)
161 struct coding_system
*coding
;
164 unsigned char *src
= coding
->source
;
165 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
166 int multibytep
= coding
->src_multibyte
;
173 /* Get one byte from the source. If the souce is exausted, jump
174 to no_more_source:. */
176 /* Check if it conforms to XXX. If not, break the loop. */
178 /* As the data is invalid for XXX, reset a proper bits. */
179 *mask
&= ~CODING_CATEGORY_XXX
;
182 /* The source exausted. */
184 /* ASCII characters only. */
186 /* Some data should be decoded into non-ASCII characters. */
187 *mask
&= CODING_CATEGORY_XXX
;
192 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
194 These functions decode a byte sequence specified as a source by
195 CODING. The resulting multibyte text goes to a place pointed to by
196 CODING->charbuf, the length of which should not exceed
197 CODING->charbuf_size;
199 These functions set the information of original and decoded texts in
200 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
201 They also set CODING->result to one of CODING_RESULT_XXX indicating
202 how the decoding is finished.
204 Below is the template of these functions. */
208 decode_coding_XXXX (coding
)
209 struct coding_system
*coding
;
211 unsigned char *src
= coding
->source
+ coding
->consumed
;
212 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
213 /* SRC_BASE remembers the start position in source in each loop.
214 The loop will be exited when there's not enough source code, or
215 when there's no room in CHARBUF for a decoded character. */
216 unsigned char *src_base
;
217 /* A buffer to produce decoded characters. */
218 int *charbuf
= coding
->charbuf
;
219 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
220 int multibytep
= coding
->src_multibyte
;
225 if (charbuf
< charbuf_end
)
226 /* No more room to produce a decoded character. */
233 if (src_base
< src_end
234 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
235 /* If the source ends by partial bytes to construct a character,
236 treat them as eight-bit raw data. */
237 while (src_base
< src_end
&& charbuf
< charbuf_end
)
238 *charbuf
++ = *src_base
++;
239 /* Remember how many bytes and characters we consumed. If the
240 source is multibyte, the bytes and chars are not identical. */
241 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
242 /* Remember how many characters we produced. */
243 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
247 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
249 These functions encode SRC_BYTES length text at SOURCE of Emacs'
250 internal multibyte format by CODING. The resulting byte sequence
251 goes to a place pointed to by DESTINATION, the length of which
252 should not exceed DST_BYTES.
254 These functions set the information of original and encoded texts in
255 the members produced, produced_char, consumed, and consumed_char of
256 the structure *CODING. They also set the member result to one of
257 CODING_RESULT_XXX indicating how the encoding finished.
259 DST_BYTES zero means that source area and destination area are
260 overlapped, which means that we can produce a encoded text until it
261 reaches at the head of not-yet-encoded source text.
263 Below is a template of these functions. */
266 encode_coding_XXX (coding
)
267 struct coding_system
*coding
;
269 int multibytep
= coding
->dst_multibyte
;
270 int *charbuf
= coding
->charbuf
;
271 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
272 unsigned char *dst
= coding
->destination
+ coding
->produced
;
273 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
274 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
275 int produced_chars
= 0;
277 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
280 /* Encode C into DST, and increment DST. */
282 label_no_more_destination
:
283 /* How many chars and bytes we produced. */
284 coding
->produced_char
+= produced_chars
;
285 coding
->produced
= dst
- coding
->destination
;
290 /*** 1. Preamble ***/
297 #include "character.h"
300 #include "composite.h"
304 Lisp_Object Vcoding_system_hash_table
;
306 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
307 Lisp_Object Qunix
, Qdos
, Qmac
;
308 Lisp_Object Qbuffer_file_coding_system
;
309 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
310 Lisp_Object Qdefault_char
;
311 Lisp_Object Qno_conversion
, Qundecided
;
312 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
313 Lisp_Object Qutf_16_be_nosig
, Qutf_16_be
, Qutf_16_le_nosig
, Qutf_16_le
;
314 Lisp_Object Qsignature
, Qendian
, Qbig
, Qlittle
;
315 Lisp_Object Qcoding_system_history
;
316 Lisp_Object Qvalid_codes
;
318 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
319 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
320 Lisp_Object Qstart_process
, Qopen_network_stream
;
321 Lisp_Object Qtarget_idx
;
323 Lisp_Object Vselect_safe_coding_system_function
;
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
327 /* Mnemonic string to indicate format of end-of-line is not yet
329 Lisp_Object eol_mnemonic_undecided
;
333 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
335 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule
, Qraw_text
;
341 /* Coding-systems are handed between Emacs Lisp programs and C internal
342 routines by the following three variables. */
343 /* Coding-system for reading files and receiving data from process. */
344 Lisp_Object Vcoding_system_for_read
;
345 /* Coding-system for writing files and sending data to process. */
346 Lisp_Object Vcoding_system_for_write
;
347 /* Coding-system actually used in the latest I/O. */
348 Lisp_Object Vlast_coding_system_used
;
350 /* A vector of length 256 which contains information about special
351 Latin codes (especially for dealing with Microsoft codes). */
352 Lisp_Object Vlatin_extra_code_table
;
354 /* Flag to inhibit code conversion of end-of-line format. */
355 int inhibit_eol_conversion
;
357 /* Flag to inhibit ISO2022 escape sequence detection. */
358 int inhibit_iso_escape_detection
;
360 /* Flag to make buffer-file-coding-system inherit from process-coding. */
361 int inherit_process_coding_system
;
363 /* Coding system to be used to encode text for terminal display. */
364 struct coding_system terminal_coding
;
366 /* Coding system to be used to encode text for terminal display when
367 terminal coding system is nil. */
368 struct coding_system safe_terminal_coding
;
370 /* Coding system of what is sent from terminal keyboard. */
371 struct coding_system keyboard_coding
;
373 Lisp_Object Vfile_coding_system_alist
;
374 Lisp_Object Vprocess_coding_system_alist
;
375 Lisp_Object Vnetwork_coding_system_alist
;
377 Lisp_Object Vlocale_coding_system
;
381 /* Flag to tell if we look up translation table on character code
383 Lisp_Object Venable_character_translation
;
384 /* Standard translation table to look up on decoding (reading). */
385 Lisp_Object Vstandard_translation_table_for_decode
;
386 /* Standard translation table to look up on encoding (writing). */
387 Lisp_Object Vstandard_translation_table_for_encode
;
389 Lisp_Object Qtranslation_table
;
390 Lisp_Object Qtranslation_table_id
;
391 Lisp_Object Qtranslation_table_for_decode
;
392 Lisp_Object Qtranslation_table_for_encode
;
394 /* Alist of charsets vs revision number. */
395 static Lisp_Object Vcharset_revision_table
;
397 /* Default coding systems used for process I/O. */
398 Lisp_Object Vdefault_process_coding_system
;
400 /* Global flag to tell that we can't call post-read-conversion and
401 pre-write-conversion functions. Usually the value is zero, but it
402 is set to 1 temporarily while such functions are running. This is
403 to avoid infinite recursive call. */
404 static int inhibit_pre_post_conversion
;
406 /* Char-table containing safe coding systems of each character. */
407 Lisp_Object Vchar_coding_system_table
;
408 Lisp_Object Qchar_coding_system
;
410 /* Two special coding systems. */
411 Lisp_Object Vsjis_coding_system
;
412 Lisp_Object Vbig5_coding_system
;
415 static int detect_coding_utf_8
P_ ((struct coding_system
*, int *));
416 static void decode_coding_utf_8
P_ ((struct coding_system
*));
417 static int encode_coding_utf_8
P_ ((struct coding_system
*));
419 static int detect_coding_utf_16
P_ ((struct coding_system
*, int *));
420 static void decode_coding_utf_16
P_ ((struct coding_system
*));
421 static int encode_coding_utf_16
P_ ((struct coding_system
*));
423 static int detect_coding_iso_2022
P_ ((struct coding_system
*, int *));
424 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
425 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
427 static int detect_coding_emacs_mule
P_ ((struct coding_system
*, int *));
428 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
429 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
431 static int detect_coding_sjis
P_ ((struct coding_system
*, int *));
432 static void decode_coding_sjis
P_ ((struct coding_system
*));
433 static int encode_coding_sjis
P_ ((struct coding_system
*));
435 static int detect_coding_big5
P_ ((struct coding_system
*, int *));
436 static void decode_coding_big5
P_ ((struct coding_system
*));
437 static int encode_coding_big5
P_ ((struct coding_system
*));
439 static int detect_coding_ccl
P_ ((struct coding_system
*, int *));
440 static void decode_coding_ccl
P_ ((struct coding_system
*));
441 static int encode_coding_ccl
P_ ((struct coding_system
*));
443 static void decode_coding_raw_text
P_ ((struct coding_system
*));
444 static int encode_coding_raw_text
P_ ((struct coding_system
*));
447 /* ISO2022 section */
449 #define CODING_ISO_INITIAL(coding, reg) \
450 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
451 coding_attr_iso_initial), \
455 #define CODING_ISO_REQUEST(coding, charset_id) \
456 ((charset_id <= (coding)->max_charset_id \
457 ? (coding)->safe_charsets[charset_id] \
461 #define CODING_ISO_FLAGS(coding) \
462 ((coding)->spec.iso_2022.flags)
463 #define CODING_ISO_DESIGNATION(coding, reg) \
464 ((coding)->spec.iso_2022.current_designation[reg])
465 #define CODING_ISO_INVOCATION(coding, plane) \
466 ((coding)->spec.iso_2022.current_invocation[plane])
467 #define CODING_ISO_SINGLE_SHIFTING(coding) \
468 ((coding)->spec.iso_2022.single_shifting)
469 #define CODING_ISO_BOL(coding) \
470 ((coding)->spec.iso_2022.bol)
471 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
472 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
474 /* Control characters of ISO2022. */
475 /* code */ /* function */
476 #define ISO_CODE_LF 0x0A /* line-feed */
477 #define ISO_CODE_CR 0x0D /* carriage-return */
478 #define ISO_CODE_SO 0x0E /* shift-out */
479 #define ISO_CODE_SI 0x0F /* shift-in */
480 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
481 #define ISO_CODE_ESC 0x1B /* escape */
482 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
483 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
484 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
486 /* All code (1-byte) of ISO2022 is classified into one of the
488 enum iso_code_class_type
490 ISO_control_0
, /* Control codes in the range
491 0x00..0x1F and 0x7F, except for the
492 following 5 codes. */
493 ISO_carriage_return
, /* ISO_CODE_CR (0x0D) */
494 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
495 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
496 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
497 ISO_escape
, /* ISO_CODE_SO (0x1B) */
498 ISO_control_1
, /* Control codes in the range
499 0x80..0x9F, except for the
500 following 3 codes. */
501 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
502 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
503 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
504 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
505 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
506 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
507 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
510 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
511 `iso-flags' attribute of an iso2022 coding system. */
513 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
514 instead of the correct short-form sequence (e.g. ESC $ A). */
515 #define CODING_ISO_FLAG_LONG_FORM 0x0001
517 /* If set, reset graphic planes and registers at end-of-line to the
519 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
521 /* If set, reset graphic planes and registers before any control
522 characters to the initial state. */
523 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
525 /* If set, encode by 7-bit environment. */
526 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
528 /* If set, use locking-shift function. */
529 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
531 /* If set, use single-shift function. Overwrite
532 CODING_ISO_FLAG_LOCKING_SHIFT. */
533 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
535 /* If set, use designation escape sequence. */
536 #define CODING_ISO_FLAG_DESIGNATION 0x0040
538 /* If set, produce revision number sequence. */
539 #define CODING_ISO_FLAG_REVISION 0x0080
541 /* If set, produce ISO6429's direction specifying sequence. */
542 #define CODING_ISO_FLAG_DIRECTION 0x0100
544 /* If set, assume designation states are reset at beginning of line on
546 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
548 /* If set, designation sequence should be placed at beginning of line
550 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
552 /* If set, do not encode unsafe charactes on output. */
553 #define CODING_ISO_FLAG_SAFE 0x0800
555 /* If set, extra latin codes (128..159) are accepted as a valid code
557 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
559 #define CODING_ISO_FLAG_COMPOSITION 0x2000
561 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
563 #define CODING_ISO_FLAG_FULL_SUPPORT 0x8000
565 /* A character to be produced on output if encoding of the original
566 character is prohibited by CODING_ISO_FLAG_SAFE. */
567 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
571 #define CODING_UTF_16_BOM(coding) \
572 ((coding)->spec.utf_16.bom)
574 #define CODING_UTF_16_ENDIAN(coding) \
575 ((coding)->spec.utf_16.endian)
577 #define CODING_UTF_16_SURROGATE(coding) \
578 ((coding)->spec.utf_16.surrogate)
582 #define CODING_CCL_DECODER(coding) \
583 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
584 #define CODING_CCL_ENCODER(coding) \
585 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
586 #define CODING_CCL_VALIDS(coding) \
587 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
590 /* Index for each coding category in `coding_category_table' */
594 coding_category_iso_7
,
595 coding_category_iso_7_tight
,
596 coding_category_iso_8_1
,
597 coding_category_iso_8_2
,
598 coding_category_iso_7_else
,
599 coding_category_iso_8_else
,
600 coding_category_utf_8
,
601 coding_category_utf_16_auto
,
602 coding_category_utf_16_be
,
603 coding_category_utf_16_le
,
604 coding_category_utf_16_be_nosig
,
605 coding_category_utf_16_le_nosig
,
606 coding_category_charset
,
607 coding_category_sjis
,
608 coding_category_big5
,
610 coding_category_emacs_mule
,
611 /* All above are targets of code detection. */
612 coding_category_raw_text
,
613 coding_category_undecided
,
617 /* Definitions of flag bits used in detect_coding_XXXX. */
618 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
619 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
620 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
621 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
622 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
623 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
624 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
625 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
626 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
627 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
628 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
629 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
630 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
631 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
632 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
633 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
635 /* This value is returned if detect_coding_mask () find nothing other
636 than ASCII characters. */
637 #define CATEGORY_MASK_ANY \
638 (CATEGORY_MASK_ISO_7 \
639 | CATEGORY_MASK_ISO_7_TIGHT \
640 | CATEGORY_MASK_ISO_8_1 \
641 | CATEGORY_MASK_ISO_8_2 \
642 | CATEGORY_MASK_ISO_7_ELSE \
643 | CATEGORY_MASK_ISO_8_ELSE \
644 | CATEGORY_MASK_UTF_8 \
645 | CATEGORY_MASK_UTF_16_BE \
646 | CATEGORY_MASK_UTF_16_LE \
647 | CATEGORY_MASK_UTF_16_BE_NOSIG \
648 | CATEGORY_MASK_UTF_16_LE_NOSIG \
649 | CATEGORY_MASK_CHARSET \
650 | CATEGORY_MASK_SJIS \
651 | CATEGORY_MASK_BIG5 \
652 | CATEGORY_MASK_CCL \
653 | CATEGORY_MASK_EMACS_MULE)
656 #define CATEGORY_MASK_ISO_7BIT \
657 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
659 #define CATEGORY_MASK_ISO_8BIT \
660 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
662 #define CATEGORY_MASK_ISO_ELSE \
663 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
665 #define CATEGORY_MASK_ISO_ESCAPE \
666 (CATEGORY_MASK_ISO_7 \
667 | CATEGORY_MASK_ISO_7_TIGHT \
668 | CATEGORY_MASK_ISO_7_ELSE \
669 | CATEGORY_MASK_ISO_8_ELSE)
671 #define CATEGORY_MASK_ISO \
672 ( CATEGORY_MASK_ISO_7BIT \
673 | CATEGORY_MASK_ISO_8BIT \
674 | CATEGORY_MASK_ISO_ELSE)
676 #define CATEGORY_MASK_UTF_16 \
677 (CATEGORY_MASK_UTF_16_BE \
678 | CATEGORY_MASK_UTF_16_LE \
679 | CATEGORY_MASK_UTF_16_BE_NOSIG \
680 | CATEGORY_MASK_UTF_16_LE_NOSIG)
683 /* List of symbols `coding-category-xxx' ordered by priority. This
684 variable is exposed to Emacs Lisp. */
685 static Lisp_Object Vcoding_category_list
;
687 /* Table of coding categories (Lisp symbols). This variable is for
689 static Lisp_Object Vcoding_category_table
;
691 /* Table of coding-categories ordered by priority. */
692 static enum coding_category coding_priorities
[coding_category_max
];
694 /* Nth element is a coding context for the coding system bound to the
695 Nth coding category. */
696 static struct coding_system coding_categories
[coding_category_max
];
698 static int detected_mask
[coding_category_raw_text
] =
706 CATEGORY_MASK_UTF_16
,
707 CATEGORY_MASK_UTF_16
,
708 CATEGORY_MASK_UTF_16
,
709 CATEGORY_MASK_UTF_16
,
710 CATEGORY_MASK_UTF_16
,
711 CATEGORY_MASK_CHARSET
,
715 CATEGORY_MASK_EMACS_MULE
718 /*** Commonly used macros and functions ***/
721 #define min(a, b) ((a) < (b) ? (a) : (b))
724 #define max(a, b) ((a) > (b) ? (a) : (b))
727 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
729 attrs = CODING_ID_ATTRS (coding->id); \
730 eol_type = CODING_ID_EOL_TYPE (coding->id); \
731 if (VECTORP (eol_type)) \
733 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
737 /* Safely get one byte from the source text pointed by SRC which ends
738 at SRC_END, and set C to that byte. If there are not enough bytes
739 in the source, it jumps to `no_more_source'. The caller
740 should declare and set these variables appropriately in advance:
741 src, src_end, multibytep
744 #define ONE_MORE_BYTE(c) \
746 if (src == src_end) \
748 if (src_base < src) \
749 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
750 goto no_more_source; \
753 if (multibytep && (c & 0x80)) \
755 if ((c & 0xFE) != 0xC0) \
756 error ("Undecodable char found"); \
757 c = ((c & 1) << 6) | *src++; \
763 #define ONE_MORE_BYTE_NO_CHECK(c) \
766 if (multibytep && (c & 0x80)) \
768 if ((c & 0xFE) != 0xC0) \
769 error ("Undecodable char found"); \
770 c = ((c & 1) << 6) | *src++; \
775 /* Store a byte C in the place pointed by DST and increment DST to the
776 next free point, and increment PRODUCED_CHARS. The caller should
777 assure that C is 0..127, and declare and set the variable `dst'
778 appropriately in advance.
782 #define EMIT_ONE_ASCII_BYTE(c) \
789 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
791 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
793 produced_chars += 2; \
794 *dst++ = (c1), *dst++ = (c2); \
798 /* Store a byte C in the place pointed by DST and increment DST to the
799 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
800 nonzero, store in an appropriate multibyte from. The caller should
801 declare and set the variables `dst' and `multibytep' appropriately
804 #define EMIT_ONE_BYTE(c) \
811 ch = BYTE8_TO_CHAR (ch); \
812 CHAR_STRING_ADVANCE (ch, dst); \
819 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
821 #define EMIT_TWO_BYTES(c1, c2) \
823 produced_chars += 2; \
826 CHAR_STRING_ADVANCE ((int) (c1), dst); \
827 CHAR_STRING_ADVANCE ((int) (c2), dst); \
837 #define EMIT_THREE_BYTES(c1, c2, c3) \
839 EMIT_ONE_BYTE (c1); \
840 EMIT_TWO_BYTES (c2, c3); \
844 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
846 EMIT_TWO_BYTES (c1, c2); \
847 EMIT_TWO_BYTES (c3, c4); \
851 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
853 charset_map_loaded = 0; \
854 c = DECODE_CHAR (charset, code); \
855 if (charset_map_loaded) \
857 unsigned char *orig = coding->source; \
860 coding_set_source (coding); \
861 offset = coding->source - orig; \
863 src_base += offset; \
869 #define ASSURE_DESTINATION(bytes) \
871 if (dst + (bytes) >= dst_end) \
873 int more_bytes = charbuf_end - charbuf + (bytes); \
875 dst = alloc_destination (coding, more_bytes, dst); \
876 dst_end = coding->destination + coding->dst_bytes; \
883 coding_set_source (coding
)
884 struct coding_system
*coding
;
886 if (BUFFERP (coding
->src_object
))
888 if (coding
->src_pos
< 0)
889 coding
->source
= GAP_END_ADDR
+ coding
->src_pos_byte
;
892 if (coding
->src_pos
< GPT
893 && coding
->src_pos
+ coding
->src_chars
>= GPT
)
894 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
895 coding
->source
= BYTE_POS_ADDR (coding
->src_pos_byte
);
898 else if (STRINGP (coding
->src_object
))
900 coding
->source
= (XSTRING (coding
->src_object
)->data
901 + coding
->src_pos_byte
);
904 /* Otherwise, the source is C string and is never relocated
905 automatically. Thus we don't have to update anything. */
910 coding_set_destination (coding
)
911 struct coding_system
*coding
;
913 if (BUFFERP (coding
->dst_object
))
915 /* We are sure that coding->dst_pos_byte is before the gap of the
917 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
918 + coding
->dst_pos_byte
- 1);
919 if (coding
->src_pos
< 0)
920 /* The source and destination is in the same buffer. */
921 coding
->dst_bytes
= (GAP_END_ADDR
922 - (coding
->src_bytes
- coding
->consumed
)
923 - coding
->destination
);
925 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
926 - coding
->destination
);
929 /* Otherwise, the destination is C string and is never relocated
930 automatically. Thus we don't have to update anything. */
936 coding_alloc_by_realloc (coding
, bytes
)
937 struct coding_system
*coding
;
940 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
941 coding
->dst_bytes
+ bytes
);
942 coding
->dst_bytes
+= bytes
;
946 coding_alloc_by_making_gap (coding
, bytes
)
947 struct coding_system
*coding
;
950 Lisp_Object this_buffer
;
952 this_buffer
= Fcurrent_buffer ();
953 if (EQ (this_buffer
, coding
->dst_object
))
955 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
957 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
959 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
963 set_buffer_internal (XBUFFER (coding
->dst_object
));
965 set_buffer_internal (XBUFFER (this_buffer
));
970 static unsigned char *
971 alloc_destination (coding
, nbytes
, dst
)
972 struct coding_system
*coding
;
976 EMACS_INT offset
= dst
- coding
->destination
;
978 if (BUFFERP (coding
->dst_object
))
979 coding_alloc_by_making_gap (coding
, nbytes
);
981 coding_alloc_by_realloc (coding
, nbytes
);
982 coding
->result
= CODING_RESULT_SUCCESS
;
983 coding_set_destination (coding
);
984 dst
= coding
->destination
+ offset
;
989 /*** 2. Emacs' internal format (emacs-utf-8) ***/
996 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
997 Check if a text is encoded in UTF-8. If it is, return
998 CATEGORY_MASK_UTF_8, else return 0. */
1000 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1001 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1002 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1003 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1004 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1005 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1008 detect_coding_utf_8 (coding
, mask
)
1009 struct coding_system
*coding
;
1012 unsigned char *src
= coding
->source
, *src_base
= src
;
1013 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1014 int multibytep
= coding
->src_multibyte
;
1015 int consumed_chars
= 0;
1018 /* A coding system of this category is always ASCII compatible. */
1019 src
+= coding
->head_ascii
;
1023 int c
, c1
, c2
, c3
, c4
;
1026 if (UTF_8_1_OCTET_P (c
))
1029 if (! UTF_8_EXTRA_OCTET_P (c1
))
1031 if (UTF_8_2_OCTET_LEADING_P (c
))
1037 if (! UTF_8_EXTRA_OCTET_P (c2
))
1039 if (UTF_8_3_OCTET_LEADING_P (c
))
1045 if (! UTF_8_EXTRA_OCTET_P (c3
))
1047 if (UTF_8_4_OCTET_LEADING_P (c
))
1053 if (! UTF_8_EXTRA_OCTET_P (c4
))
1055 if (UTF_8_5_OCTET_LEADING_P (c
))
1062 *mask
&= ~CATEGORY_MASK_UTF_8
;
1068 *mask
&= CATEGORY_MASK_UTF_8
;
1074 decode_coding_utf_8 (coding
)
1075 struct coding_system
*coding
;
1077 unsigned char *src
= coding
->source
+ coding
->consumed
;
1078 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1079 unsigned char *src_base
;
1080 int *charbuf
= coding
->charbuf
;
1081 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1082 int consumed_chars
= 0, consumed_chars_base
;
1083 int multibytep
= coding
->src_multibyte
;
1084 Lisp_Object attr
, eol_type
, charset_list
;
1086 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1090 int c
, c1
, c2
, c3
, c4
, c5
;
1093 consumed_chars_base
= consumed_chars
;
1095 if (charbuf
>= charbuf_end
)
1099 if (UTF_8_1_OCTET_P(c1
))
1104 if (EQ (eol_type
, Qdos
))
1107 goto no_more_source
;
1111 else if (EQ (eol_type
, Qmac
))
1118 if (! UTF_8_EXTRA_OCTET_P (c2
))
1120 if (UTF_8_2_OCTET_LEADING_P (c1
))
1121 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1125 if (! UTF_8_EXTRA_OCTET_P (c3
))
1127 if (UTF_8_3_OCTET_LEADING_P (c1
))
1128 c
= (((c1
& 0xF) << 12)
1129 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1133 if (! UTF_8_EXTRA_OCTET_P (c4
))
1135 if (UTF_8_4_OCTET_LEADING_P (c1
))
1136 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1137 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1141 if (! UTF_8_EXTRA_OCTET_P (c5
))
1143 if (UTF_8_5_OCTET_LEADING_P (c1
))
1145 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1146 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1163 consumed_chars
= consumed_chars_base
;
1165 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1170 coding
->consumed_char
+= consumed_chars_base
;
1171 coding
->consumed
= src_base
- coding
->source
;
1172 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1177 encode_coding_utf_8 (coding
)
1178 struct coding_system
*coding
;
1180 int multibytep
= coding
->dst_multibyte
;
1181 int *charbuf
= coding
->charbuf
;
1182 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1183 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1184 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1190 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1192 while (charbuf
< charbuf_end
)
1194 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1196 ASSURE_DESTINATION (safe_room
);
1198 CHAR_STRING_ADVANCE (c
, pend
);
1199 for (p
= str
; p
< pend
; p
++)
1205 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1207 while (charbuf
< charbuf_end
)
1209 ASSURE_DESTINATION (safe_room
);
1211 dst
+= CHAR_STRING (c
, dst
);
1215 coding
->result
= CODING_RESULT_SUCCESS
;
1216 coding
->produced_char
+= produced_chars
;
1217 coding
->produced
= dst
- coding
->destination
;
1222 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1223 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1224 Little Endian (otherwise). If it is, return
1225 CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1228 #define UTF_16_HIGH_SURROGATE_P(val) \
1229 (((val) & 0xFC00) == 0xD800)
1231 #define UTF_16_LOW_SURROGATE_P(val) \
1232 (((val) & 0xFC00) == 0xDC00)
1234 #define UTF_16_INVALID_P(val) \
1235 (((val) == 0xFFFE) \
1236 || ((val) == 0xFFFF) \
1237 || UTF_16_LOW_SURROGATE_P (val))
1241 detect_coding_utf_16 (coding
, mask
)
1242 struct coding_system
*coding
;
1245 unsigned char *src
= coding
->source
, *src_base
= src
;
1246 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1247 int multibytep
= coding
->src_multibyte
;
1248 int consumed_chars
= 0;
1254 if ((c1
== 0xFF) && (c2
== 0xFE))
1256 *mask
&= CATEGORY_MASK_UTF_16_LE
;
1259 else if ((c1
== 0xFE) && (c2
== 0xFF))
1261 *mask
&= CATEGORY_MASK_UTF_16_BE
;
1269 decode_coding_utf_16 (coding
)
1270 struct coding_system
*coding
;
1272 unsigned char *src
= coding
->source
+ coding
->consumed
;
1273 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1274 unsigned char *src_base
, *surrogate_high_base
;
1275 int *charbuf
= coding
->charbuf
;
1276 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1277 int consumed_chars
= 0, consumed_chars_base
;
1278 int multibytep
= coding
->src_multibyte
;
1279 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1280 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1281 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1282 Lisp_Object attr
, eol_type
, charset_list
;
1284 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1286 if (bom
!= utf_16_without_bom
)
1293 c
= (c1
<< 16) | c2
;
1294 if (bom
== utf_16_with_bom
)
1296 if (endian
== utf_16_big_endian
1297 ? c
!= 0xFFFE : c
!= 0xFEFF)
1299 /* We are sure that there's enouph room at CHARBUF. */
1308 CODING_UTF_16_ENDIAN (coding
)
1309 = endian
= utf_16_big_endian
;
1310 else if (c
== 0xFEFF)
1311 CODING_UTF_16_ENDIAN (coding
)
1312 = endian
= utf_16_little_endian
;
1315 CODING_UTF_16_ENDIAN (coding
)
1316 = endian
= utf_16_big_endian
;
1320 CODING_UTF_16_BOM (coding
) = utf_16_with_bom
;
1328 consumed_chars_base
= consumed_chars
;
1330 if (charbuf
+ 2 >= charbuf_end
)
1335 c
= (endian
== utf_16_big_endian
1336 ? ((c1
<< 16) | c2
) : ((c2
<< 16) | c1
));
1339 if (! UTF_16_LOW_SURROGATE_P (c
))
1341 if (endian
== utf_16_big_endian
)
1342 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1344 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1348 if (UTF_16_HIGH_SURROGATE_P (c
))
1349 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1355 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1356 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1362 if (UTF_16_HIGH_SURROGATE_P (c
))
1363 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1370 coding
->consumed_char
+= consumed_chars_base
;
1371 coding
->consumed
= src_base
- coding
->source
;
1372 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1376 encode_coding_utf_16 (coding
)
1377 struct coding_system
*coding
;
1379 int multibytep
= coding
->dst_multibyte
;
1380 int *charbuf
= coding
->charbuf
;
1381 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1382 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1383 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1385 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1386 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1387 int produced_chars
= 0;
1388 Lisp_Object attrs
, eol_type
, charset_list
;
1391 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1393 if (bom
== utf_16_with_bom
)
1395 ASSURE_DESTINATION (safe_room
);
1397 EMIT_TWO_BYTES (0xFF, 0xFE);
1399 EMIT_TWO_BYTES (0xFE, 0xFF);
1400 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1403 while (charbuf
< charbuf_end
)
1405 ASSURE_DESTINATION (safe_room
);
1413 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1415 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1422 c1
= (c
>> 10) + 0xD800;
1423 c2
= (c
& 0x3FF) + 0xDC00;
1425 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1427 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1430 coding
->result
= CODING_RESULT_SUCCESS
;
1431 coding
->produced
= dst
- coding
->destination
;
1432 coding
->produced_char
+= produced_chars
;
1437 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1439 /* Emacs' internal format for representation of multiple character
1440 sets is a kind of multi-byte encoding, i.e. characters are
1441 represented by variable-length sequences of one-byte codes.
1443 ASCII characters and control characters (e.g. `tab', `newline') are
1444 represented by one-byte sequences which are their ASCII codes, in
1445 the range 0x00 through 0x7F.
1447 8-bit characters of the range 0x80..0x9F are represented by
1448 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1451 8-bit characters of the range 0xA0..0xFF are represented by
1452 one-byte sequences which are their 8-bit code.
1454 The other characters are represented by a sequence of `base
1455 leading-code', optional `extended leading-code', and one or two
1456 `position-code's. The length of the sequence is determined by the
1457 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1458 whereas extended leading-code and position-code take the range 0xA0
1459 through 0xFF. See `charset.h' for more details about leading-code
1462 --- CODE RANGE of Emacs' internal format ---
1466 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1467 eight-bit-graphic 0xA0..0xBF
1468 ELSE 0x81..0x9D + [0xA0..0xFF]+
1469 ---------------------------------------------
1471 As this is the internal character representation, the format is
1472 usually not used externally (i.e. in a file or in a data sent to a
1473 process). But, it is possible to have a text externally in this
1474 format (i.e. by encoding by the coding system `emacs-mule').
1476 In that case, a sequence of one-byte codes has a slightly different
1479 At first, all characters in eight-bit-control are represented by
1480 one-byte sequences which are their 8-bit code.
1482 Next, character composition data are represented by the byte
1483 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1485 METHOD is 0xF0 plus one of composition method (enum
1486 composition_method),
1488 BYTES is 0xA0 plus a byte length of this composition data,
1490 CHARS is 0x20 plus a number of characters composed by this
1493 COMPONENTs are characters of multibye form or composition
1494 rules encoded by two-byte of ASCII codes.
1496 In addition, for backward compatibility, the following formats are
1497 also recognized as composition data on decoding.
1500 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1503 MSEQ is a multibyte form but in these special format:
1504 ASCII: 0xA0 ASCII_CODE+0x80,
1505 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1506 RULE is a one byte code of the range 0xA0..0xF0 that
1507 represents a composition rule.
1510 char emacs_mule_bytes
[256];
1512 /* Leading-code followed by extended leading-code. */
1513 #define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
1514 #define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
1515 #define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
1516 #define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
1520 emacs_mule_char (coding
, composition
, nbytes
, nchars
)
1521 struct coding_system
*coding
;
1523 int *nbytes
, *nchars
;
1525 unsigned char *src
= coding
->source
+ coding
->consumed
;
1526 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1527 int multibytep
= coding
->src_multibyte
;
1528 unsigned char *src_base
= src
;
1529 struct charset
*charset
;
1532 int consumed_chars
= 0;
1543 *nbytes
= src
- src_base
;
1544 *nchars
= consumed_chars
;
1549 switch (emacs_mule_bytes
[c
])
1552 if (! (charset
= emacs_mule_charset
[c
]))
1559 if (c
== LEADING_CODE_PRIVATE_11
1560 || c
== LEADING_CODE_PRIVATE_12
)
1563 if (! (charset
= emacs_mule_charset
[c
]))
1570 if (! (charset
= emacs_mule_charset
[c
]))
1573 code
= (c
& 0x7F) << 7;
1580 if (! (charset
= emacs_mule_charset
[c
]))
1583 code
= (c
& 0x7F) << 7;
1590 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
) ? charset_ascii
1591 : code
< 0xA0 ? charset_8_bit_control
1592 : charset_8_bit_graphic
);
1598 c
= DECODE_CHAR (charset
, code
);
1601 *nbytes
= src
- src_base
;
1602 *nchars
= consumed_chars
;
1613 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1614 Check if a text is encoded in `emacs-mule'. */
1617 detect_coding_emacs_mule (coding
, mask
)
1618 struct coding_system
*coding
;
1621 unsigned char *src
= coding
->source
, *src_base
= src
;
1622 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1623 int multibytep
= coding
->src_multibyte
;
1624 int consumed_chars
= 0;
1628 /* A coding system of this category is always ASCII compatible. */
1629 src
+= coding
->head_ascii
;
1637 /* Perhaps the start of composite character. We simple skip
1638 it because analyzing it is too heavy for detecting. But,
1639 at least, we check that the composite character
1640 constitues of more than 4 bytes. */
1641 unsigned char *src_base
;
1651 if (src
- src_base
<= 4)
1661 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1666 unsigned char *src_base
= src
- 1;
1673 if (src
- src_base
!= emacs_mule_bytes
[*src_base
])
1678 *mask
&= ~CATEGORY_MASK_EMACS_MULE
;
1684 *mask
&= CATEGORY_MASK_EMACS_MULE
;
1689 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1691 /* Decode a character represented as a component of composition
1692 sequence of Emacs 20/21 style at SRC. Set C to that character and
1693 update SRC to the head of next character (or an encoded composition
1694 rule). If SRC doesn't points a composition component, set C to -1.
1695 If SRC points an invalid byte sequence, global exit by a return
1698 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1702 int nbytes, nchars; \
1704 if (src == src_end) \
1706 c = emacs_mule_char (coding, 1, &nbytes, &nchars); \
1711 goto invalid_code; \
1715 consumed_chars += nchars; \
1720 /* Decode a composition rule represented as a component of composition
1721 sequence of Emacs 20 style at SRC. Set C to the rule. If SRC
1722 points an invalid byte sequence, set C to -1. */
1724 #define DECODE_EMACS_MULE_COMPOSITION_RULE(buf) \
1726 int c, gref, nref; \
1728 if (src < src_end) \
1729 goto invalid_code; \
1730 ONE_MORE_BYTE_NO_CHECK (c); \
1732 if (c < 0 || c >= 81) \
1733 goto invalid_code; \
1735 gref = c / 9, nref = c % 9; \
1736 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1740 #define ADD_COMPOSITION_DATA(buf, method, nchars) \
1743 *buf++ = coding->produced_char + char_offset; \
1744 *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \
1750 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1752 /* Emacs 21 style format. The first three bytes at SRC are \
1753 (METHOD - 0xF0), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1754 the byte length of this composition information, CHARS is the \
1755 number of characters composed by this composition. */ \
1756 enum composition_method method = c - 0xF0; \
1757 int consumed_chars_limit; \
1758 int nbytes, nchars; \
1760 ONE_MORE_BYTE (c); \
1761 nbytes = c - 0xA0; \
1763 goto invalid_code; \
1764 ONE_MORE_BYTE (c); \
1765 nchars = c - 0xA0; \
1766 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
1767 consumed_chars_limit = consumed_chars_base + nbytes; \
1768 if (method != COMPOSITION_RELATIVE) \
1771 while (consumed_chars < consumed_chars_limit) \
1773 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1774 DECODE_EMACS_MULE_COMPOSITION_RULE (charbuf); \
1776 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1778 if (consumed_chars < consumed_chars_limit) \
1779 goto invalid_code; \
1784 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1786 /* Emacs 20 style format for relative composition. */ \
1787 /* Store multibyte form of characters to be composed. */ \
1788 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1789 int *buf = components; \
1793 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1794 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1795 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1797 goto invalid_code; \
1798 ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \
1799 for (j = 0; j < i; j++) \
1800 *charbuf++ = components[j]; \
1804 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1806 /* Emacs 20 style format for rule-base composition. */ \
1807 /* Store multibyte form of characters to be composed. */ \
1808 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1809 int *buf = components; \
1812 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1813 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1815 DECODE_EMACS_MULE_COMPOSITION_RULE (buf); \
1816 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1818 if (i < 1 || (buf - components) % 2 == 0) \
1819 goto invalid_code; \
1820 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1821 goto no_more_source; \
1822 ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \
1823 for (j = 0; j < i; j++) \
1824 *charbuf++ = components[j]; \
1825 for (j = 0; j < i; j += 2) \
1826 *charbuf++ = components[j]; \
1831 decode_coding_emacs_mule (coding
)
1832 struct coding_system
*coding
;
1834 unsigned char *src
= coding
->source
+ coding
->consumed
;
1835 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1836 unsigned char *src_base
;
1837 int *charbuf
= coding
->charbuf
;
1838 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1839 int consumed_chars
= 0, consumed_chars_base
;
1840 int char_offset
= 0;
1841 int multibytep
= coding
->src_multibyte
;
1842 Lisp_Object attrs
, eol_type
, charset_list
;
1844 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1851 consumed_chars_base
= consumed_chars
;
1853 if (charbuf
>= charbuf_end
)
1862 if (EQ (eol_type
, Qdos
))
1865 goto no_more_source
;
1869 else if (EQ (eol_type
, Qmac
))
1877 if (charbuf
+ 5 + (MAX_COMPOSITION_COMPONENTS
* 2) - 1 > charbuf_end
)
1880 if (c
- 0xF0 >= COMPOSITION_RELATIVE
1881 && c
- 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS
)
1882 DECODE_EMACS_MULE_21_COMPOSITION (c
);
1884 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
1886 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
1890 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
1894 c
= emacs_mule_char (coding
, 0, &nbytes
, &nchars
);
1908 consumed_chars
= consumed_chars_base
;
1910 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1915 coding
->consumed_char
+= consumed_chars_base
;
1916 coding
->consumed
= src_base
- coding
->source
;
1917 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1921 #define EMACS_MULE_LEADING_CODES(id, codes) \
1924 codes[0] = id, codes[1] = 0; \
1925 else if (id < 0xE0) \
1926 codes[0] = 0x9A, codes[1] = id; \
1927 else if (id < 0xF0) \
1928 codes[0] = 0x9B, codes[1] = id; \
1929 else if (id < 0xF5) \
1930 codes[0] = 0x9C, codes[1] = id; \
1932 codes[0] = 0x9D, codes[1] = id; \
1937 encode_coding_emacs_mule (coding
)
1938 struct coding_system
*coding
;
1940 int multibytep
= coding
->dst_multibyte
;
1941 int *charbuf
= coding
->charbuf
;
1942 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1943 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1944 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1946 unsigned char *adjusted_dst_end
=dst_end
- 8;
1947 int produced_chars
= 0;
1948 Lisp_Object attrs
, eol_type
, charset_list
;
1951 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1953 while (charbuf
< charbuf_end
)
1955 ASSURE_DESTINATION (safe_room
);
1957 if (ASCII_CHAR_P (c
))
1958 EMIT_ONE_ASCII_BYTE (c
);
1961 struct charset
*charset
;
1965 unsigned char leading_codes
[2];
1967 charset
= char_charset (c
, charset_list
, &code
);
1970 c
= coding
->default_char
;
1971 if (ASCII_CHAR_P (c
))
1973 EMIT_ONE_ASCII_BYTE (c
);
1976 charset
= char_charset (c
, charset_list
, &code
);
1978 dimension
= CHARSET_DIMENSION (charset
);
1979 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
1980 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
1981 EMIT_ONE_BYTE (leading_codes
[0]);
1982 if (leading_codes
[1])
1983 EMIT_ONE_BYTE (leading_codes
[1]);
1985 EMIT_ONE_BYTE (code
);
1988 EMIT_ONE_BYTE (code
>> 8);
1989 EMIT_ONE_BYTE (code
& 0xFF);
1993 coding
->result
= CODING_RESULT_SUCCESS
;
1994 coding
->produced_char
+= produced_chars
;
1995 coding
->produced
= dst
- coding
->destination
;
2000 /*** 7. ISO2022 handlers ***/
2002 /* The following note describes the coding system ISO2022 briefly.
2003 Since the intention of this note is to help understand the
2004 functions in this file, some parts are NOT ACCURATE or OVERLY
2005 SIMPLIFIED. For thorough understanding, please refer to the
2006 original document of ISO2022.
2008 ISO2022 provides many mechanisms to encode several character sets
2009 in 7-bit and 8-bit environments. For 7-bite environments, all text
2010 is encoded using bytes less than 128. This may make the encoded
2011 text a little bit longer, but the text passes more easily through
2012 several gateways, some of which strip off MSB (Most Signigant Bit).
2014 There are two kinds of character sets: control character set and
2015 graphic character set. The former contains control characters such
2016 as `newline' and `escape' to provide control functions (control
2017 functions are also provided by escape sequences). The latter
2018 contains graphic characters such as 'A' and '-'. Emacs recognizes
2019 two control character sets and many graphic character sets.
2021 Graphic character sets are classified into one of the following
2022 four classes, according to the number of bytes (DIMENSION) and
2023 number of characters in one dimension (CHARS) of the set:
2024 - DIMENSION1_CHARS94
2025 - DIMENSION1_CHARS96
2026 - DIMENSION2_CHARS94
2027 - DIMENSION2_CHARS96
2029 In addition, each character set is assigned an identification tag,
2030 unique for each set, called "final character" (denoted as <F>
2031 hereafter). The <F> of each character set is decided by ECMA(*)
2032 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2033 (0x30..0x3F are for private use only).
2035 Note (*): ECMA = European Computer Manufacturers Association
2037 Here are examples of graphic character set [NAME(<F>)]:
2038 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2039 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2040 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2041 o DIMENSION2_CHARS96 -- none for the moment
2043 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2044 C0 [0x00..0x1F] -- control character plane 0
2045 GL [0x20..0x7F] -- graphic character plane 0
2046 C1 [0x80..0x9F] -- control character plane 1
2047 GR [0xA0..0xFF] -- graphic character plane 1
2049 A control character set is directly designated and invoked to C0 or
2050 C1 by an escape sequence. The most common case is that:
2051 - ISO646's control character set is designated/invoked to C0, and
2052 - ISO6429's control character set is designated/invoked to C1,
2053 and usually these designations/invocations are omitted in encoded
2054 text. In a 7-bit environment, only C0 can be used, and a control
2055 character for C1 is encoded by an appropriate escape sequence to
2056 fit into the environment. All control characters for C1 are
2057 defined to have corresponding escape sequences.
2059 A graphic character set is at first designated to one of four
2060 graphic registers (G0 through G3), then these graphic registers are
2061 invoked to GL or GR. These designations and invocations can be
2062 done independently. The most common case is that G0 is invoked to
2063 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2064 these invocations and designations are omitted in encoded text.
2065 In a 7-bit environment, only GL can be used.
2067 When a graphic character set of CHARS94 is invoked to GL, codes
2068 0x20 and 0x7F of the GL area work as control characters SPACE and
2069 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2072 There are two ways of invocation: locking-shift and single-shift.
2073 With locking-shift, the invocation lasts until the next different
2074 invocation, whereas with single-shift, the invocation affects the
2075 following character only and doesn't affect the locking-shift
2076 state. Invocations are done by the following control characters or
2079 ----------------------------------------------------------------------
2080 abbrev function cntrl escape seq description
2081 ----------------------------------------------------------------------
2082 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2083 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2084 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2085 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2086 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2087 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2088 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2089 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2090 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2091 ----------------------------------------------------------------------
2092 (*) These are not used by any known coding system.
2094 Control characters for these functions are defined by macros
2095 ISO_CODE_XXX in `coding.h'.
2097 Designations are done by the following escape sequences:
2098 ----------------------------------------------------------------------
2099 escape sequence description
2100 ----------------------------------------------------------------------
2101 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2102 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2103 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2104 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2105 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2106 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2107 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2108 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2109 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2110 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2111 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2112 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2113 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2114 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2115 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2116 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2117 ----------------------------------------------------------------------
2119 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2120 of dimension 1, chars 94, and final character <F>, etc...
2122 Note (*): Although these designations are not allowed in ISO2022,
2123 Emacs accepts them on decoding, and produces them on encoding
2124 CHARS96 character sets in a coding system which is characterized as
2125 7-bit environment, non-locking-shift, and non-single-shift.
2127 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2128 '(' must be omitted. We refer to this as "short-form" hereafter.
2130 Now you may notice that there are a lot of ways for encoding the
2131 same multilingual text in ISO2022. Actually, there exist many
2132 coding systems such as Compound Text (used in X11's inter client
2133 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
2134 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
2135 localized platforms), and all of these are variants of ISO2022.
2137 In addition to the above, Emacs handles two more kinds of escape
2138 sequences: ISO6429's direction specification and Emacs' private
2139 sequence for specifying character composition.
2141 ISO6429's direction specification takes the following form:
2142 o CSI ']' -- end of the current direction
2143 o CSI '0' ']' -- end of the current direction
2144 o CSI '1' ']' -- start of left-to-right text
2145 o CSI '2' ']' -- start of right-to-left text
2146 The control character CSI (0x9B: control sequence introducer) is
2147 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2149 Character composition specification takes the following form:
2150 o ESC '0' -- start relative composition
2151 o ESC '1' -- end composition
2152 o ESC '2' -- start rule-base composition (*)
2153 o ESC '3' -- start relative composition with alternate chars (**)
2154 o ESC '4' -- start rule-base composition with alternate chars (**)
2155 Since these are not standard escape sequences of any ISO standard,
2156 the use of them for these meaning is restricted to Emacs only.
2158 (*) This form is used only in Emacs 20.5 and the older versions,
2159 but the newer versions can safely decode it.
2160 (**) This form is used only in Emacs 21.1 and the newer versions,
2161 and the older versions can't decode it.
2163 Here's a list of examples usages of these composition escape
2164 sequences (categorized by `enum composition_method').
2166 COMPOSITION_RELATIVE:
2167 ESC 0 CHAR [ CHAR ] ESC 1
2168 COMPOSITOIN_WITH_RULE:
2169 ESC 2 CHAR [ RULE CHAR ] ESC 1
2170 COMPOSITION_WITH_ALTCHARS:
2171 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2172 COMPOSITION_WITH_RULE_ALTCHARS:
2173 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2175 enum iso_code_class_type iso_code_class
[256];
2177 #define SAFE_CHARSET_P(coding, id) \
2178 ((id) <= (coding)->max_charset_id \
2179 && (coding)->safe_charsets[id] >= 0)
2182 #define SHIFT_OUT_OK(category) \
2183 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2186 setup_iso_safe_charsets (Lisp_Object attrs
)
2188 Lisp_Object charset_list
, safe_charsets
;
2189 Lisp_Object request
;
2190 Lisp_Object reg_usage
;
2193 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2196 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2197 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2198 && ! EQ (charset_list
, Viso_2022_charset_list
))
2200 CODING_ATTR_CHARSET_LIST (attrs
)
2201 = charset_list
= Viso_2022_charset_list
;
2202 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2205 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2209 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2211 int id
= XINT (XCAR (tail
));
2212 if (max_charset_id
< id
)
2213 max_charset_id
= id
;
2216 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2218 request
= AREF (attrs
, coding_attr_iso_request
);
2219 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2220 reg94
= XINT (XCAR (reg_usage
));
2221 reg96
= XINT (XCDR (reg_usage
));
2223 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2227 struct charset
*charset
;
2230 charset
= CHARSET_FROM_ID (XINT (id
));
2231 reg
= Fcdr (Fassq (request
, id
));
2233 XSTRING (safe_charsets
)->data
[XINT (id
)] = XINT (reg
);
2234 else if (charset
->iso_chars_96
)
2237 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg96
;
2242 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg94
;
2245 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2249 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2250 Check if a text is encoded in ISO2022. If it is, returns an
2251 integer in which appropriate flag bits any of:
2253 CATEGORY_MASK_ISO_7_TIGHT
2254 CATEGORY_MASK_ISO_8_1
2255 CATEGORY_MASK_ISO_8_2
2256 CATEGORY_MASK_ISO_7_ELSE
2257 CATEGORY_MASK_ISO_8_ELSE
2258 are set. If a code which should never appear in ISO2022 is found,
2262 detect_coding_iso_2022 (coding
, mask
)
2263 struct coding_system
*coding
;
2266 unsigned char *src
= coding
->source
, *src_base
= src
;
2267 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2268 int multibytep
= coding
->src_multibyte
;
2269 int mask_iso
= CATEGORY_MASK_ISO
;
2270 int mask_found
= 0, mask_8bit_found
= 0;
2271 int reg
[4], shift_out
= 0, single_shifting
= 0;
2274 int consumed_chars
= 0;
2277 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2279 struct coding_system
*this = &(coding_categories
[i
]);
2280 Lisp_Object attrs
, val
;
2282 attrs
= CODING_ID_ATTRS (this->id
);
2283 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2284 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2285 setup_iso_safe_charsets (attrs
);
2286 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2287 this->max_charset_id
= XSTRING (val
)->size
- 1;
2288 this->safe_charsets
= (char *) XSTRING (val
)->data
;
2291 /* A coding system of this category is always ASCII compatible. */
2292 src
+= coding
->head_ascii
;
2294 reg
[0] = charset_ascii
, reg
[1] = reg
[2] = reg
[3] = -1;
2295 while (mask_iso
&& src
< src_end
)
2301 if (inhibit_iso_escape_detection
)
2303 single_shifting
= 0;
2305 if (c
>= '(' && c
<= '/')
2307 /* Designation sequence for a charset of dimension 1. */
2309 if (c1
< ' ' || c1
>= 0x80
2310 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2311 /* Invalid designation sequence. Just ignore. */
2313 reg
[(c
- '(') % 4] = id
;
2317 /* Designation sequence for a charset of dimension 2. */
2319 if (c
>= '@' && c
<= 'B')
2320 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2321 reg
[0] = id
= iso_charset_table
[1][0][c
];
2322 else if (c
>= '(' && c
<= '/')
2325 if (c1
< ' ' || c1
>= 0x80
2326 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2327 /* Invalid designation sequence. Just ignore. */
2329 reg
[(c
- '(') % 4] = id
;
2332 /* Invalid designation sequence. Just ignore. */
2335 else if (c
== 'N' || c
== 'O')
2337 /* ESC <Fe> for SS2 or SS3. */
2338 mask_iso
&= CATEGORY_MASK_ISO_7_ELSE
;
2341 else if (c
>= '0' && c
<= '4')
2343 /* ESC <Fp> for start/end composition. */
2344 mask_found
|= CATEGORY_MASK_ISO
;
2349 /* Invalid escape sequence. */
2350 mask_iso
&= ~CATEGORY_MASK_ISO_ESCAPE
;
2354 /* We found a valid designation sequence for CHARSET. */
2355 mask_iso
&= ~CATEGORY_MASK_ISO_8BIT
;
2356 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2358 mask_found
|= CATEGORY_MASK_ISO_7
;
2360 mask_iso
&= ~CATEGORY_MASK_ISO_7
;
2361 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2363 mask_found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2365 mask_iso
&= ~CATEGORY_MASK_ISO_7_TIGHT
;
2366 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2368 mask_found
|= CATEGORY_MASK_ISO_7_ELSE
;
2370 mask_iso
&= ~CATEGORY_MASK_ISO_7_ELSE
;
2371 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2373 mask_found
|= CATEGORY_MASK_ISO_8_ELSE
;
2375 mask_iso
&= ~CATEGORY_MASK_ISO_8_ELSE
;
2379 if (inhibit_iso_escape_detection
)
2381 single_shifting
= 0;
2384 || SHIFT_OUT_OK (coding_category_iso_7_else
)
2385 || SHIFT_OUT_OK (coding_category_iso_8_else
)))
2387 /* Locking shift out. */
2388 mask_iso
&= ~CATEGORY_MASK_ISO_7BIT
;
2389 mask_found
|= CATEGORY_MASK_ISO_ELSE
;
2394 if (inhibit_iso_escape_detection
)
2396 single_shifting
= 0;
2399 /* Locking shift in. */
2400 mask_iso
&= ~CATEGORY_MASK_ISO_7BIT
;
2401 mask_found
|= CATEGORY_MASK_ISO_ELSE
;
2406 single_shifting
= 0;
2410 int newmask
= CATEGORY_MASK_ISO_8_ELSE
;
2412 if (inhibit_iso_escape_detection
)
2414 if (c
!= ISO_CODE_CSI
)
2416 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2417 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2418 newmask
|= CATEGORY_MASK_ISO_8_1
;
2419 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2420 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2421 newmask
|= CATEGORY_MASK_ISO_8_2
;
2422 single_shifting
= 1;
2424 if (VECTORP (Vlatin_extra_code_table
)
2425 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2427 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2428 & CODING_ISO_FLAG_LATIN_EXTRA
)
2429 newmask
|= CATEGORY_MASK_ISO_8_1
;
2430 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2431 & CODING_ISO_FLAG_LATIN_EXTRA
)
2432 newmask
|= CATEGORY_MASK_ISO_8_2
;
2434 mask_iso
&= newmask
;
2435 mask_found
|= newmask
;
2442 single_shifting
= 0;
2447 single_shifting
= 0;
2448 mask_8bit_found
= 1;
2449 if (VECTORP (Vlatin_extra_code_table
)
2450 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2454 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2455 & CODING_ISO_FLAG_LATIN_EXTRA
)
2456 newmask
|= CATEGORY_MASK_ISO_8_1
;
2457 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2458 & CODING_ISO_FLAG_LATIN_EXTRA
)
2459 newmask
|= CATEGORY_MASK_ISO_8_2
;
2460 mask_iso
&= newmask
;
2461 mask_found
|= newmask
;
2468 mask_iso
&= ~(CATEGORY_MASK_ISO_7BIT
2469 | CATEGORY_MASK_ISO_7_ELSE
);
2470 mask_found
|= CATEGORY_MASK_ISO_8_1
;
2471 mask_8bit_found
= 1;
2472 /* Check the length of succeeding codes of the range
2473 0xA0..0FF. If the byte length is odd, we exclude
2474 CATEGORY_MASK_ISO_8_2. We can check this only
2475 when we are not single shifting. */
2476 if (!single_shifting
2477 && mask_iso
& CATEGORY_MASK_ISO_8_2
)
2480 while (src
< src_end
)
2488 if (i
& 1 && src
< src_end
)
2489 mask_iso
&= ~CATEGORY_MASK_ISO_8_2
;
2491 mask_found
|= CATEGORY_MASK_ISO_8_2
;
2500 *mask
&= ~CATEGORY_MASK_ISO
;
2505 *mask
&= mask_iso
& mask_found
;
2506 if (! mask_8bit_found
)
2507 *mask
&= ~(CATEGORY_MASK_ISO_8BIT
| CATEGORY_MASK_ISO_8_ELSE
);
2512 /* Set designation state into CODING. */
2513 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2517 if (final < '0' || final >= 128 \
2518 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2519 || !SAFE_CHARSET_P (coding, id)) \
2521 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2522 goto invalid_code; \
2524 prev = CODING_ISO_DESIGNATION (coding, reg); \
2525 CODING_ISO_DESIGNATION (coding, reg) = id; \
2526 /* If there was an invalid designation to REG previously, and this \
2527 designation is ASCII to REG, we should keep this designation \
2529 if (prev == -2 && id == charset_ascii) \
2530 goto invalid_code; \
2534 #define MAYBE_FINISH_COMPOSITION() \
2537 if (composition_state == COMPOSING_NO) \
2539 /* It is assured that we have enough room for producing \
2540 characters stored in the table `components'. */ \
2541 if (charbuf + component_idx > charbuf_end) \
2542 goto no_more_source; \
2543 composition_state = COMPOSING_NO; \
2544 if (method == COMPOSITION_RELATIVE \
2545 || method == COMPOSITION_WITH_ALTCHARS) \
2547 for (i = 0; i < component_idx; i++) \
2548 *charbuf++ = components[i]; \
2549 char_offset += component_idx; \
2553 for (i = 0; i < component_idx; i += 2) \
2554 *charbuf++ = components[i]; \
2555 char_offset += (component_idx / 2) + 1; \
2560 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2561 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2562 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2563 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2564 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2567 #define DECODE_COMPOSITION_START(c1) \
2570 && composition_state == COMPOSING_COMPONENT_CHAR) \
2572 component_len = component_idx; \
2573 composition_state = COMPOSING_CHAR; \
2579 MAYBE_FINISH_COMPOSITION (); \
2580 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2581 goto no_more_source; \
2582 for (p = src; p < src_end - 1; p++) \
2583 if (*p == ISO_CODE_ESC && p[1] == '1') \
2585 if (p == src_end - 1) \
2587 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2588 goto invalid_code; \
2589 goto no_more_source; \
2592 /* This is surely the start of a composition. */ \
2593 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2594 : c1 == '2' ? COMPOSITION_WITH_RULE \
2595 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2596 : COMPOSITION_WITH_RULE_ALTCHARS); \
2597 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2598 : COMPOSING_COMPONENT_CHAR); \
2599 component_idx = component_len = 0; \
2604 /* Handle compositoin end sequence ESC 1. */
2606 #define DECODE_COMPOSITION_END() \
2608 int nchars = (component_len > 0 ? component_idx - component_len \
2609 : method == COMPOSITION_RELATIVE ? component_idx \
2610 : (component_idx + 1) / 2); \
2612 int *saved_charbuf = charbuf; \
2614 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
2615 if (method != COMPOSITION_RELATIVE) \
2617 if (component_len == 0) \
2618 for (i = 0; i < component_idx; i++) \
2619 *charbuf++ = components[i]; \
2621 for (i = 0; i < component_len; i++) \
2622 *charbuf++ = components[i]; \
2623 *saved_charbuf = saved_charbuf - charbuf; \
2625 if (method == COMPOSITION_WITH_RULE) \
2626 for (i = 0; i < component_idx; i += 2, char_offset++) \
2627 *charbuf++ = components[i]; \
2629 for (i = component_len; i < component_idx; i++, char_offset++) \
2630 *charbuf++ = components[i]; \
2631 coding->annotated = 1; \
2632 composition_state = COMPOSING_NO; \
2636 /* Decode a composition rule from the byte C1 (and maybe one more byte
2637 from SRC) and store one encoded composition rule in
2638 coding->cmp_data. */
2640 #define DECODE_COMPOSITION_RULE(c1) \
2643 if (c1 < 81) /* old format (before ver.21) */ \
2645 int gref = (c1) / 9; \
2646 int nref = (c1) % 9; \
2647 if (gref == 4) gref = 10; \
2648 if (nref == 4) nref = 10; \
2649 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2651 else if (c1 < 93) /* new format (after ver.21) */ \
2653 ONE_MORE_BYTE (c2); \
2654 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2661 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2664 decode_coding_iso_2022 (coding
)
2665 struct coding_system
*coding
;
2667 unsigned char *src
= coding
->source
+ coding
->consumed
;
2668 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2669 unsigned char *src_base
;
2670 int *charbuf
= coding
->charbuf
;
2671 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- 4;
2672 int consumed_chars
= 0, consumed_chars_base
;
2673 int char_offset
= 0;
2674 int multibytep
= coding
->src_multibyte
;
2675 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2676 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2677 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2678 struct charset
*charset
;
2680 /* For handling composition sequence. */
2681 #define COMPOSING_NO 0
2682 #define COMPOSING_CHAR 1
2683 #define COMPOSING_RULE 2
2684 #define COMPOSING_COMPONENT_CHAR 3
2685 #define COMPOSING_COMPONENT_RULE 4
2687 int composition_state
= COMPOSING_NO
;
2688 enum composition_method method
;
2689 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2692 Lisp_Object attrs
, eol_type
, charset_list
;
2694 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2695 setup_iso_safe_charsets (attrs
);
2702 consumed_chars_base
= consumed_chars
;
2704 if (charbuf
>= charbuf_end
)
2709 /* We produce no character or one character. */
2710 switch (iso_code_class
[c1
])
2712 case ISO_0x20_or_0x7F
:
2713 if (composition_state
!= COMPOSING_NO
)
2715 if (composition_state
== COMPOSING_RULE
2716 || composition_state
== COMPOSING_COMPONENT_RULE
)
2718 DECODE_COMPOSITION_RULE (c1
);
2719 components
[component_idx
++] = c1
;
2720 composition_state
--;
2723 else if (method
== COMPOSITION_WITH_RULE
)
2724 composition_state
= COMPOSING_RULE
;
2725 else if (method
== COMPOSITION_WITH_RULE_ALTCHARS
2726 && composition_state
== COMPOSING_COMPONENT_CHAR
)
2727 composition_state
= COMPOSING_COMPONENT_CHAR
;
2729 if (charset_id_0
< 0
2730 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2732 /* This is SPACE or DEL. */
2733 charset
= CHARSET_FROM_ID (charset_ascii
);
2736 /* This is a graphic character, we fall down ... */
2738 case ISO_graphic_plane_0
:
2739 if (composition_state
== COMPOSING_RULE
)
2741 DECODE_COMPOSITION_RULE (c1
);
2742 components
[component_idx
++] = c1
;
2743 composition_state
= COMPOSING_CHAR
;
2745 charset
= CHARSET_FROM_ID (charset_id_0
);
2748 case ISO_0xA0_or_0xFF
:
2749 if (charset_id_1
< 0
2750 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
2751 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
2753 /* This is a graphic character, we fall down ... */
2755 case ISO_graphic_plane_1
:
2756 if (charset_id_1
< 0)
2758 charset
= CHARSET_FROM_ID (charset_id_1
);
2761 case ISO_carriage_return
:
2764 if (EQ (eol_type
, Qdos
))
2767 goto no_more_source
;
2771 else if (EQ (eol_type
, Qmac
))
2777 MAYBE_FINISH_COMPOSITION ();
2778 charset
= CHARSET_FROM_ID (charset_ascii
);
2782 MAYBE_FINISH_COMPOSITION ();
2786 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2787 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
2789 CODING_ISO_INVOCATION (coding
, 0) = 1;
2790 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2794 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
2796 CODING_ISO_INVOCATION (coding
, 0) = 0;
2797 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2800 case ISO_single_shift_2_7
:
2801 case ISO_single_shift_2
:
2802 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2804 /* SS2 is handled as an escape sequence of ESC 'N' */
2806 goto label_escape_sequence
;
2808 case ISO_single_shift_3
:
2809 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2811 /* SS2 is handled as an escape sequence of ESC 'O' */
2813 goto label_escape_sequence
;
2815 case ISO_control_sequence_introducer
:
2816 /* CSI is handled as an escape sequence of ESC '[' ... */
2818 goto label_escape_sequence
;
2822 label_escape_sequence
:
2823 /* Escape sequences handled here are invocation,
2824 designation, direction specification, and character
2825 composition specification. */
2828 case '&': /* revision of following character set */
2830 if (!(c1
>= '@' && c1
<= '~'))
2833 if (c1
!= ISO_CODE_ESC
)
2836 goto label_escape_sequence
;
2838 case '$': /* designation of 2-byte character set */
2839 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2842 if (c1
>= '@' && c1
<= 'B')
2843 { /* designation of JISX0208.1978, GB2312.1980,
2845 DECODE_DESIGNATION (0, 2, 0, c1
);
2847 else if (c1
>= 0x28 && c1
<= 0x2B)
2848 { /* designation of DIMENSION2_CHARS94 character set */
2850 DECODE_DESIGNATION (c1
- 0x28, 2, 0, c2
);
2852 else if (c1
>= 0x2C && c1
<= 0x2F)
2853 { /* designation of DIMENSION2_CHARS96 character set */
2855 DECODE_DESIGNATION (c1
- 0x2C, 2, 1, c2
);
2859 /* We must update these variables now. */
2860 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2861 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2864 case 'n': /* invocation of locking-shift-2 */
2865 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2866 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2868 CODING_ISO_INVOCATION (coding
, 0) = 2;
2869 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2872 case 'o': /* invocation of locking-shift-3 */
2873 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2874 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2876 CODING_ISO_INVOCATION (coding
, 0) = 3;
2877 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2880 case 'N': /* invocation of single-shift-2 */
2881 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2882 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2884 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 2));
2886 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
2890 case 'O': /* invocation of single-shift-3 */
2891 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2892 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2894 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 3));
2896 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
2900 case '0': case '2': case '3': case '4': /* start composition */
2901 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
2903 DECODE_COMPOSITION_START (c1
);
2906 case '1': /* end composition */
2907 if (composition_state
== COMPOSING_NO
)
2909 DECODE_COMPOSITION_END ();
2912 case '[': /* specification of direction */
2913 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
2915 /* For the moment, nested direction is not supported.
2916 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2917 left-to-right, and nozero means right-to-left. */
2921 case ']': /* end of the current direction */
2922 coding
->mode
&= ~CODING_MODE_DIRECTION
;
2924 case '0': /* end of the current direction */
2925 case '1': /* start of left-to-right direction */
2928 coding
->mode
&= ~CODING_MODE_DIRECTION
;
2933 case '2': /* start of right-to-left direction */
2936 coding
->mode
|= CODING_MODE_DIRECTION
;
2947 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2949 if (c1
>= 0x28 && c1
<= 0x2B)
2950 { /* designation of DIMENSION1_CHARS94 character set */
2952 DECODE_DESIGNATION (c1
- 0x28, 1, 0, c2
);
2954 else if (c1
>= 0x2C && c1
<= 0x2F)
2955 { /* designation of DIMENSION1_CHARS96 character set */
2957 DECODE_DESIGNATION (c1
- 0x2C, 1, 1, c2
);
2961 /* We must update these variables now. */
2962 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2963 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2968 /* Now we know CHARSET and 1st position code C1 of a character.
2969 Produce a decoded character while getting 2nd position code
2972 if (CHARSET_DIMENSION (charset
) > 1)
2975 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
2976 /* C2 is not in a valid range. */
2978 c1
= (c1
<< 8) | (c2
& 0x7F);
2979 if (CHARSET_DIMENSION (charset
) > 2)
2982 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
2983 /* C2 is not in a valid range. */
2985 c1
= (c1
<< 8) | (c2
& 0x7F);
2989 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
2992 MAYBE_FINISH_COMPOSITION ();
2993 for (; src_base
< src
; src_base
++, char_offset
++)
2995 if (ASCII_BYTE_P (*src_base
))
2996 *charbuf
++ = *src_base
;
2998 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3001 else if (composition_state
== COMPOSING_NO
)
3007 components
[component_idx
++] = c
;
3011 MAYBE_FINISH_COMPOSITION ();
3013 consumed_chars
= consumed_chars_base
;
3015 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3020 coding
->consumed_char
+= consumed_chars_base
;
3021 coding
->consumed
= src_base
- coding
->source
;
3022 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3026 /* ISO2022 encoding stuff. */
3029 It is not enough to say just "ISO2022" on encoding, we have to
3030 specify more details. In Emacs, each coding system of ISO2022
3031 variant has the following specifications:
3032 1. Initial designation to G0 thru G3.
3033 2. Allows short-form designation?
3034 3. ASCII should be designated to G0 before control characters?
3035 4. ASCII should be designated to G0 at end of line?
3036 5. 7-bit environment or 8-bit environment?
3037 6. Use locking-shift?
3038 7. Use Single-shift?
3039 And the following two are only for Japanese:
3040 8. Use ASCII in place of JIS0201-1976-Roman?
3041 9. Use JISX0208-1983 in place of JISX0208-1978?
3042 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3043 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3047 /* Produce codes (escape sequence) for designating CHARSET to graphic
3048 register REG at DST, and increment DST. If <final-char> of CHARSET is
3049 '@', 'A', or 'B' and the coding system CODING allows, produce
3050 designation sequence of short-form. */
3052 #define ENCODE_DESIGNATION(charset, reg, coding) \
3054 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3055 char *intermediate_char_94 = "()*+"; \
3056 char *intermediate_char_96 = ",-./"; \
3057 int revision = -1; \
3060 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3061 revision = XINT (CHARSET_ISO_REVISION (charset)); \
3063 if (revision >= 0) \
3065 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3066 EMIT_ONE_BYTE ('@' + revision); \
3068 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3069 if (CHARSET_DIMENSION (charset) == 1) \
3071 if (! CHARSET_ISO_CHARS_96 (charset)) \
3072 c = intermediate_char_94[reg]; \
3074 c = intermediate_char_96[reg]; \
3075 EMIT_ONE_ASCII_BYTE (c); \
3079 EMIT_ONE_ASCII_BYTE ('$'); \
3080 if (! CHARSET_ISO_CHARS_96 (charset)) \
3082 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3084 || final_char < '@' || final_char > 'B') \
3085 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3088 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3090 EMIT_ONE_ASCII_BYTE (final_char); \
3092 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3096 /* The following two macros produce codes (control character or escape
3097 sequence) for ISO2022 single-shift functions (single-shift-2 and
3100 #define ENCODE_SINGLE_SHIFT_2 \
3102 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3103 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3105 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3106 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3110 #define ENCODE_SINGLE_SHIFT_3 \
3112 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3113 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3115 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3116 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3120 /* The following four macros produce codes (control character or
3121 escape sequence) for ISO2022 locking-shift functions (shift-in,
3122 shift-out, locking-shift-2, and locking-shift-3). */
3124 #define ENCODE_SHIFT_IN \
3126 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3127 CODING_ISO_INVOCATION (coding, 0) = 0; \
3131 #define ENCODE_SHIFT_OUT \
3133 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3134 CODING_ISO_INVOCATION (coding, 0) = 1; \
3138 #define ENCODE_LOCKING_SHIFT_2 \
3140 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3141 CODING_ISO_INVOCATION (coding, 0) = 2; \
3145 #define ENCODE_LOCKING_SHIFT_3 \
3147 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3148 CODING_ISO_INVOCATION (coding, 0) = 3; \
3152 /* Produce codes for a DIMENSION1 character whose character set is
3153 CHARSET and whose position-code is C1. Designation and invocation
3154 sequences are also produced in advance if necessary. */
3156 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3158 int id = CHARSET_ID (charset); \
3159 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3161 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3162 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3164 EMIT_ONE_BYTE (c1 | 0x80); \
3165 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3168 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3170 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3173 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3175 EMIT_ONE_BYTE (c1 | 0x80); \
3179 /* Since CHARSET is not yet invoked to any graphic planes, we \
3180 must invoke it, or, at first, designate it to some graphic \
3181 register. Then repeat the loop to actually produce the \
3183 dst = encode_invocation_designation (charset, coding, dst, \
3188 /* Produce codes for a DIMENSION2 character whose character set is
3189 CHARSET and whose position-codes are C1 and C2. Designation and
3190 invocation codes are also produced in advance if necessary. */
3192 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3194 int id = CHARSET_ID (charset); \
3195 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3197 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3198 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3200 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3201 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3204 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3206 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3209 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3211 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3215 /* Since CHARSET is not yet invoked to any graphic planes, we \
3216 must invoke it, or, at first, designate it to some graphic \
3217 register. Then repeat the loop to actually produce the \
3219 dst = encode_invocation_designation (charset, coding, dst, \
3224 #define ENCODE_ISO_CHARACTER(charset, c) \
3226 int code = ENCODE_CHAR ((charset),(c)); \
3228 if (CHARSET_DIMENSION (charset) == 1) \
3229 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3231 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3235 /* Produce designation and invocation codes at a place pointed by DST
3236 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3240 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3241 struct charset
*charset
;
3242 struct coding_system
*coding
;
3246 int multibytep
= coding
->dst_multibyte
;
3247 int produced_chars
= *p_nchars
;
3248 int reg
; /* graphic register number */
3249 int id
= CHARSET_ID (charset
);
3251 /* At first, check designations. */
3252 for (reg
= 0; reg
< 4; reg
++)
3253 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3258 /* CHARSET is not yet designated to any graphic registers. */
3259 /* At first check the requested designation. */
3260 reg
= CODING_ISO_REQUEST (coding
, id
);
3262 /* Since CHARSET requests no special designation, designate it
3263 to graphic register 0. */
3266 ENCODE_DESIGNATION (charset
, reg
, coding
);
3269 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3270 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3272 /* Since the graphic register REG is not invoked to any graphic
3273 planes, invoke it to graphic plane 0. */
3276 case 0: /* graphic register 0 */
3280 case 1: /* graphic register 1 */
3284 case 2: /* graphic register 2 */
3285 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3286 ENCODE_SINGLE_SHIFT_2
;
3288 ENCODE_LOCKING_SHIFT_2
;
3291 case 3: /* graphic register 3 */
3292 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3293 ENCODE_SINGLE_SHIFT_3
;
3295 ENCODE_LOCKING_SHIFT_3
;
3300 *p_nchars
= produced_chars
;
3304 /* The following three macros produce codes for indicating direction
3306 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3308 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3309 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3311 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3315 #define ENCODE_DIRECTION_R2L() \
3317 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3318 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3322 #define ENCODE_DIRECTION_L2R() \
3324 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3325 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3329 /* Produce codes for designation and invocation to reset the graphic
3330 planes and registers to initial state. */
3331 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3334 struct charset *charset; \
3336 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3338 for (reg = 0; reg < 4; reg++) \
3339 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3340 && (CODING_ISO_DESIGNATION (coding, reg) \
3341 != CODING_ISO_INITIAL (coding, reg))) \
3343 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3344 ENCODE_DESIGNATION (charset, reg, coding); \
3349 /* Produce designation sequences of charsets in the line started from
3350 SRC to a place pointed by DST, and return updated DST.
3352 If the current block ends before any end-of-line, we may fail to
3353 find all the necessary designations. */
3355 static unsigned char *
3356 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3357 struct coding_system
*coding
;
3358 int *charbuf
, *charbuf_end
;
3361 struct charset
*charset
;
3362 /* Table of charsets to be designated to each graphic register. */
3364 int c
, found
= 0, reg
;
3365 int produced_chars
= 0;
3366 int multibytep
= coding
->dst_multibyte
;
3368 Lisp_Object charset_list
;
3370 attrs
= CODING_ID_ATTRS (coding
->id
);
3371 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3372 if (EQ (charset_list
, Qiso_2022
))
3373 charset_list
= Viso_2022_charset_list
;
3375 for (reg
= 0; reg
< 4; reg
++)
3385 charset
= char_charset (c
, charset_list
, NULL
);
3386 id
= CHARSET_ID (charset
);
3387 reg
= CODING_ISO_REQUEST (coding
, id
);
3388 if (reg
>= 0 && r
[reg
] < 0)
3397 for (reg
= 0; reg
< 4; reg
++)
3399 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3400 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3406 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3409 encode_coding_iso_2022 (coding
)
3410 struct coding_system
*coding
;
3412 int multibytep
= coding
->dst_multibyte
;
3413 int *charbuf
= coding
->charbuf
;
3414 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3415 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3416 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3419 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3420 && CODING_ISO_BOL (coding
));
3421 int produced_chars
= 0;
3422 Lisp_Object attrs
, eol_type
, charset_list
;
3423 int ascii_compatible
;
3426 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3428 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3430 while (charbuf
< charbuf_end
)
3432 ASSURE_DESTINATION (safe_room
);
3434 if (bol_designation
)
3436 unsigned char *dst_prev
= dst
;
3438 /* We have to produce designation sequences if any now. */
3439 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3440 bol_designation
= 0;
3441 /* We are sure that designation sequences are all ASCII bytes. */
3442 produced_chars
+= dst
- dst_prev
;
3447 /* Now encode the character C. */
3448 if (c
< 0x20 || c
== 0x7F)
3451 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3453 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3454 ENCODE_RESET_PLANE_AND_REGISTER ();
3455 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3459 for (i
= 0; i
< 4; i
++)
3460 CODING_ISO_DESIGNATION (coding
, i
)
3461 = CODING_ISO_INITIAL (coding
, i
);
3464 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3466 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3467 ENCODE_RESET_PLANE_AND_REGISTER ();
3468 EMIT_ONE_ASCII_BYTE (c
);
3470 else if (ASCII_CHAR_P (c
))
3472 if (ascii_compatible
)
3473 EMIT_ONE_ASCII_BYTE (c
);
3475 ENCODE_ISO_CHARACTER (CHARSET_FROM_ID (charset_ascii
), c
);
3479 struct charset
*charset
= char_charset (c
, charset_list
, NULL
);
3483 c
= coding
->default_char
;
3484 charset
= char_charset (c
, charset_list
, NULL
);
3486 ENCODE_ISO_CHARACTER (charset
, c
);
3490 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3491 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3493 ASSURE_DESTINATION (safe_room
);
3494 ENCODE_RESET_PLANE_AND_REGISTER ();
3496 coding
->result
= CODING_RESULT_SUCCESS
;
3497 CODING_ISO_BOL (coding
) = bol_designation
;
3498 coding
->produced_char
+= produced_chars
;
3499 coding
->produced
= dst
- coding
->destination
;
3504 /*** 8,9. SJIS and BIG5 handlers ***/
3506 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3507 quite widely. So, for the moment, Emacs supports them in the bare
3508 C code. But, in the future, they may be supported only by CCL. */
3510 /* SJIS is a coding system encoding three character sets: ASCII, right
3511 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3512 as is. A character of charset katakana-jisx0201 is encoded by
3513 "position-code + 0x80". A character of charset japanese-jisx0208
3514 is encoded in 2-byte but two position-codes are divided and shifted
3515 so that it fit in the range below.
3517 --- CODE RANGE of SJIS ---
3518 (character set) (range)
3520 KATAKANA-JISX0201 0xA0 .. 0xDF
3521 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3522 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3523 -------------------------------
3527 /* BIG5 is a coding system encoding two character sets: ASCII and
3528 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3529 character set and is encoded in two-byte.
3531 --- CODE RANGE of BIG5 ---
3532 (character set) (range)
3534 Big5 (1st byte) 0xA1 .. 0xFE
3535 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3536 --------------------------
3540 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3541 Check if a text is encoded in SJIS. If it is, return
3542 CATEGORY_MASK_SJIS, else return 0. */
3545 detect_coding_sjis (coding
, mask
)
3546 struct coding_system
*coding
;
3549 unsigned char *src
= coding
->source
, *src_base
= src
;
3550 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3551 int multibytep
= coding
->src_multibyte
;
3552 int consumed_chars
= 0;
3556 /* A coding system of this category is always ASCII compatible. */
3557 src
+= coding
->head_ascii
;
3564 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3567 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
3571 else if (c
>= 0xA0 && c
< 0xE0)
3576 *mask
&= ~CATEGORY_MASK_SJIS
;
3582 *mask
&= CATEGORY_MASK_SJIS
;
3586 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3587 Check if a text is encoded in BIG5. If it is, return
3588 CATEGORY_MASK_BIG5, else return 0. */
3591 detect_coding_big5 (coding
, mask
)
3592 struct coding_system
*coding
;
3595 unsigned char *src
= coding
->source
, *src_base
= src
;
3596 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3597 int multibytep
= coding
->src_multibyte
;
3598 int consumed_chars
= 0;
3602 /* A coding system of this category is always ASCII compatible. */
3603 src
+= coding
->head_ascii
;
3613 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
3620 *mask
&= ~CATEGORY_MASK_BIG5
;
3626 *mask
&= CATEGORY_MASK_BIG5
;
3630 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3631 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3634 decode_coding_sjis (coding
)
3635 struct coding_system
*coding
;
3637 unsigned char *src
= coding
->source
+ coding
->consumed
;
3638 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3639 unsigned char *src_base
;
3640 int *charbuf
= coding
->charbuf
;
3641 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
3642 int consumed_chars
= 0, consumed_chars_base
;
3643 int multibytep
= coding
->src_multibyte
;
3644 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3645 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3647 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3650 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3651 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3652 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3659 consumed_chars_base
= consumed_chars
;
3661 if (charbuf
>= charbuf_end
)
3668 if (EQ (eol_type
, Qdos
))
3671 goto no_more_source
;
3675 else if (EQ (eol_type
, Qmac
))
3680 struct charset
*charset
;
3683 charset
= charset_roman
;
3688 if (c
< 0xA0 || c
>= 0xE0)
3690 /* SJIS -> JISX0208 */
3692 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
3696 charset
= charset_kanji
;
3699 /* SJIS -> JISX0201-Kana */
3700 charset
= charset_kana
;
3702 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
3709 consumed_chars
= consumed_chars_base
;
3711 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3716 coding
->consumed_char
+= consumed_chars_base
;
3717 coding
->consumed
= src_base
- coding
->source
;
3718 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3722 decode_coding_big5 (coding
)
3723 struct coding_system
*coding
;
3725 unsigned char *src
= coding
->source
+ coding
->consumed
;
3726 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3727 unsigned char *src_base
;
3728 int *charbuf
= coding
->charbuf
;
3729 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
3730 int consumed_chars
= 0, consumed_chars_base
;
3731 int multibytep
= coding
->src_multibyte
;
3732 struct charset
*charset_roman
, *charset_big5
;
3733 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3735 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3737 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3738 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3745 consumed_chars_base
= consumed_chars
;
3747 if (charbuf
>= charbuf_end
)
3754 if (EQ (eol_type
, Qdos
))
3757 goto no_more_source
;
3761 else if (EQ (eol_type
, Qmac
))
3766 struct charset
*charset
;
3768 charset
= charset_roman
;
3772 if (c
< 0xA1 || c
> 0xFE)
3775 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
3778 charset
= charset_big5
;
3780 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
3788 consumed_chars
= consumed_chars_base
;
3790 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3795 coding
->consumed_char
+= consumed_chars_base
;
3796 coding
->consumed
= src_base
- coding
->source
;
3797 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3800 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3801 This function can encode charsets `ascii', `katakana-jisx0201',
3802 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3803 are sure that all these charsets are registered as official charset
3804 (i.e. do not have extended leading-codes). Characters of other
3805 charsets are produced without any encoding. If SJIS_P is 1, encode
3806 SJIS text, else encode BIG5 text. */
3809 encode_coding_sjis (coding
)
3810 struct coding_system
*coding
;
3812 int multibytep
= coding
->dst_multibyte
;
3813 int *charbuf
= coding
->charbuf
;
3814 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3815 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3816 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3818 int produced_chars
= 0;
3819 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3820 int ascii_compatible
;
3821 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3824 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3826 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3827 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3828 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3830 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3832 while (charbuf
< charbuf_end
)
3834 ASSURE_DESTINATION (safe_room
);
3836 /* Now encode the character C. */
3837 if (ASCII_CHAR_P (c
) && ascii_compatible
)
3838 EMIT_ONE_ASCII_BYTE (c
);
3842 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
3846 c
= coding
->default_char
;
3847 charset
= char_charset (c
, charset_list
, &code
);
3849 if (code
== CHARSET_INVALID_CODE (charset
))
3851 if (charset
== charset_kanji
)
3855 c1
= code
>> 8, c2
= code
& 0xFF;
3856 EMIT_TWO_BYTES (c1
, c2
);
3858 else if (charset
== charset_kana
)
3859 EMIT_ONE_BYTE (code
| 0x80);
3861 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
3864 coding
->result
= CODING_RESULT_SUCCESS
;
3865 coding
->produced_char
+= produced_chars
;
3866 coding
->produced
= dst
- coding
->destination
;
3871 encode_coding_big5 (coding
)
3872 struct coding_system
*coding
;
3874 int multibytep
= coding
->dst_multibyte
;
3875 int *charbuf
= coding
->charbuf
;
3876 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3877 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3878 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3880 int produced_chars
= 0;
3881 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3882 int ascii_compatible
;
3883 struct charset
*charset_roman
, *charset_big5
;
3886 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3888 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3889 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3890 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3892 while (charbuf
< charbuf_end
)
3894 ASSURE_DESTINATION (safe_room
);
3896 /* Now encode the character C. */
3897 if (ASCII_CHAR_P (c
) && ascii_compatible
)
3898 EMIT_ONE_ASCII_BYTE (c
);
3902 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
3906 c
= coding
->default_char
;
3907 charset
= char_charset (c
, charset_list
, &code
);
3909 if (code
== CHARSET_INVALID_CODE (charset
))
3911 if (charset
== charset_big5
)
3915 c1
= code
>> 8, c2
= code
& 0xFF;
3916 EMIT_TWO_BYTES (c1
, c2
);
3919 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
3922 coding
->result
= CODING_RESULT_SUCCESS
;
3923 coding
->produced_char
+= produced_chars
;
3924 coding
->produced
= dst
- coding
->destination
;
3929 /*** 10. CCL handlers ***/
3931 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3932 Check if a text is encoded in a coding system of which
3933 encoder/decoder are written in CCL program. If it is, return
3934 CATEGORY_MASK_CCL, else return 0. */
3937 detect_coding_ccl (coding
, mask
)
3938 struct coding_system
*coding
;
3941 unsigned char *src
= coding
->source
, *src_base
= src
;
3942 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3943 int multibytep
= coding
->src_multibyte
;
3944 int consumed_chars
= 0;
3946 unsigned char *valids
= CODING_CCL_VALIDS (coding
);
3947 int head_ascii
= coding
->head_ascii
;
3950 coding
= &coding_categories
[coding_category_ccl
];
3951 attrs
= CODING_ID_ATTRS (coding
->id
);
3952 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
3961 if (!found
&& valids
[c
] > 1)
3964 *mask
&= ~CATEGORY_MASK_CCL
;
3970 *mask
&= CATEGORY_MASK_CCL
;
3975 decode_coding_ccl (coding
)
3976 struct coding_system
*coding
;
3978 unsigned char *src
= coding
->source
+ coding
->consumed
;
3979 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3980 int *charbuf
= coding
->charbuf
;
3981 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
3982 int consumed_chars
= 0;
3983 int multibytep
= coding
->src_multibyte
;
3984 struct ccl_program ccl
;
3985 int source_charbuf
[1024];
3986 int source_byteidx
[1024];
3988 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
3990 while (src
< src_end
)
3992 unsigned char *p
= src
;
3993 int *source
, *source_end
;
3997 while (i
< 1024 && p
< src_end
)
3999 source_byteidx
[i
] = p
- src
;
4000 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4003 while (i
< 1024 && p
< src_end
)
4004 source_charbuf
[i
++] = *p
++;
4006 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4009 source
= source_charbuf
;
4010 source_end
= source
+ i
;
4011 while (source
< source_end
)
4013 ccl_driver (&ccl
, source
, charbuf
,
4014 source_end
- source
, charbuf_end
- charbuf
);
4015 source
+= ccl
.consumed
;
4016 charbuf
+= ccl
.produced
;
4017 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4020 if (source
< source_end
)
4021 src
+= source_byteidx
[source
- source_charbuf
];
4024 consumed_chars
+= source
- source_charbuf
;
4026 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4027 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4033 case CCL_STAT_SUSPEND_BY_SRC
:
4034 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4036 case CCL_STAT_SUSPEND_BY_DST
:
4039 case CCL_STAT_INVALID_CMD
:
4040 coding
->result
= CODING_RESULT_INTERRUPT
;
4043 coding
->result
= CODING_RESULT_SUCCESS
;
4046 coding
->consumed_char
+= consumed_chars
;
4047 coding
->consumed
= src
- coding
->source
;
4048 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4052 encode_coding_ccl (coding
)
4053 struct coding_system
*coding
;
4055 struct ccl_program ccl
;
4056 int multibytep
= coding
->dst_multibyte
;
4057 int *charbuf
= coding
->charbuf
;
4058 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4059 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4060 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4061 unsigned char *adjusted_dst_end
= dst_end
- 1;
4062 int destination_charbuf
[1024];
4063 int i
, produced_chars
= 0;
4065 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4067 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4068 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4070 while (charbuf
< charbuf_end
&& dst
< adjusted_dst_end
)
4072 int dst_bytes
= dst_end
- dst
;
4073 if (dst_bytes
> 1024)
4076 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4077 charbuf_end
- charbuf
, dst_bytes
);
4078 charbuf
+= ccl
.consumed
;
4080 for (i
= 0; i
< ccl
.produced
; i
++)
4081 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4084 for (i
= 0; i
< ccl
.produced
; i
++)
4085 *dst
++ = destination_charbuf
[i
] & 0xFF;
4086 produced_chars
+= ccl
.produced
;
4092 case CCL_STAT_SUSPEND_BY_SRC
:
4093 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4095 case CCL_STAT_SUSPEND_BY_DST
:
4096 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
4099 case CCL_STAT_INVALID_CMD
:
4100 coding
->result
= CODING_RESULT_INTERRUPT
;
4103 coding
->result
= CODING_RESULT_SUCCESS
;
4107 coding
->produced_char
+= produced_chars
;
4108 coding
->produced
= dst
- coding
->destination
;
4114 /*** 10, 11. no-conversion handlers ***/
4116 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4119 decode_coding_raw_text (coding
)
4120 struct coding_system
*coding
;
4122 coding
->chars_at_source
= 1;
4123 coding
->consumed_char
= coding
->src_chars
;
4124 coding
->consumed
= coding
->src_bytes
;
4125 coding
->result
= CODING_RESULT_SUCCESS
;
4129 encode_coding_raw_text (coding
)
4130 struct coding_system
*coding
;
4132 int multibytep
= coding
->dst_multibyte
;
4133 int *charbuf
= coding
->charbuf
;
4134 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4135 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4136 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4137 int produced_chars
= 0;
4142 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4144 if (coding
->src_multibyte
)
4145 while (charbuf
< charbuf_end
)
4147 ASSURE_DESTINATION (safe_room
);
4149 if (ASCII_CHAR_P (c
))
4150 EMIT_ONE_ASCII_BYTE (c
);
4151 else if (CHAR_BYTE8_P (c
))
4153 c
= CHAR_TO_BYTE8 (c
);
4158 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4160 CHAR_STRING_ADVANCE (c
, p1
);
4162 EMIT_ONE_BYTE (*p0
);
4166 while (charbuf
< charbuf_end
)
4168 ASSURE_DESTINATION (safe_room
);
4175 if (coding
->src_multibyte
)
4177 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4179 while (charbuf
< charbuf_end
)
4181 ASSURE_DESTINATION (safe_room
);
4183 if (ASCII_CHAR_P (c
))
4185 else if (CHAR_BYTE8_P (c
))
4186 *dst
++ = CHAR_TO_BYTE8 (c
);
4188 CHAR_STRING_ADVANCE (c
, dst
);
4194 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4195 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4196 *dst
++ = *charbuf
++;
4197 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4200 coding
->result
= CODING_RESULT_SUCCESS
;
4201 coding
->produced_char
+= produced_chars
;
4202 coding
->produced
= dst
- coding
->destination
;
4207 detect_coding_charset (coding
, mask
)
4208 struct coding_system
*coding
;
4211 unsigned char *src
= coding
->source
, *src_base
= src
;
4212 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4213 int multibytep
= coding
->src_multibyte
;
4214 int consumed_chars
= 0;
4215 Lisp_Object attrs
, valids
;
4217 coding
= &coding_categories
[coding_category_charset
];
4218 attrs
= CODING_ID_ATTRS (coding
->id
);
4219 valids
= AREF (attrs
, coding_attr_charset_valids
);
4221 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4222 src
+= coding
->head_ascii
;
4229 if (NILP (AREF (valids
, c
)))
4232 *mask
&= ~CATEGORY_MASK_CHARSET
;
4236 *mask
&= CATEGORY_MASK_CHARSET
;
4241 decode_coding_charset (coding
)
4242 struct coding_system
*coding
;
4244 unsigned char *src
= coding
->source
+ coding
->consumed
;
4245 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4246 unsigned char *src_base
;
4247 int *charbuf
= coding
->charbuf
;
4248 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4249 int consumed_chars
= 0, consumed_chars_base
;
4250 int multibytep
= coding
->src_multibyte
;
4251 struct charset
*charset
;
4252 Lisp_Object attrs
, eol_type
, charset_list
;
4254 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4255 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
4262 consumed_chars_base
= consumed_chars
;
4264 if (charbuf
>= charbuf_end
)
4270 if (EQ (eol_type
, Qdos
))
4273 goto no_more_source
;
4277 else if (EQ (eol_type
, Qmac
))
4282 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
4291 consumed_chars
= consumed_chars_base
;
4293 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4298 coding
->consumed_char
+= consumed_chars_base
;
4299 coding
->consumed
= src_base
- coding
->source
;
4300 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4304 encode_coding_charset (coding
)
4305 struct coding_system
*coding
;
4307 int multibytep
= coding
->dst_multibyte
;
4308 int *charbuf
= coding
->charbuf
;
4309 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4310 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4311 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4312 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4313 int produced_chars
= 0;
4314 struct charset
*charset
;
4315 Lisp_Object attrs
, eol_type
, charset_list
;
4316 int ascii_compatible
;
4319 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4320 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
4321 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4323 while (charbuf
< charbuf_end
)
4327 ASSURE_DESTINATION (safe_room
);
4329 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4330 EMIT_ONE_ASCII_BYTE (c
);
4331 else if ((code
= ENCODE_CHAR (charset
, c
))
4332 != CHARSET_INVALID_CODE (charset
))
4333 EMIT_ONE_BYTE (code
);
4335 EMIT_ONE_BYTE (coding
->default_char
);
4338 coding
->result
= CODING_RESULT_SUCCESS
;
4339 coding
->produced_char
+= produced_chars
;
4340 coding
->produced
= dst
- coding
->destination
;
4345 /*** 7. C library functions ***/
4347 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
4348 has a property `coding-system'. The value of this property is a
4349 vector of length 5 (called as coding-vector). Among elements of
4350 this vector, the first (element[0]) and the fifth (element[4])
4351 carry important information for decoding/encoding. Before
4352 decoding/encoding, this information should be set in fields of a
4353 structure of type `coding_system'.
4355 A value of property `coding-system' can be a symbol of another
4356 subsidiary coding-system. In that case, Emacs gets coding-vector
4359 `element[0]' contains information to be set in `coding->type'. The
4360 value and its meaning is as follows:
4362 0 -- coding_type_emacs_mule
4363 1 -- coding_type_sjis
4364 2 -- coding_type_iso_2022
4365 3 -- coding_type_big5
4366 4 -- coding_type_ccl encoder/decoder written in CCL
4367 nil -- coding_type_no_conversion
4368 t -- coding_type_undecided (automatic conversion on decoding,
4369 no-conversion on encoding)
4371 `element[4]' contains information to be set in `coding->flags' and
4372 `coding->spec'. The meaning varies by `coding->type'.
4374 If `coding->type' is `coding_type_iso_2022', element[4] is a vector
4375 of length 32 (of which the first 13 sub-elements are used now).
4376 Meanings of these sub-elements are:
4378 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022'
4379 If the value is an integer of valid charset, the charset is
4380 assumed to be designated to graphic register N initially.
4382 If the value is minus, it is a minus value of charset which
4383 reserves graphic register N, which means that the charset is
4384 not designated initially but should be designated to graphic
4385 register N just before encoding a character in that charset.
4387 If the value is nil, graphic register N is never used on
4390 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
4391 Each value takes t or nil. See the section ISO2022 of
4392 `coding.h' for more information.
4394 If `coding->type' is `coding_type_big5', element[4] is t to denote
4395 BIG5-ETen or nil to denote BIG5-HKU.
4397 If `coding->type' takes the other value, element[4] is ignored.
4399 Emacs Lisp's coding system also carries information about format of
4400 end-of-line in a value of property `eol-type'. If the value is
4401 integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr. If
4402 it is not integer, it should be a vector of subsidiary coding
4403 systems of which property `eol-type' has one of above values.
4407 /* Setup coding context CODING from information about CODING_SYSTEM.
4408 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4409 CODING_SYSTEM is invalid, signal an error. */
4412 setup_coding_system (coding_system
, coding
)
4413 Lisp_Object coding_system
;
4414 struct coding_system
*coding
;
4418 Lisp_Object eol_type
;
4419 Lisp_Object coding_type
;
4422 if (NILP (coding_system
))
4423 coding_system
= Qno_conversion
;
4425 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
4427 attrs
= CODING_ID_ATTRS (coding
->id
);
4428 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4431 coding
->head_ascii
= -1;
4432 coding
->common_flags
4433 = (VECTORP (eol_type
) ? CODING_REQUIRE_DETECTION_MASK
: 0);
4435 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4436 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4437 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4438 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
4440 coding_type
= CODING_ATTR_TYPE (attrs
);
4441 if (EQ (coding_type
, Qundecided
))
4443 coding
->detector
= NULL
;
4444 coding
->decoder
= decode_coding_raw_text
;
4445 coding
->encoder
= encode_coding_raw_text
;
4446 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4448 else if (EQ (coding_type
, Qiso_2022
))
4451 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
4453 /* Invoke graphic register 0 to plane 0. */
4454 CODING_ISO_INVOCATION (coding
, 0) = 0;
4455 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4456 CODING_ISO_INVOCATION (coding
, 1)
4457 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
4458 /* Setup the initial status of designation. */
4459 for (i
= 0; i
< 4; i
++)
4460 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
4461 /* Not single shifting initially. */
4462 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
4463 /* Beginning of buffer should also be regarded as bol. */
4464 CODING_ISO_BOL (coding
) = 1;
4465 coding
->detector
= detect_coding_iso_2022
;
4466 coding
->decoder
= decode_coding_iso_2022
;
4467 coding
->encoder
= encode_coding_iso_2022
;
4468 if (flags
& CODING_ISO_FLAG_SAFE
)
4469 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
4470 coding
->common_flags
4471 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4472 | CODING_REQUIRE_FLUSHING_MASK
);
4473 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
4474 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
4475 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
4477 setup_iso_safe_charsets (attrs
);
4478 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4479 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4480 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4482 CODING_ISO_FLAGS (coding
) = flags
;
4484 else if (EQ (coding_type
, Qcharset
))
4486 coding
->detector
= detect_coding_charset
;
4487 coding
->decoder
= decode_coding_charset
;
4488 coding
->encoder
= encode_coding_charset
;
4489 coding
->common_flags
4490 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4492 else if (EQ (coding_type
, Qutf_8
))
4494 coding
->detector
= detect_coding_utf_8
;
4495 coding
->decoder
= decode_coding_utf_8
;
4496 coding
->encoder
= encode_coding_utf_8
;
4497 coding
->common_flags
4498 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4500 else if (EQ (coding_type
, Qutf_16
))
4502 val
= AREF (attrs
, coding_attr_utf_16_bom
);
4503 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
4504 : EQ (val
, Qt
) ? utf_16_with_bom
4505 : utf_16_without_bom
);
4506 val
= AREF (attrs
, coding_attr_utf_16_endian
);
4507 CODING_UTF_16_ENDIAN (coding
) = (NILP (val
) ? utf_16_big_endian
4508 : utf_16_little_endian
);
4509 coding
->detector
= detect_coding_utf_16
;
4510 coding
->decoder
= decode_coding_utf_16
;
4511 coding
->encoder
= encode_coding_utf_16
;
4512 coding
->common_flags
4513 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4515 else if (EQ (coding_type
, Qccl
))
4517 coding
->detector
= detect_coding_ccl
;
4518 coding
->decoder
= decode_coding_ccl
;
4519 coding
->encoder
= encode_coding_ccl
;
4520 coding
->common_flags
4521 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4522 | CODING_REQUIRE_FLUSHING_MASK
);
4524 else if (EQ (coding_type
, Qemacs_mule
))
4526 coding
->detector
= detect_coding_emacs_mule
;
4527 coding
->decoder
= decode_coding_emacs_mule
;
4528 coding
->encoder
= encode_coding_emacs_mule
;
4529 coding
->common_flags
4530 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4531 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
4532 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
4534 Lisp_Object tail
, safe_charsets
;
4535 int max_charset_id
= 0;
4537 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4539 if (max_charset_id
< XFASTINT (XCAR (tail
)))
4540 max_charset_id
= XFASTINT (XCAR (tail
));
4541 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
4543 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4545 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
4546 coding
->max_charset_id
= max_charset_id
;
4547 coding
->safe_charsets
= (char *) XSTRING (safe_charsets
)->data
;
4550 else if (EQ (coding_type
, Qshift_jis
))
4552 coding
->detector
= detect_coding_sjis
;
4553 coding
->decoder
= decode_coding_sjis
;
4554 coding
->encoder
= encode_coding_sjis
;
4555 coding
->common_flags
4556 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4558 else if (EQ (coding_type
, Qbig5
))
4560 coding
->detector
= detect_coding_big5
;
4561 coding
->decoder
= decode_coding_big5
;
4562 coding
->encoder
= encode_coding_big5
;
4563 coding
->common_flags
4564 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4566 else /* EQ (coding_type, Qraw_text) */
4568 coding
->detector
= NULL
;
4569 coding
->decoder
= decode_coding_raw_text
;
4570 coding
->encoder
= encode_coding_raw_text
;
4571 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
4577 /* Return raw-text or one of its subsidiaries that has the same
4578 eol_type as CODING-SYSTEM. */
4581 raw_text_coding_system (coding_system
)
4582 Lisp_Object coding_system
;
4584 Lisp_Object spec
, attrs
, coding_type
;
4585 Lisp_Object eol_type
, raw_text_eol_type
;
4587 spec
= CODING_SYSTEM_SPEC (coding_system
);
4588 attrs
= AREF (spec
, 0);
4590 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
4591 return coding_system
;
4593 eol_type
= AREF (spec
, 2);
4594 if (VECTORP (eol_type
))
4596 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
4597 raw_text_eol_type
= AREF (spec
, 2);
4598 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
4599 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
4600 : AREF (raw_text_eol_type
, 2));
4604 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4605 does, return one of the subsidiary that has the same eol-spec as
4606 PARENT. Otherwise, return CODING_SYSTEM. */
4609 coding_inherit_eol_type (coding_system
, parent
)
4611 Lisp_Object spec
, attrs
, eol_type
;
4613 spec
= CODING_SYSTEM_SPEC (coding_system
);
4614 attrs
= AREF (spec
, 0);
4615 eol_type
= AREF (spec
, 2);
4616 if (VECTORP (eol_type
))
4618 Lisp_Object parent_spec
;
4619 Lisp_Object parent_attrs
;
4620 Lisp_Object parent_eol_type
;
4623 = CODING_SYSTEM_SPEC (buffer_defaults
.buffer_file_coding_system
);
4624 parent_eol_type
= AREF (parent_spec
, 2);
4625 if (EQ (parent_eol_type
, Qunix
))
4626 coding_system
= AREF (eol_type
, 0);
4627 else if (EQ (parent_eol_type
, Qdos
))
4628 coding_system
= AREF (eol_type
, 1);
4629 else if (EQ (parent_eol_type
, Qmac
))
4630 coding_system
= AREF (eol_type
, 2);
4632 return coding_system
;
4635 /* Emacs has a mechanism to automatically detect a coding system if it
4636 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4637 it's impossible to distinguish some coding systems accurately
4638 because they use the same range of codes. So, at first, coding
4639 systems are categorized into 7, those are:
4641 o coding-category-emacs-mule
4643 The category for a coding system which has the same code range
4644 as Emacs' internal format. Assigned the coding-system (Lisp
4645 symbol) `emacs-mule' by default.
4647 o coding-category-sjis
4649 The category for a coding system which has the same code range
4650 as SJIS. Assigned the coding-system (Lisp
4651 symbol) `japanese-shift-jis' by default.
4653 o coding-category-iso-7
4655 The category for a coding system which has the same code range
4656 as ISO2022 of 7-bit environment. This doesn't use any locking
4657 shift and single shift functions. This can encode/decode all
4658 charsets. Assigned the coding-system (Lisp symbol)
4659 `iso-2022-7bit' by default.
4661 o coding-category-iso-7-tight
4663 Same as coding-category-iso-7 except that this can
4664 encode/decode only the specified charsets.
4666 o coding-category-iso-8-1
4668 The category for a coding system which has the same code range
4669 as ISO2022 of 8-bit environment and graphic plane 1 used only
4670 for DIMENSION1 charset. This doesn't use any locking shift
4671 and single shift functions. Assigned the coding-system (Lisp
4672 symbol) `iso-latin-1' by default.
4674 o coding-category-iso-8-2
4676 The category for a coding system which has the same code range
4677 as ISO2022 of 8-bit environment and graphic plane 1 used only
4678 for DIMENSION2 charset. This doesn't use any locking shift
4679 and single shift functions. Assigned the coding-system (Lisp
4680 symbol) `japanese-iso-8bit' by default.
4682 o coding-category-iso-7-else
4684 The category for a coding system which has the same code range
4685 as ISO2022 of 7-bit environemnt but uses locking shift or
4686 single shift functions. Assigned the coding-system (Lisp
4687 symbol) `iso-2022-7bit-lock' by default.
4689 o coding-category-iso-8-else
4691 The category for a coding system which has the same code range
4692 as ISO2022 of 8-bit environemnt but uses locking shift or
4693 single shift functions. Assigned the coding-system (Lisp
4694 symbol) `iso-2022-8bit-ss2' by default.
4696 o coding-category-big5
4698 The category for a coding system which has the same code range
4699 as BIG5. Assigned the coding-system (Lisp symbol)
4700 `cn-big5' by default.
4702 o coding-category-utf-8
4704 The category for a coding system which has the same code range
4705 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
4706 symbol) `utf-8' by default.
4708 o coding-category-utf-16-be
4710 The category for a coding system in which a text has an
4711 Unicode signature (cf. Unicode Standard) in the order of BIG
4712 endian at the head. Assigned the coding-system (Lisp symbol)
4713 `utf-16-be' by default.
4715 o coding-category-utf-16-le
4717 The category for a coding system in which a text has an
4718 Unicode signature (cf. Unicode Standard) in the order of
4719 LITTLE endian at the head. Assigned the coding-system (Lisp
4720 symbol) `utf-16-le' by default.
4722 o coding-category-ccl
4724 The category for a coding system of which encoder/decoder is
4725 written in CCL programs. The default value is nil, i.e., no
4726 coding system is assigned.
4728 o coding-category-binary
4730 The category for a coding system not categorized in any of the
4731 above. Assigned the coding-system (Lisp symbol)
4732 `no-conversion' by default.
4734 Each of them is a Lisp symbol and the value is an actual
4735 `coding-system's (this is also a Lisp symbol) assigned by a user.
4736 What Emacs does actually is to detect a category of coding system.
4737 Then, it uses a `coding-system' assigned to it. If Emacs can't
4738 decide only one possible category, it selects a category of the
4739 highest priority. Priorities of categories are also specified by a
4740 user in a Lisp variable `coding-category-list'.
4744 #define EOL_SEEN_NONE 0
4745 #define EOL_SEEN_LF 1
4746 #define EOL_SEEN_CR 2
4747 #define EOL_SEEN_CRLF 4
4749 /* Detect how end-of-line of a text of length CODING->src_bytes
4750 pointed by CODING->source is encoded. Return one of
4753 #define MAX_EOL_CHECK_COUNT 3
4756 detect_eol (coding
, source
, src_bytes
)
4757 struct coding_system
*coding
;
4758 unsigned char *source
;
4759 EMACS_INT src_bytes
;
4761 Lisp_Object attrs
, coding_type
;
4762 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
4765 int eol_seen
= EOL_SEEN_NONE
;
4768 attrs
= CODING_ID_ATTRS (coding
->id
);
4769 coding_type
= CODING_ATTR_TYPE (attrs
);
4771 if (EQ (coding_type
, Qccl
))
4775 msb
= coding
->spec
.utf_16
.endian
== utf_16_little_endian
;
4778 while (src
+ 1 < src_end
)
4781 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
4786 this_eol
= EOL_SEEN_LF
;
4787 else if (src
+ 3 >= src_end
4788 || src
[msb
+ 2] != 0
4789 || src
[lsb
+ 2] != '\n')
4790 this_eol
= EOL_SEEN_CR
;
4792 this_eol
= EOL_SEEN_CRLF
;
4794 if (eol_seen
== EOL_SEEN_NONE
)
4795 /* This is the first end-of-line. */
4796 eol_seen
= this_eol
;
4797 else if (eol_seen
!= this_eol
)
4799 /* The found type is different from what found before. */
4800 eol_seen
= EOL_SEEN_LF
;
4803 if (++total
== MAX_EOL_CHECK_COUNT
)
4811 while (src
< src_end
)
4814 if (c
== '\n' || c
== '\r')
4819 this_eol
= EOL_SEEN_LF
;
4820 else if (src
>= src_end
|| *src
!= '\n')
4821 this_eol
= EOL_SEEN_CR
;
4823 this_eol
= EOL_SEEN_CRLF
, src
++;
4825 if (eol_seen
== EOL_SEEN_NONE
)
4826 /* This is the first end-of-line. */
4827 eol_seen
= this_eol
;
4828 else if (eol_seen
!= this_eol
)
4830 /* The found type is different from what found before. */
4831 eol_seen
= EOL_SEEN_LF
;
4834 if (++total
== MAX_EOL_CHECK_COUNT
)
4844 adjust_coding_eol_type (coding
, eol_seen
)
4845 struct coding_system
*coding
;
4848 Lisp_Object eol_type
, coding_system
;
4850 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4851 if (eol_seen
& EOL_SEEN_LF
)
4852 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
4853 else if (eol_type
& EOL_SEEN_CRLF
)
4854 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
4855 else if (eol_type
& EOL_SEEN_CR
)
4856 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
4859 /* Detect how a text specified in CODING is encoded. If a coding
4860 system is detected, update fields of CODING by the detected coding
4864 detect_coding (coding
)
4865 struct coding_system
*coding
;
4867 unsigned char *src
, *src_end
;
4868 Lisp_Object attrs
, coding_type
;
4870 coding
->consumed
= coding
->consumed_char
= 0;
4871 coding
->produced
= coding
->produced_char
= 0;
4872 coding_set_source (coding
);
4874 src_end
= coding
->source
+ coding
->src_bytes
;
4876 /* If we have not yet decided the text encoding type, detect it
4878 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
4880 int mask
= CATEGORY_MASK_ANY
;
4883 for (src
= coding
->source
; src
< src_end
; src
++)
4886 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
4888 || c
== ISO_CODE_SO
)))
4891 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
4893 if (coding
->head_ascii
< coding
->src_bytes
)
4897 for (i
= 0; i
< coding_category_raw_text
; i
++)
4899 enum coding_category category
= coding_priorities
[i
];
4900 struct coding_system
*this = coding_categories
+ category
;
4902 if (category
>= coding_category_raw_text
4903 || detected
& (1 << category
))
4908 /* No coding system of this category is defined. */
4909 mask
&= ~(1 << category
);
4913 detected
|= detected_mask
[category
];
4914 if ((*(this->detector
)) (coding
, &mask
))
4919 setup_coding_system (Qraw_text
, coding
);
4920 else if (mask
!= CATEGORY_MASK_ANY
)
4921 for (i
= 0; i
< coding_category_raw_text
; i
++)
4923 enum coding_category category
= coding_priorities
[i
];
4924 struct coding_system
*this = coding_categories
+ category
;
4926 if (mask
& (1 << category
))
4928 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
4935 attrs
= CODING_ID_ATTRS (coding
->id
);
4936 coding_type
= CODING_ATTR_TYPE (attrs
);
4938 /* If we have not yet decided the EOL type, detect it now. But, the
4939 detection is impossible for a CCL based coding system, in which
4940 case, we detct the EOL type after decoding. */
4941 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
))
4942 && ! EQ (coding_type
, Qccl
))
4944 int eol_seen
= detect_eol (coding
, coding
->source
, coding
->src_bytes
);
4946 if (eol_seen
!= EOL_SEEN_NONE
)
4947 adjust_coding_eol_type (coding
, eol_seen
);
4954 struct coding_system
*coding
;
4956 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
)))
4958 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
4959 unsigned char *pend
= p
+ coding
->produced
;
4960 int eol_seen
= EOL_SEEN_NONE
;
4962 for (; p
< pend
; p
++)
4965 eol_seen
|= EOL_SEEN_LF
;
4966 else if (*p
== '\r')
4968 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
4970 eol_seen
|= EOL_SEEN_CRLF
;
4974 eol_seen
|= EOL_SEEN_CR
;
4977 if (eol_seen
!= EOL_SEEN_NONE
)
4978 adjust_coding_eol_type (coding
, eol_seen
);
4981 if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qmac
))
4983 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
4984 unsigned char *pend
= p
+ coding
->produced
;
4986 for (; p
< pend
; p
++)
4990 else if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
))
4992 unsigned char *p
, *pbeg
, *pend
;
4993 Lisp_Object undo_list
;
4995 move_gap_both (coding
->dst_pos
+ coding
->produced_char
,
4996 coding
->dst_pos_byte
+ coding
->produced
);
4997 undo_list
= current_buffer
->undo_list
;
4998 current_buffer
->undo_list
= Qt
;
4999 del_range_2 (coding
->dst_pos
, coding
->dst_pos_byte
, GPT
, GPT_BYTE
, Qnil
);
5000 current_buffer
->undo_list
= undo_list
;
5002 pend
= pbeg
+ coding
->produced
;
5004 for (p
= pend
- 1; p
>= pbeg
; p
--)
5007 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
- p
- 1);
5010 coding
->produced_char
-= coding
->produced
- (pend
- pbeg
);
5011 coding
->produced
= pend
- pbeg
;
5012 insert_from_gap (coding
->produced_char
, coding
->produced
);
5017 translate_chars (coding
, table
)
5018 struct coding_system
*coding
;
5021 int *charbuf
= coding
->charbuf
;
5022 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5025 if (coding
->chars_at_source
)
5028 while (charbuf
< charbuf_end
)
5034 *charbuf
++ = translate_char (table
, c
);
5039 produce_chars (coding
)
5040 struct coding_system
*coding
;
5042 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5043 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5045 int produced_chars
= 0;
5047 if (! coding
->chars_at_source
)
5049 /* Characters are in coding->charbuf. */
5050 int *buf
= coding
->charbuf
;
5051 int *buf_end
= buf
+ coding
->charbuf_used
;
5052 unsigned char *adjusted_dst_end
;
5054 if (BUFFERP (coding
->src_object
)
5055 && EQ (coding
->src_object
, coding
->dst_object
))
5056 dst_end
= coding
->source
+ coding
->consumed
;
5057 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5059 while (buf
< buf_end
)
5063 if (dst
>= adjusted_dst_end
)
5065 dst
= alloc_destination (coding
,
5066 buf_end
- buf
+ MAX_MULTIBYTE_LENGTH
,
5068 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5069 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5073 if (coding
->dst_multibyte
5074 || ! CHAR_BYTE8_P (c
))
5075 CHAR_STRING_ADVANCE (c
, dst
);
5077 *dst
++ = CHAR_TO_BYTE8 (c
);
5081 /* This is an annotation data. */
5087 int multibytep
= coding
->src_multibyte
;
5088 unsigned char *src
= coding
->source
;
5089 unsigned char *src_end
= src
+ coding
->src_bytes
;
5090 Lisp_Object eol_type
;
5092 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5094 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5096 if (coding
->src_multibyte
)
5102 unsigned char *src_base
= src
;
5108 if (EQ (eol_type
, Qdos
))
5114 else if (EQ (eol_type
, Qmac
))
5119 EMACS_INT offset
= src
- coding
->source
;
5121 dst
= alloc_destination (coding
, src_end
- src
+ 1, dst
);
5122 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5123 coding_set_source (coding
);
5124 src
= coding
->source
+ offset
;
5125 src_end
= coding
->source
+ coding
->src_bytes
;
5134 while (src
< src_end
)
5140 if (EQ (eol_type
, Qdos
))
5146 else if (EQ (eol_type
, Qmac
))
5149 if (dst
>= dst_end
- 1)
5151 EMACS_INT offset
= src
- coding
->source
;
5153 dst
= alloc_destination (coding
, src_end
- src
+ 2, dst
);
5154 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5155 coding_set_source (coding
);
5156 src
= coding
->source
+ offset
;
5157 src_end
= coding
->source
+ coding
->src_bytes
;
5164 if (!EQ (coding
->src_object
, coding
->dst_object
))
5166 int require
= coding
->src_bytes
- coding
->dst_bytes
;
5170 EMACS_INT offset
= src
- coding
->source
;
5172 dst
= alloc_destination (coding
, require
, dst
);
5173 coding_set_source (coding
);
5174 src
= coding
->source
+ offset
;
5175 src_end
= coding
->source
+ coding
->src_bytes
;
5178 produced_chars
= coding
->src_chars
;
5179 while (src
< src_end
)
5185 if (EQ (eol_type
, Qdos
))
5192 else if (EQ (eol_type
, Qmac
))
5200 produced
= dst
- (coding
->destination
+ coding
->produced
);
5201 if (BUFFERP (coding
->dst_object
))
5202 insert_from_gap (produced_chars
, produced
);
5203 coding
->produced
+= produced
;
5204 coding
->produced_char
+= produced_chars
;
5205 return produced_chars
;
5208 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5210 [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5214 produce_composition (coding
, charbuf
)
5215 struct coding_system
*coding
;
5221 enum composition_method method
;
5223 Lisp_Object components
;
5225 buffer
= coding
->dst_object
;
5227 pos
= coding
->dst_pos
+ charbuf
[1];
5228 method
= (enum composition_method
) (charbuf
[3]);
5229 cmp_len
= charbuf
[4];
5231 if (method
== COMPOSITION_RELATIVE
)
5235 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
5240 for (i
= 0; i
< len
; i
++)
5241 args
[i
] = make_number (charbuf
[i
]);
5242 components
= (method
== COMPOSITION_WITH_ALTCHARS
5243 ? Fstring (len
, args
) : Fvector (len
, args
));
5245 compose_text (pos
, pos
+ cmp_len
, components
, Qnil
, Qnil
);
5249 save_composition_data (buf
, buf_end
, prop
)
5253 enum composition_method method
= COMPOSITION_METHOD (prop
);
5254 int cmp_len
= COMPOSITION_LENGTH (prop
);
5256 if (buf
+ 4 + (MAX_COMPOSITION_COMPONENTS
* 2 - 1) > buf_end
)
5259 buf
[1] = CODING_ANNOTATE_COMPOSITION_MASK
;
5263 if (method
== COMPOSITION_RELATIVE
)
5267 Lisp_Object components
;
5270 components
= COMPOSITION_COMPONENTS (prop
);
5271 if (VECTORP (components
))
5273 len
= XVECTOR (components
)->size
;
5274 for (i
= 0; i
< len
; i
++)
5275 buf
[4 + i
] = XINT (AREF (components
, i
));
5277 else if (STRINGP (components
))
5281 len
= XSTRING (components
)->size
;
5284 FETCH_STRING_CHAR_ADVANCE (buf
[4 + i
], components
, i
, i_byte
);
5286 else if (INTEGERP (components
))
5289 buf
[4] = XINT (components
);
5291 else if (CONSP (components
))
5293 for (len
= 0; CONSP (components
);
5294 len
++, components
= XCDR (components
))
5295 buf
[4 + len
] = XINT (XCAR (components
));
5301 return (buf
+ buf
[0]);
5304 #define CHARBUF_SIZE 0x4000
5306 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5308 int size = CHARBUF_SIZE;; \
5310 coding->charbuf = NULL; \
5311 while (size > 1024) \
5313 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5314 if (coding->charbuf) \
5318 if (! coding->charbuf) \
5320 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5321 return coding->result; \
5323 coding->charbuf_size = size; \
5328 produce_annotation (coding
)
5329 struct coding_system
*coding
;
5331 int *charbuf
= coding
->charbuf
;
5332 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5334 while (charbuf
< charbuf_end
)
5340 int len
= -*charbuf
;
5343 case CODING_ANNOTATE_COMPOSITION_MASK
:
5344 produce_composition (coding
, charbuf
);
5354 /* Decode the data at CODING->src_object into CODING->dst_object.
5355 CODING->src_object is a buffer, a string, or nil.
5356 CODING->dst_object is a buffer.
5358 If CODING->src_object is a buffer, it must be the current buffer.
5359 In this case, if CODING->src_pos is positive, it is a position of
5360 the source text in the buffer, otherwise, the source text is in the
5361 gap area of the buffer, and CODING->src_pos specifies the offset of
5362 the text from GPT (which must be the same as PT). If this is the
5363 same buffer as CODING->dst_object, CODING->src_pos must be
5366 If CODING->src_object is a string, CODING->src_pos in an index to
5369 If CODING->src_object is nil, CODING->source must already point to
5370 the non-relocatable memory area. In this case, CODING->src_pos is
5371 an offset from CODING->source.
5373 The decoded data is inserted at the current point of the buffer
5378 decode_coding (coding
)
5379 struct coding_system
*coding
;
5383 if (BUFFERP (coding
->src_object
)
5384 && coding
->src_pos
> 0
5385 && coding
->src_pos
< GPT
5386 && coding
->src_pos
+ coding
->src_chars
> GPT
)
5387 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
5389 if (BUFFERP (coding
->dst_object
))
5391 if (current_buffer
!= XBUFFER (coding
->dst_object
))
5392 set_buffer_internal (XBUFFER (coding
->dst_object
));
5394 move_gap_both (PT
, PT_BYTE
);
5397 coding
->consumed
= coding
->consumed_char
= 0;
5398 coding
->produced
= coding
->produced_char
= 0;
5399 coding
->chars_at_source
= 0;
5400 coding
->result
= CODING_RESULT_SUCCESS
;
5403 ALLOC_CONVERSION_WORK_AREA (coding
);
5405 attrs
= CODING_ID_ATTRS (coding
->id
);
5409 coding_set_source (coding
);
5410 coding
->annotated
= 0;
5411 (*(coding
->decoder
)) (coding
);
5412 if (!NILP (CODING_ATTR_DECODE_TBL (attrs
)))
5413 translate_chars (CODING_ATTR_DECODE_TBL (attrs
), coding
);
5414 coding_set_destination (coding
);
5415 produce_chars (coding
);
5416 if (coding
->annotated
)
5417 produce_annotation (coding
);
5419 while (coding
->consumed
< coding
->src_bytes
5420 && ! coding
->result
);
5422 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qccl
)
5423 && SYMBOLP (CODING_ID_EOL_TYPE (coding
->id
))
5424 && ! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
5425 decode_eol (coding
);
5427 coding
->carryover_bytes
= 0;
5428 if (coding
->consumed
< coding
->src_bytes
)
5430 int nbytes
= coding
->src_bytes
- coding
->consumed
;
5433 coding_set_source (coding
);
5434 coding_set_destination (coding
);
5435 src
= coding
->source
+ coding
->consumed
;
5437 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
5439 /* Flush out unprocessed data as binary chars. We are sure
5440 that the number of data is less than the size of
5442 int *charbuf
= coding
->charbuf
;
5444 while (nbytes
-- > 0)
5447 *charbuf
++ = (c
& 0x80 ? - c
: c
);
5449 produce_chars (coding
);
5453 /* Record unprocessed bytes in coding->carryover. We are
5454 sure that the number of data is less than the size of
5455 coding->carryover. */
5456 unsigned char *p
= coding
->carryover
;
5458 coding
->carryover_bytes
= nbytes
;
5459 while (nbytes
-- > 0)
5462 coding
->consumed
= coding
->src_bytes
;
5465 if (BUFFERP (coding
->dst_object
))
5467 record_insert (coding
->dst_pos
, coding
->produced_char
);
5470 return coding
->result
;
5474 consume_chars (coding
)
5475 struct coding_system
*coding
;
5477 int *buf
= coding
->charbuf
;
5478 /* -1 is to compensate for CRLF. */
5479 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
- 1;
5480 unsigned char *src
= coding
->source
+ coding
->consumed
;
5481 int pos
= coding
->src_pos
+ coding
->consumed_char
;
5482 int end_pos
= coding
->src_pos
+ coding
->src_chars
;
5483 int multibytep
= coding
->src_multibyte
;
5484 Lisp_Object eol_type
;
5486 int start
, end
, stop
;
5487 Lisp_Object object
, prop
;
5489 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5490 if (VECTORP (eol_type
))
5493 object
= coding
->src_object
;
5495 /* Note: composition handling is not yet implemented. */
5496 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
5498 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
5499 && find_composition (pos
, end_pos
, &start
, &end
, &prop
, object
)
5502 || (find_composition (end
, end_pos
, &start
, &end
, &prop
, object
)
5503 && end
<= end_pos
)))
5508 while (buf
< buf_end
)
5516 p
= save_composition_data (buf
, buf_end
, prop
);
5520 if (find_composition (end
, end_pos
, &start
, &end
, &prop
, object
)
5530 c
= STRING_CHAR_ADVANCE (src
);
5531 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
5533 if (! EQ (eol_type
, Qunix
))
5537 if (EQ (eol_type
, Qdos
))
5547 coding
->consumed
= src
- coding
->source
;
5548 coding
->consumed_char
= pos
- coding
->src_pos
;
5549 coding
->charbuf_used
= buf
- coding
->charbuf
;
5550 coding
->chars_at_source
= 0;
5554 /* Encode the text at CODING->src_object into CODING->dst_object.
5555 CODING->src_object is a buffer or a string.
5556 CODING->dst_object is a buffer or nil.
5558 If CODING->src_object is a buffer, it must be the current buffer.
5559 In this case, if CODING->src_pos is positive, it is a position of
5560 the source text in the buffer, otherwise. the source text is in the
5561 gap area of the buffer, and coding->src_pos specifies the offset of
5562 the text from GPT (which must be the same as PT). If this is the
5563 same buffer as CODING->dst_object, CODING->src_pos must be
5564 negative and CODING should not have `pre-write-conversion'.
5566 If CODING->src_object is a string, CODING should not have
5567 `pre-write-conversion'.
5569 If CODING->dst_object is a buffer, the encoded data is inserted at
5570 the current point of that buffer.
5572 If CODING->dst_object is nil, the encoded data is placed at the
5573 memory area specified by CODING->destination. */
5576 encode_coding (coding
)
5577 struct coding_system
*coding
;
5582 attrs
= CODING_ID_ATTRS (coding
->id
);
5584 if (BUFFERP (coding
->dst_object
))
5586 set_buffer_internal (XBUFFER (coding
->dst_object
));
5587 coding
->dst_multibyte
5588 = ! NILP (current_buffer
->enable_multibyte_characters
);
5591 coding
->consumed
= coding
->consumed_char
= 0;
5592 coding
->produced
= coding
->produced_char
= 0;
5593 coding
->result
= CODING_RESULT_SUCCESS
;
5596 ALLOC_CONVERSION_WORK_AREA (coding
);
5599 coding_set_source (coding
);
5600 consume_chars (coding
);
5602 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs
)))
5603 translate_chars (CODING_ATTR_ENCODE_TBL (attrs
), coding
);
5605 coding_set_destination (coding
);
5606 (*(coding
->encoder
)) (coding
);
5607 } while (coding
->consumed_char
< coding
->src_chars
);
5609 if (BUFFERP (coding
->dst_object
))
5610 insert_from_gap (coding
->produced_char
, coding
->produced
);
5612 return (coding
->result
);
5617 /* List of currently used working buffer. */
5618 Lisp_Object Vcode_conversion_work_buf_list
;
5620 /* A working buffer used by the top level conversion. */
5621 Lisp_Object Vcode_conversion_reused_work_buf
;
5624 /* Return a working buffer that can be freely used by the following
5625 code conversion. MULTIBYTEP specifies the multibyteness of the
5629 make_conversion_work_buffer (multibytep
)
5632 struct buffer
*current
= current_buffer
;
5635 if (NILP (Vcode_conversion_work_buf_list
))
5637 if (NILP (Vcode_conversion_reused_work_buf
))
5638 Vcode_conversion_reused_work_buf
5639 = Fget_buffer_create (build_string (" *code-conversion-work*"));
5640 Vcode_conversion_work_buf_list
5641 = Fcons (Vcode_conversion_reused_work_buf
, Qnil
);
5645 int depth
= Flength (Vcode_conversion_work_buf_list
);
5648 sprintf (str
, " *code-conversion-work*<%d>", depth
);
5649 Vcode_conversion_work_buf_list
5650 = Fcons (Fget_buffer_create (build_string (str
)),
5651 Vcode_conversion_work_buf_list
);
5654 buf
= XCAR (Vcode_conversion_work_buf_list
);
5655 set_buffer_internal (XBUFFER (buf
));
5656 current_buffer
->undo_list
= Qt
;
5658 Fset_buffer_multibyte (multibytep
? Qt
: Qnil
);
5659 set_buffer_internal (current
);
5663 static struct coding_system
*saved_coding
;
5666 code_conversion_restore (info
)
5669 int depth
= Flength (Vcode_conversion_work_buf_list
);
5674 buf
= XCAR (Vcode_conversion_work_buf_list
);
5675 Vcode_conversion_work_buf_list
= XCDR (Vcode_conversion_work_buf_list
);
5676 if (depth
> 1 && !NILP (Fbuffer_live_p (buf
)))
5680 if (saved_coding
->dst_object
== Qt
5681 && saved_coding
->destination
)
5682 xfree (saved_coding
->destination
);
5684 return save_excursion_restore (info
);
5689 decode_coding_gap (coding
, chars
, bytes
)
5690 struct coding_system
*coding
;
5691 EMACS_INT chars
, bytes
;
5693 int count
= specpdl_ptr
- specpdl
;
5695 saved_coding
= coding
;
5696 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5698 coding
->src_object
= Fcurrent_buffer ();
5699 coding
->src_chars
= chars
;
5700 coding
->src_bytes
= bytes
;
5701 coding
->src_pos
= -chars
;
5702 coding
->src_pos_byte
= -bytes
;
5703 coding
->src_multibyte
= chars
< bytes
;
5704 coding
->dst_object
= coding
->src_object
;
5705 coding
->dst_pos
= PT
;
5706 coding
->dst_pos_byte
= PT_BYTE
;
5708 if (CODING_REQUIRE_DETECTION (coding
))
5709 detect_coding (coding
);
5711 decode_coding (coding
);
5713 unbind_to (count
, Qnil
);
5714 return coding
->result
;
5718 encode_coding_gap (coding
, chars
, bytes
)
5719 struct coding_system
*coding
;
5720 EMACS_INT chars
, bytes
;
5722 int count
= specpdl_ptr
- specpdl
;
5725 saved_coding
= coding
;
5726 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5728 buffer
= Fcurrent_buffer ();
5729 coding
->src_object
= buffer
;
5730 coding
->src_chars
= chars
;
5731 coding
->src_bytes
= bytes
;
5732 coding
->src_pos
= -chars
;
5733 coding
->src_pos_byte
= -bytes
;
5734 coding
->src_multibyte
= chars
< bytes
;
5735 coding
->dst_object
= coding
->src_object
;
5736 coding
->dst_pos
= PT
;
5737 coding
->dst_pos_byte
= PT_BYTE
;
5739 encode_coding (coding
);
5741 unbind_to (count
, Qnil
);
5742 return coding
->result
;
5746 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5747 SRC_OBJECT into DST_OBJECT by coding context CODING.
5749 SRC_OBJECT is a buffer, a string, or Qnil.
5751 If it is a buffer, the text is at point of the buffer. FROM and TO
5752 are positions in the buffer.
5754 If it is a string, the text is at the beginning of the string.
5755 FROM and TO are indices to the string.
5757 If it is nil, the text is at coding->source. FROM and TO are
5758 indices to coding->source.
5760 DST_OBJECT is a buffer, Qt, or Qnil.
5762 If it is a buffer, the decoded text is inserted at point of the
5763 buffer. If the buffer is the same as SRC_OBJECT, the source text
5766 If it is Qt, a string is made from the decoded text, and
5767 set in CODING->dst_object.
5769 If it is Qnil, the decoded text is stored at CODING->destination.
5770 The called must allocate CODING->dst_bytes bytes at
5771 CODING->destination by xmalloc. If the decoded text is longer than
5772 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5776 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
5778 struct coding_system
*coding
;
5779 Lisp_Object src_object
;
5780 EMACS_INT from
, from_byte
, to
, to_byte
;
5781 Lisp_Object dst_object
;
5783 int count
= specpdl_ptr
- specpdl
;
5784 unsigned char *destination
;
5785 EMACS_INT dst_bytes
;
5786 EMACS_INT chars
= to
- from
;
5787 EMACS_INT bytes
= to_byte
- from_byte
;
5790 saved_coding
= coding
;
5791 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5793 if (NILP (dst_object
))
5795 destination
= coding
->destination
;
5796 dst_bytes
= coding
->dst_bytes
;
5799 coding
->src_object
= src_object
;
5800 coding
->src_chars
= chars
;
5801 coding
->src_bytes
= bytes
;
5802 coding
->src_multibyte
= chars
< bytes
;
5804 if (STRINGP (src_object
))
5806 coding
->src_pos
= from
;
5807 coding
->src_pos_byte
= from_byte
;
5809 else if (BUFFERP (src_object
))
5811 set_buffer_internal (XBUFFER (src_object
));
5813 move_gap_both (from
, from_byte
);
5814 if (EQ (src_object
, dst_object
))
5816 TEMP_SET_PT_BOTH (from
, from_byte
);
5817 del_range_both (from
, from_byte
, to
, to_byte
, 1);
5818 coding
->src_pos
= -chars
;
5819 coding
->src_pos_byte
= -bytes
;
5823 coding
->src_pos
= from
;
5824 coding
->src_pos_byte
= from_byte
;
5828 if (CODING_REQUIRE_DETECTION (coding
))
5829 detect_coding (coding
);
5830 attrs
= CODING_ID_ATTRS (coding
->id
);
5832 if (! NILP (CODING_ATTR_POST_READ (attrs
))
5833 || EQ (dst_object
, Qt
))
5835 coding
->dst_object
= make_conversion_work_buffer (1);
5836 coding
->dst_pos
= BEG
;
5837 coding
->dst_pos_byte
= BEG_BYTE
;
5838 coding
->dst_multibyte
= 1;
5840 else if (BUFFERP (dst_object
))
5842 coding
->dst_object
= dst_object
;
5843 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
5844 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
5845 coding
->dst_multibyte
5846 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
5850 coding
->dst_object
= Qnil
;
5851 coding
->dst_multibyte
= 1;
5854 decode_coding (coding
);
5856 if (BUFFERP (coding
->dst_object
))
5857 set_buffer_internal (XBUFFER (coding
->dst_object
));
5859 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
5861 struct gcpro gcpro1
, gcpro2
;
5862 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
5865 GCPRO2 (coding
->src_object
, coding
->dst_object
);
5866 val
= call1 (CODING_ATTR_POST_READ (attrs
),
5867 make_number (coding
->produced_char
));
5870 coding
->produced_char
+= Z
- prev_Z
;
5871 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
5874 if (EQ (dst_object
, Qt
))
5876 coding
->dst_object
= Fbuffer_string ();
5878 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
5880 set_buffer_internal (XBUFFER (coding
->dst_object
));
5881 if (dst_bytes
< coding
->produced
)
5884 = (unsigned char *) xrealloc (destination
, coding
->produced
);
5887 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
5888 unbind_to (count
, Qnil
);
5891 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
5892 move_gap_both (BEGV
, BEGV_BYTE
);
5893 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
5894 coding
->destination
= destination
;
5898 unbind_to (count
, Qnil
);
5903 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
5905 struct coding_system
*coding
;
5906 Lisp_Object src_object
;
5907 EMACS_INT from
, from_byte
, to
, to_byte
;
5908 Lisp_Object dst_object
;
5910 int count
= specpdl_ptr
- specpdl
;
5911 EMACS_INT chars
= to
- from
;
5912 EMACS_INT bytes
= to_byte
- from_byte
;
5915 saved_coding
= coding
;
5916 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5918 coding
->src_object
= src_object
;
5919 coding
->src_chars
= chars
;
5920 coding
->src_bytes
= bytes
;
5921 coding
->src_multibyte
= chars
< bytes
;
5923 attrs
= CODING_ID_ATTRS (coding
->id
);
5925 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
5929 coding
->src_object
= make_conversion_work_buffer (coding
->src_multibyte
);
5930 set_buffer_internal (XBUFFER (coding
->src_object
));
5931 if (STRINGP (src_object
))
5932 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
5933 else if (BUFFERP (src_object
))
5934 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
5936 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
5938 if (EQ (src_object
, dst_object
))
5940 set_buffer_internal (XBUFFER (src_object
));
5941 del_range_both (from
, from_byte
, to
, to_byte
, 1);
5942 set_buffer_internal (XBUFFER (coding
->src_object
));
5945 val
= call2 (CODING_ATTR_PRE_WRITE (attrs
),
5946 make_number (1), make_number (chars
));
5949 move_gap_both (BEG
, BEG_BYTE
);
5950 coding
->src_chars
= Z
- BEG
;
5951 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
5952 coding
->src_pos
= BEG
;
5953 coding
->src_pos_byte
= BEG_BYTE
;
5954 coding
->src_multibyte
= Z
< Z_BYTE
;
5956 else if (STRINGP (src_object
))
5958 coding
->src_pos
= from
;
5959 coding
->src_pos_byte
= from_byte
;
5961 else if (BUFFERP (src_object
))
5963 set_buffer_internal (XBUFFER (src_object
));
5965 move_gap_both (from
, from_byte
);
5966 if (EQ (src_object
, dst_object
))
5968 del_range_both (from
, from_byte
, to
, to_byte
, 1);
5969 coding
->src_pos
= -chars
;
5970 coding
->src_pos_byte
= -bytes
;
5974 coding
->src_pos
= from
;
5975 coding
->src_pos_byte
= from_byte
;
5979 if (BUFFERP (dst_object
))
5981 coding
->dst_object
= dst_object
;
5982 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
5983 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
5984 coding
->dst_multibyte
5985 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
5987 else if (EQ (dst_object
, Qt
))
5989 coding
->dst_object
= Qnil
;
5990 coding
->destination
= (unsigned char *) xmalloc (coding
->src_chars
);
5991 coding
->dst_bytes
= coding
->src_chars
;
5992 coding
->dst_multibyte
= 0;
5996 coding
->dst_object
= Qnil
;
5997 coding
->dst_multibyte
= 0;
6000 encode_coding (coding
);
6002 if (EQ (dst_object
, Qt
))
6004 if (BUFFERP (coding
->dst_object
))
6005 coding
->dst_object
= Fbuffer_string ();
6009 = make_unibyte_string ((char *) coding
->destination
,
6011 xfree (coding
->destination
);
6015 unbind_to (count
, Qnil
);
6020 preferred_coding_system ()
6022 int id
= coding_categories
[coding_priorities
[0]].id
;
6024 return CODING_ID_NAME (id
);
6029 /*** 8. Emacs Lisp library functions ***/
6031 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
6032 doc
: /* Return t if OBJECT is nil or a coding-system.
6033 See the documentation of `define-coding-system' for information
6034 about coding-system objects. */)
6038 return ((NILP (obj
) || CODING_SYSTEM_P (obj
)) ? Qt
: Qnil
);
6041 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
6042 Sread_non_nil_coding_system
, 1, 1, 0,
6043 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6050 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6051 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
6053 while (XSTRING (val
)->size
== 0);
6054 return (Fintern (val
, Qnil
));
6057 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
6058 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6059 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6060 (prompt
, default_coding_system
)
6061 Lisp_Object prompt
, default_coding_system
;
6064 if (SYMBOLP (default_coding_system
))
6065 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
6066 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6067 Qt
, Qnil
, Qcoding_system_history
,
6068 default_coding_system
, Qnil
);
6069 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
6072 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
6074 doc
: /* Check validity of CODING-SYSTEM.
6075 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6076 It is valid if it is a symbol with a non-nil `coding-system' property.
6077 The value of property should be a vector of length 5. */)
6079 Lisp_Object coding_system
;
6081 CHECK_SYMBOL (coding_system
);
6082 if (!NILP (Fcoding_system_p (coding_system
)))
6083 return coding_system
;
6085 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
6090 detect_coding_system (src
, src_bytes
, highest
, multibytep
, coding_system
)
6092 int src_bytes
, highest
;
6094 Lisp_Object coding_system
;
6096 unsigned char *src_end
= src
+ src_bytes
;
6097 int mask
= CATEGORY_MASK_ANY
;
6100 Lisp_Object attrs
, eol_type
;
6102 struct coding_system coding
;
6104 if (NILP (coding_system
))
6105 coding_system
= Qundecided
;
6106 setup_coding_system (coding_system
, &coding
);
6107 attrs
= CODING_ID_ATTRS (coding
.id
);
6108 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
6110 coding
.source
= src
;
6111 coding
.src_bytes
= src_bytes
;
6112 coding
.src_multibyte
= multibytep
;
6113 coding
.consumed
= 0;
6115 if (XINT (CODING_ATTR_CATEGORY (attrs
)) != coding_category_undecided
)
6117 mask
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
6121 coding_system
= Qnil
;
6122 for (; src
< src_end
; src
++)
6125 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
6127 || c
== ISO_CODE_SO
)))
6130 coding
.head_ascii
= src
- coding
.source
;
6133 for (i
= 0; i
< coding_category_raw_text
; i
++)
6135 enum coding_category category
= coding_priorities
[i
];
6136 struct coding_system
*this = coding_categories
+ category
;
6138 if (category
>= coding_category_raw_text
6139 || detected
& (1 << category
))
6144 /* No coding system of this category is defined. */
6145 mask
&= ~(1 << category
);
6149 detected
|= detected_mask
[category
];
6150 if ((*(coding_categories
[category
].detector
)) (&coding
, &mask
)
6153 mask
&= detected_mask
[category
];
6161 val
= Fcons (make_number (coding_category_raw_text
), Qnil
);
6162 else if (mask
== CATEGORY_MASK_ANY
)
6163 val
= Fcons (make_number (coding_category_undecided
), Qnil
);
6166 for (i
= 0; i
< coding_category_raw_text
; i
++)
6167 if (mask
& (1 << coding_priorities
[i
]))
6169 val
= Fcons (make_number (coding_priorities
[i
]), Qnil
);
6176 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6177 if (mask
& (1 << coding_priorities
[i
]))
6178 val
= Fcons (make_number (coding_priorities
[i
]), val
);
6182 int one_byte_eol
= -1, two_byte_eol
= -1;
6185 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
6187 struct coding_system
*this
6188 = (NILP (coding_system
) ? coding_categories
+ XINT (XCAR (tail
))
6192 attrs
= CODING_ID_ATTRS (this->id
);
6193 eol_type
= CODING_ID_EOL_TYPE (this->id
);
6194 XSETCAR (tail
, CODING_ID_NAME (this->id
));
6195 if (VECTORP (eol_type
))
6197 if (EQ (CODING_ATTR_TYPE (attrs
), Qutf_16
))
6199 if (two_byte_eol
< 0)
6200 two_byte_eol
= detect_eol (this, coding
.source
, src_bytes
);
6201 this_eol
= two_byte_eol
;
6205 if (one_byte_eol
< 0)
6206 one_byte_eol
=detect_eol (this, coding
.source
, src_bytes
);
6207 this_eol
= one_byte_eol
;
6209 if (this_eol
== EOL_SEEN_LF
)
6210 XSETCAR (tail
, AREF (eol_type
, 0));
6211 else if (this_eol
== EOL_SEEN_CRLF
)
6212 XSETCAR (tail
, AREF (eol_type
, 1));
6213 else if (this_eol
== EOL_SEEN_CR
)
6214 XSETCAR (tail
, AREF (eol_type
, 2));
6219 return (highest
? XCAR (val
) : val
);
6223 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
6225 doc
: /* Detect coding system of the text in the region between START and END.
6226 Return a list of possible coding systems ordered by priority.
6228 If only ASCII characters are found, it returns a list of single element
6229 `undecided' or its subsidiary coding system according to a detected
6232 If optional argument HIGHEST is non-nil, return the coding system of
6233 highest priority. */)
6234 (start
, end
, highest
)
6235 Lisp_Object start
, end
, highest
;
6238 int from_byte
, to_byte
;
6240 CHECK_NUMBER_COERCE_MARKER (start
);
6241 CHECK_NUMBER_COERCE_MARKER (end
);
6243 validate_region (&start
, &end
);
6244 from
= XINT (start
), to
= XINT (end
);
6245 from_byte
= CHAR_TO_BYTE (from
);
6246 to_byte
= CHAR_TO_BYTE (to
);
6248 if (from
< GPT
&& to
>= GPT
)
6249 move_gap_both (to
, to_byte
);
6251 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
6252 to_byte
- from_byte
,
6254 !NILP (current_buffer
6255 ->enable_multibyte_characters
),
6259 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
6261 doc
: /* Detect coding system of the text in STRING.
6262 Return a list of possible coding systems ordered by priority.
6264 If only ASCII characters are found, it returns a list of single element
6265 `undecided' or its subsidiary coding system according to a detected
6268 If optional argument HIGHEST is non-nil, return the coding system of
6269 highest priority. */)
6271 Lisp_Object string
, highest
;
6273 CHECK_STRING (string
);
6275 return detect_coding_system (XSTRING (string
)->data
,
6276 STRING_BYTES (XSTRING (string
)),
6278 STRING_MULTIBYTE (string
),
6284 char_encodable_p (c
, attrs
)
6290 struct charset
*charset
;
6292 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
6293 CONSP (tail
); tail
= XCDR (tail
))
6295 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
6296 if (CHAR_CHARSET_P (c
, charset
))
6299 return (! NILP (tail
));
6303 /* Return a list of coding systems that safely encode the text between
6304 START and END. If EXCLUDE is non-nil, it is a list of coding
6305 systems not to check. The returned list doesn't contain any such
6306 coding systems. In any case, If the text contains only ASCII or is
6307 unibyte, return t. */
6309 DEFUN ("find-coding-systems-region-internal",
6310 Ffind_coding_systems_region_internal
,
6311 Sfind_coding_systems_region_internal
, 2, 3, 0,
6312 doc
: /* Internal use only. */)
6313 (start
, end
, exclude
)
6314 Lisp_Object start
, end
, exclude
;
6316 Lisp_Object coding_attrs_list
, safe_codings
;
6317 EMACS_INT start_byte
, end_byte
;
6318 unsigned char *p
, *pbeg
, *pend
;
6320 Lisp_Object tail
, elt
;
6322 if (STRINGP (start
))
6324 if (!STRING_MULTIBYTE (start
)
6325 && XSTRING (start
)->size
!= STRING_BYTES (XSTRING (start
)))
6328 end_byte
= STRING_BYTES (XSTRING (start
));
6332 CHECK_NUMBER_COERCE_MARKER (start
);
6333 CHECK_NUMBER_COERCE_MARKER (end
);
6334 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
6335 args_out_of_range (start
, end
);
6336 if (NILP (current_buffer
->enable_multibyte_characters
))
6338 start_byte
= CHAR_TO_BYTE (XINT (start
));
6339 end_byte
= CHAR_TO_BYTE (XINT (end
));
6340 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
6343 if (start
< GPT
&& end
> GPT
)
6345 if ((GPT
- start
) < (end
- GPT
))
6346 move_gap_both (start
, start_byte
);
6348 move_gap_both (end
, end_byte
);
6352 coding_attrs_list
= Qnil
;
6353 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
6355 || NILP (Fmemq (XCAR (tail
), exclude
)))
6359 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
6360 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
6361 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
6362 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
6365 if (STRINGP (start
))
6366 p
= pbeg
= XSTRING (start
)->data
;
6368 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
6369 pend
= p
+ (end_byte
- start_byte
);
6371 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
6372 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
6376 if (ASCII_BYTE_P (*p
))
6380 c
= STRING_CHAR_ADVANCE (p
);
6382 charset_map_loaded
= 0;
6383 for (tail
= coding_attrs_list
; CONSP (tail
);)
6388 else if (char_encodable_p (c
, elt
))
6390 else if (CONSP (XCDR (tail
)))
6392 XSETCAR (tail
, XCAR (XCDR (tail
)));
6393 XSETCDR (tail
, XCDR (XCDR (tail
)));
6397 XSETCAR (tail
, Qnil
);
6401 if (charset_map_loaded
)
6403 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
6405 if (STRINGP (start
))
6406 pbeg
= XSTRING (start
)->data
;
6408 pbeg
= BYTE_POS_ADDR (start_byte
);
6409 p
= pbeg
+ p_offset
;
6410 pend
= pbeg
+ pend_offset
;
6415 safe_codings
= Qnil
;
6416 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
6417 if (! NILP (XCAR (tail
)))
6418 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
6420 return safe_codings
;
6424 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
6425 Scheck_coding_systems_region
, 3, 3, 0,
6426 doc
: /* Check if the region is encodable by coding systems.
6428 START and END are buffer positions specifying the region.
6429 CODING-SYSTEM-LIST is a list of coding systems to check.
6431 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6432 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6433 whole region, POS0, POS1, ... are buffer positions where non-encodable
6434 characters are found.
6436 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6439 START may be a string. In that case, check if the string is
6440 encodable, and the value contains indices to the string instead of
6441 buffer positions. END is ignored. */)
6442 (start
, end
, coding_system_list
)
6443 Lisp_Object start
, end
, coding_system_list
;
6446 EMACS_INT start_byte
, end_byte
;
6448 unsigned char *p
, *pbeg
, *pend
;
6450 Lisp_Object tail
, elt
;
6452 if (STRINGP (start
))
6454 if (!STRING_MULTIBYTE (start
)
6455 && XSTRING (start
)->size
!= STRING_BYTES (XSTRING (start
)))
6458 end_byte
= STRING_BYTES (XSTRING (start
));
6463 CHECK_NUMBER_COERCE_MARKER (start
);
6464 CHECK_NUMBER_COERCE_MARKER (end
);
6465 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
6466 args_out_of_range (start
, end
);
6467 if (NILP (current_buffer
->enable_multibyte_characters
))
6469 start_byte
= CHAR_TO_BYTE (XINT (start
));
6470 end_byte
= CHAR_TO_BYTE (XINT (end
));
6471 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
6474 if (start
< GPT
&& end
> GPT
)
6476 if ((GPT
- start
) < (end
- GPT
))
6477 move_gap_both (start
, start_byte
);
6479 move_gap_both (end
, end_byte
);
6485 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
6488 list
= Fcons (Fcons (elt
, Fcons (AREF (CODING_SYSTEM_SPEC (elt
), 0),
6493 if (STRINGP (start
))
6494 p
= pbeg
= XSTRING (start
)->data
;
6496 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
6497 pend
= p
+ (end_byte
- start_byte
);
6499 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
6500 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
6504 if (ASCII_BYTE_P (*p
))
6508 c
= STRING_CHAR_ADVANCE (p
);
6510 charset_map_loaded
= 0;
6511 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
6513 elt
= XCDR (XCAR (tail
));
6514 if (! char_encodable_p (c
, XCAR (elt
)))
6515 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
6517 if (charset_map_loaded
)
6519 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
6521 if (STRINGP (start
))
6522 pbeg
= XSTRING (start
)->data
;
6524 pbeg
= BYTE_POS_ADDR (start_byte
);
6525 p
= pbeg
+ p_offset
;
6526 pend
= pbeg
+ pend_offset
;
6534 for (; CONSP (tail
); tail
= XCDR (tail
))
6537 if (CONSP (XCDR (XCDR (elt
))))
6538 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
6548 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
6549 Lisp_Object start
, end
, coding_system
, dst_object
;
6550 int encodep
, norecord
;
6552 struct coding_system coding
;
6553 EMACS_INT from
, from_byte
, to
, to_byte
;
6554 Lisp_Object src_object
;
6556 CHECK_NUMBER_COERCE_MARKER (start
);
6557 CHECK_NUMBER_COERCE_MARKER (end
);
6558 if (NILP (coding_system
))
6559 coding_system
= Qno_conversion
;
6561 CHECK_CODING_SYSTEM (coding_system
);
6562 src_object
= Fcurrent_buffer ();
6563 if (NILP (dst_object
))
6564 dst_object
= src_object
;
6565 else if (! EQ (dst_object
, Qt
))
6566 CHECK_BUFFER (dst_object
);
6568 validate_region (&start
, &end
);
6569 from
= XFASTINT (start
);
6570 from_byte
= CHAR_TO_BYTE (from
);
6571 to
= XFASTINT (end
);
6572 to_byte
= CHAR_TO_BYTE (to
);
6574 setup_coding_system (coding_system
, &coding
);
6575 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6578 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
6581 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
6584 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
6586 if (coding
.result
!= CODING_RESULT_SUCCESS
)
6587 error ("Code conversion error: %d", coding
.result
);
6589 return (BUFFERP (dst_object
)
6590 ? make_number (coding
.produced_char
)
6591 : coding
.dst_object
);
6595 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
6596 3, 4, "r\nzCoding system: ",
6597 doc
: /* Decode the current region from the specified coding system.
6598 When called from a program, takes four arguments:
6599 START, END, CODING-SYSTEM, and DESTINATION.
6600 START and END are buffer positions.
6602 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6603 If nil, the region between START and END is replace by the decoded text.
6604 If buffer, the decoded text is inserted in the buffer.
6605 If t, the decoded text is returned.
6607 This function sets `last-coding-system-used' to the precise coding system
6608 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6609 not fully specified.)
6610 It returns the length of the decoded text. */)
6611 (start
, end
, coding_system
, destination
)
6612 Lisp_Object start
, end
, coding_system
, destination
;
6614 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
6617 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
6618 3, 4, "r\nzCoding system: ",
6619 doc
: /* Encode the current region by specified coding system.
6620 When called from a program, takes three arguments:
6621 START, END, and CODING-SYSTEM. START and END are buffer positions.
6623 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6624 If nil, the region between START and END is replace by the encoded text.
6625 If buffer, the encoded text is inserted in the buffer.
6626 If t, the encoded text is returned.
6628 This function sets `last-coding-system-used' to the precise coding system
6629 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6630 not fully specified.)
6631 It returns the length of the encoded text. */)
6632 (start
, end
, coding_system
, destination
)
6633 Lisp_Object start
, end
, coding_system
, destination
;
6635 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
6639 code_convert_string (string
, coding_system
, dst_object
,
6640 encodep
, nocopy
, norecord
)
6641 Lisp_Object string
, coding_system
, dst_object
;
6642 int encodep
, nocopy
, norecord
;
6644 struct coding_system coding
;
6645 EMACS_INT chars
, bytes
;
6647 CHECK_STRING (string
);
6648 if (NILP (coding_system
))
6651 Vlast_coding_system_used
= Qno_conversion
;
6652 if (NILP (dst_object
))
6653 return (nocopy
? Fcopy_sequence (string
) : string
);
6656 if (NILP (coding_system
))
6657 coding_system
= Qno_conversion
;
6659 CHECK_CODING_SYSTEM (coding_system
);
6660 if (NILP (dst_object
))
6662 else if (! EQ (dst_object
, Qt
))
6663 CHECK_BUFFER (dst_object
);
6665 setup_coding_system (coding_system
, &coding
);
6666 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6667 chars
= XSTRING (string
)->size
;
6668 bytes
= STRING_BYTES (XSTRING (string
));
6670 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
6672 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
6674 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
6676 if (coding
.result
!= CODING_RESULT_SUCCESS
)
6677 error ("Code conversion error: %d", coding
.result
);
6679 return (BUFFERP (dst_object
)
6680 ? make_number (coding
.produced_char
)
6681 : coding
.dst_object
);
6685 /* Encode or decode STRING according to CODING_SYSTEM.
6686 Do not set Vlast_coding_system_used.
6688 This function is called only from macros DECODE_FILE and
6689 ENCODE_FILE, thus we ignore character composition. */
6692 code_convert_string_norecord (string
, coding_system
, encodep
)
6693 Lisp_Object string
, coding_system
;
6696 code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
6700 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
6702 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6704 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6705 if the decoding operation is trivial.
6707 Optional fourth arg BUFFER non-nil meant that the decoded text is
6708 inserted in BUFFER instead of returned as a astring. In this case,
6709 the return value is BUFFER.
6711 This function sets `last-coding-system-used' to the precise coding system
6712 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6713 not fully specified. */)
6714 (string
, coding_system
, nocopy
, buffer
)
6715 Lisp_Object string
, coding_system
, nocopy
, buffer
;
6717 return code_convert_string (string
, coding_system
, buffer
,
6718 0, ! NILP (nocopy
), 0);
6721 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
6723 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
6725 Optional third arg NOCOPY non-nil means it is OK to return STRING
6726 itself if the encoding operation is trivial.
6728 Optional fourth arg BUFFER non-nil meant that the encoded text is
6729 inserted in BUFFER instead of returned as a astring. In this case,
6730 the return value is BUFFER.
6732 This function sets `last-coding-system-used' to the precise coding system
6733 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6734 not fully specified.) */)
6735 (string
, coding_system
, nocopy
, buffer
)
6736 Lisp_Object string
, coding_system
, nocopy
, buffer
;
6738 return code_convert_string (string
, coding_system
, buffer
,
6739 nocopy
, ! NILP (nocopy
), 1);
6743 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
6744 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
6745 Return the corresponding character. */)
6749 Lisp_Object spec
, attrs
, val
;
6750 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
6753 CHECK_NATNUM (code
);
6754 c
= XFASTINT (code
);
6755 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
6756 attrs
= AREF (spec
, 0);
6758 if (ASCII_BYTE_P (c
)
6759 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6762 val
= CODING_ATTR_CHARSET_LIST (attrs
);
6763 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6764 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6765 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
)));
6768 charset
= charset_roman
;
6769 else if (c
>= 0xA0 && c
< 0xDF)
6771 charset
= charset_kana
;
6776 int s1
= c
>> 8, s2
= c
& 0x7F;
6778 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
6779 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
6780 error ("Invalid code: %d", code
);
6782 charset
= charset_kanji
;
6784 c
= DECODE_CHAR (charset
, c
);
6786 error ("Invalid code: %d", code
);
6787 return make_number (c
);
6791 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
6792 doc
: /* Encode a Japanese character CHAR to shift_jis encoding.
6793 Return the corresponding code in SJIS. */)
6797 Lisp_Object spec
, attrs
, charset_list
;
6799 struct charset
*charset
;
6802 CHECK_CHARACTER (ch
);
6804 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
6805 attrs
= AREF (spec
, 0);
6807 if (ASCII_CHAR_P (c
)
6808 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6811 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
6812 charset
= char_charset (c
, charset_list
, &code
);
6813 if (code
== CHARSET_INVALID_CODE (charset
))
6814 error ("Can't encode by shift_jis encoding: %d", c
);
6817 return make_number (code
);
6820 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
6821 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
6822 Return the corresponding character. */)
6826 Lisp_Object spec
, attrs
, val
;
6827 struct charset
*charset_roman
, *charset_big5
, *charset
;
6830 CHECK_NATNUM (code
);
6831 c
= XFASTINT (code
);
6832 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
6833 attrs
= AREF (spec
, 0);
6835 if (ASCII_BYTE_P (c
)
6836 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6839 val
= CODING_ATTR_CHARSET_LIST (attrs
);
6840 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6841 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
6844 charset
= charset_roman
;
6847 int b1
= c
>> 8, b2
= c
& 0x7F;
6848 if (b1
< 0xA1 || b1
> 0xFE
6849 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
6850 error ("Invalid code: %d", code
);
6851 charset
= charset_big5
;
6853 c
= DECODE_CHAR (charset
, (unsigned )c
);
6855 error ("Invalid code: %d", code
);
6856 return make_number (c
);
6859 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
6860 doc
: /* Encode the Big5 character CHAR to BIG5 coding system.
6861 Return the corresponding character code in Big5. */)
6865 Lisp_Object spec
, attrs
, charset_list
;
6866 struct charset
*charset
;
6870 CHECK_CHARACTER (ch
);
6872 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
6873 attrs
= AREF (spec
, 0);
6874 if (ASCII_CHAR_P (c
)
6875 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6878 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
6879 charset
= char_charset (c
, charset_list
, &code
);
6880 if (code
== CHARSET_INVALID_CODE (charset
))
6881 error ("Can't encode by Big5 encoding: %d", c
);
6883 return make_number (code
);
6887 DEFUN ("set-terminal-coding-system-internal",
6888 Fset_terminal_coding_system_internal
,
6889 Sset_terminal_coding_system_internal
, 1, 1, 0,
6890 doc
: /* Internal use only. */)
6893 CHECK_SYMBOL (coding_system
);
6894 setup_coding_system (Fcheck_coding_system (coding_system
),
6897 /* We had better not send unsafe characters to terminal. */
6898 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
6899 /* Characer composition should be disabled. */
6900 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
6901 terminal_coding
.src_multibyte
= 1;
6902 terminal_coding
.dst_multibyte
= 0;
6906 DEFUN ("set-safe-terminal-coding-system-internal",
6907 Fset_safe_terminal_coding_system_internal
,
6908 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
6909 doc
: /* Internal use only. */)
6912 CHECK_SYMBOL (coding_system
);
6913 setup_coding_system (Fcheck_coding_system (coding_system
),
6914 &safe_terminal_coding
);
6915 /* Characer composition should be disabled. */
6916 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
6917 safe_terminal_coding
.src_multibyte
= 1;
6918 safe_terminal_coding
.dst_multibyte
= 0;
6922 DEFUN ("terminal-coding-system",
6923 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
6924 doc
: /* Return coding system specified for terminal output. */)
6927 return CODING_ID_NAME (terminal_coding
.id
);
6930 DEFUN ("set-keyboard-coding-system-internal",
6931 Fset_keyboard_coding_system_internal
,
6932 Sset_keyboard_coding_system_internal
, 1, 1, 0,
6933 doc
: /* Internal use only. */)
6935 Lisp_Object coding_system
;
6937 CHECK_SYMBOL (coding_system
);
6938 setup_coding_system (Fcheck_coding_system (coding_system
),
6940 /* Characer composition should be disabled. */
6941 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
6945 DEFUN ("keyboard-coding-system",
6946 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
6947 doc
: /* Return coding system specified for decoding keyboard input. */)
6950 return CODING_ID_NAME (keyboard_coding
.id
);
6954 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
6955 Sfind_operation_coding_system
, 1, MANY
, 0,
6956 doc
: /* Choose a coding system for an operation based on the target name.
6957 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
6958 DECODING-SYSTEM is the coding system to use for decoding
6959 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
6960 for encoding (in case OPERATION does encoding).
6962 The first argument OPERATION specifies an I/O primitive:
6963 For file I/O, `insert-file-contents' or `write-region'.
6964 For process I/O, `call-process', `call-process-region', or `start-process'.
6965 For network I/O, `open-network-stream'.
6967 The remaining arguments should be the same arguments that were passed
6968 to the primitive. Depending on which primitive, one of those arguments
6969 is selected as the TARGET. For example, if OPERATION does file I/O,
6970 whichever argument specifies the file name is TARGET.
6972 TARGET has a meaning which depends on OPERATION:
6973 For file I/O, TARGET is a file name.
6974 For process I/O, TARGET is a process name.
6975 For network I/O, TARGET is a service name or a port number
6977 This function looks up what specified for TARGET in,
6978 `file-coding-system-alist', `process-coding-system-alist',
6979 or `network-coding-system-alist' depending on OPERATION.
6980 They may specify a coding system, a cons of coding systems,
6981 or a function symbol to call.
6982 In the last case, we call the function with one argument,
6983 which is a list of all the arguments given to this function.
6985 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
6990 Lisp_Object operation
, target_idx
, target
, val
;
6991 register Lisp_Object chain
;
6994 error ("Too few arguments");
6995 operation
= args
[0];
6996 if (!SYMBOLP (operation
)
6997 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
6998 error ("Invalid first arguement");
6999 if (nargs
< 1 + XINT (target_idx
))
7000 error ("Too few arguments for operation: %s",
7001 XSYMBOL (operation
)->name
->data
);
7002 target
= args
[XINT (target_idx
) + 1];
7003 if (!(STRINGP (target
)
7004 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
7005 error ("Invalid %dth argument", XINT (target_idx
) + 1);
7007 chain
= ((EQ (operation
, Qinsert_file_contents
)
7008 || EQ (operation
, Qwrite_region
))
7009 ? Vfile_coding_system_alist
7010 : (EQ (operation
, Qopen_network_stream
)
7011 ? Vnetwork_coding_system_alist
7012 : Vprocess_coding_system_alist
));
7016 for (; CONSP (chain
); chain
= XCDR (chain
))
7022 && ((STRINGP (target
)
7023 && STRINGP (XCAR (elt
))
7024 && fast_string_match (XCAR (elt
), target
) >= 0)
7025 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
7028 /* Here, if VAL is both a valid coding system and a valid
7029 function symbol, we return VAL as a coding system. */
7032 if (! SYMBOLP (val
))
7034 if (! NILP (Fcoding_system_p (val
)))
7035 return Fcons (val
, val
);
7036 if (! NILP (Ffboundp (val
)))
7038 val
= call1 (val
, Flist (nargs
, args
));
7041 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
7042 return Fcons (val
, val
);
7050 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
7051 Sset_coding_system_priority
, 1, MANY
, 0,
7052 doc
: /* Put higher priority to coding systems of the arguments. */)
7058 int changed
[coding_category_max
];
7059 enum coding_category priorities
[coding_category_max
];
7061 bzero (changed
, sizeof changed
);
7063 for (i
= j
= 0; i
< nargs
; i
++)
7065 enum coding_category category
;
7066 Lisp_Object spec
, attrs
;
7068 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
7069 attrs
= AREF (spec
, 0);
7070 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7071 if (changed
[category
])
7072 /* Ignore this coding system because a coding system of the
7073 same category already had a higher priority. */
7075 changed
[category
] = 1;
7076 priorities
[j
++] = category
;
7077 if (coding_categories
[category
].id
>= 0
7078 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
7079 setup_coding_system (args
[i
], &coding_categories
[category
]);
7082 /* Now we have decided top J priorities. Reflect the order of the
7083 original priorities to the remaining priorities. */
7085 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
7087 while (j
< coding_category_max
7088 && changed
[coding_priorities
[j
]])
7090 if (j
== coding_category_max
)
7092 priorities
[i
] = coding_priorities
[j
];
7095 bcopy (priorities
, coding_priorities
, sizeof priorities
);
7099 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
7100 Scoding_system_priority_list
, 0, 1, 0,
7101 doc
: /* Return a list of coding systems ordered by their priorities. */)
7103 Lisp_Object highestp
;
7108 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
7110 enum coding_category category
= coding_priorities
[i
];
7111 int id
= coding_categories
[category
].id
;
7116 attrs
= CODING_ID_ATTRS (id
);
7117 if (! NILP (highestp
))
7118 return CODING_ATTR_BASE_NAME (attrs
);
7119 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
7121 return Fnreverse (val
);
7125 make_subsidiaries (base
)
7128 Lisp_Object subsidiaries
;
7129 char *suffixes
[] = { "-unix", "-dos", "-mac" };
7130 int base_name_len
= STRING_BYTES (XSYMBOL (base
)->name
);
7131 char *buf
= (char *) alloca (base_name_len
+ 6);
7134 bcopy (XSYMBOL (base
)->name
->data
, buf
, base_name_len
);
7135 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
7136 for (i
= 0; i
< 3; i
++)
7138 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
7139 ASET (subsidiaries
, i
, intern (buf
));
7141 return subsidiaries
;
7145 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
7146 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
7147 doc
: /* For internal use only. */)
7153 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
7154 Lisp_Object attrs
; /* Vector of attributes. */
7155 Lisp_Object eol_type
;
7156 Lisp_Object aliases
;
7157 Lisp_Object coding_type
, charset_list
, safe_charsets
;
7158 enum coding_category category
;
7159 Lisp_Object tail
, val
;
7160 int max_charset_id
= 0;
7163 if (nargs
< coding_arg_max
)
7166 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
7168 name
= args
[coding_arg_name
];
7169 CHECK_SYMBOL (name
);
7170 CODING_ATTR_BASE_NAME (attrs
) = name
;
7172 val
= args
[coding_arg_mnemonic
];
7173 if (! STRINGP (val
))
7174 CHECK_CHARACTER (val
);
7175 CODING_ATTR_MNEMONIC (attrs
) = val
;
7177 coding_type
= args
[coding_arg_coding_type
];
7178 CHECK_SYMBOL (coding_type
);
7179 CODING_ATTR_TYPE (attrs
) = coding_type
;
7181 charset_list
= args
[coding_arg_charset_list
];
7182 if (SYMBOLP (charset_list
))
7184 if (EQ (charset_list
, Qiso_2022
))
7186 if (! EQ (coding_type
, Qiso_2022
))
7187 error ("Invalid charset-list");
7188 charset_list
= Viso_2022_charset_list
;
7190 else if (EQ (charset_list
, Qemacs_mule
))
7192 if (! EQ (coding_type
, Qemacs_mule
))
7193 error ("Invalid charset-list");
7194 charset_list
= Vemacs_mule_charset_list
;
7196 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7197 if (max_charset_id
< XFASTINT (XCAR (tail
)))
7198 max_charset_id
= XFASTINT (XCAR (tail
));
7202 charset_list
= Fcopy_sequence (charset_list
);
7203 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
7205 struct charset
*charset
;
7208 CHECK_CHARSET_GET_CHARSET (val
, charset
);
7209 if (EQ (coding_type
, Qiso_2022
)
7210 ? CHARSET_ISO_FINAL (charset
) < 0
7211 : EQ (coding_type
, Qemacs_mule
)
7212 ? CHARSET_EMACS_MULE_ID (charset
) < 0
7214 error ("Can't handle charset `%s'",
7215 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7217 XCAR (tail
) = make_number (charset
->id
);
7218 if (max_charset_id
< charset
->id
)
7219 max_charset_id
= charset
->id
;
7222 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
7224 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
7226 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7227 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
7228 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
7230 val
= args
[coding_arg_decode_translation_table
];
7232 CHECK_CHAR_TABLE (val
);
7233 CODING_ATTR_DECODE_TBL (attrs
) = val
;
7235 val
= args
[coding_arg_encode_translation_table
];
7237 CHECK_CHAR_TABLE (val
);
7238 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
7240 val
= args
[coding_arg_post_read_conversion
];
7242 CODING_ATTR_POST_READ (attrs
) = val
;
7244 val
= args
[coding_arg_pre_write_conversion
];
7246 CODING_ATTR_PRE_WRITE (attrs
) = val
;
7248 val
= args
[coding_arg_default_char
];
7250 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
7253 CHECK_CHARACTER (val
);
7254 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
7257 val
= args
[coding_arg_plist
];
7259 CODING_ATTR_PLIST (attrs
) = val
;
7261 if (EQ (coding_type
, Qcharset
))
7263 val
= Fmake_vector (make_number (256), Qnil
);
7265 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7267 struct charset
*charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
7269 for (i
= charset
->code_space
[0]; i
<= charset
->code_space
[1]; i
++)
7270 if (NILP (AREF (val
, i
)))
7271 ASET (val
, i
, XCAR (tail
));
7273 ASET (attrs
, coding_attr_charset_valids
, val
);
7274 category
= coding_category_charset
;
7276 else if (EQ (coding_type
, Qccl
))
7280 if (nargs
< coding_arg_ccl_max
)
7283 val
= args
[coding_arg_ccl_decoder
];
7284 CHECK_CCL_PROGRAM (val
);
7286 val
= Fcopy_sequence (val
);
7287 ASET (attrs
, coding_attr_ccl_decoder
, val
);
7289 val
= args
[coding_arg_ccl_encoder
];
7290 CHECK_CCL_PROGRAM (val
);
7292 val
= Fcopy_sequence (val
);
7293 ASET (attrs
, coding_attr_ccl_encoder
, val
);
7295 val
= args
[coding_arg_ccl_valids
];
7296 valids
= Fmake_string (make_number (256), make_number (0));
7297 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
7301 ASET (valids
, XINT (val
), 1);
7307 CHECK_NUMBER (XCAR (val
));
7308 CHECK_NUMBER (XCDR (val
));
7309 from
= XINT (XCAR (val
));
7310 to
= XINT (XCDR (val
));
7311 for (i
= from
; i
<= to
; i
++)
7312 ASET (valids
, i
, 1);
7315 ASET (attrs
, coding_attr_ccl_valids
, valids
);
7317 category
= coding_category_ccl
;
7319 else if (EQ (coding_type
, Qutf_16
))
7321 Lisp_Object bom
, endian
;
7323 if (nargs
< coding_arg_utf16_max
)
7326 bom
= args
[coding_arg_utf16_bom
];
7327 if (! NILP (bom
) && ! EQ (bom
, Qt
))
7330 CHECK_CODING_SYSTEM (XCAR (bom
));
7331 CHECK_CODING_SYSTEM (XCDR (bom
));
7333 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
7335 endian
= args
[coding_arg_utf16_endian
];
7336 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
7338 category
= (CONSP (bom
)
7339 ? coding_category_utf_16_auto
7342 ? coding_category_utf_16_be_nosig
7343 : coding_category_utf_16_le_nosig
)
7345 ? coding_category_utf_16_be
7346 : coding_category_utf_16_le
));
7348 else if (EQ (coding_type
, Qiso_2022
))
7350 Lisp_Object initial
, reg_usage
, request
, flags
;
7351 struct charset
*charset
;
7352 int i
, id
, max_id
= -1;
7354 if (nargs
< coding_arg_iso2022_max
)
7357 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
7358 CHECK_VECTOR (initial
);
7359 for (i
= 0; i
< 4; i
++)
7361 val
= Faref (initial
, make_number (i
));
7364 CHECK_CHARSET_GET_ID (val
, id
);
7365 ASET (initial
, i
, make_number (id
));
7368 ASET (initial
, i
, make_number (-1));
7371 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
7372 CHECK_CONS (reg_usage
);
7373 CHECK_NATNUM (XCAR (reg_usage
));
7374 CHECK_NATNUM (XCDR (reg_usage
));
7376 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
7377 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
7383 CHECK_CHARSET_GET_ID (XCAR (val
), id
);
7384 CHECK_NATNUM (XCDR (val
));
7385 if (XINT (XCDR (val
)) >= 4)
7386 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
7387 XCAR (val
) = make_number (id
);
7390 flags
= args
[coding_arg_iso2022_flags
];
7391 CHECK_NATNUM (flags
);
7393 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
7394 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
7396 ASET (attrs
, coding_attr_iso_initial
, initial
);
7397 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
7398 ASET (attrs
, coding_attr_iso_request
, request
);
7399 ASET (attrs
, coding_attr_iso_flags
, flags
);
7400 setup_iso_safe_charsets (attrs
);
7402 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
7403 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
7404 | CODING_ISO_FLAG_SINGLE_SHIFT
))
7405 ? coding_category_iso_7_else
7406 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
7407 ? coding_category_iso_7
7408 : coding_category_iso_7_tight
);
7411 int id
= XINT (AREF (initial
, 1));
7413 category
= (((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
7414 | CODING_ISO_FLAG_SINGLE_SHIFT
))
7415 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
7417 ? coding_category_iso_8_else
7418 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
7419 ? coding_category_iso_8_1
7420 : coding_category_iso_8_2
);
7423 else if (EQ (coding_type
, Qemacs_mule
))
7425 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
7426 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
7428 category
= coding_category_emacs_mule
;
7430 else if (EQ (coding_type
, Qshift_jis
))
7433 struct charset
*charset
;
7435 if (XINT (Flength (charset_list
)) != 3)
7436 error ("There should be just three charsets");
7438 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7439 if (CHARSET_DIMENSION (charset
) != 1)
7440 error ("Dimension of charset %s is not one",
7441 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7443 charset_list
= XCDR (charset_list
);
7444 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7445 if (CHARSET_DIMENSION (charset
) != 1)
7446 error ("Dimension of charset %s is not one",
7447 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7449 charset_list
= XCDR (charset_list
);
7450 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7451 if (CHARSET_DIMENSION (charset
) != 2)
7452 error ("Dimension of charset %s is not two",
7453 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7455 category
= coding_category_sjis
;
7456 Vsjis_coding_system
= name
;
7458 else if (EQ (coding_type
, Qbig5
))
7460 struct charset
*charset
;
7462 if (XINT (Flength (charset_list
)) != 2)
7463 error ("There should be just two charsets");
7465 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7466 if (CHARSET_DIMENSION (charset
) != 1)
7467 error ("Dimension of charset %s is not one",
7468 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7470 charset_list
= XCDR (charset_list
);
7471 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7472 if (CHARSET_DIMENSION (charset
) != 2)
7473 error ("Dimension of charset %s is not two",
7474 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7476 category
= coding_category_big5
;
7477 Vbig5_coding_system
= name
;
7479 else if (EQ (coding_type
, Qraw_text
))
7480 category
= coding_category_raw_text
;
7481 else if (EQ (coding_type
, Qutf_8
))
7482 category
= coding_category_utf_8
;
7483 else if (EQ (coding_type
, Qundecided
))
7484 category
= coding_category_undecided
;
7486 error ("Invalid coding system type: %s",
7487 XSYMBOL (coding_type
)->name
->data
);
7489 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
7491 eol_type
= args
[coding_arg_eol_type
];
7492 if (! NILP (eol_type
)
7493 && ! EQ (eol_type
, Qunix
)
7494 && ! EQ (eol_type
, Qdos
)
7495 && ! EQ (eol_type
, Qmac
))
7496 error ("Invalid eol-type");
7498 aliases
= Fcons (name
, Qnil
);
7500 if (NILP (eol_type
))
7502 eol_type
= make_subsidiaries (name
);
7503 for (i
= 0; i
< 3; i
++)
7505 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
7507 this_name
= AREF (eol_type
, i
);
7508 this_aliases
= Fcons (this_name
, Qnil
);
7509 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
7510 this_spec
= Fmake_vector (make_number (3), attrs
);
7511 ASET (this_spec
, 1, this_aliases
);
7512 ASET (this_spec
, 2, this_eol_type
);
7513 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
7514 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
7515 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
7516 Vcoding_system_alist
);
7520 spec_vec
= Fmake_vector (make_number (3), attrs
);
7521 ASET (spec_vec
, 1, aliases
);
7522 ASET (spec_vec
, 2, eol_type
);
7524 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
7525 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
7526 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
7527 Vcoding_system_alist
);
7530 int id
= coding_categories
[category
].id
;
7532 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
7533 setup_coding_system (name
, &coding_categories
[category
]);
7539 return Fsignal (Qwrong_number_of_arguments
,
7540 Fcons (intern ("define-coding-system-internal"),
7541 make_number (nargs
)));
7544 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
7545 Sdefine_coding_system_alias
, 2, 2, 0,
7546 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
7547 (alias
, coding_system
)
7548 Lisp_Object alias
, coding_system
;
7550 Lisp_Object spec
, aliases
, eol_type
;
7552 CHECK_SYMBOL (alias
);
7553 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7554 aliases
= AREF (spec
, 1);
7555 while (!NILP (XCDR (aliases
)))
7556 aliases
= XCDR (aliases
);
7557 XCDR (aliases
) = Fcons (alias
, Qnil
);
7559 eol_type
= AREF (spec
, 2);
7560 if (VECTORP (eol_type
))
7562 Lisp_Object subsidiaries
;
7565 subsidiaries
= make_subsidiaries (alias
);
7566 for (i
= 0; i
< 3; i
++)
7567 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
7568 AREF (eol_type
, i
));
7570 ASET (spec
, 2, subsidiaries
);
7573 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
7578 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
7580 doc
: /* Return the base of CODING-SYSTEM.
7581 Any alias or subsidiary coding systems are not base coding system. */)
7583 Lisp_Object coding_system
;
7585 Lisp_Object spec
, attrs
;
7587 if (NILP (coding_system
))
7588 return (Qno_conversion
);
7589 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7590 attrs
= AREF (spec
, 0);
7591 return CODING_ATTR_BASE_NAME (attrs
);
7594 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
7596 doc
: "Return the property list of CODING-SYSTEM.")
7598 Lisp_Object coding_system
;
7600 Lisp_Object spec
, attrs
;
7602 if (NILP (coding_system
))
7603 coding_system
= Qno_conversion
;
7604 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7605 attrs
= AREF (spec
, 0);
7606 return CODING_ATTR_PLIST (attrs
);
7610 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
7612 doc
: /* Return the list of aliases of CODING-SYSTEM.
7613 A base coding system is what made by `define-coding-system'.
7614 Any alias nor subsidiary coding systems are not base coding system. */)
7616 Lisp_Object coding_system
;
7620 if (NILP (coding_system
))
7621 coding_system
= Qno_conversion
;
7622 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7623 return AREF (spec
, 2);
7626 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
7627 Scoding_system_eol_type
, 1, 1, 0,
7628 doc
: /* Return eol-type of CODING-SYSTEM.
7629 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7631 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7632 and CR respectively.
7634 A vector value indicates that a format of end-of-line should be
7635 detected automatically. Nth element of the vector is the subsidiary
7636 coding system whose eol-type is N. */)
7638 Lisp_Object coding_system
;
7640 Lisp_Object spec
, eol_type
;
7643 if (NILP (coding_system
))
7644 coding_system
= Qno_conversion
;
7645 if (! CODING_SYSTEM_P (coding_system
))
7647 spec
= CODING_SYSTEM_SPEC (coding_system
);
7648 eol_type
= AREF (spec
, 2);
7649 if (VECTORP (eol_type
))
7650 return Fcopy_sequence (eol_type
);
7651 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
7652 return make_number (n
);
7658 /*** 9. Post-amble ***/
7665 for (i
= 0; i
< coding_category_max
; i
++)
7667 coding_categories
[i
].id
= -1;
7668 coding_priorities
[i
] = i
;
7671 /* ISO2022 specific initialize routine. */
7672 for (i
= 0; i
< 0x20; i
++)
7673 iso_code_class
[i
] = ISO_control_0
;
7674 for (i
= 0x21; i
< 0x7F; i
++)
7675 iso_code_class
[i
] = ISO_graphic_plane_0
;
7676 for (i
= 0x80; i
< 0xA0; i
++)
7677 iso_code_class
[i
] = ISO_control_1
;
7678 for (i
= 0xA1; i
< 0xFF; i
++)
7679 iso_code_class
[i
] = ISO_graphic_plane_1
;
7680 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
7681 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
7682 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
7683 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
7684 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
7685 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
7686 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
7687 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
7688 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
7689 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
7691 inhibit_pre_post_conversion
= 0;
7693 for (i
= 0; i
< 256; i
++)
7695 emacs_mule_bytes
[i
] = 1;
7704 staticpro (&Vcoding_system_hash_table
);
7705 Vcoding_system_hash_table
= Fmakehash (Qeq
);
7707 staticpro (&Vsjis_coding_system
);
7708 Vsjis_coding_system
= Qnil
;
7710 staticpro (&Vbig5_coding_system
);
7711 Vbig5_coding_system
= Qnil
;
7713 staticpro (&Vcode_conversion_work_buf_list
);
7714 Vcode_conversion_work_buf_list
= Qnil
;
7716 staticpro (&Vcode_conversion_reused_work_buf
);
7717 Vcode_conversion_reused_work_buf
= Qnil
;
7719 DEFSYM (Qcharset
, "charset");
7720 DEFSYM (Qtarget_idx
, "target-idx");
7721 DEFSYM (Qcoding_system_history
, "coding-system-history");
7722 Fset (Qcoding_system_history
, Qnil
);
7724 /* Target FILENAME is the first argument. */
7725 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
7726 /* Target FILENAME is the third argument. */
7727 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
7729 DEFSYM (Qcall_process
, "call-process");
7730 /* Target PROGRAM is the first argument. */
7731 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
7733 DEFSYM (Qcall_process_region
, "call-process-region");
7734 /* Target PROGRAM is the third argument. */
7735 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
7737 DEFSYM (Qstart_process
, "start-process");
7738 /* Target PROGRAM is the third argument. */
7739 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
7741 DEFSYM (Qopen_network_stream
, "open-network-stream");
7742 /* Target SERVICE is the fourth argument. */
7743 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
7745 DEFSYM (Qcoding_system
, "coding-system");
7746 DEFSYM (Qcoding_aliases
, "coding-aliases");
7748 DEFSYM (Qeol_type
, "eol-type");
7749 DEFSYM (Qunix
, "unix");
7750 DEFSYM (Qdos
, "dos");
7751 DEFSYM (Qmac
, "mac");
7753 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
7754 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
7755 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
7756 DEFSYM (Qdefault_char
, "default-char");
7757 DEFSYM (Qundecided
, "undecided");
7758 DEFSYM (Qno_conversion
, "no-conversion");
7759 DEFSYM (Qraw_text
, "raw-text");
7761 DEFSYM (Qiso_2022
, "iso-2022");
7763 DEFSYM (Qutf_8
, "utf-8");
7765 DEFSYM (Qutf_16
, "utf-16");
7766 DEFSYM (Qutf_16_be
, "utf-16-be");
7767 DEFSYM (Qutf_16_be_nosig
, "utf-16-be-nosig");
7768 DEFSYM (Qutf_16_le
, "utf-16-l3");
7769 DEFSYM (Qutf_16_le_nosig
, "utf-16-le-nosig");
7770 DEFSYM (Qsignature
, "signature");
7771 DEFSYM (Qendian
, "endian");
7772 DEFSYM (Qbig
, "big");
7773 DEFSYM (Qlittle
, "little");
7775 DEFSYM (Qshift_jis
, "shift-jis");
7776 DEFSYM (Qbig5
, "big5");
7778 DEFSYM (Qcoding_system_p
, "coding-system-p");
7780 DEFSYM (Qcoding_system_error
, "coding-system-error");
7781 Fput (Qcoding_system_error
, Qerror_conditions
,
7782 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
7783 Fput (Qcoding_system_error
, Qerror_message
,
7784 build_string ("Invalid coding system"));
7786 /* Intern this now in case it isn't already done.
7787 Setting this variable twice is harmless.
7788 But don't staticpro it here--that is done in alloc.c. */
7789 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
7791 DEFSYM (Qtranslation_table
, "translation-table");
7792 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
7793 DEFSYM (Qtranslation_table_id
, "translation-table-id");
7794 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
7795 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
7797 DEFSYM (Qchar_coding_system
, "char-coding-system");
7799 Fput (Qchar_coding_system
, Qchar_table_extra_slots
, make_number (2));
7801 DEFSYM (Qvalid_codes
, "valid-codes");
7803 DEFSYM (Qemacs_mule
, "emacs-mule");
7805 Vcoding_category_table
7806 = Fmake_vector (make_number (coding_category_max
), Qnil
);
7807 staticpro (&Vcoding_category_table
);
7808 /* Followings are target of code detection. */
7809 ASET (Vcoding_category_table
, coding_category_iso_7
,
7810 intern ("coding-category-iso-7"));
7811 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
7812 intern ("coding-category-iso-7-tight"));
7813 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
7814 intern ("coding-category-iso-8-1"));
7815 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
7816 intern ("coding-category-iso-8-2"));
7817 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
7818 intern ("coding-category-iso-7-else"));
7819 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
7820 intern ("coding-category-iso-8-else"));
7821 ASET (Vcoding_category_table
, coding_category_utf_8
,
7822 intern ("coding-category-utf-8"));
7823 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
7824 intern ("coding-category-utf-16-be"));
7825 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
7826 intern ("coding-category-utf-16-le"));
7827 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
7828 intern ("coding-category-utf-16-be-nosig"));
7829 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
7830 intern ("coding-category-utf-16-le-nosig"));
7831 ASET (Vcoding_category_table
, coding_category_charset
,
7832 intern ("coding-category-charset"));
7833 ASET (Vcoding_category_table
, coding_category_sjis
,
7834 intern ("coding-category-sjis"));
7835 ASET (Vcoding_category_table
, coding_category_big5
,
7836 intern ("coding-category-big5"));
7837 ASET (Vcoding_category_table
, coding_category_ccl
,
7838 intern ("coding-category-ccl"));
7839 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
7840 intern ("coding-category-emacs-mule"));
7841 /* Followings are NOT target of code detection. */
7842 ASET (Vcoding_category_table
, coding_category_raw_text
,
7843 intern ("coding-category-raw-text"));
7844 ASET (Vcoding_category_table
, coding_category_undecided
,
7845 intern ("coding-category-undecided"));
7848 Lisp_Object args
[coding_arg_max
];
7849 Lisp_Object plist
[14];
7852 for (i
= 0; i
< coding_arg_max
; i
++)
7855 plist
[0] = intern (":name");
7856 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
7857 plist
[2] = intern (":mnemonic");
7858 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
7859 plist
[4] = intern (":coding-type");
7860 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
7861 plist
[6] = intern (":ascii-compatible-p");
7862 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
7863 plist
[8] = intern (":default-char");
7864 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
7865 plist
[10] = intern (":docstring");
7866 plist
[11] = build_string ("Do no conversion.\n\
7868 When you visit a file with this coding, the file is read into a\n\
7869 unibyte buffer as is, thus each byte of a file is treated as a\n\
7871 plist
[12] = intern (":eol-type");
7872 plist
[13] = args
[coding_arg_eol_type
] = Qunix
;
7873 args
[coding_arg_plist
] = Flist (14, plist
);
7874 Fdefine_coding_system_internal (coding_arg_max
, args
);
7877 setup_coding_system (Qno_conversion
, &keyboard_coding
);
7878 setup_coding_system (Qno_conversion
, &terminal_coding
);
7879 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
7881 defsubr (&Scoding_system_p
);
7882 defsubr (&Sread_coding_system
);
7883 defsubr (&Sread_non_nil_coding_system
);
7884 defsubr (&Scheck_coding_system
);
7885 defsubr (&Sdetect_coding_region
);
7886 defsubr (&Sdetect_coding_string
);
7887 defsubr (&Sfind_coding_systems_region_internal
);
7888 defsubr (&Scheck_coding_systems_region
);
7889 defsubr (&Sdecode_coding_region
);
7890 defsubr (&Sencode_coding_region
);
7891 defsubr (&Sdecode_coding_string
);
7892 defsubr (&Sencode_coding_string
);
7893 defsubr (&Sdecode_sjis_char
);
7894 defsubr (&Sencode_sjis_char
);
7895 defsubr (&Sdecode_big5_char
);
7896 defsubr (&Sencode_big5_char
);
7897 defsubr (&Sset_terminal_coding_system_internal
);
7898 defsubr (&Sset_safe_terminal_coding_system_internal
);
7899 defsubr (&Sterminal_coding_system
);
7900 defsubr (&Sset_keyboard_coding_system_internal
);
7901 defsubr (&Skeyboard_coding_system
);
7902 defsubr (&Sfind_operation_coding_system
);
7903 defsubr (&Sset_coding_system_priority
);
7904 defsubr (&Sdefine_coding_system_internal
);
7905 defsubr (&Sdefine_coding_system_alias
);
7906 defsubr (&Scoding_system_base
);
7907 defsubr (&Scoding_system_plist
);
7908 defsubr (&Scoding_system_aliases
);
7909 defsubr (&Scoding_system_eol_type
);
7910 defsubr (&Scoding_system_priority_list
);
7912 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
7913 doc
: /* List of coding systems.
7915 Do not alter the value of this variable manually. This variable should be
7916 updated by the functions `define-coding-system' and
7917 `define-coding-system-alias'. */);
7918 Vcoding_system_list
= Qnil
;
7920 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
7921 doc
: /* Alist of coding system names.
7922 Each element is one element list of coding system name.
7923 This variable is given to `completing-read' as TABLE argument.
7925 Do not alter the value of this variable manually. This variable should be
7926 updated by the functions `make-coding-system' and
7927 `define-coding-system-alias'. */);
7928 Vcoding_system_alist
= Qnil
;
7930 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
7931 doc
: /* List of coding-categories (symbols) ordered by priority.
7933 On detecting a coding system, Emacs tries code detection algorithms
7934 associated with each coding-category one by one in this order. When
7935 one algorithm agrees with a byte sequence of source text, the coding
7936 system bound to the corresponding coding-category is selected. */);
7940 Vcoding_category_list
= Qnil
;
7941 for (i
= coding_category_max
- 1; i
>= 0; i
--)
7942 Vcoding_category_list
7943 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
7944 Vcoding_category_list
);
7947 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
7948 doc
: /* Specify the coding system for read operations.
7949 It is useful to bind this variable with `let', but do not set it globally.
7950 If the value is a coding system, it is used for decoding on read operation.
7951 If not, an appropriate element is used from one of the coding system alists:
7952 There are three such tables, `file-coding-system-alist',
7953 `process-coding-system-alist', and `network-coding-system-alist'. */);
7954 Vcoding_system_for_read
= Qnil
;
7956 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
7957 doc
: /* Specify the coding system for write operations.
7958 Programs bind this variable with `let', but you should not set it globally.
7959 If the value is a coding system, it is used for encoding of output,
7960 when writing it to a file and when sending it to a file or subprocess.
7962 If this does not specify a coding system, an appropriate element
7963 is used from one of the coding system alists:
7964 There are three such tables, `file-coding-system-alist',
7965 `process-coding-system-alist', and `network-coding-system-alist'.
7966 For output to files, if the above procedure does not specify a coding system,
7967 the value of `buffer-file-coding-system' is used. */);
7968 Vcoding_system_for_write
= Qnil
;
7970 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
7972 Coding system used in the latest file or process I/O. */);
7973 Vlast_coding_system_used
= Qnil
;
7975 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
7977 *Non-nil means always inhibit code conversion of end-of-line format.
7978 See info node `Coding Systems' and info node `Text and Binary' concerning
7979 such conversion. */);
7980 inhibit_eol_conversion
= 0;
7982 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
7984 Non-nil means process buffer inherits coding system of process output.
7985 Bind it to t if the process output is to be treated as if it were a file
7986 read from some filesystem. */);
7987 inherit_process_coding_system
= 0;
7989 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
7991 Alist to decide a coding system to use for a file I/O operation.
7992 The format is ((PATTERN . VAL) ...),
7993 where PATTERN is a regular expression matching a file name,
7994 VAL is a coding system, a cons of coding systems, or a function symbol.
7995 If VAL is a coding system, it is used for both decoding and encoding
7997 If VAL is a cons of coding systems, the car part is used for decoding,
7998 and the cdr part is used for encoding.
7999 If VAL is a function symbol, the function must return a coding system
8000 or a cons of coding systems which are used as above. The function gets
8001 the arguments with which `find-operation-coding-systems' was called.
8003 See also the function `find-operation-coding-system'
8004 and the variable `auto-coding-alist'. */);
8005 Vfile_coding_system_alist
= Qnil
;
8007 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
8009 Alist to decide a coding system to use for a process I/O operation.
8010 The format is ((PATTERN . VAL) ...),
8011 where PATTERN is a regular expression matching a program name,
8012 VAL is a coding system, a cons of coding systems, or a function symbol.
8013 If VAL is a coding system, it is used for both decoding what received
8014 from the program and encoding what sent to the program.
8015 If VAL is a cons of coding systems, the car part is used for decoding,
8016 and the cdr part is used for encoding.
8017 If VAL is a function symbol, the function must return a coding system
8018 or a cons of coding systems which are used as above.
8020 See also the function `find-operation-coding-system'. */);
8021 Vprocess_coding_system_alist
= Qnil
;
8023 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
8025 Alist to decide a coding system to use for a network I/O operation.
8026 The format is ((PATTERN . VAL) ...),
8027 where PATTERN is a regular expression matching a network service name
8028 or is a port number to connect to,
8029 VAL is a coding system, a cons of coding systems, or a function symbol.
8030 If VAL is a coding system, it is used for both decoding what received
8031 from the network stream and encoding what sent to the network stream.
8032 If VAL is a cons of coding systems, the car part is used for decoding,
8033 and the cdr part is used for encoding.
8034 If VAL is a function symbol, the function must return a coding system
8035 or a cons of coding systems which are used as above.
8037 See also the function `find-operation-coding-system'. */);
8038 Vnetwork_coding_system_alist
= Qnil
;
8040 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
8041 doc
: /* Coding system to use with system messages.
8042 Also used for decoding keyboard input on X Window system. */);
8043 Vlocale_coding_system
= Qnil
;
8045 /* The eol mnemonics are reset in startup.el system-dependently. */
8046 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
8048 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8049 eol_mnemonic_unix
= build_string (":");
8051 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
8053 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8054 eol_mnemonic_dos
= build_string ("\\");
8056 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
8058 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8059 eol_mnemonic_mac
= build_string ("/");
8061 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
8063 *String displayed in mode line when end-of-line format is not yet determined. */);
8064 eol_mnemonic_undecided
= build_string (":");
8066 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
8068 *Non-nil enables character translation while encoding and decoding. */);
8069 Venable_character_translation
= Qt
;
8071 DEFVAR_LISP ("standard-translation-table-for-decode",
8072 &Vstandard_translation_table_for_decode
,
8073 doc
: /* Table for translating characters while decoding. */);
8074 Vstandard_translation_table_for_decode
= Qnil
;
8076 DEFVAR_LISP ("standard-translation-table-for-encode",
8077 &Vstandard_translation_table_for_encode
,
8078 doc
: /* Table for translating characters while encoding. */);
8079 Vstandard_translation_table_for_encode
= Qnil
;
8081 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
8082 doc
: /* Alist of charsets vs revision numbers.
8083 While encoding, if a charset (car part of an element) is found,
8084 designate it with the escape sequence identifying revision (cdr part
8085 of the element). */);
8086 Vcharset_revision_table
= Qnil
;
8088 DEFVAR_LISP ("default-process-coding-system",
8089 &Vdefault_process_coding_system
,
8090 doc
: /* Cons of coding systems used for process I/O by default.
8091 The car part is used for decoding a process output,
8092 the cdr part is used for encoding a text to be sent to a process. */);
8093 Vdefault_process_coding_system
= Qnil
;
8095 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
8097 Table of extra Latin codes in the range 128..159 (inclusive).
8098 This is a vector of length 256.
8099 If Nth element is non-nil, the existence of code N in a file
8100 \(or output of subprocess) doesn't prevent it to be detected as
8101 a coding system of ISO 2022 variant which has a flag
8102 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8103 or reading output of a subprocess.
8104 Only 128th through 159th elements has a meaning. */);
8105 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
8107 DEFVAR_LISP ("select-safe-coding-system-function",
8108 &Vselect_safe_coding_system_function
,
8110 Function to call to select safe coding system for encoding a text.
8112 If set, this function is called to force a user to select a proper
8113 coding system which can encode the text in the case that a default
8114 coding system used in each operation can't encode the text.
8116 The default value is `select-safe-coding-system' (which see). */);
8117 Vselect_safe_coding_system_function
= Qnil
;
8119 DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table
,
8121 Char-table containing safe coding systems of each characters.
8122 Each element doesn't include such generic coding systems that can
8123 encode any characters. They are in the first extra slot. */);
8124 Vchar_coding_system_table
= Fmake_char_table (Qchar_coding_system
, Qnil
);
8126 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8127 &inhibit_iso_escape_detection
,
8129 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8131 By default, on reading a file, Emacs tries to detect how the text is
8132 encoded. This code detection is sensitive to escape sequences. If
8133 the sequence is valid as ISO2022, the code is determined as one of
8134 the ISO2022 encodings, and the file is decoded by the corresponding
8135 coding system (e.g. `iso-2022-7bit').
8137 However, there may be a case that you want to read escape sequences in
8138 a file as is. In such a case, you can set this variable to non-nil.
8139 Then, as the code detection ignores any escape sequences, no file is
8140 detected as encoded in some ISO2022 encoding. The result is that all
8141 escape sequences become visible in a buffer.
8143 The default value is nil, and it is strongly recommended not to change
8144 it. That is because many Emacs Lisp source files that contain
8145 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8146 in Emacs's distribution, and they won't be decoded correctly on
8147 reading if you suppress escape sequence detection.
8149 The other way to read escape sequences in a file without decoding is
8150 to explicitly specify some coding system that doesn't use ISO2022's
8151 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8152 inhibit_iso_escape_detection
= 0;
8156 emacs_strerror (error_number
)
8161 synchronize_system_messages_locale ();
8162 str
= strerror (error_number
);
8164 if (! NILP (Vlocale_coding_system
))
8166 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
8167 Vlocale_coding_system
,
8169 str
= (char *) XSTRING (dec
)->data
;