1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
152 Below is the template of these functions. */
156 detect_coding_XXX (coding
, detect_info
)
157 struct coding_system
*coding
;
158 struct coding_detection_info
*detect_info
;
160 unsigned char *src
= coding
->source
;
161 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
162 int multibytep
= coding
->src_multibyte
;
163 int consumed_chars
= 0;
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
173 if (! __C_conforms_to_XXX___ (c
))
175 if (! __C_strongly_suggests_XXX__ (c
))
176 found
= CATEGORY_MASK_XXX
;
178 /* The byte sequence is invalid for XXX. */
179 detect_info
->rejected
|= CATEGORY_MASK_XXX
;
183 /* The source exausted successfully. */
184 detect_info
->found
|= found
;
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
201 Below is the template of these functions. */
205 decode_coding_XXXX (coding
)
206 struct coding_system
*coding
;
208 unsigned char *src
= coding
->source
+ coding
->consumed
;
209 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base
;
214 /* A buffer to produce decoded characters. */
215 int *charbuf
= coding
->charbuf
;
216 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
217 int multibytep
= coding
->src_multibyte
;
222 if (charbuf
< charbuf_end
)
223 /* No more room to produce a decoded character. */
230 if (src_base
< src_end
231 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base
< src_end
&& charbuf
< charbuf_end
)
235 *charbuf
++ = *src_base
++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
239 /* Remember how many characters we produced. */
240 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
260 Below is a template of these functions. */
263 encode_coding_XXX (coding
)
264 struct coding_system
*coding
;
266 int multibytep
= coding
->dst_multibyte
;
267 int *charbuf
= coding
->charbuf
;
268 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
269 unsigned char *dst
= coding
->destination
+ coding
->produced
;
270 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
271 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
272 int produced_chars
= 0;
274 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
277 /* Encode C into DST, and increment DST. */
279 label_no_more_destination
:
280 /* How many chars and bytes we produced. */
281 coding
->produced_char
+= produced_chars
;
282 coding
->produced
= dst
- coding
->destination
;
287 /*** 1. Preamble ***/
294 #include "character.h"
297 #include "composite.h"
301 Lisp_Object Vcoding_system_hash_table
;
303 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
304 Lisp_Object Qunix
, Qdos
;
305 extern Lisp_Object Qmac
; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system
;
307 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
308 Lisp_Object Qdefault_char
;
309 Lisp_Object Qno_conversion
, Qundecided
;
310 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
311 Lisp_Object Qbig
, Qlittle
;
312 Lisp_Object Qcoding_system_history
;
313 Lisp_Object Qvalid_codes
;
314 Lisp_Object QCcategory
;
316 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
317 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
318 Lisp_Object Qstart_process
, Qopen_network_stream
;
319 Lisp_Object Qtarget_idx
;
321 Lisp_Object Vselect_safe_coding_system_function
;
323 /* Mnemonic string for each format of end-of-line. */
324 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
325 /* Mnemonic string to indicate format of end-of-line is not yet
327 Lisp_Object eol_mnemonic_undecided
;
331 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
333 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
335 /* Coding system emacs-mule and raw-text are for converting only
336 end-of-line format. */
337 Lisp_Object Qemacs_mule
, Qraw_text
;
339 /* Coding-systems are handed between Emacs Lisp programs and C internal
340 routines by the following three variables. */
341 /* Coding-system for reading files and receiving data from process. */
342 Lisp_Object Vcoding_system_for_read
;
343 /* Coding-system for writing files and sending data to process. */
344 Lisp_Object Vcoding_system_for_write
;
345 /* Coding-system actually used in the latest I/O. */
346 Lisp_Object Vlast_coding_system_used
;
348 /* A vector of length 256 which contains information about special
349 Latin codes (especially for dealing with Microsoft codes). */
350 Lisp_Object Vlatin_extra_code_table
;
352 /* Flag to inhibit code conversion of end-of-line format. */
353 int inhibit_eol_conversion
;
355 /* Flag to inhibit ISO2022 escape sequence detection. */
356 int inhibit_iso_escape_detection
;
358 /* Flag to make buffer-file-coding-system inherit from process-coding. */
359 int inherit_process_coding_system
;
361 /* Coding system to be used to encode text for terminal display. */
362 struct coding_system terminal_coding
;
364 /* Coding system to be used to encode text for terminal display when
365 terminal coding system is nil. */
366 struct coding_system safe_terminal_coding
;
368 /* Coding system of what is sent from terminal keyboard. */
369 struct coding_system keyboard_coding
;
371 Lisp_Object Vfile_coding_system_alist
;
372 Lisp_Object Vprocess_coding_system_alist
;
373 Lisp_Object Vnetwork_coding_system_alist
;
375 Lisp_Object Vlocale_coding_system
;
379 /* Flag to tell if we look up translation table on character code
381 Lisp_Object Venable_character_translation
;
382 /* Standard translation table to look up on decoding (reading). */
383 Lisp_Object Vstandard_translation_table_for_decode
;
384 /* Standard translation table to look up on encoding (writing). */
385 Lisp_Object Vstandard_translation_table_for_encode
;
387 Lisp_Object Qtranslation_table
;
388 Lisp_Object Qtranslation_table_id
;
389 Lisp_Object Qtranslation_table_for_decode
;
390 Lisp_Object Qtranslation_table_for_encode
;
392 /* Alist of charsets vs revision number. */
393 static Lisp_Object Vcharset_revision_table
;
395 /* Default coding systems used for process I/O. */
396 Lisp_Object Vdefault_process_coding_system
;
398 /* Global flag to tell that we can't call post-read-conversion and
399 pre-write-conversion functions. Usually the value is zero, but it
400 is set to 1 temporarily while such functions are running. This is
401 to avoid infinite recursive call. */
402 static int inhibit_pre_post_conversion
;
404 /* Two special coding systems. */
405 Lisp_Object Vsjis_coding_system
;
406 Lisp_Object Vbig5_coding_system
;
409 static int detect_coding_utf_8
P_ ((struct coding_system
*,
410 struct coding_detection_info
*info
));
411 static void decode_coding_utf_8
P_ ((struct coding_system
*));
412 static int encode_coding_utf_8
P_ ((struct coding_system
*));
414 static int detect_coding_utf_16
P_ ((struct coding_system
*,
415 struct coding_detection_info
*info
));
416 static void decode_coding_utf_16
P_ ((struct coding_system
*));
417 static int encode_coding_utf_16
P_ ((struct coding_system
*));
419 static int detect_coding_iso_2022
P_ ((struct coding_system
*,
420 struct coding_detection_info
*info
));
421 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
422 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
424 static int detect_coding_emacs_mule
P_ ((struct coding_system
*,
425 struct coding_detection_info
*info
));
426 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
427 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
429 static int detect_coding_sjis
P_ ((struct coding_system
*,
430 struct coding_detection_info
*info
));
431 static void decode_coding_sjis
P_ ((struct coding_system
*));
432 static int encode_coding_sjis
P_ ((struct coding_system
*));
434 static int detect_coding_big5
P_ ((struct coding_system
*,
435 struct coding_detection_info
*info
));
436 static void decode_coding_big5
P_ ((struct coding_system
*));
437 static int encode_coding_big5
P_ ((struct coding_system
*));
439 static int detect_coding_ccl
P_ ((struct coding_system
*,
440 struct coding_detection_info
*info
));
441 static void decode_coding_ccl
P_ ((struct coding_system
*));
442 static int encode_coding_ccl
P_ ((struct coding_system
*));
444 static void decode_coding_raw_text
P_ ((struct coding_system
*));
445 static int encode_coding_raw_text
P_ ((struct coding_system
*));
448 /* ISO2022 section */
450 #define CODING_ISO_INITIAL(coding, reg) \
451 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
452 coding_attr_iso_initial), \
456 #define CODING_ISO_REQUEST(coding, charset_id) \
457 ((charset_id <= (coding)->max_charset_id \
458 ? (coding)->safe_charsets[charset_id] \
462 #define CODING_ISO_FLAGS(coding) \
463 ((coding)->spec.iso_2022.flags)
464 #define CODING_ISO_DESIGNATION(coding, reg) \
465 ((coding)->spec.iso_2022.current_designation[reg])
466 #define CODING_ISO_INVOCATION(coding, plane) \
467 ((coding)->spec.iso_2022.current_invocation[plane])
468 #define CODING_ISO_SINGLE_SHIFTING(coding) \
469 ((coding)->spec.iso_2022.single_shifting)
470 #define CODING_ISO_BOL(coding) \
471 ((coding)->spec.iso_2022.bol)
472 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
473 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
475 /* Control characters of ISO2022. */
476 /* code */ /* function */
477 #define ISO_CODE_LF 0x0A /* line-feed */
478 #define ISO_CODE_CR 0x0D /* carriage-return */
479 #define ISO_CODE_SO 0x0E /* shift-out */
480 #define ISO_CODE_SI 0x0F /* shift-in */
481 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
482 #define ISO_CODE_ESC 0x1B /* escape */
483 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
484 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
485 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
487 /* All code (1-byte) of ISO2022 is classified into one of the
489 enum iso_code_class_type
491 ISO_control_0
, /* Control codes in the range
492 0x00..0x1F and 0x7F, except for the
493 following 5 codes. */
494 ISO_carriage_return
, /* ISO_CODE_CR (0x0D) */
495 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
496 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
497 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
498 ISO_escape
, /* ISO_CODE_SO (0x1B) */
499 ISO_control_1
, /* Control codes in the range
500 0x80..0x9F, except for the
501 following 3 codes. */
502 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
503 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
504 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
505 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
506 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
507 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
508 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
511 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
512 `iso-flags' attribute of an iso2022 coding system. */
514 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
515 instead of the correct short-form sequence (e.g. ESC $ A). */
516 #define CODING_ISO_FLAG_LONG_FORM 0x0001
518 /* If set, reset graphic planes and registers at end-of-line to the
520 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
522 /* If set, reset graphic planes and registers before any control
523 characters to the initial state. */
524 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
526 /* If set, encode by 7-bit environment. */
527 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
529 /* If set, use locking-shift function. */
530 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
532 /* If set, use single-shift function. Overwrite
533 CODING_ISO_FLAG_LOCKING_SHIFT. */
534 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
536 /* If set, use designation escape sequence. */
537 #define CODING_ISO_FLAG_DESIGNATION 0x0040
539 /* If set, produce revision number sequence. */
540 #define CODING_ISO_FLAG_REVISION 0x0080
542 /* If set, produce ISO6429's direction specifying sequence. */
543 #define CODING_ISO_FLAG_DIRECTION 0x0100
545 /* If set, assume designation states are reset at beginning of line on
547 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
549 /* If set, designation sequence should be placed at beginning of line
551 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
553 /* If set, do not encode unsafe charactes on output. */
554 #define CODING_ISO_FLAG_SAFE 0x0800
556 /* If set, extra latin codes (128..159) are accepted as a valid code
558 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
560 #define CODING_ISO_FLAG_COMPOSITION 0x2000
562 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
564 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
566 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
568 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
570 /* A character to be produced on output if encoding of the original
571 character is prohibited by CODING_ISO_FLAG_SAFE. */
572 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
576 #define CODING_UTF_16_BOM(coding) \
577 ((coding)->spec.utf_16.bom)
579 #define CODING_UTF_16_ENDIAN(coding) \
580 ((coding)->spec.utf_16.endian)
582 #define CODING_UTF_16_SURROGATE(coding) \
583 ((coding)->spec.utf_16.surrogate)
587 #define CODING_CCL_DECODER(coding) \
588 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
589 #define CODING_CCL_ENCODER(coding) \
590 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
591 #define CODING_CCL_VALIDS(coding) \
592 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
595 /* Index for each coding category in `coding_categories' */
599 coding_category_iso_7
,
600 coding_category_iso_7_tight
,
601 coding_category_iso_8_1
,
602 coding_category_iso_8_2
,
603 coding_category_iso_7_else
,
604 coding_category_iso_8_else
,
605 coding_category_utf_8
,
606 coding_category_utf_16_auto
,
607 coding_category_utf_16_be
,
608 coding_category_utf_16_le
,
609 coding_category_utf_16_be_nosig
,
610 coding_category_utf_16_le_nosig
,
611 coding_category_charset
,
612 coding_category_sjis
,
613 coding_category_big5
,
615 coding_category_emacs_mule
,
616 /* All above are targets of code detection. */
617 coding_category_raw_text
,
618 coding_category_undecided
,
622 /* Definitions of flag bits used in detect_coding_XXXX. */
623 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
624 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
625 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
626 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
627 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
628 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
629 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
630 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
631 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
632 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
633 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
634 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
635 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
636 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
637 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
638 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
639 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
640 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
642 /* This value is returned if detect_coding_mask () find nothing other
643 than ASCII characters. */
644 #define CATEGORY_MASK_ANY \
645 (CATEGORY_MASK_ISO_7 \
646 | CATEGORY_MASK_ISO_7_TIGHT \
647 | CATEGORY_MASK_ISO_8_1 \
648 | CATEGORY_MASK_ISO_8_2 \
649 | CATEGORY_MASK_ISO_7_ELSE \
650 | CATEGORY_MASK_ISO_8_ELSE \
651 | CATEGORY_MASK_UTF_8 \
652 | CATEGORY_MASK_UTF_16_BE \
653 | CATEGORY_MASK_UTF_16_LE \
654 | CATEGORY_MASK_UTF_16_BE_NOSIG \
655 | CATEGORY_MASK_UTF_16_LE_NOSIG \
656 | CATEGORY_MASK_CHARSET \
657 | CATEGORY_MASK_SJIS \
658 | CATEGORY_MASK_BIG5 \
659 | CATEGORY_MASK_CCL \
660 | CATEGORY_MASK_EMACS_MULE)
663 #define CATEGORY_MASK_ISO_7BIT \
664 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
666 #define CATEGORY_MASK_ISO_8BIT \
667 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
669 #define CATEGORY_MASK_ISO_ELSE \
670 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
672 #define CATEGORY_MASK_ISO_ESCAPE \
673 (CATEGORY_MASK_ISO_7 \
674 | CATEGORY_MASK_ISO_7_TIGHT \
675 | CATEGORY_MASK_ISO_7_ELSE \
676 | CATEGORY_MASK_ISO_8_ELSE)
678 #define CATEGORY_MASK_ISO \
679 ( CATEGORY_MASK_ISO_7BIT \
680 | CATEGORY_MASK_ISO_8BIT \
681 | CATEGORY_MASK_ISO_ELSE)
683 #define CATEGORY_MASK_UTF_16 \
684 (CATEGORY_MASK_UTF_16_BE \
685 | CATEGORY_MASK_UTF_16_LE \
686 | CATEGORY_MASK_UTF_16_BE_NOSIG \
687 | CATEGORY_MASK_UTF_16_LE_NOSIG)
690 /* List of symbols `coding-category-xxx' ordered by priority. This
691 variable is exposed to Emacs Lisp. */
692 static Lisp_Object Vcoding_category_list
;
694 /* Table of coding categories (Lisp symbols). This variable is for
696 static Lisp_Object Vcoding_category_table
;
698 /* Table of coding-categories ordered by priority. */
699 static enum coding_category coding_priorities
[coding_category_max
];
701 /* Nth element is a coding context for the coding system bound to the
702 Nth coding category. */
703 static struct coding_system coding_categories
[coding_category_max
];
705 /*** Commonly used macros and functions ***/
708 #define min(a, b) ((a) < (b) ? (a) : (b))
711 #define max(a, b) ((a) > (b) ? (a) : (b))
714 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
716 attrs = CODING_ID_ATTRS (coding->id); \
717 eol_type = CODING_ID_EOL_TYPE (coding->id); \
718 if (VECTORP (eol_type)) \
720 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
724 /* Safely get one byte from the source text pointed by SRC which ends
725 at SRC_END, and set C to that byte. If there are not enough bytes
726 in the source, it jumps to `no_more_source'. The caller
727 should declare and set these variables appropriately in advance:
728 src, src_end, multibytep
731 #define ONE_MORE_BYTE(c) \
733 if (src == src_end) \
735 if (src_base < src) \
736 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
737 goto no_more_source; \
740 if (multibytep && (c & 0x80)) \
742 if ((c & 0xFE) != 0xC0) \
743 error ("Undecodable char found"); \
744 c = ((c & 1) << 6) | *src++; \
750 #define ONE_MORE_BYTE_NO_CHECK(c) \
753 if (multibytep && (c & 0x80)) \
755 if ((c & 0xFE) != 0xC0) \
756 error ("Undecodable char found"); \
757 c = ((c & 1) << 6) | *src++; \
763 /* Store a byte C in the place pointed by DST and increment DST to the
764 next free point, and increment PRODUCED_CHARS. The caller should
765 assure that C is 0..127, and declare and set the variable `dst'
766 appropriately in advance.
770 #define EMIT_ONE_ASCII_BYTE(c) \
777 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
779 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
781 produced_chars += 2; \
782 *dst++ = (c1), *dst++ = (c2); \
786 /* Store a byte C in the place pointed by DST and increment DST to the
787 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
788 nonzero, store in an appropriate multibyte from. The caller should
789 declare and set the variables `dst' and `multibytep' appropriately
792 #define EMIT_ONE_BYTE(c) \
799 ch = BYTE8_TO_CHAR (ch); \
800 CHAR_STRING_ADVANCE (ch, dst); \
807 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
809 #define EMIT_TWO_BYTES(c1, c2) \
811 produced_chars += 2; \
818 ch = BYTE8_TO_CHAR (ch); \
819 CHAR_STRING_ADVANCE (ch, dst); \
822 ch = BYTE8_TO_CHAR (ch); \
823 CHAR_STRING_ADVANCE (ch, dst); \
833 #define EMIT_THREE_BYTES(c1, c2, c3) \
835 EMIT_ONE_BYTE (c1); \
836 EMIT_TWO_BYTES (c2, c3); \
840 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
842 EMIT_TWO_BYTES (c1, c2); \
843 EMIT_TWO_BYTES (c3, c4); \
847 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
849 charset_map_loaded = 0; \
850 c = DECODE_CHAR (charset, code); \
851 if (charset_map_loaded) \
853 unsigned char *orig = coding->source; \
856 coding_set_source (coding); \
857 offset = coding->source - orig; \
859 src_base += offset; \
865 #define ASSURE_DESTINATION(bytes) \
867 if (dst + (bytes) >= dst_end) \
869 int more_bytes = charbuf_end - charbuf + (bytes); \
871 dst = alloc_destination (coding, more_bytes, dst); \
872 dst_end = coding->destination + coding->dst_bytes; \
879 coding_set_source (coding
)
880 struct coding_system
*coding
;
882 if (BUFFERP (coding
->src_object
))
884 struct buffer
*buf
= XBUFFER (coding
->src_object
);
886 if (coding
->src_pos
< 0)
887 coding
->source
= BUF_GAP_END_ADDR (buf
) + coding
->src_pos_byte
;
889 coding
->source
= BUF_BYTE_ADDRESS (buf
, coding
->src_pos_byte
);
891 else if (STRINGP (coding
->src_object
))
893 coding
->source
= (XSTRING (coding
->src_object
)->data
894 + coding
->src_pos_byte
);
897 /* Otherwise, the source is C string and is never relocated
898 automatically. Thus we don't have to update anything. */
903 coding_set_destination (coding
)
904 struct coding_system
*coding
;
906 if (BUFFERP (coding
->dst_object
))
908 if (coding
->src_pos
< 0)
910 coding
->destination
= BEG_ADDR
+ coding
->dst_pos_byte
- 1;
911 coding
->dst_bytes
= (GAP_END_ADDR
912 - (coding
->src_bytes
- coding
->consumed
)
913 - coding
->destination
);
917 /* We are sure that coding->dst_pos_byte is before the gap
919 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
920 + coding
->dst_pos_byte
- 1);
921 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
922 - coding
->destination
);
926 /* Otherwise, the destination is C string and is never relocated
927 automatically. Thus we don't have to update anything. */
933 coding_alloc_by_realloc (coding
, bytes
)
934 struct coding_system
*coding
;
937 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
938 coding
->dst_bytes
+ bytes
);
939 coding
->dst_bytes
+= bytes
;
943 coding_alloc_by_making_gap (coding
, bytes
)
944 struct coding_system
*coding
;
947 if (BUFFERP (coding
->dst_object
)
948 && EQ (coding
->src_object
, coding
->dst_object
))
950 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
952 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
954 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
958 Lisp_Object this_buffer
;
960 this_buffer
= Fcurrent_buffer ();
961 set_buffer_internal (XBUFFER (coding
->dst_object
));
963 set_buffer_internal (XBUFFER (this_buffer
));
968 static unsigned char *
969 alloc_destination (coding
, nbytes
, dst
)
970 struct coding_system
*coding
;
974 EMACS_INT offset
= dst
- coding
->destination
;
976 if (BUFFERP (coding
->dst_object
))
977 coding_alloc_by_making_gap (coding
, nbytes
);
979 coding_alloc_by_realloc (coding
, nbytes
);
980 coding
->result
= CODING_RESULT_SUCCESS
;
981 coding_set_destination (coding
);
982 dst
= coding
->destination
+ offset
;
986 /** Macros for annotations. */
988 /* Maximum length of annotation data (sum of annotations for
989 composition and charset). */
990 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
992 /* An annotation data is stored in the array coding->charbuf in this
994 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
995 LENGTH is the number of elements in the annotation.
996 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
997 FROM and TO specify the range of text annotated. They are relative
998 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1000 The format of the following elements depend on ANNOTATION_MASK.
1002 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1004 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1005 METHOD is one of enum composition_method.
1006 Optionnal COMPOSITION-COMPONENTS are characters and composition
1009 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1012 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1014 *(buf)++ = -(len); \
1015 *(buf)++ = (mask); \
1016 *(buf)++ = (from); \
1018 coding->annotated = 1; \
1021 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1023 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1028 #define ADD_CHARSET_DATA(buf, from, to, id) \
1030 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1035 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1042 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1043 Check if a text is encoded in UTF-8. If it is, return 1, else
1046 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1047 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1048 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1049 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1050 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1051 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1054 detect_coding_utf_8 (coding
, detect_info
)
1055 struct coding_system
*coding
;
1056 struct coding_detection_info
*detect_info
;
1058 unsigned char *src
= coding
->source
, *src_base
= src
;
1059 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1060 int multibytep
= coding
->src_multibyte
;
1061 int consumed_chars
= 0;
1065 detect_info
->checked
|= CATEGORY_MASK_UTF_8
;
1066 /* A coding system of this category is always ASCII compatible. */
1067 src
+= coding
->head_ascii
;
1071 int c
, c1
, c2
, c3
, c4
;
1075 if (UTF_8_1_OCTET_P (c
))
1079 if (! UTF_8_EXTRA_OCTET_P (c1
))
1081 if (UTF_8_2_OCTET_LEADING_P (c
))
1083 found
= CATEGORY_MASK_UTF_8
;
1087 if (! UTF_8_EXTRA_OCTET_P (c2
))
1089 if (UTF_8_3_OCTET_LEADING_P (c
))
1091 found
= CATEGORY_MASK_UTF_8
;
1095 if (! UTF_8_EXTRA_OCTET_P (c3
))
1097 if (UTF_8_4_OCTET_LEADING_P (c
))
1099 found
= CATEGORY_MASK_UTF_8
;
1103 if (! UTF_8_EXTRA_OCTET_P (c4
))
1105 if (UTF_8_5_OCTET_LEADING_P (c
))
1107 found
= CATEGORY_MASK_UTF_8
;
1112 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1116 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1118 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1121 detect_info
->found
|= found
;
1127 decode_coding_utf_8 (coding
)
1128 struct coding_system
*coding
;
1130 unsigned char *src
= coding
->source
+ coding
->consumed
;
1131 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1132 unsigned char *src_base
;
1133 int *charbuf
= coding
->charbuf
;
1134 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1135 int consumed_chars
= 0, consumed_chars_base
;
1136 int multibytep
= coding
->src_multibyte
;
1137 Lisp_Object attr
, eol_type
, charset_list
;
1139 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1143 int c
, c1
, c2
, c3
, c4
, c5
;
1146 consumed_chars_base
= consumed_chars
;
1148 if (charbuf
>= charbuf_end
)
1152 if (UTF_8_1_OCTET_P(c1
))
1157 if (EQ (eol_type
, Qdos
))
1161 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
1162 goto no_more_source
;
1167 else if (EQ (eol_type
, Qmac
))
1174 if (! UTF_8_EXTRA_OCTET_P (c2
))
1176 if (UTF_8_2_OCTET_LEADING_P (c1
))
1178 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1179 /* Reject overlong sequences here and below. Encoders
1180 producing them are incorrect, they can be misleading,
1181 and they mess up read/write invariance. */
1188 if (! UTF_8_EXTRA_OCTET_P (c3
))
1190 if (UTF_8_3_OCTET_LEADING_P (c1
))
1192 c
= (((c1
& 0xF) << 12)
1193 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1195 || (c
>= 0xd800 && c
< 0xe000)) /* surrogates (invalid) */
1201 if (! UTF_8_EXTRA_OCTET_P (c4
))
1203 if (UTF_8_4_OCTET_LEADING_P (c1
))
1205 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1206 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1213 if (! UTF_8_EXTRA_OCTET_P (c5
))
1215 if (UTF_8_5_OCTET_LEADING_P (c1
))
1217 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1218 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1220 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1235 consumed_chars
= consumed_chars_base
;
1237 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1242 coding
->consumed_char
+= consumed_chars_base
;
1243 coding
->consumed
= src_base
- coding
->source
;
1244 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1249 encode_coding_utf_8 (coding
)
1250 struct coding_system
*coding
;
1252 int multibytep
= coding
->dst_multibyte
;
1253 int *charbuf
= coding
->charbuf
;
1254 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1255 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1256 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1257 int produced_chars
= 0;
1262 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1264 while (charbuf
< charbuf_end
)
1266 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1268 ASSURE_DESTINATION (safe_room
);
1270 if (CHAR_BYTE8_P (c
))
1272 c
= CHAR_TO_BYTE8 (c
);
1277 CHAR_STRING_ADVANCE (c
, pend
);
1278 for (p
= str
; p
< pend
; p
++)
1285 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1287 while (charbuf
< charbuf_end
)
1289 ASSURE_DESTINATION (safe_room
);
1291 dst
+= CHAR_STRING (c
, dst
);
1295 coding
->result
= CODING_RESULT_SUCCESS
;
1296 coding
->produced_char
+= produced_chars
;
1297 coding
->produced
= dst
- coding
->destination
;
1302 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1303 Check if a text is encoded in one of UTF-16 based coding systems.
1304 If it is, return 1, else return 0. */
1306 #define UTF_16_HIGH_SURROGATE_P(val) \
1307 (((val) & 0xFC00) == 0xD800)
1309 #define UTF_16_LOW_SURROGATE_P(val) \
1310 (((val) & 0xFC00) == 0xDC00)
1312 #define UTF_16_INVALID_P(val) \
1313 (((val) == 0xFFFE) \
1314 || ((val) == 0xFFFF) \
1315 || UTF_16_LOW_SURROGATE_P (val))
1319 detect_coding_utf_16 (coding
, detect_info
)
1320 struct coding_system
*coding
;
1321 struct coding_detection_info
*detect_info
;
1323 unsigned char *src
= coding
->source
, *src_base
= src
;
1324 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1325 int multibytep
= coding
->src_multibyte
;
1326 int consumed_chars
= 0;
1329 detect_info
->checked
|= CATEGORY_MASK_UTF_16
;
1331 if (coding
->mode
& CODING_MODE_LAST_BLOCK
1332 && (coding
->src_bytes
& 1))
1334 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1340 if ((c1
== 0xFF) && (c2
== 0xFE))
1342 detect_info
->found
|= (CATEGORY_MASK_UTF_16_LE
1343 | CATEGORY_MASK_UTF_16_AUTO
);
1344 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_BE
;
1346 else if ((c1
== 0xFE) && (c2
== 0xFF))
1348 detect_info
->found
|= (CATEGORY_MASK_UTF_16_BE
1349 | CATEGORY_MASK_UTF_16_AUTO
);
1350 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_LE
;
1357 decode_coding_utf_16 (coding
)
1358 struct coding_system
*coding
;
1360 unsigned char *src
= coding
->source
+ coding
->consumed
;
1361 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1362 unsigned char *src_base
;
1363 int *charbuf
= coding
->charbuf
;
1364 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1365 int consumed_chars
= 0, consumed_chars_base
;
1366 int multibytep
= coding
->src_multibyte
;
1367 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1368 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1369 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1370 Lisp_Object attr
, eol_type
, charset_list
;
1372 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1374 if (bom
== utf_16_with_bom
)
1383 if (endian
== utf_16_big_endian
1384 ? c
!= 0xFEFF : c
!= 0xFFFE)
1386 /* The first two bytes are not BOM. Treat them as bytes
1387 for a normal character. */
1391 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1393 else if (bom
== utf_16_detect_bom
)
1395 /* We have already tried to detect BOM and failed in
1397 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1405 consumed_chars_base
= consumed_chars
;
1407 if (charbuf
+ 2 >= charbuf_end
)
1412 c
= (endian
== utf_16_big_endian
1413 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1416 if (! UTF_16_LOW_SURROGATE_P (c
))
1418 if (endian
== utf_16_big_endian
)
1419 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1421 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1425 if (UTF_16_HIGH_SURROGATE_P (c
))
1426 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1432 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1433 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1439 if (UTF_16_HIGH_SURROGATE_P (c
))
1440 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1447 coding
->consumed_char
+= consumed_chars_base
;
1448 coding
->consumed
= src_base
- coding
->source
;
1449 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1453 encode_coding_utf_16 (coding
)
1454 struct coding_system
*coding
;
1456 int multibytep
= coding
->dst_multibyte
;
1457 int *charbuf
= coding
->charbuf
;
1458 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1459 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1460 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1462 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1463 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1464 int produced_chars
= 0;
1465 Lisp_Object attrs
, eol_type
, charset_list
;
1468 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1470 if (bom
!= utf_16_without_bom
)
1472 ASSURE_DESTINATION (safe_room
);
1474 EMIT_TWO_BYTES (0xFE, 0xFF);
1476 EMIT_TWO_BYTES (0xFF, 0xFE);
1477 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1480 while (charbuf
< charbuf_end
)
1482 ASSURE_DESTINATION (safe_room
);
1484 if (c
>= MAX_UNICODE_CHAR
)
1485 c
= coding
->default_char
;
1490 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1492 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1499 c1
= (c
>> 10) + 0xD800;
1500 c2
= (c
& 0x3FF) + 0xDC00;
1502 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1504 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1507 coding
->result
= CODING_RESULT_SUCCESS
;
1508 coding
->produced
= dst
- coding
->destination
;
1509 coding
->produced_char
+= produced_chars
;
1514 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1516 /* Emacs' internal format for representation of multiple character
1517 sets is a kind of multi-byte encoding, i.e. characters are
1518 represented by variable-length sequences of one-byte codes.
1520 ASCII characters and control characters (e.g. `tab', `newline') are
1521 represented by one-byte sequences which are their ASCII codes, in
1522 the range 0x00 through 0x7F.
1524 8-bit characters of the range 0x80..0x9F are represented by
1525 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1528 8-bit characters of the range 0xA0..0xFF are represented by
1529 one-byte sequences which are their 8-bit code.
1531 The other characters are represented by a sequence of `base
1532 leading-code', optional `extended leading-code', and one or two
1533 `position-code's. The length of the sequence is determined by the
1534 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1535 whereas extended leading-code and position-code take the range 0xA0
1536 through 0xFF. See `charset.h' for more details about leading-code
1539 --- CODE RANGE of Emacs' internal format ---
1543 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1544 eight-bit-graphic 0xA0..0xBF
1545 ELSE 0x81..0x9D + [0xA0..0xFF]+
1546 ---------------------------------------------
1548 As this is the internal character representation, the format is
1549 usually not used externally (i.e. in a file or in a data sent to a
1550 process). But, it is possible to have a text externally in this
1551 format (i.e. by encoding by the coding system `emacs-mule').
1553 In that case, a sequence of one-byte codes has a slightly different
1556 At first, all characters in eight-bit-control are represented by
1557 one-byte sequences which are their 8-bit code.
1559 Next, character composition data are represented by the byte
1560 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1562 METHOD is 0xF0 plus one of composition method (enum
1563 composition_method),
1565 BYTES is 0xA0 plus a byte length of this composition data,
1567 CHARS is 0x20 plus a number of characters composed by this
1570 COMPONENTs are characters of multibye form or composition
1571 rules encoded by two-byte of ASCII codes.
1573 In addition, for backward compatibility, the following formats are
1574 also recognized as composition data on decoding.
1577 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1580 MSEQ is a multibyte form but in these special format:
1581 ASCII: 0xA0 ASCII_CODE+0x80,
1582 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1583 RULE is a one byte code of the range 0xA0..0xF0 that
1584 represents a composition rule.
1587 char emacs_mule_bytes
[256];
1590 emacs_mule_char (coding
, src
, nbytes
, nchars
, id
)
1591 struct coding_system
*coding
;
1593 int *nbytes
, *nchars
, *id
;
1595 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1596 int multibytep
= coding
->src_multibyte
;
1597 unsigned char *src_base
= src
;
1598 struct charset
*charset
;
1601 int consumed_chars
= 0;
1604 switch (emacs_mule_bytes
[c
])
1607 if (! (charset
= emacs_mule_charset
[c
]))
1614 if (c
== EMACS_MULE_LEADING_CODE_PRIVATE_11
1615 || c
== EMACS_MULE_LEADING_CODE_PRIVATE_12
)
1618 if (! (charset
= emacs_mule_charset
[c
]))
1625 if (! (charset
= emacs_mule_charset
[c
]))
1628 code
= (c
& 0x7F) << 8;
1636 if (! (charset
= emacs_mule_charset
[c
]))
1639 code
= (c
& 0x7F) << 8;
1646 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
)
1647 ? charset_ascii
: charset_eight_bit
);
1653 c
= DECODE_CHAR (charset
, code
);
1656 *nbytes
= src
- src_base
;
1657 *nchars
= consumed_chars
;
1670 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1671 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1675 detect_coding_emacs_mule (coding
, detect_info
)
1676 struct coding_system
*coding
;
1677 struct coding_detection_info
*detect_info
;
1679 unsigned char *src
= coding
->source
, *src_base
= src
;
1680 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1681 int multibytep
= coding
->src_multibyte
;
1682 int consumed_chars
= 0;
1687 detect_info
->checked
|= CATEGORY_MASK_EMACS_MULE
;
1688 /* A coding system of this category is always ASCII compatible. */
1689 src
+= coding
->head_ascii
;
1699 /* Perhaps the start of composite character. We simple skip
1700 it because analyzing it is too heavy for detecting. But,
1701 at least, we check that the composite character
1702 constitues of more than 4 bytes. */
1703 unsigned char *src_base
;
1713 if (src
- src_base
<= 4)
1715 found
= CATEGORY_MASK_EMACS_MULE
;
1723 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1728 unsigned char *src_base
= src
- 1;
1735 if (src
- src_base
!= emacs_mule_bytes
[*src_base
])
1737 found
= CATEGORY_MASK_EMACS_MULE
;
1740 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1744 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1746 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1749 detect_info
->found
|= found
;
1754 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1756 /* Decode a character represented as a component of composition
1757 sequence of Emacs 20/21 style at SRC. Set C to that character and
1758 update SRC to the head of next character (or an encoded composition
1759 rule). If SRC doesn't points a composition component, set C to -1.
1760 If SRC points an invalid byte sequence, global exit by a return
1763 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1767 int nbytes, nchars; \
1769 if (src == src_end) \
1771 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1776 goto invalid_code; \
1780 consumed_chars += nchars; \
1785 /* Decode a composition rule represented as a component of composition
1786 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1787 and increment BUF. If SRC points an invalid byte sequence, set C
1790 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1792 int c, gref, nref; \
1794 if (src >= src_end) \
1795 goto invalid_code; \
1796 ONE_MORE_BYTE_NO_CHECK (c); \
1798 if (c < 0 || c >= 81) \
1799 goto invalid_code; \
1801 gref = c / 9, nref = c % 9; \
1802 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1806 /* Decode a composition rule represented as a component of composition
1807 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1808 and increment BUF. If SRC points an invalid byte sequence, set C
1811 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1815 if (src + 1>= src_end) \
1816 goto invalid_code; \
1817 ONE_MORE_BYTE_NO_CHECK (gref); \
1819 ONE_MORE_BYTE_NO_CHECK (nref); \
1821 if (gref < 0 || gref >= 81 \
1822 || nref < 0 || nref >= 81) \
1823 goto invalid_code; \
1824 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1828 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1830 /* Emacs 21 style format. The first three bytes at SRC are \
1831 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1832 the byte length of this composition information, CHARS is the \
1833 number of characters composed by this composition. */ \
1834 enum composition_method method = c - 0xF2; \
1835 int *charbuf_base = charbuf; \
1837 int consumed_chars_limit; \
1838 int nbytes, nchars; \
1840 ONE_MORE_BYTE (c); \
1841 nbytes = c - 0xA0; \
1843 goto invalid_code; \
1844 ONE_MORE_BYTE (c); \
1845 nchars = c - 0xA0; \
1846 from = coding->produced + char_offset; \
1847 to = from + nchars; \
1848 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1849 consumed_chars_limit = consumed_chars_base + nbytes; \
1850 if (method != COMPOSITION_RELATIVE) \
1853 while (consumed_chars < consumed_chars_limit) \
1855 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1856 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1858 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1861 if (consumed_chars < consumed_chars_limit) \
1862 goto invalid_code; \
1863 charbuf_base[0] -= i; \
1868 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1870 /* Emacs 20 style format for relative composition. */ \
1871 /* Store multibyte form of characters to be composed. */ \
1872 enum composition_method method = COMPOSITION_RELATIVE; \
1873 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1874 int *buf = components; \
1879 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1880 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1881 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1883 goto invalid_code; \
1884 from = coding->produced_char + char_offset; \
1886 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1887 for (j = 0; j < i; j++) \
1888 *charbuf++ = components[j]; \
1892 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1894 /* Emacs 20 style format for rule-base composition. */ \
1895 /* Store multibyte form of characters to be composed. */ \
1896 enum composition_method method = COMPOSITION_WITH_RULE; \
1897 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1898 int *buf = components; \
1902 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1903 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1905 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1906 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1908 if (i < 1 || (buf - components) % 2 == 0) \
1909 goto invalid_code; \
1910 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1911 goto no_more_source; \
1912 from = coding->produced_char + char_offset; \
1914 ADD_COMPOSITION_DATA (buf, from, to, method); \
1915 for (j = 0; j < i; j++) \
1916 *charbuf++ = components[j]; \
1917 for (j = 0; j < i; j += 2) \
1918 *charbuf++ = components[j]; \
1923 decode_coding_emacs_mule (coding
)
1924 struct coding_system
*coding
;
1926 unsigned char *src
= coding
->source
+ coding
->consumed
;
1927 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1928 unsigned char *src_base
;
1929 int *charbuf
= coding
->charbuf
;
1930 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
1931 int consumed_chars
= 0, consumed_chars_base
;
1932 int multibytep
= coding
->src_multibyte
;
1933 Lisp_Object attrs
, eol_type
, charset_list
;
1934 int char_offset
= coding
->produced_char
;
1935 int last_offset
= char_offset
;
1936 int last_id
= charset_ascii
;
1938 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1945 consumed_chars_base
= consumed_chars
;
1947 if (charbuf
>= charbuf_end
)
1956 if (EQ (eol_type
, Qdos
))
1960 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
1961 goto no_more_source
;
1966 else if (EQ (eol_type
, Qmac
))
1975 if (c
- 0xF2 >= COMPOSITION_RELATIVE
1976 && c
- 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS
)
1977 DECODE_EMACS_MULE_21_COMPOSITION (c
);
1979 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
1981 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
1985 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
1991 consumed_chars
= consumed_chars_base
;
1992 c
= emacs_mule_char (coding
, src
, &nbytes
, &nchars
, &id
);
2001 if (last_id
!= charset_ascii
)
2002 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2004 last_offset
= char_offset
;
2008 consumed_chars
+= nchars
;
2015 consumed_chars
= consumed_chars_base
;
2017 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
2023 if (last_id
!= charset_ascii
)
2024 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2025 coding
->consumed_char
+= consumed_chars_base
;
2026 coding
->consumed
= src_base
- coding
->source
;
2027 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
2031 #define EMACS_MULE_LEADING_CODES(id, codes) \
2034 codes[0] = id, codes[1] = 0; \
2035 else if (id < 0xE0) \
2036 codes[0] = 0x9A, codes[1] = id; \
2037 else if (id < 0xF0) \
2038 codes[0] = 0x9B, codes[1] = id; \
2039 else if (id < 0xF5) \
2040 codes[0] = 0x9C, codes[1] = id; \
2042 codes[0] = 0x9D, codes[1] = id; \
2047 encode_coding_emacs_mule (coding
)
2048 struct coding_system
*coding
;
2050 int multibytep
= coding
->dst_multibyte
;
2051 int *charbuf
= coding
->charbuf
;
2052 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
2053 unsigned char *dst
= coding
->destination
+ coding
->produced
;
2054 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
2056 int produced_chars
= 0;
2057 Lisp_Object attrs
, eol_type
, charset_list
;
2059 int preferred_charset_id
= -1;
2061 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2063 while (charbuf
< charbuf_end
)
2065 ASSURE_DESTINATION (safe_room
);
2070 /* Handle an annotation. */
2073 case CODING_ANNOTATE_COMPOSITION_MASK
:
2074 /* Not yet implemented. */
2076 case CODING_ANNOTATE_CHARSET_MASK
:
2077 preferred_charset_id
= charbuf
[3];
2078 if (preferred_charset_id
>= 0
2079 && NILP (Fmemq (make_number (preferred_charset_id
),
2081 preferred_charset_id
= -1;
2090 if (ASCII_CHAR_P (c
))
2091 EMIT_ONE_ASCII_BYTE (c
);
2092 else if (CHAR_BYTE8_P (c
))
2094 c
= CHAR_TO_BYTE8 (c
);
2099 struct charset
*charset
;
2103 unsigned char leading_codes
[2];
2105 if (preferred_charset_id
>= 0)
2107 charset
= CHARSET_FROM_ID (preferred_charset_id
);
2108 if (! CHAR_CHARSET_P (c
, charset
))
2109 charset
= char_charset (c
, charset_list
, NULL
);
2112 charset
= char_charset (c
, charset_list
, &code
);
2115 c
= coding
->default_char
;
2116 if (ASCII_CHAR_P (c
))
2118 EMIT_ONE_ASCII_BYTE (c
);
2121 charset
= char_charset (c
, charset_list
, &code
);
2123 dimension
= CHARSET_DIMENSION (charset
);
2124 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2125 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2126 EMIT_ONE_BYTE (leading_codes
[0]);
2127 if (leading_codes
[1])
2128 EMIT_ONE_BYTE (leading_codes
[1]);
2130 EMIT_ONE_BYTE (code
);
2133 EMIT_ONE_BYTE (code
>> 8);
2134 EMIT_ONE_BYTE (code
& 0xFF);
2138 coding
->result
= CODING_RESULT_SUCCESS
;
2139 coding
->produced_char
+= produced_chars
;
2140 coding
->produced
= dst
- coding
->destination
;
2145 /*** 7. ISO2022 handlers ***/
2147 /* The following note describes the coding system ISO2022 briefly.
2148 Since the intention of this note is to help understand the
2149 functions in this file, some parts are NOT ACCURATE or are OVERLY
2150 SIMPLIFIED. For thorough understanding, please refer to the
2151 original document of ISO2022. This is equivalent to the standard
2152 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2154 ISO2022 provides many mechanisms to encode several character sets
2155 in 7-bit and 8-bit environments. For 7-bit environments, all text
2156 is encoded using bytes less than 128. This may make the encoded
2157 text a little bit longer, but the text passes more easily through
2158 several types of gateway, some of which strip off the MSB (Most
2161 There are two kinds of character sets: control character sets and
2162 graphic character sets. The former contain control characters such
2163 as `newline' and `escape' to provide control functions (control
2164 functions are also provided by escape sequences). The latter
2165 contain graphic characters such as 'A' and '-'. Emacs recognizes
2166 two control character sets and many graphic character sets.
2168 Graphic character sets are classified into one of the following
2169 four classes, according to the number of bytes (DIMENSION) and
2170 number of characters in one dimension (CHARS) of the set:
2171 - DIMENSION1_CHARS94
2172 - DIMENSION1_CHARS96
2173 - DIMENSION2_CHARS94
2174 - DIMENSION2_CHARS96
2176 In addition, each character set is assigned an identification tag,
2177 unique for each set, called the "final character" (denoted as <F>
2178 hereafter). The <F> of each character set is decided by ECMA(*)
2179 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2180 (0x30..0x3F are for private use only).
2182 Note (*): ECMA = European Computer Manufacturers Association
2184 Here are examples of graphic character sets [NAME(<F>)]:
2185 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2186 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2187 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2188 o DIMENSION2_CHARS96 -- none for the moment
2190 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2191 C0 [0x00..0x1F] -- control character plane 0
2192 GL [0x20..0x7F] -- graphic character plane 0
2193 C1 [0x80..0x9F] -- control character plane 1
2194 GR [0xA0..0xFF] -- graphic character plane 1
2196 A control character set is directly designated and invoked to C0 or
2197 C1 by an escape sequence. The most common case is that:
2198 - ISO646's control character set is designated/invoked to C0, and
2199 - ISO6429's control character set is designated/invoked to C1,
2200 and usually these designations/invocations are omitted in encoded
2201 text. In a 7-bit environment, only C0 can be used, and a control
2202 character for C1 is encoded by an appropriate escape sequence to
2203 fit into the environment. All control characters for C1 are
2204 defined to have corresponding escape sequences.
2206 A graphic character set is at first designated to one of four
2207 graphic registers (G0 through G3), then these graphic registers are
2208 invoked to GL or GR. These designations and invocations can be
2209 done independently. The most common case is that G0 is invoked to
2210 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2211 these invocations and designations are omitted in encoded text.
2212 In a 7-bit environment, only GL can be used.
2214 When a graphic character set of CHARS94 is invoked to GL, codes
2215 0x20 and 0x7F of the GL area work as control characters SPACE and
2216 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2219 There are two ways of invocation: locking-shift and single-shift.
2220 With locking-shift, the invocation lasts until the next different
2221 invocation, whereas with single-shift, the invocation affects the
2222 following character only and doesn't affect the locking-shift
2223 state. Invocations are done by the following control characters or
2226 ----------------------------------------------------------------------
2227 abbrev function cntrl escape seq description
2228 ----------------------------------------------------------------------
2229 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2230 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2231 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2232 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2233 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2234 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2235 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2236 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2237 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2238 ----------------------------------------------------------------------
2239 (*) These are not used by any known coding system.
2241 Control characters for these functions are defined by macros
2242 ISO_CODE_XXX in `coding.h'.
2244 Designations are done by the following escape sequences:
2245 ----------------------------------------------------------------------
2246 escape sequence description
2247 ----------------------------------------------------------------------
2248 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2249 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2250 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2251 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2252 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2253 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2254 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2255 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2256 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2257 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2258 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2259 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2260 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2261 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2262 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2263 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2264 ----------------------------------------------------------------------
2266 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2267 of dimension 1, chars 94, and final character <F>, etc...
2269 Note (*): Although these designations are not allowed in ISO2022,
2270 Emacs accepts them on decoding, and produces them on encoding
2271 CHARS96 character sets in a coding system which is characterized as
2272 7-bit environment, non-locking-shift, and non-single-shift.
2274 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2275 '(' must be omitted. We refer to this as "short-form" hereafter.
2277 Now you may notice that there are a lot of ways of encoding the
2278 same multilingual text in ISO2022. Actually, there exist many
2279 coding systems such as Compound Text (used in X11's inter client
2280 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2281 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2282 localized platforms), and all of these are variants of ISO2022.
2284 In addition to the above, Emacs handles two more kinds of escape
2285 sequences: ISO6429's direction specification and Emacs' private
2286 sequence for specifying character composition.
2288 ISO6429's direction specification takes the following form:
2289 o CSI ']' -- end of the current direction
2290 o CSI '0' ']' -- end of the current direction
2291 o CSI '1' ']' -- start of left-to-right text
2292 o CSI '2' ']' -- start of right-to-left text
2293 The control character CSI (0x9B: control sequence introducer) is
2294 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2296 Character composition specification takes the following form:
2297 o ESC '0' -- start relative composition
2298 o ESC '1' -- end composition
2299 o ESC '2' -- start rule-base composition (*)
2300 o ESC '3' -- start relative composition with alternate chars (**)
2301 o ESC '4' -- start rule-base composition with alternate chars (**)
2302 Since these are not standard escape sequences of any ISO standard,
2303 the use of them with these meanings is restricted to Emacs only.
2305 (*) This form is used only in Emacs 20.7 and older versions,
2306 but newer versions can safely decode it.
2307 (**) This form is used only in Emacs 21.1 and newer versions,
2308 and older versions can't decode it.
2310 Here's a list of example usages of these composition escape
2311 sequences (categorized by `enum composition_method').
2313 COMPOSITION_RELATIVE:
2314 ESC 0 CHAR [ CHAR ] ESC 1
2315 COMPOSITION_WITH_RULE:
2316 ESC 2 CHAR [ RULE CHAR ] ESC 1
2317 COMPOSITION_WITH_ALTCHARS:
2318 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2319 COMPOSITION_WITH_RULE_ALTCHARS:
2320 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2322 enum iso_code_class_type iso_code_class
[256];
2324 #define SAFE_CHARSET_P(coding, id) \
2325 ((id) <= (coding)->max_charset_id \
2326 && (coding)->safe_charsets[id] >= 0)
2329 #define SHIFT_OUT_OK(category) \
2330 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2333 setup_iso_safe_charsets (attrs
)
2336 Lisp_Object charset_list
, safe_charsets
;
2337 Lisp_Object request
;
2338 Lisp_Object reg_usage
;
2341 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2344 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2345 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2346 && ! EQ (charset_list
, Viso_2022_charset_list
))
2348 CODING_ATTR_CHARSET_LIST (attrs
)
2349 = charset_list
= Viso_2022_charset_list
;
2350 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2353 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2357 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2359 int id
= XINT (XCAR (tail
));
2360 if (max_charset_id
< id
)
2361 max_charset_id
= id
;
2364 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2366 request
= AREF (attrs
, coding_attr_iso_request
);
2367 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2368 reg94
= XINT (XCAR (reg_usage
));
2369 reg96
= XINT (XCDR (reg_usage
));
2371 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2375 struct charset
*charset
;
2378 charset
= CHARSET_FROM_ID (XINT (id
));
2379 reg
= Fcdr (Fassq (id
, request
));
2381 XSTRING (safe_charsets
)->data
[XINT (id
)] = XINT (reg
);
2382 else if (charset
->iso_chars_96
)
2385 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg96
;
2390 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg94
;
2393 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2397 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2398 Check if a text is encoded in one of ISO-2022 based codig systems.
2399 If it is, return 1, else return 0. */
2402 detect_coding_iso_2022 (coding
, detect_info
)
2403 struct coding_system
*coding
;
2404 struct coding_detection_info
*detect_info
;
2406 unsigned char *src
= coding
->source
, *src_base
= src
;
2407 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2408 int multibytep
= coding
->src_multibyte
;
2409 int single_shifting
= 0;
2412 int consumed_chars
= 0;
2417 detect_info
->checked
|= CATEGORY_MASK_ISO
;
2419 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2421 struct coding_system
*this = &(coding_categories
[i
]);
2422 Lisp_Object attrs
, val
;
2424 attrs
= CODING_ID_ATTRS (this->id
);
2425 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2426 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2427 setup_iso_safe_charsets (attrs
);
2428 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2429 this->max_charset_id
= XSTRING (val
)->size
- 1;
2430 this->safe_charsets
= (char *) XSTRING (val
)->data
;
2433 /* A coding system of this category is always ASCII compatible. */
2434 src
+= coding
->head_ascii
;
2436 while (rejected
!= CATEGORY_MASK_ISO
)
2442 if (inhibit_iso_escape_detection
)
2444 single_shifting
= 0;
2446 if (c
>= '(' && c
<= '/')
2448 /* Designation sequence for a charset of dimension 1. */
2450 if (c1
< ' ' || c1
>= 0x80
2451 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2452 /* Invalid designation sequence. Just ignore. */
2457 /* Designation sequence for a charset of dimension 2. */
2459 if (c
>= '@' && c
<= 'B')
2460 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2461 id
= iso_charset_table
[1][0][c
];
2462 else if (c
>= '(' && c
<= '/')
2465 if (c1
< ' ' || c1
>= 0x80
2466 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2467 /* Invalid designation sequence. Just ignore. */
2471 /* Invalid designation sequence. Just ignore it. */
2474 else if (c
== 'N' || c
== 'O')
2476 /* ESC <Fe> for SS2 or SS3. */
2477 single_shifting
= 1;
2478 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2481 else if (c
>= '0' && c
<= '4')
2483 /* ESC <Fp> for start/end composition. */
2484 found
|= CATEGORY_MASK_ISO
;
2489 /* Invalid escape sequence. Just ignore it. */
2493 /* We found a valid designation sequence for CHARSET. */
2494 rejected
|= CATEGORY_MASK_ISO_8BIT
;
2495 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2497 found
|= CATEGORY_MASK_ISO_7
;
2499 rejected
|= CATEGORY_MASK_ISO_7
;
2500 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2502 found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2504 rejected
|= CATEGORY_MASK_ISO_7_TIGHT
;
2505 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2507 found
|= CATEGORY_MASK_ISO_7_ELSE
;
2509 rejected
|= CATEGORY_MASK_ISO_7_ELSE
;
2510 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2512 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2514 rejected
|= CATEGORY_MASK_ISO_8_ELSE
;
2519 /* Locking shift out/in. */
2520 if (inhibit_iso_escape_detection
)
2522 single_shifting
= 0;
2523 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2524 found
|= CATEGORY_MASK_ISO_ELSE
;
2528 /* Control sequence introducer. */
2529 single_shifting
= 0;
2530 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2531 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2532 goto check_extra_latin
;
2538 if (inhibit_iso_escape_detection
)
2540 single_shifting
= 1;
2541 rejected
|= CATEGORY_MASK_ISO_7BIT
;
2542 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2543 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2544 found
|= CATEGORY_MASK_ISO_8_1
;
2545 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2546 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2547 found
|= CATEGORY_MASK_ISO_8_2
;
2548 goto check_extra_latin
;
2553 single_shifting
= 0;
2558 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2559 found
|= CATEGORY_MASK_ISO_8_1
;
2560 /* Check the length of succeeding codes of the range
2561 0xA0..0FF. If the byte length is even, we include
2562 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2563 only when we are not single shifting. */
2564 if (! single_shifting
2565 && ! (rejected
& CATEGORY_MASK_ISO_8_2
))
2568 while (src
< src_end
)
2576 if (i
& 1 && src
< src_end
)
2577 rejected
|= CATEGORY_MASK_ISO_8_2
;
2579 found
|= CATEGORY_MASK_ISO_8_2
;
2584 single_shifting
= 0;
2585 if (! VECTORP (Vlatin_extra_code_table
)
2586 || NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2588 rejected
= CATEGORY_MASK_ISO
;
2591 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2592 & CODING_ISO_FLAG_LATIN_EXTRA
)
2593 found
|= CATEGORY_MASK_ISO_8_1
;
2595 rejected
|= CATEGORY_MASK_ISO_8_1
;
2596 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2597 & CODING_ISO_FLAG_LATIN_EXTRA
)
2598 found
|= CATEGORY_MASK_ISO_8_2
;
2600 rejected
|= CATEGORY_MASK_ISO_8_2
;
2603 detect_info
->rejected
|= CATEGORY_MASK_ISO
;
2607 detect_info
->rejected
|= rejected
;
2608 detect_info
->found
|= (found
& ~rejected
);
2613 /* Set designation state into CODING. */
2614 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2618 if (final < '0' || final >= 128 \
2619 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2620 || !SAFE_CHARSET_P (coding, id)) \
2622 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2623 goto invalid_code; \
2625 prev = CODING_ISO_DESIGNATION (coding, reg); \
2626 if (id == charset_jisx0201_roman) \
2628 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2629 id = charset_ascii; \
2631 else if (id == charset_jisx0208_1978) \
2633 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2634 id = charset_jisx0208; \
2636 CODING_ISO_DESIGNATION (coding, reg) = id; \
2637 /* If there was an invalid designation to REG previously, and this \
2638 designation is ASCII to REG, we should keep this designation \
2640 if (prev == -2 && id == charset_ascii) \
2641 goto invalid_code; \
2645 #define MAYBE_FINISH_COMPOSITION() \
2648 if (composition_state == COMPOSING_NO) \
2650 /* It is assured that we have enough room for producing \
2651 characters stored in the table `components'. */ \
2652 if (charbuf + component_idx > charbuf_end) \
2653 goto no_more_source; \
2654 composition_state = COMPOSING_NO; \
2655 if (method == COMPOSITION_RELATIVE \
2656 || method == COMPOSITION_WITH_ALTCHARS) \
2658 for (i = 0; i < component_idx; i++) \
2659 *charbuf++ = components[i]; \
2660 char_offset += component_idx; \
2664 for (i = 0; i < component_idx; i += 2) \
2665 *charbuf++ = components[i]; \
2666 char_offset += (component_idx / 2) + 1; \
2671 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2672 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2673 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2674 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2675 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2678 #define DECODE_COMPOSITION_START(c1) \
2681 && composition_state == COMPOSING_COMPONENT_RULE) \
2683 component_len = component_idx; \
2684 composition_state = COMPOSING_CHAR; \
2690 MAYBE_FINISH_COMPOSITION (); \
2691 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2692 goto no_more_source; \
2693 for (p = src; p < src_end - 1; p++) \
2694 if (*p == ISO_CODE_ESC && p[1] == '1') \
2696 if (p == src_end - 1) \
2698 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2699 goto invalid_code; \
2700 goto no_more_source; \
2703 /* This is surely the start of a composition. */ \
2704 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2705 : c1 == '2' ? COMPOSITION_WITH_RULE \
2706 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2707 : COMPOSITION_WITH_RULE_ALTCHARS); \
2708 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2709 : COMPOSING_COMPONENT_CHAR); \
2710 component_idx = component_len = 0; \
2715 /* Handle compositoin end sequence ESC 1. */
2717 #define DECODE_COMPOSITION_END() \
2719 int nchars = (component_len > 0 ? component_idx - component_len \
2720 : method == COMPOSITION_RELATIVE ? component_idx \
2721 : (component_idx + 1) / 2); \
2723 int *saved_charbuf = charbuf; \
2724 int from = coding->produced_char + char_offset; \
2725 int to = from + nchars; \
2727 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2728 if (method != COMPOSITION_RELATIVE) \
2730 if (component_len == 0) \
2731 for (i = 0; i < component_idx; i++) \
2732 *charbuf++ = components[i]; \
2734 for (i = 0; i < component_len; i++) \
2735 *charbuf++ = components[i]; \
2736 *saved_charbuf = saved_charbuf - charbuf; \
2738 if (method == COMPOSITION_WITH_RULE) \
2739 for (i = 0; i < component_idx; i += 2, char_offset++) \
2740 *charbuf++ = components[i]; \
2742 for (i = component_len; i < component_idx; i++, char_offset++) \
2743 *charbuf++ = components[i]; \
2744 coding->annotated = 1; \
2745 composition_state = COMPOSING_NO; \
2749 /* Decode a composition rule from the byte C1 (and maybe one more byte
2750 from SRC) and store one encoded composition rule in
2751 coding->cmp_data. */
2753 #define DECODE_COMPOSITION_RULE(c1) \
2756 if (c1 < 81) /* old format (before ver.21) */ \
2758 int gref = (c1) / 9; \
2759 int nref = (c1) % 9; \
2760 if (gref == 4) gref = 10; \
2761 if (nref == 4) nref = 10; \
2762 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2764 else if (c1 < 93) /* new format (after ver.21) */ \
2766 ONE_MORE_BYTE (c2); \
2767 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2774 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2777 decode_coding_iso_2022 (coding
)
2778 struct coding_system
*coding
;
2780 unsigned char *src
= coding
->source
+ coding
->consumed
;
2781 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2782 unsigned char *src_base
;
2783 int *charbuf
= coding
->charbuf
;
2785 = charbuf
+ coding
->charbuf_size
- 4 - MAX_ANNOTATION_LENGTH
;
2786 int consumed_chars
= 0, consumed_chars_base
;
2787 int multibytep
= coding
->src_multibyte
;
2788 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2789 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2790 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2791 struct charset
*charset
;
2793 /* For handling composition sequence. */
2794 #define COMPOSING_NO 0
2795 #define COMPOSING_CHAR 1
2796 #define COMPOSING_RULE 2
2797 #define COMPOSING_COMPONENT_CHAR 3
2798 #define COMPOSING_COMPONENT_RULE 4
2800 int composition_state
= COMPOSING_NO
;
2801 enum composition_method method
;
2802 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2805 Lisp_Object attrs
, eol_type
, charset_list
;
2806 int char_offset
= coding
->produced_char
;
2807 int last_offset
= char_offset
;
2808 int last_id
= charset_ascii
;
2810 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2811 setup_iso_safe_charsets (attrs
);
2818 consumed_chars_base
= consumed_chars
;
2820 if (charbuf
>= charbuf_end
)
2825 /* We produce at most one character. */
2826 switch (iso_code_class
[c1
])
2828 case ISO_0x20_or_0x7F
:
2829 if (composition_state
!= COMPOSING_NO
)
2831 if (composition_state
== COMPOSING_RULE
2832 || composition_state
== COMPOSING_COMPONENT_RULE
)
2834 DECODE_COMPOSITION_RULE (c1
);
2835 components
[component_idx
++] = c1
;
2836 composition_state
--;
2840 if (charset_id_0
< 0
2841 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2842 /* This is SPACE or DEL. */
2843 charset
= CHARSET_FROM_ID (charset_ascii
);
2845 charset
= CHARSET_FROM_ID (charset_id_0
);
2848 case ISO_graphic_plane_0
:
2849 if (composition_state
!= COMPOSING_NO
)
2851 if (composition_state
== COMPOSING_RULE
2852 || composition_state
== COMPOSING_COMPONENT_RULE
)
2854 DECODE_COMPOSITION_RULE (c1
);
2855 components
[component_idx
++] = c1
;
2856 composition_state
--;
2860 charset
= CHARSET_FROM_ID (charset_id_0
);
2863 case ISO_0xA0_or_0xFF
:
2864 if (charset_id_1
< 0
2865 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
2866 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
2868 /* This is a graphic character, we fall down ... */
2870 case ISO_graphic_plane_1
:
2871 if (charset_id_1
< 0)
2873 charset
= CHARSET_FROM_ID (charset_id_1
);
2876 case ISO_carriage_return
:
2879 if (EQ (eol_type
, Qdos
))
2883 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
2884 goto no_more_source
;
2889 else if (EQ (eol_type
, Qmac
))
2895 MAYBE_FINISH_COMPOSITION ();
2896 charset
= CHARSET_FROM_ID (charset_ascii
);
2900 MAYBE_FINISH_COMPOSITION ();
2904 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2905 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
2907 CODING_ISO_INVOCATION (coding
, 0) = 1;
2908 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2912 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
2914 CODING_ISO_INVOCATION (coding
, 0) = 0;
2915 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2918 case ISO_single_shift_2_7
:
2919 case ISO_single_shift_2
:
2920 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2922 /* SS2 is handled as an escape sequence of ESC 'N' */
2924 goto label_escape_sequence
;
2926 case ISO_single_shift_3
:
2927 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2929 /* SS2 is handled as an escape sequence of ESC 'O' */
2931 goto label_escape_sequence
;
2933 case ISO_control_sequence_introducer
:
2934 /* CSI is handled as an escape sequence of ESC '[' ... */
2936 goto label_escape_sequence
;
2940 label_escape_sequence
:
2941 /* Escape sequences handled here are invocation,
2942 designation, direction specification, and character
2943 composition specification. */
2946 case '&': /* revision of following character set */
2948 if (!(c1
>= '@' && c1
<= '~'))
2951 if (c1
!= ISO_CODE_ESC
)
2954 goto label_escape_sequence
;
2956 case '$': /* designation of 2-byte character set */
2957 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2960 if (c1
>= '@' && c1
<= 'B')
2961 { /* designation of JISX0208.1978, GB2312.1980,
2963 DECODE_DESIGNATION (0, 2, 0, c1
);
2965 else if (c1
>= 0x28 && c1
<= 0x2B)
2966 { /* designation of DIMENSION2_CHARS94 character set */
2968 DECODE_DESIGNATION (c1
- 0x28, 2, 0, c2
);
2970 else if (c1
>= 0x2C && c1
<= 0x2F)
2971 { /* designation of DIMENSION2_CHARS96 character set */
2973 DECODE_DESIGNATION (c1
- 0x2C, 2, 1, c2
);
2977 /* We must update these variables now. */
2978 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2979 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2982 case 'n': /* invocation of locking-shift-2 */
2983 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2984 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2986 CODING_ISO_INVOCATION (coding
, 0) = 2;
2987 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2990 case 'o': /* invocation of locking-shift-3 */
2991 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2992 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2994 CODING_ISO_INVOCATION (coding
, 0) = 3;
2995 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2998 case 'N': /* invocation of single-shift-2 */
2999 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3000 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3002 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 2));
3004 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3008 case 'O': /* invocation of single-shift-3 */
3009 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3010 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3012 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 3));
3014 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3018 case '0': case '2': case '3': case '4': /* start composition */
3019 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
3021 DECODE_COMPOSITION_START (c1
);
3024 case '1': /* end composition */
3025 if (composition_state
== COMPOSING_NO
)
3027 DECODE_COMPOSITION_END ();
3030 case '[': /* specification of direction */
3031 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
3033 /* For the moment, nested direction is not supported.
3034 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3035 left-to-right, and nozero means right-to-left. */
3039 case ']': /* end of the current direction */
3040 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3042 case '0': /* end of the current direction */
3043 case '1': /* start of left-to-right direction */
3046 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3051 case '2': /* start of right-to-left direction */
3054 coding
->mode
|= CODING_MODE_DIRECTION
;
3068 /* CTEXT extended segment:
3069 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3070 We keep these bytes as is for the moment.
3071 They may be decoded by post-read-conversion. */
3075 ONE_MORE_BYTE (dim
);
3078 size
= ((M
- 128) * 128) + (L
- 128);
3079 if (charbuf
+ 8 + size
> charbuf_end
)
3081 *charbuf
++ = ISO_CODE_ESC
;
3085 *charbuf
++ = BYTE8_TO_CHAR (M
);
3086 *charbuf
++ = BYTE8_TO_CHAR (L
);
3090 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3095 /* XFree86 extension for embedding UTF-8 in CTEXT:
3096 ESC % G --UTF-8-BYTES-- ESC % @
3097 We keep these bytes as is for the moment.
3098 They may be decoded by post-read-conversion. */
3101 if (p
+ 6 > charbuf_end
)
3103 *p
++ = ISO_CODE_ESC
;
3106 while (p
< charbuf_end
)
3109 if (c1
== ISO_CODE_ESC
3110 && src
+ 1 < src_end
3114 *p
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3116 if (p
+ 3 > charbuf_end
)
3118 *p
++ = ISO_CODE_ESC
;
3129 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3131 if (c1
>= 0x28 && c1
<= 0x2B)
3132 { /* designation of DIMENSION1_CHARS94 character set */
3134 DECODE_DESIGNATION (c1
- 0x28, 1, 0, c2
);
3136 else if (c1
>= 0x2C && c1
<= 0x2F)
3137 { /* designation of DIMENSION1_CHARS96 character set */
3139 DECODE_DESIGNATION (c1
- 0x2C, 1, 1, c2
);
3143 /* We must update these variables now. */
3144 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3145 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3150 if (charset
->id
!= charset_ascii
3151 && last_id
!= charset
->id
)
3153 if (last_id
!= charset_ascii
)
3154 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3155 last_id
= charset
->id
;
3156 last_offset
= char_offset
;
3159 /* Now we know CHARSET and 1st position code C1 of a character.
3160 Produce a decoded character while getting 2nd position code
3163 if (CHARSET_DIMENSION (charset
) > 1)
3166 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3167 /* C2 is not in a valid range. */
3169 c1
= (c1
<< 8) | (c2
& 0x7F);
3170 if (CHARSET_DIMENSION (charset
) > 2)
3173 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3174 /* C2 is not in a valid range. */
3176 c1
= (c1
<< 8) | (c2
& 0x7F);
3180 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3183 MAYBE_FINISH_COMPOSITION ();
3184 for (; src_base
< src
; src_base
++, char_offset
++)
3186 if (ASCII_BYTE_P (*src_base
))
3187 *charbuf
++ = *src_base
;
3189 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3192 else if (composition_state
== COMPOSING_NO
)
3199 components
[component_idx
++] = c
;
3200 if (method
== COMPOSITION_WITH_RULE
3201 || (method
== COMPOSITION_WITH_RULE_ALTCHARS
3202 && composition_state
== COMPOSING_COMPONENT_CHAR
))
3203 composition_state
++;
3208 MAYBE_FINISH_COMPOSITION ();
3210 consumed_chars
= consumed_chars_base
;
3212 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3222 if (last_id
!= charset_ascii
)
3223 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3224 coding
->consumed_char
+= consumed_chars_base
;
3225 coding
->consumed
= src_base
- coding
->source
;
3226 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3230 /* ISO2022 encoding stuff. */
3233 It is not enough to say just "ISO2022" on encoding, we have to
3234 specify more details. In Emacs, each coding system of ISO2022
3235 variant has the following specifications:
3236 1. Initial designation to G0 thru G3.
3237 2. Allows short-form designation?
3238 3. ASCII should be designated to G0 before control characters?
3239 4. ASCII should be designated to G0 at end of line?
3240 5. 7-bit environment or 8-bit environment?
3241 6. Use locking-shift?
3242 7. Use Single-shift?
3243 And the following two are only for Japanese:
3244 8. Use ASCII in place of JIS0201-1976-Roman?
3245 9. Use JISX0208-1983 in place of JISX0208-1978?
3246 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3247 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3251 /* Produce codes (escape sequence) for designating CHARSET to graphic
3252 register REG at DST, and increment DST. If <final-char> of CHARSET is
3253 '@', 'A', or 'B' and the coding system CODING allows, produce
3254 designation sequence of short-form. */
3256 #define ENCODE_DESIGNATION(charset, reg, coding) \
3258 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3259 char *intermediate_char_94 = "()*+"; \
3260 char *intermediate_char_96 = ",-./"; \
3261 int revision = -1; \
3264 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3265 revision = CHARSET_ISO_REVISION (charset); \
3267 if (revision >= 0) \
3269 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3270 EMIT_ONE_BYTE ('@' + revision); \
3272 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3273 if (CHARSET_DIMENSION (charset) == 1) \
3275 if (! CHARSET_ISO_CHARS_96 (charset)) \
3276 c = intermediate_char_94[reg]; \
3278 c = intermediate_char_96[reg]; \
3279 EMIT_ONE_ASCII_BYTE (c); \
3283 EMIT_ONE_ASCII_BYTE ('$'); \
3284 if (! CHARSET_ISO_CHARS_96 (charset)) \
3286 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3288 || final_char < '@' || final_char > 'B') \
3289 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3292 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3294 EMIT_ONE_ASCII_BYTE (final_char); \
3296 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3300 /* The following two macros produce codes (control character or escape
3301 sequence) for ISO2022 single-shift functions (single-shift-2 and
3304 #define ENCODE_SINGLE_SHIFT_2 \
3306 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3307 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3309 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3310 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3314 #define ENCODE_SINGLE_SHIFT_3 \
3316 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3317 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3319 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3320 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3324 /* The following four macros produce codes (control character or
3325 escape sequence) for ISO2022 locking-shift functions (shift-in,
3326 shift-out, locking-shift-2, and locking-shift-3). */
3328 #define ENCODE_SHIFT_IN \
3330 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3331 CODING_ISO_INVOCATION (coding, 0) = 0; \
3335 #define ENCODE_SHIFT_OUT \
3337 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3338 CODING_ISO_INVOCATION (coding, 0) = 1; \
3342 #define ENCODE_LOCKING_SHIFT_2 \
3344 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3345 CODING_ISO_INVOCATION (coding, 0) = 2; \
3349 #define ENCODE_LOCKING_SHIFT_3 \
3351 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3352 CODING_ISO_INVOCATION (coding, 0) = 3; \
3356 /* Produce codes for a DIMENSION1 character whose character set is
3357 CHARSET and whose position-code is C1. Designation and invocation
3358 sequences are also produced in advance if necessary. */
3360 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3362 int id = CHARSET_ID (charset); \
3364 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3365 && id == charset_ascii) \
3367 id = charset_jisx0201_roman; \
3368 charset = CHARSET_FROM_ID (id); \
3371 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3373 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3374 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3376 EMIT_ONE_BYTE (c1 | 0x80); \
3377 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3380 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3382 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3385 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3387 EMIT_ONE_BYTE (c1 | 0x80); \
3391 /* Since CHARSET is not yet invoked to any graphic planes, we \
3392 must invoke it, or, at first, designate it to some graphic \
3393 register. Then repeat the loop to actually produce the \
3395 dst = encode_invocation_designation (charset, coding, dst, \
3400 /* Produce codes for a DIMENSION2 character whose character set is
3401 CHARSET and whose position-codes are C1 and C2. Designation and
3402 invocation codes are also produced in advance if necessary. */
3404 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3406 int id = CHARSET_ID (charset); \
3408 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3409 && id == charset_jisx0208) \
3411 id = charset_jisx0208_1978; \
3412 charset = CHARSET_FROM_ID (id); \
3415 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3417 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3418 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3420 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3421 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3424 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3426 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3429 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3431 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3435 /* Since CHARSET is not yet invoked to any graphic planes, we \
3436 must invoke it, or, at first, designate it to some graphic \
3437 register. Then repeat the loop to actually produce the \
3439 dst = encode_invocation_designation (charset, coding, dst, \
3444 #define ENCODE_ISO_CHARACTER(charset, c) \
3446 int code = ENCODE_CHAR ((charset),(c)); \
3448 if (CHARSET_DIMENSION (charset) == 1) \
3449 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3451 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3455 /* Produce designation and invocation codes at a place pointed by DST
3456 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3460 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3461 struct charset
*charset
;
3462 struct coding_system
*coding
;
3466 int multibytep
= coding
->dst_multibyte
;
3467 int produced_chars
= *p_nchars
;
3468 int reg
; /* graphic register number */
3469 int id
= CHARSET_ID (charset
);
3471 /* At first, check designations. */
3472 for (reg
= 0; reg
< 4; reg
++)
3473 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3478 /* CHARSET is not yet designated to any graphic registers. */
3479 /* At first check the requested designation. */
3480 reg
= CODING_ISO_REQUEST (coding
, id
);
3482 /* Since CHARSET requests no special designation, designate it
3483 to graphic register 0. */
3486 ENCODE_DESIGNATION (charset
, reg
, coding
);
3489 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3490 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3492 /* Since the graphic register REG is not invoked to any graphic
3493 planes, invoke it to graphic plane 0. */
3496 case 0: /* graphic register 0 */
3500 case 1: /* graphic register 1 */
3504 case 2: /* graphic register 2 */
3505 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3506 ENCODE_SINGLE_SHIFT_2
;
3508 ENCODE_LOCKING_SHIFT_2
;
3511 case 3: /* graphic register 3 */
3512 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3513 ENCODE_SINGLE_SHIFT_3
;
3515 ENCODE_LOCKING_SHIFT_3
;
3520 *p_nchars
= produced_chars
;
3524 /* The following three macros produce codes for indicating direction
3526 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3528 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3529 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3531 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3535 #define ENCODE_DIRECTION_R2L() \
3537 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3538 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3542 #define ENCODE_DIRECTION_L2R() \
3544 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3545 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3549 /* Produce codes for designation and invocation to reset the graphic
3550 planes and registers to initial state. */
3551 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3554 struct charset *charset; \
3556 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3558 for (reg = 0; reg < 4; reg++) \
3559 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3560 && (CODING_ISO_DESIGNATION (coding, reg) \
3561 != CODING_ISO_INITIAL (coding, reg))) \
3563 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3564 ENCODE_DESIGNATION (charset, reg, coding); \
3569 /* Produce designation sequences of charsets in the line started from
3570 SRC to a place pointed by DST, and return updated DST.
3572 If the current block ends before any end-of-line, we may fail to
3573 find all the necessary designations. */
3575 static unsigned char *
3576 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3577 struct coding_system
*coding
;
3578 int *charbuf
, *charbuf_end
;
3581 struct charset
*charset
;
3582 /* Table of charsets to be designated to each graphic register. */
3584 int c
, found
= 0, reg
;
3585 int produced_chars
= 0;
3586 int multibytep
= coding
->dst_multibyte
;
3588 Lisp_Object charset_list
;
3590 attrs
= CODING_ID_ATTRS (coding
->id
);
3591 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3592 if (EQ (charset_list
, Qiso_2022
))
3593 charset_list
= Viso_2022_charset_list
;
3595 for (reg
= 0; reg
< 4; reg
++)
3605 charset
= char_charset (c
, charset_list
, NULL
);
3606 id
= CHARSET_ID (charset
);
3607 reg
= CODING_ISO_REQUEST (coding
, id
);
3608 if (reg
>= 0 && r
[reg
] < 0)
3617 for (reg
= 0; reg
< 4; reg
++)
3619 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3620 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3626 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3629 encode_coding_iso_2022 (coding
)
3630 struct coding_system
*coding
;
3632 int multibytep
= coding
->dst_multibyte
;
3633 int *charbuf
= coding
->charbuf
;
3634 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3635 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3636 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3639 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3640 && CODING_ISO_BOL (coding
));
3641 int produced_chars
= 0;
3642 Lisp_Object attrs
, eol_type
, charset_list
;
3643 int ascii_compatible
;
3645 int preferred_charset_id
= -1;
3647 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3648 setup_iso_safe_charsets (attrs
);
3649 /* Charset list may have been changed. */
3650 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
); \
3651 coding
->safe_charsets
3652 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs
))->data
;
3654 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3656 while (charbuf
< charbuf_end
)
3658 ASSURE_DESTINATION (safe_room
);
3660 if (bol_designation
)
3662 unsigned char *dst_prev
= dst
;
3664 /* We have to produce designation sequences if any now. */
3665 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3666 bol_designation
= 0;
3667 /* We are sure that designation sequences are all ASCII bytes. */
3668 produced_chars
+= dst
- dst_prev
;
3675 /* Handle an annotation. */
3678 case CODING_ANNOTATE_COMPOSITION_MASK
:
3679 /* Not yet implemented. */
3681 case CODING_ANNOTATE_CHARSET_MASK
:
3682 preferred_charset_id
= charbuf
[3];
3683 if (preferred_charset_id
>= 0
3684 && NILP (Fmemq (make_number (preferred_charset_id
),
3686 preferred_charset_id
= -1;
3695 /* Now encode the character C. */
3696 if (c
< 0x20 || c
== 0x7F)
3699 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3701 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3702 ENCODE_RESET_PLANE_AND_REGISTER ();
3703 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3707 for (i
= 0; i
< 4; i
++)
3708 CODING_ISO_DESIGNATION (coding
, i
)
3709 = CODING_ISO_INITIAL (coding
, i
);
3712 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3714 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3715 ENCODE_RESET_PLANE_AND_REGISTER ();
3716 EMIT_ONE_ASCII_BYTE (c
);
3718 else if (ASCII_CHAR_P (c
))
3720 if (ascii_compatible
)
3721 EMIT_ONE_ASCII_BYTE (c
);
3724 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
3725 ENCODE_ISO_CHARACTER (charset
, c
);
3728 else if (CHAR_BYTE8_P (c
))
3730 c
= CHAR_TO_BYTE8 (c
);
3735 struct charset
*charset
;
3737 if (preferred_charset_id
>= 0)
3739 charset
= CHARSET_FROM_ID (preferred_charset_id
);
3740 if (! CHAR_CHARSET_P (c
, charset
))
3741 charset
= char_charset (c
, charset_list
, NULL
);
3744 charset
= char_charset (c
, charset_list
, NULL
);
3747 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3749 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3750 charset
= CHARSET_FROM_ID (charset_ascii
);
3754 c
= coding
->default_char
;
3755 charset
= char_charset (c
, charset_list
, NULL
);
3758 ENCODE_ISO_CHARACTER (charset
, c
);
3762 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3763 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3765 ASSURE_DESTINATION (safe_room
);
3766 ENCODE_RESET_PLANE_AND_REGISTER ();
3768 coding
->result
= CODING_RESULT_SUCCESS
;
3769 CODING_ISO_BOL (coding
) = bol_designation
;
3770 coding
->produced_char
+= produced_chars
;
3771 coding
->produced
= dst
- coding
->destination
;
3776 /*** 8,9. SJIS and BIG5 handlers ***/
3778 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3779 quite widely. So, for the moment, Emacs supports them in the bare
3780 C code. But, in the future, they may be supported only by CCL. */
3782 /* SJIS is a coding system encoding three character sets: ASCII, right
3783 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3784 as is. A character of charset katakana-jisx0201 is encoded by
3785 "position-code + 0x80". A character of charset japanese-jisx0208
3786 is encoded in 2-byte but two position-codes are divided and shifted
3787 so that it fit in the range below.
3789 --- CODE RANGE of SJIS ---
3790 (character set) (range)
3792 KATAKANA-JISX0201 0xA0 .. 0xDF
3793 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3794 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3795 -------------------------------
3799 /* BIG5 is a coding system encoding two character sets: ASCII and
3800 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3801 character set and is encoded in two-byte.
3803 --- CODE RANGE of BIG5 ---
3804 (character set) (range)
3806 Big5 (1st byte) 0xA1 .. 0xFE
3807 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3808 --------------------------
3812 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3813 Check if a text is encoded in SJIS. If it is, return
3814 CATEGORY_MASK_SJIS, else return 0. */
3817 detect_coding_sjis (coding
, detect_info
)
3818 struct coding_system
*coding
;
3819 struct coding_detection_info
*detect_info
;
3821 unsigned char *src
= coding
->source
, *src_base
= src
;
3822 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3823 int multibytep
= coding
->src_multibyte
;
3824 int consumed_chars
= 0;
3829 detect_info
->checked
|= CATEGORY_MASK_SJIS
;
3830 /* A coding system of this category is always ASCII compatible. */
3831 src
+= coding
->head_ascii
;
3840 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3843 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
3845 found
= CATEGORY_MASK_SJIS
;
3847 else if (c
>= 0xA0 && c
< 0xE0)
3848 found
= CATEGORY_MASK_SJIS
;
3852 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3856 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3858 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3861 detect_info
->found
|= found
;
3865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3866 Check if a text is encoded in BIG5. If it is, return
3867 CATEGORY_MASK_BIG5, else return 0. */
3870 detect_coding_big5 (coding
, detect_info
)
3871 struct coding_system
*coding
;
3872 struct coding_detection_info
*detect_info
;
3874 unsigned char *src
= coding
->source
, *src_base
= src
;
3875 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3876 int multibytep
= coding
->src_multibyte
;
3877 int consumed_chars
= 0;
3882 detect_info
->checked
|= CATEGORY_MASK_BIG5
;
3883 /* A coding system of this category is always ASCII compatible. */
3884 src
+= coding
->head_ascii
;
3896 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
3898 found
= CATEGORY_MASK_BIG5
;
3903 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3907 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3909 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3912 detect_info
->found
|= found
;
3916 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3917 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3920 decode_coding_sjis (coding
)
3921 struct coding_system
*coding
;
3923 unsigned char *src
= coding
->source
+ coding
->consumed
;
3924 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3925 unsigned char *src_base
;
3926 int *charbuf
= coding
->charbuf
;
3927 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
3928 int consumed_chars
= 0, consumed_chars_base
;
3929 int multibytep
= coding
->src_multibyte
;
3930 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3931 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3932 int char_offset
= coding
->produced_char
;
3933 int last_offset
= char_offset
;
3934 int last_id
= charset_ascii
;
3936 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3939 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3940 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3941 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3948 consumed_chars_base
= consumed_chars
;
3950 if (charbuf
>= charbuf_end
)
3957 if (EQ (eol_type
, Qdos
))
3961 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
3962 goto no_more_source
;
3967 else if (EQ (eol_type
, Qmac
))
3972 struct charset
*charset
;
3975 charset
= charset_roman
;
3980 if (c
< 0xA0 || c
>= 0xE0)
3982 /* SJIS -> JISX0208 */
3984 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
3988 charset
= charset_kanji
;
3992 /* SJIS -> JISX0201-Kana */
3994 charset
= charset_kana
;
3997 if (charset
->id
!= charset_ascii
3998 && last_id
!= charset
->id
)
4000 if (last_id
!= charset_ascii
)
4001 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4002 last_id
= charset
->id
;
4003 last_offset
= char_offset
;
4005 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4013 consumed_chars
= consumed_chars_base
;
4015 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4021 if (last_id
!= charset_ascii
)
4022 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4023 coding
->consumed_char
+= consumed_chars_base
;
4024 coding
->consumed
= src_base
- coding
->source
;
4025 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4029 decode_coding_big5 (coding
)
4030 struct coding_system
*coding
;
4032 unsigned char *src
= coding
->source
+ coding
->consumed
;
4033 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4034 unsigned char *src_base
;
4035 int *charbuf
= coding
->charbuf
;
4036 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4037 int consumed_chars
= 0, consumed_chars_base
;
4038 int multibytep
= coding
->src_multibyte
;
4039 struct charset
*charset_roman
, *charset_big5
;
4040 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4041 int char_offset
= coding
->produced_char
;
4042 int last_offset
= char_offset
;
4043 int last_id
= charset_ascii
;
4045 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4047 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4048 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4055 consumed_chars_base
= consumed_chars
;
4057 if (charbuf
>= charbuf_end
)
4064 if (EQ (eol_type
, Qdos
))
4068 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4069 goto no_more_source
;
4074 else if (EQ (eol_type
, Qmac
))
4079 struct charset
*charset
;
4081 charset
= charset_roman
;
4085 if (c
< 0xA1 || c
> 0xFE)
4088 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
4091 charset
= charset_big5
;
4093 if (charset
->id
!= charset_ascii
4094 && last_id
!= charset
->id
)
4096 if (last_id
!= charset_ascii
)
4097 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4098 last_id
= charset
->id
;
4099 last_offset
= char_offset
;
4101 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4110 consumed_chars
= consumed_chars_base
;
4112 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4118 if (last_id
!= charset_ascii
)
4119 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4120 coding
->consumed_char
+= consumed_chars_base
;
4121 coding
->consumed
= src_base
- coding
->source
;
4122 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4125 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4126 This function can encode charsets `ascii', `katakana-jisx0201',
4127 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4128 are sure that all these charsets are registered as official charset
4129 (i.e. do not have extended leading-codes). Characters of other
4130 charsets are produced without any encoding. If SJIS_P is 1, encode
4131 SJIS text, else encode BIG5 text. */
4134 encode_coding_sjis (coding
)
4135 struct coding_system
*coding
;
4137 int multibytep
= coding
->dst_multibyte
;
4138 int *charbuf
= coding
->charbuf
;
4139 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4140 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4141 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4143 int produced_chars
= 0;
4144 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4145 int ascii_compatible
;
4146 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4149 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4151 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4152 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4153 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4155 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4157 while (charbuf
< charbuf_end
)
4159 ASSURE_DESTINATION (safe_room
);
4161 /* Now encode the character C. */
4162 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4163 EMIT_ONE_ASCII_BYTE (c
);
4164 else if (CHAR_BYTE8_P (c
))
4166 c
= CHAR_TO_BYTE8 (c
);
4172 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4176 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4178 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4179 charset
= CHARSET_FROM_ID (charset_ascii
);
4183 c
= coding
->default_char
;
4184 charset
= char_charset (c
, charset_list
, &code
);
4187 if (code
== CHARSET_INVALID_CODE (charset
))
4189 if (charset
== charset_kanji
)
4193 c1
= code
>> 8, c2
= code
& 0xFF;
4194 EMIT_TWO_BYTES (c1
, c2
);
4196 else if (charset
== charset_kana
)
4197 EMIT_ONE_BYTE (code
| 0x80);
4199 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4202 coding
->result
= CODING_RESULT_SUCCESS
;
4203 coding
->produced_char
+= produced_chars
;
4204 coding
->produced
= dst
- coding
->destination
;
4209 encode_coding_big5 (coding
)
4210 struct coding_system
*coding
;
4212 int multibytep
= coding
->dst_multibyte
;
4213 int *charbuf
= coding
->charbuf
;
4214 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4215 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4216 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4218 int produced_chars
= 0;
4219 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4220 int ascii_compatible
;
4221 struct charset
*charset_roman
, *charset_big5
;
4224 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4226 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4227 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4228 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4230 while (charbuf
< charbuf_end
)
4232 ASSURE_DESTINATION (safe_room
);
4234 /* Now encode the character C. */
4235 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4236 EMIT_ONE_ASCII_BYTE (c
);
4237 else if (CHAR_BYTE8_P (c
))
4239 c
= CHAR_TO_BYTE8 (c
);
4245 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4249 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4251 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4252 charset
= CHARSET_FROM_ID (charset_ascii
);
4256 c
= coding
->default_char
;
4257 charset
= char_charset (c
, charset_list
, &code
);
4260 if (code
== CHARSET_INVALID_CODE (charset
))
4262 if (charset
== charset_big5
)
4266 c1
= code
>> 8, c2
= code
& 0xFF;
4267 EMIT_TWO_BYTES (c1
, c2
);
4270 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4273 coding
->result
= CODING_RESULT_SUCCESS
;
4274 coding
->produced_char
+= produced_chars
;
4275 coding
->produced
= dst
- coding
->destination
;
4280 /*** 10. CCL handlers ***/
4282 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4283 Check if a text is encoded in a coding system of which
4284 encoder/decoder are written in CCL program. If it is, return
4285 CATEGORY_MASK_CCL, else return 0. */
4288 detect_coding_ccl (coding
, detect_info
)
4289 struct coding_system
*coding
;
4290 struct coding_detection_info
*detect_info
;
4292 unsigned char *src
= coding
->source
, *src_base
= src
;
4293 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4294 int multibytep
= coding
->src_multibyte
;
4295 int consumed_chars
= 0;
4297 unsigned char *valids
= CODING_CCL_VALIDS (coding
);
4298 int head_ascii
= coding
->head_ascii
;
4301 detect_info
->checked
|= CATEGORY_MASK_CCL
;
4303 coding
= &coding_categories
[coding_category_ccl
];
4304 attrs
= CODING_ID_ATTRS (coding
->id
);
4305 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4314 if ((valids
[c
] > 1))
4315 found
= CATEGORY_MASK_CCL
;
4317 detect_info
->rejected
|= CATEGORY_MASK_CCL
;
4321 detect_info
->found
|= found
;
4326 decode_coding_ccl (coding
)
4327 struct coding_system
*coding
;
4329 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4330 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4331 int *charbuf
= coding
->charbuf
;
4332 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4333 int consumed_chars
= 0;
4334 int multibytep
= coding
->src_multibyte
;
4335 struct ccl_program ccl
;
4336 int source_charbuf
[1024];
4337 int source_byteidx
[1024];
4338 Lisp_Object attrs
, eol_type
, charset_list
;
4340 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4341 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
4343 while (src
< src_end
)
4345 const unsigned char *p
= src
;
4346 int *source
, *source_end
;
4350 while (i
< 1024 && p
< src_end
)
4352 source_byteidx
[i
] = p
- src
;
4353 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4356 while (i
< 1024 && p
< src_end
)
4357 source_charbuf
[i
++] = *p
++;
4359 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4362 source
= source_charbuf
;
4363 source_end
= source
+ i
;
4364 while (source
< source_end
)
4366 ccl_driver (&ccl
, source
, charbuf
,
4367 source_end
- source
, charbuf_end
- charbuf
,
4369 source
+= ccl
.consumed
;
4370 charbuf
+= ccl
.produced
;
4371 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4374 if (source
< source_end
)
4375 src
+= source_byteidx
[source
- source_charbuf
];
4378 consumed_chars
+= source
- source_charbuf
;
4380 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4381 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4387 case CCL_STAT_SUSPEND_BY_SRC
:
4388 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4390 case CCL_STAT_SUSPEND_BY_DST
:
4393 case CCL_STAT_INVALID_CMD
:
4394 coding
->result
= CODING_RESULT_INTERRUPT
;
4397 coding
->result
= CODING_RESULT_SUCCESS
;
4400 coding
->consumed_char
+= consumed_chars
;
4401 coding
->consumed
= src
- coding
->source
;
4402 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4406 encode_coding_ccl (coding
)
4407 struct coding_system
*coding
;
4409 struct ccl_program ccl
;
4410 int multibytep
= coding
->dst_multibyte
;
4411 int *charbuf
= coding
->charbuf
;
4412 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4413 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4414 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4415 unsigned char *adjusted_dst_end
= dst_end
- 1;
4416 int destination_charbuf
[1024];
4417 int i
, produced_chars
= 0;
4418 Lisp_Object attrs
, eol_type
, charset_list
;
4420 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4421 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4423 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4424 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4426 while (charbuf
< charbuf_end
&& dst
< adjusted_dst_end
)
4428 int dst_bytes
= dst_end
- dst
;
4429 if (dst_bytes
> 1024)
4432 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4433 charbuf_end
- charbuf
, dst_bytes
, charset_list
);
4434 charbuf
+= ccl
.consumed
;
4436 for (i
= 0; i
< ccl
.produced
; i
++)
4437 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4440 for (i
= 0; i
< ccl
.produced
; i
++)
4441 *dst
++ = destination_charbuf
[i
] & 0xFF;
4442 produced_chars
+= ccl
.produced
;
4448 case CCL_STAT_SUSPEND_BY_SRC
:
4449 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4451 case CCL_STAT_SUSPEND_BY_DST
:
4452 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
4455 case CCL_STAT_INVALID_CMD
:
4456 coding
->result
= CODING_RESULT_INTERRUPT
;
4459 coding
->result
= CODING_RESULT_SUCCESS
;
4463 coding
->produced_char
+= produced_chars
;
4464 coding
->produced
= dst
- coding
->destination
;
4470 /*** 10, 11. no-conversion handlers ***/
4472 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4475 decode_coding_raw_text (coding
)
4476 struct coding_system
*coding
;
4478 coding
->chars_at_source
= 1;
4479 coding
->consumed_char
= 0;
4480 coding
->consumed
= 0;
4481 coding
->result
= CODING_RESULT_SUCCESS
;
4485 encode_coding_raw_text (coding
)
4486 struct coding_system
*coding
;
4488 int multibytep
= coding
->dst_multibyte
;
4489 int *charbuf
= coding
->charbuf
;
4490 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4491 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4492 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4493 int produced_chars
= 0;
4498 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4500 if (coding
->src_multibyte
)
4501 while (charbuf
< charbuf_end
)
4503 ASSURE_DESTINATION (safe_room
);
4505 if (ASCII_CHAR_P (c
))
4506 EMIT_ONE_ASCII_BYTE (c
);
4507 else if (CHAR_BYTE8_P (c
))
4509 c
= CHAR_TO_BYTE8 (c
);
4514 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4516 CHAR_STRING_ADVANCE (c
, p1
);
4519 EMIT_ONE_BYTE (*p0
);
4525 while (charbuf
< charbuf_end
)
4527 ASSURE_DESTINATION (safe_room
);
4534 if (coding
->src_multibyte
)
4536 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4538 while (charbuf
< charbuf_end
)
4540 ASSURE_DESTINATION (safe_room
);
4542 if (ASCII_CHAR_P (c
))
4544 else if (CHAR_BYTE8_P (c
))
4545 *dst
++ = CHAR_TO_BYTE8 (c
);
4547 CHAR_STRING_ADVANCE (c
, dst
);
4553 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4554 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4555 *dst
++ = *charbuf
++;
4556 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4559 coding
->result
= CODING_RESULT_SUCCESS
;
4560 coding
->produced_char
+= produced_chars
;
4561 coding
->produced
= dst
- coding
->destination
;
4565 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4566 Check if a text is encoded in a charset-based coding system. If it
4567 is, return 1, else return 0. */
4570 detect_coding_charset (coding
, detect_info
)
4571 struct coding_system
*coding
;
4572 struct coding_detection_info
*detect_info
;
4574 unsigned char *src
= coding
->source
, *src_base
= src
;
4575 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4576 int multibytep
= coding
->src_multibyte
;
4577 int consumed_chars
= 0;
4578 Lisp_Object attrs
, valids
;
4581 detect_info
->checked
|= CATEGORY_MASK_CHARSET
;
4583 coding
= &coding_categories
[coding_category_charset
];
4584 attrs
= CODING_ID_ATTRS (coding
->id
);
4585 valids
= AREF (attrs
, coding_attr_charset_valids
);
4587 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4588 src
+= coding
->head_ascii
;
4595 if (NILP (AREF (valids
, c
)))
4598 found
= CATEGORY_MASK_CHARSET
;
4600 detect_info
->rejected
|= CATEGORY_MASK_CHARSET
;
4604 detect_info
->found
|= found
;
4609 decode_coding_charset (coding
)
4610 struct coding_system
*coding
;
4612 unsigned char *src
= coding
->source
+ coding
->consumed
;
4613 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4614 unsigned char *src_base
;
4615 int *charbuf
= coding
->charbuf
;
4616 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4617 int consumed_chars
= 0, consumed_chars_base
;
4618 int multibytep
= coding
->src_multibyte
;
4619 Lisp_Object attrs
, eol_type
, charset_list
, valids
;
4620 int char_offset
= coding
->produced_char
;
4621 int last_offset
= char_offset
;
4622 int last_id
= charset_ascii
;
4624 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4625 valids
= AREF (attrs
, coding_attr_charset_valids
);
4632 consumed_chars_base
= consumed_chars
;
4634 if (charbuf
>= charbuf_end
)
4640 /* Here we assume that no charset maps '\r' to something
4642 if (EQ (eol_type
, Qdos
))
4646 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4647 goto no_more_source
;
4652 else if (EQ (eol_type
, Qmac
))
4658 struct charset
*charset
;
4663 val
= AREF (valids
, c
);
4668 charset
= CHARSET_FROM_ID (XFASTINT (val
));
4669 dim
= CHARSET_DIMENSION (charset
);
4673 code
= (code
<< 8) | c
;
4676 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
4681 /* VAL is a list of charset IDs. It is assured that the
4682 list is sorted by charset dimensions (smaller one
4686 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
4687 dim
= CHARSET_DIMENSION (charset
);
4691 code
= (code
<< 8) | c
;
4694 CODING_DECODE_CHAR (coding
, src
, src_base
,
4695 src_end
, charset
, code
, c
);
4703 if (charset
->id
!= charset_ascii
4704 && last_id
!= charset
->id
)
4706 if (last_id
!= charset_ascii
)
4707 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4708 last_id
= charset
->id
;
4709 last_offset
= char_offset
;
4718 consumed_chars
= consumed_chars_base
;
4720 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4726 if (last_id
!= charset_ascii
)
4727 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4728 coding
->consumed_char
+= consumed_chars_base
;
4729 coding
->consumed
= src_base
- coding
->source
;
4730 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4734 encode_coding_charset (coding
)
4735 struct coding_system
*coding
;
4737 int multibytep
= coding
->dst_multibyte
;
4738 int *charbuf
= coding
->charbuf
;
4739 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4740 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4741 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4742 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4743 int produced_chars
= 0;
4744 Lisp_Object attrs
, eol_type
, charset_list
;
4745 int ascii_compatible
;
4748 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4749 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4751 while (charbuf
< charbuf_end
)
4753 struct charset
*charset
;
4756 ASSURE_DESTINATION (safe_room
);
4758 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4759 EMIT_ONE_ASCII_BYTE (c
);
4760 else if (CHAR_BYTE8_P (c
))
4762 c
= CHAR_TO_BYTE8 (c
);
4767 charset
= char_charset (c
, charset_list
, &code
);
4770 if (CHARSET_DIMENSION (charset
) == 1)
4771 EMIT_ONE_BYTE (code
);
4772 else if (CHARSET_DIMENSION (charset
) == 2)
4773 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
4774 else if (CHARSET_DIMENSION (charset
) == 3)
4775 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
4777 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
4778 (code
>> 8) & 0xFF, code
& 0xFF);
4782 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4783 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4785 c
= coding
->default_char
;
4791 coding
->result
= CODING_RESULT_SUCCESS
;
4792 coding
->produced_char
+= produced_chars
;
4793 coding
->produced
= dst
- coding
->destination
;
4798 /*** 7. C library functions ***/
4800 /* Setup coding context CODING from information about CODING_SYSTEM.
4801 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4802 CODING_SYSTEM is invalid, signal an error. */
4805 setup_coding_system (coding_system
, coding
)
4806 Lisp_Object coding_system
;
4807 struct coding_system
*coding
;
4810 Lisp_Object eol_type
;
4811 Lisp_Object coding_type
;
4814 if (NILP (coding_system
))
4815 coding_system
= Qno_conversion
;
4817 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
4819 attrs
= CODING_ID_ATTRS (coding
->id
);
4820 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4823 coding
->head_ascii
= -1;
4824 coding
->common_flags
4825 = (VECTORP (eol_type
) ? CODING_REQUIRE_DETECTION_MASK
: 0);
4826 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
4827 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
4828 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
4829 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
4831 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4832 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4833 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4834 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
4836 coding_type
= CODING_ATTR_TYPE (attrs
);
4837 if (EQ (coding_type
, Qundecided
))
4839 coding
->detector
= NULL
;
4840 coding
->decoder
= decode_coding_raw_text
;
4841 coding
->encoder
= encode_coding_raw_text
;
4842 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4844 else if (EQ (coding_type
, Qiso_2022
))
4847 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
4849 /* Invoke graphic register 0 to plane 0. */
4850 CODING_ISO_INVOCATION (coding
, 0) = 0;
4851 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4852 CODING_ISO_INVOCATION (coding
, 1)
4853 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
4854 /* Setup the initial status of designation. */
4855 for (i
= 0; i
< 4; i
++)
4856 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
4857 /* Not single shifting initially. */
4858 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
4859 /* Beginning of buffer should also be regarded as bol. */
4860 CODING_ISO_BOL (coding
) = 1;
4861 coding
->detector
= detect_coding_iso_2022
;
4862 coding
->decoder
= decode_coding_iso_2022
;
4863 coding
->encoder
= encode_coding_iso_2022
;
4864 if (flags
& CODING_ISO_FLAG_SAFE
)
4865 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
4866 coding
->common_flags
4867 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4868 | CODING_REQUIRE_FLUSHING_MASK
);
4869 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
4870 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
4871 if (flags
& CODING_ISO_FLAG_DESIGNATION
)
4872 coding
->common_flags
|= CODING_ANNOTATE_CHARSET_MASK
;
4873 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
4875 setup_iso_safe_charsets (attrs
);
4876 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4877 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4878 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4880 CODING_ISO_FLAGS (coding
) = flags
;
4882 else if (EQ (coding_type
, Qcharset
))
4884 coding
->detector
= detect_coding_charset
;
4885 coding
->decoder
= decode_coding_charset
;
4886 coding
->encoder
= encode_coding_charset
;
4887 coding
->common_flags
4888 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4890 else if (EQ (coding_type
, Qutf_8
))
4892 coding
->detector
= detect_coding_utf_8
;
4893 coding
->decoder
= decode_coding_utf_8
;
4894 coding
->encoder
= encode_coding_utf_8
;
4895 coding
->common_flags
4896 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4898 else if (EQ (coding_type
, Qutf_16
))
4900 val
= AREF (attrs
, coding_attr_utf_16_bom
);
4901 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
4902 : EQ (val
, Qt
) ? utf_16_with_bom
4903 : utf_16_without_bom
);
4904 val
= AREF (attrs
, coding_attr_utf_16_endian
);
4905 CODING_UTF_16_ENDIAN (coding
) = (EQ (val
, Qbig
) ? utf_16_big_endian
4906 : utf_16_little_endian
);
4907 CODING_UTF_16_SURROGATE (coding
) = 0;
4908 coding
->detector
= detect_coding_utf_16
;
4909 coding
->decoder
= decode_coding_utf_16
;
4910 coding
->encoder
= encode_coding_utf_16
;
4911 coding
->common_flags
4912 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4913 if (CODING_UTF_16_BOM (coding
) == utf_16_detect_bom
)
4914 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4916 else if (EQ (coding_type
, Qccl
))
4918 coding
->detector
= detect_coding_ccl
;
4919 coding
->decoder
= decode_coding_ccl
;
4920 coding
->encoder
= encode_coding_ccl
;
4921 coding
->common_flags
4922 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4923 | CODING_REQUIRE_FLUSHING_MASK
);
4925 else if (EQ (coding_type
, Qemacs_mule
))
4927 coding
->detector
= detect_coding_emacs_mule
;
4928 coding
->decoder
= decode_coding_emacs_mule
;
4929 coding
->encoder
= encode_coding_emacs_mule
;
4930 coding
->common_flags
4931 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4932 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
4933 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
4935 Lisp_Object tail
, safe_charsets
;
4936 int max_charset_id
= 0;
4938 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4940 if (max_charset_id
< XFASTINT (XCAR (tail
)))
4941 max_charset_id
= XFASTINT (XCAR (tail
));
4942 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
4944 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4946 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
4947 coding
->max_charset_id
= max_charset_id
;
4948 coding
->safe_charsets
= (char *) XSTRING (safe_charsets
)->data
;
4951 else if (EQ (coding_type
, Qshift_jis
))
4953 coding
->detector
= detect_coding_sjis
;
4954 coding
->decoder
= decode_coding_sjis
;
4955 coding
->encoder
= encode_coding_sjis
;
4956 coding
->common_flags
4957 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4959 else if (EQ (coding_type
, Qbig5
))
4961 coding
->detector
= detect_coding_big5
;
4962 coding
->decoder
= decode_coding_big5
;
4963 coding
->encoder
= encode_coding_big5
;
4964 coding
->common_flags
4965 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4967 else /* EQ (coding_type, Qraw_text) */
4969 coding
->detector
= NULL
;
4970 coding
->decoder
= decode_coding_raw_text
;
4971 coding
->encoder
= encode_coding_raw_text
;
4972 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
4978 /* Return raw-text or one of its subsidiaries that has the same
4979 eol_type as CODING-SYSTEM. */
4982 raw_text_coding_system (coding_system
)
4983 Lisp_Object coding_system
;
4985 Lisp_Object spec
, attrs
;
4986 Lisp_Object eol_type
, raw_text_eol_type
;
4988 if (NILP (coding_system
))
4990 spec
= CODING_SYSTEM_SPEC (coding_system
);
4991 attrs
= AREF (spec
, 0);
4993 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
4994 return coding_system
;
4996 eol_type
= AREF (spec
, 2);
4997 if (VECTORP (eol_type
))
4999 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
5000 raw_text_eol_type
= AREF (spec
, 2);
5001 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
5002 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
5003 : AREF (raw_text_eol_type
, 2));
5007 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5008 does, return one of the subsidiary that has the same eol-spec as
5009 PARENT. Otherwise, return CODING_SYSTEM. */
5012 coding_inherit_eol_type (coding_system
, parent
)
5013 Lisp_Object coding_system
, parent
;
5015 Lisp_Object spec
, attrs
, eol_type
;
5017 if (NILP (coding_system
))
5018 coding_system
= Qraw_text
;
5019 spec
= CODING_SYSTEM_SPEC (coding_system
);
5020 attrs
= AREF (spec
, 0);
5021 eol_type
= AREF (spec
, 2);
5022 if (VECTORP (eol_type
)
5025 Lisp_Object parent_spec
;
5026 Lisp_Object parent_eol_type
;
5029 = CODING_SYSTEM_SPEC (buffer_defaults
.buffer_file_coding_system
);
5030 parent_eol_type
= AREF (parent_spec
, 2);
5031 if (EQ (parent_eol_type
, Qunix
))
5032 coding_system
= AREF (eol_type
, 0);
5033 else if (EQ (parent_eol_type
, Qdos
))
5034 coding_system
= AREF (eol_type
, 1);
5035 else if (EQ (parent_eol_type
, Qmac
))
5036 coding_system
= AREF (eol_type
, 2);
5038 return coding_system
;
5041 /* Emacs has a mechanism to automatically detect a coding system if it
5042 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5043 it's impossible to distinguish some coding systems accurately
5044 because they use the same range of codes. So, at first, coding
5045 systems are categorized into 7, those are:
5047 o coding-category-emacs-mule
5049 The category for a coding system which has the same code range
5050 as Emacs' internal format. Assigned the coding-system (Lisp
5051 symbol) `emacs-mule' by default.
5053 o coding-category-sjis
5055 The category for a coding system which has the same code range
5056 as SJIS. Assigned the coding-system (Lisp
5057 symbol) `japanese-shift-jis' by default.
5059 o coding-category-iso-7
5061 The category for a coding system which has the same code range
5062 as ISO2022 of 7-bit environment. This doesn't use any locking
5063 shift and single shift functions. This can encode/decode all
5064 charsets. Assigned the coding-system (Lisp symbol)
5065 `iso-2022-7bit' by default.
5067 o coding-category-iso-7-tight
5069 Same as coding-category-iso-7 except that this can
5070 encode/decode only the specified charsets.
5072 o coding-category-iso-8-1
5074 The category for a coding system which has the same code range
5075 as ISO2022 of 8-bit environment and graphic plane 1 used only
5076 for DIMENSION1 charset. This doesn't use any locking shift
5077 and single shift functions. Assigned the coding-system (Lisp
5078 symbol) `iso-latin-1' by default.
5080 o coding-category-iso-8-2
5082 The category for a coding system which has the same code range
5083 as ISO2022 of 8-bit environment and graphic plane 1 used only
5084 for DIMENSION2 charset. This doesn't use any locking shift
5085 and single shift functions. Assigned the coding-system (Lisp
5086 symbol) `japanese-iso-8bit' by default.
5088 o coding-category-iso-7-else
5090 The category for a coding system which has the same code range
5091 as ISO2022 of 7-bit environemnt but uses locking shift or
5092 single shift functions. Assigned the coding-system (Lisp
5093 symbol) `iso-2022-7bit-lock' by default.
5095 o coding-category-iso-8-else
5097 The category for a coding system which has the same code range
5098 as ISO2022 of 8-bit environemnt but uses locking shift or
5099 single shift functions. Assigned the coding-system (Lisp
5100 symbol) `iso-2022-8bit-ss2' by default.
5102 o coding-category-big5
5104 The category for a coding system which has the same code range
5105 as BIG5. Assigned the coding-system (Lisp symbol)
5106 `cn-big5' by default.
5108 o coding-category-utf-8
5110 The category for a coding system which has the same code range
5111 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5112 symbol) `utf-8' by default.
5114 o coding-category-utf-16-be
5116 The category for a coding system in which a text has an
5117 Unicode signature (cf. Unicode Standard) in the order of BIG
5118 endian at the head. Assigned the coding-system (Lisp symbol)
5119 `utf-16-be' by default.
5121 o coding-category-utf-16-le
5123 The category for a coding system in which a text has an
5124 Unicode signature (cf. Unicode Standard) in the order of
5125 LITTLE endian at the head. Assigned the coding-system (Lisp
5126 symbol) `utf-16-le' by default.
5128 o coding-category-ccl
5130 The category for a coding system of which encoder/decoder is
5131 written in CCL programs. The default value is nil, i.e., no
5132 coding system is assigned.
5134 o coding-category-binary
5136 The category for a coding system not categorized in any of the
5137 above. Assigned the coding-system (Lisp symbol)
5138 `no-conversion' by default.
5140 Each of them is a Lisp symbol and the value is an actual
5141 `coding-system's (this is also a Lisp symbol) assigned by a user.
5142 What Emacs does actually is to detect a category of coding system.
5143 Then, it uses a `coding-system' assigned to it. If Emacs can't
5144 decide only one possible category, it selects a category of the
5145 highest priority. Priorities of categories are also specified by a
5146 user in a Lisp variable `coding-category-list'.
5150 #define EOL_SEEN_NONE 0
5151 #define EOL_SEEN_LF 1
5152 #define EOL_SEEN_CR 2
5153 #define EOL_SEEN_CRLF 4
5155 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5156 SOURCE is encoded. If CATEGORY is one of
5157 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5158 two-byte, else they are encoded by one-byte.
5160 Return one of EOL_SEEN_XXX. */
5162 #define MAX_EOL_CHECK_COUNT 3
5165 detect_eol (source
, src_bytes
, category
)
5166 unsigned char *source
;
5167 EMACS_INT src_bytes
;
5168 enum coding_category category
;
5170 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
5173 int eol_seen
= EOL_SEEN_NONE
;
5175 if ((1 << category
) & CATEGORY_MASK_UTF_16
)
5179 msb
= category
== (coding_category_utf_16_le
5180 | coding_category_utf_16_le_nosig
);
5183 while (src
+ 1 < src_end
)
5186 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
5191 this_eol
= EOL_SEEN_LF
;
5192 else if (src
+ 3 >= src_end
5193 || src
[msb
+ 2] != 0
5194 || src
[lsb
+ 2] != '\n')
5195 this_eol
= EOL_SEEN_CR
;
5197 this_eol
= EOL_SEEN_CRLF
;
5199 if (eol_seen
== EOL_SEEN_NONE
)
5200 /* This is the first end-of-line. */
5201 eol_seen
= this_eol
;
5202 else if (eol_seen
!= this_eol
)
5204 /* The found type is different from what found before. */
5205 eol_seen
= EOL_SEEN_LF
;
5208 if (++total
== MAX_EOL_CHECK_COUNT
)
5216 while (src
< src_end
)
5219 if (c
== '\n' || c
== '\r')
5224 this_eol
= EOL_SEEN_LF
;
5225 else if (src
>= src_end
|| *src
!= '\n')
5226 this_eol
= EOL_SEEN_CR
;
5228 this_eol
= EOL_SEEN_CRLF
, src
++;
5230 if (eol_seen
== EOL_SEEN_NONE
)
5231 /* This is the first end-of-line. */
5232 eol_seen
= this_eol
;
5233 else if (eol_seen
!= this_eol
)
5235 /* The found type is different from what found before. */
5236 eol_seen
= EOL_SEEN_LF
;
5239 if (++total
== MAX_EOL_CHECK_COUNT
)
5249 adjust_coding_eol_type (coding
, eol_seen
)
5250 struct coding_system
*coding
;
5253 Lisp_Object eol_type
;
5255 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5256 if (eol_seen
& EOL_SEEN_LF
)
5257 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
5258 else if (eol_seen
& EOL_SEEN_CRLF
)
5259 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
5260 else if (eol_seen
& EOL_SEEN_CR
)
5261 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
5264 /* Detect how a text specified in CODING is encoded. If a coding
5265 system is detected, update fields of CODING by the detected coding
5269 detect_coding (coding
)
5270 struct coding_system
*coding
;
5272 unsigned char *src
, *src_end
;
5273 Lisp_Object attrs
, coding_type
;
5275 coding
->consumed
= coding
->consumed_char
= 0;
5276 coding
->produced
= coding
->produced_char
= 0;
5277 coding_set_source (coding
);
5279 src_end
= coding
->source
+ coding
->src_bytes
;
5281 /* If we have not yet decided the text encoding type, detect it
5283 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
5287 for (src
= coding
->source
; src
< src_end
; src
++)
5290 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
5292 || c
== ISO_CODE_SO
)))
5295 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
5297 if (coding
->head_ascii
< coding
->src_bytes
)
5299 struct coding_detection_info detect_info
;
5300 enum coding_category category
;
5301 struct coding_system
*this;
5303 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
5304 for (i
= 0; i
< coding_category_raw_text
; i
++)
5306 category
= coding_priorities
[i
];
5307 this = coding_categories
+ category
;
5310 /* No coding system of this category is defined. */
5311 detect_info
.rejected
|= (1 << category
);
5313 else if (category
>= coding_category_raw_text
)
5315 else if (detect_info
.checked
& (1 << category
))
5317 if (detect_info
.found
& (1 << category
))
5320 else if ((*(this->detector
)) (coding
, &detect_info
)
5321 && detect_info
.found
& (1 << category
))
5324 if (i
< coding_category_raw_text
)
5325 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5326 else if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
5327 setup_coding_system (Qraw_text
, coding
);
5328 else if (detect_info
.rejected
)
5329 for (i
= 0; i
< coding_category_raw_text
; i
++)
5330 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
5332 this = coding_categories
+ coding_priorities
[i
];
5333 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5338 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qutf_16
))
5340 Lisp_Object coding_systems
;
5341 struct coding_detection_info detect_info
;
5344 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_16_bom
);
5345 detect_info
.found
= detect_info
.rejected
= 0;
5346 if (CONSP (coding_systems
)
5347 && detect_coding_utf_16 (coding
, &detect_info
)
5348 && (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
5349 | CATEGORY_MASK_UTF_16_BE
)))
5351 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
5352 setup_coding_system (XCAR (coding_systems
), coding
);
5354 setup_coding_system (XCDR (coding_systems
), coding
);
5358 attrs
= CODING_ID_ATTRS (coding
->id
);
5359 coding_type
= CODING_ATTR_TYPE (attrs
);
5361 /* If we have not yet decided the EOL type, detect it now. But, the
5362 detection is impossible for a CCL based coding system, in which
5363 case, we detct the EOL type after decoding. */
5364 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
))
5365 && ! EQ (coding_type
, Qccl
))
5367 int eol_seen
= detect_eol (coding
->source
, coding
->src_bytes
,
5368 XINT (CODING_ATTR_CATEGORY (attrs
)));
5370 if (eol_seen
!= EOL_SEEN_NONE
)
5371 adjust_coding_eol_type (coding
, eol_seen
);
5378 struct coding_system
*coding
;
5380 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
)))
5382 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5383 unsigned char *pend
= p
+ coding
->produced
;
5384 int eol_seen
= EOL_SEEN_NONE
;
5386 for (; p
< pend
; p
++)
5389 eol_seen
|= EOL_SEEN_LF
;
5390 else if (*p
== '\r')
5392 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
5394 eol_seen
|= EOL_SEEN_CRLF
;
5398 eol_seen
|= EOL_SEEN_CR
;
5401 if (eol_seen
!= EOL_SEEN_NONE
)
5402 adjust_coding_eol_type (coding
, eol_seen
);
5405 if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qmac
))
5407 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5408 unsigned char *pend
= p
+ coding
->produced
;
5410 for (; p
< pend
; p
++)
5414 else if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
))
5416 unsigned char *p
, *pbeg
, *pend
;
5417 Lisp_Object undo_list
;
5419 move_gap_both (coding
->dst_pos
+ coding
->produced_char
,
5420 coding
->dst_pos_byte
+ coding
->produced
);
5421 undo_list
= current_buffer
->undo_list
;
5422 current_buffer
->undo_list
= Qt
;
5423 del_range_2 (coding
->dst_pos
, coding
->dst_pos_byte
, GPT
, GPT_BYTE
, 0);
5424 current_buffer
->undo_list
= undo_list
;
5426 pend
= pbeg
+ coding
->produced
;
5428 for (p
= pend
- 1; p
>= pbeg
; p
--)
5431 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
- p
- 1);
5434 coding
->produced_char
-= coding
->produced
- (pend
- pbeg
);
5435 coding
->produced
= pend
- pbeg
;
5436 insert_from_gap (coding
->produced_char
, coding
->produced
);
5441 translate_chars (coding
, table
)
5442 struct coding_system
*coding
;
5445 int *charbuf
= coding
->charbuf
;
5446 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5449 if (coding
->chars_at_source
)
5452 while (charbuf
< charbuf_end
)
5458 *charbuf
++ = translate_char (table
, c
);
5463 produce_chars (coding
)
5464 struct coding_system
*coding
;
5466 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5467 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5469 int produced_chars
= 0;
5471 if (! coding
->chars_at_source
)
5473 /* Characters are in coding->charbuf. */
5474 int *buf
= coding
->charbuf
;
5475 int *buf_end
= buf
+ coding
->charbuf_used
;
5476 unsigned char *adjusted_dst_end
;
5478 if (BUFFERP (coding
->src_object
)
5479 && EQ (coding
->src_object
, coding
->dst_object
))
5480 dst_end
= coding
->source
+ coding
->consumed
;
5481 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5483 while (buf
< buf_end
)
5487 if (dst
>= adjusted_dst_end
)
5489 dst
= alloc_destination (coding
,
5490 buf_end
- buf
+ MAX_MULTIBYTE_LENGTH
,
5492 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5493 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5497 if (coding
->dst_multibyte
5498 || ! CHAR_BYTE8_P (c
))
5499 CHAR_STRING_ADVANCE (c
, dst
);
5501 *dst
++ = CHAR_TO_BYTE8 (c
);
5505 /* This is an annotation datum. (-C) is the length of
5512 unsigned char *src
= coding
->source
;
5513 unsigned char *src_end
= src
+ coding
->src_bytes
;
5514 Lisp_Object eol_type
;
5516 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5518 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5520 if (coding
->src_multibyte
)
5527 unsigned char *src_base
= src
;
5533 if (EQ (eol_type
, Qdos
))
5537 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
5538 goto no_more_source
;
5543 else if (EQ (eol_type
, Qmac
))
5548 coding
->consumed
= src
- coding
->source
;
5550 if (EQ (coding
->src_object
, coding
->dst_object
))
5554 dst
= alloc_destination (coding
, src_end
- src
+ 1,
5556 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5557 coding_set_source (coding
);
5558 src
= coding
->source
+ coding
->consumed
;
5559 src_end
= coding
->source
+ coding
->src_bytes
;
5569 while (src
< src_end
)
5576 if (EQ (eol_type
, Qdos
))
5582 else if (EQ (eol_type
, Qmac
))
5585 if (dst
>= dst_end
- 1)
5587 coding
->consumed
= src
- coding
->source
;
5589 if (EQ (coding
->src_object
, coding
->dst_object
))
5591 if (dst
>= dst_end
- 1)
5593 dst
= alloc_destination (coding
, src_end
- src
+ 2,
5595 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5596 coding_set_source (coding
);
5597 src
= coding
->source
+ coding
->consumed
;
5598 src_end
= coding
->source
+ coding
->src_bytes
;
5606 if (!EQ (coding
->src_object
, coding
->dst_object
))
5608 int require
= coding
->src_bytes
- coding
->dst_bytes
;
5612 EMACS_INT offset
= src
- coding
->source
;
5614 dst
= alloc_destination (coding
, require
, dst
);
5615 coding_set_source (coding
);
5616 src
= coding
->source
+ offset
;
5617 src_end
= coding
->source
+ coding
->src_bytes
;
5620 produced_chars
= coding
->src_chars
;
5621 while (src
< src_end
)
5627 if (EQ (eol_type
, Qdos
))
5634 else if (EQ (eol_type
, Qmac
))
5640 coding
->consumed
= coding
->src_bytes
;
5641 coding
->consumed_char
= coding
->src_chars
;
5644 produced
= dst
- (coding
->destination
+ coding
->produced
);
5645 if (BUFFERP (coding
->dst_object
))
5646 insert_from_gap (produced_chars
, produced
);
5647 coding
->produced
+= produced
;
5648 coding
->produced_char
+= produced_chars
;
5649 return produced_chars
;
5652 /* Compose text in CODING->object according to the annotation data at
5653 CHARBUF. CHARBUF is an array:
5654 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5658 produce_composition (coding
, charbuf
)
5659 struct coding_system
*coding
;
5664 enum composition_method method
;
5665 Lisp_Object components
;
5668 from
= coding
->dst_pos
+ charbuf
[2];
5669 to
= coding
->dst_pos
+ charbuf
[3];
5670 method
= (enum composition_method
) (charbuf
[4]);
5672 if (method
== COMPOSITION_RELATIVE
)
5676 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
5681 for (i
= 0; i
< len
; i
++)
5682 args
[i
] = make_number (charbuf
[i
]);
5683 components
= (method
== COMPOSITION_WITH_ALTCHARS
5684 ? Fstring (len
, args
) : Fvector (len
, args
));
5686 compose_text (from
, to
, components
, Qnil
, coding
->dst_object
);
5690 /* Put `charset' property on text in CODING->object according to
5691 the annotation data at CHARBUF. CHARBUF is an array:
5692 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5696 produce_charset (coding
, charbuf
)
5697 struct coding_system
*coding
;
5700 EMACS_INT from
= coding
->dst_pos
+ charbuf
[2];
5701 EMACS_INT to
= coding
->dst_pos
+ charbuf
[3];
5702 struct charset
*charset
= CHARSET_FROM_ID (charbuf
[4]);
5704 Fput_text_property (make_number (from
), make_number (to
),
5705 Qcharset
, CHARSET_NAME (charset
),
5706 coding
->dst_object
);
5710 #define CHARBUF_SIZE 0x4000
5712 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5714 int size = CHARBUF_SIZE;; \
5716 coding->charbuf = NULL; \
5717 while (size > 1024) \
5719 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5720 if (coding->charbuf) \
5724 if (! coding->charbuf) \
5726 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5727 return coding->result; \
5729 coding->charbuf_size = size; \
5734 produce_annotation (coding
)
5735 struct coding_system
*coding
;
5737 int *charbuf
= coding
->charbuf
;
5738 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5740 if (NILP (coding
->dst_object
))
5743 while (charbuf
< charbuf_end
)
5749 int len
= -*charbuf
;
5752 case CODING_ANNOTATE_COMPOSITION_MASK
:
5753 produce_composition (coding
, charbuf
);
5755 case CODING_ANNOTATE_CHARSET_MASK
:
5756 produce_charset (coding
, charbuf
);
5766 /* Decode the data at CODING->src_object into CODING->dst_object.
5767 CODING->src_object is a buffer, a string, or nil.
5768 CODING->dst_object is a buffer.
5770 If CODING->src_object is a buffer, it must be the current buffer.
5771 In this case, if CODING->src_pos is positive, it is a position of
5772 the source text in the buffer, otherwise, the source text is in the
5773 gap area of the buffer, and CODING->src_pos specifies the offset of
5774 the text from GPT (which must be the same as PT). If this is the
5775 same buffer as CODING->dst_object, CODING->src_pos must be
5778 If CODING->src_object is a string, CODING->src_pos in an index to
5781 If CODING->src_object is nil, CODING->source must already point to
5782 the non-relocatable memory area. In this case, CODING->src_pos is
5783 an offset from CODING->source.
5785 The decoded data is inserted at the current point of the buffer
5790 decode_coding (coding
)
5791 struct coding_system
*coding
;
5795 if (BUFFERP (coding
->src_object
)
5796 && coding
->src_pos
> 0
5797 && coding
->src_pos
< GPT
5798 && coding
->src_pos
+ coding
->src_chars
> GPT
)
5799 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
5801 if (BUFFERP (coding
->dst_object
))
5803 if (current_buffer
!= XBUFFER (coding
->dst_object
))
5804 set_buffer_internal (XBUFFER (coding
->dst_object
));
5806 move_gap_both (PT
, PT_BYTE
);
5809 coding
->consumed
= coding
->consumed_char
= 0;
5810 coding
->produced
= coding
->produced_char
= 0;
5811 coding
->chars_at_source
= 0;
5812 coding
->result
= CODING_RESULT_SUCCESS
;
5815 ALLOC_CONVERSION_WORK_AREA (coding
);
5817 attrs
= CODING_ID_ATTRS (coding
->id
);
5821 coding_set_source (coding
);
5822 coding
->annotated
= 0;
5823 (*(coding
->decoder
)) (coding
);
5824 if (!NILP (CODING_ATTR_DECODE_TBL (attrs
)))
5825 translate_chars (coding
, CODING_ATTR_DECODE_TBL (attrs
));
5826 else if (!NILP (Vstandard_translation_table_for_decode
))
5827 translate_chars (coding
, Vstandard_translation_table_for_decode
);
5828 coding_set_destination (coding
);
5829 produce_chars (coding
);
5830 if (coding
->annotated
)
5831 produce_annotation (coding
);
5833 while (coding
->consumed
< coding
->src_bytes
5834 && ! coding
->result
);
5836 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qccl
)
5837 && SYMBOLP (CODING_ID_EOL_TYPE (coding
->id
))
5838 && ! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
5839 decode_eol (coding
);
5841 coding
->carryover_bytes
= 0;
5842 if (coding
->consumed
< coding
->src_bytes
)
5844 int nbytes
= coding
->src_bytes
- coding
->consumed
;
5847 coding_set_source (coding
);
5848 coding_set_destination (coding
);
5849 src
= coding
->source
+ coding
->consumed
;
5851 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
5853 /* Flush out unprocessed data as binary chars. We are sure
5854 that the number of data is less than the size of
5856 while (nbytes
-- > 0)
5860 coding
->charbuf
[coding
->charbuf_used
++] = (c
& 0x80 ? - c
: c
);
5862 produce_chars (coding
);
5866 /* Record unprocessed bytes in coding->carryover. We are
5867 sure that the number of data is less than the size of
5868 coding->carryover. */
5869 unsigned char *p
= coding
->carryover
;
5871 coding
->carryover_bytes
= nbytes
;
5872 while (nbytes
-- > 0)
5875 coding
->consumed
= coding
->src_bytes
;
5878 return coding
->result
;
5882 /* Extract an annotation datum from a composition starting at POS and
5883 ending before LIMIT of CODING->src_object (buffer or string), store
5884 the data in BUF, set *STOP to a starting position of the next
5885 composition (if any) or to LIMIT, and return the address of the
5886 next element of BUF.
5888 If such an annotation is not found, set *STOP to a starting
5889 position of a composition after POS (if any) or to LIMIT, and
5893 handle_composition_annotation (pos
, limit
, coding
, buf
, stop
)
5894 EMACS_INT pos
, limit
;
5895 struct coding_system
*coding
;
5899 EMACS_INT start
, end
;
5902 if (! find_composition (pos
, limit
, &start
, &end
, &prop
, coding
->src_object
)
5905 else if (start
> pos
)
5911 /* We found a composition. Store the corresponding
5912 annotation data in BUF. */
5914 enum composition_method method
= COMPOSITION_METHOD (prop
);
5915 int nchars
= COMPOSITION_LENGTH (prop
);
5917 ADD_COMPOSITION_DATA (buf
, 0, nchars
, method
);
5918 if (method
!= COMPOSITION_RELATIVE
)
5920 Lisp_Object components
;
5923 components
= COMPOSITION_COMPONENTS (prop
);
5924 if (VECTORP (components
))
5926 len
= XVECTOR (components
)->size
;
5927 for (i
= 0; i
< len
; i
++)
5928 *buf
++ = XINT (AREF (components
, i
));
5930 else if (STRINGP (components
))
5932 len
= XSTRING (components
)->size
;
5936 FETCH_STRING_CHAR_ADVANCE (*buf
, components
, i
, i_byte
);
5940 else if (INTEGERP (components
))
5943 *buf
++ = XINT (components
);
5945 else if (CONSP (components
))
5947 for (len
= 0; CONSP (components
);
5948 len
++, components
= XCDR (components
))
5949 *buf
++ = XINT (XCAR (components
));
5957 if (find_composition (end
, limit
, &start
, &end
, &prop
,
5968 /* Extract an annotation datum from a text property `charset' at POS of
5969 CODING->src_object (buffer of string), store the data in BUF, set
5970 *STOP to the position where the value of `charset' property changes
5971 (limiting by LIMIT), and return the address of the next element of
5974 If the property value is nil, set *STOP to the position where the
5975 property value is non-nil (limiting by LIMIT), and return BUF. */
5978 handle_charset_annotation (pos
, limit
, coding
, buf
, stop
)
5979 EMACS_INT pos
, limit
;
5980 struct coding_system
*coding
;
5984 Lisp_Object val
, next
;
5987 val
= Fget_text_property (make_number (pos
), Qcharset
, coding
->src_object
);
5988 if (! NILP (val
) && CHARSETP (val
))
5989 id
= XINT (CHARSET_SYMBOL_ID (val
));
5992 ADD_CHARSET_DATA (buf
, 0, 0, id
);
5993 next
= Fnext_single_property_change (make_number (pos
), Qcharset
,
5995 make_number (limit
));
5996 *stop
= XINT (next
);
6002 consume_chars (coding
)
6003 struct coding_system
*coding
;
6005 int *buf
= coding
->charbuf
;
6006 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
;
6007 const unsigned char *src
= coding
->source
+ coding
->consumed
;
6008 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
6009 EMACS_INT pos
= coding
->src_pos
+ coding
->consumed_char
;
6010 EMACS_INT end_pos
= coding
->src_pos
+ coding
->src_chars
;
6011 int multibytep
= coding
->src_multibyte
;
6012 Lisp_Object eol_type
;
6014 EMACS_INT stop
, stop_composition
, stop_charset
;
6016 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6017 if (VECTORP (eol_type
))
6020 /* Note: composition handling is not yet implemented. */
6021 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
6023 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
)
6024 stop
= stop_composition
= pos
;
6026 stop
= stop_composition
= end_pos
;
6027 if (coding
->common_flags
& CODING_ANNOTATE_CHARSET_MASK
)
6028 stop
= stop_charset
= pos
;
6030 stop_charset
= end_pos
;
6032 /* Compensate for CRLF and annotation. */
6033 buf_end
-= 1 + MAX_ANNOTATION_LENGTH
;
6034 while (buf
< buf_end
)
6040 if (pos
== stop_composition
)
6041 buf
= handle_composition_annotation (pos
, end_pos
, coding
,
6042 buf
, &stop_composition
);
6043 if (pos
== stop_charset
)
6044 buf
= handle_charset_annotation (pos
, end_pos
, coding
,
6045 buf
, &stop_charset
);
6046 stop
= (stop_composition
< stop_charset
6047 ? stop_composition
: stop_charset
);
6054 if (! CODING_FOR_UNIBYTE (coding
)
6055 && (bytes
= MULTIBYTE_LENGTH (src
, src_end
)) > 0)
6056 c
= STRING_CHAR_ADVANCE (src
), pos
+= bytes
;
6061 c
= STRING_CHAR_ADVANCE (src
), pos
++;
6062 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
6064 if (! EQ (eol_type
, Qunix
))
6068 if (EQ (eol_type
, Qdos
))
6077 coding
->consumed
= src
- coding
->source
;
6078 coding
->consumed_char
= pos
- coding
->src_pos
;
6079 coding
->charbuf_used
= buf
- coding
->charbuf
;
6080 coding
->chars_at_source
= 0;
6084 /* Encode the text at CODING->src_object into CODING->dst_object.
6085 CODING->src_object is a buffer or a string.
6086 CODING->dst_object is a buffer or nil.
6088 If CODING->src_object is a buffer, it must be the current buffer.
6089 In this case, if CODING->src_pos is positive, it is a position of
6090 the source text in the buffer, otherwise. the source text is in the
6091 gap area of the buffer, and coding->src_pos specifies the offset of
6092 the text from GPT (which must be the same as PT). If this is the
6093 same buffer as CODING->dst_object, CODING->src_pos must be
6094 negative and CODING should not have `pre-write-conversion'.
6096 If CODING->src_object is a string, CODING should not have
6097 `pre-write-conversion'.
6099 If CODING->dst_object is a buffer, the encoded data is inserted at
6100 the current point of that buffer.
6102 If CODING->dst_object is nil, the encoded data is placed at the
6103 memory area specified by CODING->destination. */
6106 encode_coding (coding
)
6107 struct coding_system
*coding
;
6111 attrs
= CODING_ID_ATTRS (coding
->id
);
6113 if (BUFFERP (coding
->dst_object
))
6115 set_buffer_internal (XBUFFER (coding
->dst_object
));
6116 coding
->dst_multibyte
6117 = ! NILP (current_buffer
->enable_multibyte_characters
);
6120 coding
->consumed
= coding
->consumed_char
= 0;
6121 coding
->produced
= coding
->produced_char
= 0;
6122 coding
->result
= CODING_RESULT_SUCCESS
;
6125 ALLOC_CONVERSION_WORK_AREA (coding
);
6128 coding_set_source (coding
);
6129 consume_chars (coding
);
6131 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs
)))
6132 translate_chars (coding
, CODING_ATTR_ENCODE_TBL (attrs
));
6133 else if (!NILP (Vstandard_translation_table_for_encode
))
6134 translate_chars (coding
, Vstandard_translation_table_for_encode
);
6136 coding_set_destination (coding
);
6137 (*(coding
->encoder
)) (coding
);
6138 } while (coding
->consumed_char
< coding
->src_chars
);
6140 if (BUFFERP (coding
->dst_object
))
6141 insert_from_gap (coding
->produced_char
, coding
->produced
);
6143 return (coding
->result
);
6147 /* Stack of working buffers used in code conversion. An nil element
6148 means that the code conversion of that level is not using a working
6150 Lisp_Object Vcode_conversion_work_buf_list
;
6152 /* A working buffer used by the top level conversion. */
6153 Lisp_Object Vcode_conversion_reused_work_buf
;
6156 /* Return a working buffer that can be freely used by the following
6157 code conversion. MULTIBYTEP specifies the multibyteness of the
6161 make_conversion_work_buffer (multibytep
, depth
)
6162 int multibytep
, depth
;
6164 struct buffer
*current
= current_buffer
;
6165 Lisp_Object buf
, name
;
6169 if (NILP (Vcode_conversion_reused_work_buf
))
6170 Vcode_conversion_reused_work_buf
6171 = Fget_buffer_create (build_string (" *code-converting-work<0>*"));
6172 buf
= Vcode_conversion_reused_work_buf
;
6178 name
= build_string (" *code-converting-work*");
6179 name
= Fgenerate_new_buffer_name (name
, Qnil
);
6185 sprintf (str
, " *code-converting-work*<%d>", depth
);
6186 name
= build_string (str
);
6188 buf
= Fget_buffer_create (name
);
6190 set_buffer_internal (XBUFFER (buf
));
6191 current_buffer
->undo_list
= Qt
;
6193 Fset_buffer_multibyte (multibytep
? Qt
: Qnil
, Qnil
);
6194 set_buffer_internal (current
);
6199 code_conversion_restore (buffer
)
6202 Lisp_Object workbuf
;
6204 workbuf
= XCAR (Vcode_conversion_work_buf_list
);
6205 if (! NILP (workbuf
)
6206 && ! EQ (workbuf
, Vcode_conversion_reused_work_buf
)
6207 && ! NILP (Fbuffer_live_p (workbuf
)))
6208 Fkill_buffer (workbuf
);
6209 Vcode_conversion_work_buf_list
= XCDR (Vcode_conversion_work_buf_list
);
6210 set_buffer_internal (XBUFFER (buffer
));
6215 code_conversion_save (buffer
, with_work_buf
, multibyte
)
6217 int with_work_buf
, multibyte
;
6219 Lisp_Object workbuf
;
6223 int depth
= XINT (Flength (Vcode_conversion_work_buf_list
));
6225 workbuf
= make_conversion_work_buffer (multibyte
, depth
);
6229 Vcode_conversion_work_buf_list
6230 = Fcons (workbuf
, Vcode_conversion_work_buf_list
);
6231 record_unwind_protect (code_conversion_restore
, buffer
);
6236 decode_coding_gap (coding
, chars
, bytes
)
6237 struct coding_system
*coding
;
6238 EMACS_INT chars
, bytes
;
6240 int count
= specpdl_ptr
- specpdl
;
6244 buffer
= Fcurrent_buffer ();
6245 code_conversion_save (buffer
, 0, 0);
6247 coding
->src_object
= buffer
;
6248 coding
->src_chars
= chars
;
6249 coding
->src_bytes
= bytes
;
6250 coding
->src_pos
= -chars
;
6251 coding
->src_pos_byte
= -bytes
;
6252 coding
->src_multibyte
= chars
< bytes
;
6253 coding
->dst_object
= buffer
;
6254 coding
->dst_pos
= PT
;
6255 coding
->dst_pos_byte
= PT_BYTE
;
6256 coding
->dst_multibyte
= ! NILP (current_buffer
->enable_multibyte_characters
);
6257 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
6259 if (CODING_REQUIRE_DETECTION (coding
))
6260 detect_coding (coding
);
6262 decode_coding (coding
);
6264 attrs
= CODING_ID_ATTRS (coding
->id
);
6265 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6267 struct gcpro gcpro1
;
6268 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6271 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6273 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6274 make_number (coding
->produced_char
));
6277 coding
->produced_char
+= Z
- prev_Z
;
6278 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6281 unbind_to (count
, Qnil
);
6282 return coding
->result
;
6286 encode_coding_gap (coding
, chars
, bytes
)
6287 struct coding_system
*coding
;
6288 EMACS_INT chars
, bytes
;
6290 int count
= specpdl_ptr
- specpdl
;
6293 buffer
= Fcurrent_buffer ();
6294 code_conversion_save (buffer
, 0, 0);
6296 coding
->src_object
= buffer
;
6297 coding
->src_chars
= chars
;
6298 coding
->src_bytes
= bytes
;
6299 coding
->src_pos
= -chars
;
6300 coding
->src_pos_byte
= -bytes
;
6301 coding
->src_multibyte
= chars
< bytes
;
6302 coding
->dst_object
= coding
->src_object
;
6303 coding
->dst_pos
= PT
;
6304 coding
->dst_pos_byte
= PT_BYTE
;
6306 encode_coding (coding
);
6308 unbind_to (count
, Qnil
);
6309 return coding
->result
;
6313 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6314 SRC_OBJECT into DST_OBJECT by coding context CODING.
6316 SRC_OBJECT is a buffer, a string, or Qnil.
6318 If it is a buffer, the text is at point of the buffer. FROM and TO
6319 are positions in the buffer.
6321 If it is a string, the text is at the beginning of the string.
6322 FROM and TO are indices to the string.
6324 If it is nil, the text is at coding->source. FROM and TO are
6325 indices to coding->source.
6327 DST_OBJECT is a buffer, Qt, or Qnil.
6329 If it is a buffer, the decoded text is inserted at point of the
6330 buffer. If the buffer is the same as SRC_OBJECT, the source text
6333 If it is Qt, a string is made from the decoded text, and
6334 set in CODING->dst_object.
6336 If it is Qnil, the decoded text is stored at CODING->destination.
6337 The caller must allocate CODING->dst_bytes bytes at
6338 CODING->destination by xmalloc. If the decoded text is longer than
6339 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6343 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6345 struct coding_system
*coding
;
6346 Lisp_Object src_object
;
6347 EMACS_INT from
, from_byte
, to
, to_byte
;
6348 Lisp_Object dst_object
;
6350 int count
= specpdl_ptr
- specpdl
;
6351 unsigned char *destination
;
6352 EMACS_INT dst_bytes
;
6353 EMACS_INT chars
= to
- from
;
6354 EMACS_INT bytes
= to_byte
- from_byte
;
6357 int saved_pt
= -1, saved_pt_byte
;
6359 buffer
= Fcurrent_buffer ();
6361 if (NILP (dst_object
))
6363 destination
= coding
->destination
;
6364 dst_bytes
= coding
->dst_bytes
;
6367 coding
->src_object
= src_object
;
6368 coding
->src_chars
= chars
;
6369 coding
->src_bytes
= bytes
;
6370 coding
->src_multibyte
= chars
< bytes
;
6372 if (STRINGP (src_object
))
6374 coding
->src_pos
= from
;
6375 coding
->src_pos_byte
= from_byte
;
6377 else if (BUFFERP (src_object
))
6379 set_buffer_internal (XBUFFER (src_object
));
6381 move_gap_both (from
, from_byte
);
6382 if (EQ (src_object
, dst_object
))
6384 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6385 TEMP_SET_PT_BOTH (from
, from_byte
);
6386 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6387 coding
->src_pos
= -chars
;
6388 coding
->src_pos_byte
= -bytes
;
6392 coding
->src_pos
= from
;
6393 coding
->src_pos_byte
= from_byte
;
6397 if (CODING_REQUIRE_DETECTION (coding
))
6398 detect_coding (coding
);
6399 attrs
= CODING_ID_ATTRS (coding
->id
);
6401 if (EQ (dst_object
, Qt
)
6402 || (! NILP (CODING_ATTR_POST_READ (attrs
))
6403 && NILP (dst_object
)))
6405 coding
->dst_object
= code_conversion_save (buffer
, 1, 1);
6406 coding
->dst_pos
= BEG
;
6407 coding
->dst_pos_byte
= BEG_BYTE
;
6408 coding
->dst_multibyte
= 1;
6410 else if (BUFFERP (dst_object
))
6412 code_conversion_save (buffer
, 0, 0);
6413 coding
->dst_object
= dst_object
;
6414 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6415 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6416 coding
->dst_multibyte
6417 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6421 code_conversion_save (buffer
, 0, 0);
6422 coding
->dst_object
= Qnil
;
6423 coding
->dst_multibyte
= 1;
6426 decode_coding (coding
);
6428 if (BUFFERP (coding
->dst_object
))
6429 set_buffer_internal (XBUFFER (coding
->dst_object
));
6431 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6433 struct gcpro gcpro1
, gcpro2
;
6434 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6437 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6438 GCPRO2 (coding
->src_object
, coding
->dst_object
);
6439 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6440 make_number (coding
->produced_char
));
6443 coding
->produced_char
+= Z
- prev_Z
;
6444 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6447 if (EQ (dst_object
, Qt
))
6449 coding
->dst_object
= Fbuffer_string ();
6451 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
6453 set_buffer_internal (XBUFFER (coding
->dst_object
));
6454 if (dst_bytes
< coding
->produced
)
6457 = (unsigned char *) xrealloc (destination
, coding
->produced
);
6460 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
6461 unbind_to (count
, Qnil
);
6464 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
6465 move_gap_both (BEGV
, BEGV_BYTE
);
6466 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
6467 coding
->destination
= destination
;
6473 /* This is the case of:
6474 (BUFFERP (src_object) && EQ (src_object, dst_object))
6475 As we have moved PT while replacing the original buffer
6476 contents, we must recover it now. */
6477 set_buffer_internal (XBUFFER (src_object
));
6478 if (saved_pt
< from
)
6479 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6480 else if (saved_pt
< from
+ chars
)
6481 TEMP_SET_PT_BOTH (from
, from_byte
);
6482 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6483 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6484 saved_pt_byte
+ (coding
->produced
- bytes
));
6486 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6487 saved_pt_byte
+ (coding
->produced
- bytes
));
6490 unbind_to (count
, Qnil
);
6495 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6497 struct coding_system
*coding
;
6498 Lisp_Object src_object
;
6499 EMACS_INT from
, from_byte
, to
, to_byte
;
6500 Lisp_Object dst_object
;
6502 int count
= specpdl_ptr
- specpdl
;
6503 EMACS_INT chars
= to
- from
;
6504 EMACS_INT bytes
= to_byte
- from_byte
;
6507 int saved_pt
= -1, saved_pt_byte
;
6509 buffer
= Fcurrent_buffer ();
6511 coding
->src_object
= src_object
;
6512 coding
->src_chars
= chars
;
6513 coding
->src_bytes
= bytes
;
6514 coding
->src_multibyte
= chars
< bytes
;
6516 attrs
= CODING_ID_ATTRS (coding
->id
);
6518 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
6520 coding
->src_object
= code_conversion_save (buffer
, 1,
6521 coding
->src_multibyte
);
6522 set_buffer_internal (XBUFFER (coding
->src_object
));
6523 if (STRINGP (src_object
))
6524 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
6525 else if (BUFFERP (src_object
))
6526 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
6528 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
6530 if (EQ (src_object
, dst_object
))
6532 set_buffer_internal (XBUFFER (src_object
));
6533 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6534 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6535 set_buffer_internal (XBUFFER (coding
->src_object
));
6538 call2 (CODING_ATTR_PRE_WRITE (attrs
),
6539 make_number (BEG
), make_number (Z
));
6540 coding
->src_object
= Fcurrent_buffer ();
6542 move_gap_both (BEG
, BEG_BYTE
);
6543 coding
->src_chars
= Z
- BEG
;
6544 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
6545 coding
->src_pos
= BEG
;
6546 coding
->src_pos_byte
= BEG_BYTE
;
6547 coding
->src_multibyte
= Z
< Z_BYTE
;
6549 else if (STRINGP (src_object
))
6551 code_conversion_save (buffer
, 0, 0);
6552 coding
->src_pos
= from
;
6553 coding
->src_pos_byte
= from_byte
;
6555 else if (BUFFERP (src_object
))
6557 code_conversion_save (buffer
, 0, 0);
6558 set_buffer_internal (XBUFFER (src_object
));
6559 if (EQ (src_object
, dst_object
))
6561 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6562 coding
->src_object
= del_range_1 (from
, to
, 1, 1);
6563 coding
->src_pos
= 0;
6564 coding
->src_pos_byte
= 0;
6568 if (from
< GPT
&& to
>= GPT
)
6569 move_gap_both (from
, from_byte
);
6570 coding
->src_pos
= from
;
6571 coding
->src_pos_byte
= from_byte
;
6575 code_conversion_save (buffer
, 0, 0);
6577 if (BUFFERP (dst_object
))
6579 coding
->dst_object
= dst_object
;
6580 if (EQ (src_object
, dst_object
))
6582 coding
->dst_pos
= from
;
6583 coding
->dst_pos_byte
= from_byte
;
6587 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6588 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6590 coding
->dst_multibyte
6591 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6593 else if (EQ (dst_object
, Qt
))
6595 coding
->dst_object
= Qnil
;
6596 coding
->dst_bytes
= coding
->src_chars
;
6597 if (coding
->dst_bytes
== 0)
6598 coding
->dst_bytes
= 1;
6599 coding
->destination
= (unsigned char *) xmalloc (coding
->dst_bytes
);
6600 coding
->dst_multibyte
= 0;
6604 coding
->dst_object
= Qnil
;
6605 coding
->dst_multibyte
= 0;
6608 encode_coding (coding
);
6610 if (EQ (dst_object
, Qt
))
6612 if (BUFFERP (coding
->dst_object
))
6613 coding
->dst_object
= Fbuffer_string ();
6617 = make_unibyte_string ((char *) coding
->destination
,
6619 xfree (coding
->destination
);
6625 /* This is the case of:
6626 (BUFFERP (src_object) && EQ (src_object, dst_object))
6627 As we have moved PT while replacing the original buffer
6628 contents, we must recover it now. */
6629 set_buffer_internal (XBUFFER (src_object
));
6630 if (saved_pt
< from
)
6631 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6632 else if (saved_pt
< from
+ chars
)
6633 TEMP_SET_PT_BOTH (from
, from_byte
);
6634 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6635 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6636 saved_pt_byte
+ (coding
->produced
- bytes
));
6638 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6639 saved_pt_byte
+ (coding
->produced
- bytes
));
6642 unbind_to (count
, Qnil
);
6647 preferred_coding_system ()
6649 int id
= coding_categories
[coding_priorities
[0]].id
;
6651 return CODING_ID_NAME (id
);
6656 /*** 8. Emacs Lisp library functions ***/
6658 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
6659 doc
: /* Return t if OBJECT is nil or a coding-system.
6660 See the documentation of `define-coding-system' for information
6661 about coding-system objects. */)
6665 return ((NILP (obj
) || CODING_SYSTEM_P (obj
)) ? Qt
: Qnil
);
6668 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
6669 Sread_non_nil_coding_system
, 1, 1, 0,
6670 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6677 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6678 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
6680 while (XSTRING (val
)->size
== 0);
6681 return (Fintern (val
, Qnil
));
6684 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
6685 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6686 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6687 (prompt
, default_coding_system
)
6688 Lisp_Object prompt
, default_coding_system
;
6691 if (SYMBOLP (default_coding_system
))
6692 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
6693 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6694 Qt
, Qnil
, Qcoding_system_history
,
6695 default_coding_system
, Qnil
);
6696 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
6699 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
6701 doc
: /* Check validity of CODING-SYSTEM.
6702 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6704 Lisp_Object coding_system
;
6706 CHECK_SYMBOL (coding_system
);
6707 if (!NILP (Fcoding_system_p (coding_system
)))
6708 return coding_system
;
6710 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
6714 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6715 HIGHEST is nonzero, return the coding system of the highest
6716 priority among the detected coding systems. Otherwize return a
6717 list of detected coding systems sorted by their priorities. If
6718 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6719 multibyte form but contains only ASCII and eight-bit chars.
6720 Otherwise, the bytes are raw bytes.
6722 CODING-SYSTEM controls the detection as below:
6724 If it is nil, detect both text-format and eol-format. If the
6725 text-format part of CODING-SYSTEM is already specified
6726 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6727 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6728 detect only text-format. */
6731 detect_coding_system (src
, src_bytes
, highest
, multibytep
, coding_system
)
6733 int src_bytes
, highest
;
6735 Lisp_Object coding_system
;
6737 unsigned char *src_end
= src
+ src_bytes
;
6738 Lisp_Object attrs
, eol_type
;
6740 struct coding_system coding
;
6742 struct coding_detection_info detect_info
;
6744 if (NILP (coding_system
))
6745 coding_system
= Qundecided
;
6746 setup_coding_system (coding_system
, &coding
);
6747 attrs
= CODING_ID_ATTRS (coding
.id
);
6748 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
6749 coding_system
= CODING_ATTR_BASE_NAME (attrs
);
6751 coding
.source
= src
;
6752 coding
.src_bytes
= src_bytes
;
6753 coding
.src_multibyte
= multibytep
;
6754 coding
.consumed
= 0;
6755 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6757 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
6759 /* At first, detect text-format if necessary. */
6760 if (XINT (CODING_ATTR_CATEGORY (attrs
)) == coding_category_undecided
)
6762 enum coding_category category
;
6763 struct coding_system
*this;
6766 for (; src
< src_end
; src
++)
6770 || (c
< 0x20 && (c
== ISO_CODE_ESC
6772 || c
== ISO_CODE_SO
)))
6775 coding
.head_ascii
= src
- coding
.source
;
6778 for (i
= 0; i
< coding_category_raw_text
; i
++)
6780 category
= coding_priorities
[i
];
6781 this = coding_categories
+ category
;
6785 /* No coding system of this category is defined. */
6786 detect_info
.rejected
|= (1 << category
);
6788 else if (category
>= coding_category_raw_text
)
6790 else if (detect_info
.checked
& (1 << category
))
6793 && (detect_info
.found
& (1 << category
)))
6798 if ((*(this->detector
)) (&coding
, &detect_info
)
6800 && (detect_info
.found
& (1 << category
)))
6806 if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
6808 detect_info
.found
= CATEGORY_MASK_RAW_TEXT
;
6809 id
= coding_categories
[coding_category_raw_text
].id
;
6810 val
= Fcons (make_number (id
), Qnil
);
6812 else if (! detect_info
.rejected
&& ! detect_info
.found
)
6814 detect_info
.found
= CATEGORY_MASK_ANY
;
6815 id
= coding_categories
[coding_category_undecided
].id
;
6816 val
= Fcons (make_number (id
), Qnil
);
6820 if (detect_info
.found
)
6822 detect_info
.found
= 1 << category
;
6823 val
= Fcons (make_number (this->id
), Qnil
);
6826 for (i
= 0; i
< coding_category_raw_text
; i
++)
6827 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
6829 detect_info
.found
= 1 << coding_priorities
[i
];
6830 id
= coding_categories
[coding_priorities
[i
]].id
;
6831 val
= Fcons (make_number (id
), Qnil
);
6837 int mask
= detect_info
.rejected
| detect_info
.found
;
6841 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6843 category
= coding_priorities
[i
];
6844 if (! (mask
& (1 << category
)))
6846 found
|= 1 << category
;
6847 id
= coding_categories
[category
].id
;
6848 val
= Fcons (make_number (id
), val
);
6851 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6853 category
= coding_priorities
[i
];
6854 if (detect_info
.found
& (1 << category
))
6856 id
= coding_categories
[category
].id
;
6857 val
= Fcons (make_number (id
), val
);
6860 detect_info
.found
|= found
;
6865 detect_info
.found
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
6866 val
= Fcons (make_number (coding
.id
), Qnil
);
6869 /* Then, detect eol-format if necessary. */
6871 int normal_eol
= -1, utf_16_be_eol
= -1, utf_16_le_eol
;
6874 if (VECTORP (eol_type
))
6876 if (detect_info
.found
& ~CATEGORY_MASK_UTF_16
)
6877 normal_eol
= detect_eol (coding
.source
, src_bytes
,
6878 coding_category_raw_text
);
6879 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_BE
6880 | CATEGORY_MASK_UTF_16_BE_NOSIG
))
6881 utf_16_be_eol
= detect_eol (coding
.source
, src_bytes
,
6882 coding_category_utf_16_be
);
6883 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
6884 | CATEGORY_MASK_UTF_16_LE_NOSIG
))
6885 utf_16_le_eol
= detect_eol (coding
.source
, src_bytes
,
6886 coding_category_utf_16_le
);
6890 if (EQ (eol_type
, Qunix
))
6891 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_LF
;
6892 else if (EQ (eol_type
, Qdos
))
6893 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CRLF
;
6895 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CR
;
6898 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
6900 enum coding_category category
;
6903 id
= XINT (XCAR (tail
));
6904 attrs
= CODING_ID_ATTRS (id
);
6905 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
6906 eol_type
= CODING_ID_EOL_TYPE (id
);
6907 if (VECTORP (eol_type
))
6909 if (category
== coding_category_utf_16_be
6910 || category
== coding_category_utf_16_be_nosig
)
6911 this_eol
= utf_16_be_eol
;
6912 else if (category
== coding_category_utf_16_le
6913 || category
== coding_category_utf_16_le_nosig
)
6914 this_eol
= utf_16_le_eol
;
6916 this_eol
= normal_eol
;
6918 if (this_eol
== EOL_SEEN_LF
)
6919 XSETCAR (tail
, AREF (eol_type
, 0));
6920 else if (this_eol
== EOL_SEEN_CRLF
)
6921 XSETCAR (tail
, AREF (eol_type
, 1));
6922 else if (this_eol
== EOL_SEEN_CR
)
6923 XSETCAR (tail
, AREF (eol_type
, 2));
6925 XSETCAR (tail
, CODING_ID_NAME (id
));
6928 XSETCAR (tail
, CODING_ID_NAME (id
));
6932 return (highest
? XCAR (val
) : val
);
6936 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
6938 doc
: /* Detect coding system of the text in the region between START and END.
6939 Return a list of possible coding systems ordered by priority.
6941 If only ASCII characters are found, it returns a list of single element
6942 `undecided' or its subsidiary coding system according to a detected
6945 If optional argument HIGHEST is non-nil, return the coding system of
6946 highest priority. */)
6947 (start
, end
, highest
)
6948 Lisp_Object start
, end
, highest
;
6951 int from_byte
, to_byte
;
6953 CHECK_NUMBER_COERCE_MARKER (start
);
6954 CHECK_NUMBER_COERCE_MARKER (end
);
6956 validate_region (&start
, &end
);
6957 from
= XINT (start
), to
= XINT (end
);
6958 from_byte
= CHAR_TO_BYTE (from
);
6959 to_byte
= CHAR_TO_BYTE (to
);
6961 if (from
< GPT
&& to
>= GPT
)
6962 move_gap_both (to
, to_byte
);
6964 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
6965 to_byte
- from_byte
,
6967 !NILP (current_buffer
6968 ->enable_multibyte_characters
),
6972 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
6974 doc
: /* Detect coding system of the text in STRING.
6975 Return a list of possible coding systems ordered by priority.
6977 If only ASCII characters are found, it returns a list of single element
6978 `undecided' or its subsidiary coding system according to a detected
6981 If optional argument HIGHEST is non-nil, return the coding system of
6982 highest priority. */)
6984 Lisp_Object string
, highest
;
6986 CHECK_STRING (string
);
6988 return detect_coding_system (XSTRING (string
)->data
,
6989 STRING_BYTES (XSTRING (string
)),
6991 STRING_MULTIBYTE (string
),
6997 char_encodable_p (c
, attrs
)
7002 struct charset
*charset
;
7004 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
7005 CONSP (tail
); tail
= XCDR (tail
))
7007 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
7008 if (CHAR_CHARSET_P (c
, charset
))
7011 return (! NILP (tail
));
7015 /* Return a list of coding systems that safely encode the text between
7016 START and END. If EXCLUDE is non-nil, it is a list of coding
7017 systems not to check. The returned list doesn't contain any such
7018 coding systems. In any case, if the text contains only ASCII or is
7019 unibyte, return t. */
7021 DEFUN ("find-coding-systems-region-internal",
7022 Ffind_coding_systems_region_internal
,
7023 Sfind_coding_systems_region_internal
, 2, 3, 0,
7024 doc
: /* Internal use only. */)
7025 (start
, end
, exclude
)
7026 Lisp_Object start
, end
, exclude
;
7028 Lisp_Object coding_attrs_list
, safe_codings
;
7029 EMACS_INT start_byte
, end_byte
;
7030 const unsigned char *p
, *pbeg
, *pend
;
7032 Lisp_Object tail
, elt
;
7034 if (STRINGP (start
))
7036 if (!STRING_MULTIBYTE (start
)
7037 || XSTRING (start
)->size
== STRING_BYTES (XSTRING (start
)))
7040 end_byte
= STRING_BYTES (XSTRING (start
));
7044 CHECK_NUMBER_COERCE_MARKER (start
);
7045 CHECK_NUMBER_COERCE_MARKER (end
);
7046 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7047 args_out_of_range (start
, end
);
7048 if (NILP (current_buffer
->enable_multibyte_characters
))
7050 start_byte
= CHAR_TO_BYTE (XINT (start
));
7051 end_byte
= CHAR_TO_BYTE (XINT (end
));
7052 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7055 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7057 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7058 move_gap_both (XINT (start
), start_byte
);
7060 move_gap_both (XINT (end
), end_byte
);
7064 coding_attrs_list
= Qnil
;
7065 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7067 || NILP (Fmemq (XCAR (tail
), exclude
)))
7071 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
7072 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
7073 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
7074 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
7077 if (STRINGP (start
))
7078 p
= pbeg
= XSTRING (start
)->data
;
7080 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7081 pend
= p
+ (end_byte
- start_byte
);
7083 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
7084 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7088 if (ASCII_BYTE_P (*p
))
7092 c
= STRING_CHAR_ADVANCE (p
);
7094 charset_map_loaded
= 0;
7095 for (tail
= coding_attrs_list
; CONSP (tail
);)
7100 else if (char_encodable_p (c
, elt
))
7102 else if (CONSP (XCDR (tail
)))
7104 XSETCAR (tail
, XCAR (XCDR (tail
)));
7105 XSETCDR (tail
, XCDR (XCDR (tail
)));
7109 XSETCAR (tail
, Qnil
);
7113 if (charset_map_loaded
)
7115 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7117 if (STRINGP (start
))
7118 pbeg
= XSTRING (start
)->data
;
7120 pbeg
= BYTE_POS_ADDR (start_byte
);
7121 p
= pbeg
+ p_offset
;
7122 pend
= pbeg
+ pend_offset
;
7127 safe_codings
= Qnil
;
7128 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
7129 if (! NILP (XCAR (tail
)))
7130 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
7132 return safe_codings
;
7136 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
7137 Scheck_coding_systems_region
, 3, 3, 0,
7138 doc
: /* Check if the region is encodable by coding systems.
7140 START and END are buffer positions specifying the region.
7141 CODING-SYSTEM-LIST is a list of coding systems to check.
7143 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7144 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7145 whole region, POS0, POS1, ... are buffer positions where non-encodable
7146 characters are found.
7148 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7151 START may be a string. In that case, check if the string is
7152 encodable, and the value contains indices to the string instead of
7153 buffer positions. END is ignored. */)
7154 (start
, end
, coding_system_list
)
7155 Lisp_Object start
, end
, coding_system_list
;
7158 EMACS_INT start_byte
, end_byte
;
7160 const unsigned char *p
, *pbeg
, *pend
;
7162 Lisp_Object tail
, elt
;
7164 if (STRINGP (start
))
7166 if (!STRING_MULTIBYTE (start
)
7167 && XSTRING (start
)->size
!= STRING_BYTES (XSTRING (start
)))
7170 end_byte
= STRING_BYTES (XSTRING (start
));
7175 CHECK_NUMBER_COERCE_MARKER (start
);
7176 CHECK_NUMBER_COERCE_MARKER (end
);
7177 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7178 args_out_of_range (start
, end
);
7179 if (NILP (current_buffer
->enable_multibyte_characters
))
7181 start_byte
= CHAR_TO_BYTE (XINT (start
));
7182 end_byte
= CHAR_TO_BYTE (XINT (end
));
7183 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7186 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7188 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7189 move_gap_both (XINT (start
), start_byte
);
7191 move_gap_both (XINT (end
), end_byte
);
7197 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7200 list
= Fcons (Fcons (elt
, Fcons (AREF (CODING_SYSTEM_SPEC (elt
), 0),
7205 if (STRINGP (start
))
7206 p
= pbeg
= XSTRING (start
)->data
;
7208 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7209 pend
= p
+ (end_byte
- start_byte
);
7211 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
7212 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7216 if (ASCII_BYTE_P (*p
))
7220 c
= STRING_CHAR_ADVANCE (p
);
7222 charset_map_loaded
= 0;
7223 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
7225 elt
= XCDR (XCAR (tail
));
7226 if (! char_encodable_p (c
, XCAR (elt
)))
7227 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
7229 if (charset_map_loaded
)
7231 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7233 if (STRINGP (start
))
7234 pbeg
= XSTRING (start
)->data
;
7236 pbeg
= BYTE_POS_ADDR (start_byte
);
7237 p
= pbeg
+ p_offset
;
7238 pend
= pbeg
+ pend_offset
;
7246 for (; CONSP (tail
); tail
= XCDR (tail
))
7249 if (CONSP (XCDR (XCDR (elt
))))
7250 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
7260 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
7261 Lisp_Object start
, end
, coding_system
, dst_object
;
7262 int encodep
, norecord
;
7264 struct coding_system coding
;
7265 EMACS_INT from
, from_byte
, to
, to_byte
;
7266 Lisp_Object src_object
;
7268 CHECK_NUMBER_COERCE_MARKER (start
);
7269 CHECK_NUMBER_COERCE_MARKER (end
);
7270 if (NILP (coding_system
))
7271 coding_system
= Qno_conversion
;
7273 CHECK_CODING_SYSTEM (coding_system
);
7274 src_object
= Fcurrent_buffer ();
7275 if (NILP (dst_object
))
7276 dst_object
= src_object
;
7277 else if (! EQ (dst_object
, Qt
))
7278 CHECK_BUFFER (dst_object
);
7280 validate_region (&start
, &end
);
7281 from
= XFASTINT (start
);
7282 from_byte
= CHAR_TO_BYTE (from
);
7283 to
= XFASTINT (end
);
7284 to_byte
= CHAR_TO_BYTE (to
);
7286 setup_coding_system (coding_system
, &coding
);
7287 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7290 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7293 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7296 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7298 if (coding
.result
!= CODING_RESULT_SUCCESS
)
7299 error ("Code conversion error: %d", coding
.result
);
7301 return (BUFFERP (dst_object
)
7302 ? make_number (coding
.produced_char
)
7303 : coding
.dst_object
);
7307 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
7308 3, 4, "r\nzCoding system: ",
7309 doc
: /* Decode the current region from the specified coding system.
7310 When called from a program, takes four arguments:
7311 START, END, CODING-SYSTEM, and DESTINATION.
7312 START and END are buffer positions.
7314 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7315 If nil, the region between START and END is replace by the decoded text.
7316 If buffer, the decoded text is inserted in the buffer.
7317 If t, the decoded text is returned.
7319 This function sets `last-coding-system-used' to the precise coding system
7320 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7321 not fully specified.)
7322 It returns the length of the decoded text. */)
7323 (start
, end
, coding_system
, destination
)
7324 Lisp_Object start
, end
, coding_system
, destination
;
7326 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
7329 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
7330 3, 4, "r\nzCoding system: ",
7331 doc
: /* Encode the current region by specified coding system.
7332 When called from a program, takes three arguments:
7333 START, END, and CODING-SYSTEM. START and END are buffer positions.
7335 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7336 If nil, the region between START and END is replace by the encoded text.
7337 If buffer, the encoded text is inserted in the buffer.
7338 If t, the encoded text is returned.
7340 This function sets `last-coding-system-used' to the precise coding system
7341 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7342 not fully specified.)
7343 It returns the length of the encoded text. */)
7344 (start
, end
, coding_system
, destination
)
7345 Lisp_Object start
, end
, coding_system
, destination
;
7347 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
7351 code_convert_string (string
, coding_system
, dst_object
,
7352 encodep
, nocopy
, norecord
)
7353 Lisp_Object string
, coding_system
, dst_object
;
7354 int encodep
, nocopy
, norecord
;
7356 struct coding_system coding
;
7357 EMACS_INT chars
, bytes
;
7359 CHECK_STRING (string
);
7360 if (NILP (coding_system
))
7363 Vlast_coding_system_used
= Qno_conversion
;
7364 if (NILP (dst_object
))
7365 return (nocopy
? Fcopy_sequence (string
) : string
);
7368 if (NILP (coding_system
))
7369 coding_system
= Qno_conversion
;
7371 CHECK_CODING_SYSTEM (coding_system
);
7372 if (NILP (dst_object
))
7374 else if (! EQ (dst_object
, Qt
))
7375 CHECK_BUFFER (dst_object
);
7377 setup_coding_system (coding_system
, &coding
);
7378 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7379 chars
= XSTRING (string
)->size
;
7380 bytes
= STRING_BYTES (XSTRING (string
));
7382 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7384 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7386 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7388 if (coding
.result
!= CODING_RESULT_SUCCESS
)
7389 error ("Code conversion error: %d", coding
.result
);
7391 return (BUFFERP (dst_object
)
7392 ? make_number (coding
.produced_char
)
7393 : coding
.dst_object
);
7397 /* Encode or decode STRING according to CODING_SYSTEM.
7398 Do not set Vlast_coding_system_used.
7400 This function is called only from macros DECODE_FILE and
7401 ENCODE_FILE, thus we ignore character composition. */
7404 code_convert_string_norecord (string
, coding_system
, encodep
)
7405 Lisp_Object string
, coding_system
;
7408 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
7412 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
7414 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7416 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7417 if the decoding operation is trivial.
7419 Optional fourth arg BUFFER non-nil meant that the decoded text is
7420 inserted in BUFFER instead of returned as a string. In this case,
7421 the return value is BUFFER.
7423 This function sets `last-coding-system-used' to the precise coding system
7424 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7425 not fully specified. */)
7426 (string
, coding_system
, nocopy
, buffer
)
7427 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7429 return code_convert_string (string
, coding_system
, buffer
,
7430 0, ! NILP (nocopy
), 0);
7433 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
7435 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
7437 Optional third arg NOCOPY non-nil means it is OK to return STRING
7438 itself if the encoding operation is trivial.
7440 Optional fourth arg BUFFER non-nil meant that the encoded text is
7441 inserted in BUFFER instead of returned as a string. In this case,
7442 the return value is BUFFER.
7444 This function sets `last-coding-system-used' to the precise coding system
7445 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7446 not fully specified.) */)
7447 (string
, coding_system
, nocopy
, buffer
)
7448 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7450 return code_convert_string (string
, coding_system
, buffer
,
7451 1, ! NILP (nocopy
), 1);
7455 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
7456 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
7457 Return the corresponding character. */)
7461 Lisp_Object spec
, attrs
, val
;
7462 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
7465 CHECK_NATNUM (code
);
7466 c
= XFASTINT (code
);
7467 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7468 attrs
= AREF (spec
, 0);
7470 if (ASCII_BYTE_P (c
)
7471 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7474 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7475 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7476 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7477 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7480 charset
= charset_roman
;
7481 else if (c
>= 0xA0 && c
< 0xDF)
7483 charset
= charset_kana
;
7488 int s1
= c
>> 8, s2
= c
& 0xFF;
7490 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
7491 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
7492 error ("Invalid code: %d", code
);
7494 charset
= charset_kanji
;
7496 c
= DECODE_CHAR (charset
, c
);
7498 error ("Invalid code: %d", code
);
7499 return make_number (c
);
7503 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
7504 doc
: /* Encode a Japanese character CHAR to shift_jis encoding.
7505 Return the corresponding code in SJIS. */)
7509 Lisp_Object spec
, attrs
, charset_list
;
7511 struct charset
*charset
;
7514 CHECK_CHARACTER (ch
);
7516 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7517 attrs
= AREF (spec
, 0);
7519 if (ASCII_CHAR_P (c
)
7520 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7523 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7524 charset
= char_charset (c
, charset_list
, &code
);
7525 if (code
== CHARSET_INVALID_CODE (charset
))
7526 error ("Can't encode by shift_jis encoding: %d", c
);
7529 return make_number (code
);
7532 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
7533 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
7534 Return the corresponding character. */)
7538 Lisp_Object spec
, attrs
, val
;
7539 struct charset
*charset_roman
, *charset_big5
, *charset
;
7542 CHECK_NATNUM (code
);
7543 c
= XFASTINT (code
);
7544 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7545 attrs
= AREF (spec
, 0);
7547 if (ASCII_BYTE_P (c
)
7548 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7551 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7552 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7553 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7556 charset
= charset_roman
;
7559 int b1
= c
>> 8, b2
= c
& 0x7F;
7560 if (b1
< 0xA1 || b1
> 0xFE
7561 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
7562 error ("Invalid code: %d", code
);
7563 charset
= charset_big5
;
7565 c
= DECODE_CHAR (charset
, (unsigned )c
);
7567 error ("Invalid code: %d", code
);
7568 return make_number (c
);
7571 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
7572 doc
: /* Encode the Big5 character CHAR to BIG5 coding system.
7573 Return the corresponding character code in Big5. */)
7577 Lisp_Object spec
, attrs
, charset_list
;
7578 struct charset
*charset
;
7582 CHECK_CHARACTER (ch
);
7584 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7585 attrs
= AREF (spec
, 0);
7586 if (ASCII_CHAR_P (c
)
7587 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7590 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7591 charset
= char_charset (c
, charset_list
, &code
);
7592 if (code
== CHARSET_INVALID_CODE (charset
))
7593 error ("Can't encode by Big5 encoding: %d", c
);
7595 return make_number (code
);
7599 DEFUN ("set-terminal-coding-system-internal",
7600 Fset_terminal_coding_system_internal
,
7601 Sset_terminal_coding_system_internal
, 1, 1, 0,
7602 doc
: /* Internal use only. */)
7604 Lisp_Object coding_system
;
7606 CHECK_SYMBOL (coding_system
);
7607 setup_coding_system (Fcheck_coding_system (coding_system
),
7610 /* We had better not send unsafe characters to terminal. */
7611 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
7612 /* Characer composition should be disabled. */
7613 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7614 terminal_coding
.src_multibyte
= 1;
7615 terminal_coding
.dst_multibyte
= 0;
7619 DEFUN ("set-safe-terminal-coding-system-internal",
7620 Fset_safe_terminal_coding_system_internal
,
7621 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
7622 doc
: /* Internal use only. */)
7624 Lisp_Object coding_system
;
7626 CHECK_SYMBOL (coding_system
);
7627 setup_coding_system (Fcheck_coding_system (coding_system
),
7628 &safe_terminal_coding
);
7629 /* Characer composition should be disabled. */
7630 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7631 safe_terminal_coding
.src_multibyte
= 1;
7632 safe_terminal_coding
.dst_multibyte
= 0;
7636 DEFUN ("terminal-coding-system",
7637 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
7638 doc
: /* Return coding system specified for terminal output. */)
7641 return CODING_ID_NAME (terminal_coding
.id
);
7644 DEFUN ("set-keyboard-coding-system-internal",
7645 Fset_keyboard_coding_system_internal
,
7646 Sset_keyboard_coding_system_internal
, 1, 1, 0,
7647 doc
: /* Internal use only. */)
7649 Lisp_Object coding_system
;
7651 CHECK_SYMBOL (coding_system
);
7652 setup_coding_system (Fcheck_coding_system (coding_system
),
7654 /* Characer composition should be disabled. */
7655 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7659 DEFUN ("keyboard-coding-system",
7660 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
7661 doc
: /* Return coding system specified for decoding keyboard input. */)
7664 return CODING_ID_NAME (keyboard_coding
.id
);
7668 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
7669 Sfind_operation_coding_system
, 1, MANY
, 0,
7670 doc
: /* Choose a coding system for an operation based on the target name.
7671 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7672 DECODING-SYSTEM is the coding system to use for decoding
7673 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7674 for encoding (in case OPERATION does encoding).
7676 The first argument OPERATION specifies an I/O primitive:
7677 For file I/O, `insert-file-contents' or `write-region'.
7678 For process I/O, `call-process', `call-process-region', or `start-process'.
7679 For network I/O, `open-network-stream'.
7681 The remaining arguments should be the same arguments that were passed
7682 to the primitive. Depending on which primitive, one of those arguments
7683 is selected as the TARGET. For example, if OPERATION does file I/O,
7684 whichever argument specifies the file name is TARGET.
7686 TARGET has a meaning which depends on OPERATION:
7687 For file I/O, TARGET is a file name.
7688 For process I/O, TARGET is a process name.
7689 For network I/O, TARGET is a service name or a port number
7691 This function looks up what specified for TARGET in,
7692 `file-coding-system-alist', `process-coding-system-alist',
7693 or `network-coding-system-alist' depending on OPERATION.
7694 They may specify a coding system, a cons of coding systems,
7695 or a function symbol to call.
7696 In the last case, we call the function with one argument,
7697 which is a list of all the arguments given to this function.
7699 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7704 Lisp_Object operation
, target_idx
, target
, val
;
7705 register Lisp_Object chain
;
7708 error ("Too few arguments");
7709 operation
= args
[0];
7710 if (!SYMBOLP (operation
)
7711 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
7712 error ("Invalid first arguement");
7713 if (nargs
< 1 + XINT (target_idx
))
7714 error ("Too few arguments for operation: %s",
7715 XSYMBOL (operation
)->name
->data
);
7716 target
= args
[XINT (target_idx
) + 1];
7717 if (!(STRINGP (target
)
7718 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
7719 error ("Invalid %dth argument", XINT (target_idx
) + 1);
7721 chain
= ((EQ (operation
, Qinsert_file_contents
)
7722 || EQ (operation
, Qwrite_region
))
7723 ? Vfile_coding_system_alist
7724 : (EQ (operation
, Qopen_network_stream
)
7725 ? Vnetwork_coding_system_alist
7726 : Vprocess_coding_system_alist
));
7730 for (; CONSP (chain
); chain
= XCDR (chain
))
7736 && ((STRINGP (target
)
7737 && STRINGP (XCAR (elt
))
7738 && fast_string_match (XCAR (elt
), target
) >= 0)
7739 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
7742 /* Here, if VAL is both a valid coding system and a valid
7743 function symbol, we return VAL as a coding system. */
7746 if (! SYMBOLP (val
))
7748 if (! NILP (Fcoding_system_p (val
)))
7749 return Fcons (val
, val
);
7750 if (! NILP (Ffboundp (val
)))
7752 val
= call1 (val
, Flist (nargs
, args
));
7755 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
7756 return Fcons (val
, val
);
7764 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
7765 Sset_coding_system_priority
, 0, MANY
, 0,
7766 doc
: /* Assign higher priority to the coding systems given as arguments.
7767 If multiple coding systems belongs to the same category,
7768 all but the first one are ignored. */)
7774 int changed
[coding_category_max
];
7775 enum coding_category priorities
[coding_category_max
];
7777 bzero (changed
, sizeof changed
);
7779 for (i
= j
= 0; i
< nargs
; i
++)
7781 enum coding_category category
;
7782 Lisp_Object spec
, attrs
;
7784 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
7785 attrs
= AREF (spec
, 0);
7786 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7787 if (changed
[category
])
7788 /* Ignore this coding system because a coding system of the
7789 same category already had a higher priority. */
7791 changed
[category
] = 1;
7792 priorities
[j
++] = category
;
7793 if (coding_categories
[category
].id
>= 0
7794 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
7795 setup_coding_system (args
[i
], &coding_categories
[category
]);
7796 Fset (AREF (Vcoding_category_table
, category
), args
[i
]);
7799 /* Now we have decided top J priorities. Reflect the order of the
7800 original priorities to the remaining priorities. */
7802 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
7804 while (j
< coding_category_max
7805 && changed
[coding_priorities
[j
]])
7807 if (j
== coding_category_max
)
7809 priorities
[i
] = coding_priorities
[j
];
7812 bcopy (priorities
, coding_priorities
, sizeof priorities
);
7814 /* Update `coding-category-list'. */
7815 Vcoding_category_list
= Qnil
;
7816 for (i
= coding_category_max
- 1; i
>= 0; i
--)
7817 Vcoding_category_list
7818 = Fcons (AREF (Vcoding_category_table
, priorities
[i
]),
7819 Vcoding_category_list
);
7824 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
7825 Scoding_system_priority_list
, 0, 1, 0,
7826 doc
: /* Return a list of coding systems ordered by their priorities.
7827 HIGHESTP non-nil means just return the highest priority one. */)
7829 Lisp_Object highestp
;
7834 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
7836 enum coding_category category
= coding_priorities
[i
];
7837 int id
= coding_categories
[category
].id
;
7842 attrs
= CODING_ID_ATTRS (id
);
7843 if (! NILP (highestp
))
7844 return CODING_ATTR_BASE_NAME (attrs
);
7845 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
7847 return Fnreverse (val
);
7850 static char *suffixes
[] = { "-unix", "-dos", "-mac" };
7853 make_subsidiaries (base
)
7856 Lisp_Object subsidiaries
;
7857 int base_name_len
= STRING_BYTES (XSYMBOL (base
)->name
);
7858 char *buf
= (char *) alloca (base_name_len
+ 6);
7861 bcopy (XSYMBOL (base
)->name
->data
, buf
, base_name_len
);
7862 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
7863 for (i
= 0; i
< 3; i
++)
7865 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
7866 ASET (subsidiaries
, i
, intern (buf
));
7868 return subsidiaries
;
7872 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
7873 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
7874 doc
: /* For internal use only.
7875 usage: (define-coding-system-internal ...) */)
7881 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
7882 Lisp_Object attrs
; /* Vector of attributes. */
7883 Lisp_Object eol_type
;
7884 Lisp_Object aliases
;
7885 Lisp_Object coding_type
, charset_list
, safe_charsets
;
7886 enum coding_category category
;
7887 Lisp_Object tail
, val
;
7888 int max_charset_id
= 0;
7891 if (nargs
< coding_arg_max
)
7894 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
7896 name
= args
[coding_arg_name
];
7897 CHECK_SYMBOL (name
);
7898 CODING_ATTR_BASE_NAME (attrs
) = name
;
7900 val
= args
[coding_arg_mnemonic
];
7901 if (! STRINGP (val
))
7902 CHECK_CHARACTER (val
);
7903 CODING_ATTR_MNEMONIC (attrs
) = val
;
7905 coding_type
= args
[coding_arg_coding_type
];
7906 CHECK_SYMBOL (coding_type
);
7907 CODING_ATTR_TYPE (attrs
) = coding_type
;
7909 charset_list
= args
[coding_arg_charset_list
];
7910 if (SYMBOLP (charset_list
))
7912 if (EQ (charset_list
, Qiso_2022
))
7914 if (! EQ (coding_type
, Qiso_2022
))
7915 error ("Invalid charset-list");
7916 charset_list
= Viso_2022_charset_list
;
7918 else if (EQ (charset_list
, Qemacs_mule
))
7920 if (! EQ (coding_type
, Qemacs_mule
))
7921 error ("Invalid charset-list");
7922 charset_list
= Vemacs_mule_charset_list
;
7924 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7925 if (max_charset_id
< XFASTINT (XCAR (tail
)))
7926 max_charset_id
= XFASTINT (XCAR (tail
));
7930 charset_list
= Fcopy_sequence (charset_list
);
7931 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
7933 struct charset
*charset
;
7936 CHECK_CHARSET_GET_CHARSET (val
, charset
);
7937 if (EQ (coding_type
, Qiso_2022
)
7938 ? CHARSET_ISO_FINAL (charset
) < 0
7939 : EQ (coding_type
, Qemacs_mule
)
7940 ? CHARSET_EMACS_MULE_ID (charset
) < 0
7942 error ("Can't handle charset `%s'",
7943 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7945 XCAR (tail
) = make_number (charset
->id
);
7946 if (max_charset_id
< charset
->id
)
7947 max_charset_id
= charset
->id
;
7950 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
7952 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
7954 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7955 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
7956 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
7958 CODING_ATTR_ASCII_COMPAT (attrs
) = args
[coding_arg_ascii_compatible_p
];
7960 val
= args
[coding_arg_decode_translation_table
];
7962 CHECK_CHAR_TABLE (val
);
7963 CODING_ATTR_DECODE_TBL (attrs
) = val
;
7965 val
= args
[coding_arg_encode_translation_table
];
7967 CHECK_CHAR_TABLE (val
);
7968 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
7970 val
= args
[coding_arg_post_read_conversion
];
7972 CODING_ATTR_POST_READ (attrs
) = val
;
7974 val
= args
[coding_arg_pre_write_conversion
];
7976 CODING_ATTR_PRE_WRITE (attrs
) = val
;
7978 val
= args
[coding_arg_default_char
];
7980 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
7983 CHECK_CHARACTER (val
);
7984 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
7987 val
= args
[coding_arg_plist
];
7989 CODING_ATTR_PLIST (attrs
) = val
;
7991 if (EQ (coding_type
, Qcharset
))
7994 /* Generate a lisp vector of 256 elements. Each element is nil,
7995 integer, or a list of charset IDs.
7997 If Nth element is nil, the byte code N is invalid in this
8000 If Nth element is a number NUM, N is the first byte of a
8001 charset whose ID is NUM.
8003 If Nth element is a list of charset IDs, N is the first byte
8004 of one of them. The list is sorted by dimensions of the
8005 charsets. A charset of smaller dimension comes firtst.
8007 for (list
= Qnil
, tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8009 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
8011 if (charset
->method
== CHARSET_METHOD_SUPERSET
)
8013 val
= CHARSET_SUPERSET (charset
);
8014 for (; CONSP (val
); val
= XCDR (val
))
8015 list
= Fcons (XCAR (XCAR (val
)), list
);
8018 list
= Fcons (XCAR (tail
), list
);
8021 val
= Fmake_vector (make_number (256), Qnil
);
8023 for (tail
= Fnreverse (list
); CONSP (tail
); tail
= XCDR (tail
))
8025 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
8026 int dim
= CHARSET_DIMENSION (charset
);
8027 int idx
= (dim
- 1) * 4;
8029 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8030 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8032 for (i
= charset
->code_space
[idx
];
8033 i
<= charset
->code_space
[idx
+ 1]; i
++)
8035 Lisp_Object tmp
, tmp2
;
8038 tmp
= AREF (val
, i
);
8041 else if (NUMBERP (tmp
))
8043 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
8045 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
8047 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
8051 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
8053 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
8058 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
8061 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
8062 XSETCAR (tmp2
, XCAR (tail
));
8068 ASET (attrs
, coding_attr_charset_valids
, val
);
8069 category
= coding_category_charset
;
8071 else if (EQ (coding_type
, Qccl
))
8075 if (nargs
< coding_arg_ccl_max
)
8078 val
= args
[coding_arg_ccl_decoder
];
8079 CHECK_CCL_PROGRAM (val
);
8081 val
= Fcopy_sequence (val
);
8082 ASET (attrs
, coding_attr_ccl_decoder
, val
);
8084 val
= args
[coding_arg_ccl_encoder
];
8085 CHECK_CCL_PROGRAM (val
);
8087 val
= Fcopy_sequence (val
);
8088 ASET (attrs
, coding_attr_ccl_encoder
, val
);
8090 val
= args
[coding_arg_ccl_valids
];
8091 valids
= Fmake_string (make_number (256), make_number (0));
8092 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
8099 from
= to
= XINT (val
);
8100 if (from
< 0 || from
> 255)
8101 args_out_of_range_3 (val
, make_number (0), make_number (255));
8106 CHECK_NUMBER (XCAR (val
));
8107 CHECK_NUMBER (XCDR (val
));
8108 from
= XINT (XCAR (val
));
8109 if (from
< 0 || from
> 255)
8110 args_out_of_range_3 (XCAR (val
),
8111 make_number (0), make_number (255));
8112 to
= XINT (XCDR (val
));
8113 if (to
< from
|| to
> 255)
8114 args_out_of_range_3 (XCDR (val
),
8115 XCAR (val
), make_number (255));
8117 for (i
= from
; i
<= to
; i
++)
8118 XSTRING (valids
)->data
[i
] = 1;
8120 ASET (attrs
, coding_attr_ccl_valids
, valids
);
8122 category
= coding_category_ccl
;
8124 else if (EQ (coding_type
, Qutf_16
))
8126 Lisp_Object bom
, endian
;
8128 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8130 if (nargs
< coding_arg_utf16_max
)
8133 bom
= args
[coding_arg_utf16_bom
];
8134 if (! NILP (bom
) && ! EQ (bom
, Qt
))
8137 CHECK_CODING_SYSTEM (XCAR (bom
));
8138 CHECK_CODING_SYSTEM (XCDR (bom
));
8140 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
8142 endian
= args
[coding_arg_utf16_endian
];
8143 CHECK_SYMBOL (endian
);
8146 else if (! EQ (endian
, Qbig
) && ! EQ (endian
, Qlittle
))
8147 error ("Invalid endian: %s", XSYMBOL (endian
)->name
->data
);
8148 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
8150 category
= (CONSP (bom
)
8151 ? coding_category_utf_16_auto
8153 ? (EQ (endian
, Qbig
)
8154 ? coding_category_utf_16_be_nosig
8155 : coding_category_utf_16_le_nosig
)
8156 : (EQ (endian
, Qbig
)
8157 ? coding_category_utf_16_be
8158 : coding_category_utf_16_le
));
8160 else if (EQ (coding_type
, Qiso_2022
))
8162 Lisp_Object initial
, reg_usage
, request
, flags
;
8165 if (nargs
< coding_arg_iso2022_max
)
8168 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
8169 CHECK_VECTOR (initial
);
8170 for (i
= 0; i
< 4; i
++)
8172 val
= Faref (initial
, make_number (i
));
8175 struct charset
*charset
;
8177 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8178 ASET (initial
, i
, make_number (CHARSET_ID (charset
)));
8179 if (i
== 0 && CHARSET_ASCII_COMPATIBLE_P (charset
))
8180 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8183 ASET (initial
, i
, make_number (-1));
8186 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
8187 CHECK_CONS (reg_usage
);
8188 CHECK_NATNUM (XCAR (reg_usage
));
8189 CHECK_NATNUM (XCDR (reg_usage
));
8191 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
8192 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
8198 CHECK_CHARSET_GET_ID (XCAR (val
), id
);
8199 CHECK_NATNUM (XCDR (val
));
8200 if (XINT (XCDR (val
)) >= 4)
8201 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
8202 XCAR (val
) = make_number (id
);
8205 flags
= args
[coding_arg_iso2022_flags
];
8206 CHECK_NATNUM (flags
);
8208 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
8209 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
8211 ASET (attrs
, coding_attr_iso_initial
, initial
);
8212 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
8213 ASET (attrs
, coding_attr_iso_request
, request
);
8214 ASET (attrs
, coding_attr_iso_flags
, flags
);
8215 setup_iso_safe_charsets (attrs
);
8217 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
8218 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
8219 | CODING_ISO_FLAG_SINGLE_SHIFT
))
8220 ? coding_category_iso_7_else
8221 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8222 ? coding_category_iso_7
8223 : coding_category_iso_7_tight
);
8226 int id
= XINT (AREF (initial
, 1));
8228 category
= (((i
& CODING_ISO_FLAG_LOCKING_SHIFT
)
8229 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8231 ? coding_category_iso_8_else
8232 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
8233 ? coding_category_iso_8_1
8234 : coding_category_iso_8_2
);
8236 if (category
!= coding_category_iso_8_1
8237 && category
!= coding_category_iso_8_2
)
8238 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8240 else if (EQ (coding_type
, Qemacs_mule
))
8242 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
8243 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
8244 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8245 category
= coding_category_emacs_mule
;
8247 else if (EQ (coding_type
, Qshift_jis
))
8250 struct charset
*charset
;
8252 if (XINT (Flength (charset_list
)) != 3)
8253 error ("There should be just three charsets");
8255 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8256 if (CHARSET_DIMENSION (charset
) != 1)
8257 error ("Dimension of charset %s is not one",
8258 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8259 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8260 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8262 charset_list
= XCDR (charset_list
);
8263 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8264 if (CHARSET_DIMENSION (charset
) != 1)
8265 error ("Dimension of charset %s is not one",
8266 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8268 charset_list
= XCDR (charset_list
);
8269 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8270 if (CHARSET_DIMENSION (charset
) != 2)
8271 error ("Dimension of charset %s is not two",
8272 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8274 category
= coding_category_sjis
;
8275 Vsjis_coding_system
= name
;
8277 else if (EQ (coding_type
, Qbig5
))
8279 struct charset
*charset
;
8281 if (XINT (Flength (charset_list
)) != 2)
8282 error ("There should be just two charsets");
8284 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8285 if (CHARSET_DIMENSION (charset
) != 1)
8286 error ("Dimension of charset %s is not one",
8287 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8288 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8289 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8291 charset_list
= XCDR (charset_list
);
8292 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8293 if (CHARSET_DIMENSION (charset
) != 2)
8294 error ("Dimension of charset %s is not two",
8295 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8297 category
= coding_category_big5
;
8298 Vbig5_coding_system
= name
;
8300 else if (EQ (coding_type
, Qraw_text
))
8302 category
= coding_category_raw_text
;
8303 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8305 else if (EQ (coding_type
, Qutf_8
))
8307 category
= coding_category_utf_8
;
8308 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8310 else if (EQ (coding_type
, Qundecided
))
8311 category
= coding_category_undecided
;
8313 error ("Invalid coding system type: %s",
8314 XSYMBOL (coding_type
)->name
->data
);
8316 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
8317 CODING_ATTR_PLIST (attrs
)
8318 = Fcons (QCcategory
, Fcons (AREF (Vcoding_category_table
, category
),
8319 CODING_ATTR_PLIST (attrs
)));
8321 eol_type
= args
[coding_arg_eol_type
];
8322 if (! NILP (eol_type
)
8323 && ! EQ (eol_type
, Qunix
)
8324 && ! EQ (eol_type
, Qdos
)
8325 && ! EQ (eol_type
, Qmac
))
8326 error ("Invalid eol-type");
8328 aliases
= Fcons (name
, Qnil
);
8330 if (NILP (eol_type
))
8332 eol_type
= make_subsidiaries (name
);
8333 for (i
= 0; i
< 3; i
++)
8335 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
8337 this_name
= AREF (eol_type
, i
);
8338 this_aliases
= Fcons (this_name
, Qnil
);
8339 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
8340 this_spec
= Fmake_vector (make_number (3), attrs
);
8341 ASET (this_spec
, 1, this_aliases
);
8342 ASET (this_spec
, 2, this_eol_type
);
8343 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
8344 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
8345 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
8346 Vcoding_system_alist
);
8350 spec_vec
= Fmake_vector (make_number (3), attrs
);
8351 ASET (spec_vec
, 1, aliases
);
8352 ASET (spec_vec
, 2, eol_type
);
8354 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
8355 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
8356 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
8357 Vcoding_system_alist
);
8360 int id
= coding_categories
[category
].id
;
8362 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
8363 setup_coding_system (name
, &coding_categories
[category
]);
8369 return Fsignal (Qwrong_number_of_arguments
,
8370 Fcons (intern ("define-coding-system-internal"),
8371 make_number (nargs
)));
8374 /* Fixme: should this record the alias relationships for
8375 diagnostics? Should it update coding-system-list? */
8376 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
8377 Sdefine_coding_system_alias
, 2, 2, 0,
8378 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8379 (alias
, coding_system
)
8380 Lisp_Object alias
, coding_system
;
8382 Lisp_Object spec
, aliases
, eol_type
;
8384 CHECK_SYMBOL (alias
);
8385 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8386 aliases
= AREF (spec
, 1);
8387 while (!NILP (XCDR (aliases
)))
8388 aliases
= XCDR (aliases
);
8389 XCDR (aliases
) = Fcons (alias
, Qnil
);
8391 eol_type
= AREF (spec
, 2);
8392 if (VECTORP (eol_type
))
8394 Lisp_Object subsidiaries
;
8397 subsidiaries
= make_subsidiaries (alias
);
8398 for (i
= 0; i
< 3; i
++)
8399 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
8400 AREF (eol_type
, i
));
8402 ASET (spec
, 2, subsidiaries
);
8405 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
8406 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
8407 Vcoding_system_alist
);
8412 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
8414 doc
: /* Return the base of CODING-SYSTEM.
8415 Any alias or subsidiary coding system is not a base coding system. */)
8417 Lisp_Object coding_system
;
8419 Lisp_Object spec
, attrs
;
8421 if (NILP (coding_system
))
8422 return (Qno_conversion
);
8423 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8424 attrs
= AREF (spec
, 0);
8425 return CODING_ATTR_BASE_NAME (attrs
);
8428 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
8430 doc
: "Return the property list of CODING-SYSTEM.")
8432 Lisp_Object coding_system
;
8434 Lisp_Object spec
, attrs
;
8436 if (NILP (coding_system
))
8437 coding_system
= Qno_conversion
;
8438 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8439 attrs
= AREF (spec
, 0);
8440 return CODING_ATTR_PLIST (attrs
);
8444 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
8446 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
8448 Lisp_Object coding_system
;
8452 if (NILP (coding_system
))
8453 coding_system
= Qno_conversion
;
8454 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8455 return AREF (spec
, 1);
8458 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
8459 Scoding_system_eol_type
, 1, 1, 0,
8460 doc
: /* Return eol-type of CODING-SYSTEM.
8461 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8463 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8464 and CR respectively.
8466 A vector value indicates that a format of end-of-line should be
8467 detected automatically. Nth element of the vector is the subsidiary
8468 coding system whose eol-type is N. */)
8470 Lisp_Object coding_system
;
8472 Lisp_Object spec
, eol_type
;
8475 if (NILP (coding_system
))
8476 coding_system
= Qno_conversion
;
8477 if (! CODING_SYSTEM_P (coding_system
))
8479 spec
= CODING_SYSTEM_SPEC (coding_system
);
8480 eol_type
= AREF (spec
, 2);
8481 if (VECTORP (eol_type
))
8482 return Fcopy_sequence (eol_type
);
8483 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
8484 return make_number (n
);
8490 /*** 9. Post-amble ***/
8497 for (i
= 0; i
< coding_category_max
; i
++)
8499 coding_categories
[i
].id
= -1;
8500 coding_priorities
[i
] = i
;
8503 /* ISO2022 specific initialize routine. */
8504 for (i
= 0; i
< 0x20; i
++)
8505 iso_code_class
[i
] = ISO_control_0
;
8506 for (i
= 0x21; i
< 0x7F; i
++)
8507 iso_code_class
[i
] = ISO_graphic_plane_0
;
8508 for (i
= 0x80; i
< 0xA0; i
++)
8509 iso_code_class
[i
] = ISO_control_1
;
8510 for (i
= 0xA1; i
< 0xFF; i
++)
8511 iso_code_class
[i
] = ISO_graphic_plane_1
;
8512 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
8513 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
8514 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
8515 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
8516 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
8517 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
8518 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
8519 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
8520 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
8521 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
8523 inhibit_pre_post_conversion
= 0;
8525 for (i
= 0; i
< 256; i
++)
8527 emacs_mule_bytes
[i
] = 1;
8529 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_11
] = 3;
8530 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_12
] = 3;
8531 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_21
] = 4;
8532 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_22
] = 4;
8540 staticpro (&Vcoding_system_hash_table
);
8541 Vcoding_system_hash_table
= Fmakehash (Qeq
);
8543 staticpro (&Vsjis_coding_system
);
8544 Vsjis_coding_system
= Qnil
;
8546 staticpro (&Vbig5_coding_system
);
8547 Vbig5_coding_system
= Qnil
;
8549 staticpro (&Vcode_conversion_work_buf_list
);
8550 Vcode_conversion_work_buf_list
= Qnil
;
8552 staticpro (&Vcode_conversion_reused_work_buf
);
8553 Vcode_conversion_reused_work_buf
= Qnil
;
8555 DEFSYM (Qcharset
, "charset");
8556 DEFSYM (Qtarget_idx
, "target-idx");
8557 DEFSYM (Qcoding_system_history
, "coding-system-history");
8558 Fset (Qcoding_system_history
, Qnil
);
8560 /* Target FILENAME is the first argument. */
8561 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
8562 /* Target FILENAME is the third argument. */
8563 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
8565 DEFSYM (Qcall_process
, "call-process");
8566 /* Target PROGRAM is the first argument. */
8567 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
8569 DEFSYM (Qcall_process_region
, "call-process-region");
8570 /* Target PROGRAM is the third argument. */
8571 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
8573 DEFSYM (Qstart_process
, "start-process");
8574 /* Target PROGRAM is the third argument. */
8575 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
8577 DEFSYM (Qopen_network_stream
, "open-network-stream");
8578 /* Target SERVICE is the fourth argument. */
8579 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
8581 DEFSYM (Qcoding_system
, "coding-system");
8582 DEFSYM (Qcoding_aliases
, "coding-aliases");
8584 DEFSYM (Qeol_type
, "eol-type");
8585 DEFSYM (Qunix
, "unix");
8586 DEFSYM (Qdos
, "dos");
8588 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
8589 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
8590 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
8591 DEFSYM (Qdefault_char
, "default-char");
8592 DEFSYM (Qundecided
, "undecided");
8593 DEFSYM (Qno_conversion
, "no-conversion");
8594 DEFSYM (Qraw_text
, "raw-text");
8596 DEFSYM (Qiso_2022
, "iso-2022");
8598 DEFSYM (Qutf_8
, "utf-8");
8600 DEFSYM (Qutf_16
, "utf-16");
8601 DEFSYM (Qbig
, "big");
8602 DEFSYM (Qlittle
, "little");
8604 DEFSYM (Qshift_jis
, "shift-jis");
8605 DEFSYM (Qbig5
, "big5");
8607 DEFSYM (Qcoding_system_p
, "coding-system-p");
8609 DEFSYM (Qcoding_system_error
, "coding-system-error");
8610 Fput (Qcoding_system_error
, Qerror_conditions
,
8611 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
8612 Fput (Qcoding_system_error
, Qerror_message
,
8613 build_string ("Invalid coding system"));
8615 /* Intern this now in case it isn't already done.
8616 Setting this variable twice is harmless.
8617 But don't staticpro it here--that is done in alloc.c. */
8618 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
8620 DEFSYM (Qtranslation_table
, "translation-table");
8621 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
8622 DEFSYM (Qtranslation_table_id
, "translation-table-id");
8623 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
8624 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
8626 DEFSYM (Qvalid_codes
, "valid-codes");
8628 DEFSYM (Qemacs_mule
, "emacs-mule");
8630 DEFSYM (QCcategory
, ":category");
8632 Vcoding_category_table
8633 = Fmake_vector (make_number (coding_category_max
), Qnil
);
8634 staticpro (&Vcoding_category_table
);
8635 /* Followings are target of code detection. */
8636 ASET (Vcoding_category_table
, coding_category_iso_7
,
8637 intern ("coding-category-iso-7"));
8638 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
8639 intern ("coding-category-iso-7-tight"));
8640 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
8641 intern ("coding-category-iso-8-1"));
8642 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
8643 intern ("coding-category-iso-8-2"));
8644 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
8645 intern ("coding-category-iso-7-else"));
8646 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
8647 intern ("coding-category-iso-8-else"));
8648 ASET (Vcoding_category_table
, coding_category_utf_8
,
8649 intern ("coding-category-utf-8"));
8650 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
8651 intern ("coding-category-utf-16-be"));
8652 ASET (Vcoding_category_table
, coding_category_utf_16_auto
,
8653 intern ("coding-category-utf-16-auto"));
8654 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
8655 intern ("coding-category-utf-16-le"));
8656 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
8657 intern ("coding-category-utf-16-be-nosig"));
8658 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
8659 intern ("coding-category-utf-16-le-nosig"));
8660 ASET (Vcoding_category_table
, coding_category_charset
,
8661 intern ("coding-category-charset"));
8662 ASET (Vcoding_category_table
, coding_category_sjis
,
8663 intern ("coding-category-sjis"));
8664 ASET (Vcoding_category_table
, coding_category_big5
,
8665 intern ("coding-category-big5"));
8666 ASET (Vcoding_category_table
, coding_category_ccl
,
8667 intern ("coding-category-ccl"));
8668 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
8669 intern ("coding-category-emacs-mule"));
8670 /* Followings are NOT target of code detection. */
8671 ASET (Vcoding_category_table
, coding_category_raw_text
,
8672 intern ("coding-category-raw-text"));
8673 ASET (Vcoding_category_table
, coding_category_undecided
,
8674 intern ("coding-category-undecided"));
8676 defsubr (&Scoding_system_p
);
8677 defsubr (&Sread_coding_system
);
8678 defsubr (&Sread_non_nil_coding_system
);
8679 defsubr (&Scheck_coding_system
);
8680 defsubr (&Sdetect_coding_region
);
8681 defsubr (&Sdetect_coding_string
);
8682 defsubr (&Sfind_coding_systems_region_internal
);
8683 defsubr (&Scheck_coding_systems_region
);
8684 defsubr (&Sdecode_coding_region
);
8685 defsubr (&Sencode_coding_region
);
8686 defsubr (&Sdecode_coding_string
);
8687 defsubr (&Sencode_coding_string
);
8688 defsubr (&Sdecode_sjis_char
);
8689 defsubr (&Sencode_sjis_char
);
8690 defsubr (&Sdecode_big5_char
);
8691 defsubr (&Sencode_big5_char
);
8692 defsubr (&Sset_terminal_coding_system_internal
);
8693 defsubr (&Sset_safe_terminal_coding_system_internal
);
8694 defsubr (&Sterminal_coding_system
);
8695 defsubr (&Sset_keyboard_coding_system_internal
);
8696 defsubr (&Skeyboard_coding_system
);
8697 defsubr (&Sfind_operation_coding_system
);
8698 defsubr (&Sset_coding_system_priority
);
8699 defsubr (&Sdefine_coding_system_internal
);
8700 defsubr (&Sdefine_coding_system_alias
);
8701 defsubr (&Scoding_system_base
);
8702 defsubr (&Scoding_system_plist
);
8703 defsubr (&Scoding_system_aliases
);
8704 defsubr (&Scoding_system_eol_type
);
8705 defsubr (&Scoding_system_priority_list
);
8707 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
8708 doc
: /* List of coding systems.
8710 Do not alter the value of this variable manually. This variable should be
8711 updated by the functions `define-coding-system' and
8712 `define-coding-system-alias'. */);
8713 Vcoding_system_list
= Qnil
;
8715 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
8716 doc
: /* Alist of coding system names.
8717 Each element is one element list of coding system name.
8718 This variable is given to `completing-read' as TABLE argument.
8720 Do not alter the value of this variable manually. This variable should be
8721 updated by the functions `make-coding-system' and
8722 `define-coding-system-alias'. */);
8723 Vcoding_system_alist
= Qnil
;
8725 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
8726 doc
: /* List of coding-categories (symbols) ordered by priority.
8728 On detecting a coding system, Emacs tries code detection algorithms
8729 associated with each coding-category one by one in this order. When
8730 one algorithm agrees with a byte sequence of source text, the coding
8731 system bound to the corresponding coding-category is selected. */);
8735 Vcoding_category_list
= Qnil
;
8736 for (i
= coding_category_max
- 1; i
>= 0; i
--)
8737 Vcoding_category_list
8738 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
8739 Vcoding_category_list
);
8742 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
8743 doc
: /* Specify the coding system for read operations.
8744 It is useful to bind this variable with `let', but do not set it globally.
8745 If the value is a coding system, it is used for decoding on read operation.
8746 If not, an appropriate element is used from one of the coding system alists:
8747 There are three such tables, `file-coding-system-alist',
8748 `process-coding-system-alist', and `network-coding-system-alist'. */);
8749 Vcoding_system_for_read
= Qnil
;
8751 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
8752 doc
: /* Specify the coding system for write operations.
8753 Programs bind this variable with `let', but you should not set it globally.
8754 If the value is a coding system, it is used for encoding of output,
8755 when writing it to a file and when sending it to a file or subprocess.
8757 If this does not specify a coding system, an appropriate element
8758 is used from one of the coding system alists:
8759 There are three such tables, `file-coding-system-alist',
8760 `process-coding-system-alist', and `network-coding-system-alist'.
8761 For output to files, if the above procedure does not specify a coding system,
8762 the value of `buffer-file-coding-system' is used. */);
8763 Vcoding_system_for_write
= Qnil
;
8765 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
8767 Coding system used in the latest file or process I/O. */);
8768 Vlast_coding_system_used
= Qnil
;
8770 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
8772 *Non-nil means always inhibit code conversion of end-of-line format.
8773 See info node `Coding Systems' and info node `Text and Binary' concerning
8774 such conversion. */);
8775 inhibit_eol_conversion
= 0;
8777 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
8779 Non-nil means process buffer inherits coding system of process output.
8780 Bind it to t if the process output is to be treated as if it were a file
8781 read from some filesystem. */);
8782 inherit_process_coding_system
= 0;
8784 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
8786 Alist to decide a coding system to use for a file I/O operation.
8787 The format is ((PATTERN . VAL) ...),
8788 where PATTERN is a regular expression matching a file name,
8789 VAL is a coding system, a cons of coding systems, or a function symbol.
8790 If VAL is a coding system, it is used for both decoding and encoding
8792 If VAL is a cons of coding systems, the car part is used for decoding,
8793 and the cdr part is used for encoding.
8794 If VAL is a function symbol, the function must return a coding system
8795 or a cons of coding systems which are used as above. The function gets
8796 the arguments with which `find-operation-coding-systems' was called.
8798 See also the function `find-operation-coding-system'
8799 and the variable `auto-coding-alist'. */);
8800 Vfile_coding_system_alist
= Qnil
;
8802 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
8804 Alist to decide a coding system to use for a process I/O operation.
8805 The format is ((PATTERN . VAL) ...),
8806 where PATTERN is a regular expression matching a program name,
8807 VAL is a coding system, a cons of coding systems, or a function symbol.
8808 If VAL is a coding system, it is used for both decoding what received
8809 from the program and encoding what sent to the program.
8810 If VAL is a cons of coding systems, the car part is used for decoding,
8811 and the cdr part is used for encoding.
8812 If VAL is a function symbol, the function must return a coding system
8813 or a cons of coding systems which are used as above.
8815 See also the function `find-operation-coding-system'. */);
8816 Vprocess_coding_system_alist
= Qnil
;
8818 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
8820 Alist to decide a coding system to use for a network I/O operation.
8821 The format is ((PATTERN . VAL) ...),
8822 where PATTERN is a regular expression matching a network service name
8823 or is a port number to connect to,
8824 VAL is a coding system, a cons of coding systems, or a function symbol.
8825 If VAL is a coding system, it is used for both decoding what received
8826 from the network stream and encoding what sent to the network stream.
8827 If VAL is a cons of coding systems, the car part is used for decoding,
8828 and the cdr part is used for encoding.
8829 If VAL is a function symbol, the function must return a coding system
8830 or a cons of coding systems which are used as above.
8832 See also the function `find-operation-coding-system'. */);
8833 Vnetwork_coding_system_alist
= Qnil
;
8835 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
8836 doc
: /* Coding system to use with system messages.
8837 Also used for decoding keyboard input on X Window system. */);
8838 Vlocale_coding_system
= Qnil
;
8840 /* The eol mnemonics are reset in startup.el system-dependently. */
8841 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
8843 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8844 eol_mnemonic_unix
= build_string (":");
8846 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
8848 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8849 eol_mnemonic_dos
= build_string ("\\");
8851 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
8853 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8854 eol_mnemonic_mac
= build_string ("/");
8856 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
8858 *String displayed in mode line when end-of-line format is not yet determined. */);
8859 eol_mnemonic_undecided
= build_string (":");
8861 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
8863 *Non-nil enables character translation while encoding and decoding. */);
8864 Venable_character_translation
= Qt
;
8866 DEFVAR_LISP ("standard-translation-table-for-decode",
8867 &Vstandard_translation_table_for_decode
,
8868 doc
: /* Table for translating characters while decoding. */);
8869 Vstandard_translation_table_for_decode
= Qnil
;
8871 DEFVAR_LISP ("standard-translation-table-for-encode",
8872 &Vstandard_translation_table_for_encode
,
8873 doc
: /* Table for translating characters while encoding. */);
8874 Vstandard_translation_table_for_encode
= Qnil
;
8876 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
8877 doc
: /* Alist of charsets vs revision numbers.
8878 While encoding, if a charset (car part of an element) is found,
8879 designate it with the escape sequence identifying revision (cdr part
8880 of the element). */);
8881 Vcharset_revision_table
= Qnil
;
8883 DEFVAR_LISP ("default-process-coding-system",
8884 &Vdefault_process_coding_system
,
8885 doc
: /* Cons of coding systems used for process I/O by default.
8886 The car part is used for decoding a process output,
8887 the cdr part is used for encoding a text to be sent to a process. */);
8888 Vdefault_process_coding_system
= Qnil
;
8890 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
8892 Table of extra Latin codes in the range 128..159 (inclusive).
8893 This is a vector of length 256.
8894 If Nth element is non-nil, the existence of code N in a file
8895 \(or output of subprocess) doesn't prevent it to be detected as
8896 a coding system of ISO 2022 variant which has a flag
8897 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8898 or reading output of a subprocess.
8899 Only 128th through 159th elements has a meaning. */);
8900 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
8902 DEFVAR_LISP ("select-safe-coding-system-function",
8903 &Vselect_safe_coding_system_function
,
8905 Function to call to select safe coding system for encoding a text.
8907 If set, this function is called to force a user to select a proper
8908 coding system which can encode the text in the case that a default
8909 coding system used in each operation can't encode the text.
8911 The default value is `select-safe-coding-system' (which see). */);
8912 Vselect_safe_coding_system_function
= Qnil
;
8914 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8915 &inhibit_iso_escape_detection
,
8917 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8919 By default, on reading a file, Emacs tries to detect how the text is
8920 encoded. This code detection is sensitive to escape sequences. If
8921 the sequence is valid as ISO2022, the code is determined as one of
8922 the ISO2022 encodings, and the file is decoded by the corresponding
8923 coding system (e.g. `iso-2022-7bit').
8925 However, there may be a case that you want to read escape sequences in
8926 a file as is. In such a case, you can set this variable to non-nil.
8927 Then, as the code detection ignores any escape sequences, no file is
8928 detected as encoded in some ISO2022 encoding. The result is that all
8929 escape sequences become visible in a buffer.
8931 The default value is nil, and it is strongly recommended not to change
8932 it. That is because many Emacs Lisp source files that contain
8933 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8934 in Emacs's distribution, and they won't be decoded correctly on
8935 reading if you suppress escape sequence detection.
8937 The other way to read escape sequences in a file without decoding is
8938 to explicitly specify some coding system that doesn't use ISO2022's
8939 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8940 inhibit_iso_escape_detection
= 0;
8943 Lisp_Object args
[coding_arg_max
];
8944 Lisp_Object plist
[14];
8947 for (i
= 0; i
< coding_arg_max
; i
++)
8950 plist
[0] = intern (":name");
8951 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
8952 plist
[2] = intern (":mnemonic");
8953 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
8954 plist
[4] = intern (":coding-type");
8955 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
8956 plist
[6] = intern (":ascii-compatible-p");
8957 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
8958 plist
[8] = intern (":default-char");
8959 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
8960 plist
[10] = intern (":docstring");
8961 plist
[11] = build_string ("Do no conversion.\n\
8963 When you visit a file with this coding, the file is read into a\n\
8964 unibyte buffer as is, thus each byte of a file is treated as a\n\
8966 plist
[12] = intern (":eol-type");
8967 plist
[13] = args
[coding_arg_eol_type
] = Qunix
;
8968 args
[coding_arg_plist
] = Flist (14, plist
);
8969 Fdefine_coding_system_internal (coding_arg_max
, args
);
8972 setup_coding_system (Qno_conversion
, &keyboard_coding
);
8973 setup_coding_system (Qno_conversion
, &terminal_coding
);
8974 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
8979 for (i
= 0; i
< coding_category_max
; i
++)
8980 Fset (AREF (Vcoding_category_table
, i
), Qno_conversion
);
8985 emacs_strerror (error_number
)
8990 synchronize_system_messages_locale ();
8991 str
= strerror (error_number
);
8993 if (! NILP (Vlocale_coding_system
))
8995 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
8996 Vlocale_coding_system
,
8998 str
= (char *) XSTRING (dec
)->data
;