1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2011 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
11 This file is part of GNU Emacs.
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
152 Below is the template of these functions. */
156 detect_coding_XXX (struct coding_system
*coding
,
157 struct coding_detection_info
*detect_info
)
159 const unsigned char *src
= coding
->source
;
160 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
161 int multibytep
= coding
->src_multibyte
;
162 int consumed_chars
= 0;
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
172 if (! __C_conforms_to_XXX___ (c
))
174 if (! __C_strongly_suggests_XXX__ (c
))
175 found
= CATEGORY_MASK_XXX
;
177 /* The byte sequence is invalid for XXX. */
178 detect_info
->rejected
|= CATEGORY_MASK_XXX
;
182 /* The source exhausted successfully. */
183 detect_info
->found
|= found
;
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
200 Below is the template of these functions. */
204 decode_coding_XXXX (struct coding_system
*coding
)
206 const unsigned char *src
= coding
->source
+ coding
->consumed
;
207 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base
;
212 /* A buffer to produce decoded characters. */
213 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
214 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
;
215 int multibytep
= coding
->src_multibyte
;
220 if (charbuf
< charbuf_end
)
221 /* No more room to produce a decoded character. */
228 if (src_base
< src_end
229 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base
< src_end
&& charbuf
< charbuf_end
)
233 *charbuf
++ = *src_base
++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
237 /* Remember how many characters we produced. */
238 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
258 Below is a template of these functions. */
261 encode_coding_XXX (struct coding_system
*coding
)
263 int multibytep
= coding
->dst_multibyte
;
264 int *charbuf
= coding
->charbuf
;
265 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
266 unsigned char *dst
= coding
->destination
+ coding
->produced
;
267 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
268 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
269 int produced_chars
= 0;
271 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
274 /* Encode C into DST, and increment DST. */
276 label_no_more_destination
:
277 /* How many chars and bytes we produced. */
278 coding
->produced_char
+= produced_chars
;
279 coding
->produced
= dst
- coding
->destination
;
284 /*** 1. Preamble ***/
292 #include "character.h"
295 #include "composite.h"
299 #include "termhooks.h"
301 Lisp_Object Vcoding_system_hash_table
;
303 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
304 Lisp_Object Qunix
, Qdos
;
305 Lisp_Object Qbuffer_file_coding_system
;
306 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
307 Lisp_Object Qdefault_char
;
308 Lisp_Object Qno_conversion
, Qundecided
;
309 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
310 Lisp_Object Qbig
, Qlittle
;
311 Lisp_Object Qcoding_system_history
;
312 Lisp_Object Qvalid_codes
;
313 Lisp_Object QCcategory
, QCmnemonic
, QCdefault_char
;
314 Lisp_Object QCdecode_translation_table
, QCencode_translation_table
;
315 Lisp_Object QCpost_read_conversion
, QCpre_write_conversion
;
316 Lisp_Object QCascii_compatible_p
;
318 Lisp_Object Qcall_process
, Qcall_process_region
;
319 Lisp_Object Qstart_process
, Qopen_network_stream
;
320 Lisp_Object Qtarget_idx
;
322 Lisp_Object Qinsufficient_source
, Qinconsistent_eol
, Qinvalid_source
;
323 Lisp_Object Qinterrupted
, Qinsufficient_memory
;
325 /* If a symbol has this property, evaluate the value to define the
326 symbol as a coding system. */
327 static Lisp_Object Qcoding_system_define_form
;
329 /* Format of end-of-line decided by system. This is Qunix on
330 Unix and Mac, Qdos on DOS/Windows.
331 This has an effect only for external encoding (i.e. for output to
332 file and process), not for in-buffer or Lisp string encoding. */
333 static Lisp_Object system_eol_type
;
337 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
339 /* Coding system emacs-mule and raw-text are for converting only
340 end-of-line format. */
341 Lisp_Object Qemacs_mule
, Qraw_text
;
342 Lisp_Object Qutf_8_emacs
;
344 /* Coding-systems are handed between Emacs Lisp programs and C internal
345 routines by the following three variables. */
346 /* Coding system to be used to encode text for terminal display when
347 terminal coding system is nil. */
348 struct coding_system safe_terminal_coding
;
352 Lisp_Object Qtranslation_table
;
353 Lisp_Object Qtranslation_table_id
;
354 Lisp_Object Qtranslation_table_for_decode
;
355 Lisp_Object Qtranslation_table_for_encode
;
357 /* Two special coding systems. */
358 Lisp_Object Vsjis_coding_system
;
359 Lisp_Object Vbig5_coding_system
;
361 /* ISO2022 section */
363 #define CODING_ISO_INITIAL(coding, reg) \
364 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
365 coding_attr_iso_initial), \
369 #define CODING_ISO_REQUEST(coding, charset_id) \
370 (((charset_id) <= (coding)->max_charset_id \
371 ? ((coding)->safe_charsets[charset_id] != 255 \
372 ? (coding)->safe_charsets[charset_id] \
377 #define CODING_ISO_FLAGS(coding) \
378 ((coding)->spec.iso_2022.flags)
379 #define CODING_ISO_DESIGNATION(coding, reg) \
380 ((coding)->spec.iso_2022.current_designation[reg])
381 #define CODING_ISO_INVOCATION(coding, plane) \
382 ((coding)->spec.iso_2022.current_invocation[plane])
383 #define CODING_ISO_SINGLE_SHIFTING(coding) \
384 ((coding)->spec.iso_2022.single_shifting)
385 #define CODING_ISO_BOL(coding) \
386 ((coding)->spec.iso_2022.bol)
387 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
388 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
389 #define CODING_ISO_CMP_STATUS(coding) \
390 (&(coding)->spec.iso_2022.cmp_status)
391 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
392 ((coding)->spec.iso_2022.ctext_extended_segment_len)
393 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
394 ((coding)->spec.iso_2022.embedded_utf_8)
396 /* Control characters of ISO2022. */
397 /* code */ /* function */
398 #define ISO_CODE_LF 0x0A /* line-feed */
399 #define ISO_CODE_CR 0x0D /* carriage-return */
400 #define ISO_CODE_SO 0x0E /* shift-out */
401 #define ISO_CODE_SI 0x0F /* shift-in */
402 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
403 #define ISO_CODE_ESC 0x1B /* escape */
404 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
405 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
406 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
408 /* All code (1-byte) of ISO2022 is classified into one of the
410 enum iso_code_class_type
412 ISO_control_0
, /* Control codes in the range
413 0x00..0x1F and 0x7F, except for the
414 following 5 codes. */
415 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
416 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
417 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
418 ISO_escape
, /* ISO_CODE_SO (0x1B) */
419 ISO_control_1
, /* Control codes in the range
420 0x80..0x9F, except for the
421 following 3 codes. */
422 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
423 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
424 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
425 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
426 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
427 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
428 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
431 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
432 `iso-flags' attribute of an iso2022 coding system. */
434 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
435 instead of the correct short-form sequence (e.g. ESC $ A). */
436 #define CODING_ISO_FLAG_LONG_FORM 0x0001
438 /* If set, reset graphic planes and registers at end-of-line to the
440 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
442 /* If set, reset graphic planes and registers before any control
443 characters to the initial state. */
444 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
446 /* If set, encode by 7-bit environment. */
447 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
449 /* If set, use locking-shift function. */
450 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
452 /* If set, use single-shift function. Overwrite
453 CODING_ISO_FLAG_LOCKING_SHIFT. */
454 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
456 /* If set, use designation escape sequence. */
457 #define CODING_ISO_FLAG_DESIGNATION 0x0040
459 /* If set, produce revision number sequence. */
460 #define CODING_ISO_FLAG_REVISION 0x0080
462 /* If set, produce ISO6429's direction specifying sequence. */
463 #define CODING_ISO_FLAG_DIRECTION 0x0100
465 /* If set, assume designation states are reset at beginning of line on
467 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
469 /* If set, designation sequence should be placed at beginning of line
471 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
473 /* If set, do not encode unsafe characters on output. */
474 #define CODING_ISO_FLAG_SAFE 0x0800
476 /* If set, extra latin codes (128..159) are accepted as a valid code
478 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
480 #define CODING_ISO_FLAG_COMPOSITION 0x2000
482 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
484 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
486 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
488 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
490 /* A character to be produced on output if encoding of the original
491 character is prohibited by CODING_ISO_FLAG_SAFE. */
492 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
495 #define CODING_UTF_8_BOM(coding) \
496 ((coding)->spec.utf_8_bom)
499 #define CODING_UTF_16_BOM(coding) \
500 ((coding)->spec.utf_16.bom)
502 #define CODING_UTF_16_ENDIAN(coding) \
503 ((coding)->spec.utf_16.endian)
505 #define CODING_UTF_16_SURROGATE(coding) \
506 ((coding)->spec.utf_16.surrogate)
510 #define CODING_CCL_DECODER(coding) \
511 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
512 #define CODING_CCL_ENCODER(coding) \
513 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
514 #define CODING_CCL_VALIDS(coding) \
515 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
517 /* Index for each coding category in `coding_categories' */
521 coding_category_iso_7
,
522 coding_category_iso_7_tight
,
523 coding_category_iso_8_1
,
524 coding_category_iso_8_2
,
525 coding_category_iso_7_else
,
526 coding_category_iso_8_else
,
527 coding_category_utf_8_auto
,
528 coding_category_utf_8_nosig
,
529 coding_category_utf_8_sig
,
530 coding_category_utf_16_auto
,
531 coding_category_utf_16_be
,
532 coding_category_utf_16_le
,
533 coding_category_utf_16_be_nosig
,
534 coding_category_utf_16_le_nosig
,
535 coding_category_charset
,
536 coding_category_sjis
,
537 coding_category_big5
,
539 coding_category_emacs_mule
,
540 /* All above are targets of code detection. */
541 coding_category_raw_text
,
542 coding_category_undecided
,
546 /* Definitions of flag bits used in detect_coding_XXXX. */
547 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
548 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
549 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
550 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
551 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
552 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
553 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
554 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
555 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
556 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
557 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
558 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
559 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
560 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
561 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
562 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
563 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
564 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
565 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
566 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
568 /* This value is returned if detect_coding_mask () find nothing other
569 than ASCII characters. */
570 #define CATEGORY_MASK_ANY \
571 (CATEGORY_MASK_ISO_7 \
572 | CATEGORY_MASK_ISO_7_TIGHT \
573 | CATEGORY_MASK_ISO_8_1 \
574 | CATEGORY_MASK_ISO_8_2 \
575 | CATEGORY_MASK_ISO_7_ELSE \
576 | CATEGORY_MASK_ISO_8_ELSE \
577 | CATEGORY_MASK_UTF_8_AUTO \
578 | CATEGORY_MASK_UTF_8_NOSIG \
579 | CATEGORY_MASK_UTF_8_SIG \
580 | CATEGORY_MASK_UTF_16_AUTO \
581 | CATEGORY_MASK_UTF_16_BE \
582 | CATEGORY_MASK_UTF_16_LE \
583 | CATEGORY_MASK_UTF_16_BE_NOSIG \
584 | CATEGORY_MASK_UTF_16_LE_NOSIG \
585 | CATEGORY_MASK_CHARSET \
586 | CATEGORY_MASK_SJIS \
587 | CATEGORY_MASK_BIG5 \
588 | CATEGORY_MASK_CCL \
589 | CATEGORY_MASK_EMACS_MULE)
592 #define CATEGORY_MASK_ISO_7BIT \
593 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
595 #define CATEGORY_MASK_ISO_8BIT \
596 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
598 #define CATEGORY_MASK_ISO_ELSE \
599 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
601 #define CATEGORY_MASK_ISO_ESCAPE \
602 (CATEGORY_MASK_ISO_7 \
603 | CATEGORY_MASK_ISO_7_TIGHT \
604 | CATEGORY_MASK_ISO_7_ELSE \
605 | CATEGORY_MASK_ISO_8_ELSE)
607 #define CATEGORY_MASK_ISO \
608 ( CATEGORY_MASK_ISO_7BIT \
609 | CATEGORY_MASK_ISO_8BIT \
610 | CATEGORY_MASK_ISO_ELSE)
612 #define CATEGORY_MASK_UTF_16 \
613 (CATEGORY_MASK_UTF_16_AUTO \
614 | CATEGORY_MASK_UTF_16_BE \
615 | CATEGORY_MASK_UTF_16_LE \
616 | CATEGORY_MASK_UTF_16_BE_NOSIG \
617 | CATEGORY_MASK_UTF_16_LE_NOSIG)
619 #define CATEGORY_MASK_UTF_8 \
620 (CATEGORY_MASK_UTF_8_AUTO \
621 | CATEGORY_MASK_UTF_8_NOSIG \
622 | CATEGORY_MASK_UTF_8_SIG)
624 /* Table of coding categories (Lisp symbols). This variable is for
625 internal use only. */
626 static Lisp_Object Vcoding_category_table
;
628 /* Table of coding-categories ordered by priority. */
629 static enum coding_category coding_priorities
[coding_category_max
];
631 /* Nth element is a coding context for the coding system bound to the
632 Nth coding category. */
633 static struct coding_system coding_categories
[coding_category_max
];
635 /*** Commonly used macros and functions ***/
638 #define min(a, b) ((a) < (b) ? (a) : (b))
641 #define max(a, b) ((a) > (b) ? (a) : (b))
644 #define CODING_GET_INFO(coding, attrs, charset_list) \
646 (attrs) = CODING_ID_ATTRS ((coding)->id); \
647 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
651 /* Safely get one byte from the source text pointed by SRC which ends
652 at SRC_END, and set C to that byte. If there are not enough bytes
653 in the source, it jumps to `no_more_source'. If multibytep is
654 nonzero, and a multibyte character is found at SRC, set C to the
655 negative value of the character code. The caller should declare
656 and set these variables appropriately in advance:
657 src, src_end, multibytep */
659 #define ONE_MORE_BYTE(c) \
661 if (src == src_end) \
663 if (src_base < src) \
664 record_conversion_result \
665 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
666 goto no_more_source; \
669 if (multibytep && (c & 0x80)) \
671 if ((c & 0xFE) == 0xC0) \
672 c = ((c & 1) << 6) | *src++; \
676 c = - string_char (src, &src, NULL); \
677 record_conversion_result \
678 (coding, CODING_RESULT_INVALID_SRC); \
684 /* Safely get two bytes from the source text pointed by SRC which ends
685 at SRC_END, and set C1 and C2 to those bytes while skipping the
686 heading multibyte characters. If there are not enough bytes in the
687 source, it jumps to `no_more_source'. If multibytep is nonzero and
688 a multibyte character is found for C2, set C2 to the negative value
689 of the character code. The caller should declare and set these
690 variables appropriately in advance:
691 src, src_end, multibytep
692 It is intended that this macro is used in detect_coding_utf_16. */
694 #define TWO_MORE_BYTES(c1, c2) \
697 if (src == src_end) \
698 goto no_more_source; \
700 if (multibytep && (c1 & 0x80)) \
702 if ((c1 & 0xFE) == 0xC0) \
703 c1 = ((c1 & 1) << 6) | *src++; \
706 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
711 if (src == src_end) \
712 goto no_more_source; \
714 if (multibytep && (c2 & 0x80)) \
716 if ((c2 & 0xFE) == 0xC0) \
717 c2 = ((c2 & 1) << 6) | *src++; \
724 #define ONE_MORE_BYTE_NO_CHECK(c) \
727 if (multibytep && (c & 0x80)) \
729 if ((c & 0xFE) == 0xC0) \
730 c = ((c & 1) << 6) | *src++; \
734 c = - string_char (src, &src, NULL); \
735 record_conversion_result \
736 (coding, CODING_RESULT_INVALID_SRC); \
743 /* Store a byte C in the place pointed by DST and increment DST to the
744 next free point, and increment PRODUCED_CHARS. The caller should
745 assure that C is 0..127, and declare and set the variable `dst'
746 appropriately in advance.
750 #define EMIT_ONE_ASCII_BYTE(c) \
757 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
759 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
761 produced_chars += 2; \
762 *dst++ = (c1), *dst++ = (c2); \
766 /* Store a byte C in the place pointed by DST and increment DST to the
767 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
768 nonzero, store in an appropriate multibyte from. The caller should
769 declare and set the variables `dst' and `multibytep' appropriately
772 #define EMIT_ONE_BYTE(c) \
779 ch = BYTE8_TO_CHAR (ch); \
780 CHAR_STRING_ADVANCE (ch, dst); \
787 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
789 #define EMIT_TWO_BYTES(c1, c2) \
791 produced_chars += 2; \
798 ch = BYTE8_TO_CHAR (ch); \
799 CHAR_STRING_ADVANCE (ch, dst); \
802 ch = BYTE8_TO_CHAR (ch); \
803 CHAR_STRING_ADVANCE (ch, dst); \
813 #define EMIT_THREE_BYTES(c1, c2, c3) \
815 EMIT_ONE_BYTE (c1); \
816 EMIT_TWO_BYTES (c2, c3); \
820 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
822 EMIT_TWO_BYTES (c1, c2); \
823 EMIT_TWO_BYTES (c3, c4); \
827 /* Prototypes for static functions. */
828 static void record_conversion_result (struct coding_system
*coding
,
829 enum coding_result_code result
);
830 static int detect_coding_utf_8 (struct coding_system
*,
831 struct coding_detection_info
*info
);
832 static void decode_coding_utf_8 (struct coding_system
*);
833 static int encode_coding_utf_8 (struct coding_system
*);
835 static int detect_coding_utf_16 (struct coding_system
*,
836 struct coding_detection_info
*info
);
837 static void decode_coding_utf_16 (struct coding_system
*);
838 static int encode_coding_utf_16 (struct coding_system
*);
840 static int detect_coding_iso_2022 (struct coding_system
*,
841 struct coding_detection_info
*info
);
842 static void decode_coding_iso_2022 (struct coding_system
*);
843 static int encode_coding_iso_2022 (struct coding_system
*);
845 static int detect_coding_emacs_mule (struct coding_system
*,
846 struct coding_detection_info
*info
);
847 static void decode_coding_emacs_mule (struct coding_system
*);
848 static int encode_coding_emacs_mule (struct coding_system
*);
850 static int detect_coding_sjis (struct coding_system
*,
851 struct coding_detection_info
*info
);
852 static void decode_coding_sjis (struct coding_system
*);
853 static int encode_coding_sjis (struct coding_system
*);
855 static int detect_coding_big5 (struct coding_system
*,
856 struct coding_detection_info
*info
);
857 static void decode_coding_big5 (struct coding_system
*);
858 static int encode_coding_big5 (struct coding_system
*);
860 static int detect_coding_ccl (struct coding_system
*,
861 struct coding_detection_info
*info
);
862 static void decode_coding_ccl (struct coding_system
*);
863 static int encode_coding_ccl (struct coding_system
*);
865 static void decode_coding_raw_text (struct coding_system
*);
866 static int encode_coding_raw_text (struct coding_system
*);
868 static void coding_set_source (struct coding_system
*);
869 static void coding_set_destination (struct coding_system
*);
870 static void coding_alloc_by_realloc (struct coding_system
*, EMACS_INT
);
871 static void coding_alloc_by_making_gap (struct coding_system
*,
872 EMACS_INT
, EMACS_INT
);
873 static unsigned char *alloc_destination (struct coding_system
*,
874 EMACS_INT
, unsigned char *);
875 static void setup_iso_safe_charsets (Lisp_Object
);
876 static unsigned char *encode_designation_at_bol (struct coding_system
*,
879 static int detect_eol (const unsigned char *,
880 EMACS_INT
, enum coding_category
);
881 static Lisp_Object
adjust_coding_eol_type (struct coding_system
*, int);
882 static void decode_eol (struct coding_system
*);
883 static Lisp_Object
get_translation_table (Lisp_Object
, int, int *);
884 static Lisp_Object
get_translation (Lisp_Object
, int *, int *);
885 static int produce_chars (struct coding_system
*, Lisp_Object
, int);
886 static INLINE
void produce_charset (struct coding_system
*, int *,
888 static void produce_annotation (struct coding_system
*, EMACS_INT
);
889 static int decode_coding (struct coding_system
*);
890 static INLINE
int *handle_composition_annotation (EMACS_INT
, EMACS_INT
,
891 struct coding_system
*,
893 static INLINE
int *handle_charset_annotation (EMACS_INT
, EMACS_INT
,
894 struct coding_system
*,
896 static void consume_chars (struct coding_system
*, Lisp_Object
, int);
897 static int encode_coding (struct coding_system
*);
898 static Lisp_Object
make_conversion_work_buffer (int);
899 static Lisp_Object
code_conversion_restore (Lisp_Object
);
900 static INLINE
int char_encodable_p (int, Lisp_Object
);
901 static Lisp_Object
make_subsidiaries (Lisp_Object
);
904 record_conversion_result (struct coding_system
*coding
,
905 enum coding_result_code result
)
907 coding
->result
= result
;
910 case CODING_RESULT_INSUFFICIENT_SRC
:
911 Vlast_code_conversion_error
= Qinsufficient_source
;
913 case CODING_RESULT_INCONSISTENT_EOL
:
914 Vlast_code_conversion_error
= Qinconsistent_eol
;
916 case CODING_RESULT_INVALID_SRC
:
917 Vlast_code_conversion_error
= Qinvalid_source
;
919 case CODING_RESULT_INTERRUPT
:
920 Vlast_code_conversion_error
= Qinterrupted
;
922 case CODING_RESULT_INSUFFICIENT_MEM
:
923 Vlast_code_conversion_error
= Qinsufficient_memory
;
925 case CODING_RESULT_INSUFFICIENT_DST
:
926 /* Don't record this error in Vlast_code_conversion_error
927 because it happens just temporarily and is resolved when the
928 whole conversion is finished. */
930 case CODING_RESULT_SUCCESS
:
933 Vlast_code_conversion_error
= intern ("Unknown error");
937 /* This wrapper macro is used to preserve validity of pointers into
938 buffer text across calls to decode_char, which could cause
939 relocation of buffers if it loads a charset map, because loading a
940 charset map allocates large structures. */
941 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
943 charset_map_loaded = 0; \
944 c = DECODE_CHAR (charset, code); \
945 if (charset_map_loaded) \
947 const unsigned char *orig = coding->source; \
950 coding_set_source (coding); \
951 offset = coding->source - orig; \
953 src_base += offset; \
959 /* If there are at least BYTES length of room at dst, allocate memory
960 for coding->destination and update dst and dst_end. We don't have
961 to take care of coding->source which will be relocated. It is
962 handled by calling coding_set_source in encode_coding. */
964 #define ASSURE_DESTINATION(bytes) \
966 if (dst + (bytes) >= dst_end) \
968 int more_bytes = charbuf_end - charbuf + (bytes); \
970 dst = alloc_destination (coding, more_bytes, dst); \
971 dst_end = coding->destination + coding->dst_bytes; \
976 /* Store multibyte form of the character C in P, and advance P to the
977 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
978 never calls MAYBE_UNIFY_CHAR. */
980 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
982 if ((c) <= MAX_1_BYTE_CHAR) \
984 else if ((c) <= MAX_2_BYTE_CHAR) \
985 *(p)++ = (0xC0 | ((c) >> 6)), \
986 *(p)++ = (0x80 | ((c) & 0x3F)); \
987 else if ((c) <= MAX_3_BYTE_CHAR) \
988 *(p)++ = (0xE0 | ((c) >> 12)), \
989 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
990 *(p)++ = (0x80 | ((c) & 0x3F)); \
991 else if ((c) <= MAX_4_BYTE_CHAR) \
992 *(p)++ = (0xF0 | (c >> 18)), \
993 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
994 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
995 *(p)++ = (0x80 | (c & 0x3F)); \
996 else if ((c) <= MAX_5_BYTE_CHAR) \
998 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
999 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1000 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1001 *(p)++ = (0x80 | (c & 0x3F)); \
1003 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1007 /* Return the character code of character whose multibyte form is at
1008 P, and advance P to the end of the multibyte form. This is like
1009 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1011 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1014 : ! ((p)[0] & 0x20) \
1016 ((((p)[-2] & 0x1F) << 6) \
1017 | ((p)[-1] & 0x3F) \
1018 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1019 : ! ((p)[0] & 0x10) \
1021 ((((p)[-3] & 0x0F) << 12) \
1022 | (((p)[-2] & 0x3F) << 6) \
1023 | ((p)[-1] & 0x3F))) \
1024 : ! ((p)[0] & 0x08) \
1026 ((((p)[-4] & 0xF) << 18) \
1027 | (((p)[-3] & 0x3F) << 12) \
1028 | (((p)[-2] & 0x3F) << 6) \
1029 | ((p)[-1] & 0x3F))) \
1031 ((((p)[-4] & 0x3F) << 18) \
1032 | (((p)[-3] & 0x3F) << 12) \
1033 | (((p)[-2] & 0x3F) << 6) \
1034 | ((p)[-1] & 0x3F))))
1038 coding_set_source (struct coding_system
*coding
)
1040 if (BUFFERP (coding
->src_object
))
1042 struct buffer
*buf
= XBUFFER (coding
->src_object
);
1044 if (coding
->src_pos
< 0)
1045 coding
->source
= BUF_GAP_END_ADDR (buf
) + coding
->src_pos_byte
;
1047 coding
->source
= BUF_BYTE_ADDRESS (buf
, coding
->src_pos_byte
);
1049 else if (STRINGP (coding
->src_object
))
1051 coding
->source
= SDATA (coding
->src_object
) + coding
->src_pos_byte
;
1055 /* Otherwise, the source is C string and is never relocated
1056 automatically. Thus we don't have to update anything. */
1061 coding_set_destination (struct coding_system
*coding
)
1063 if (BUFFERP (coding
->dst_object
))
1065 if (coding
->src_pos
< 0)
1067 coding
->destination
= BEG_ADDR
+ coding
->dst_pos_byte
- BEG_BYTE
;
1068 coding
->dst_bytes
= (GAP_END_ADDR
1069 - (coding
->src_bytes
- coding
->consumed
)
1070 - coding
->destination
);
1074 /* We are sure that coding->dst_pos_byte is before the gap
1076 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
1077 + coding
->dst_pos_byte
- BEG_BYTE
);
1078 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
1079 - coding
->destination
);
1084 /* Otherwise, the destination is C string and is never relocated
1085 automatically. Thus we don't have to update anything. */
1091 coding_alloc_by_realloc (struct coding_system
*coding
, EMACS_INT bytes
)
1093 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
1094 coding
->dst_bytes
+ bytes
);
1095 coding
->dst_bytes
+= bytes
;
1099 coding_alloc_by_making_gap (struct coding_system
*coding
,
1100 EMACS_INT gap_head_used
, EMACS_INT bytes
)
1102 if (EQ (coding
->src_object
, coding
->dst_object
))
1104 /* The gap may contain the produced data at the head and not-yet
1105 consumed data at the tail. To preserve those data, we at
1106 first make the gap size to zero, then increase the gap
1108 EMACS_INT add
= GAP_SIZE
;
1110 GPT
+= gap_head_used
, GPT_BYTE
+= gap_head_used
;
1111 GAP_SIZE
= 0; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
1113 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
1114 GPT
-= gap_head_used
, GPT_BYTE
-= gap_head_used
;
1118 Lisp_Object this_buffer
;
1120 this_buffer
= Fcurrent_buffer ();
1121 set_buffer_internal (XBUFFER (coding
->dst_object
));
1123 set_buffer_internal (XBUFFER (this_buffer
));
1128 static unsigned char *
1129 alloc_destination (struct coding_system
*coding
, EMACS_INT nbytes
,
1132 EMACS_INT offset
= dst
- coding
->destination
;
1134 if (BUFFERP (coding
->dst_object
))
1136 struct buffer
*buf
= XBUFFER (coding
->dst_object
);
1138 coding_alloc_by_making_gap (coding
, dst
- BUF_GPT_ADDR (buf
), nbytes
);
1141 coding_alloc_by_realloc (coding
, nbytes
);
1142 coding_set_destination (coding
);
1143 dst
= coding
->destination
+ offset
;
1147 /** Macros for annotations. */
1149 /* An annotation data is stored in the array coding->charbuf in this
1151 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1152 LENGTH is the number of elements in the annotation.
1153 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1154 NCHARS is the number of characters in the text annotated.
1156 The format of the following elements depend on ANNOTATION_MASK.
1158 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1160 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1162 NBYTES is the number of bytes specified in the header part of
1163 old-style emacs-mule encoding, or 0 for the other kind of
1166 METHOD is one of enum composition_method.
1168 Optional COMPOSITION-COMPONENTS are characters and composition
1171 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1174 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1175 recover from an invalid annotation, and should be skipped by
1176 produce_annotation. */
1178 /* Maximum length of the header of annotation data. */
1179 #define MAX_ANNOTATION_LENGTH 5
1181 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1183 *(buf)++ = -(len); \
1184 *(buf)++ = (mask); \
1185 *(buf)++ = (nchars); \
1186 coding->annotated = 1; \
1189 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1191 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1197 #define ADD_CHARSET_DATA(buf, nchars, id) \
1199 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1204 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1211 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1212 Check if a text is encoded in UTF-8. If it is, return 1, else
1215 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1216 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1217 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1218 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1219 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1220 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1222 #define UTF_BOM 0xFEFF
1223 #define UTF_8_BOM_1 0xEF
1224 #define UTF_8_BOM_2 0xBB
1225 #define UTF_8_BOM_3 0xBF
1228 detect_coding_utf_8 (struct coding_system
*coding
,
1229 struct coding_detection_info
*detect_info
)
1231 const unsigned char *src
= coding
->source
, *src_base
;
1232 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1233 int multibytep
= coding
->src_multibyte
;
1234 int consumed_chars
= 0;
1238 detect_info
->checked
|= CATEGORY_MASK_UTF_8
;
1239 /* A coding system of this category is always ASCII compatible. */
1240 src
+= coding
->head_ascii
;
1244 int c
, c1
, c2
, c3
, c4
;
1248 if (c
< 0 || UTF_8_1_OCTET_P (c
))
1251 if (c1
< 0 || ! UTF_8_EXTRA_OCTET_P (c1
))
1253 if (UTF_8_2_OCTET_LEADING_P (c
))
1259 if (c2
< 0 || ! UTF_8_EXTRA_OCTET_P (c2
))
1261 if (UTF_8_3_OCTET_LEADING_P (c
))
1264 if (src_base
== coding
->source
1265 && c
== UTF_8_BOM_1
&& c1
== UTF_8_BOM_2
&& c2
== UTF_8_BOM_3
)
1270 if (c3
< 0 || ! UTF_8_EXTRA_OCTET_P (c3
))
1272 if (UTF_8_4_OCTET_LEADING_P (c
))
1278 if (c4
< 0 || ! UTF_8_EXTRA_OCTET_P (c4
))
1280 if (UTF_8_5_OCTET_LEADING_P (c
))
1287 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1291 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1293 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1298 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1299 detect_info
->found
|= CATEGORY_MASK_UTF_8_SIG
| CATEGORY_MASK_UTF_8_NOSIG
;
1303 detect_info
->rejected
|= CATEGORY_MASK_UTF_8_SIG
;
1305 detect_info
->found
|= CATEGORY_MASK_UTF_8_NOSIG
;
1312 decode_coding_utf_8 (struct coding_system
*coding
)
1314 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1315 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1316 const unsigned char *src_base
;
1317 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
1318 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
;
1319 int consumed_chars
= 0, consumed_chars_base
= 0;
1320 int multibytep
= coding
->src_multibyte
;
1321 enum utf_bom_type bom
= CODING_UTF_8_BOM (coding
);
1322 Lisp_Object attr
, charset_list
;
1324 !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
1325 int byte_after_cr
= -1;
1327 CODING_GET_INFO (coding
, attr
, charset_list
);
1329 if (bom
!= utf_without_bom
)
1335 if (! UTF_8_3_OCTET_LEADING_P (c1
))
1340 if (! UTF_8_EXTRA_OCTET_P (c2
))
1345 if (! UTF_8_EXTRA_OCTET_P (c3
))
1349 if ((c1
!= UTF_8_BOM_1
)
1350 || (c2
!= UTF_8_BOM_2
) || (c3
!= UTF_8_BOM_3
))
1353 CODING_UTF_8_BOM (coding
) = utf_without_bom
;
1358 CODING_UTF_8_BOM (coding
) = utf_without_bom
;
1362 int c
, c1
, c2
, c3
, c4
, c5
;
1365 consumed_chars_base
= consumed_chars
;
1367 if (charbuf
>= charbuf_end
)
1369 if (byte_after_cr
>= 0)
1374 if (byte_after_cr
>= 0)
1375 c1
= byte_after_cr
, byte_after_cr
= -1;
1382 else if (UTF_8_1_OCTET_P (c1
))
1384 if (eol_crlf
&& c1
== '\r')
1385 ONE_MORE_BYTE (byte_after_cr
);
1391 if (c2
< 0 || ! UTF_8_EXTRA_OCTET_P (c2
))
1393 if (UTF_8_2_OCTET_LEADING_P (c1
))
1395 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1396 /* Reject overlong sequences here and below. Encoders
1397 producing them are incorrect, they can be misleading,
1398 and they mess up read/write invariance. */
1405 if (c3
< 0 || ! UTF_8_EXTRA_OCTET_P (c3
))
1407 if (UTF_8_3_OCTET_LEADING_P (c1
))
1409 c
= (((c1
& 0xF) << 12)
1410 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1412 || (c
>= 0xd800 && c
< 0xe000)) /* surrogates (invalid) */
1418 if (c4
< 0 || ! UTF_8_EXTRA_OCTET_P (c4
))
1420 if (UTF_8_4_OCTET_LEADING_P (c1
))
1422 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1423 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1430 if (c5
< 0 || ! UTF_8_EXTRA_OCTET_P (c5
))
1432 if (UTF_8_5_OCTET_LEADING_P (c1
))
1434 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1435 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1437 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1452 consumed_chars
= consumed_chars_base
;
1454 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1459 coding
->consumed_char
+= consumed_chars_base
;
1460 coding
->consumed
= src_base
- coding
->source
;
1461 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1466 encode_coding_utf_8 (struct coding_system
*coding
)
1468 int multibytep
= coding
->dst_multibyte
;
1469 int *charbuf
= coding
->charbuf
;
1470 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1471 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1472 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1473 int produced_chars
= 0;
1476 if (CODING_UTF_8_BOM (coding
) == utf_with_bom
)
1478 ASSURE_DESTINATION (3);
1479 EMIT_THREE_BYTES (UTF_8_BOM_1
, UTF_8_BOM_2
, UTF_8_BOM_3
);
1480 CODING_UTF_8_BOM (coding
) = utf_without_bom
;
1485 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1487 while (charbuf
< charbuf_end
)
1489 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1491 ASSURE_DESTINATION (safe_room
);
1493 if (CHAR_BYTE8_P (c
))
1495 c
= CHAR_TO_BYTE8 (c
);
1500 CHAR_STRING_ADVANCE_NO_UNIFY (c
, pend
);
1501 for (p
= str
; p
< pend
; p
++)
1508 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1510 while (charbuf
< charbuf_end
)
1512 ASSURE_DESTINATION (safe_room
);
1514 if (CHAR_BYTE8_P (c
))
1515 *dst
++ = CHAR_TO_BYTE8 (c
);
1517 CHAR_STRING_ADVANCE_NO_UNIFY (c
, dst
);
1521 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1522 coding
->produced_char
+= produced_chars
;
1523 coding
->produced
= dst
- coding
->destination
;
1528 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1529 Check if a text is encoded in one of UTF-16 based coding systems.
1530 If it is, return 1, else return 0. */
1532 #define UTF_16_HIGH_SURROGATE_P(val) \
1533 (((val) & 0xFC00) == 0xD800)
1535 #define UTF_16_LOW_SURROGATE_P(val) \
1536 (((val) & 0xFC00) == 0xDC00)
1538 #define UTF_16_INVALID_P(val) \
1539 (((val) == 0xFFFE) \
1540 || ((val) == 0xFFFF) \
1541 || UTF_16_LOW_SURROGATE_P (val))
1545 detect_coding_utf_16 (struct coding_system
*coding
,
1546 struct coding_detection_info
*detect_info
)
1548 const unsigned char *src
= coding
->source
;
1549 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1550 int multibytep
= coding
->src_multibyte
;
1553 detect_info
->checked
|= CATEGORY_MASK_UTF_16
;
1554 if (coding
->mode
& CODING_MODE_LAST_BLOCK
1555 && (coding
->src_chars
& 1))
1557 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1561 TWO_MORE_BYTES (c1
, c2
);
1562 if ((c1
== 0xFF) && (c2
== 0xFE))
1564 detect_info
->found
|= (CATEGORY_MASK_UTF_16_LE
1565 | CATEGORY_MASK_UTF_16_AUTO
);
1566 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_BE
1567 | CATEGORY_MASK_UTF_16_BE_NOSIG
1568 | CATEGORY_MASK_UTF_16_LE_NOSIG
);
1570 else if ((c1
== 0xFE) && (c2
== 0xFF))
1572 detect_info
->found
|= (CATEGORY_MASK_UTF_16_BE
1573 | CATEGORY_MASK_UTF_16_AUTO
);
1574 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_LE
1575 | CATEGORY_MASK_UTF_16_BE_NOSIG
1576 | CATEGORY_MASK_UTF_16_LE_NOSIG
);
1580 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1585 /* We check the dispersion of Eth and Oth bytes where E is even and
1586 O is odd. If both are high, we assume binary data.*/
1587 unsigned char e
[256], o
[256];
1588 unsigned e_num
= 1, o_num
= 1;
1595 detect_info
->rejected
|= (CATEGORY_MASK_UTF_16_AUTO
1596 |CATEGORY_MASK_UTF_16_BE
1597 | CATEGORY_MASK_UTF_16_LE
);
1599 while ((detect_info
->rejected
& CATEGORY_MASK_UTF_16
)
1600 != CATEGORY_MASK_UTF_16
)
1602 TWO_MORE_BYTES (c1
, c2
);
1610 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_BE_NOSIG
;
1617 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_LE_NOSIG
;
1628 decode_coding_utf_16 (struct coding_system
*coding
)
1630 const unsigned char *src
= coding
->source
+ coding
->consumed
;
1631 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1632 const unsigned char *src_base
;
1633 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
1634 /* We may produces at most 3 chars in one loop. */
1635 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
- 2;
1636 int consumed_chars
= 0, consumed_chars_base
= 0;
1637 int multibytep
= coding
->src_multibyte
;
1638 enum utf_bom_type bom
= CODING_UTF_16_BOM (coding
);
1639 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1640 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1641 Lisp_Object attr
, charset_list
;
1643 !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
1644 int byte_after_cr1
= -1, byte_after_cr2
= -1;
1646 CODING_GET_INFO (coding
, attr
, charset_list
);
1648 if (bom
== utf_with_bom
)
1657 if (endian
== utf_16_big_endian
1658 ? c
!= 0xFEFF : c
!= 0xFFFE)
1660 /* The first two bytes are not BOM. Treat them as bytes
1661 for a normal character. */
1665 CODING_UTF_16_BOM (coding
) = utf_without_bom
;
1667 else if (bom
== utf_detect_bom
)
1669 /* We have already tried to detect BOM and failed in
1671 CODING_UTF_16_BOM (coding
) = utf_without_bom
;
1679 consumed_chars_base
= consumed_chars
;
1681 if (charbuf
>= charbuf_end
)
1683 if (byte_after_cr1
>= 0)
1688 if (byte_after_cr1
>= 0)
1689 c1
= byte_after_cr1
, byte_after_cr1
= -1;
1697 if (byte_after_cr2
>= 0)
1698 c2
= byte_after_cr2
, byte_after_cr2
= -1;
1703 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
1707 c
= (endian
== utf_16_big_endian
1708 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1712 if (! UTF_16_LOW_SURROGATE_P (c
))
1714 if (endian
== utf_16_big_endian
)
1715 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1717 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1721 if (UTF_16_HIGH_SURROGATE_P (c
))
1722 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1728 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1729 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1730 *charbuf
++ = 0x10000 + c
;
1735 if (UTF_16_HIGH_SURROGATE_P (c
))
1736 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1739 if (eol_crlf
&& c
== '\r')
1741 ONE_MORE_BYTE (byte_after_cr1
);
1742 ONE_MORE_BYTE (byte_after_cr2
);
1750 coding
->consumed_char
+= consumed_chars_base
;
1751 coding
->consumed
= src_base
- coding
->source
;
1752 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1756 encode_coding_utf_16 (struct coding_system
*coding
)
1758 int multibytep
= coding
->dst_multibyte
;
1759 int *charbuf
= coding
->charbuf
;
1760 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1761 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1762 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1764 enum utf_bom_type bom
= CODING_UTF_16_BOM (coding
);
1765 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1766 int produced_chars
= 0;
1767 Lisp_Object attrs
, charset_list
;
1770 CODING_GET_INFO (coding
, attrs
, charset_list
);
1772 if (bom
!= utf_without_bom
)
1774 ASSURE_DESTINATION (safe_room
);
1776 EMIT_TWO_BYTES (0xFE, 0xFF);
1778 EMIT_TWO_BYTES (0xFF, 0xFE);
1779 CODING_UTF_16_BOM (coding
) = utf_without_bom
;
1782 while (charbuf
< charbuf_end
)
1784 ASSURE_DESTINATION (safe_room
);
1786 if (c
> MAX_UNICODE_CHAR
)
1787 c
= coding
->default_char
;
1792 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1794 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1801 c1
= (c
>> 10) + 0xD800;
1802 c2
= (c
& 0x3FF) + 0xDC00;
1804 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1806 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1809 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
1810 coding
->produced
= dst
- coding
->destination
;
1811 coding
->produced_char
+= produced_chars
;
1816 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1818 /* Emacs' internal format for representation of multiple character
1819 sets is a kind of multi-byte encoding, i.e. characters are
1820 represented by variable-length sequences of one-byte codes.
1822 ASCII characters and control characters (e.g. `tab', `newline') are
1823 represented by one-byte sequences which are their ASCII codes, in
1824 the range 0x00 through 0x7F.
1826 8-bit characters of the range 0x80..0x9F are represented by
1827 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1830 8-bit characters of the range 0xA0..0xFF are represented by
1831 one-byte sequences which are their 8-bit code.
1833 The other characters are represented by a sequence of `base
1834 leading-code', optional `extended leading-code', and one or two
1835 `position-code's. The length of the sequence is determined by the
1836 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1837 whereas extended leading-code and position-code take the range 0xA0
1838 through 0xFF. See `charset.h' for more details about leading-code
1841 --- CODE RANGE of Emacs' internal format ---
1845 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1846 eight-bit-graphic 0xA0..0xBF
1847 ELSE 0x81..0x9D + [0xA0..0xFF]+
1848 ---------------------------------------------
1850 As this is the internal character representation, the format is
1851 usually not used externally (i.e. in a file or in a data sent to a
1852 process). But, it is possible to have a text externally in this
1853 format (i.e. by encoding by the coding system `emacs-mule').
1855 In that case, a sequence of one-byte codes has a slightly different
1858 At first, all characters in eight-bit-control are represented by
1859 one-byte sequences which are their 8-bit code.
1861 Next, character composition data are represented by the byte
1862 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1864 METHOD is 0xF2 plus one of composition method (enum
1865 composition_method),
1867 BYTES is 0xA0 plus a byte length of this composition data,
1869 CHARS is 0xA0 plus a number of characters composed by this
1872 COMPONENTs are characters of multibyte form or composition
1873 rules encoded by two-byte of ASCII codes.
1875 In addition, for backward compatibility, the following formats are
1876 also recognized as composition data on decoding.
1879 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1882 MSEQ is a multibyte form but in these special format:
1883 ASCII: 0xA0 ASCII_CODE+0x80,
1884 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1885 RULE is a one byte code of the range 0xA0..0xF0 that
1886 represents a composition rule.
1889 char emacs_mule_bytes
[256];
1892 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1893 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1897 detect_coding_emacs_mule (struct coding_system
*coding
,
1898 struct coding_detection_info
*detect_info
)
1900 const unsigned char *src
= coding
->source
, *src_base
;
1901 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1902 int multibytep
= coding
->src_multibyte
;
1903 int consumed_chars
= 0;
1907 detect_info
->checked
|= CATEGORY_MASK_EMACS_MULE
;
1908 /* A coding system of this category is always ASCII compatible. */
1909 src
+= coding
->head_ascii
;
1919 /* Perhaps the start of composite character. We simply skip
1920 it because analyzing it is too heavy for detecting. But,
1921 at least, we check that the composite character
1922 constitutes of more than 4 bytes. */
1923 const unsigned char *src_base
;
1933 if (src
- src_base
<= 4)
1935 found
= CATEGORY_MASK_EMACS_MULE
;
1943 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1948 int more_bytes
= emacs_mule_bytes
[c
] - 1;
1950 while (more_bytes
> 0)
1955 src
--; /* Unread the last byte. */
1960 if (more_bytes
!= 0)
1962 found
= CATEGORY_MASK_EMACS_MULE
;
1965 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1969 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1971 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1974 detect_info
->found
|= found
;
1979 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1980 character. If CMP_STATUS indicates that we must expect MSEQ or
1981 RULE described above, decode it and return the negative value of
1982 the decoded character or rule. If an invalid byte is found, return
1983 -1. If SRC is too short, return -2. */
1986 emacs_mule_char (struct coding_system
*coding
, const unsigned char *src
,
1987 int *nbytes
, int *nchars
, int *id
,
1988 struct composition_status
*cmp_status
)
1990 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1991 const unsigned char *src_base
= src
;
1992 int multibytep
= coding
->src_multibyte
;
1996 int consumed_chars
= 0;
2003 charset_id
= emacs_mule_charset
[0];
2009 if (cmp_status
->state
!= COMPOSING_NO
2010 && cmp_status
->old_form
)
2012 if (cmp_status
->state
== COMPOSING_CHAR
)
2027 *nbytes
= src
- src_base
;
2028 *nchars
= consumed_chars
;
2036 switch (emacs_mule_bytes
[c
])
2039 if ((charset_id
= emacs_mule_charset
[c
]) < 0)
2048 if (c
== EMACS_MULE_LEADING_CODE_PRIVATE_11
2049 || c
== EMACS_MULE_LEADING_CODE_PRIVATE_12
)
2052 if (c
< 0xA0 || (charset_id
= emacs_mule_charset
[c
]) < 0)
2061 if ((charset_id
= emacs_mule_charset
[c
]) < 0)
2066 code
= (c
& 0x7F) << 8;
2076 if (c
< 0 || (charset_id
= emacs_mule_charset
[c
]) < 0)
2081 code
= (c
& 0x7F) << 8;
2090 charset_id
= ASCII_BYTE_P (code
) ? charset_ascii
: charset_eight_bit
;
2096 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
2097 CHARSET_FROM_ID (charset_id
), code
, c
);
2101 *nbytes
= src
- src_base
;
2102 *nchars
= consumed_chars
;
2105 return (mseq_found
? -c
: c
);
2115 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2117 /* Handle these composition sequence ('|': the end of header elements,
2118 BYTES and CHARS >= 0xA0):
2120 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2121 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2122 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2126 (4) relative composition: 0x80 | MSEQ ... MSEQ
2127 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2129 When the starter 0x80 and the following header elements are found,
2130 this annotation header is produced.
2132 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2134 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2135 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2137 Then, upon reading the following elements, these codes are produced
2138 until the composition end is found:
2141 (2) ALT ... ALT CHAR ... CHAR
2142 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2144 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2146 When the composition end is found, LENGTH and NCHARS in the
2147 annotation header is updated as below:
2149 (1) LENGTH: unchanged, NCHARS: unchanged
2150 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2151 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2152 (4) LENGTH: unchanged, NCHARS: number of CHARs
2153 (5) LENGTH: unchanged, NCHARS: number of CHARs
2155 If an error is found while composing, the annotation header is
2156 changed to the original composition header (plus filler -1s) as
2159 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2160 (5) [ 0x80 0xFF -1 -1- -1 ]
2162 and the sequence [ -2 DECODED-RULE ] is changed to the original
2163 byte sequence as below:
2164 o the original byte sequence is B: [ B -1 ]
2165 o the original byte sequence is B1 B2: [ B1 B2 ]
2167 Most of the routines are implemented by macros because many
2168 variables and labels in the caller decode_coding_emacs_mule must be
2169 accessible, and they are usually called just once (thus doesn't
2170 increase the size of compiled object). */
2172 /* Decode a composition rule represented by C as a component of
2173 composition sequence of Emacs 20 style. Set RULE to the decoded
2176 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2181 if (c < 0 || c >= 81) \
2182 goto invalid_code; \
2183 gref = c / 9, nref = c % 9; \
2184 if (gref == 4) gref = 10; \
2185 if (nref == 4) nref = 10; \
2186 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2190 /* Decode a composition rule represented by C and the following byte
2191 at SRC as a component of composition sequence of Emacs 21 style.
2192 Set RULE to the decoded rule. */
2194 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2199 if (gref < 0 || gref >= 81) \
2200 goto invalid_code; \
2201 ONE_MORE_BYTE (c); \
2203 if (nref < 0 || nref >= 81) \
2204 goto invalid_code; \
2205 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2209 /* Start of Emacs 21 style format. The first three bytes at SRC are
2210 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2211 byte length of this composition information, CHARS is the number of
2212 characters composed by this composition. */
2214 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2216 enum composition_method method = c - 0xF2; \
2217 int nbytes, nchars; \
2219 ONE_MORE_BYTE (c); \
2221 goto invalid_code; \
2222 nbytes = c - 0xA0; \
2223 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2224 goto invalid_code; \
2225 ONE_MORE_BYTE (c); \
2226 nchars = c - 0xA0; \
2227 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2228 goto invalid_code; \
2229 cmp_status->old_form = 0; \
2230 cmp_status->method = method; \
2231 if (method == COMPOSITION_RELATIVE) \
2232 cmp_status->state = COMPOSING_CHAR; \
2234 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2235 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2236 cmp_status->nchars = nchars; \
2237 cmp_status->ncomps = nbytes - 4; \
2238 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2242 /* Start of Emacs 20 style format for relative composition. */
2244 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2246 cmp_status->old_form = 1; \
2247 cmp_status->method = COMPOSITION_RELATIVE; \
2248 cmp_status->state = COMPOSING_CHAR; \
2249 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2250 cmp_status->nchars = cmp_status->ncomps = 0; \
2251 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2255 /* Start of Emacs 20 style format for rule-base composition. */
2257 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2259 cmp_status->old_form = 1; \
2260 cmp_status->method = COMPOSITION_WITH_RULE; \
2261 cmp_status->state = COMPOSING_CHAR; \
2262 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2263 cmp_status->nchars = cmp_status->ncomps = 0; \
2264 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2268 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2270 const unsigned char *current_src = src; \
2272 ONE_MORE_BYTE (c); \
2274 goto invalid_code; \
2275 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2276 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2277 DECODE_EMACS_MULE_21_COMPOSITION (); \
2278 else if (c < 0xA0) \
2279 goto invalid_code; \
2280 else if (c < 0xC0) \
2282 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2283 /* Re-read C as a composition component. */ \
2284 src = current_src; \
2286 else if (c == 0xFF) \
2287 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2289 goto invalid_code; \
2292 #define EMACS_MULE_COMPOSITION_END() \
2294 int idx = - cmp_status->length; \
2296 if (cmp_status->old_form) \
2297 charbuf[idx + 2] = cmp_status->nchars; \
2298 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2299 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2300 cmp_status->state = COMPOSING_NO; \
2305 emacs_mule_finish_composition (int *charbuf
,
2306 struct composition_status
*cmp_status
)
2308 int idx
= - cmp_status
->length
;
2311 if (cmp_status
->old_form
&& cmp_status
->nchars
> 0)
2313 charbuf
[idx
+ 2] = cmp_status
->nchars
;
2315 if (cmp_status
->method
== COMPOSITION_WITH_RULE
2316 && cmp_status
->state
== COMPOSING_CHAR
)
2318 /* The last rule was invalid. */
2319 int rule
= charbuf
[-1] + 0xA0;
2321 charbuf
[-2] = BYTE8_TO_CHAR (rule
);
2328 charbuf
[idx
++] = BYTE8_TO_CHAR (0x80);
2330 if (cmp_status
->method
== COMPOSITION_WITH_RULE
)
2332 charbuf
[idx
++] = BYTE8_TO_CHAR (0xFF);
2333 charbuf
[idx
++] = -3;
2339 int nchars
= charbuf
[idx
+ 1] + 0xA0;
2340 int nbytes
= charbuf
[idx
+ 2] + 0xA0;
2342 charbuf
[idx
++] = BYTE8_TO_CHAR (0xF2 + cmp_status
->method
);
2343 charbuf
[idx
++] = BYTE8_TO_CHAR (nbytes
);
2344 charbuf
[idx
++] = BYTE8_TO_CHAR (nchars
);
2345 charbuf
[idx
++] = -1;
2349 cmp_status
->state
= COMPOSING_NO
;
2353 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2355 if (cmp_status->state != COMPOSING_NO) \
2356 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2361 decode_coding_emacs_mule (struct coding_system
*coding
)
2363 const unsigned char *src
= coding
->source
+ coding
->consumed
;
2364 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2365 const unsigned char *src_base
;
2366 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
2367 /* We may produce two annotations (charset and composition) in one
2368 loop and one more charset annotation at the end. */
2370 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 3);
2371 int consumed_chars
= 0, consumed_chars_base
;
2372 int multibytep
= coding
->src_multibyte
;
2373 Lisp_Object attrs
, charset_list
;
2374 int char_offset
= coding
->produced_char
;
2375 int last_offset
= char_offset
;
2376 int last_id
= charset_ascii
;
2378 !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
2379 int byte_after_cr
= -1;
2380 struct composition_status
*cmp_status
= &coding
->spec
.emacs_mule
.cmp_status
;
2382 CODING_GET_INFO (coding
, attrs
, charset_list
);
2384 if (cmp_status
->state
!= COMPOSING_NO
)
2388 for (i
= 0; i
< cmp_status
->length
; i
++)
2389 *charbuf
++ = cmp_status
->carryover
[i
];
2390 coding
->annotated
= 1;
2398 consumed_chars_base
= consumed_chars
;
2400 if (charbuf
>= charbuf_end
)
2402 if (byte_after_cr
>= 0)
2407 if (byte_after_cr
>= 0)
2408 c
= byte_after_cr
, byte_after_cr
= -1;
2412 if (c
< 0 || c
== 0x80)
2414 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2421 DECODE_EMACS_MULE_COMPOSITION_START ();
2427 if (eol_crlf
&& c
== '\r')
2428 ONE_MORE_BYTE (byte_after_cr
);
2430 if (cmp_status
->state
!= COMPOSING_NO
)
2432 if (cmp_status
->old_form
)
2433 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2434 else if (cmp_status
->state
>= COMPOSING_COMPONENT_CHAR
)
2435 cmp_status
->ncomps
--;
2441 /* emacs_mule_char can load a charset map from a file, which
2442 allocates a large structure and might cause buffer text
2443 to be relocated as result. Thus, we need to remember the
2444 original pointer to buffer text, and fix up all related
2445 pointers after the call. */
2446 const unsigned char *orig
= coding
->source
;
2449 c
= emacs_mule_char (coding
, src_base
, &nbytes
, &nchars
, &id
,
2451 offset
= coding
->source
- orig
;
2465 src
= src_base
+ nbytes
;
2466 consumed_chars
= consumed_chars_base
+ nchars
;
2467 if (cmp_status
->state
>= COMPOSING_COMPONENT_CHAR
)
2468 cmp_status
->ncomps
-= nchars
;
2471 /* Now if C >= 0, we found a normally encoded character, if C <
2472 0, we found an old-style composition component character or
2475 if (cmp_status
->state
== COMPOSING_NO
)
2479 if (last_id
!= charset_ascii
)
2480 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
,
2483 last_offset
= char_offset
;
2488 else if (cmp_status
->state
== COMPOSING_CHAR
)
2490 if (cmp_status
->old_form
)
2494 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2501 cmp_status
->nchars
++;
2502 cmp_status
->length
++;
2503 if (cmp_status
->nchars
== MAX_COMPOSITION_COMPONENTS
)
2504 EMACS_MULE_COMPOSITION_END ();
2505 else if (cmp_status
->method
== COMPOSITION_WITH_RULE
)
2506 cmp_status
->state
= COMPOSING_RULE
;
2512 cmp_status
->length
++;
2513 cmp_status
->nchars
--;
2514 if (cmp_status
->nchars
== 0)
2515 EMACS_MULE_COMPOSITION_END ();
2518 else if (cmp_status
->state
== COMPOSING_RULE
)
2524 EMACS_MULE_COMPOSITION_END ();
2531 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c
, rule
);
2536 cmp_status
->length
+= 2;
2537 cmp_status
->state
= COMPOSING_CHAR
;
2540 else if (cmp_status
->state
== COMPOSING_COMPONENT_CHAR
)
2543 cmp_status
->length
++;
2544 if (cmp_status
->ncomps
== 0)
2545 cmp_status
->state
= COMPOSING_CHAR
;
2546 else if (cmp_status
->ncomps
> 0)
2548 if (cmp_status
->method
== COMPOSITION_WITH_RULE_ALTCHARS
)
2549 cmp_status
->state
= COMPOSING_COMPONENT_RULE
;
2552 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2554 else /* COMPOSING_COMPONENT_RULE */
2558 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c
, rule
);
2563 cmp_status
->length
+= 2;
2564 cmp_status
->ncomps
--;
2565 if (cmp_status
->ncomps
> 0)
2566 cmp_status
->state
= COMPOSING_COMPONENT_CHAR
;
2568 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2573 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2575 consumed_chars
= consumed_chars_base
;
2577 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
2583 if (cmp_status
->state
!= COMPOSING_NO
)
2585 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
2586 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2591 charbuf
-= cmp_status
->length
;
2592 for (i
= 0; i
< cmp_status
->length
; i
++)
2593 cmp_status
->carryover
[i
] = charbuf
[i
];
2596 if (last_id
!= charset_ascii
)
2597 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
2598 coding
->consumed_char
+= consumed_chars_base
;
2599 coding
->consumed
= src_base
- coding
->source
;
2600 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
2604 #define EMACS_MULE_LEADING_CODES(id, codes) \
2607 codes[0] = id, codes[1] = 0; \
2608 else if (id < 0xE0) \
2609 codes[0] = 0x9A, codes[1] = id; \
2610 else if (id < 0xF0) \
2611 codes[0] = 0x9B, codes[1] = id; \
2612 else if (id < 0xF5) \
2613 codes[0] = 0x9C, codes[1] = id; \
2615 codes[0] = 0x9D, codes[1] = id; \
2620 encode_coding_emacs_mule (struct coding_system
*coding
)
2622 int multibytep
= coding
->dst_multibyte
;
2623 int *charbuf
= coding
->charbuf
;
2624 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
2625 unsigned char *dst
= coding
->destination
+ coding
->produced
;
2626 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
2628 int produced_chars
= 0;
2629 Lisp_Object attrs
, charset_list
;
2631 int preferred_charset_id
= -1;
2633 CODING_GET_INFO (coding
, attrs
, charset_list
);
2634 if (! EQ (charset_list
, Vemacs_mule_charset_list
))
2636 CODING_ATTR_CHARSET_LIST (attrs
)
2637 = charset_list
= Vemacs_mule_charset_list
;
2640 while (charbuf
< charbuf_end
)
2642 ASSURE_DESTINATION (safe_room
);
2647 /* Handle an annotation. */
2650 case CODING_ANNOTATE_COMPOSITION_MASK
:
2651 /* Not yet implemented. */
2653 case CODING_ANNOTATE_CHARSET_MASK
:
2654 preferred_charset_id
= charbuf
[3];
2655 if (preferred_charset_id
>= 0
2656 && NILP (Fmemq (make_number (preferred_charset_id
),
2658 preferred_charset_id
= -1;
2667 if (ASCII_CHAR_P (c
))
2668 EMIT_ONE_ASCII_BYTE (c
);
2669 else if (CHAR_BYTE8_P (c
))
2671 c
= CHAR_TO_BYTE8 (c
);
2676 struct charset
*charset
;
2680 unsigned char leading_codes
[2];
2682 if (preferred_charset_id
>= 0)
2684 charset
= CHARSET_FROM_ID (preferred_charset_id
);
2685 if (CHAR_CHARSET_P (c
, charset
))
2686 code
= ENCODE_CHAR (charset
, c
);
2688 charset
= char_charset (c
, charset_list
, &code
);
2691 charset
= char_charset (c
, charset_list
, &code
);
2694 c
= coding
->default_char
;
2695 if (ASCII_CHAR_P (c
))
2697 EMIT_ONE_ASCII_BYTE (c
);
2700 charset
= char_charset (c
, charset_list
, &code
);
2702 dimension
= CHARSET_DIMENSION (charset
);
2703 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2704 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2705 EMIT_ONE_BYTE (leading_codes
[0]);
2706 if (leading_codes
[1])
2707 EMIT_ONE_BYTE (leading_codes
[1]);
2709 EMIT_ONE_BYTE (code
| 0x80);
2713 EMIT_ONE_BYTE (code
>> 8);
2714 EMIT_ONE_BYTE (code
& 0xFF);
2718 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
2719 coding
->produced_char
+= produced_chars
;
2720 coding
->produced
= dst
- coding
->destination
;
2725 /*** 7. ISO2022 handlers ***/
2727 /* The following note describes the coding system ISO2022 briefly.
2728 Since the intention of this note is to help understand the
2729 functions in this file, some parts are NOT ACCURATE or are OVERLY
2730 SIMPLIFIED. For thorough understanding, please refer to the
2731 original document of ISO2022. This is equivalent to the standard
2732 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2734 ISO2022 provides many mechanisms to encode several character sets
2735 in 7-bit and 8-bit environments. For 7-bit environments, all text
2736 is encoded using bytes less than 128. This may make the encoded
2737 text a little bit longer, but the text passes more easily through
2738 several types of gateway, some of which strip off the MSB (Most
2741 There are two kinds of character sets: control character sets and
2742 graphic character sets. The former contain control characters such
2743 as `newline' and `escape' to provide control functions (control
2744 functions are also provided by escape sequences). The latter
2745 contain graphic characters such as 'A' and '-'. Emacs recognizes
2746 two control character sets and many graphic character sets.
2748 Graphic character sets are classified into one of the following
2749 four classes, according to the number of bytes (DIMENSION) and
2750 number of characters in one dimension (CHARS) of the set:
2751 - DIMENSION1_CHARS94
2752 - DIMENSION1_CHARS96
2753 - DIMENSION2_CHARS94
2754 - DIMENSION2_CHARS96
2756 In addition, each character set is assigned an identification tag,
2757 unique for each set, called the "final character" (denoted as <F>
2758 hereafter). The <F> of each character set is decided by ECMA(*)
2759 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2760 (0x30..0x3F are for private use only).
2762 Note (*): ECMA = European Computer Manufacturers Association
2764 Here are examples of graphic character sets [NAME(<F>)]:
2765 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2766 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2767 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2768 o DIMENSION2_CHARS96 -- none for the moment
2770 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2771 C0 [0x00..0x1F] -- control character plane 0
2772 GL [0x20..0x7F] -- graphic character plane 0
2773 C1 [0x80..0x9F] -- control character plane 1
2774 GR [0xA0..0xFF] -- graphic character plane 1
2776 A control character set is directly designated and invoked to C0 or
2777 C1 by an escape sequence. The most common case is that:
2778 - ISO646's control character set is designated/invoked to C0, and
2779 - ISO6429's control character set is designated/invoked to C1,
2780 and usually these designations/invocations are omitted in encoded
2781 text. In a 7-bit environment, only C0 can be used, and a control
2782 character for C1 is encoded by an appropriate escape sequence to
2783 fit into the environment. All control characters for C1 are
2784 defined to have corresponding escape sequences.
2786 A graphic character set is at first designated to one of four
2787 graphic registers (G0 through G3), then these graphic registers are
2788 invoked to GL or GR. These designations and invocations can be
2789 done independently. The most common case is that G0 is invoked to
2790 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2791 these invocations and designations are omitted in encoded text.
2792 In a 7-bit environment, only GL can be used.
2794 When a graphic character set of CHARS94 is invoked to GL, codes
2795 0x20 and 0x7F of the GL area work as control characters SPACE and
2796 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2799 There are two ways of invocation: locking-shift and single-shift.
2800 With locking-shift, the invocation lasts until the next different
2801 invocation, whereas with single-shift, the invocation affects the
2802 following character only and doesn't affect the locking-shift
2803 state. Invocations are done by the following control characters or
2806 ----------------------------------------------------------------------
2807 abbrev function cntrl escape seq description
2808 ----------------------------------------------------------------------
2809 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2810 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2811 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2812 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2813 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2814 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2815 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2816 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2817 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2818 ----------------------------------------------------------------------
2819 (*) These are not used by any known coding system.
2821 Control characters for these functions are defined by macros
2822 ISO_CODE_XXX in `coding.h'.
2824 Designations are done by the following escape sequences:
2825 ----------------------------------------------------------------------
2826 escape sequence description
2827 ----------------------------------------------------------------------
2828 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2829 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2830 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2831 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2832 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2833 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2834 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2835 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2836 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2837 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2838 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2839 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2840 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2841 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2842 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2843 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2844 ----------------------------------------------------------------------
2846 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2847 of dimension 1, chars 94, and final character <F>, etc...
2849 Note (*): Although these designations are not allowed in ISO2022,
2850 Emacs accepts them on decoding, and produces them on encoding
2851 CHARS96 character sets in a coding system which is characterized as
2852 7-bit environment, non-locking-shift, and non-single-shift.
2854 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2855 '(' must be omitted. We refer to this as "short-form" hereafter.
2857 Now you may notice that there are a lot of ways of encoding the
2858 same multilingual text in ISO2022. Actually, there exist many
2859 coding systems such as Compound Text (used in X11's inter client
2860 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2861 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2862 localized platforms), and all of these are variants of ISO2022.
2864 In addition to the above, Emacs handles two more kinds of escape
2865 sequences: ISO6429's direction specification and Emacs' private
2866 sequence for specifying character composition.
2868 ISO6429's direction specification takes the following form:
2869 o CSI ']' -- end of the current direction
2870 o CSI '0' ']' -- end of the current direction
2871 o CSI '1' ']' -- start of left-to-right text
2872 o CSI '2' ']' -- start of right-to-left text
2873 The control character CSI (0x9B: control sequence introducer) is
2874 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2876 Character composition specification takes the following form:
2877 o ESC '0' -- start relative composition
2878 o ESC '1' -- end composition
2879 o ESC '2' -- start rule-base composition (*)
2880 o ESC '3' -- start relative composition with alternate chars (**)
2881 o ESC '4' -- start rule-base composition with alternate chars (**)
2882 Since these are not standard escape sequences of any ISO standard,
2883 the use of them with these meanings is restricted to Emacs only.
2885 (*) This form is used only in Emacs 20.7 and older versions,
2886 but newer versions can safely decode it.
2887 (**) This form is used only in Emacs 21.1 and newer versions,
2888 and older versions can't decode it.
2890 Here's a list of example usages of these composition escape
2891 sequences (categorized by `enum composition_method').
2893 COMPOSITION_RELATIVE:
2894 ESC 0 CHAR [ CHAR ] ESC 1
2895 COMPOSITION_WITH_RULE:
2896 ESC 2 CHAR [ RULE CHAR ] ESC 1
2897 COMPOSITION_WITH_ALTCHARS:
2898 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2899 COMPOSITION_WITH_RULE_ALTCHARS:
2900 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2902 enum iso_code_class_type iso_code_class
[256];
2904 #define SAFE_CHARSET_P(coding, id) \
2905 ((id) <= (coding)->max_charset_id \
2906 && (coding)->safe_charsets[id] != 255)
2909 #define SHIFT_OUT_OK(category) \
2910 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2913 setup_iso_safe_charsets (Lisp_Object attrs
)
2915 Lisp_Object charset_list
, safe_charsets
;
2916 Lisp_Object request
;
2917 Lisp_Object reg_usage
;
2920 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2923 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2924 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2925 && ! EQ (charset_list
, Viso_2022_charset_list
))
2927 CODING_ATTR_CHARSET_LIST (attrs
)
2928 = charset_list
= Viso_2022_charset_list
;
2929 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2932 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2936 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2938 int id
= XINT (XCAR (tail
));
2939 if (max_charset_id
< id
)
2940 max_charset_id
= id
;
2943 safe_charsets
= make_uninit_string (max_charset_id
+ 1);
2944 memset (SDATA (safe_charsets
), 255, max_charset_id
+ 1);
2945 request
= AREF (attrs
, coding_attr_iso_request
);
2946 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2947 reg94
= XINT (XCAR (reg_usage
));
2948 reg96
= XINT (XCDR (reg_usage
));
2950 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2954 struct charset
*charset
;
2957 charset
= CHARSET_FROM_ID (XINT (id
));
2958 reg
= Fcdr (Fassq (id
, request
));
2960 SSET (safe_charsets
, XINT (id
), XINT (reg
));
2961 else if (charset
->iso_chars_96
)
2964 SSET (safe_charsets
, XINT (id
), reg96
);
2969 SSET (safe_charsets
, XINT (id
), reg94
);
2972 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2976 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2977 Check if a text is encoded in one of ISO-2022 based coding systems.
2978 If it is, return 1, else return 0. */
2981 detect_coding_iso_2022 (struct coding_system
*coding
,
2982 struct coding_detection_info
*detect_info
)
2984 const unsigned char *src
= coding
->source
, *src_base
= src
;
2985 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2986 int multibytep
= coding
->src_multibyte
;
2987 int single_shifting
= 0;
2990 int consumed_chars
= 0;
2994 int composition_count
= -1;
2996 detect_info
->checked
|= CATEGORY_MASK_ISO
;
2998 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
3000 struct coding_system
*this = &(coding_categories
[i
]);
3001 Lisp_Object attrs
, val
;
3005 attrs
= CODING_ID_ATTRS (this->id
);
3006 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3007 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Viso_2022_charset_list
))
3008 setup_iso_safe_charsets (attrs
);
3009 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
3010 this->max_charset_id
= SCHARS (val
) - 1;
3011 this->safe_charsets
= SDATA (val
);
3014 /* A coding system of this category is always ASCII compatible. */
3015 src
+= coding
->head_ascii
;
3017 while (rejected
!= CATEGORY_MASK_ISO
)
3024 if (inhibit_iso_escape_detection
)
3026 single_shifting
= 0;
3028 if (c
>= '(' && c
<= '/')
3030 /* Designation sequence for a charset of dimension 1. */
3032 if (c1
< ' ' || c1
>= 0x80
3033 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
3034 /* Invalid designation sequence. Just ignore. */
3039 /* Designation sequence for a charset of dimension 2. */
3041 if (c
>= '@' && c
<= 'B')
3042 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3043 id
= iso_charset_table
[1][0][c
];
3044 else if (c
>= '(' && c
<= '/')
3047 if (c1
< ' ' || c1
>= 0x80
3048 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
3049 /* Invalid designation sequence. Just ignore. */
3053 /* Invalid designation sequence. Just ignore it. */
3056 else if (c
== 'N' || c
== 'O')
3058 /* ESC <Fe> for SS2 or SS3. */
3059 single_shifting
= 1;
3060 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
3065 /* End of composition. */
3066 if (composition_count
< 0
3067 || composition_count
> MAX_COMPOSITION_COMPONENTS
)
3070 composition_count
= -1;
3071 found
|= CATEGORY_MASK_ISO
;
3073 else if (c
>= '0' && c
<= '4')
3075 /* ESC <Fp> for start/end composition. */
3076 composition_count
= 0;
3081 /* Invalid escape sequence. Just ignore it. */
3085 /* We found a valid designation sequence for CHARSET. */
3086 rejected
|= CATEGORY_MASK_ISO_8BIT
;
3087 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
3089 found
|= CATEGORY_MASK_ISO_7
;
3091 rejected
|= CATEGORY_MASK_ISO_7
;
3092 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
3094 found
|= CATEGORY_MASK_ISO_7_TIGHT
;
3096 rejected
|= CATEGORY_MASK_ISO_7_TIGHT
;
3097 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
3099 found
|= CATEGORY_MASK_ISO_7_ELSE
;
3101 rejected
|= CATEGORY_MASK_ISO_7_ELSE
;
3102 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
3104 found
|= CATEGORY_MASK_ISO_8_ELSE
;
3106 rejected
|= CATEGORY_MASK_ISO_8_ELSE
;
3111 /* Locking shift out/in. */
3112 if (inhibit_iso_escape_detection
)
3114 single_shifting
= 0;
3115 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
3119 /* Control sequence introducer. */
3120 single_shifting
= 0;
3121 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
3122 found
|= CATEGORY_MASK_ISO_8_ELSE
;
3123 goto check_extra_latin
;
3128 if (inhibit_iso_escape_detection
)
3130 single_shifting
= 0;
3131 rejected
|= CATEGORY_MASK_ISO_7BIT
;
3132 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
3133 & CODING_ISO_FLAG_SINGLE_SHIFT
)
3134 found
|= CATEGORY_MASK_ISO_8_1
, single_shifting
= 1;
3135 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
3136 & CODING_ISO_FLAG_SINGLE_SHIFT
)
3137 found
|= CATEGORY_MASK_ISO_8_2
, single_shifting
= 1;
3138 if (single_shifting
)
3140 goto check_extra_latin
;
3147 if (composition_count
>= 0)
3148 composition_count
++;
3149 single_shifting
= 0;
3154 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
3155 found
|= CATEGORY_MASK_ISO_8_1
;
3156 /* Check the length of succeeding codes of the range
3157 0xA0..0FF. If the byte length is even, we include
3158 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3159 only when we are not single shifting. */
3160 if (! single_shifting
3161 && ! (rejected
& CATEGORY_MASK_ISO_8_2
))
3164 while (src
< src_end
)
3176 if (i
& 1 && src
< src_end
)
3178 rejected
|= CATEGORY_MASK_ISO_8_2
;
3179 if (composition_count
>= 0)
3180 composition_count
+= i
;
3184 found
|= CATEGORY_MASK_ISO_8_2
;
3185 if (composition_count
>= 0)
3186 composition_count
+= i
/ 2;
3192 single_shifting
= 0;
3193 if (! VECTORP (Vlatin_extra_code_table
)
3194 || NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
3196 rejected
= CATEGORY_MASK_ISO
;
3199 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
3200 & CODING_ISO_FLAG_LATIN_EXTRA
)
3201 found
|= CATEGORY_MASK_ISO_8_1
;
3203 rejected
|= CATEGORY_MASK_ISO_8_1
;
3204 rejected
|= CATEGORY_MASK_ISO_8_2
;
3207 detect_info
->rejected
|= CATEGORY_MASK_ISO
;
3211 detect_info
->rejected
|= rejected
;
3212 detect_info
->found
|= (found
& ~rejected
);
3217 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3218 escape sequence should be kept. */
3219 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3223 if (final < '0' || final >= 128 \
3224 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3225 || !SAFE_CHARSET_P (coding, id)) \
3227 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3231 prev = CODING_ISO_DESIGNATION (coding, reg); \
3232 if (id == charset_jisx0201_roman) \
3234 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3235 id = charset_ascii; \
3237 else if (id == charset_jisx0208_1978) \
3239 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3240 id = charset_jisx0208; \
3242 CODING_ISO_DESIGNATION (coding, reg) = id; \
3243 /* If there was an invalid designation to REG previously, and this \
3244 designation is ASCII to REG, we should keep this designation \
3246 if (prev == -2 && id == charset_ascii) \
3251 /* Handle these composition sequence (ALT: alternate char):
3253 (1) relative composition: ESC 0 CHAR ... ESC 1
3254 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3255 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3256 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3258 When the start sequence (ESC 0/2/3/4) is found, this annotation
3261 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3263 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3264 produced until the end sequence (ESC 1) is found:
3267 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3268 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3269 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3271 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3272 annotation header is updated as below:
3274 (1) LENGTH: unchanged, NCHARS: number of CHARs
3275 (2) LENGTH: unchanged, NCHARS: number of CHARs
3276 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3277 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3279 If an error is found while composing, the annotation header is
3282 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3284 and the sequence [ -2 DECODED-RULE ] is changed to the original
3285 byte sequence as below:
3286 o the original byte sequence is B: [ B -1 ]
3287 o the original byte sequence is B1 B2: [ B1 B2 ]
3288 and the sequence [ -1 -1 ] is changed to the original byte
3293 /* Decode a composition rule C1 and maybe one more byte from the
3294 source, and set RULE to the encoded composition rule, NBYTES to the
3295 length of the composition rule. If the rule is invalid, set RULE
3296 to some negative value. */
3298 #define DECODE_COMPOSITION_RULE(rule, nbytes) \
3303 if (rule < 81) /* old format (before ver.21) */ \
3305 int gref = (rule) / 9; \
3306 int nref = (rule) % 9; \
3307 if (gref == 4) gref = 10; \
3308 if (nref == 4) nref = 10; \
3309 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3312 else /* new format (after ver.21) */ \
3316 ONE_MORE_BYTE (c); \
3317 rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32); \
3319 rule += 0x100; /* to destinguish it from the old format */ \
3324 #define ENCODE_COMPOSITION_RULE(rule) \
3326 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3328 if (rule < 0x100) /* old format */ \
3330 if (gref == 10) gref = 4; \
3331 if (nref == 10) nref = 4; \
3332 charbuf[idx] = 32 + gref * 9 + nref; \
3333 charbuf[idx + 1] = -1; \
3336 else /* new format */ \
3338 charbuf[idx] = 32 + 81 + gref; \
3339 charbuf[idx + 1] = 32 + nref; \
3344 /* Finish the current composition as invalid. */
3346 static int finish_composition (int *, struct composition_status
*);
3349 finish_composition (int *charbuf
, struct composition_status
*cmp_status
)
3351 int idx
= - cmp_status
->length
;
3354 /* Recover the original ESC sequence */
3355 charbuf
[idx
++] = ISO_CODE_ESC
;
3356 charbuf
[idx
++] = (cmp_status
->method
== COMPOSITION_RELATIVE
? '0'
3357 : cmp_status
->method
== COMPOSITION_WITH_RULE
? '2'
3358 : cmp_status
->method
== COMPOSITION_WITH_ALTCHARS
? '3'
3359 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3361 charbuf
[idx
++] = -2;
3363 charbuf
[idx
++] = -1;
3364 new_chars
= cmp_status
->nchars
;
3365 if (cmp_status
->method
>= COMPOSITION_WITH_RULE
)
3366 for (; idx
< 0; idx
++)
3368 int elt
= charbuf
[idx
];
3372 ENCODE_COMPOSITION_RULE (charbuf
[idx
+ 1]);
3377 charbuf
[idx
++] = ISO_CODE_ESC
;
3382 cmp_status
->state
= COMPOSING_NO
;
3386 /* If characters are under composition, finish the composition. */
3387 #define MAYBE_FINISH_COMPOSITION() \
3389 if (cmp_status->state != COMPOSING_NO) \
3390 char_offset += finish_composition (charbuf, cmp_status); \
3393 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3395 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3396 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3397 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3398 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3400 Produce this annotation sequence now:
3402 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3405 #define DECODE_COMPOSITION_START(c1) \
3408 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3409 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3410 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3411 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3415 cmp_status->state = COMPOSING_CHAR; \
3416 cmp_status->length += 2; \
3420 MAYBE_FINISH_COMPOSITION (); \
3421 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3422 : c1 == '2' ? COMPOSITION_WITH_RULE \
3423 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3424 : COMPOSITION_WITH_RULE_ALTCHARS); \
3426 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3427 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3428 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3429 cmp_status->nchars = cmp_status->ncomps = 0; \
3430 coding->annotated = 1; \
3435 /* Handle composition end sequence ESC 1. */
3437 #define DECODE_COMPOSITION_END() \
3439 if (cmp_status->nchars == 0 \
3440 || ((cmp_status->state == COMPOSING_CHAR) \
3441 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3443 MAYBE_FINISH_COMPOSITION (); \
3444 goto invalid_code; \
3446 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3447 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3448 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3449 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3450 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3451 char_offset += cmp_status->nchars; \
3452 cmp_status->state = COMPOSING_NO; \
3455 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3457 #define STORE_COMPOSITION_RULE(rule) \
3460 *charbuf++ = rule; \
3461 cmp_status->length += 2; \
3462 cmp_status->state--; \
3465 /* Store a composed char or a component char C in charbuf, and update
3468 #define STORE_COMPOSITION_CHAR(c) \
3471 cmp_status->length++; \
3472 if (cmp_status->state == COMPOSING_CHAR) \
3473 cmp_status->nchars++; \
3475 cmp_status->ncomps++; \
3476 if (cmp_status->method == COMPOSITION_WITH_RULE \
3477 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3478 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3479 cmp_status->state++; \
3483 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3486 decode_coding_iso_2022 (struct coding_system
*coding
)
3488 const unsigned char *src
= coding
->source
+ coding
->consumed
;
3489 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3490 const unsigned char *src_base
;
3491 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
3492 /* We may produce two annotations (charset and composition) in one
3493 loop and one more charset annotation at the end. */
3495 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 3);
3496 int consumed_chars
= 0, consumed_chars_base
;
3497 int multibytep
= coding
->src_multibyte
;
3498 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3499 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3500 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3501 int charset_id_2
, charset_id_3
;
3502 struct charset
*charset
;
3504 struct composition_status
*cmp_status
= CODING_ISO_CMP_STATUS (coding
);
3505 Lisp_Object attrs
, charset_list
;
3506 int char_offset
= coding
->produced_char
;
3507 int last_offset
= char_offset
;
3508 int last_id
= charset_ascii
;
3510 !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
3511 int byte_after_cr
= -1;
3514 CODING_GET_INFO (coding
, attrs
, charset_list
);
3515 setup_iso_safe_charsets (attrs
);
3516 /* Charset list may have been changed. */
3517 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3518 coding
->safe_charsets
= SDATA (CODING_ATTR_SAFE_CHARSETS (attrs
));
3520 if (cmp_status
->state
!= COMPOSING_NO
)
3522 for (i
= 0; i
< cmp_status
->length
; i
++)
3523 *charbuf
++ = cmp_status
->carryover
[i
];
3524 coding
->annotated
= 1;
3532 consumed_chars_base
= consumed_chars
;
3534 if (charbuf
>= charbuf_end
)
3536 if (byte_after_cr
>= 0)
3541 if (byte_after_cr
>= 0)
3542 c1
= byte_after_cr
, byte_after_cr
= -1;
3548 if (CODING_ISO_EXTSEGMENT_LEN (coding
) > 0)
3550 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3552 CODING_ISO_EXTSEGMENT_LEN (coding
)--;
3556 if (CODING_ISO_EMBEDDED_UTF_8 (coding
))
3558 if (c1
== ISO_CODE_ESC
)
3560 if (src
+ 1 >= src_end
)
3561 goto no_more_source
;
3562 *charbuf
++ = ISO_CODE_ESC
;
3564 if (src
[0] == '%' && src
[1] == '@')
3567 consumed_chars
+= 2;
3569 /* We are sure charbuf can contain two more chars. */
3572 CODING_ISO_EMBEDDED_UTF_8 (coding
) = 0;
3577 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3583 if ((cmp_status
->state
== COMPOSING_RULE
3584 || cmp_status
->state
== COMPOSING_COMPONENT_RULE
)
3585 && c1
!= ISO_CODE_ESC
)
3589 DECODE_COMPOSITION_RULE (rule
, nbytes
);
3592 STORE_COMPOSITION_RULE (rule
);
3596 /* We produce at most one character. */
3597 switch (iso_code_class
[c1
])
3599 case ISO_0x20_or_0x7F
:
3600 if (charset_id_0
< 0
3601 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
3602 /* This is SPACE or DEL. */
3603 charset
= CHARSET_FROM_ID (charset_ascii
);
3605 charset
= CHARSET_FROM_ID (charset_id_0
);
3608 case ISO_graphic_plane_0
:
3609 if (charset_id_0
< 0)
3610 charset
= CHARSET_FROM_ID (charset_ascii
);
3612 charset
= CHARSET_FROM_ID (charset_id_0
);
3615 case ISO_0xA0_or_0xFF
:
3616 if (charset_id_1
< 0
3617 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
3618 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
3620 /* This is a graphic character, we fall down ... */
3622 case ISO_graphic_plane_1
:
3623 if (charset_id_1
< 0)
3625 charset
= CHARSET_FROM_ID (charset_id_1
);
3629 if (eol_crlf
&& c1
== '\r')
3630 ONE_MORE_BYTE (byte_after_cr
);
3631 MAYBE_FINISH_COMPOSITION ();
3632 charset
= CHARSET_FROM_ID (charset_ascii
);
3639 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3640 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
3642 CODING_ISO_INVOCATION (coding
, 0) = 1;
3643 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3647 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
3649 CODING_ISO_INVOCATION (coding
, 0) = 0;
3650 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3653 case ISO_single_shift_2_7
:
3654 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
))
3656 case ISO_single_shift_2
:
3657 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
3659 /* SS2 is handled as an escape sequence of ESC 'N' */
3661 goto label_escape_sequence
;
3663 case ISO_single_shift_3
:
3664 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
3666 /* SS2 is handled as an escape sequence of ESC 'O' */
3668 goto label_escape_sequence
;
3670 case ISO_control_sequence_introducer
:
3671 /* CSI is handled as an escape sequence of ESC '[' ... */
3673 goto label_escape_sequence
;
3677 label_escape_sequence
:
3678 /* Escape sequences handled here are invocation,
3679 designation, direction specification, and character
3680 composition specification. */
3683 case '&': /* revision of following character set */
3685 if (!(c1
>= '@' && c1
<= '~'))
3688 if (c1
!= ISO_CODE_ESC
)
3691 goto label_escape_sequence
;
3693 case '$': /* designation of 2-byte character set */
3694 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3700 if (c1
>= '@' && c1
<= 'B')
3701 { /* designation of JISX0208.1978, GB2312.1980,
3703 reg
= 0, chars96
= 0;
3705 else if (c1
>= 0x28 && c1
<= 0x2B)
3706 { /* designation of DIMENSION2_CHARS94 character set */
3707 reg
= c1
- 0x28, chars96
= 0;
3710 else if (c1
>= 0x2C && c1
<= 0x2F)
3711 { /* designation of DIMENSION2_CHARS96 character set */
3712 reg
= c1
- 0x2C, chars96
= 1;
3717 DECODE_DESIGNATION (reg
, 2, chars96
, c1
);
3718 /* We must update these variables now. */
3720 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3722 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3728 case 'n': /* invocation of locking-shift-2 */
3729 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3730 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3732 CODING_ISO_INVOCATION (coding
, 0) = 2;
3733 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3736 case 'o': /* invocation of locking-shift-3 */
3737 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
3738 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3740 CODING_ISO_INVOCATION (coding
, 0) = 3;
3741 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3744 case 'N': /* invocation of single-shift-2 */
3745 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3746 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3748 charset_id_2
= CODING_ISO_DESIGNATION (coding
, 2);
3749 if (charset_id_2
< 0)
3750 charset
= CHARSET_FROM_ID (charset_ascii
);
3752 charset
= CHARSET_FROM_ID (charset_id_2
);
3754 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3758 case 'O': /* invocation of single-shift-3 */
3759 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3760 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3762 charset_id_3
= CODING_ISO_DESIGNATION (coding
, 3);
3763 if (charset_id_3
< 0)
3764 charset
= CHARSET_FROM_ID (charset_ascii
);
3766 charset
= CHARSET_FROM_ID (charset_id_3
);
3768 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3772 case '0': case '2': case '3': case '4': /* start composition */
3773 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
3775 if (last_id
!= charset_ascii
)
3777 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
3778 last_id
= charset_ascii
;
3779 last_offset
= char_offset
;
3781 DECODE_COMPOSITION_START (c1
);
3784 case '1': /* end composition */
3785 if (cmp_status
->state
== COMPOSING_NO
)
3787 DECODE_COMPOSITION_END ();
3790 case '[': /* specification of direction */
3791 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
))
3793 /* For the moment, nested direction is not supported.
3794 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3795 left-to-right, and nonzero means right-to-left. */
3799 case ']': /* end of the current direction */
3800 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3802 case '0': /* end of the current direction */
3803 case '1': /* start of left-to-right direction */
3806 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3811 case '2': /* start of right-to-left direction */
3814 coding
->mode
|= CODING_MODE_DIRECTION
;
3828 /* CTEXT extended segment:
3829 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3830 We keep these bytes as is for the moment.
3831 They may be decoded by post-read-conversion. */
3835 ONE_MORE_BYTE (dim
);
3836 if (dim
< '0' || dim
> '4')
3844 size
= ((M
- 128) * 128) + (L
- 128);
3845 if (charbuf
+ 6 > charbuf_end
)
3847 *charbuf
++ = ISO_CODE_ESC
;
3851 *charbuf
++ = BYTE8_TO_CHAR (M
);
3852 *charbuf
++ = BYTE8_TO_CHAR (L
);
3853 CODING_ISO_EXTSEGMENT_LEN (coding
) = size
;
3857 /* XFree86 extension for embedding UTF-8 in CTEXT:
3858 ESC % G --UTF-8-BYTES-- ESC % @
3859 We keep these bytes as is for the moment.
3860 They may be decoded by post-read-conversion. */
3861 if (charbuf
+ 3 > charbuf_end
)
3863 *charbuf
++ = ISO_CODE_ESC
;
3866 CODING_ISO_EMBEDDED_UTF_8 (coding
) = 1;
3874 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3879 if (c1
>= 0x28 && c1
<= 0x2B)
3880 { /* designation of DIMENSION1_CHARS94 character set */
3881 reg
= c1
- 0x28, chars96
= 0;
3884 else if (c1
>= 0x2C && c1
<= 0x2F)
3885 { /* designation of DIMENSION1_CHARS96 character set */
3886 reg
= c1
- 0x2C, chars96
= 1;
3891 DECODE_DESIGNATION (reg
, 1, chars96
, c1
);
3892 /* We must update these variables now. */
3894 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3896 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3904 if (cmp_status
->state
== COMPOSING_NO
3905 && charset
->id
!= charset_ascii
3906 && last_id
!= charset
->id
)
3908 if (last_id
!= charset_ascii
)
3909 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
3910 last_id
= charset
->id
;
3911 last_offset
= char_offset
;
3914 /* Now we know CHARSET and 1st position code C1 of a character.
3915 Produce a decoded character while getting 2nd and 3rd
3916 position codes C2, C3 if necessary. */
3917 if (CHARSET_DIMENSION (charset
) > 1)
3920 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0)
3921 || ((c1
& 0x80) != (c2
& 0x80)))
3922 /* C2 is not in a valid range. */
3924 if (CHARSET_DIMENSION (charset
) == 2)
3925 c1
= (c1
<< 8) | c2
;
3929 if (c3
< 0x20 || (c3
>= 0x80 && c3
< 0xA0)
3930 || ((c1
& 0x80) != (c3
& 0x80)))
3931 /* C3 is not in a valid range. */
3933 c1
= (c1
<< 16) | (c2
<< 8) | c2
;
3937 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3940 MAYBE_FINISH_COMPOSITION ();
3941 for (; src_base
< src
; src_base
++, char_offset
++)
3943 if (ASCII_BYTE_P (*src_base
))
3944 *charbuf
++ = *src_base
;
3946 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3949 else if (cmp_status
->state
== COMPOSING_NO
)
3954 else if ((cmp_status
->state
== COMPOSING_CHAR
3955 ? cmp_status
->nchars
3956 : cmp_status
->ncomps
)
3957 >= MAX_COMPOSITION_COMPONENTS
)
3959 /* Too long composition. */
3960 MAYBE_FINISH_COMPOSITION ();
3965 STORE_COMPOSITION_CHAR (c
);
3969 MAYBE_FINISH_COMPOSITION ();
3971 consumed_chars
= consumed_chars_base
;
3973 *charbuf
++ = c
< 0 ? -c
: ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3983 if (cmp_status
->state
!= COMPOSING_NO
)
3985 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
3986 MAYBE_FINISH_COMPOSITION ();
3989 charbuf
-= cmp_status
->length
;
3990 for (i
= 0; i
< cmp_status
->length
; i
++)
3991 cmp_status
->carryover
[i
] = charbuf
[i
];
3994 else if (last_id
!= charset_ascii
)
3995 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
3996 coding
->consumed_char
+= consumed_chars_base
;
3997 coding
->consumed
= src_base
- coding
->source
;
3998 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4002 /* ISO2022 encoding stuff. */
4005 It is not enough to say just "ISO2022" on encoding, we have to
4006 specify more details. In Emacs, each coding system of ISO2022
4007 variant has the following specifications:
4008 1. Initial designation to G0 thru G3.
4009 2. Allows short-form designation?
4010 3. ASCII should be designated to G0 before control characters?
4011 4. ASCII should be designated to G0 at end of line?
4012 5. 7-bit environment or 8-bit environment?
4013 6. Use locking-shift?
4014 7. Use Single-shift?
4015 And the following two are only for Japanese:
4016 8. Use ASCII in place of JIS0201-1976-Roman?
4017 9. Use JISX0208-1983 in place of JISX0208-1978?
4018 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4019 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
4023 /* Produce codes (escape sequence) for designating CHARSET to graphic
4024 register REG at DST, and increment DST. If <final-char> of CHARSET is
4025 '@', 'A', or 'B' and the coding system CODING allows, produce
4026 designation sequence of short-form. */
4028 #define ENCODE_DESIGNATION(charset, reg, coding) \
4030 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4031 const char *intermediate_char_94 = "()*+"; \
4032 const char *intermediate_char_96 = ",-./"; \
4033 int revision = -1; \
4036 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
4037 revision = CHARSET_ISO_REVISION (charset); \
4039 if (revision >= 0) \
4041 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4042 EMIT_ONE_BYTE ('@' + revision); \
4044 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4045 if (CHARSET_DIMENSION (charset) == 1) \
4047 if (! CHARSET_ISO_CHARS_96 (charset)) \
4048 c = intermediate_char_94[reg]; \
4050 c = intermediate_char_96[reg]; \
4051 EMIT_ONE_ASCII_BYTE (c); \
4055 EMIT_ONE_ASCII_BYTE ('$'); \
4056 if (! CHARSET_ISO_CHARS_96 (charset)) \
4058 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
4060 || final_char < '@' || final_char > 'B') \
4061 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4064 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4066 EMIT_ONE_ASCII_BYTE (final_char); \
4068 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4072 /* The following two macros produce codes (control character or escape
4073 sequence) for ISO2022 single-shift functions (single-shift-2 and
4076 #define ENCODE_SINGLE_SHIFT_2 \
4078 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4079 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4081 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4082 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4086 #define ENCODE_SINGLE_SHIFT_3 \
4088 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4089 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4091 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4092 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4096 /* The following four macros produce codes (control character or
4097 escape sequence) for ISO2022 locking-shift functions (shift-in,
4098 shift-out, locking-shift-2, and locking-shift-3). */
4100 #define ENCODE_SHIFT_IN \
4102 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4103 CODING_ISO_INVOCATION (coding, 0) = 0; \
4107 #define ENCODE_SHIFT_OUT \
4109 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4110 CODING_ISO_INVOCATION (coding, 0) = 1; \
4114 #define ENCODE_LOCKING_SHIFT_2 \
4116 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4117 CODING_ISO_INVOCATION (coding, 0) = 2; \
4121 #define ENCODE_LOCKING_SHIFT_3 \
4123 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4124 CODING_ISO_INVOCATION (coding, 0) = 3; \
4128 /* Produce codes for a DIMENSION1 character whose character set is
4129 CHARSET and whose position-code is C1. Designation and invocation
4130 sequences are also produced in advance if necessary. */
4132 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4134 int id = CHARSET_ID (charset); \
4136 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4137 && id == charset_ascii) \
4139 id = charset_jisx0201_roman; \
4140 charset = CHARSET_FROM_ID (id); \
4143 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4145 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4146 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4148 EMIT_ONE_BYTE (c1 | 0x80); \
4149 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4152 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4154 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4157 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4159 EMIT_ONE_BYTE (c1 | 0x80); \
4163 /* Since CHARSET is not yet invoked to any graphic planes, we \
4164 must invoke it, or, at first, designate it to some graphic \
4165 register. Then repeat the loop to actually produce the \
4167 dst = encode_invocation_designation (charset, coding, dst, \
4172 /* Produce codes for a DIMENSION2 character whose character set is
4173 CHARSET and whose position-codes are C1 and C2. Designation and
4174 invocation codes are also produced in advance if necessary. */
4176 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4178 int id = CHARSET_ID (charset); \
4180 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4181 && id == charset_jisx0208) \
4183 id = charset_jisx0208_1978; \
4184 charset = CHARSET_FROM_ID (id); \
4187 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4189 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4190 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4192 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4193 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4196 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4198 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4201 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4203 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4207 /* Since CHARSET is not yet invoked to any graphic planes, we \
4208 must invoke it, or, at first, designate it to some graphic \
4209 register. Then repeat the loop to actually produce the \
4211 dst = encode_invocation_designation (charset, coding, dst, \
4216 #define ENCODE_ISO_CHARACTER(charset, c) \
4218 int code = ENCODE_CHAR ((charset), (c)); \
4220 if (CHARSET_DIMENSION (charset) == 1) \
4221 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4223 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4227 /* Produce designation and invocation codes at a place pointed by DST
4228 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4232 encode_invocation_designation (struct charset
*charset
,
4233 struct coding_system
*coding
,
4234 unsigned char *dst
, int *p_nchars
)
4236 int multibytep
= coding
->dst_multibyte
;
4237 int produced_chars
= *p_nchars
;
4238 int reg
; /* graphic register number */
4239 int id
= CHARSET_ID (charset
);
4241 /* At first, check designations. */
4242 for (reg
= 0; reg
< 4; reg
++)
4243 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
4248 /* CHARSET is not yet designated to any graphic registers. */
4249 /* At first check the requested designation. */
4250 reg
= CODING_ISO_REQUEST (coding
, id
);
4252 /* Since CHARSET requests no special designation, designate it
4253 to graphic register 0. */
4256 ENCODE_DESIGNATION (charset
, reg
, coding
);
4259 if (CODING_ISO_INVOCATION (coding
, 0) != reg
4260 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
4262 /* Since the graphic register REG is not invoked to any graphic
4263 planes, invoke it to graphic plane 0. */
4266 case 0: /* graphic register 0 */
4270 case 1: /* graphic register 1 */
4274 case 2: /* graphic register 2 */
4275 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
4276 ENCODE_SINGLE_SHIFT_2
;
4278 ENCODE_LOCKING_SHIFT_2
;
4281 case 3: /* graphic register 3 */
4282 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
4283 ENCODE_SINGLE_SHIFT_3
;
4285 ENCODE_LOCKING_SHIFT_3
;
4290 *p_nchars
= produced_chars
;
4294 /* The following three macros produce codes for indicating direction
4296 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
4298 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
4299 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
4301 EMIT_ONE_BYTE (ISO_CODE_CSI); \
4305 #define ENCODE_DIRECTION_R2L() \
4307 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
4308 EMIT_TWO_ASCII_BYTES ('2', ']'); \
4312 #define ENCODE_DIRECTION_L2R() \
4314 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
4315 EMIT_TWO_ASCII_BYTES ('0', ']'); \
4319 /* Produce codes for designation and invocation to reset the graphic
4320 planes and registers to initial state. */
4321 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4324 struct charset *charset; \
4326 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4328 for (reg = 0; reg < 4; reg++) \
4329 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4330 && (CODING_ISO_DESIGNATION (coding, reg) \
4331 != CODING_ISO_INITIAL (coding, reg))) \
4333 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4334 ENCODE_DESIGNATION (charset, reg, coding); \
4339 /* Produce designation sequences of charsets in the line started from
4340 SRC to a place pointed by DST, and return updated DST.
4342 If the current block ends before any end-of-line, we may fail to
4343 find all the necessary designations. */
4345 static unsigned char *
4346 encode_designation_at_bol (struct coding_system
*coding
, int *charbuf
,
4347 int *charbuf_end
, unsigned char *dst
)
4349 struct charset
*charset
;
4350 /* Table of charsets to be designated to each graphic register. */
4352 int c
, found
= 0, reg
;
4353 int produced_chars
= 0;
4354 int multibytep
= coding
->dst_multibyte
;
4356 Lisp_Object charset_list
;
4358 attrs
= CODING_ID_ATTRS (coding
->id
);
4359 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
4360 if (EQ (charset_list
, Qiso_2022
))
4361 charset_list
= Viso_2022_charset_list
;
4363 for (reg
= 0; reg
< 4; reg
++)
4373 charset
= char_charset (c
, charset_list
, NULL
);
4374 id
= CHARSET_ID (charset
);
4375 reg
= CODING_ISO_REQUEST (coding
, id
);
4376 if (reg
>= 0 && r
[reg
] < 0)
4385 for (reg
= 0; reg
< 4; reg
++)
4387 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
4388 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
4394 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4397 encode_coding_iso_2022 (struct coding_system
*coding
)
4399 int multibytep
= coding
->dst_multibyte
;
4400 int *charbuf
= coding
->charbuf
;
4401 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4402 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4403 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4406 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4407 && CODING_ISO_BOL (coding
));
4408 int produced_chars
= 0;
4409 Lisp_Object attrs
, eol_type
, charset_list
;
4410 int ascii_compatible
;
4412 int preferred_charset_id
= -1;
4414 CODING_GET_INFO (coding
, attrs
, charset_list
);
4415 eol_type
= inhibit_eol_conversion
? Qunix
: CODING_ID_EOL_TYPE (coding
->id
);
4416 if (VECTORP (eol_type
))
4419 setup_iso_safe_charsets (attrs
);
4420 /* Charset list may have been changed. */
4421 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
4422 coding
->safe_charsets
= SDATA (CODING_ATTR_SAFE_CHARSETS (attrs
));
4425 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
))
4426 && ! (CODING_ISO_FLAGS (coding
) & (CODING_ISO_FLAG_DESIGNATION
4427 | CODING_ISO_FLAG_LOCKING_SHIFT
)));
4429 while (charbuf
< charbuf_end
)
4431 ASSURE_DESTINATION (safe_room
);
4433 if (bol_designation
)
4435 unsigned char *dst_prev
= dst
;
4437 /* We have to produce designation sequences if any now. */
4438 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
4439 bol_designation
= 0;
4440 /* We are sure that designation sequences are all ASCII bytes. */
4441 produced_chars
+= dst
- dst_prev
;
4448 /* Handle an annotation. */
4451 case CODING_ANNOTATE_COMPOSITION_MASK
:
4452 /* Not yet implemented. */
4454 case CODING_ANNOTATE_CHARSET_MASK
:
4455 preferred_charset_id
= charbuf
[2];
4456 if (preferred_charset_id
>= 0
4457 && NILP (Fmemq (make_number (preferred_charset_id
),
4459 preferred_charset_id
= -1;
4468 /* Now encode the character C. */
4469 if (c
< 0x20 || c
== 0x7F)
4472 || (c
== '\r' && EQ (eol_type
, Qmac
)))
4474 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
4475 ENCODE_RESET_PLANE_AND_REGISTER ();
4476 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
4480 for (i
= 0; i
< 4; i
++)
4481 CODING_ISO_DESIGNATION (coding
, i
)
4482 = CODING_ISO_INITIAL (coding
, i
);
4485 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
4487 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
4488 ENCODE_RESET_PLANE_AND_REGISTER ();
4489 EMIT_ONE_ASCII_BYTE (c
);
4491 else if (ASCII_CHAR_P (c
))
4493 if (ascii_compatible
)
4494 EMIT_ONE_ASCII_BYTE (c
);
4497 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
4498 ENCODE_ISO_CHARACTER (charset
, c
);
4501 else if (CHAR_BYTE8_P (c
))
4503 c
= CHAR_TO_BYTE8 (c
);
4508 struct charset
*charset
;
4510 if (preferred_charset_id
>= 0)
4512 charset
= CHARSET_FROM_ID (preferred_charset_id
);
4513 if (! CHAR_CHARSET_P (c
, charset
))
4514 charset
= char_charset (c
, charset_list
, NULL
);
4517 charset
= char_charset (c
, charset_list
, NULL
);
4520 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4522 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4523 charset
= CHARSET_FROM_ID (charset_ascii
);
4527 c
= coding
->default_char
;
4528 charset
= char_charset (c
, charset_list
, NULL
);
4531 ENCODE_ISO_CHARACTER (charset
, c
);
4535 if (coding
->mode
& CODING_MODE_LAST_BLOCK
4536 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
4538 ASSURE_DESTINATION (safe_room
);
4539 ENCODE_RESET_PLANE_AND_REGISTER ();
4541 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
4542 CODING_ISO_BOL (coding
) = bol_designation
;
4543 coding
->produced_char
+= produced_chars
;
4544 coding
->produced
= dst
- coding
->destination
;
4549 /*** 8,9. SJIS and BIG5 handlers ***/
4551 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4552 quite widely. So, for the moment, Emacs supports them in the bare
4553 C code. But, in the future, they may be supported only by CCL. */
4555 /* SJIS is a coding system encoding three character sets: ASCII, right
4556 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4557 as is. A character of charset katakana-jisx0201 is encoded by
4558 "position-code + 0x80". A character of charset japanese-jisx0208
4559 is encoded in 2-byte but two position-codes are divided and shifted
4560 so that it fit in the range below.
4562 --- CODE RANGE of SJIS ---
4563 (character set) (range)
4565 KATAKANA-JISX0201 0xA0 .. 0xDF
4566 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4567 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4568 -------------------------------
4572 /* BIG5 is a coding system encoding two character sets: ASCII and
4573 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4574 character set and is encoded in two-byte.
4576 --- CODE RANGE of BIG5 ---
4577 (character set) (range)
4579 Big5 (1st byte) 0xA1 .. 0xFE
4580 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4581 --------------------------
4585 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4586 Check if a text is encoded in SJIS. If it is, return
4587 CATEGORY_MASK_SJIS, else return 0. */
4590 detect_coding_sjis (struct coding_system
*coding
,
4591 struct coding_detection_info
*detect_info
)
4593 const unsigned char *src
= coding
->source
, *src_base
;
4594 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4595 int multibytep
= coding
->src_multibyte
;
4596 int consumed_chars
= 0;
4599 Lisp_Object attrs
, charset_list
;
4600 int max_first_byte_of_2_byte_code
;
4602 CODING_GET_INFO (coding
, attrs
, charset_list
);
4603 max_first_byte_of_2_byte_code
4604 = (XINT (Flength (charset_list
)) > 3 ? 0xFC : 0xEF);
4606 detect_info
->checked
|= CATEGORY_MASK_SJIS
;
4607 /* A coding system of this category is always ASCII compatible. */
4608 src
+= coding
->head_ascii
;
4616 if ((c
>= 0x81 && c
<= 0x9F)
4617 || (c
>= 0xE0 && c
<= max_first_byte_of_2_byte_code
))
4620 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
4622 found
= CATEGORY_MASK_SJIS
;
4624 else if (c
>= 0xA0 && c
< 0xE0)
4625 found
= CATEGORY_MASK_SJIS
;
4629 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
4633 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4635 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
4638 detect_info
->found
|= found
;
4642 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4643 Check if a text is encoded in BIG5. If it is, return
4644 CATEGORY_MASK_BIG5, else return 0. */
4647 detect_coding_big5 (struct coding_system
*coding
,
4648 struct coding_detection_info
*detect_info
)
4650 const unsigned char *src
= coding
->source
, *src_base
;
4651 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4652 int multibytep
= coding
->src_multibyte
;
4653 int consumed_chars
= 0;
4657 detect_info
->checked
|= CATEGORY_MASK_BIG5
;
4658 /* A coding system of this category is always ASCII compatible. */
4659 src
+= coding
->head_ascii
;
4670 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
4672 found
= CATEGORY_MASK_BIG5
;
4677 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
4681 if (src_base
< src
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4683 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
4686 detect_info
->found
|= found
;
4690 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4691 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
4694 decode_coding_sjis (struct coding_system
*coding
)
4696 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4697 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4698 const unsigned char *src_base
;
4699 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
4700 /* We may produce one charset annotation in one loop and one more at
4703 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 2);
4704 int consumed_chars
= 0, consumed_chars_base
;
4705 int multibytep
= coding
->src_multibyte
;
4706 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4707 struct charset
*charset_kanji2
;
4708 Lisp_Object attrs
, charset_list
, val
;
4709 int char_offset
= coding
->produced_char
;
4710 int last_offset
= char_offset
;
4711 int last_id
= charset_ascii
;
4713 !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
4714 int byte_after_cr
= -1;
4716 CODING_GET_INFO (coding
, attrs
, charset_list
);
4719 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4720 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4721 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4722 charset_kanji2
= NILP (val
) ? NULL
: CHARSET_FROM_ID (XINT (XCAR (val
)));
4727 struct charset
*charset
;
4730 consumed_chars_base
= consumed_chars
;
4732 if (charbuf
>= charbuf_end
)
4734 if (byte_after_cr
>= 0)
4739 if (byte_after_cr
>= 0)
4740 c
= byte_after_cr
, byte_after_cr
= -1;
4747 if (eol_crlf
&& c
== '\r')
4748 ONE_MORE_BYTE (byte_after_cr
);
4749 charset
= charset_roman
;
4751 else if (c
== 0x80 || c
== 0xA0)
4753 else if (c
>= 0xA1 && c
<= 0xDF)
4755 /* SJIS -> JISX0201-Kana */
4757 charset
= charset_kana
;
4761 /* SJIS -> JISX0208 */
4763 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
4767 charset
= charset_kanji
;
4769 else if (c
<= 0xFC && charset_kanji2
)
4771 /* SJIS -> JISX0213-2 */
4773 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
4777 charset
= charset_kanji2
;
4781 if (charset
->id
!= charset_ascii
4782 && last_id
!= charset
->id
)
4784 if (last_id
!= charset_ascii
)
4785 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4786 last_id
= charset
->id
;
4787 last_offset
= char_offset
;
4789 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4796 consumed_chars
= consumed_chars_base
;
4798 *charbuf
++ = c
< 0 ? -c
: BYTE8_TO_CHAR (c
);
4804 if (last_id
!= charset_ascii
)
4805 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4806 coding
->consumed_char
+= consumed_chars_base
;
4807 coding
->consumed
= src_base
- coding
->source
;
4808 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4812 decode_coding_big5 (struct coding_system
*coding
)
4814 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4815 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4816 const unsigned char *src_base
;
4817 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
4818 /* We may produce one charset annotation in one loop and one more at
4821 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 2);
4822 int consumed_chars
= 0, consumed_chars_base
;
4823 int multibytep
= coding
->src_multibyte
;
4824 struct charset
*charset_roman
, *charset_big5
;
4825 Lisp_Object attrs
, charset_list
, val
;
4826 int char_offset
= coding
->produced_char
;
4827 int last_offset
= char_offset
;
4828 int last_id
= charset_ascii
;
4830 !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
4831 int byte_after_cr
= -1;
4833 CODING_GET_INFO (coding
, attrs
, charset_list
);
4835 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4836 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4841 struct charset
*charset
;
4844 consumed_chars_base
= consumed_chars
;
4846 if (charbuf
>= charbuf_end
)
4848 if (byte_after_cr
>= 0)
4853 if (byte_after_cr
>= 0)
4854 c
= byte_after_cr
, byte_after_cr
= -1;
4862 if (eol_crlf
&& c
== '\r')
4863 ONE_MORE_BYTE (byte_after_cr
);
4864 charset
= charset_roman
;
4869 if (c
< 0xA1 || c
> 0xFE)
4872 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
4875 charset
= charset_big5
;
4877 if (charset
->id
!= charset_ascii
4878 && last_id
!= charset
->id
)
4880 if (last_id
!= charset_ascii
)
4881 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4882 last_id
= charset
->id
;
4883 last_offset
= char_offset
;
4885 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4892 consumed_chars
= consumed_chars_base
;
4894 *charbuf
++ = c
< 0 ? -c
: BYTE8_TO_CHAR (c
);
4900 if (last_id
!= charset_ascii
)
4901 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
4902 coding
->consumed_char
+= consumed_chars_base
;
4903 coding
->consumed
= src_base
- coding
->source
;
4904 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4907 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4908 This function can encode charsets `ascii', `katakana-jisx0201',
4909 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4910 are sure that all these charsets are registered as official charset
4911 (i.e. do not have extended leading-codes). Characters of other
4912 charsets are produced without any encoding. If SJIS_P is 1, encode
4913 SJIS text, else encode BIG5 text. */
4916 encode_coding_sjis (struct coding_system
*coding
)
4918 int multibytep
= coding
->dst_multibyte
;
4919 int *charbuf
= coding
->charbuf
;
4920 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4921 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4922 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4924 int produced_chars
= 0;
4925 Lisp_Object attrs
, charset_list
, val
;
4926 int ascii_compatible
;
4927 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4928 struct charset
*charset_kanji2
;
4931 CODING_GET_INFO (coding
, attrs
, charset_list
);
4933 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4934 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4935 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4936 charset_kanji2
= NILP (val
) ? NULL
: CHARSET_FROM_ID (XINT (XCAR (val
)));
4938 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4940 while (charbuf
< charbuf_end
)
4942 ASSURE_DESTINATION (safe_room
);
4944 /* Now encode the character C. */
4945 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4946 EMIT_ONE_ASCII_BYTE (c
);
4947 else if (CHAR_BYTE8_P (c
))
4949 c
= CHAR_TO_BYTE8 (c
);
4955 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4959 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4961 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4962 charset
= CHARSET_FROM_ID (charset_ascii
);
4966 c
= coding
->default_char
;
4967 charset
= char_charset (c
, charset_list
, &code
);
4970 if (code
== CHARSET_INVALID_CODE (charset
))
4972 if (charset
== charset_kanji
)
4976 c1
= code
>> 8, c2
= code
& 0xFF;
4977 EMIT_TWO_BYTES (c1
, c2
);
4979 else if (charset
== charset_kana
)
4980 EMIT_ONE_BYTE (code
| 0x80);
4981 else if (charset_kanji2
&& charset
== charset_kanji2
)
4986 if (c1
== 0x21 || (c1
>= 0x23 && c1
<= 0x25)
4988 || (c1
>= 0x2C && c1
<= 0x2F) || c1
>= 0x6E)
4990 JIS_TO_SJIS2 (code
);
4991 c1
= code
>> 8, c2
= code
& 0xFF;
4992 EMIT_TWO_BYTES (c1
, c2
);
4995 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4998 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
5001 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5002 coding
->produced_char
+= produced_chars
;
5003 coding
->produced
= dst
- coding
->destination
;
5008 encode_coding_big5 (struct coding_system
*coding
)
5010 int multibytep
= coding
->dst_multibyte
;
5011 int *charbuf
= coding
->charbuf
;
5012 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5013 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5014 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5016 int produced_chars
= 0;
5017 Lisp_Object attrs
, charset_list
, val
;
5018 int ascii_compatible
;
5019 struct charset
*charset_roman
, *charset_big5
;
5022 CODING_GET_INFO (coding
, attrs
, charset_list
);
5024 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
5025 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
5026 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
5028 while (charbuf
< charbuf_end
)
5030 ASSURE_DESTINATION (safe_room
);
5032 /* Now encode the character C. */
5033 if (ASCII_CHAR_P (c
) && ascii_compatible
)
5034 EMIT_ONE_ASCII_BYTE (c
);
5035 else if (CHAR_BYTE8_P (c
))
5037 c
= CHAR_TO_BYTE8 (c
);
5043 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
5047 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
5049 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
5050 charset
= CHARSET_FROM_ID (charset_ascii
);
5054 c
= coding
->default_char
;
5055 charset
= char_charset (c
, charset_list
, &code
);
5058 if (code
== CHARSET_INVALID_CODE (charset
))
5060 if (charset
== charset_big5
)
5064 c1
= code
>> 8, c2
= code
& 0xFF;
5065 EMIT_TWO_BYTES (c1
, c2
);
5068 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
5071 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5072 coding
->produced_char
+= produced_chars
;
5073 coding
->produced
= dst
- coding
->destination
;
5078 /*** 10. CCL handlers ***/
5080 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5081 Check if a text is encoded in a coding system of which
5082 encoder/decoder are written in CCL program. If it is, return
5083 CATEGORY_MASK_CCL, else return 0. */
5086 detect_coding_ccl (struct coding_system
*coding
,
5087 struct coding_detection_info
*detect_info
)
5089 const unsigned char *src
= coding
->source
, *src_base
;
5090 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
5091 int multibytep
= coding
->src_multibyte
;
5092 int consumed_chars
= 0;
5094 unsigned char *valids
;
5095 int head_ascii
= coding
->head_ascii
;
5098 detect_info
->checked
|= CATEGORY_MASK_CCL
;
5100 coding
= &coding_categories
[coding_category_ccl
];
5101 valids
= CODING_CCL_VALIDS (coding
);
5102 attrs
= CODING_ID_ATTRS (coding
->id
);
5103 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
5112 if (c
< 0 || ! valids
[c
])
5114 if ((valids
[c
] > 1))
5115 found
= CATEGORY_MASK_CCL
;
5117 detect_info
->rejected
|= CATEGORY_MASK_CCL
;
5121 detect_info
->found
|= found
;
5126 decode_coding_ccl (struct coding_system
*coding
)
5128 const unsigned char *src
= coding
->source
+ coding
->consumed
;
5129 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
5130 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
5131 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_size
;
5132 int consumed_chars
= 0;
5133 int multibytep
= coding
->src_multibyte
;
5134 struct ccl_program
*ccl
= &coding
->spec
.ccl
->ccl
;
5135 int source_charbuf
[1024];
5136 int source_byteidx
[1025];
5137 Lisp_Object attrs
, charset_list
;
5139 CODING_GET_INFO (coding
, attrs
, charset_list
);
5143 const unsigned char *p
= src
;
5148 while (i
< 1024 && p
< src_end
)
5150 source_byteidx
[i
] = p
- src
;
5151 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
5153 source_byteidx
[i
] = p
- src
;
5156 while (i
< 1024 && p
< src_end
)
5157 source_charbuf
[i
++] = *p
++;
5159 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
5160 ccl
->last_block
= 1;
5161 ccl_driver (ccl
, source_charbuf
, charbuf
, i
, charbuf_end
- charbuf
,
5163 charbuf
+= ccl
->produced
;
5165 src
+= source_byteidx
[ccl
->consumed
];
5167 src
+= ccl
->consumed
;
5168 consumed_chars
+= ccl
->consumed
;
5169 if (p
== src_end
|| ccl
->status
!= CCL_STAT_SUSPEND_BY_SRC
)
5173 switch (ccl
->status
)
5175 case CCL_STAT_SUSPEND_BY_SRC
:
5176 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
5178 case CCL_STAT_SUSPEND_BY_DST
:
5179 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_DST
);
5182 case CCL_STAT_INVALID_CMD
:
5183 record_conversion_result (coding
, CODING_RESULT_INTERRUPT
);
5186 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5189 coding
->consumed_char
+= consumed_chars
;
5190 coding
->consumed
= src
- coding
->source
;
5191 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
5195 encode_coding_ccl (struct coding_system
*coding
)
5197 struct ccl_program
*ccl
= &coding
->spec
.ccl
->ccl
;
5198 int multibytep
= coding
->dst_multibyte
;
5199 int *charbuf
= coding
->charbuf
;
5200 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5201 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5202 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5203 int destination_charbuf
[1024];
5204 int i
, produced_chars
= 0;
5205 Lisp_Object attrs
, charset_list
;
5207 CODING_GET_INFO (coding
, attrs
, charset_list
);
5208 if (coding
->consumed_char
== coding
->src_chars
5209 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
5210 ccl
->last_block
= 1;
5212 while (charbuf
< charbuf_end
)
5214 ccl_driver (ccl
, charbuf
, destination_charbuf
,
5215 charbuf_end
- charbuf
, 1024, charset_list
);
5218 ASSURE_DESTINATION (ccl
->produced
* 2);
5219 for (i
= 0; i
< ccl
->produced
; i
++)
5220 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
5224 ASSURE_DESTINATION (ccl
->produced
);
5225 for (i
= 0; i
< ccl
->produced
; i
++)
5226 *dst
++ = destination_charbuf
[i
] & 0xFF;
5227 produced_chars
+= ccl
->produced
;
5229 charbuf
+= ccl
->consumed
;
5230 if (ccl
->status
== CCL_STAT_QUIT
5231 || ccl
->status
== CCL_STAT_INVALID_CMD
)
5235 switch (ccl
->status
)
5237 case CCL_STAT_SUSPEND_BY_SRC
:
5238 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
5240 case CCL_STAT_SUSPEND_BY_DST
:
5241 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_DST
);
5244 case CCL_STAT_INVALID_CMD
:
5245 record_conversion_result (coding
, CODING_RESULT_INTERRUPT
);
5248 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5252 coding
->produced_char
+= produced_chars
;
5253 coding
->produced
= dst
- coding
->destination
;
5259 /*** 10, 11. no-conversion handlers ***/
5261 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5264 decode_coding_raw_text (struct coding_system
*coding
)
5267 !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
5269 coding
->chars_at_source
= 1;
5270 coding
->consumed_char
= coding
->src_chars
;
5271 coding
->consumed
= coding
->src_bytes
;
5272 if (eol_crlf
&& coding
->source
[coding
->src_bytes
- 1] == '\r')
5274 coding
->consumed_char
--;
5276 record_conversion_result (coding
, CODING_RESULT_INSUFFICIENT_SRC
);
5279 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5283 encode_coding_raw_text (struct coding_system
*coding
)
5285 int multibytep
= coding
->dst_multibyte
;
5286 int *charbuf
= coding
->charbuf
;
5287 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
5288 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5289 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5290 int produced_chars
= 0;
5295 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
5297 if (coding
->src_multibyte
)
5298 while (charbuf
< charbuf_end
)
5300 ASSURE_DESTINATION (safe_room
);
5302 if (ASCII_CHAR_P (c
))
5303 EMIT_ONE_ASCII_BYTE (c
);
5304 else if (CHAR_BYTE8_P (c
))
5306 c
= CHAR_TO_BYTE8 (c
);
5311 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
5313 CHAR_STRING_ADVANCE (c
, p1
);
5316 EMIT_ONE_BYTE (*p0
);
5322 while (charbuf
< charbuf_end
)
5324 ASSURE_DESTINATION (safe_room
);
5331 if (coding
->src_multibyte
)
5333 int safe_room
= MAX_MULTIBYTE_LENGTH
;
5335 while (charbuf
< charbuf_end
)
5337 ASSURE_DESTINATION (safe_room
);
5339 if (ASCII_CHAR_P (c
))
5341 else if (CHAR_BYTE8_P (c
))
5342 *dst
++ = CHAR_TO_BYTE8 (c
);
5344 CHAR_STRING_ADVANCE (c
, dst
);
5349 ASSURE_DESTINATION (charbuf_end
- charbuf
);
5350 while (charbuf
< charbuf_end
&& dst
< dst_end
)
5351 *dst
++ = *charbuf
++;
5353 produced_chars
= dst
- (coding
->destination
+ coding
->produced
);
5355 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5356 coding
->produced_char
+= produced_chars
;
5357 coding
->produced
= dst
- coding
->destination
;
5361 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5362 Check if a text is encoded in a charset-based coding system. If it
5363 is, return 1, else return 0. */
5366 detect_coding_charset (struct coding_system
*coding
,
5367 struct coding_detection_info
*detect_info
)
5369 const unsigned char *src
= coding
->source
, *src_base
;
5370 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
5371 int multibytep
= coding
->src_multibyte
;
5372 int consumed_chars
= 0;
5373 Lisp_Object attrs
, valids
, name
;
5375 int head_ascii
= coding
->head_ascii
;
5376 int check_latin_extra
= 0;
5378 detect_info
->checked
|= CATEGORY_MASK_CHARSET
;
5380 coding
= &coding_categories
[coding_category_charset
];
5381 attrs
= CODING_ID_ATTRS (coding
->id
);
5382 valids
= AREF (attrs
, coding_attr_charset_valids
);
5383 name
= CODING_ID_NAME (coding
->id
);
5384 if (strncmp (SSDATA (SYMBOL_NAME (name
)),
5385 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5386 || strncmp (SSDATA (SYMBOL_NAME (name
)),
5387 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5388 check_latin_extra
= 1;
5390 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
5397 struct charset
*charset
;
5404 val
= AREF (valids
, c
);
5410 && check_latin_extra
5411 && (!VECTORP (Vlatin_extra_code_table
)
5412 || NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
])))
5414 found
= CATEGORY_MASK_CHARSET
;
5418 charset
= CHARSET_FROM_ID (XFASTINT (val
));
5419 dim
= CHARSET_DIMENSION (charset
);
5420 for (idx
= 1; idx
< dim
; idx
++)
5425 if (c
< charset
->code_space
[(dim
- 1 - idx
) * 2]
5426 || c
> charset
->code_space
[(dim
- 1 - idx
) * 2 + 1])
5435 for (; CONSP (val
); val
= XCDR (val
))
5437 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
5438 dim
= CHARSET_DIMENSION (charset
);
5444 if (c
< charset
->code_space
[(dim
- 1 - idx
) * 4]
5445 || c
> charset
->code_space
[(dim
- 1 - idx
) * 4 + 1])
5460 detect_info
->rejected
|= CATEGORY_MASK_CHARSET
;
5464 detect_info
->found
|= found
;
5469 decode_coding_charset (struct coding_system
*coding
)
5471 const unsigned char *src
= coding
->source
+ coding
->consumed
;
5472 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
5473 const unsigned char *src_base
;
5474 int *charbuf
= coding
->charbuf
+ coding
->charbuf_used
;
5475 /* We may produce one charset annotation in one loop and one more at
5478 = coding
->charbuf
+ coding
->charbuf_size
- (MAX_ANNOTATION_LENGTH
* 2);
5479 int consumed_chars
= 0, consumed_chars_base
;
5480 int multibytep
= coding
->src_multibyte
;
5481 Lisp_Object attrs
, charset_list
, valids
;
5482 int char_offset
= coding
->produced_char
;
5483 int last_offset
= char_offset
;
5484 int last_id
= charset_ascii
;
5486 !inhibit_eol_conversion
&& EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
);
5487 int byte_after_cr
= -1;
5489 CODING_GET_INFO (coding
, attrs
, charset_list
);
5490 valids
= AREF (attrs
, coding_attr_charset_valids
);
5496 struct charset
*charset
;
5502 consumed_chars_base
= consumed_chars
;
5504 if (charbuf
>= charbuf_end
)
5506 if (byte_after_cr
>= 0)
5511 if (byte_after_cr
>= 0)
5519 if (eol_crlf
&& c
== '\r')
5520 ONE_MORE_BYTE (byte_after_cr
);
5526 val
= AREF (valids
, c
);
5527 if (! INTEGERP (val
) && ! CONSP (val
))
5531 charset
= CHARSET_FROM_ID (XFASTINT (val
));
5532 dim
= CHARSET_DIMENSION (charset
);
5536 code
= (code
<< 8) | c
;
5539 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
5544 /* VAL is a list of charset IDs. It is assured that the
5545 list is sorted by charset dimensions (smaller one
5549 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
5550 dim
= CHARSET_DIMENSION (charset
);
5554 code
= (code
<< 8) | c
;
5557 CODING_DECODE_CHAR (coding
, src
, src_base
,
5558 src_end
, charset
, code
, c
);
5566 if (charset
->id
!= charset_ascii
5567 && last_id
!= charset
->id
)
5569 if (last_id
!= charset_ascii
)
5570 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
5571 last_id
= charset
->id
;
5572 last_offset
= char_offset
;
5581 consumed_chars
= consumed_chars_base
;
5583 *charbuf
++ = c
< 0 ? -c
: ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
5589 if (last_id
!= charset_ascii
)
5590 ADD_CHARSET_DATA (charbuf
, char_offset
- last_offset
, last_id
);
5591 coding
->consumed_char
+= consumed_chars_base
;
5592 coding
->consumed
= src_base
- coding
->source
;
5593 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
5597 encode_coding_charset (struct coding_system
*coding
)
5599 int multibytep
= coding
->dst_multibyte
;
5600 int *charbuf
= coding
->charbuf
;
5601 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5602 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5603 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5604 int safe_room
= MAX_MULTIBYTE_LENGTH
;
5605 int produced_chars
= 0;
5606 Lisp_Object attrs
, charset_list
;
5607 int ascii_compatible
;
5610 CODING_GET_INFO (coding
, attrs
, charset_list
);
5611 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
5613 while (charbuf
< charbuf_end
)
5615 struct charset
*charset
;
5618 ASSURE_DESTINATION (safe_room
);
5620 if (ascii_compatible
&& ASCII_CHAR_P (c
))
5621 EMIT_ONE_ASCII_BYTE (c
);
5622 else if (CHAR_BYTE8_P (c
))
5624 c
= CHAR_TO_BYTE8 (c
);
5629 charset
= char_charset (c
, charset_list
, &code
);
5632 if (CHARSET_DIMENSION (charset
) == 1)
5633 EMIT_ONE_BYTE (code
);
5634 else if (CHARSET_DIMENSION (charset
) == 2)
5635 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
5636 else if (CHARSET_DIMENSION (charset
) == 3)
5637 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
5639 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
5640 (code
>> 8) & 0xFF, code
& 0xFF);
5644 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
5645 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
5647 c
= coding
->default_char
;
5653 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
5654 coding
->produced_char
+= produced_chars
;
5655 coding
->produced
= dst
- coding
->destination
;
5660 /*** 7. C library functions ***/
5662 /* Setup coding context CODING from information about CODING_SYSTEM.
5663 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5664 CODING_SYSTEM is invalid, signal an error. */
5667 setup_coding_system (Lisp_Object coding_system
, struct coding_system
*coding
)
5670 Lisp_Object eol_type
;
5671 Lisp_Object coding_type
;
5674 if (NILP (coding_system
))
5675 coding_system
= Qundecided
;
5677 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
5679 attrs
= CODING_ID_ATTRS (coding
->id
);
5680 eol_type
= inhibit_eol_conversion
? Qunix
: CODING_ID_EOL_TYPE (coding
->id
);
5683 coding
->head_ascii
= -1;
5684 if (VECTORP (eol_type
))
5685 coding
->common_flags
= (CODING_REQUIRE_DECODING_MASK
5686 | CODING_REQUIRE_DETECTION_MASK
);
5687 else if (! EQ (eol_type
, Qunix
))
5688 coding
->common_flags
= (CODING_REQUIRE_DECODING_MASK
5689 | CODING_REQUIRE_ENCODING_MASK
);
5691 coding
->common_flags
= 0;
5692 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
5693 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
5694 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
5695 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
5696 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs
)))
5697 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
5699 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
5700 coding
->max_charset_id
= SCHARS (val
) - 1;
5701 coding
->safe_charsets
= SDATA (val
);
5702 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
5703 coding
->carryover_bytes
= 0;
5705 coding_type
= CODING_ATTR_TYPE (attrs
);
5706 if (EQ (coding_type
, Qundecided
))
5708 coding
->detector
= NULL
;
5709 coding
->decoder
= decode_coding_raw_text
;
5710 coding
->encoder
= encode_coding_raw_text
;
5711 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
5713 else if (EQ (coding_type
, Qiso_2022
))
5716 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
5718 /* Invoke graphic register 0 to plane 0. */
5719 CODING_ISO_INVOCATION (coding
, 0) = 0;
5720 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5721 CODING_ISO_INVOCATION (coding
, 1)
5722 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
5723 /* Setup the initial status of designation. */
5724 for (i
= 0; i
< 4; i
++)
5725 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
5726 /* Not single shifting initially. */
5727 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
5728 /* Beginning of buffer should also be regarded as bol. */
5729 CODING_ISO_BOL (coding
) = 1;
5730 coding
->detector
= detect_coding_iso_2022
;
5731 coding
->decoder
= decode_coding_iso_2022
;
5732 coding
->encoder
= encode_coding_iso_2022
;
5733 if (flags
& CODING_ISO_FLAG_SAFE
)
5734 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
5735 coding
->common_flags
5736 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
5737 | CODING_REQUIRE_FLUSHING_MASK
);
5738 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
5739 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
5740 if (flags
& CODING_ISO_FLAG_DESIGNATION
)
5741 coding
->common_flags
|= CODING_ANNOTATE_CHARSET_MASK
;
5742 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
5744 setup_iso_safe_charsets (attrs
);
5745 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
5746 coding
->max_charset_id
= SCHARS (val
) - 1;
5747 coding
->safe_charsets
= SDATA (val
);
5749 CODING_ISO_FLAGS (coding
) = flags
;
5750 CODING_ISO_CMP_STATUS (coding
)->state
= COMPOSING_NO
;
5751 CODING_ISO_CMP_STATUS (coding
)->method
= COMPOSITION_NO
;
5752 CODING_ISO_EXTSEGMENT_LEN (coding
) = 0;
5753 CODING_ISO_EMBEDDED_UTF_8 (coding
) = 0;
5755 else if (EQ (coding_type
, Qcharset
))
5757 coding
->detector
= detect_coding_charset
;
5758 coding
->decoder
= decode_coding_charset
;
5759 coding
->encoder
= encode_coding_charset
;
5760 coding
->common_flags
5761 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5763 else if (EQ (coding_type
, Qutf_8
))
5765 val
= AREF (attrs
, coding_attr_utf_bom
);
5766 CODING_UTF_8_BOM (coding
) = (CONSP (val
) ? utf_detect_bom
5767 : EQ (val
, Qt
) ? utf_with_bom
5769 coding
->detector
= detect_coding_utf_8
;
5770 coding
->decoder
= decode_coding_utf_8
;
5771 coding
->encoder
= encode_coding_utf_8
;
5772 coding
->common_flags
5773 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5774 if (CODING_UTF_8_BOM (coding
) == utf_detect_bom
)
5775 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
5777 else if (EQ (coding_type
, Qutf_16
))
5779 val
= AREF (attrs
, coding_attr_utf_bom
);
5780 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_detect_bom
5781 : EQ (val
, Qt
) ? utf_with_bom
5783 val
= AREF (attrs
, coding_attr_utf_16_endian
);
5784 CODING_UTF_16_ENDIAN (coding
) = (EQ (val
, Qbig
) ? utf_16_big_endian
5785 : utf_16_little_endian
);
5786 CODING_UTF_16_SURROGATE (coding
) = 0;
5787 coding
->detector
= detect_coding_utf_16
;
5788 coding
->decoder
= decode_coding_utf_16
;
5789 coding
->encoder
= encode_coding_utf_16
;
5790 coding
->common_flags
5791 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5792 if (CODING_UTF_16_BOM (coding
) == utf_detect_bom
)
5793 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
5795 else if (EQ (coding_type
, Qccl
))
5797 coding
->detector
= detect_coding_ccl
;
5798 coding
->decoder
= decode_coding_ccl
;
5799 coding
->encoder
= encode_coding_ccl
;
5800 coding
->common_flags
5801 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
5802 | CODING_REQUIRE_FLUSHING_MASK
);
5804 else if (EQ (coding_type
, Qemacs_mule
))
5806 coding
->detector
= detect_coding_emacs_mule
;
5807 coding
->decoder
= decode_coding_emacs_mule
;
5808 coding
->encoder
= encode_coding_emacs_mule
;
5809 coding
->common_flags
5810 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5811 coding
->spec
.emacs_mule
.full_support
= 1;
5812 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
5813 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
5815 Lisp_Object tail
, safe_charsets
;
5816 int max_charset_id
= 0;
5818 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
5820 if (max_charset_id
< XFASTINT (XCAR (tail
)))
5821 max_charset_id
= XFASTINT (XCAR (tail
));
5822 safe_charsets
= make_uninit_string (max_charset_id
+ 1);
5823 memset (SDATA (safe_charsets
), 255, max_charset_id
+ 1);
5824 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
5826 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
5827 coding
->max_charset_id
= max_charset_id
;
5828 coding
->safe_charsets
= SDATA (safe_charsets
);
5829 coding
->spec
.emacs_mule
.full_support
= 1;
5831 coding
->spec
.emacs_mule
.cmp_status
.state
= COMPOSING_NO
;
5832 coding
->spec
.emacs_mule
.cmp_status
.method
= COMPOSITION_NO
;
5834 else if (EQ (coding_type
, Qshift_jis
))
5836 coding
->detector
= detect_coding_sjis
;
5837 coding
->decoder
= decode_coding_sjis
;
5838 coding
->encoder
= encode_coding_sjis
;
5839 coding
->common_flags
5840 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5842 else if (EQ (coding_type
, Qbig5
))
5844 coding
->detector
= detect_coding_big5
;
5845 coding
->decoder
= decode_coding_big5
;
5846 coding
->encoder
= encode_coding_big5
;
5847 coding
->common_flags
5848 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
5850 else /* EQ (coding_type, Qraw_text) */
5852 coding
->detector
= NULL
;
5853 coding
->decoder
= decode_coding_raw_text
;
5854 coding
->encoder
= encode_coding_raw_text
;
5855 if (! EQ (eol_type
, Qunix
))
5857 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
5858 if (! VECTORP (eol_type
))
5859 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
5867 /* Return a list of charsets supported by CODING. */
5870 coding_charset_list (struct coding_system
*coding
)
5872 Lisp_Object attrs
, charset_list
;
5874 CODING_GET_INFO (coding
, attrs
, charset_list
);
5875 if (EQ (CODING_ATTR_TYPE (attrs
), Qiso_2022
))
5877 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
5879 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
5880 charset_list
= Viso_2022_charset_list
;
5882 else if (EQ (CODING_ATTR_TYPE (attrs
), Qemacs_mule
))
5884 charset_list
= Vemacs_mule_charset_list
;
5886 return charset_list
;
5890 /* Return a list of charsets supported by CODING-SYSTEM. */
5893 coding_system_charset_list (Lisp_Object coding_system
)
5896 Lisp_Object attrs
, charset_list
;
5898 CHECK_CODING_SYSTEM_GET_ID (coding_system
, id
);
5899 attrs
= CODING_ID_ATTRS (id
);
5901 if (EQ (CODING_ATTR_TYPE (attrs
), Qiso_2022
))
5903 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
5905 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
5906 charset_list
= Viso_2022_charset_list
;
5908 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
5910 else if (EQ (CODING_ATTR_TYPE (attrs
), Qemacs_mule
))
5912 charset_list
= Vemacs_mule_charset_list
;
5916 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
5918 return charset_list
;
5922 /* Return raw-text or one of its subsidiaries that has the same
5923 eol_type as CODING-SYSTEM. */
5926 raw_text_coding_system (Lisp_Object coding_system
)
5928 Lisp_Object spec
, attrs
;
5929 Lisp_Object eol_type
, raw_text_eol_type
;
5931 if (NILP (coding_system
))
5933 spec
= CODING_SYSTEM_SPEC (coding_system
);
5934 attrs
= AREF (spec
, 0);
5936 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
5937 return coding_system
;
5939 eol_type
= AREF (spec
, 2);
5940 if (VECTORP (eol_type
))
5942 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
5943 raw_text_eol_type
= AREF (spec
, 2);
5944 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
5945 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
5946 : AREF (raw_text_eol_type
, 2));
5950 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5951 the subsidiary that has the same eol-spec as PARENT (if it is not
5952 nil and specifies end-of-line format) or the system's setting
5953 (system_eol_type). */
5956 coding_inherit_eol_type (Lisp_Object coding_system
, Lisp_Object parent
)
5958 Lisp_Object spec
, eol_type
;
5960 if (NILP (coding_system
))
5961 coding_system
= Qraw_text
;
5962 spec
= CODING_SYSTEM_SPEC (coding_system
);
5963 eol_type
= AREF (spec
, 2);
5964 if (VECTORP (eol_type
))
5966 Lisp_Object parent_eol_type
;
5968 if (! NILP (parent
))
5970 Lisp_Object parent_spec
;
5972 parent_spec
= CODING_SYSTEM_SPEC (parent
);
5973 parent_eol_type
= AREF (parent_spec
, 2);
5974 if (VECTORP (parent_eol_type
))
5975 parent_eol_type
= system_eol_type
;
5978 parent_eol_type
= system_eol_type
;
5979 if (EQ (parent_eol_type
, Qunix
))
5980 coding_system
= AREF (eol_type
, 0);
5981 else if (EQ (parent_eol_type
, Qdos
))
5982 coding_system
= AREF (eol_type
, 1);
5983 else if (EQ (parent_eol_type
, Qmac
))
5984 coding_system
= AREF (eol_type
, 2);
5986 return coding_system
;
5990 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5991 decided for writing to a process. If not, complement them, and
5992 return a new coding system. */
5995 complement_process_encoding_system (Lisp_Object coding_system
)
5997 Lisp_Object coding_base
= Qnil
, eol_base
= Qnil
;
5998 Lisp_Object spec
, attrs
;
6001 for (i
= 0; i
< 3; i
++)
6004 coding_system
= CDR_SAFE (Vdefault_process_coding_system
);
6006 coding_system
= preferred_coding_system ();
6007 spec
= CODING_SYSTEM_SPEC (coding_system
);
6010 attrs
= AREF (spec
, 0);
6011 if (NILP (coding_base
) && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
6012 coding_base
= CODING_ATTR_BASE_NAME (attrs
);
6013 if (NILP (eol_base
) && ! VECTORP (AREF (spec
, 2)))
6014 eol_base
= coding_system
;
6015 if (! NILP (coding_base
) && ! NILP (eol_base
))
6020 /* The original CODING_SYSTEM didn't specify text-conversion or
6021 eol-conversion. Be sure that we return a fully complemented
6023 coding_system
= coding_inherit_eol_type (coding_base
, eol_base
);
6024 return coding_system
;
6028 /* Emacs has a mechanism to automatically detect a coding system if it
6029 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6030 it's impossible to distinguish some coding systems accurately
6031 because they use the same range of codes. So, at first, coding
6032 systems are categorized into 7, those are:
6034 o coding-category-emacs-mule
6036 The category for a coding system which has the same code range
6037 as Emacs' internal format. Assigned the coding-system (Lisp
6038 symbol) `emacs-mule' by default.
6040 o coding-category-sjis
6042 The category for a coding system which has the same code range
6043 as SJIS. Assigned the coding-system (Lisp
6044 symbol) `japanese-shift-jis' by default.
6046 o coding-category-iso-7
6048 The category for a coding system which has the same code range
6049 as ISO2022 of 7-bit environment. This doesn't use any locking
6050 shift and single shift functions. This can encode/decode all
6051 charsets. Assigned the coding-system (Lisp symbol)
6052 `iso-2022-7bit' by default.
6054 o coding-category-iso-7-tight
6056 Same as coding-category-iso-7 except that this can
6057 encode/decode only the specified charsets.
6059 o coding-category-iso-8-1
6061 The category for a coding system which has the same code range
6062 as ISO2022 of 8-bit environment and graphic plane 1 used only
6063 for DIMENSION1 charset. This doesn't use any locking shift
6064 and single shift functions. Assigned the coding-system (Lisp
6065 symbol) `iso-latin-1' by default.
6067 o coding-category-iso-8-2
6069 The category for a coding system which has the same code range
6070 as ISO2022 of 8-bit environment and graphic plane 1 used only
6071 for DIMENSION2 charset. This doesn't use any locking shift
6072 and single shift functions. Assigned the coding-system (Lisp
6073 symbol) `japanese-iso-8bit' by default.
6075 o coding-category-iso-7-else
6077 The category for a coding system which has the same code range
6078 as ISO2022 of 7-bit environment but uses locking shift or
6079 single shift functions. Assigned the coding-system (Lisp
6080 symbol) `iso-2022-7bit-lock' by default.
6082 o coding-category-iso-8-else
6084 The category for a coding system which has the same code range
6085 as ISO2022 of 8-bit environment but uses locking shift or
6086 single shift functions. Assigned the coding-system (Lisp
6087 symbol) `iso-2022-8bit-ss2' by default.
6089 o coding-category-big5
6091 The category for a coding system which has the same code range
6092 as BIG5. Assigned the coding-system (Lisp symbol)
6093 `cn-big5' by default.
6095 o coding-category-utf-8
6097 The category for a coding system which has the same code range
6098 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6099 symbol) `utf-8' by default.
6101 o coding-category-utf-16-be
6103 The category for a coding system in which a text has an
6104 Unicode signature (cf. Unicode Standard) in the order of BIG
6105 endian at the head. Assigned the coding-system (Lisp symbol)
6106 `utf-16-be' by default.
6108 o coding-category-utf-16-le
6110 The category for a coding system in which a text has an
6111 Unicode signature (cf. Unicode Standard) in the order of
6112 LITTLE endian at the head. Assigned the coding-system (Lisp
6113 symbol) `utf-16-le' by default.
6115 o coding-category-ccl
6117 The category for a coding system of which encoder/decoder is
6118 written in CCL programs. The default value is nil, i.e., no
6119 coding system is assigned.
6121 o coding-category-binary
6123 The category for a coding system not categorized in any of the
6124 above. Assigned the coding-system (Lisp symbol)
6125 `no-conversion' by default.
6127 Each of them is a Lisp symbol and the value is an actual
6128 `coding-system's (this is also a Lisp symbol) assigned by a user.
6129 What Emacs does actually is to detect a category of coding system.
6130 Then, it uses a `coding-system' assigned to it. If Emacs can't
6131 decide only one possible category, it selects a category of the
6132 highest priority. Priorities of categories are also specified by a
6133 user in a Lisp variable `coding-category-list'.
6137 #define EOL_SEEN_NONE 0
6138 #define EOL_SEEN_LF 1
6139 #define EOL_SEEN_CR 2
6140 #define EOL_SEEN_CRLF 4
6142 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6143 SOURCE is encoded. If CATEGORY is one of
6144 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6145 two-byte, else they are encoded by one-byte.
6147 Return one of EOL_SEEN_XXX. */
6149 #define MAX_EOL_CHECK_COUNT 3
6152 detect_eol (const unsigned char *source
, EMACS_INT src_bytes
,
6153 enum coding_category category
)
6155 const unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
6158 int eol_seen
= EOL_SEEN_NONE
;
6160 if ((1 << category
) & CATEGORY_MASK_UTF_16
)
6164 msb
= category
== (coding_category_utf_16_le
6165 | coding_category_utf_16_le_nosig
);
6168 while (src
+ 1 < src_end
)
6171 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
6176 this_eol
= EOL_SEEN_LF
;
6177 else if (src
+ 3 >= src_end
6178 || src
[msb
+ 2] != 0
6179 || src
[lsb
+ 2] != '\n')
6180 this_eol
= EOL_SEEN_CR
;
6183 this_eol
= EOL_SEEN_CRLF
;
6187 if (eol_seen
== EOL_SEEN_NONE
)
6188 /* This is the first end-of-line. */
6189 eol_seen
= this_eol
;
6190 else if (eol_seen
!= this_eol
)
6192 /* The found type is different from what found before.
6193 Allow for stray ^M characters in DOS EOL files. */
6194 if ((eol_seen
== EOL_SEEN_CR
&& this_eol
== EOL_SEEN_CRLF
)
6195 || (eol_seen
== EOL_SEEN_CRLF
6196 && this_eol
== EOL_SEEN_CR
))
6197 eol_seen
= EOL_SEEN_CRLF
;
6200 eol_seen
= EOL_SEEN_LF
;
6204 if (++total
== MAX_EOL_CHECK_COUNT
)
6211 while (src
< src_end
)
6214 if (c
== '\n' || c
== '\r')
6219 this_eol
= EOL_SEEN_LF
;
6220 else if (src
>= src_end
|| *src
!= '\n')
6221 this_eol
= EOL_SEEN_CR
;
6223 this_eol
= EOL_SEEN_CRLF
, src
++;
6225 if (eol_seen
== EOL_SEEN_NONE
)
6226 /* This is the first end-of-line. */
6227 eol_seen
= this_eol
;
6228 else if (eol_seen
!= this_eol
)
6230 /* The found type is different from what found before.
6231 Allow for stray ^M characters in DOS EOL files. */
6232 if ((eol_seen
== EOL_SEEN_CR
&& this_eol
== EOL_SEEN_CRLF
)
6233 || (eol_seen
== EOL_SEEN_CRLF
&& this_eol
== EOL_SEEN_CR
))
6234 eol_seen
= EOL_SEEN_CRLF
;
6237 eol_seen
= EOL_SEEN_LF
;
6241 if (++total
== MAX_EOL_CHECK_COUNT
)
6250 adjust_coding_eol_type (struct coding_system
*coding
, int eol_seen
)
6252 Lisp_Object eol_type
;
6254 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6255 if (eol_seen
& EOL_SEEN_LF
)
6257 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
6260 else if (eol_seen
& EOL_SEEN_CRLF
)
6262 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
6265 else if (eol_seen
& EOL_SEEN_CR
)
6267 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
6273 /* Detect how a text specified in CODING is encoded. If a coding
6274 system is detected, update fields of CODING by the detected coding
6278 detect_coding (struct coding_system
*coding
)
6280 const unsigned char *src
, *src_end
;
6281 int saved_mode
= coding
->mode
;
6283 coding
->consumed
= coding
->consumed_char
= 0;
6284 coding
->produced
= coding
->produced_char
= 0;
6285 coding_set_source (coding
);
6287 src_end
= coding
->source
+ coding
->src_bytes
;
6288 coding
->head_ascii
= 0;
6290 /* If we have not yet decided the text encoding type, detect it
6292 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
6295 struct coding_detection_info detect_info
;
6296 int null_byte_found
= 0, eight_bit_found
= 0;
6298 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
6299 for (src
= coding
->source
; src
< src_end
; src
++)
6304 eight_bit_found
= 1;
6305 if (null_byte_found
)
6310 if ((c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
6311 && ! inhibit_iso_escape_detection
6312 && ! detect_info
.checked
)
6314 if (detect_coding_iso_2022 (coding
, &detect_info
))
6316 /* We have scanned the whole data. */
6317 if (! (detect_info
.rejected
& CATEGORY_MASK_ISO_7_ELSE
))
6319 /* We didn't find an 8-bit code. We may
6320 have found a null-byte, but it's very
6321 rare that a binary file conforms to
6324 coding
->head_ascii
= src
- coding
->source
;
6326 detect_info
.rejected
|= ~CATEGORY_MASK_ISO_ESCAPE
;
6330 else if (! c
&& !inhibit_null_byte_detection
)
6332 null_byte_found
= 1;
6333 if (eight_bit_found
)
6336 if (! eight_bit_found
)
6337 coding
->head_ascii
++;
6339 else if (! eight_bit_found
)
6340 coding
->head_ascii
++;
6343 if (null_byte_found
|| eight_bit_found
6344 || coding
->head_ascii
< coding
->src_bytes
6345 || detect_info
.found
)
6347 enum coding_category category
;
6348 struct coding_system
*this;
6350 if (coding
->head_ascii
== coding
->src_bytes
)
6351 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6352 for (i
= 0; i
< coding_category_raw_text
; i
++)
6354 category
= coding_priorities
[i
];
6355 this = coding_categories
+ category
;
6356 if (detect_info
.found
& (1 << category
))
6361 if (null_byte_found
)
6363 detect_info
.checked
|= ~CATEGORY_MASK_UTF_16
;
6364 detect_info
.rejected
|= ~CATEGORY_MASK_UTF_16
;
6366 for (i
= 0; i
< coding_category_raw_text
; i
++)
6368 category
= coding_priorities
[i
];
6369 this = coding_categories
+ category
;
6372 /* No coding system of this category is defined. */
6373 detect_info
.rejected
|= (1 << category
);
6375 else if (category
>= coding_category_raw_text
)
6377 else if (detect_info
.checked
& (1 << category
))
6379 if (detect_info
.found
& (1 << category
))
6382 else if ((*(this->detector
)) (coding
, &detect_info
)
6383 && detect_info
.found
& (1 << category
))
6385 if (category
== coding_category_utf_16_auto
)
6387 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
6388 category
= coding_category_utf_16_le
;
6390 category
= coding_category_utf_16_be
;
6397 if (i
< coding_category_raw_text
)
6398 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
6399 else if (null_byte_found
)
6400 setup_coding_system (Qno_conversion
, coding
);
6401 else if ((detect_info
.rejected
& CATEGORY_MASK_ANY
)
6402 == CATEGORY_MASK_ANY
)
6403 setup_coding_system (Qraw_text
, coding
);
6404 else if (detect_info
.rejected
)
6405 for (i
= 0; i
< coding_category_raw_text
; i
++)
6406 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
6408 this = coding_categories
+ coding_priorities
[i
];
6409 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
6414 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding
->id
)))
6415 == coding_category_utf_8_auto
)
6417 Lisp_Object coding_systems
;
6418 struct coding_detection_info detect_info
;
6421 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_bom
);
6422 detect_info
.found
= detect_info
.rejected
= 0;
6423 coding
->head_ascii
= 0;
6424 if (CONSP (coding_systems
)
6425 && detect_coding_utf_8 (coding
, &detect_info
))
6427 if (detect_info
.found
& CATEGORY_MASK_UTF_8_SIG
)
6428 setup_coding_system (XCAR (coding_systems
), coding
);
6430 setup_coding_system (XCDR (coding_systems
), coding
);
6433 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding
->id
)))
6434 == coding_category_utf_16_auto
)
6436 Lisp_Object coding_systems
;
6437 struct coding_detection_info detect_info
;
6440 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_bom
);
6441 detect_info
.found
= detect_info
.rejected
= 0;
6442 coding
->head_ascii
= 0;
6443 if (CONSP (coding_systems
)
6444 && detect_coding_utf_16 (coding
, &detect_info
))
6446 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
6447 setup_coding_system (XCAR (coding_systems
), coding
);
6448 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
6449 setup_coding_system (XCDR (coding_systems
), coding
);
6452 coding
->mode
= saved_mode
;
6457 decode_eol (struct coding_system
*coding
)
6459 Lisp_Object eol_type
;
6460 unsigned char *p
, *pbeg
, *pend
;
6462 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6463 if (EQ (eol_type
, Qunix
) || inhibit_eol_conversion
)
6466 if (NILP (coding
->dst_object
))
6467 pbeg
= coding
->destination
;
6469 pbeg
= BYTE_POS_ADDR (coding
->dst_pos_byte
);
6470 pend
= pbeg
+ coding
->produced
;
6472 if (VECTORP (eol_type
))
6474 int eol_seen
= EOL_SEEN_NONE
;
6476 for (p
= pbeg
; p
< pend
; p
++)
6479 eol_seen
|= EOL_SEEN_LF
;
6480 else if (*p
== '\r')
6482 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
6484 eol_seen
|= EOL_SEEN_CRLF
;
6488 eol_seen
|= EOL_SEEN_CR
;
6491 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6492 if ((eol_seen
& EOL_SEEN_CRLF
) != 0
6493 && (eol_seen
& EOL_SEEN_CR
) != 0
6494 && (eol_seen
& EOL_SEEN_LF
) == 0)
6495 eol_seen
= EOL_SEEN_CRLF
;
6496 else if (eol_seen
!= EOL_SEEN_NONE
6497 && eol_seen
!= EOL_SEEN_LF
6498 && eol_seen
!= EOL_SEEN_CRLF
6499 && eol_seen
!= EOL_SEEN_CR
)
6500 eol_seen
= EOL_SEEN_LF
;
6501 if (eol_seen
!= EOL_SEEN_NONE
)
6502 eol_type
= adjust_coding_eol_type (coding
, eol_seen
);
6505 if (EQ (eol_type
, Qmac
))
6507 for (p
= pbeg
; p
< pend
; p
++)
6511 else if (EQ (eol_type
, Qdos
))
6515 if (NILP (coding
->dst_object
))
6517 /* Start deleting '\r' from the tail to minimize the memory
6519 for (p
= pend
- 2; p
>= pbeg
; p
--)
6522 memmove (p
, p
+ 1, pend
-- - p
- 1);
6528 int pos_byte
= coding
->dst_pos_byte
;
6529 int pos
= coding
->dst_pos
;
6530 int pos_end
= pos
+ coding
->produced_char
- 1;
6532 while (pos
< pos_end
)
6534 p
= BYTE_POS_ADDR (pos_byte
);
6535 if (*p
== '\r' && p
[1] == '\n')
6537 del_range_2 (pos
, pos_byte
, pos
+ 1, pos_byte
+ 1, 0);
6542 if (coding
->dst_multibyte
)
6543 pos_byte
+= BYTES_BY_CHAR_HEAD (*p
);
6548 coding
->produced
-= n
;
6549 coding
->produced_char
-= n
;
6554 /* Return a translation table (or list of them) from coding system
6555 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6556 decoding (ENCODEP is zero). */
6559 get_translation_table (Lisp_Object attrs
, int encodep
, int *max_lookup
)
6561 Lisp_Object standard
, translation_table
;
6564 if (NILP (Venable_character_translation
))
6571 translation_table
= CODING_ATTR_ENCODE_TBL (attrs
),
6572 standard
= Vstandard_translation_table_for_encode
;
6574 translation_table
= CODING_ATTR_DECODE_TBL (attrs
),
6575 standard
= Vstandard_translation_table_for_decode
;
6576 if (NILP (translation_table
))
6577 translation_table
= standard
;
6580 if (SYMBOLP (translation_table
))
6581 translation_table
= Fget (translation_table
, Qtranslation_table
);
6582 else if (CONSP (translation_table
))
6584 translation_table
= Fcopy_sequence (translation_table
);
6585 for (val
= translation_table
; CONSP (val
); val
= XCDR (val
))
6586 if (SYMBOLP (XCAR (val
)))
6587 XSETCAR (val
, Fget (XCAR (val
), Qtranslation_table
));
6589 if (CHAR_TABLE_P (standard
))
6591 if (CONSP (translation_table
))
6592 translation_table
= nconc2 (translation_table
,
6593 Fcons (standard
, Qnil
));
6595 translation_table
= Fcons (translation_table
,
6596 Fcons (standard
, Qnil
));
6603 if (CHAR_TABLE_P (translation_table
)
6604 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table
)) > 1)
6606 val
= XCHAR_TABLE (translation_table
)->extras
[1];
6607 if (NATNUMP (val
) && *max_lookup
< XFASTINT (val
))
6608 *max_lookup
= XFASTINT (val
);
6610 else if (CONSP (translation_table
))
6612 Lisp_Object tail
, val
;
6614 for (tail
= translation_table
; CONSP (tail
); tail
= XCDR (tail
))
6615 if (CHAR_TABLE_P (XCAR (tail
))
6616 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail
))) > 1)
6618 val
= XCHAR_TABLE (XCAR (tail
))->extras
[1];
6619 if (NATNUMP (val
) && *max_lookup
< XFASTINT (val
))
6620 *max_lookup
= XFASTINT (val
);
6624 return translation_table
;
6627 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6630 if (CHAR_TABLE_P (table)) \
6632 trans = CHAR_TABLE_REF (table, c); \
6633 if (CHARACTERP (trans)) \
6634 c = XFASTINT (trans), trans = Qnil; \
6636 else if (CONSP (table)) \
6640 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6641 if (CHAR_TABLE_P (XCAR (tail))) \
6643 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6644 if (CHARACTERP (trans)) \
6645 c = XFASTINT (trans), trans = Qnil; \
6646 else if (! NILP (trans)) \
6653 /* Return a translation of character(s) at BUF according to TRANS.
6654 TRANS is TO-CHAR or ((FROM . TO) ...) where
6655 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6656 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6657 translation is found, and Qnil if not found..
6658 If BUF is too short to lookup characters in FROM, return Qt. */
6661 get_translation (Lisp_Object trans
, int *buf
, int *buf_end
)
6664 if (INTEGERP (trans
))
6666 for (; CONSP (trans
); trans
= XCDR (trans
))
6668 Lisp_Object val
= XCAR (trans
);
6669 Lisp_Object from
= XCAR (val
);
6670 int len
= ASIZE (from
);
6673 for (i
= 0; i
< len
; i
++)
6675 if (buf
+ i
== buf_end
)
6677 if (XINT (AREF (from
, i
)) != buf
[i
])
6688 produce_chars (struct coding_system
*coding
, Lisp_Object translation_table
,
6691 unsigned char *dst
= coding
->destination
+ coding
->produced
;
6692 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
6694 EMACS_INT produced_chars
= 0;
6697 if (! coding
->chars_at_source
)
6699 /* Source characters are in coding->charbuf. */
6700 int *buf
= coding
->charbuf
;
6701 int *buf_end
= buf
+ coding
->charbuf_used
;
6703 if (EQ (coding
->src_object
, coding
->dst_object
))
6705 coding_set_source (coding
);
6706 dst_end
= ((unsigned char *) coding
->source
) + coding
->consumed
;
6709 while (buf
< buf_end
)
6715 int from_nchars
= 1, to_nchars
= 1;
6716 Lisp_Object trans
= Qnil
;
6718 LOOKUP_TRANSLATION_TABLE (translation_table
, c
, trans
);
6721 trans
= get_translation (trans
, buf
, buf_end
);
6722 if (INTEGERP (trans
))
6724 else if (CONSP (trans
))
6726 from_nchars
= ASIZE (XCAR (trans
));
6727 trans
= XCDR (trans
);
6728 if (INTEGERP (trans
))
6732 to_nchars
= ASIZE (trans
);
6733 c
= XINT (AREF (trans
, 0));
6736 else if (EQ (trans
, Qt
) && ! last_block
)
6740 if (dst
+ MAX_MULTIBYTE_LENGTH
* to_nchars
> dst_end
)
6742 dst
= alloc_destination (coding
,
6744 + MAX_MULTIBYTE_LENGTH
* to_nchars
,
6746 if (EQ (coding
->src_object
, coding
->dst_object
))
6748 coding_set_source (coding
);
6749 dst_end
= (((unsigned char *) coding
->source
)
6750 + coding
->consumed
);
6753 dst_end
= coding
->destination
+ coding
->dst_bytes
;
6756 for (i
= 0; i
< to_nchars
; i
++)
6759 c
= XINT (AREF (trans
, i
));
6760 if (coding
->dst_multibyte
6761 || ! CHAR_BYTE8_P (c
))
6762 CHAR_STRING_ADVANCE_NO_UNIFY (c
, dst
);
6764 *dst
++ = CHAR_TO_BYTE8 (c
);
6766 produced_chars
+= to_nchars
;
6770 /* This is an annotation datum. (-C) is the length. */
6773 carryover
= buf_end
- buf
;
6777 /* Source characters are at coding->source. */
6778 const unsigned char *src
= coding
->source
;
6779 const unsigned char *src_end
= src
+ coding
->consumed
;
6781 if (EQ (coding
->dst_object
, coding
->src_object
))
6782 dst_end
= (unsigned char *) src
;
6783 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
6785 if (coding
->src_multibyte
)
6788 EMACS_INT consumed_chars
= 0;
6792 const unsigned char *src_base
= src
;
6798 if (EQ (coding
->src_object
, coding
->dst_object
))
6799 dst_end
= (unsigned char *) src
;
6802 EMACS_INT offset
= src
- coding
->source
;
6804 dst
= alloc_destination (coding
, src_end
- src
+ 1,
6806 dst_end
= coding
->destination
+ coding
->dst_bytes
;
6807 coding_set_source (coding
);
6808 src
= coding
->source
+ offset
;
6809 src_end
= coding
->source
+ coding
->src_bytes
;
6810 if (EQ (coding
->src_object
, coding
->dst_object
))
6811 dst_end
= (unsigned char *) src
;
6821 while (src
< src_end
)
6826 if (dst
>= dst_end
- 1)
6828 if (EQ (coding
->src_object
, coding
->dst_object
))
6829 dst_end
= (unsigned char *) src
;
6830 if (dst
>= dst_end
- 1)
6832 EMACS_INT offset
= src
- coding
->source
;
6833 EMACS_INT more_bytes
;
6835 if (EQ (coding
->src_object
, coding
->dst_object
))
6836 more_bytes
= ((src_end
- src
) / 2) + 2;
6838 more_bytes
= src_end
- src
+ 2;
6839 dst
= alloc_destination (coding
, more_bytes
, dst
);
6840 dst_end
= coding
->destination
+ coding
->dst_bytes
;
6841 coding_set_source (coding
);
6842 src
= coding
->source
+ offset
;
6843 src_end
= coding
->source
+ coding
->src_bytes
;
6844 if (EQ (coding
->src_object
, coding
->dst_object
))
6845 dst_end
= (unsigned char *) src
;
6853 if (!EQ (coding
->src_object
, coding
->dst_object
))
6855 EMACS_INT require
= coding
->src_bytes
- coding
->dst_bytes
;
6859 EMACS_INT offset
= src
- coding
->source
;
6861 dst
= alloc_destination (coding
, require
, dst
);
6862 coding_set_source (coding
);
6863 src
= coding
->source
+ offset
;
6864 src_end
= coding
->source
+ coding
->src_bytes
;
6867 produced_chars
= coding
->consumed_char
;
6868 while (src
< src_end
)
6873 produced
= dst
- (coding
->destination
+ coding
->produced
);
6874 if (BUFFERP (coding
->dst_object
) && produced_chars
> 0)
6875 insert_from_gap (produced_chars
, produced
);
6876 coding
->produced
+= produced
;
6877 coding
->produced_char
+= produced_chars
;
6881 /* Compose text in CODING->object according to the annotation data at
6882 CHARBUF. CHARBUF is an array:
6883 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6887 produce_composition (struct coding_system
*coding
, int *charbuf
, EMACS_INT pos
)
6891 enum composition_method method
;
6892 Lisp_Object components
;
6894 len
= -charbuf
[0] - MAX_ANNOTATION_LENGTH
;
6895 to
= pos
+ charbuf
[2];
6896 method
= (enum composition_method
) (charbuf
[4]);
6898 if (method
== COMPOSITION_RELATIVE
)
6902 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
6905 if (method
== COMPOSITION_WITH_RULE
)
6906 len
= charbuf
[2] * 3 - 2;
6907 charbuf
+= MAX_ANNOTATION_LENGTH
;
6908 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6909 for (i
= j
= 0; i
< len
&& charbuf
[i
] != -1; i
++, j
++)
6911 if (charbuf
[i
] >= 0)
6912 args
[j
] = make_number (charbuf
[i
]);
6916 args
[j
] = make_number (charbuf
[i
] % 0x100);
6919 components
= (i
== j
? Fstring (j
, args
) : Fvector (j
, args
));
6921 compose_text (pos
, to
, components
, Qnil
, coding
->dst_object
);
6925 /* Put `charset' property on text in CODING->object according to
6926 the annotation data at CHARBUF. CHARBUF is an array:
6927 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6931 produce_charset (struct coding_system
*coding
, int *charbuf
, EMACS_INT pos
)
6933 EMACS_INT from
= pos
- charbuf
[2];
6934 struct charset
*charset
= CHARSET_FROM_ID (charbuf
[3]);
6936 Fput_text_property (make_number (from
), make_number (pos
),
6937 Qcharset
, CHARSET_NAME (charset
),
6938 coding
->dst_object
);
6942 #define CHARBUF_SIZE 0x4000
6944 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6946 int size = CHARBUF_SIZE; \
6948 coding->charbuf = NULL; \
6949 while (size > 1024) \
6951 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6952 if (coding->charbuf) \
6956 if (! coding->charbuf) \
6958 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6959 return coding->result; \
6961 coding->charbuf_size = size; \
6966 produce_annotation (struct coding_system
*coding
, EMACS_INT pos
)
6968 int *charbuf
= coding
->charbuf
;
6969 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
6971 if (NILP (coding
->dst_object
))
6974 while (charbuf
< charbuf_end
)
6980 int len
= -*charbuf
;
6985 case CODING_ANNOTATE_COMPOSITION_MASK
:
6986 produce_composition (coding
, charbuf
, pos
);
6988 case CODING_ANNOTATE_CHARSET_MASK
:
6989 produce_charset (coding
, charbuf
, pos
);
6997 /* Decode the data at CODING->src_object into CODING->dst_object.
6998 CODING->src_object is a buffer, a string, or nil.
6999 CODING->dst_object is a buffer.
7001 If CODING->src_object is a buffer, it must be the current buffer.
7002 In this case, if CODING->src_pos is positive, it is a position of
7003 the source text in the buffer, otherwise, the source text is in the
7004 gap area of the buffer, and CODING->src_pos specifies the offset of
7005 the text from GPT (which must be the same as PT). If this is the
7006 same buffer as CODING->dst_object, CODING->src_pos must be
7009 If CODING->src_object is a string, CODING->src_pos is an index to
7012 If CODING->src_object is nil, CODING->source must already point to
7013 the non-relocatable memory area. In this case, CODING->src_pos is
7014 an offset from CODING->source.
7016 The decoded data is inserted at the current point of the buffer
7021 decode_coding (struct coding_system
*coding
)
7024 Lisp_Object undo_list
;
7025 Lisp_Object translation_table
;
7026 struct ccl_spec cclspec
;
7030 if (BUFFERP (coding
->src_object
)
7031 && coding
->src_pos
> 0
7032 && coding
->src_pos
< GPT
7033 && coding
->src_pos
+ coding
->src_chars
> GPT
)
7034 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
7037 if (BUFFERP (coding
->dst_object
))
7039 if (current_buffer
!= XBUFFER (coding
->dst_object
))
7040 set_buffer_internal (XBUFFER (coding
->dst_object
));
7042 move_gap_both (PT
, PT_BYTE
);
7043 undo_list
= BVAR (current_buffer
, undo_list
);
7044 BVAR (current_buffer
, undo_list
) = Qt
;
7047 coding
->consumed
= coding
->consumed_char
= 0;
7048 coding
->produced
= coding
->produced_char
= 0;
7049 coding
->chars_at_source
= 0;
7050 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
7053 ALLOC_CONVERSION_WORK_AREA (coding
);
7055 attrs
= CODING_ID_ATTRS (coding
->id
);
7056 translation_table
= get_translation_table (attrs
, 0, NULL
);
7059 if (coding
->decoder
== decode_coding_ccl
)
7061 coding
->spec
.ccl
= &cclspec
;
7062 setup_ccl_program (&cclspec
.ccl
, CODING_CCL_DECODER (coding
));
7066 EMACS_INT pos
= coding
->dst_pos
+ coding
->produced_char
;
7068 coding_set_source (coding
);
7069 coding
->annotated
= 0;
7070 coding
->charbuf_used
= carryover
;
7071 (*(coding
->decoder
)) (coding
);
7072 coding_set_destination (coding
);
7073 carryover
= produce_chars (coding
, translation_table
, 0);
7074 if (coding
->annotated
)
7075 produce_annotation (coding
, pos
);
7076 for (i
= 0; i
< carryover
; i
++)
7078 = coding
->charbuf
[coding
->charbuf_used
- carryover
+ i
];
7080 while (coding
->result
== CODING_RESULT_INSUFFICIENT_DST
7081 || (coding
->consumed
< coding
->src_bytes
7082 && (coding
->result
== CODING_RESULT_SUCCESS
7083 || coding
->result
== CODING_RESULT_INVALID_SRC
)));
7087 coding_set_destination (coding
);
7088 coding
->charbuf_used
= carryover
;
7089 produce_chars (coding
, translation_table
, 1);
7092 coding
->carryover_bytes
= 0;
7093 if (coding
->consumed
< coding
->src_bytes
)
7095 int nbytes
= coding
->src_bytes
- coding
->consumed
;
7096 const unsigned char *src
;
7098 coding_set_source (coding
);
7099 coding_set_destination (coding
);
7100 src
= coding
->source
+ coding
->consumed
;
7102 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
7104 /* Flush out unprocessed data as binary chars. We are sure
7105 that the number of data is less than the size of
7107 coding
->charbuf_used
= 0;
7108 coding
->chars_at_source
= 0;
7110 while (nbytes
-- > 0)
7115 c
= BYTE8_TO_CHAR (c
);
7116 coding
->charbuf
[coding
->charbuf_used
++] = c
;
7118 produce_chars (coding
, Qnil
, 1);
7122 /* Record unprocessed bytes in coding->carryover. We are
7123 sure that the number of data is less than the size of
7124 coding->carryover. */
7125 unsigned char *p
= coding
->carryover
;
7127 if (nbytes
> sizeof coding
->carryover
)
7128 nbytes
= sizeof coding
->carryover
;
7129 coding
->carryover_bytes
= nbytes
;
7130 while (nbytes
-- > 0)
7133 coding
->consumed
= coding
->src_bytes
;
7136 if (! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
)
7137 && !inhibit_eol_conversion
)
7138 decode_eol (coding
);
7139 if (BUFFERP (coding
->dst_object
))
7141 BVAR (current_buffer
, undo_list
) = undo_list
;
7142 record_insert (coding
->dst_pos
, coding
->produced_char
);
7144 return coding
->result
;
7148 /* Extract an annotation datum from a composition starting at POS and
7149 ending before LIMIT of CODING->src_object (buffer or string), store
7150 the data in BUF, set *STOP to a starting position of the next
7151 composition (if any) or to LIMIT, and return the address of the
7152 next element of BUF.
7154 If such an annotation is not found, set *STOP to a starting
7155 position of a composition after POS (if any) or to LIMIT, and
7159 handle_composition_annotation (EMACS_INT pos
, EMACS_INT limit
,
7160 struct coding_system
*coding
, int *buf
,
7163 EMACS_INT start
, end
;
7166 if (! find_composition (pos
, limit
, &start
, &end
, &prop
, coding
->src_object
)
7169 else if (start
> pos
)
7175 /* We found a composition. Store the corresponding
7176 annotation data in BUF. */
7178 enum composition_method method
= COMPOSITION_METHOD (prop
);
7179 int nchars
= COMPOSITION_LENGTH (prop
);
7181 ADD_COMPOSITION_DATA (buf
, nchars
, 0, method
);
7182 if (method
!= COMPOSITION_RELATIVE
)
7184 Lisp_Object components
;
7187 components
= COMPOSITION_COMPONENTS (prop
);
7188 if (VECTORP (components
))
7190 len
= XVECTOR (components
)->size
;
7191 for (i
= 0; i
< len
; i
++)
7192 *buf
++ = XINT (AREF (components
, i
));
7194 else if (STRINGP (components
))
7196 len
= SCHARS (components
);
7200 FETCH_STRING_CHAR_ADVANCE (*buf
, components
, i
, i_byte
);
7204 else if (INTEGERP (components
))
7207 *buf
++ = XINT (components
);
7209 else if (CONSP (components
))
7211 for (len
= 0; CONSP (components
);
7212 len
++, components
= XCDR (components
))
7213 *buf
++ = XINT (XCAR (components
));
7221 if (find_composition (end
, limit
, &start
, &end
, &prop
,
7232 /* Extract an annotation datum from a text property `charset' at POS of
7233 CODING->src_object (buffer of string), store the data in BUF, set
7234 *STOP to the position where the value of `charset' property changes
7235 (limiting by LIMIT), and return the address of the next element of
7238 If the property value is nil, set *STOP to the position where the
7239 property value is non-nil (limiting by LIMIT), and return BUF. */
7242 handle_charset_annotation (EMACS_INT pos
, EMACS_INT limit
,
7243 struct coding_system
*coding
, int *buf
,
7246 Lisp_Object val
, next
;
7249 val
= Fget_text_property (make_number (pos
), Qcharset
, coding
->src_object
);
7250 if (! NILP (val
) && CHARSETP (val
))
7251 id
= XINT (CHARSET_SYMBOL_ID (val
));
7254 ADD_CHARSET_DATA (buf
, 0, id
);
7255 next
= Fnext_single_property_change (make_number (pos
), Qcharset
,
7257 make_number (limit
));
7258 *stop
= XINT (next
);
7264 consume_chars (struct coding_system
*coding
, Lisp_Object translation_table
,
7267 int *buf
= coding
->charbuf
;
7268 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
;
7269 const unsigned char *src
= coding
->source
+ coding
->consumed
;
7270 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
7271 EMACS_INT pos
= coding
->src_pos
+ coding
->consumed_char
;
7272 EMACS_INT end_pos
= coding
->src_pos
+ coding
->src_chars
;
7273 int multibytep
= coding
->src_multibyte
;
7274 Lisp_Object eol_type
;
7276 EMACS_INT stop
, stop_composition
, stop_charset
;
7277 int *lookup_buf
= NULL
;
7279 if (! NILP (translation_table
))
7280 lookup_buf
= alloca (sizeof (int) * max_lookup
);
7282 eol_type
= inhibit_eol_conversion
? Qunix
: CODING_ID_EOL_TYPE (coding
->id
);
7283 if (VECTORP (eol_type
))
7286 /* Note: composition handling is not yet implemented. */
7287 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7289 if (NILP (coding
->src_object
))
7290 stop
= stop_composition
= stop_charset
= end_pos
;
7293 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
)
7294 stop
= stop_composition
= pos
;
7296 stop
= stop_composition
= end_pos
;
7297 if (coding
->common_flags
& CODING_ANNOTATE_CHARSET_MASK
)
7298 stop
= stop_charset
= pos
;
7300 stop_charset
= end_pos
;
7303 /* Compensate for CRLF and conversion. */
7304 buf_end
-= 1 + MAX_ANNOTATION_LENGTH
;
7305 while (buf
< buf_end
)
7313 if (pos
== stop_composition
)
7314 buf
= handle_composition_annotation (pos
, end_pos
, coding
,
7315 buf
, &stop_composition
);
7316 if (pos
== stop_charset
)
7317 buf
= handle_charset_annotation (pos
, end_pos
, coding
,
7318 buf
, &stop_charset
);
7319 stop
= (stop_composition
< stop_charset
7320 ? stop_composition
: stop_charset
);
7327 if (coding
->encoder
== encode_coding_raw_text
7328 || coding
->encoder
== encode_coding_ccl
)
7330 else if ((bytes
= MULTIBYTE_LENGTH (src
, src_end
)) > 0)
7331 c
= STRING_CHAR_ADVANCE_NO_UNIFY (src
), pos
+= bytes
;
7333 c
= BYTE8_TO_CHAR (*src
), src
++, pos
++;
7336 c
= STRING_CHAR_ADVANCE_NO_UNIFY (src
), pos
++;
7337 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
7339 if (! EQ (eol_type
, Qunix
))
7343 if (EQ (eol_type
, Qdos
))
7351 LOOKUP_TRANSLATION_TABLE (translation_table
, c
, trans
);
7356 int from_nchars
= 1, to_nchars
= 1;
7357 int *lookup_buf_end
;
7358 const unsigned char *p
= src
;
7362 for (i
= 1; i
< max_lookup
&& p
< src_end
; i
++)
7363 lookup_buf
[i
] = STRING_CHAR_ADVANCE (p
);
7364 lookup_buf_end
= lookup_buf
+ i
;
7365 trans
= get_translation (trans
, lookup_buf
, lookup_buf_end
);
7366 if (INTEGERP (trans
))
7368 else if (CONSP (trans
))
7370 from_nchars
= ASIZE (XCAR (trans
));
7371 trans
= XCDR (trans
);
7372 if (INTEGERP (trans
))
7376 to_nchars
= ASIZE (trans
);
7377 if (buf
+ to_nchars
> buf_end
)
7379 c
= XINT (AREF (trans
, 0));
7385 for (i
= 1; i
< to_nchars
; i
++)
7386 *buf
++ = XINT (AREF (trans
, i
));
7387 for (i
= 1; i
< from_nchars
; i
++, pos
++)
7388 src
+= MULTIBYTE_LENGTH_NO_CHECK (src
);
7392 coding
->consumed
= src
- coding
->source
;
7393 coding
->consumed_char
= pos
- coding
->src_pos
;
7394 coding
->charbuf_used
= buf
- coding
->charbuf
;
7395 coding
->chars_at_source
= 0;
7399 /* Encode the text at CODING->src_object into CODING->dst_object.
7400 CODING->src_object is a buffer or a string.
7401 CODING->dst_object is a buffer or nil.
7403 If CODING->src_object is a buffer, it must be the current buffer.
7404 In this case, if CODING->src_pos is positive, it is a position of
7405 the source text in the buffer, otherwise. the source text is in the
7406 gap area of the buffer, and coding->src_pos specifies the offset of
7407 the text from GPT (which must be the same as PT). If this is the
7408 same buffer as CODING->dst_object, CODING->src_pos must be
7409 negative and CODING should not have `pre-write-conversion'.
7411 If CODING->src_object is a string, CODING should not have
7412 `pre-write-conversion'.
7414 If CODING->dst_object is a buffer, the encoded data is inserted at
7415 the current point of that buffer.
7417 If CODING->dst_object is nil, the encoded data is placed at the
7418 memory area specified by CODING->destination. */
7421 encode_coding (struct coding_system
*coding
)
7424 Lisp_Object translation_table
;
7426 struct ccl_spec cclspec
;
7428 attrs
= CODING_ID_ATTRS (coding
->id
);
7429 if (coding
->encoder
== encode_coding_raw_text
)
7430 translation_table
= Qnil
, max_lookup
= 0;
7432 translation_table
= get_translation_table (attrs
, 1, &max_lookup
);
7434 if (BUFFERP (coding
->dst_object
))
7436 set_buffer_internal (XBUFFER (coding
->dst_object
));
7437 coding
->dst_multibyte
7438 = ! NILP (BVAR (current_buffer
, enable_multibyte_characters
));
7441 coding
->consumed
= coding
->consumed_char
= 0;
7442 coding
->produced
= coding
->produced_char
= 0;
7443 record_conversion_result (coding
, CODING_RESULT_SUCCESS
);
7446 ALLOC_CONVERSION_WORK_AREA (coding
);
7448 if (coding
->encoder
== encode_coding_ccl
)
7450 coding
->spec
.ccl
= &cclspec
;
7451 setup_ccl_program (&cclspec
.ccl
, CODING_CCL_ENCODER (coding
));
7454 coding_set_source (coding
);
7455 consume_chars (coding
, translation_table
, max_lookup
);
7456 coding_set_destination (coding
);
7457 (*(coding
->encoder
)) (coding
);
7458 } while (coding
->consumed_char
< coding
->src_chars
);
7460 if (BUFFERP (coding
->dst_object
) && coding
->produced_char
> 0)
7461 insert_from_gap (coding
->produced_char
, coding
->produced
);
7463 return (coding
->result
);
7467 /* Name (or base name) of work buffer for code conversion. */
7468 static Lisp_Object Vcode_conversion_workbuf_name
;
7470 /* A working buffer used by the top level conversion. Once it is
7471 created, it is never destroyed. It has the name
7472 Vcode_conversion_workbuf_name. The other working buffers are
7473 destroyed after the use is finished, and their names are modified
7474 versions of Vcode_conversion_workbuf_name. */
7475 static Lisp_Object Vcode_conversion_reused_workbuf
;
7477 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
7478 static int reused_workbuf_in_use
;
7481 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7482 multibyteness of returning buffer. */
7485 make_conversion_work_buffer (int multibyte
)
7487 Lisp_Object name
, workbuf
;
7488 struct buffer
*current
;
7490 if (reused_workbuf_in_use
++)
7492 name
= Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name
, Qnil
);
7493 workbuf
= Fget_buffer_create (name
);
7497 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf
)))
7498 Vcode_conversion_reused_workbuf
7499 = Fget_buffer_create (Vcode_conversion_workbuf_name
);
7500 workbuf
= Vcode_conversion_reused_workbuf
;
7502 current
= current_buffer
;
7503 set_buffer_internal (XBUFFER (workbuf
));
7504 /* We can't allow modification hooks to run in the work buffer. For
7505 instance, directory_files_internal assumes that file decoding
7506 doesn't compile new regexps. */
7507 Fset (Fmake_local_variable (Qinhibit_modification_hooks
), Qt
);
7509 BVAR (current_buffer
, undo_list
) = Qt
;
7510 BVAR (current_buffer
, enable_multibyte_characters
) = multibyte
? Qt
: Qnil
;
7511 set_buffer_internal (current
);
7517 code_conversion_restore (Lisp_Object arg
)
7519 Lisp_Object current
, workbuf
;
7520 struct gcpro gcpro1
;
7523 current
= XCAR (arg
);
7524 workbuf
= XCDR (arg
);
7525 if (! NILP (workbuf
))
7527 if (EQ (workbuf
, Vcode_conversion_reused_workbuf
))
7528 reused_workbuf_in_use
= 0;
7529 else if (! NILP (Fbuffer_live_p (workbuf
)))
7530 Fkill_buffer (workbuf
);
7532 set_buffer_internal (XBUFFER (current
));
7538 code_conversion_save (int with_work_buf
, int multibyte
)
7540 Lisp_Object workbuf
= Qnil
;
7543 workbuf
= make_conversion_work_buffer (multibyte
);
7544 record_unwind_protect (code_conversion_restore
,
7545 Fcons (Fcurrent_buffer (), workbuf
));
7550 decode_coding_gap (struct coding_system
*coding
,
7551 EMACS_INT chars
, EMACS_INT bytes
)
7553 int count
= SPECPDL_INDEX ();
7556 code_conversion_save (0, 0);
7558 coding
->src_object
= Fcurrent_buffer ();
7559 coding
->src_chars
= chars
;
7560 coding
->src_bytes
= bytes
;
7561 coding
->src_pos
= -chars
;
7562 coding
->src_pos_byte
= -bytes
;
7563 coding
->src_multibyte
= chars
< bytes
;
7564 coding
->dst_object
= coding
->src_object
;
7565 coding
->dst_pos
= PT
;
7566 coding
->dst_pos_byte
= PT_BYTE
;
7567 coding
->dst_multibyte
= ! NILP (BVAR (current_buffer
, enable_multibyte_characters
));
7569 if (CODING_REQUIRE_DETECTION (coding
))
7570 detect_coding (coding
);
7572 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
7573 current_buffer
->text
->inhibit_shrinking
= 1;
7574 decode_coding (coding
);
7575 current_buffer
->text
->inhibit_shrinking
= 0;
7577 attrs
= CODING_ID_ATTRS (coding
->id
);
7578 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
7580 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
7583 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
7584 val
= call1 (CODING_ATTR_POST_READ (attrs
),
7585 make_number (coding
->produced_char
));
7587 coding
->produced_char
+= Z
- prev_Z
;
7588 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
7591 unbind_to (count
, Qnil
);
7592 return coding
->result
;
7596 encode_coding_gap (struct coding_system
*coding
,
7597 EMACS_INT chars
, EMACS_INT bytes
)
7599 int count
= SPECPDL_INDEX ();
7601 code_conversion_save (0, 0);
7603 coding
->src_object
= Fcurrent_buffer ();
7604 coding
->src_chars
= chars
;
7605 coding
->src_bytes
= bytes
;
7606 coding
->src_pos
= -chars
;
7607 coding
->src_pos_byte
= -bytes
;
7608 coding
->src_multibyte
= chars
< bytes
;
7609 coding
->dst_object
= coding
->src_object
;
7610 coding
->dst_pos
= PT
;
7611 coding
->dst_pos_byte
= PT_BYTE
;
7613 encode_coding (coding
);
7615 unbind_to (count
, Qnil
);
7616 return coding
->result
;
7620 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7621 SRC_OBJECT into DST_OBJECT by coding context CODING.
7623 SRC_OBJECT is a buffer, a string, or Qnil.
7625 If it is a buffer, the text is at point of the buffer. FROM and TO
7626 are positions in the buffer.
7628 If it is a string, the text is at the beginning of the string.
7629 FROM and TO are indices to the string.
7631 If it is nil, the text is at coding->source. FROM and TO are
7632 indices to coding->source.
7634 DST_OBJECT is a buffer, Qt, or Qnil.
7636 If it is a buffer, the decoded text is inserted at point of the
7637 buffer. If the buffer is the same as SRC_OBJECT, the source text
7640 If it is Qt, a string is made from the decoded text, and
7641 set in CODING->dst_object.
7643 If it is Qnil, the decoded text is stored at CODING->destination.
7644 The caller must allocate CODING->dst_bytes bytes at
7645 CODING->destination by xmalloc. If the decoded text is longer than
7646 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7650 decode_coding_object (struct coding_system
*coding
,
7651 Lisp_Object src_object
,
7652 EMACS_INT from
, EMACS_INT from_byte
,
7653 EMACS_INT to
, EMACS_INT to_byte
,
7654 Lisp_Object dst_object
)
7656 int count
= SPECPDL_INDEX ();
7657 unsigned char *destination
;
7658 EMACS_INT dst_bytes
;
7659 EMACS_INT chars
= to
- from
;
7660 EMACS_INT bytes
= to_byte
- from_byte
;
7662 int saved_pt
= -1, saved_pt_byte
;
7663 int need_marker_adjustment
= 0;
7664 Lisp_Object old_deactivate_mark
;
7666 old_deactivate_mark
= Vdeactivate_mark
;
7668 if (NILP (dst_object
))
7670 destination
= coding
->destination
;
7671 dst_bytes
= coding
->dst_bytes
;
7674 coding
->src_object
= src_object
;
7675 coding
->src_chars
= chars
;
7676 coding
->src_bytes
= bytes
;
7677 coding
->src_multibyte
= chars
< bytes
;
7679 if (STRINGP (src_object
))
7681 coding
->src_pos
= from
;
7682 coding
->src_pos_byte
= from_byte
;
7684 else if (BUFFERP (src_object
))
7686 set_buffer_internal (XBUFFER (src_object
));
7688 move_gap_both (from
, from_byte
);
7689 if (EQ (src_object
, dst_object
))
7691 struct Lisp_Marker
*tail
;
7693 for (tail
= BUF_MARKERS (current_buffer
); tail
; tail
= tail
->next
)
7695 tail
->need_adjustment
7696 = tail
->charpos
== (tail
->insertion_type
? from
: to
);
7697 need_marker_adjustment
|= tail
->need_adjustment
;
7699 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
7700 TEMP_SET_PT_BOTH (from
, from_byte
);
7701 current_buffer
->text
->inhibit_shrinking
= 1;
7702 del_range_both (from
, from_byte
, to
, to_byte
, 1);
7703 coding
->src_pos
= -chars
;
7704 coding
->src_pos_byte
= -bytes
;
7708 coding
->src_pos
= from
;
7709 coding
->src_pos_byte
= from_byte
;
7713 if (CODING_REQUIRE_DETECTION (coding
))
7714 detect_coding (coding
);
7715 attrs
= CODING_ID_ATTRS (coding
->id
);
7717 if (EQ (dst_object
, Qt
)
7718 || (! NILP (CODING_ATTR_POST_READ (attrs
))
7719 && NILP (dst_object
)))
7721 coding
->dst_multibyte
= !CODING_FOR_UNIBYTE (coding
);
7722 coding
->dst_object
= code_conversion_save (1, coding
->dst_multibyte
);
7723 coding
->dst_pos
= BEG
;
7724 coding
->dst_pos_byte
= BEG_BYTE
;
7726 else if (BUFFERP (dst_object
))
7728 code_conversion_save (0, 0);
7729 coding
->dst_object
= dst_object
;
7730 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
7731 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
7732 coding
->dst_multibyte
7733 = ! NILP (BVAR (XBUFFER (dst_object
), enable_multibyte_characters
));
7737 code_conversion_save (0, 0);
7738 coding
->dst_object
= Qnil
;
7739 /* Most callers presume this will return a multibyte result, and they
7740 won't use `binary' or `raw-text' anyway, so let's not worry about
7741 CODING_FOR_UNIBYTE. */
7742 coding
->dst_multibyte
= 1;
7745 decode_coding (coding
);
7747 if (BUFFERP (coding
->dst_object
))
7748 set_buffer_internal (XBUFFER (coding
->dst_object
));
7750 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
7752 struct gcpro gcpro1
, gcpro2
, gcpro3
, gcpro4
, gcpro5
;
7753 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
7756 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
7757 GCPRO5 (coding
->src_object
, coding
->dst_object
, src_object
, dst_object
,
7758 old_deactivate_mark
);
7759 val
= safe_call1 (CODING_ATTR_POST_READ (attrs
),
7760 make_number (coding
->produced_char
));
7763 coding
->produced_char
+= Z
- prev_Z
;
7764 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
7767 if (EQ (dst_object
, Qt
))
7769 coding
->dst_object
= Fbuffer_string ();
7771 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
7773 set_buffer_internal (XBUFFER (coding
->dst_object
));
7774 if (dst_bytes
< coding
->produced
)
7776 destination
= xrealloc (destination
, coding
->produced
);
7779 record_conversion_result (coding
,
7780 CODING_RESULT_INSUFFICIENT_MEM
);
7781 unbind_to (count
, Qnil
);
7784 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
7785 move_gap_both (BEGV
, BEGV_BYTE
);
7786 memcpy (destination
, BEGV_ADDR
, coding
->produced
);
7787 coding
->destination
= destination
;
7793 /* This is the case of:
7794 (BUFFERP (src_object) && EQ (src_object, dst_object))
7795 As we have moved PT while replacing the original buffer
7796 contents, we must recover it now. */
7797 set_buffer_internal (XBUFFER (src_object
));
7798 current_buffer
->text
->inhibit_shrinking
= 0;
7799 if (saved_pt
< from
)
7800 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
7801 else if (saved_pt
< from
+ chars
)
7802 TEMP_SET_PT_BOTH (from
, from_byte
);
7803 else if (! NILP (BVAR (current_buffer
, enable_multibyte_characters
)))
7804 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
7805 saved_pt_byte
+ (coding
->produced
- bytes
));
7807 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
7808 saved_pt_byte
+ (coding
->produced
- bytes
));
7810 if (need_marker_adjustment
)
7812 struct Lisp_Marker
*tail
;
7814 for (tail
= BUF_MARKERS (current_buffer
); tail
; tail
= tail
->next
)
7815 if (tail
->need_adjustment
)
7817 tail
->need_adjustment
= 0;
7818 if (tail
->insertion_type
)
7820 tail
->bytepos
= from_byte
;
7821 tail
->charpos
= from
;
7825 tail
->bytepos
= from_byte
+ coding
->produced
;
7827 = (NILP (BVAR (current_buffer
, enable_multibyte_characters
))
7828 ? tail
->bytepos
: from
+ coding
->produced_char
);
7834 Vdeactivate_mark
= old_deactivate_mark
;
7835 unbind_to (count
, coding
->dst_object
);
7840 encode_coding_object (struct coding_system
*coding
,
7841 Lisp_Object src_object
,
7842 EMACS_INT from
, EMACS_INT from_byte
,
7843 EMACS_INT to
, EMACS_INT to_byte
,
7844 Lisp_Object dst_object
)
7846 int count
= SPECPDL_INDEX ();
7847 EMACS_INT chars
= to
- from
;
7848 EMACS_INT bytes
= to_byte
- from_byte
;
7850 int saved_pt
= -1, saved_pt_byte
;
7851 int need_marker_adjustment
= 0;
7852 int kill_src_buffer
= 0;
7853 Lisp_Object old_deactivate_mark
;
7855 old_deactivate_mark
= Vdeactivate_mark
;
7857 coding
->src_object
= src_object
;
7858 coding
->src_chars
= chars
;
7859 coding
->src_bytes
= bytes
;
7860 coding
->src_multibyte
= chars
< bytes
;
7862 attrs
= CODING_ID_ATTRS (coding
->id
);
7864 if (EQ (src_object
, dst_object
))
7866 struct Lisp_Marker
*tail
;
7868 for (tail
= BUF_MARKERS (current_buffer
); tail
; tail
= tail
->next
)
7870 tail
->need_adjustment
7871 = tail
->charpos
== (tail
->insertion_type
? from
: to
);
7872 need_marker_adjustment
|= tail
->need_adjustment
;
7876 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
7878 coding
->src_object
= code_conversion_save (1, coding
->src_multibyte
);
7879 set_buffer_internal (XBUFFER (coding
->src_object
));
7880 if (STRINGP (src_object
))
7881 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
7882 else if (BUFFERP (src_object
))
7883 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
7885 insert_1_both ((char *) coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
7887 if (EQ (src_object
, dst_object
))
7889 set_buffer_internal (XBUFFER (src_object
));
7890 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
7891 del_range_both (from
, from_byte
, to
, to_byte
, 1);
7892 set_buffer_internal (XBUFFER (coding
->src_object
));
7896 Lisp_Object args
[3];
7897 struct gcpro gcpro1
, gcpro2
, gcpro3
, gcpro4
, gcpro5
;
7899 GCPRO5 (coding
->src_object
, coding
->dst_object
, src_object
, dst_object
,
7900 old_deactivate_mark
);
7901 args
[0] = CODING_ATTR_PRE_WRITE (attrs
);
7902 args
[1] = make_number (BEG
);
7903 args
[2] = make_number (Z
);
7904 safe_call (3, args
);
7907 if (XBUFFER (coding
->src_object
) != current_buffer
)
7908 kill_src_buffer
= 1;
7909 coding
->src_object
= Fcurrent_buffer ();
7911 move_gap_both (BEG
, BEG_BYTE
);
7912 coding
->src_chars
= Z
- BEG
;
7913 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
7914 coding
->src_pos
= BEG
;
7915 coding
->src_pos_byte
= BEG_BYTE
;
7916 coding
->src_multibyte
= Z
< Z_BYTE
;
7918 else if (STRINGP (src_object
))
7920 code_conversion_save (0, 0);
7921 coding
->src_pos
= from
;
7922 coding
->src_pos_byte
= from_byte
;
7924 else if (BUFFERP (src_object
))
7926 code_conversion_save (0, 0);
7927 set_buffer_internal (XBUFFER (src_object
));
7928 if (EQ (src_object
, dst_object
))
7930 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
7931 coding
->src_object
= del_range_1 (from
, to
, 1, 1);
7932 coding
->src_pos
= 0;
7933 coding
->src_pos_byte
= 0;
7937 if (from
< GPT
&& to
>= GPT
)
7938 move_gap_both (from
, from_byte
);
7939 coding
->src_pos
= from
;
7940 coding
->src_pos_byte
= from_byte
;
7944 code_conversion_save (0, 0);
7946 if (BUFFERP (dst_object
))
7948 coding
->dst_object
= dst_object
;
7949 if (EQ (src_object
, dst_object
))
7951 coding
->dst_pos
= from
;
7952 coding
->dst_pos_byte
= from_byte
;
7956 struct buffer
*current
= current_buffer
;
7958 set_buffer_temp (XBUFFER (dst_object
));
7959 coding
->dst_pos
= PT
;
7960 coding
->dst_pos_byte
= PT_BYTE
;
7961 move_gap_both (coding
->dst_pos
, coding
->dst_pos_byte
);
7962 set_buffer_temp (current
);
7964 coding
->dst_multibyte
7965 = ! NILP (BVAR (XBUFFER (dst_object
), enable_multibyte_characters
));
7967 else if (EQ (dst_object
, Qt
))
7969 coding
->dst_object
= Qnil
;
7970 coding
->dst_bytes
= coding
->src_chars
;
7971 if (coding
->dst_bytes
== 0)
7972 coding
->dst_bytes
= 1;
7973 coding
->destination
= (unsigned char *) xmalloc (coding
->dst_bytes
);
7974 coding
->dst_multibyte
= 0;
7978 coding
->dst_object
= Qnil
;
7979 coding
->dst_multibyte
= 0;
7982 encode_coding (coding
);
7984 if (EQ (dst_object
, Qt
))
7986 if (BUFFERP (coding
->dst_object
))
7987 coding
->dst_object
= Fbuffer_string ();
7991 = make_unibyte_string ((char *) coding
->destination
,
7993 xfree (coding
->destination
);
7999 /* This is the case of:
8000 (BUFFERP (src_object) && EQ (src_object, dst_object))
8001 As we have moved PT while replacing the original buffer
8002 contents, we must recover it now. */
8003 set_buffer_internal (XBUFFER (src_object
));
8004 if (saved_pt
< from
)
8005 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
8006 else if (saved_pt
< from
+ chars
)
8007 TEMP_SET_PT_BOTH (from
, from_byte
);
8008 else if (! NILP (BVAR (current_buffer
, enable_multibyte_characters
)))
8009 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
8010 saved_pt_byte
+ (coding
->produced
- bytes
));
8012 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
8013 saved_pt_byte
+ (coding
->produced
- bytes
));
8015 if (need_marker_adjustment
)
8017 struct Lisp_Marker
*tail
;
8019 for (tail
= BUF_MARKERS (current_buffer
); tail
; tail
= tail
->next
)
8020 if (tail
->need_adjustment
)
8022 tail
->need_adjustment
= 0;
8023 if (tail
->insertion_type
)
8025 tail
->bytepos
= from_byte
;
8026 tail
->charpos
= from
;
8030 tail
->bytepos
= from_byte
+ coding
->produced
;
8032 = (NILP (BVAR (current_buffer
, enable_multibyte_characters
))
8033 ? tail
->bytepos
: from
+ coding
->produced_char
);
8039 if (kill_src_buffer
)
8040 Fkill_buffer (coding
->src_object
);
8042 Vdeactivate_mark
= old_deactivate_mark
;
8043 unbind_to (count
, Qnil
);
8048 preferred_coding_system (void)
8050 int id
= coding_categories
[coding_priorities
[0]].id
;
8052 return CODING_ID_NAME (id
);
8057 /*** 8. Emacs Lisp library functions ***/
8059 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
8060 doc
: /* Return t if OBJECT is nil or a coding-system.
8061 See the documentation of `define-coding-system' for information
8062 about coding-system objects. */)
8063 (Lisp_Object object
)
8066 || CODING_SYSTEM_ID (object
) >= 0)
8068 if (! SYMBOLP (object
)
8069 || NILP (Fget (object
, Qcoding_system_define_form
)))
8074 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
8075 Sread_non_nil_coding_system
, 1, 1, 0,
8076 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8077 (Lisp_Object prompt
)
8082 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
8083 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
8085 while (SCHARS (val
) == 0);
8086 return (Fintern (val
, Qnil
));
8089 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
8090 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8091 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8092 Ignores case when completing coding systems (all Emacs coding systems
8093 are lower-case). */)
8094 (Lisp_Object prompt
, Lisp_Object default_coding_system
)
8097 int count
= SPECPDL_INDEX ();
8099 if (SYMBOLP (default_coding_system
))
8100 default_coding_system
= SYMBOL_NAME (default_coding_system
);
8101 specbind (Qcompletion_ignore_case
, Qt
);
8102 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
8103 Qt
, Qnil
, Qcoding_system_history
,
8104 default_coding_system
, Qnil
);
8105 unbind_to (count
, Qnil
);
8106 return (SCHARS (val
) == 0 ? Qnil
: Fintern (val
, Qnil
));
8109 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
8111 doc
: /* Check validity of CODING-SYSTEM.
8112 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8113 It is valid if it is nil or a symbol defined as a coding system by the
8114 function `define-coding-system'. */)
8115 (Lisp_Object coding_system
)
8117 Lisp_Object define_form
;
8119 define_form
= Fget (coding_system
, Qcoding_system_define_form
);
8120 if (! NILP (define_form
))
8122 Fput (coding_system
, Qcoding_system_define_form
, Qnil
);
8123 safe_eval (define_form
);
8125 if (!NILP (Fcoding_system_p (coding_system
)))
8126 return coding_system
;
8127 xsignal1 (Qcoding_system_error
, coding_system
);
8131 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8132 HIGHEST is nonzero, return the coding system of the highest
8133 priority among the detected coding systems. Otherwise return a
8134 list of detected coding systems sorted by their priorities. If
8135 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8136 multibyte form but contains only ASCII and eight-bit chars.
8137 Otherwise, the bytes are raw bytes.
8139 CODING-SYSTEM controls the detection as below:
8141 If it is nil, detect both text-format and eol-format. If the
8142 text-format part of CODING-SYSTEM is already specified
8143 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8144 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8145 detect only text-format. */
8148 detect_coding_system (const unsigned char *src
,
8149 EMACS_INT src_chars
, EMACS_INT src_bytes
,
8150 int highest
, int multibytep
,
8151 Lisp_Object coding_system
)
8153 const unsigned char *src_end
= src
+ src_bytes
;
8154 Lisp_Object attrs
, eol_type
;
8155 Lisp_Object val
= Qnil
;
8156 struct coding_system coding
;
8158 struct coding_detection_info detect_info
;
8159 enum coding_category base_category
;
8160 int null_byte_found
= 0, eight_bit_found
= 0;
8162 if (NILP (coding_system
))
8163 coding_system
= Qundecided
;
8164 setup_coding_system (coding_system
, &coding
);
8165 attrs
= CODING_ID_ATTRS (coding
.id
);
8166 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
8167 coding_system
= CODING_ATTR_BASE_NAME (attrs
);
8169 coding
.source
= src
;
8170 coding
.src_chars
= src_chars
;
8171 coding
.src_bytes
= src_bytes
;
8172 coding
.src_multibyte
= multibytep
;
8173 coding
.consumed
= 0;
8174 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
8175 coding
.head_ascii
= 0;
8177 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
8179 /* At first, detect text-format if necessary. */
8180 base_category
= XINT (CODING_ATTR_CATEGORY (attrs
));
8181 if (base_category
== coding_category_undecided
)
8183 enum coding_category category
;
8184 struct coding_system
*this;
8187 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8188 for (; src
< src_end
; src
++)
8193 eight_bit_found
= 1;
8194 if (null_byte_found
)
8199 if ((c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
8200 && ! inhibit_iso_escape_detection
8201 && ! detect_info
.checked
)
8203 if (detect_coding_iso_2022 (&coding
, &detect_info
))
8205 /* We have scanned the whole data. */
8206 if (! (detect_info
.rejected
& CATEGORY_MASK_ISO_7_ELSE
))
8208 /* We didn't find an 8-bit code. We may
8209 have found a null-byte, but it's very
8210 rare that a binary file confirm to
8213 coding
.head_ascii
= src
- coding
.source
;
8215 detect_info
.rejected
|= ~CATEGORY_MASK_ISO_ESCAPE
;
8219 else if (! c
&& !inhibit_null_byte_detection
)
8221 null_byte_found
= 1;
8222 if (eight_bit_found
)
8225 if (! eight_bit_found
)
8226 coding
.head_ascii
++;
8228 else if (! eight_bit_found
)
8229 coding
.head_ascii
++;
8232 if (null_byte_found
|| eight_bit_found
8233 || coding
.head_ascii
< coding
.src_bytes
8234 || detect_info
.found
)
8236 if (coding
.head_ascii
== coding
.src_bytes
)
8237 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8238 for (i
= 0; i
< coding_category_raw_text
; i
++)
8240 category
= coding_priorities
[i
];
8241 this = coding_categories
+ category
;
8242 if (detect_info
.found
& (1 << category
))
8247 if (null_byte_found
)
8249 detect_info
.checked
|= ~CATEGORY_MASK_UTF_16
;
8250 detect_info
.rejected
|= ~CATEGORY_MASK_UTF_16
;
8252 for (i
= 0; i
< coding_category_raw_text
; i
++)
8254 category
= coding_priorities
[i
];
8255 this = coding_categories
+ category
;
8259 /* No coding system of this category is defined. */
8260 detect_info
.rejected
|= (1 << category
);
8262 else if (category
>= coding_category_raw_text
)
8264 else if (detect_info
.checked
& (1 << category
))
8267 && (detect_info
.found
& (1 << category
)))
8270 else if ((*(this->detector
)) (&coding
, &detect_info
)
8272 && (detect_info
.found
& (1 << category
)))
8274 if (category
== coding_category_utf_16_auto
)
8276 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
8277 category
= coding_category_utf_16_le
;
8279 category
= coding_category_utf_16_be
;
8287 if ((detect_info
.rejected
& CATEGORY_MASK_ANY
) == CATEGORY_MASK_ANY
8290 detect_info
.found
= CATEGORY_MASK_RAW_TEXT
;
8291 id
= CODING_SYSTEM_ID (Qno_conversion
);
8292 val
= Fcons (make_number (id
), Qnil
);
8294 else if (! detect_info
.rejected
&& ! detect_info
.found
)
8296 detect_info
.found
= CATEGORY_MASK_ANY
;
8297 id
= coding_categories
[coding_category_undecided
].id
;
8298 val
= Fcons (make_number (id
), Qnil
);
8302 if (detect_info
.found
)
8304 detect_info
.found
= 1 << category
;
8305 val
= Fcons (make_number (this->id
), Qnil
);
8308 for (i
= 0; i
< coding_category_raw_text
; i
++)
8309 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
8311 detect_info
.found
= 1 << coding_priorities
[i
];
8312 id
= coding_categories
[coding_priorities
[i
]].id
;
8313 val
= Fcons (make_number (id
), Qnil
);
8319 int mask
= detect_info
.rejected
| detect_info
.found
;
8322 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
8324 category
= coding_priorities
[i
];
8325 if (! (mask
& (1 << category
)))
8327 found
|= 1 << category
;
8328 id
= coding_categories
[category
].id
;
8330 val
= Fcons (make_number (id
), val
);
8333 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
8335 category
= coding_priorities
[i
];
8336 if (detect_info
.found
& (1 << category
))
8338 id
= coding_categories
[category
].id
;
8339 val
= Fcons (make_number (id
), val
);
8342 detect_info
.found
|= found
;
8345 else if (base_category
== coding_category_utf_8_auto
)
8347 if (detect_coding_utf_8 (&coding
, &detect_info
))
8349 struct coding_system
*this;
8351 if (detect_info
.found
& CATEGORY_MASK_UTF_8_SIG
)
8352 this = coding_categories
+ coding_category_utf_8_sig
;
8354 this = coding_categories
+ coding_category_utf_8_nosig
;
8355 val
= Fcons (make_number (this->id
), Qnil
);
8358 else if (base_category
== coding_category_utf_16_auto
)
8360 if (detect_coding_utf_16 (&coding
, &detect_info
))
8362 struct coding_system
*this;
8364 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
8365 this = coding_categories
+ coding_category_utf_16_le
;
8366 else if (detect_info
.found
& CATEGORY_MASK_UTF_16_BE
)
8367 this = coding_categories
+ coding_category_utf_16_be
;
8368 else if (detect_info
.rejected
& CATEGORY_MASK_UTF_16_LE_NOSIG
)
8369 this = coding_categories
+ coding_category_utf_16_be_nosig
;
8371 this = coding_categories
+ coding_category_utf_16_le_nosig
;
8372 val
= Fcons (make_number (this->id
), Qnil
);
8377 detect_info
.found
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
8378 val
= Fcons (make_number (coding
.id
), Qnil
);
8381 /* Then, detect eol-format if necessary. */
8383 int normal_eol
= -1, utf_16_be_eol
= -1, utf_16_le_eol
= -1;
8386 if (VECTORP (eol_type
))
8388 if (detect_info
.found
& ~CATEGORY_MASK_UTF_16
)
8390 if (null_byte_found
)
8391 normal_eol
= EOL_SEEN_LF
;
8393 normal_eol
= detect_eol (coding
.source
, src_bytes
,
8394 coding_category_raw_text
);
8396 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_BE
8397 | CATEGORY_MASK_UTF_16_BE_NOSIG
))
8398 utf_16_be_eol
= detect_eol (coding
.source
, src_bytes
,
8399 coding_category_utf_16_be
);
8400 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
8401 | CATEGORY_MASK_UTF_16_LE_NOSIG
))
8402 utf_16_le_eol
= detect_eol (coding
.source
, src_bytes
,
8403 coding_category_utf_16_le
);
8407 if (EQ (eol_type
, Qunix
))
8408 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_LF
;
8409 else if (EQ (eol_type
, Qdos
))
8410 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CRLF
;
8412 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CR
;
8415 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
8417 enum coding_category category
;
8420 id
= XINT (XCAR (tail
));
8421 attrs
= CODING_ID_ATTRS (id
);
8422 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
8423 eol_type
= CODING_ID_EOL_TYPE (id
);
8424 if (VECTORP (eol_type
))
8426 if (category
== coding_category_utf_16_be
8427 || category
== coding_category_utf_16_be_nosig
)
8428 this_eol
= utf_16_be_eol
;
8429 else if (category
== coding_category_utf_16_le
8430 || category
== coding_category_utf_16_le_nosig
)
8431 this_eol
= utf_16_le_eol
;
8433 this_eol
= normal_eol
;
8435 if (this_eol
== EOL_SEEN_LF
)
8436 XSETCAR (tail
, AREF (eol_type
, 0));
8437 else if (this_eol
== EOL_SEEN_CRLF
)
8438 XSETCAR (tail
, AREF (eol_type
, 1));
8439 else if (this_eol
== EOL_SEEN_CR
)
8440 XSETCAR (tail
, AREF (eol_type
, 2));
8442 XSETCAR (tail
, CODING_ID_NAME (id
));
8445 XSETCAR (tail
, CODING_ID_NAME (id
));
8449 return (highest
? (CONSP (val
) ? XCAR (val
) : Qnil
) : val
);
8453 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
8455 doc
: /* Detect coding system of the text in the region between START and END.
8456 Return a list of possible coding systems ordered by priority.
8457 The coding systems to try and their priorities follows what
8458 the function `coding-system-priority-list' (which see) returns.
8460 If only ASCII characters are found (except for such ISO-2022 control
8461 characters as ESC), it returns a list of single element `undecided'
8462 or its subsidiary coding system according to a detected end-of-line
8465 If optional argument HIGHEST is non-nil, return the coding system of
8466 highest priority. */)
8467 (Lisp_Object start
, Lisp_Object end
, Lisp_Object highest
)
8470 int from_byte
, to_byte
;
8472 CHECK_NUMBER_COERCE_MARKER (start
);
8473 CHECK_NUMBER_COERCE_MARKER (end
);
8475 validate_region (&start
, &end
);
8476 from
= XINT (start
), to
= XINT (end
);
8477 from_byte
= CHAR_TO_BYTE (from
);
8478 to_byte
= CHAR_TO_BYTE (to
);
8480 if (from
< GPT
&& to
>= GPT
)
8481 move_gap_both (to
, to_byte
);
8483 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
8484 to
- from
, to_byte
- from_byte
,
8486 !NILP (BVAR (current_buffer
8487 , enable_multibyte_characters
)),
8491 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
8493 doc
: /* Detect coding system of the text in STRING.
8494 Return a list of possible coding systems ordered by priority.
8495 The coding systems to try and their priorities follows what
8496 the function `coding-system-priority-list' (which see) returns.
8498 If only ASCII characters are found (except for such ISO-2022 control
8499 characters as ESC), it returns a list of single element `undecided'
8500 or its subsidiary coding system according to a detected end-of-line
8503 If optional argument HIGHEST is non-nil, return the coding system of
8504 highest priority. */)
8505 (Lisp_Object string
, Lisp_Object highest
)
8507 CHECK_STRING (string
);
8509 return detect_coding_system (SDATA (string
),
8510 SCHARS (string
), SBYTES (string
),
8511 !NILP (highest
), STRING_MULTIBYTE (string
),
8517 char_encodable_p (int c
, Lisp_Object attrs
)
8520 struct charset
*charset
;
8521 Lisp_Object translation_table
;
8523 translation_table
= CODING_ATTR_TRANS_TBL (attrs
);
8524 if (! NILP (translation_table
))
8525 c
= translate_char (translation_table
, c
);
8526 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
8527 CONSP (tail
); tail
= XCDR (tail
))
8529 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
8530 if (CHAR_CHARSET_P (c
, charset
))
8533 return (! NILP (tail
));
8537 /* Return a list of coding systems that safely encode the text between
8538 START and END. If EXCLUDE is non-nil, it is a list of coding
8539 systems not to check. The returned list doesn't contain any such
8540 coding systems. In any case, if the text contains only ASCII or is
8541 unibyte, return t. */
8543 DEFUN ("find-coding-systems-region-internal",
8544 Ffind_coding_systems_region_internal
,
8545 Sfind_coding_systems_region_internal
, 2, 3, 0,
8546 doc
: /* Internal use only. */)
8547 (Lisp_Object start
, Lisp_Object end
, Lisp_Object exclude
)
8549 Lisp_Object coding_attrs_list
, safe_codings
;
8550 EMACS_INT start_byte
, end_byte
;
8551 const unsigned char *p
, *pbeg
, *pend
;
8553 Lisp_Object tail
, elt
, work_table
;
8555 if (STRINGP (start
))
8557 if (!STRING_MULTIBYTE (start
)
8558 || SCHARS (start
) == SBYTES (start
))
8561 end_byte
= SBYTES (start
);
8565 CHECK_NUMBER_COERCE_MARKER (start
);
8566 CHECK_NUMBER_COERCE_MARKER (end
);
8567 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
8568 args_out_of_range (start
, end
);
8569 if (NILP (BVAR (current_buffer
, enable_multibyte_characters
)))
8571 start_byte
= CHAR_TO_BYTE (XINT (start
));
8572 end_byte
= CHAR_TO_BYTE (XINT (end
));
8573 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
8576 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
8578 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
8579 move_gap_both (XINT (start
), start_byte
);
8581 move_gap_both (XINT (end
), end_byte
);
8585 coding_attrs_list
= Qnil
;
8586 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
8588 || NILP (Fmemq (XCAR (tail
), exclude
)))
8592 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
8593 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
8594 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
8596 ASET (attrs
, coding_attr_trans_tbl
,
8597 get_translation_table (attrs
, 1, NULL
));
8598 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
8602 if (STRINGP (start
))
8603 p
= pbeg
= SDATA (start
);
8605 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
8606 pend
= p
+ (end_byte
- start_byte
);
8608 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
8609 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
8611 work_table
= Fmake_char_table (Qnil
, Qnil
);
8614 if (ASCII_BYTE_P (*p
))
8618 c
= STRING_CHAR_ADVANCE (p
);
8619 if (!NILP (char_table_ref (work_table
, c
)))
8620 /* This character was already checked. Ignore it. */
8623 charset_map_loaded
= 0;
8624 for (tail
= coding_attrs_list
; CONSP (tail
);)
8629 else if (char_encodable_p (c
, elt
))
8631 else if (CONSP (XCDR (tail
)))
8633 XSETCAR (tail
, XCAR (XCDR (tail
)));
8634 XSETCDR (tail
, XCDR (XCDR (tail
)));
8638 XSETCAR (tail
, Qnil
);
8642 if (charset_map_loaded
)
8644 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
8646 if (STRINGP (start
))
8647 pbeg
= SDATA (start
);
8649 pbeg
= BYTE_POS_ADDR (start_byte
);
8650 p
= pbeg
+ p_offset
;
8651 pend
= pbeg
+ pend_offset
;
8653 char_table_set (work_table
, c
, Qt
);
8657 safe_codings
= list2 (Qraw_text
, Qno_conversion
);
8658 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
8659 if (! NILP (XCAR (tail
)))
8660 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
8662 return safe_codings
;
8666 DEFUN ("unencodable-char-position", Funencodable_char_position
,
8667 Sunencodable_char_position
, 3, 5, 0,
8669 Return position of first un-encodable character in a region.
8670 START and END specify the region and CODING-SYSTEM specifies the
8671 encoding to check. Return nil if CODING-SYSTEM does encode the region.
8673 If optional 4th argument COUNT is non-nil, it specifies at most how
8674 many un-encodable characters to search. In this case, the value is a
8677 If optional 5th argument STRING is non-nil, it is a string to search
8678 for un-encodable characters. In that case, START and END are indexes
8680 (Lisp_Object start
, Lisp_Object end
, Lisp_Object coding_system
, Lisp_Object count
, Lisp_Object string
)
8683 struct coding_system coding
;
8684 Lisp_Object attrs
, charset_list
, translation_table
;
8685 Lisp_Object positions
;
8687 const unsigned char *p
, *stop
, *pend
;
8688 int ascii_compatible
;
8690 setup_coding_system (Fcheck_coding_system (coding_system
), &coding
);
8691 attrs
= CODING_ID_ATTRS (coding
.id
);
8692 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
8694 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
8695 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
8696 translation_table
= get_translation_table (attrs
, 1, NULL
);
8700 validate_region (&start
, &end
);
8701 from
= XINT (start
);
8703 if (NILP (BVAR (current_buffer
, enable_multibyte_characters
))
8704 || (ascii_compatible
8705 && (to
- from
) == (CHAR_TO_BYTE (to
) - (CHAR_TO_BYTE (from
)))))
8707 p
= CHAR_POS_ADDR (from
);
8708 pend
= CHAR_POS_ADDR (to
);
8709 if (from
< GPT
&& to
>= GPT
)
8716 CHECK_STRING (string
);
8717 CHECK_NATNUM (start
);
8719 from
= XINT (start
);
8722 || to
> SCHARS (string
))
8723 args_out_of_range_3 (string
, start
, end
);
8724 if (! STRING_MULTIBYTE (string
))
8726 p
= SDATA (string
) + string_char_to_byte (string
, from
);
8727 stop
= pend
= SDATA (string
) + string_char_to_byte (string
, to
);
8728 if (ascii_compatible
&& (to
- from
) == (pend
- p
))
8736 CHECK_NATNUM (count
);
8745 if (ascii_compatible
)
8746 while (p
< stop
&& ASCII_BYTE_P (*p
))
8756 c
= STRING_CHAR_ADVANCE (p
);
8757 if (! (ASCII_CHAR_P (c
) && ascii_compatible
)
8758 && ! char_charset (translate_char (translation_table
, c
),
8759 charset_list
, NULL
))
8761 positions
= Fcons (make_number (from
), positions
);
8770 return (NILP (count
) ? Fcar (positions
) : Fnreverse (positions
));
8774 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
8775 Scheck_coding_systems_region
, 3, 3, 0,
8776 doc
: /* Check if the region is encodable by coding systems.
8778 START and END are buffer positions specifying the region.
8779 CODING-SYSTEM-LIST is a list of coding systems to check.
8781 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8782 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8783 whole region, POS0, POS1, ... are buffer positions where non-encodable
8784 characters are found.
8786 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8789 START may be a string. In that case, check if the string is
8790 encodable, and the value contains indices to the string instead of
8791 buffer positions. END is ignored.
8793 If the current buffer (or START if it is a string) is unibyte, the value
8795 (Lisp_Object start
, Lisp_Object end
, Lisp_Object coding_system_list
)
8798 EMACS_INT start_byte
, end_byte
;
8800 const unsigned char *p
, *pbeg
, *pend
;
8802 Lisp_Object tail
, elt
, attrs
;
8804 if (STRINGP (start
))
8806 if (!STRING_MULTIBYTE (start
)
8807 || SCHARS (start
) == SBYTES (start
))
8810 end_byte
= SBYTES (start
);
8815 CHECK_NUMBER_COERCE_MARKER (start
);
8816 CHECK_NUMBER_COERCE_MARKER (end
);
8817 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
8818 args_out_of_range (start
, end
);
8819 if (NILP (BVAR (current_buffer
, enable_multibyte_characters
)))
8821 start_byte
= CHAR_TO_BYTE (XINT (start
));
8822 end_byte
= CHAR_TO_BYTE (XINT (end
));
8823 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
8826 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
8828 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
8829 move_gap_both (XINT (start
), start_byte
);
8831 move_gap_both (XINT (end
), end_byte
);
8837 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
8840 attrs
= AREF (CODING_SYSTEM_SPEC (elt
), 0);
8841 ASET (attrs
, coding_attr_trans_tbl
,
8842 get_translation_table (attrs
, 1, NULL
));
8843 list
= Fcons (Fcons (elt
, Fcons (attrs
, Qnil
)), list
);
8846 if (STRINGP (start
))
8847 p
= pbeg
= SDATA (start
);
8849 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
8850 pend
= p
+ (end_byte
- start_byte
);
8852 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
8853 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
8857 if (ASCII_BYTE_P (*p
))
8861 c
= STRING_CHAR_ADVANCE (p
);
8863 charset_map_loaded
= 0;
8864 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
8866 elt
= XCDR (XCAR (tail
));
8867 if (! char_encodable_p (c
, XCAR (elt
)))
8868 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
8870 if (charset_map_loaded
)
8872 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
8874 if (STRINGP (start
))
8875 pbeg
= SDATA (start
);
8877 pbeg
= BYTE_POS_ADDR (start_byte
);
8878 p
= pbeg
+ p_offset
;
8879 pend
= pbeg
+ pend_offset
;
8887 for (; CONSP (tail
); tail
= XCDR (tail
))
8890 if (CONSP (XCDR (XCDR (elt
))))
8891 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
8900 code_convert_region (Lisp_Object start
, Lisp_Object end
,
8901 Lisp_Object coding_system
, Lisp_Object dst_object
,
8902 int encodep
, int norecord
)
8904 struct coding_system coding
;
8905 EMACS_INT from
, from_byte
, to
, to_byte
;
8906 Lisp_Object src_object
;
8908 CHECK_NUMBER_COERCE_MARKER (start
);
8909 CHECK_NUMBER_COERCE_MARKER (end
);
8910 if (NILP (coding_system
))
8911 coding_system
= Qno_conversion
;
8913 CHECK_CODING_SYSTEM (coding_system
);
8914 src_object
= Fcurrent_buffer ();
8915 if (NILP (dst_object
))
8916 dst_object
= src_object
;
8917 else if (! EQ (dst_object
, Qt
))
8918 CHECK_BUFFER (dst_object
);
8920 validate_region (&start
, &end
);
8921 from
= XFASTINT (start
);
8922 from_byte
= CHAR_TO_BYTE (from
);
8923 to
= XFASTINT (end
);
8924 to_byte
= CHAR_TO_BYTE (to
);
8926 setup_coding_system (coding_system
, &coding
);
8927 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
8930 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
8933 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
8936 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
8938 return (BUFFERP (dst_object
)
8939 ? make_number (coding
.produced_char
)
8940 : coding
.dst_object
);
8944 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
8945 3, 4, "r\nzCoding system: ",
8946 doc
: /* Decode the current region from the specified coding system.
8947 When called from a program, takes four arguments:
8948 START, END, CODING-SYSTEM, and DESTINATION.
8949 START and END are buffer positions.
8951 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8952 If nil, the region between START and END is replaced by the decoded text.
8953 If buffer, the decoded text is inserted in that buffer after point (point
8955 In those cases, the length of the decoded text is returned.
8956 If DESTINATION is t, the decoded text is returned.
8958 This function sets `last-coding-system-used' to the precise coding system
8959 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8960 not fully specified.) */)
8961 (Lisp_Object start
, Lisp_Object end
, Lisp_Object coding_system
, Lisp_Object destination
)
8963 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
8966 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
8967 3, 4, "r\nzCoding system: ",
8968 doc
: /* Encode the current region by specified coding system.
8969 When called from a program, takes four arguments:
8970 START, END, CODING-SYSTEM and DESTINATION.
8971 START and END are buffer positions.
8973 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8974 If nil, the region between START and END is replace by the encoded text.
8975 If buffer, the encoded text is inserted in that buffer after point (point
8977 In those cases, the length of the encoded text is returned.
8978 If DESTINATION is t, the encoded text is returned.
8980 This function sets `last-coding-system-used' to the precise coding system
8981 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8982 not fully specified.) */)
8983 (Lisp_Object start
, Lisp_Object end
, Lisp_Object coding_system
, Lisp_Object destination
)
8985 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
8989 code_convert_string (Lisp_Object string
, Lisp_Object coding_system
,
8990 Lisp_Object dst_object
, int encodep
, int nocopy
, int norecord
)
8992 struct coding_system coding
;
8993 EMACS_INT chars
, bytes
;
8995 CHECK_STRING (string
);
8996 if (NILP (coding_system
))
8999 Vlast_coding_system_used
= Qno_conversion
;
9000 if (NILP (dst_object
))
9001 return (nocopy
? Fcopy_sequence (string
) : string
);
9004 if (NILP (coding_system
))
9005 coding_system
= Qno_conversion
;
9007 CHECK_CODING_SYSTEM (coding_system
);
9008 if (NILP (dst_object
))
9010 else if (! EQ (dst_object
, Qt
))
9011 CHECK_BUFFER (dst_object
);
9013 setup_coding_system (coding_system
, &coding
);
9014 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
9015 chars
= SCHARS (string
);
9016 bytes
= SBYTES (string
);
9018 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
9020 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
9022 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
9024 return (BUFFERP (dst_object
)
9025 ? make_number (coding
.produced_char
)
9026 : coding
.dst_object
);
9030 /* Encode or decode STRING according to CODING_SYSTEM.
9031 Do not set Vlast_coding_system_used.
9033 This function is called only from macros DECODE_FILE and
9034 ENCODE_FILE, thus we ignore character composition. */
9037 code_convert_string_norecord (Lisp_Object string
, Lisp_Object coding_system
,
9040 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
9044 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
9046 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9048 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9049 if the decoding operation is trivial.
9051 Optional fourth arg BUFFER non-nil means that the decoded text is
9052 inserted in that buffer after point (point does not move). In this
9053 case, the return value is the length of the decoded text.
9055 This function sets `last-coding-system-used' to the precise coding system
9056 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9057 not fully specified.) */)
9058 (Lisp_Object string
, Lisp_Object coding_system
, Lisp_Object nocopy
, Lisp_Object buffer
)
9060 return code_convert_string (string
, coding_system
, buffer
,
9061 0, ! NILP (nocopy
), 0);
9064 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
9066 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
9068 Optional third arg NOCOPY non-nil means it is OK to return STRING
9069 itself if the encoding operation is trivial.
9071 Optional fourth arg BUFFER non-nil means that the encoded text is
9072 inserted in that buffer after point (point does not move). In this
9073 case, the return value is the length of the encoded text.
9075 This function sets `last-coding-system-used' to the precise coding system
9076 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9077 not fully specified.) */)
9078 (Lisp_Object string
, Lisp_Object coding_system
, Lisp_Object nocopy
, Lisp_Object buffer
)
9080 return code_convert_string (string
, coding_system
, buffer
,
9081 1, ! NILP (nocopy
), 1);
9085 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
9086 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
9087 Return the corresponding character. */)
9090 Lisp_Object spec
, attrs
, val
;
9091 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
9094 CHECK_NATNUM (code
);
9095 c
= XFASTINT (code
);
9096 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
9097 attrs
= AREF (spec
, 0);
9099 if (ASCII_BYTE_P (c
)
9100 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
9103 val
= CODING_ATTR_CHARSET_LIST (attrs
);
9104 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
9105 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
9106 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
9109 charset
= charset_roman
;
9110 else if (c
>= 0xA0 && c
< 0xDF)
9112 charset
= charset_kana
;
9117 int s1
= c
>> 8, s2
= c
& 0xFF;
9119 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
9120 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
9121 error ("Invalid code: %d", code
);
9123 charset
= charset_kanji
;
9125 c
= DECODE_CHAR (charset
, c
);
9127 error ("Invalid code: %d", code
);
9128 return make_number (c
);
9132 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
9133 doc
: /* Encode a Japanese character CH to shift_jis encoding.
9134 Return the corresponding code in SJIS. */)
9137 Lisp_Object spec
, attrs
, charset_list
;
9139 struct charset
*charset
;
9142 CHECK_CHARACTER (ch
);
9144 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
9145 attrs
= AREF (spec
, 0);
9147 if (ASCII_CHAR_P (c
)
9148 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
9151 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
9152 charset
= char_charset (c
, charset_list
, &code
);
9153 if (code
== CHARSET_INVALID_CODE (charset
))
9154 error ("Can't encode by shift_jis encoding: %d", c
);
9157 return make_number (code
);
9160 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
9161 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
9162 Return the corresponding character. */)
9165 Lisp_Object spec
, attrs
, val
;
9166 struct charset
*charset_roman
, *charset_big5
, *charset
;
9169 CHECK_NATNUM (code
);
9170 c
= XFASTINT (code
);
9171 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
9172 attrs
= AREF (spec
, 0);
9174 if (ASCII_BYTE_P (c
)
9175 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
9178 val
= CODING_ATTR_CHARSET_LIST (attrs
);
9179 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
9180 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
9183 charset
= charset_roman
;
9186 int b1
= c
>> 8, b2
= c
& 0x7F;
9187 if (b1
< 0xA1 || b1
> 0xFE
9188 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
9189 error ("Invalid code: %d", code
);
9190 charset
= charset_big5
;
9192 c
= DECODE_CHAR (charset
, (unsigned )c
);
9194 error ("Invalid code: %d", code
);
9195 return make_number (c
);
9198 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
9199 doc
: /* Encode the Big5 character CH to BIG5 coding system.
9200 Return the corresponding character code in Big5. */)
9203 Lisp_Object spec
, attrs
, charset_list
;
9204 struct charset
*charset
;
9208 CHECK_CHARACTER (ch
);
9210 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
9211 attrs
= AREF (spec
, 0);
9212 if (ASCII_CHAR_P (c
)
9213 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
9216 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
9217 charset
= char_charset (c
, charset_list
, &code
);
9218 if (code
== CHARSET_INVALID_CODE (charset
))
9219 error ("Can't encode by Big5 encoding: %d", c
);
9221 return make_number (code
);
9225 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal
,
9226 Sset_terminal_coding_system_internal
, 1, 2, 0,
9227 doc
: /* Internal use only. */)
9228 (Lisp_Object coding_system
, Lisp_Object terminal
)
9230 struct terminal
*term
= get_terminal (terminal
, 1);
9231 struct coding_system
*terminal_coding
= TERMINAL_TERMINAL_CODING (term
);
9232 CHECK_SYMBOL (coding_system
);
9233 setup_coding_system (Fcheck_coding_system (coding_system
), terminal_coding
);
9234 /* We had better not send unsafe characters to terminal. */
9235 terminal_coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
9236 /* Character composition should be disabled. */
9237 terminal_coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
9238 terminal_coding
->src_multibyte
= 1;
9239 terminal_coding
->dst_multibyte
= 0;
9240 if (terminal_coding
->common_flags
& CODING_REQUIRE_ENCODING_MASK
)
9241 term
->charset_list
= coding_charset_list (terminal_coding
);
9243 term
->charset_list
= Fcons (make_number (charset_ascii
), Qnil
);
9247 DEFUN ("set-safe-terminal-coding-system-internal",
9248 Fset_safe_terminal_coding_system_internal
,
9249 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
9250 doc
: /* Internal use only. */)
9251 (Lisp_Object coding_system
)
9253 CHECK_SYMBOL (coding_system
);
9254 setup_coding_system (Fcheck_coding_system (coding_system
),
9255 &safe_terminal_coding
);
9256 /* Character composition should be disabled. */
9257 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
9258 safe_terminal_coding
.src_multibyte
= 1;
9259 safe_terminal_coding
.dst_multibyte
= 0;
9263 DEFUN ("terminal-coding-system", Fterminal_coding_system
,
9264 Sterminal_coding_system
, 0, 1, 0,
9265 doc
: /* Return coding system specified for terminal output on the given terminal.
9266 TERMINAL may be a terminal object, a frame, or nil for the selected
9267 frame's terminal device. */)
9268 (Lisp_Object terminal
)
9270 struct coding_system
*terminal_coding
9271 = TERMINAL_TERMINAL_CODING (get_terminal (terminal
, 1));
9272 Lisp_Object coding_system
= CODING_ID_NAME (terminal_coding
->id
);
9274 /* For backward compatibility, return nil if it is `undecided'. */
9275 return (! EQ (coding_system
, Qundecided
) ? coding_system
: Qnil
);
9278 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal
,
9279 Sset_keyboard_coding_system_internal
, 1, 2, 0,
9280 doc
: /* Internal use only. */)
9281 (Lisp_Object coding_system
, Lisp_Object terminal
)
9283 struct terminal
*t
= get_terminal (terminal
, 1);
9284 CHECK_SYMBOL (coding_system
);
9285 if (NILP (coding_system
))
9286 coding_system
= Qno_conversion
;
9288 Fcheck_coding_system (coding_system
);
9289 setup_coding_system (coding_system
, TERMINAL_KEYBOARD_CODING (t
));
9290 /* Character composition should be disabled. */
9291 TERMINAL_KEYBOARD_CODING (t
)->common_flags
9292 &= ~CODING_ANNOTATE_COMPOSITION_MASK
;
9296 DEFUN ("keyboard-coding-system",
9297 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 1, 0,
9298 doc
: /* Return coding system specified for decoding keyboard input. */)
9299 (Lisp_Object terminal
)
9301 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9302 (get_terminal (terminal
, 1))->id
);
9306 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
9307 Sfind_operation_coding_system
, 1, MANY
, 0,
9308 doc
: /* Choose a coding system for an operation based on the target name.
9309 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9310 DECODING-SYSTEM is the coding system to use for decoding
9311 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9312 for encoding (in case OPERATION does encoding).
9314 The first argument OPERATION specifies an I/O primitive:
9315 For file I/O, `insert-file-contents' or `write-region'.
9316 For process I/O, `call-process', `call-process-region', or `start-process'.
9317 For network I/O, `open-network-stream'.
9319 The remaining arguments should be the same arguments that were passed
9320 to the primitive. Depending on which primitive, one of those arguments
9321 is selected as the TARGET. For example, if OPERATION does file I/O,
9322 whichever argument specifies the file name is TARGET.
9324 TARGET has a meaning which depends on OPERATION:
9325 For file I/O, TARGET is a file name (except for the special case below).
9326 For process I/O, TARGET is a process name.
9327 For network I/O, TARGET is a service name or a port number.
9329 This function looks up what is specified for TARGET in
9330 `file-coding-system-alist', `process-coding-system-alist',
9331 or `network-coding-system-alist' depending on OPERATION.
9332 They may specify a coding system, a cons of coding systems,
9333 or a function symbol to call.
9334 In the last case, we call the function with one argument,
9335 which is a list of all the arguments given to this function.
9336 If the function can't decide a coding system, it can return
9337 `undecided' so that the normal code-detection is performed.
9339 If OPERATION is `insert-file-contents', the argument corresponding to
9340 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9341 file name to look up, and BUFFER is a buffer that contains the file's
9342 contents (not yet decoded). If `file-coding-system-alist' specifies a
9343 function to call for FILENAME, that function should examine the
9344 contents of BUFFER instead of reading the file.
9346 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9347 (int nargs
, Lisp_Object
*args
)
9349 Lisp_Object operation
, target_idx
, target
, val
;
9350 register Lisp_Object chain
;
9353 error ("Too few arguments");
9354 operation
= args
[0];
9355 if (!SYMBOLP (operation
)
9356 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
9357 error ("Invalid first argument");
9358 if (nargs
< 1 + XINT (target_idx
))
9359 error ("Too few arguments for operation: %s",
9360 SDATA (SYMBOL_NAME (operation
)));
9361 target
= args
[XINT (target_idx
) + 1];
9362 if (!(STRINGP (target
)
9363 || (EQ (operation
, Qinsert_file_contents
) && CONSP (target
)
9364 && STRINGP (XCAR (target
)) && BUFFERP (XCDR (target
)))
9365 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
9366 error ("Invalid %dth argument", XINT (target_idx
) + 1);
9368 target
= XCAR (target
);
9370 chain
= ((EQ (operation
, Qinsert_file_contents
)
9371 || EQ (operation
, Qwrite_region
))
9372 ? Vfile_coding_system_alist
9373 : (EQ (operation
, Qopen_network_stream
)
9374 ? Vnetwork_coding_system_alist
9375 : Vprocess_coding_system_alist
));
9379 for (; CONSP (chain
); chain
= XCDR (chain
))
9385 && ((STRINGP (target
)
9386 && STRINGP (XCAR (elt
))
9387 && fast_string_match (XCAR (elt
), target
) >= 0)
9388 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
9391 /* Here, if VAL is both a valid coding system and a valid
9392 function symbol, we return VAL as a coding system. */
9395 if (! SYMBOLP (val
))
9397 if (! NILP (Fcoding_system_p (val
)))
9398 return Fcons (val
, val
);
9399 if (! NILP (Ffboundp (val
)))
9401 /* We use call1 rather than safe_call1
9402 so as to get bug reports about functions called here
9403 which don't handle the current interface. */
9404 val
= call1 (val
, Flist (nargs
, args
));
9407 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
9408 return Fcons (val
, val
);
9416 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
9417 Sset_coding_system_priority
, 0, MANY
, 0,
9418 doc
: /* Assign higher priority to the coding systems given as arguments.
9419 If multiple coding systems belong to the same category,
9420 all but the first one are ignored.
9422 usage: (set-coding-system-priority &rest coding-systems) */)
9423 (int nargs
, Lisp_Object
*args
)
9426 int changed
[coding_category_max
];
9427 enum coding_category priorities
[coding_category_max
];
9429 memset (changed
, 0, sizeof changed
);
9431 for (i
= j
= 0; i
< nargs
; i
++)
9433 enum coding_category category
;
9434 Lisp_Object spec
, attrs
;
9436 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
9437 attrs
= AREF (spec
, 0);
9438 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
9439 if (changed
[category
])
9440 /* Ignore this coding system because a coding system of the
9441 same category already had a higher priority. */
9443 changed
[category
] = 1;
9444 priorities
[j
++] = category
;
9445 if (coding_categories
[category
].id
>= 0
9446 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
9447 setup_coding_system (args
[i
], &coding_categories
[category
]);
9448 Fset (AREF (Vcoding_category_table
, category
), args
[i
]);
9451 /* Now we have decided top J priorities. Reflect the order of the
9452 original priorities to the remaining priorities. */
9454 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
9456 while (j
< coding_category_max
9457 && changed
[coding_priorities
[j
]])
9459 if (j
== coding_category_max
)
9461 priorities
[i
] = coding_priorities
[j
];
9464 memcpy (coding_priorities
, priorities
, sizeof priorities
);
9466 /* Update `coding-category-list'. */
9467 Vcoding_category_list
= Qnil
;
9468 for (i
= coding_category_max
- 1; i
>= 0; i
--)
9469 Vcoding_category_list
9470 = Fcons (AREF (Vcoding_category_table
, priorities
[i
]),
9471 Vcoding_category_list
);
9476 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
9477 Scoding_system_priority_list
, 0, 1, 0,
9478 doc
: /* Return a list of coding systems ordered by their priorities.
9479 The list contains a subset of coding systems; i.e. coding systems
9480 assigned to each coding category (see `coding-category-list').
9482 HIGHESTP non-nil means just return the highest priority one. */)
9483 (Lisp_Object highestp
)
9488 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
9490 enum coding_category category
= coding_priorities
[i
];
9491 int id
= coding_categories
[category
].id
;
9496 attrs
= CODING_ID_ATTRS (id
);
9497 if (! NILP (highestp
))
9498 return CODING_ATTR_BASE_NAME (attrs
);
9499 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
9501 return Fnreverse (val
);
9504 static const char *const suffixes
[] = { "-unix", "-dos", "-mac" };
9507 make_subsidiaries (Lisp_Object base
)
9509 Lisp_Object subsidiaries
;
9510 int base_name_len
= SBYTES (SYMBOL_NAME (base
));
9511 char *buf
= (char *) alloca (base_name_len
+ 6);
9514 memcpy (buf
, SDATA (SYMBOL_NAME (base
)), base_name_len
);
9515 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
9516 for (i
= 0; i
< 3; i
++)
9518 memcpy (buf
+ base_name_len
, suffixes
[i
], strlen (suffixes
[i
]) + 1);
9519 ASET (subsidiaries
, i
, intern (buf
));
9521 return subsidiaries
;
9525 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
9526 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
9527 doc
: /* For internal use only.
9528 usage: (define-coding-system-internal ...) */)
9529 (int nargs
, Lisp_Object
*args
)
9532 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
9533 Lisp_Object attrs
; /* Vector of attributes. */
9534 Lisp_Object eol_type
;
9535 Lisp_Object aliases
;
9536 Lisp_Object coding_type
, charset_list
, safe_charsets
;
9537 enum coding_category category
;
9538 Lisp_Object tail
, val
;
9539 int max_charset_id
= 0;
9542 if (nargs
< coding_arg_max
)
9545 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
9547 name
= args
[coding_arg_name
];
9548 CHECK_SYMBOL (name
);
9549 CODING_ATTR_BASE_NAME (attrs
) = name
;
9551 val
= args
[coding_arg_mnemonic
];
9552 if (! STRINGP (val
))
9553 CHECK_CHARACTER (val
);
9554 CODING_ATTR_MNEMONIC (attrs
) = val
;
9556 coding_type
= args
[coding_arg_coding_type
];
9557 CHECK_SYMBOL (coding_type
);
9558 CODING_ATTR_TYPE (attrs
) = coding_type
;
9560 charset_list
= args
[coding_arg_charset_list
];
9561 if (SYMBOLP (charset_list
))
9563 if (EQ (charset_list
, Qiso_2022
))
9565 if (! EQ (coding_type
, Qiso_2022
))
9566 error ("Invalid charset-list");
9567 charset_list
= Viso_2022_charset_list
;
9569 else if (EQ (charset_list
, Qemacs_mule
))
9571 if (! EQ (coding_type
, Qemacs_mule
))
9572 error ("Invalid charset-list");
9573 charset_list
= Vemacs_mule_charset_list
;
9575 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
9576 if (max_charset_id
< XFASTINT (XCAR (tail
)))
9577 max_charset_id
= XFASTINT (XCAR (tail
));
9581 charset_list
= Fcopy_sequence (charset_list
);
9582 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
9584 struct charset
*charset
;
9587 CHECK_CHARSET_GET_CHARSET (val
, charset
);
9588 if (EQ (coding_type
, Qiso_2022
)
9589 ? CHARSET_ISO_FINAL (charset
) < 0
9590 : EQ (coding_type
, Qemacs_mule
)
9591 ? CHARSET_EMACS_MULE_ID (charset
) < 0
9593 error ("Can't handle charset `%s'",
9594 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9596 XSETCAR (tail
, make_number (charset
->id
));
9597 if (max_charset_id
< charset
->id
)
9598 max_charset_id
= charset
->id
;
9601 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
9603 safe_charsets
= make_uninit_string (max_charset_id
+ 1);
9604 memset (SDATA (safe_charsets
), 255, max_charset_id
+ 1);
9605 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
9606 SSET (safe_charsets
, XFASTINT (XCAR (tail
)), 0);
9607 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
9609 CODING_ATTR_ASCII_COMPAT (attrs
) = args
[coding_arg_ascii_compatible_p
];
9611 val
= args
[coding_arg_decode_translation_table
];
9612 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
9614 CODING_ATTR_DECODE_TBL (attrs
) = val
;
9616 val
= args
[coding_arg_encode_translation_table
];
9617 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
9619 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
9621 val
= args
[coding_arg_post_read_conversion
];
9623 CODING_ATTR_POST_READ (attrs
) = val
;
9625 val
= args
[coding_arg_pre_write_conversion
];
9627 CODING_ATTR_PRE_WRITE (attrs
) = val
;
9629 val
= args
[coding_arg_default_char
];
9631 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
9634 CHECK_CHARACTER (val
);
9635 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
9638 val
= args
[coding_arg_for_unibyte
];
9639 CODING_ATTR_FOR_UNIBYTE (attrs
) = NILP (val
) ? Qnil
: Qt
;
9641 val
= args
[coding_arg_plist
];
9643 CODING_ATTR_PLIST (attrs
) = val
;
9645 if (EQ (coding_type
, Qcharset
))
9647 /* Generate a lisp vector of 256 elements. Each element is nil,
9648 integer, or a list of charset IDs.
9650 If Nth element is nil, the byte code N is invalid in this
9653 If Nth element is a number NUM, N is the first byte of a
9654 charset whose ID is NUM.
9656 If Nth element is a list of charset IDs, N is the first byte
9657 of one of them. The list is sorted by dimensions of the
9658 charsets. A charset of smaller dimension comes first. */
9659 val
= Fmake_vector (make_number (256), Qnil
);
9661 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
9663 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
9664 int dim
= CHARSET_DIMENSION (charset
);
9665 int idx
= (dim
- 1) * 4;
9667 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
9668 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
9670 for (i
= charset
->code_space
[idx
];
9671 i
<= charset
->code_space
[idx
+ 1]; i
++)
9673 Lisp_Object tmp
, tmp2
;
9676 tmp
= AREF (val
, i
);
9679 else if (NUMBERP (tmp
))
9681 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
9683 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
9685 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
9689 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
9691 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
9696 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
9699 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
9700 XSETCAR (tmp2
, XCAR (tail
));
9706 ASET (attrs
, coding_attr_charset_valids
, val
);
9707 category
= coding_category_charset
;
9709 else if (EQ (coding_type
, Qccl
))
9713 if (nargs
< coding_arg_ccl_max
)
9716 val
= args
[coding_arg_ccl_decoder
];
9717 CHECK_CCL_PROGRAM (val
);
9719 val
= Fcopy_sequence (val
);
9720 ASET (attrs
, coding_attr_ccl_decoder
, val
);
9722 val
= args
[coding_arg_ccl_encoder
];
9723 CHECK_CCL_PROGRAM (val
);
9725 val
= Fcopy_sequence (val
);
9726 ASET (attrs
, coding_attr_ccl_encoder
, val
);
9728 val
= args
[coding_arg_ccl_valids
];
9729 valids
= Fmake_string (make_number (256), make_number (0));
9730 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
9737 from
= to
= XINT (val
);
9738 if (from
< 0 || from
> 255)
9739 args_out_of_range_3 (val
, make_number (0), make_number (255));
9744 CHECK_NATNUM_CAR (val
);
9745 CHECK_NATNUM_CDR (val
);
9746 from
= XINT (XCAR (val
));
9748 args_out_of_range_3 (XCAR (val
),
9749 make_number (0), make_number (255));
9750 to
= XINT (XCDR (val
));
9751 if (to
< from
|| to
> 255)
9752 args_out_of_range_3 (XCDR (val
),
9753 XCAR (val
), make_number (255));
9755 for (i
= from
; i
<= to
; i
++)
9756 SSET (valids
, i
, 1);
9758 ASET (attrs
, coding_attr_ccl_valids
, valids
);
9760 category
= coding_category_ccl
;
9762 else if (EQ (coding_type
, Qutf_16
))
9764 Lisp_Object bom
, endian
;
9766 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
9768 if (nargs
< coding_arg_utf16_max
)
9771 bom
= args
[coding_arg_utf16_bom
];
9772 if (! NILP (bom
) && ! EQ (bom
, Qt
))
9776 CHECK_CODING_SYSTEM (val
);
9778 CHECK_CODING_SYSTEM (val
);
9780 ASET (attrs
, coding_attr_utf_bom
, bom
);
9782 endian
= args
[coding_arg_utf16_endian
];
9783 CHECK_SYMBOL (endian
);
9786 else if (! EQ (endian
, Qbig
) && ! EQ (endian
, Qlittle
))
9787 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian
)));
9788 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
9790 category
= (CONSP (bom
)
9791 ? coding_category_utf_16_auto
9793 ? (EQ (endian
, Qbig
)
9794 ? coding_category_utf_16_be_nosig
9795 : coding_category_utf_16_le_nosig
)
9796 : (EQ (endian
, Qbig
)
9797 ? coding_category_utf_16_be
9798 : coding_category_utf_16_le
));
9800 else if (EQ (coding_type
, Qiso_2022
))
9802 Lisp_Object initial
, reg_usage
, request
, flags
;
9805 if (nargs
< coding_arg_iso2022_max
)
9808 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
9809 CHECK_VECTOR (initial
);
9810 for (i
= 0; i
< 4; i
++)
9812 val
= Faref (initial
, make_number (i
));
9815 struct charset
*charset
;
9817 CHECK_CHARSET_GET_CHARSET (val
, charset
);
9818 ASET (initial
, i
, make_number (CHARSET_ID (charset
)));
9819 if (i
== 0 && CHARSET_ASCII_COMPATIBLE_P (charset
))
9820 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
9823 ASET (initial
, i
, make_number (-1));
9826 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
9827 CHECK_CONS (reg_usage
);
9828 CHECK_NUMBER_CAR (reg_usage
);
9829 CHECK_NUMBER_CDR (reg_usage
);
9831 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
9832 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
9840 CHECK_CHARSET_GET_ID (tmp
, id
);
9841 CHECK_NATNUM_CDR (val
);
9842 if (XINT (XCDR (val
)) >= 4)
9843 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
9844 XSETCAR (val
, make_number (id
));
9847 flags
= args
[coding_arg_iso2022_flags
];
9848 CHECK_NATNUM (flags
);
9850 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
9851 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
9853 ASET (attrs
, coding_attr_iso_initial
, initial
);
9854 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
9855 ASET (attrs
, coding_attr_iso_request
, request
);
9856 ASET (attrs
, coding_attr_iso_flags
, flags
);
9857 setup_iso_safe_charsets (attrs
);
9859 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
9860 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
9861 | CODING_ISO_FLAG_SINGLE_SHIFT
))
9862 ? coding_category_iso_7_else
9863 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
9864 ? coding_category_iso_7
9865 : coding_category_iso_7_tight
);
9868 int id
= XINT (AREF (initial
, 1));
9870 category
= (((i
& CODING_ISO_FLAG_LOCKING_SHIFT
)
9871 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
9873 ? coding_category_iso_8_else
9874 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
9875 ? coding_category_iso_8_1
9876 : coding_category_iso_8_2
);
9878 if (category
!= coding_category_iso_8_1
9879 && category
!= coding_category_iso_8_2
)
9880 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
9882 else if (EQ (coding_type
, Qemacs_mule
))
9884 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
9885 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
9886 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
9887 category
= coding_category_emacs_mule
;
9889 else if (EQ (coding_type
, Qshift_jis
))
9892 struct charset
*charset
;
9894 if (XINT (Flength (charset_list
)) != 3
9895 && XINT (Flength (charset_list
)) != 4)
9896 error ("There should be three or four charsets");
9898 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
9899 if (CHARSET_DIMENSION (charset
) != 1)
9900 error ("Dimension of charset %s is not one",
9901 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9902 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
9903 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
9905 charset_list
= XCDR (charset_list
);
9906 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
9907 if (CHARSET_DIMENSION (charset
) != 1)
9908 error ("Dimension of charset %s is not one",
9909 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9911 charset_list
= XCDR (charset_list
);
9912 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
9913 if (CHARSET_DIMENSION (charset
) != 2)
9914 error ("Dimension of charset %s is not two",
9915 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9917 charset_list
= XCDR (charset_list
);
9918 if (! NILP (charset_list
))
9920 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
9921 if (CHARSET_DIMENSION (charset
) != 2)
9922 error ("Dimension of charset %s is not two",
9923 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9926 category
= coding_category_sjis
;
9927 Vsjis_coding_system
= name
;
9929 else if (EQ (coding_type
, Qbig5
))
9931 struct charset
*charset
;
9933 if (XINT (Flength (charset_list
)) != 2)
9934 error ("There should be just two charsets");
9936 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
9937 if (CHARSET_DIMENSION (charset
) != 1)
9938 error ("Dimension of charset %s is not one",
9939 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9940 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
9941 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
9943 charset_list
= XCDR (charset_list
);
9944 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
9945 if (CHARSET_DIMENSION (charset
) != 2)
9946 error ("Dimension of charset %s is not two",
9947 SDATA (SYMBOL_NAME (CHARSET_NAME (charset
))));
9949 category
= coding_category_big5
;
9950 Vbig5_coding_system
= name
;
9952 else if (EQ (coding_type
, Qraw_text
))
9954 category
= coding_category_raw_text
;
9955 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
9957 else if (EQ (coding_type
, Qutf_8
))
9961 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
9963 if (nargs
< coding_arg_utf8_max
)
9966 bom
= args
[coding_arg_utf8_bom
];
9967 if (! NILP (bom
) && ! EQ (bom
, Qt
))
9971 CHECK_CODING_SYSTEM (val
);
9973 CHECK_CODING_SYSTEM (val
);
9975 ASET (attrs
, coding_attr_utf_bom
, bom
);
9977 category
= (CONSP (bom
) ? coding_category_utf_8_auto
9978 : NILP (bom
) ? coding_category_utf_8_nosig
9979 : coding_category_utf_8_sig
);
9981 else if (EQ (coding_type
, Qundecided
))
9982 category
= coding_category_undecided
;
9984 error ("Invalid coding system type: %s",
9985 SDATA (SYMBOL_NAME (coding_type
)));
9987 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
9988 CODING_ATTR_PLIST (attrs
)
9989 = Fcons (QCcategory
, Fcons (AREF (Vcoding_category_table
, category
),
9990 CODING_ATTR_PLIST (attrs
)));
9991 CODING_ATTR_PLIST (attrs
)
9992 = Fcons (QCascii_compatible_p
,
9993 Fcons (CODING_ATTR_ASCII_COMPAT (attrs
),
9994 CODING_ATTR_PLIST (attrs
)));
9996 eol_type
= args
[coding_arg_eol_type
];
9997 if (! NILP (eol_type
)
9998 && ! EQ (eol_type
, Qunix
)
9999 && ! EQ (eol_type
, Qdos
)
10000 && ! EQ (eol_type
, Qmac
))
10001 error ("Invalid eol-type");
10003 aliases
= Fcons (name
, Qnil
);
10005 if (NILP (eol_type
))
10007 eol_type
= make_subsidiaries (name
);
10008 for (i
= 0; i
< 3; i
++)
10010 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
10012 this_name
= AREF (eol_type
, i
);
10013 this_aliases
= Fcons (this_name
, Qnil
);
10014 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
10015 this_spec
= Fmake_vector (make_number (3), attrs
);
10016 ASET (this_spec
, 1, this_aliases
);
10017 ASET (this_spec
, 2, this_eol_type
);
10018 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
10019 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
10020 val
= Fassoc (Fsymbol_name (this_name
), Vcoding_system_alist
);
10022 Vcoding_system_alist
10023 = Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
10024 Vcoding_system_alist
);
10028 spec_vec
= Fmake_vector (make_number (3), attrs
);
10029 ASET (spec_vec
, 1, aliases
);
10030 ASET (spec_vec
, 2, eol_type
);
10032 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
10033 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
10034 val
= Fassoc (Fsymbol_name (name
), Vcoding_system_alist
);
10036 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
10037 Vcoding_system_alist
);
10040 int id
= coding_categories
[category
].id
;
10042 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
10043 setup_coding_system (name
, &coding_categories
[category
]);
10049 return Fsignal (Qwrong_number_of_arguments
,
10050 Fcons (intern ("define-coding-system-internal"),
10051 make_number (nargs
)));
10055 DEFUN ("coding-system-put", Fcoding_system_put
, Scoding_system_put
,
10057 doc
: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10058 (Lisp_Object coding_system
, Lisp_Object prop
, Lisp_Object val
)
10060 Lisp_Object spec
, attrs
;
10062 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10063 attrs
= AREF (spec
, 0);
10064 if (EQ (prop
, QCmnemonic
))
10066 if (! STRINGP (val
))
10067 CHECK_CHARACTER (val
);
10068 CODING_ATTR_MNEMONIC (attrs
) = val
;
10070 else if (EQ (prop
, QCdefault_char
))
10073 val
= make_number (' ');
10075 CHECK_CHARACTER (val
);
10076 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
10078 else if (EQ (prop
, QCdecode_translation_table
))
10080 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
10081 CHECK_SYMBOL (val
);
10082 CODING_ATTR_DECODE_TBL (attrs
) = val
;
10084 else if (EQ (prop
, QCencode_translation_table
))
10086 if (! CHAR_TABLE_P (val
) && ! CONSP (val
))
10087 CHECK_SYMBOL (val
);
10088 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
10090 else if (EQ (prop
, QCpost_read_conversion
))
10092 CHECK_SYMBOL (val
);
10093 CODING_ATTR_POST_READ (attrs
) = val
;
10095 else if (EQ (prop
, QCpre_write_conversion
))
10097 CHECK_SYMBOL (val
);
10098 CODING_ATTR_PRE_WRITE (attrs
) = val
;
10100 else if (EQ (prop
, QCascii_compatible_p
))
10102 CODING_ATTR_ASCII_COMPAT (attrs
) = val
;
10105 CODING_ATTR_PLIST (attrs
)
10106 = Fplist_put (CODING_ATTR_PLIST (attrs
), prop
, val
);
10111 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
10112 Sdefine_coding_system_alias
, 2, 2, 0,
10113 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10114 (Lisp_Object alias
, Lisp_Object coding_system
)
10116 Lisp_Object spec
, aliases
, eol_type
, val
;
10118 CHECK_SYMBOL (alias
);
10119 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10120 aliases
= AREF (spec
, 1);
10121 /* ALIASES should be a list of length more than zero, and the first
10122 element is a base coding system. Append ALIAS at the tail of the
10124 while (!NILP (XCDR (aliases
)))
10125 aliases
= XCDR (aliases
);
10126 XSETCDR (aliases
, Fcons (alias
, Qnil
));
10128 eol_type
= AREF (spec
, 2);
10129 if (VECTORP (eol_type
))
10131 Lisp_Object subsidiaries
;
10134 subsidiaries
= make_subsidiaries (alias
);
10135 for (i
= 0; i
< 3; i
++)
10136 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
10137 AREF (eol_type
, i
));
10140 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
10141 Vcoding_system_list
= Fcons (alias
, Vcoding_system_list
);
10142 val
= Fassoc (Fsymbol_name (alias
), Vcoding_system_alist
);
10144 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
10145 Vcoding_system_alist
);
10150 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
10152 doc
: /* Return the base of CODING-SYSTEM.
10153 Any alias or subsidiary coding system is not a base coding system. */)
10154 (Lisp_Object coding_system
)
10156 Lisp_Object spec
, attrs
;
10158 if (NILP (coding_system
))
10159 return (Qno_conversion
);
10160 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10161 attrs
= AREF (spec
, 0);
10162 return CODING_ATTR_BASE_NAME (attrs
);
10165 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
10167 doc
: "Return the property list of CODING-SYSTEM.")
10168 (Lisp_Object coding_system
)
10170 Lisp_Object spec
, attrs
;
10172 if (NILP (coding_system
))
10173 coding_system
= Qno_conversion
;
10174 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10175 attrs
= AREF (spec
, 0);
10176 return CODING_ATTR_PLIST (attrs
);
10180 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
10182 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
10183 (Lisp_Object coding_system
)
10187 if (NILP (coding_system
))
10188 coding_system
= Qno_conversion
;
10189 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
10190 return AREF (spec
, 1);
10193 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
10194 Scoding_system_eol_type
, 1, 1, 0,
10195 doc
: /* Return eol-type of CODING-SYSTEM.
10196 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10198 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10199 and CR respectively.
10201 A vector value indicates that a format of end-of-line should be
10202 detected automatically. Nth element of the vector is the subsidiary
10203 coding system whose eol-type is N. */)
10204 (Lisp_Object coding_system
)
10206 Lisp_Object spec
, eol_type
;
10209 if (NILP (coding_system
))
10210 coding_system
= Qno_conversion
;
10211 if (! CODING_SYSTEM_P (coding_system
))
10213 spec
= CODING_SYSTEM_SPEC (coding_system
);
10214 eol_type
= AREF (spec
, 2);
10215 if (VECTORP (eol_type
))
10216 return Fcopy_sequence (eol_type
);
10217 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
10218 return make_number (n
);
10224 /*** 9. Post-amble ***/
10227 init_coding_once (void)
10231 for (i
= 0; i
< coding_category_max
; i
++)
10233 coding_categories
[i
].id
= -1;
10234 coding_priorities
[i
] = i
;
10237 /* ISO2022 specific initialize routine. */
10238 for (i
= 0; i
< 0x20; i
++)
10239 iso_code_class
[i
] = ISO_control_0
;
10240 for (i
= 0x21; i
< 0x7F; i
++)
10241 iso_code_class
[i
] = ISO_graphic_plane_0
;
10242 for (i
= 0x80; i
< 0xA0; i
++)
10243 iso_code_class
[i
] = ISO_control_1
;
10244 for (i
= 0xA1; i
< 0xFF; i
++)
10245 iso_code_class
[i
] = ISO_graphic_plane_1
;
10246 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
10247 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
10248 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
10249 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
10250 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
10251 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
10252 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
10253 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
10254 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
10256 for (i
= 0; i
< 256; i
++)
10258 emacs_mule_bytes
[i
] = 1;
10260 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_11
] = 3;
10261 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_12
] = 3;
10262 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_21
] = 4;
10263 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_22
] = 4;
10269 syms_of_coding (void)
10271 staticpro (&Vcoding_system_hash_table
);
10273 Lisp_Object args
[2];
10276 Vcoding_system_hash_table
= Fmake_hash_table (2, args
);
10279 staticpro (&Vsjis_coding_system
);
10280 Vsjis_coding_system
= Qnil
;
10282 staticpro (&Vbig5_coding_system
);
10283 Vbig5_coding_system
= Qnil
;
10285 staticpro (&Vcode_conversion_reused_workbuf
);
10286 Vcode_conversion_reused_workbuf
= Qnil
;
10288 staticpro (&Vcode_conversion_workbuf_name
);
10289 Vcode_conversion_workbuf_name
= make_pure_c_string (" *code-conversion-work*");
10291 reused_workbuf_in_use
= 0;
10293 DEFSYM (Qcharset
, "charset");
10294 DEFSYM (Qtarget_idx
, "target-idx");
10295 DEFSYM (Qcoding_system_history
, "coding-system-history");
10296 Fset (Qcoding_system_history
, Qnil
);
10298 /* Target FILENAME is the first argument. */
10299 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
10300 /* Target FILENAME is the third argument. */
10301 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
10303 DEFSYM (Qcall_process
, "call-process");
10304 /* Target PROGRAM is the first argument. */
10305 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
10307 DEFSYM (Qcall_process_region
, "call-process-region");
10308 /* Target PROGRAM is the third argument. */
10309 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
10311 DEFSYM (Qstart_process
, "start-process");
10312 /* Target PROGRAM is the third argument. */
10313 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
10315 DEFSYM (Qopen_network_stream
, "open-network-stream");
10316 /* Target SERVICE is the fourth argument. */
10317 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
10319 DEFSYM (Qcoding_system
, "coding-system");
10320 DEFSYM (Qcoding_aliases
, "coding-aliases");
10322 DEFSYM (Qeol_type
, "eol-type");
10323 DEFSYM (Qunix
, "unix");
10324 DEFSYM (Qdos
, "dos");
10326 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
10327 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
10328 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
10329 DEFSYM (Qdefault_char
, "default-char");
10330 DEFSYM (Qundecided
, "undecided");
10331 DEFSYM (Qno_conversion
, "no-conversion");
10332 DEFSYM (Qraw_text
, "raw-text");
10334 DEFSYM (Qiso_2022
, "iso-2022");
10336 DEFSYM (Qutf_8
, "utf-8");
10337 DEFSYM (Qutf_8_emacs
, "utf-8-emacs");
10339 DEFSYM (Qutf_16
, "utf-16");
10340 DEFSYM (Qbig
, "big");
10341 DEFSYM (Qlittle
, "little");
10343 DEFSYM (Qshift_jis
, "shift-jis");
10344 DEFSYM (Qbig5
, "big5");
10346 DEFSYM (Qcoding_system_p
, "coding-system-p");
10348 DEFSYM (Qcoding_system_error
, "coding-system-error");
10349 Fput (Qcoding_system_error
, Qerror_conditions
,
10350 pure_cons (Qcoding_system_error
, pure_cons (Qerror
, Qnil
)));
10351 Fput (Qcoding_system_error
, Qerror_message
,
10352 make_pure_c_string ("Invalid coding system"));
10354 /* Intern this now in case it isn't already done.
10355 Setting this variable twice is harmless.
10356 But don't staticpro it here--that is done in alloc.c. */
10357 Qchar_table_extra_slots
= intern_c_string ("char-table-extra-slots");
10359 DEFSYM (Qtranslation_table
, "translation-table");
10360 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (2));
10361 DEFSYM (Qtranslation_table_id
, "translation-table-id");
10362 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
10363 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
10365 DEFSYM (Qvalid_codes
, "valid-codes");
10367 DEFSYM (Qemacs_mule
, "emacs-mule");
10369 DEFSYM (QCcategory
, ":category");
10370 DEFSYM (QCmnemonic
, ":mnemonic");
10371 DEFSYM (QCdefault_char
, ":default-char");
10372 DEFSYM (QCdecode_translation_table
, ":decode-translation-table");
10373 DEFSYM (QCencode_translation_table
, ":encode-translation-table");
10374 DEFSYM (QCpost_read_conversion
, ":post-read-conversion");
10375 DEFSYM (QCpre_write_conversion
, ":pre-write-conversion");
10376 DEFSYM (QCascii_compatible_p
, ":ascii-compatible-p");
10378 Vcoding_category_table
10379 = Fmake_vector (make_number (coding_category_max
), Qnil
);
10380 staticpro (&Vcoding_category_table
);
10381 /* Followings are target of code detection. */
10382 ASET (Vcoding_category_table
, coding_category_iso_7
,
10383 intern_c_string ("coding-category-iso-7"));
10384 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
10385 intern_c_string ("coding-category-iso-7-tight"));
10386 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
10387 intern_c_string ("coding-category-iso-8-1"));
10388 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
10389 intern_c_string ("coding-category-iso-8-2"));
10390 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
10391 intern_c_string ("coding-category-iso-7-else"));
10392 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
10393 intern_c_string ("coding-category-iso-8-else"));
10394 ASET (Vcoding_category_table
, coding_category_utf_8_auto
,
10395 intern_c_string ("coding-category-utf-8-auto"));
10396 ASET (Vcoding_category_table
, coding_category_utf_8_nosig
,
10397 intern_c_string ("coding-category-utf-8"));
10398 ASET (Vcoding_category_table
, coding_category_utf_8_sig
,
10399 intern_c_string ("coding-category-utf-8-sig"));
10400 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
10401 intern_c_string ("coding-category-utf-16-be"));
10402 ASET (Vcoding_category_table
, coding_category_utf_16_auto
,
10403 intern_c_string ("coding-category-utf-16-auto"));
10404 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
10405 intern_c_string ("coding-category-utf-16-le"));
10406 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
10407 intern_c_string ("coding-category-utf-16-be-nosig"));
10408 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
10409 intern_c_string ("coding-category-utf-16-le-nosig"));
10410 ASET (Vcoding_category_table
, coding_category_charset
,
10411 intern_c_string ("coding-category-charset"));
10412 ASET (Vcoding_category_table
, coding_category_sjis
,
10413 intern_c_string ("coding-category-sjis"));
10414 ASET (Vcoding_category_table
, coding_category_big5
,
10415 intern_c_string ("coding-category-big5"));
10416 ASET (Vcoding_category_table
, coding_category_ccl
,
10417 intern_c_string ("coding-category-ccl"));
10418 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
10419 intern_c_string ("coding-category-emacs-mule"));
10420 /* Followings are NOT target of code detection. */
10421 ASET (Vcoding_category_table
, coding_category_raw_text
,
10422 intern_c_string ("coding-category-raw-text"));
10423 ASET (Vcoding_category_table
, coding_category_undecided
,
10424 intern_c_string ("coding-category-undecided"));
10426 DEFSYM (Qinsufficient_source
, "insufficient-source");
10427 DEFSYM (Qinconsistent_eol
, "inconsistent-eol");
10428 DEFSYM (Qinvalid_source
, "invalid-source");
10429 DEFSYM (Qinterrupted
, "interrupted");
10430 DEFSYM (Qinsufficient_memory
, "insufficient-memory");
10431 DEFSYM (Qcoding_system_define_form
, "coding-system-define-form");
10433 defsubr (&Scoding_system_p
);
10434 defsubr (&Sread_coding_system
);
10435 defsubr (&Sread_non_nil_coding_system
);
10436 defsubr (&Scheck_coding_system
);
10437 defsubr (&Sdetect_coding_region
);
10438 defsubr (&Sdetect_coding_string
);
10439 defsubr (&Sfind_coding_systems_region_internal
);
10440 defsubr (&Sunencodable_char_position
);
10441 defsubr (&Scheck_coding_systems_region
);
10442 defsubr (&Sdecode_coding_region
);
10443 defsubr (&Sencode_coding_region
);
10444 defsubr (&Sdecode_coding_string
);
10445 defsubr (&Sencode_coding_string
);
10446 defsubr (&Sdecode_sjis_char
);
10447 defsubr (&Sencode_sjis_char
);
10448 defsubr (&Sdecode_big5_char
);
10449 defsubr (&Sencode_big5_char
);
10450 defsubr (&Sset_terminal_coding_system_internal
);
10451 defsubr (&Sset_safe_terminal_coding_system_internal
);
10452 defsubr (&Sterminal_coding_system
);
10453 defsubr (&Sset_keyboard_coding_system_internal
);
10454 defsubr (&Skeyboard_coding_system
);
10455 defsubr (&Sfind_operation_coding_system
);
10456 defsubr (&Sset_coding_system_priority
);
10457 defsubr (&Sdefine_coding_system_internal
);
10458 defsubr (&Sdefine_coding_system_alias
);
10459 defsubr (&Scoding_system_put
);
10460 defsubr (&Scoding_system_base
);
10461 defsubr (&Scoding_system_plist
);
10462 defsubr (&Scoding_system_aliases
);
10463 defsubr (&Scoding_system_eol_type
);
10464 defsubr (&Scoding_system_priority_list
);
10466 DEFVAR_LISP ("coding-system-list", Vcoding_system_list
,
10467 doc
: /* List of coding systems.
10469 Do not alter the value of this variable manually. This variable should be
10470 updated by the functions `define-coding-system' and
10471 `define-coding-system-alias'. */);
10472 Vcoding_system_list
= Qnil
;
10474 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist
,
10475 doc
: /* Alist of coding system names.
10476 Each element is one element list of coding system name.
10477 This variable is given to `completing-read' as COLLECTION argument.
10479 Do not alter the value of this variable manually. This variable should be
10480 updated by the functions `make-coding-system' and
10481 `define-coding-system-alias'. */);
10482 Vcoding_system_alist
= Qnil
;
10484 DEFVAR_LISP ("coding-category-list", Vcoding_category_list
,
10485 doc
: /* List of coding-categories (symbols) ordered by priority.
10487 On detecting a coding system, Emacs tries code detection algorithms
10488 associated with each coding-category one by one in this order. When
10489 one algorithm agrees with a byte sequence of source text, the coding
10490 system bound to the corresponding coding-category is selected.
10492 Don't modify this variable directly, but use `set-coding-system-priority'. */);
10496 Vcoding_category_list
= Qnil
;
10497 for (i
= coding_category_max
- 1; i
>= 0; i
--)
10498 Vcoding_category_list
10499 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
10500 Vcoding_category_list
);
10503 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read
,
10504 doc
: /* Specify the coding system for read operations.
10505 It is useful to bind this variable with `let', but do not set it globally.
10506 If the value is a coding system, it is used for decoding on read operation.
10507 If not, an appropriate element is used from one of the coding system alists.
10508 There are three such tables: `file-coding-system-alist',
10509 `process-coding-system-alist', and `network-coding-system-alist'. */);
10510 Vcoding_system_for_read
= Qnil
;
10512 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write
,
10513 doc
: /* Specify the coding system for write operations.
10514 Programs bind this variable with `let', but you should not set it globally.
10515 If the value is a coding system, it is used for encoding of output,
10516 when writing it to a file and when sending it to a file or subprocess.
10518 If this does not specify a coding system, an appropriate element
10519 is used from one of the coding system alists.
10520 There are three such tables: `file-coding-system-alist',
10521 `process-coding-system-alist', and `network-coding-system-alist'.
10522 For output to files, if the above procedure does not specify a coding system,
10523 the value of `buffer-file-coding-system' is used. */);
10524 Vcoding_system_for_write
= Qnil
;
10526 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used
,
10528 Coding system used in the latest file or process I/O. */);
10529 Vlast_coding_system_used
= Qnil
;
10531 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error
,
10533 Error status of the last code conversion.
10535 When an error was detected in the last code conversion, this variable
10536 is set to one of the following symbols.
10537 `insufficient-source'
10541 `insufficient-memory'
10542 When no error was detected, the value doesn't change. So, to check
10543 the error status of a code conversion by this variable, you must
10544 explicitly set this variable to nil before performing code
10546 Vlast_code_conversion_error
= Qnil
;
10548 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion
,
10550 *Non-nil means always inhibit code conversion of end-of-line format.
10551 See info node `Coding Systems' and info node `Text and Binary' concerning
10552 such conversion. */);
10553 inhibit_eol_conversion
= 0;
10555 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system
,
10557 Non-nil means process buffer inherits coding system of process output.
10558 Bind it to t if the process output is to be treated as if it were a file
10559 read from some filesystem. */);
10560 inherit_process_coding_system
= 0;
10562 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist
,
10564 Alist to decide a coding system to use for a file I/O operation.
10565 The format is ((PATTERN . VAL) ...),
10566 where PATTERN is a regular expression matching a file name,
10567 VAL is a coding system, a cons of coding systems, or a function symbol.
10568 If VAL is a coding system, it is used for both decoding and encoding
10570 If VAL is a cons of coding systems, the car part is used for decoding,
10571 and the cdr part is used for encoding.
10572 If VAL is a function symbol, the function must return a coding system
10573 or a cons of coding systems which are used as above. The function is
10574 called with an argument that is a list of the arguments with which
10575 `find-operation-coding-system' was called. If the function can't decide
10576 a coding system, it can return `undecided' so that the normal
10577 code-detection is performed.
10579 See also the function `find-operation-coding-system'
10580 and the variable `auto-coding-alist'. */);
10581 Vfile_coding_system_alist
= Qnil
;
10583 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist
,
10585 Alist to decide a coding system to use for a process I/O operation.
10586 The format is ((PATTERN . VAL) ...),
10587 where PATTERN is a regular expression matching a program name,
10588 VAL is a coding system, a cons of coding systems, or a function symbol.
10589 If VAL is a coding system, it is used for both decoding what received
10590 from the program and encoding what sent to the program.
10591 If VAL is a cons of coding systems, the car part is used for decoding,
10592 and the cdr part is used for encoding.
10593 If VAL is a function symbol, the function must return a coding system
10594 or a cons of coding systems which are used as above.
10596 See also the function `find-operation-coding-system'. */);
10597 Vprocess_coding_system_alist
= Qnil
;
10599 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist
,
10601 Alist to decide a coding system to use for a network I/O operation.
10602 The format is ((PATTERN . VAL) ...),
10603 where PATTERN is a regular expression matching a network service name
10604 or is a port number to connect to,
10605 VAL is a coding system, a cons of coding systems, or a function symbol.
10606 If VAL is a coding system, it is used for both decoding what received
10607 from the network stream and encoding what sent to the network stream.
10608 If VAL is a cons of coding systems, the car part is used for decoding,
10609 and the cdr part is used for encoding.
10610 If VAL is a function symbol, the function must return a coding system
10611 or a cons of coding systems which are used as above.
10613 See also the function `find-operation-coding-system'. */);
10614 Vnetwork_coding_system_alist
= Qnil
;
10616 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system
,
10617 doc
: /* Coding system to use with system messages.
10618 Also used for decoding keyboard input on X Window system. */);
10619 Vlocale_coding_system
= Qnil
;
10621 /* The eol mnemonics are reset in startup.el system-dependently. */
10622 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix
,
10624 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
10625 eol_mnemonic_unix
= make_pure_c_string (":");
10627 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos
,
10629 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
10630 eol_mnemonic_dos
= make_pure_c_string ("\\");
10632 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac
,
10634 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
10635 eol_mnemonic_mac
= make_pure_c_string ("/");
10637 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided
,
10639 *String displayed in mode line when end-of-line format is not yet determined. */);
10640 eol_mnemonic_undecided
= make_pure_c_string (":");
10642 DEFVAR_LISP ("enable-character-translation", Venable_character_translation
,
10644 *Non-nil enables character translation while encoding and decoding. */);
10645 Venable_character_translation
= Qt
;
10647 DEFVAR_LISP ("standard-translation-table-for-decode",
10648 Vstandard_translation_table_for_decode
,
10649 doc
: /* Table for translating characters while decoding. */);
10650 Vstandard_translation_table_for_decode
= Qnil
;
10652 DEFVAR_LISP ("standard-translation-table-for-encode",
10653 Vstandard_translation_table_for_encode
,
10654 doc
: /* Table for translating characters while encoding. */);
10655 Vstandard_translation_table_for_encode
= Qnil
;
10657 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table
,
10658 doc
: /* Alist of charsets vs revision numbers.
10659 While encoding, if a charset (car part of an element) is found,
10660 designate it with the escape sequence identifying revision (cdr part
10661 of the element). */);
10662 Vcharset_revision_table
= Qnil
;
10664 DEFVAR_LISP ("default-process-coding-system",
10665 Vdefault_process_coding_system
,
10666 doc
: /* Cons of coding systems used for process I/O by default.
10667 The car part is used for decoding a process output,
10668 the cdr part is used for encoding a text to be sent to a process. */);
10669 Vdefault_process_coding_system
= Qnil
;
10671 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table
,
10673 Table of extra Latin codes in the range 128..159 (inclusive).
10674 This is a vector of length 256.
10675 If Nth element is non-nil, the existence of code N in a file
10676 \(or output of subprocess) doesn't prevent it to be detected as
10677 a coding system of ISO 2022 variant which has a flag
10678 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10679 or reading output of a subprocess.
10680 Only 128th through 159th elements have a meaning. */);
10681 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
10683 DEFVAR_LISP ("select-safe-coding-system-function",
10684 Vselect_safe_coding_system_function
,
10686 Function to call to select safe coding system for encoding a text.
10688 If set, this function is called to force a user to select a proper
10689 coding system which can encode the text in the case that a default
10690 coding system used in each operation can't encode the text. The
10691 function should take care that the buffer is not modified while
10692 the coding system is being selected.
10694 The default value is `select-safe-coding-system' (which see). */);
10695 Vselect_safe_coding_system_function
= Qnil
;
10697 DEFVAR_BOOL ("coding-system-require-warning",
10698 coding_system_require_warning
,
10699 doc
: /* Internal use only.
10700 If non-nil, on writing a file, `select-safe-coding-system-function' is
10701 called even if `coding-system-for-write' is non-nil. The command
10702 `universal-coding-system-argument' binds this variable to t temporarily. */);
10703 coding_system_require_warning
= 0;
10706 DEFVAR_BOOL ("inhibit-iso-escape-detection",
10707 inhibit_iso_escape_detection
,
10709 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10711 When Emacs reads text, it tries to detect how the text is encoded.
10712 This code detection is sensitive to escape sequences. If Emacs sees
10713 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10714 of the ISO2022 encodings, and decodes text by the corresponding coding
10715 system (e.g. `iso-2022-7bit').
10717 However, there may be a case that you want to read escape sequences in
10718 a file as is. In such a case, you can set this variable to non-nil.
10719 Then the code detection will ignore any escape sequences, and no text is
10720 detected as encoded in some ISO-2022 encoding. The result is that all
10721 escape sequences become visible in a buffer.
10723 The default value is nil, and it is strongly recommended not to change
10724 it. That is because many Emacs Lisp source files that contain
10725 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10726 in Emacs's distribution, and they won't be decoded correctly on
10727 reading if you suppress escape sequence detection.
10729 The other way to read escape sequences in a file without decoding is
10730 to explicitly specify some coding system that doesn't use ISO-2022
10731 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
10732 inhibit_iso_escape_detection
= 0;
10734 DEFVAR_BOOL ("inhibit-null-byte-detection",
10735 inhibit_null_byte_detection
,
10736 doc
: /* If non-nil, Emacs ignores null bytes on code detection.
10737 By default, Emacs treats it as binary data, and does not attempt to
10738 decode it. The effect is as if you specified `no-conversion' for
10741 Set this to non-nil when a regular text happens to include null bytes.
10742 Examples are Index nodes of Info files and null-byte delimited output
10743 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10744 decode text as usual. */);
10745 inhibit_null_byte_detection
= 0;
10747 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input
,
10748 doc
: /* Char table for translating self-inserting characters.
10749 This is applied to the result of input methods, not their input.
10750 See also `keyboard-translate-table'.
10752 Use of this variable for character code unification was rendered
10753 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10754 internal character representation. */);
10755 Vtranslation_table_for_input
= Qnil
;
10758 Lisp_Object args
[coding_arg_max
];
10759 Lisp_Object plist
[16];
10762 for (i
= 0; i
< coding_arg_max
; i
++)
10765 plist
[0] = intern_c_string (":name");
10766 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
10767 plist
[2] = intern_c_string (":mnemonic");
10768 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
10769 plist
[4] = intern_c_string (":coding-type");
10770 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
10771 plist
[6] = intern_c_string (":ascii-compatible-p");
10772 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
10773 plist
[8] = intern_c_string (":default-char");
10774 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
10775 plist
[10] = intern_c_string (":for-unibyte");
10776 plist
[11] = args
[coding_arg_for_unibyte
] = Qt
;
10777 plist
[12] = intern_c_string (":docstring");
10778 plist
[13] = make_pure_c_string ("Do no conversion.\n\
10780 When you visit a file with this coding, the file is read into a\n\
10781 unibyte buffer as is, thus each byte of a file is treated as a\n\
10783 plist
[14] = intern_c_string (":eol-type");
10784 plist
[15] = args
[coding_arg_eol_type
] = Qunix
;
10785 args
[coding_arg_plist
] = Flist (16, plist
);
10786 Fdefine_coding_system_internal (coding_arg_max
, args
);
10788 plist
[1] = args
[coding_arg_name
] = Qundecided
;
10789 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('-');
10790 plist
[5] = args
[coding_arg_coding_type
] = Qundecided
;
10791 /* This is already set.
10792 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10793 plist
[8] = intern_c_string (":charset-list");
10794 plist
[9] = args
[coding_arg_charset_list
] = Fcons (Qascii
, Qnil
);
10795 plist
[11] = args
[coding_arg_for_unibyte
] = Qnil
;
10796 plist
[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10797 plist
[15] = args
[coding_arg_eol_type
] = Qnil
;
10798 args
[coding_arg_plist
] = Flist (16, plist
);
10799 Fdefine_coding_system_internal (coding_arg_max
, args
);
10802 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
10807 for (i
= 0; i
< coding_category_max
; i
++)
10808 Fset (AREF (Vcoding_category_table
, i
), Qno_conversion
);
10810 #if defined (DOS_NT)
10811 system_eol_type
= Qdos
;
10813 system_eol_type
= Qunix
;
10815 staticpro (&system_eol_type
);
10819 emacs_strerror (int error_number
)
10823 synchronize_system_messages_locale ();
10824 str
= strerror (error_number
);
10826 if (! NILP (Vlocale_coding_system
))
10828 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
10829 Vlocale_coding_system
,
10831 str
= SSDATA (dec
);