1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX. Return 1 if the data contains
148 a byte sequence which can be decoded into non-ASCII characters by
149 the coding system. Otherwize (i.e. the data contains only ASCII
150 characters or invalid sequence) return 0.
152 It also resets some bits of an integer pointed by MASK. The macros
153 CATEGORY_MASK_XXX specifies each bit of this integer.
155 Below is the template of these functions. */
159 detect_coding_XXX (coding
, mask
)
160 struct coding_system
*coding
;
163 unsigned char *src
= coding
->source
;
164 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
165 int multibytep
= coding
->src_multibyte
;
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
175 /* Check if it conforms to XXX. If not, break the loop. */
177 /* As the data is invalid for XXX, reset a proper bits. */
178 *mask
&= ~CODING_CATEGORY_XXX
;
181 /* The source exausted. */
183 /* ASCII characters only. */
185 /* Some data should be decoded into non-ASCII characters. */
186 *mask
&= CODING_CATEGORY_XXX
;
191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
193 These functions decode a byte sequence specified as a source by
194 CODING. The resulting multibyte text goes to a place pointed to by
195 CODING->charbuf, the length of which should not exceed
196 CODING->charbuf_size;
198 These functions set the information of original and decoded texts in
199 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
200 They also set CODING->result to one of CODING_RESULT_XXX indicating
201 how the decoding is finished.
203 Below is the template of these functions. */
207 decode_coding_XXXX (coding
)
208 struct coding_system
*coding
;
210 unsigned char *src
= coding
->source
+ coding
->consumed
;
211 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
212 /* SRC_BASE remembers the start position in source in each loop.
213 The loop will be exited when there's not enough source code, or
214 when there's no room in CHARBUF for a decoded character. */
215 unsigned char *src_base
;
216 /* A buffer to produce decoded characters. */
217 int *charbuf
= coding
->charbuf
;
218 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
219 int multibytep
= coding
->src_multibyte
;
224 if (charbuf
< charbuf_end
)
225 /* No more room to produce a decoded character. */
232 if (src_base
< src_end
233 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
234 /* If the source ends by partial bytes to construct a character,
235 treat them as eight-bit raw data. */
236 while (src_base
< src_end
&& charbuf
< charbuf_end
)
237 *charbuf
++ = *src_base
++;
238 /* Remember how many bytes and characters we consumed. If the
239 source is multibyte, the bytes and chars are not identical. */
240 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
241 /* Remember how many characters we produced. */
242 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
248 These functions encode SRC_BYTES length text at SOURCE of Emacs'
249 internal multibyte format by CODING. The resulting byte sequence
250 goes to a place pointed to by DESTINATION, the length of which
251 should not exceed DST_BYTES.
253 These functions set the information of original and encoded texts in
254 the members produced, produced_char, consumed, and consumed_char of
255 the structure *CODING. They also set the member result to one of
256 CODING_RESULT_XXX indicating how the encoding finished.
258 DST_BYTES zero means that source area and destination area are
259 overlapped, which means that we can produce a encoded text until it
260 reaches at the head of not-yet-encoded source text.
262 Below is a template of these functions. */
265 encode_coding_XXX (coding
)
266 struct coding_system
*coding
;
268 int multibytep
= coding
->dst_multibyte
;
269 int *charbuf
= coding
->charbuf
;
270 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
271 unsigned char *dst
= coding
->destination
+ coding
->produced
;
272 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
273 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
274 int produced_chars
= 0;
276 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
279 /* Encode C into DST, and increment DST. */
281 label_no_more_destination
:
282 /* How many chars and bytes we produced. */
283 coding
->produced_char
+= produced_chars
;
284 coding
->produced
= dst
- coding
->destination
;
289 /*** 1. Preamble ***/
296 #include "character.h"
299 #include "composite.h"
303 Lisp_Object Vcoding_system_hash_table
;
305 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
306 Lisp_Object Qunix
, Qdos
;
307 extern Lisp_Object Qmac
; /* frame.c */
308 Lisp_Object Qbuffer_file_coding_system
;
309 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
310 Lisp_Object Qdefault_char
;
311 Lisp_Object Qno_conversion
, Qundecided
;
312 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
313 Lisp_Object Qutf_16_be_nosig
, Qutf_16_be
, Qutf_16_le_nosig
, Qutf_16_le
;
314 Lisp_Object Qsignature
, Qendian
, Qbig
, Qlittle
;
315 Lisp_Object Qcoding_system_history
;
316 Lisp_Object Qvalid_codes
;
318 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
319 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
320 Lisp_Object Qstart_process
, Qopen_network_stream
;
321 Lisp_Object Qtarget_idx
;
323 Lisp_Object Vselect_safe_coding_system_function
;
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
327 /* Mnemonic string to indicate format of end-of-line is not yet
329 Lisp_Object eol_mnemonic_undecided
;
333 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
335 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule
, Qraw_text
;
341 /* Coding-systems are handed between Emacs Lisp programs and C internal
342 routines by the following three variables. */
343 /* Coding-system for reading files and receiving data from process. */
344 Lisp_Object Vcoding_system_for_read
;
345 /* Coding-system for writing files and sending data to process. */
346 Lisp_Object Vcoding_system_for_write
;
347 /* Coding-system actually used in the latest I/O. */
348 Lisp_Object Vlast_coding_system_used
;
350 /* A vector of length 256 which contains information about special
351 Latin codes (especially for dealing with Microsoft codes). */
352 Lisp_Object Vlatin_extra_code_table
;
354 /* Flag to inhibit code conversion of end-of-line format. */
355 int inhibit_eol_conversion
;
357 /* Flag to inhibit ISO2022 escape sequence detection. */
358 int inhibit_iso_escape_detection
;
360 /* Flag to make buffer-file-coding-system inherit from process-coding. */
361 int inherit_process_coding_system
;
363 /* Coding system to be used to encode text for terminal display. */
364 struct coding_system terminal_coding
;
366 /* Coding system to be used to encode text for terminal display when
367 terminal coding system is nil. */
368 struct coding_system safe_terminal_coding
;
370 /* Coding system of what is sent from terminal keyboard. */
371 struct coding_system keyboard_coding
;
373 Lisp_Object Vfile_coding_system_alist
;
374 Lisp_Object Vprocess_coding_system_alist
;
375 Lisp_Object Vnetwork_coding_system_alist
;
377 Lisp_Object Vlocale_coding_system
;
381 /* Flag to tell if we look up translation table on character code
383 Lisp_Object Venable_character_translation
;
384 /* Standard translation table to look up on decoding (reading). */
385 Lisp_Object Vstandard_translation_table_for_decode
;
386 /* Standard translation table to look up on encoding (writing). */
387 Lisp_Object Vstandard_translation_table_for_encode
;
389 Lisp_Object Qtranslation_table
;
390 Lisp_Object Qtranslation_table_id
;
391 Lisp_Object Qtranslation_table_for_decode
;
392 Lisp_Object Qtranslation_table_for_encode
;
394 /* Alist of charsets vs revision number. */
395 static Lisp_Object Vcharset_revision_table
;
397 /* Default coding systems used for process I/O. */
398 Lisp_Object Vdefault_process_coding_system
;
400 /* Global flag to tell that we can't call post-read-conversion and
401 pre-write-conversion functions. Usually the value is zero, but it
402 is set to 1 temporarily while such functions are running. This is
403 to avoid infinite recursive call. */
404 static int inhibit_pre_post_conversion
;
406 /* Two special coding systems. */
407 Lisp_Object Vsjis_coding_system
;
408 Lisp_Object Vbig5_coding_system
;
411 static int detect_coding_utf_8
P_ ((struct coding_system
*, int *));
412 static void decode_coding_utf_8
P_ ((struct coding_system
*));
413 static int encode_coding_utf_8
P_ ((struct coding_system
*));
415 static int detect_coding_utf_16
P_ ((struct coding_system
*, int *));
416 static void decode_coding_utf_16
P_ ((struct coding_system
*));
417 static int encode_coding_utf_16
P_ ((struct coding_system
*));
419 static int detect_coding_iso_2022
P_ ((struct coding_system
*, int *));
420 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
421 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
423 static int detect_coding_emacs_mule
P_ ((struct coding_system
*, int *));
424 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
425 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
427 static int detect_coding_sjis
P_ ((struct coding_system
*, int *));
428 static void decode_coding_sjis
P_ ((struct coding_system
*));
429 static int encode_coding_sjis
P_ ((struct coding_system
*));
431 static int detect_coding_big5
P_ ((struct coding_system
*, int *));
432 static void decode_coding_big5
P_ ((struct coding_system
*));
433 static int encode_coding_big5
P_ ((struct coding_system
*));
435 static int detect_coding_ccl
P_ ((struct coding_system
*, int *));
436 static void decode_coding_ccl
P_ ((struct coding_system
*));
437 static int encode_coding_ccl
P_ ((struct coding_system
*));
439 static void decode_coding_raw_text
P_ ((struct coding_system
*));
440 static int encode_coding_raw_text
P_ ((struct coding_system
*));
443 /* ISO2022 section */
445 #define CODING_ISO_INITIAL(coding, reg) \
446 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
447 coding_attr_iso_initial), \
451 #define CODING_ISO_REQUEST(coding, charset_id) \
452 ((charset_id <= (coding)->max_charset_id \
453 ? (coding)->safe_charsets[charset_id] \
457 #define CODING_ISO_FLAGS(coding) \
458 ((coding)->spec.iso_2022.flags)
459 #define CODING_ISO_DESIGNATION(coding, reg) \
460 ((coding)->spec.iso_2022.current_designation[reg])
461 #define CODING_ISO_INVOCATION(coding, plane) \
462 ((coding)->spec.iso_2022.current_invocation[plane])
463 #define CODING_ISO_SINGLE_SHIFTING(coding) \
464 ((coding)->spec.iso_2022.single_shifting)
465 #define CODING_ISO_BOL(coding) \
466 ((coding)->spec.iso_2022.bol)
467 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
468 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
470 /* Control characters of ISO2022. */
471 /* code */ /* function */
472 #define ISO_CODE_LF 0x0A /* line-feed */
473 #define ISO_CODE_CR 0x0D /* carriage-return */
474 #define ISO_CODE_SO 0x0E /* shift-out */
475 #define ISO_CODE_SI 0x0F /* shift-in */
476 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
477 #define ISO_CODE_ESC 0x1B /* escape */
478 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
479 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
480 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
482 /* All code (1-byte) of ISO2022 is classified into one of the
484 enum iso_code_class_type
486 ISO_control_0
, /* Control codes in the range
487 0x00..0x1F and 0x7F, except for the
488 following 5 codes. */
489 ISO_carriage_return
, /* ISO_CODE_CR (0x0D) */
490 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
491 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
492 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
493 ISO_escape
, /* ISO_CODE_SO (0x1B) */
494 ISO_control_1
, /* Control codes in the range
495 0x80..0x9F, except for the
496 following 3 codes. */
497 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
498 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
499 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
500 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
501 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
502 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
503 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
506 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
507 `iso-flags' attribute of an iso2022 coding system. */
509 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
510 instead of the correct short-form sequence (e.g. ESC $ A). */
511 #define CODING_ISO_FLAG_LONG_FORM 0x0001
513 /* If set, reset graphic planes and registers at end-of-line to the
515 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
517 /* If set, reset graphic planes and registers before any control
518 characters to the initial state. */
519 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
521 /* If set, encode by 7-bit environment. */
522 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
524 /* If set, use locking-shift function. */
525 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
527 /* If set, use single-shift function. Overwrite
528 CODING_ISO_FLAG_LOCKING_SHIFT. */
529 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
531 /* If set, use designation escape sequence. */
532 #define CODING_ISO_FLAG_DESIGNATION 0x0040
534 /* If set, produce revision number sequence. */
535 #define CODING_ISO_FLAG_REVISION 0x0080
537 /* If set, produce ISO6429's direction specifying sequence. */
538 #define CODING_ISO_FLAG_DIRECTION 0x0100
540 /* If set, assume designation states are reset at beginning of line on
542 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
544 /* If set, designation sequence should be placed at beginning of line
546 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
548 /* If set, do not encode unsafe charactes on output. */
549 #define CODING_ISO_FLAG_SAFE 0x0800
551 /* If set, extra latin codes (128..159) are accepted as a valid code
553 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
555 #define CODING_ISO_FLAG_COMPOSITION 0x2000
557 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
559 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
561 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
563 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
565 /* A character to be produced on output if encoding of the original
566 character is prohibited by CODING_ISO_FLAG_SAFE. */
567 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
571 #define CODING_UTF_16_BOM(coding) \
572 ((coding)->spec.utf_16.bom)
574 #define CODING_UTF_16_ENDIAN(coding) \
575 ((coding)->spec.utf_16.endian)
577 #define CODING_UTF_16_SURROGATE(coding) \
578 ((coding)->spec.utf_16.surrogate)
582 #define CODING_CCL_DECODER(coding) \
583 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
584 #define CODING_CCL_ENCODER(coding) \
585 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
586 #define CODING_CCL_VALIDS(coding) \
587 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
590 /* Index for each coding category in `coding_categories' */
594 coding_category_iso_7
,
595 coding_category_iso_7_tight
,
596 coding_category_iso_8_1
,
597 coding_category_iso_8_2
,
598 coding_category_iso_7_else
,
599 coding_category_iso_8_else
,
600 coding_category_utf_8
,
601 coding_category_utf_16_auto
,
602 coding_category_utf_16_be
,
603 coding_category_utf_16_le
,
604 coding_category_utf_16_be_nosig
,
605 coding_category_utf_16_le_nosig
,
606 coding_category_charset
,
607 coding_category_sjis
,
608 coding_category_big5
,
610 coding_category_emacs_mule
,
611 /* All above are targets of code detection. */
612 coding_category_raw_text
,
613 coding_category_undecided
,
617 /* Definitions of flag bits used in detect_coding_XXXX. */
618 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
619 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
620 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
621 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
622 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
623 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
624 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
625 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
626 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
627 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
628 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
629 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
630 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
631 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
632 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
633 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
635 /* This value is returned if detect_coding_mask () find nothing other
636 than ASCII characters. */
637 #define CATEGORY_MASK_ANY \
638 (CATEGORY_MASK_ISO_7 \
639 | CATEGORY_MASK_ISO_7_TIGHT \
640 | CATEGORY_MASK_ISO_8_1 \
641 | CATEGORY_MASK_ISO_8_2 \
642 | CATEGORY_MASK_ISO_7_ELSE \
643 | CATEGORY_MASK_ISO_8_ELSE \
644 | CATEGORY_MASK_UTF_8 \
645 | CATEGORY_MASK_UTF_16_BE \
646 | CATEGORY_MASK_UTF_16_LE \
647 | CATEGORY_MASK_UTF_16_BE_NOSIG \
648 | CATEGORY_MASK_UTF_16_LE_NOSIG \
649 | CATEGORY_MASK_CHARSET \
650 | CATEGORY_MASK_SJIS \
651 | CATEGORY_MASK_BIG5 \
652 | CATEGORY_MASK_CCL \
653 | CATEGORY_MASK_EMACS_MULE)
656 #define CATEGORY_MASK_ISO_7BIT \
657 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
659 #define CATEGORY_MASK_ISO_8BIT \
660 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
662 #define CATEGORY_MASK_ISO_ELSE \
663 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
665 #define CATEGORY_MASK_ISO_ESCAPE \
666 (CATEGORY_MASK_ISO_7 \
667 | CATEGORY_MASK_ISO_7_TIGHT \
668 | CATEGORY_MASK_ISO_7_ELSE \
669 | CATEGORY_MASK_ISO_8_ELSE)
671 #define CATEGORY_MASK_ISO \
672 ( CATEGORY_MASK_ISO_7BIT \
673 | CATEGORY_MASK_ISO_8BIT \
674 | CATEGORY_MASK_ISO_ELSE)
676 #define CATEGORY_MASK_UTF_16 \
677 (CATEGORY_MASK_UTF_16_BE \
678 | CATEGORY_MASK_UTF_16_LE \
679 | CATEGORY_MASK_UTF_16_BE_NOSIG \
680 | CATEGORY_MASK_UTF_16_LE_NOSIG)
683 /* List of symbols `coding-category-xxx' ordered by priority. This
684 variable is exposed to Emacs Lisp. */
685 static Lisp_Object Vcoding_category_list
;
687 /* Table of coding categories (Lisp symbols). This variable is for
689 static Lisp_Object Vcoding_category_table
;
691 /* Table of coding-categories ordered by priority. */
692 static enum coding_category coding_priorities
[coding_category_max
];
694 /* Nth element is a coding context for the coding system bound to the
695 Nth coding category. */
696 static struct coding_system coding_categories
[coding_category_max
];
698 static int detected_mask
[coding_category_raw_text
] =
706 CATEGORY_MASK_UTF_16
,
707 CATEGORY_MASK_UTF_16
,
708 CATEGORY_MASK_UTF_16
,
709 CATEGORY_MASK_UTF_16
,
710 CATEGORY_MASK_UTF_16
,
711 CATEGORY_MASK_CHARSET
,
715 CATEGORY_MASK_EMACS_MULE
718 /*** Commonly used macros and functions ***/
721 #define min(a, b) ((a) < (b) ? (a) : (b))
724 #define max(a, b) ((a) > (b) ? (a) : (b))
727 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
729 attrs = CODING_ID_ATTRS (coding->id); \
730 eol_type = CODING_ID_EOL_TYPE (coding->id); \
731 if (VECTORP (eol_type)) \
733 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
737 /* Safely get one byte from the source text pointed by SRC which ends
738 at SRC_END, and set C to that byte. If there are not enough bytes
739 in the source, it jumps to `no_more_source'. The caller
740 should declare and set these variables appropriately in advance:
741 src, src_end, multibytep
744 #define ONE_MORE_BYTE(c) \
746 if (src == src_end) \
748 if (src_base < src) \
749 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
750 goto no_more_source; \
753 if (multibytep && (c & 0x80)) \
755 if ((c & 0xFE) != 0xC0) \
756 error ("Undecodable char found"); \
757 c = ((c & 1) << 6) | *src++; \
763 #define ONE_MORE_BYTE_NO_CHECK(c) \
766 if (multibytep && (c & 0x80)) \
768 if ((c & 0xFE) != 0xC0) \
769 error ("Undecodable char found"); \
770 c = ((c & 1) << 6) | *src++; \
776 /* Store a byte C in the place pointed by DST and increment DST to the
777 next free point, and increment PRODUCED_CHARS. The caller should
778 assure that C is 0..127, and declare and set the variable `dst'
779 appropriately in advance.
783 #define EMIT_ONE_ASCII_BYTE(c) \
790 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
792 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
794 produced_chars += 2; \
795 *dst++ = (c1), *dst++ = (c2); \
799 /* Store a byte C in the place pointed by DST and increment DST to the
800 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
801 nonzero, store in an appropriate multibyte from. The caller should
802 declare and set the variables `dst' and `multibytep' appropriately
805 #define EMIT_ONE_BYTE(c) \
812 ch = BYTE8_TO_CHAR (ch); \
813 CHAR_STRING_ADVANCE (ch, dst); \
820 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
822 #define EMIT_TWO_BYTES(c1, c2) \
824 produced_chars += 2; \
831 ch = BYTE8_TO_CHAR (ch); \
832 CHAR_STRING_ADVANCE (ch, dst); \
835 ch = BYTE8_TO_CHAR (ch); \
836 CHAR_STRING_ADVANCE (ch, dst); \
846 #define EMIT_THREE_BYTES(c1, c2, c3) \
848 EMIT_ONE_BYTE (c1); \
849 EMIT_TWO_BYTES (c2, c3); \
853 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
855 EMIT_TWO_BYTES (c1, c2); \
856 EMIT_TWO_BYTES (c3, c4); \
860 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
862 charset_map_loaded = 0; \
863 c = DECODE_CHAR (charset, code); \
864 if (charset_map_loaded) \
866 unsigned char *orig = coding->source; \
869 coding_set_source (coding); \
870 offset = coding->source - orig; \
872 src_base += offset; \
878 #define ASSURE_DESTINATION(bytes) \
880 if (dst + (bytes) >= dst_end) \
882 int more_bytes = charbuf_end - charbuf + (bytes); \
884 dst = alloc_destination (coding, more_bytes, dst); \
885 dst_end = coding->destination + coding->dst_bytes; \
892 coding_set_source (coding
)
893 struct coding_system
*coding
;
895 if (BUFFERP (coding
->src_object
))
897 if (coding
->src_pos
< 0)
898 coding
->source
= GAP_END_ADDR
+ coding
->src_pos_byte
;
901 struct buffer
*buf
= XBUFFER (coding
->src_object
);
902 EMACS_INT gpt_byte
= BUF_GPT_BYTE (buf
);
903 unsigned char *beg_addr
= BUF_BEG_ADDR (buf
);
905 coding
->source
= beg_addr
+ coding
->src_pos_byte
- 1;
906 if (coding
->src_pos_byte
>= gpt_byte
)
907 coding
->source
+= BUF_GAP_SIZE (buf
);
910 else if (STRINGP (coding
->src_object
))
912 coding
->source
= (XSTRING (coding
->src_object
)->data
913 + coding
->src_pos_byte
);
916 /* Otherwise, the source is C string and is never relocated
917 automatically. Thus we don't have to update anything. */
922 coding_set_destination (coding
)
923 struct coding_system
*coding
;
925 if (BUFFERP (coding
->dst_object
))
927 /* We are sure that coding->dst_pos_byte is before the gap of the
929 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
930 + coding
->dst_pos_byte
- 1);
931 if (coding
->src_pos
< 0)
932 coding
->dst_bytes
= (GAP_END_ADDR
933 - (coding
->src_bytes
- coding
->consumed
)
934 - coding
->destination
);
936 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
937 - coding
->destination
);
940 /* Otherwise, the destination is C string and is never relocated
941 automatically. Thus we don't have to update anything. */
947 coding_alloc_by_realloc (coding
, bytes
)
948 struct coding_system
*coding
;
951 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
952 coding
->dst_bytes
+ bytes
);
953 coding
->dst_bytes
+= bytes
;
957 coding_alloc_by_making_gap (coding
, bytes
)
958 struct coding_system
*coding
;
961 if (BUFFERP (coding
->dst_object
)
962 && EQ (coding
->src_object
, coding
->dst_object
))
964 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
966 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
968 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
972 Lisp_Object this_buffer
;
974 this_buffer
= Fcurrent_buffer ();
975 set_buffer_internal (XBUFFER (coding
->dst_object
));
977 set_buffer_internal (XBUFFER (this_buffer
));
982 static unsigned char *
983 alloc_destination (coding
, nbytes
, dst
)
984 struct coding_system
*coding
;
988 EMACS_INT offset
= dst
- coding
->destination
;
990 if (BUFFERP (coding
->dst_object
))
991 coding_alloc_by_making_gap (coding
, nbytes
);
993 coding_alloc_by_realloc (coding
, nbytes
);
994 coding
->result
= CODING_RESULT_SUCCESS
;
995 coding_set_destination (coding
);
996 dst
= coding
->destination
+ offset
;
1001 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1008 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1009 Check if a text is encoded in UTF-8. If it is, return
1010 CATEGORY_MASK_UTF_8, else return 0. */
1012 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1013 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1014 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1015 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1016 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1017 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1020 detect_coding_utf_8 (coding
, mask
)
1021 struct coding_system
*coding
;
1024 unsigned char *src
= coding
->source
, *src_base
= src
;
1025 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1026 int multibytep
= coding
->src_multibyte
;
1027 int consumed_chars
= 0;
1030 /* A coding system of this category is always ASCII compatible. */
1031 src
+= coding
->head_ascii
;
1035 int c
, c1
, c2
, c3
, c4
;
1038 if (UTF_8_1_OCTET_P (c
))
1041 if (! UTF_8_EXTRA_OCTET_P (c1
))
1043 if (UTF_8_2_OCTET_LEADING_P (c
))
1049 if (! UTF_8_EXTRA_OCTET_P (c2
))
1051 if (UTF_8_3_OCTET_LEADING_P (c
))
1057 if (! UTF_8_EXTRA_OCTET_P (c3
))
1059 if (UTF_8_4_OCTET_LEADING_P (c
))
1065 if (! UTF_8_EXTRA_OCTET_P (c4
))
1067 if (UTF_8_5_OCTET_LEADING_P (c
))
1074 *mask
&= ~CATEGORY_MASK_UTF_8
;
1080 *mask
&= CATEGORY_MASK_UTF_8
;
1085 /* Fixme: deal with surrogates? */
1087 decode_coding_utf_8 (coding
)
1088 struct coding_system
*coding
;
1090 unsigned char *src
= coding
->source
+ coding
->consumed
;
1091 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1092 unsigned char *src_base
;
1093 int *charbuf
= coding
->charbuf
;
1094 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1095 int consumed_chars
= 0, consumed_chars_base
;
1096 int multibytep
= coding
->src_multibyte
;
1097 Lisp_Object attr
, eol_type
, charset_list
;
1099 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1103 int c
, c1
, c2
, c3
, c4
, c5
;
1106 consumed_chars_base
= consumed_chars
;
1108 if (charbuf
>= charbuf_end
)
1112 if (UTF_8_1_OCTET_P(c1
))
1117 if (EQ (eol_type
, Qdos
))
1120 goto no_more_source
;
1124 else if (EQ (eol_type
, Qmac
))
1131 if (! UTF_8_EXTRA_OCTET_P (c2
))
1133 if (UTF_8_2_OCTET_LEADING_P (c1
))
1135 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1136 /* Reject overlong sequences here and below. Encoders
1137 producing them are incorrect, they can be misleading,
1138 and they mess up read/write invariance. */
1145 if (! UTF_8_EXTRA_OCTET_P (c3
))
1147 if (UTF_8_3_OCTET_LEADING_P (c1
))
1149 c
= (((c1
& 0xF) << 12)
1150 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1157 if (! UTF_8_EXTRA_OCTET_P (c4
))
1159 if (UTF_8_4_OCTET_LEADING_P (c1
))
1161 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1162 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1169 if (! UTF_8_EXTRA_OCTET_P (c5
))
1171 if (UTF_8_5_OCTET_LEADING_P (c1
))
1173 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1174 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1176 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1191 consumed_chars
= consumed_chars_base
;
1193 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1198 coding
->consumed_char
+= consumed_chars_base
;
1199 coding
->consumed
= src_base
- coding
->source
;
1200 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1205 encode_coding_utf_8 (coding
)
1206 struct coding_system
*coding
;
1208 int multibytep
= coding
->dst_multibyte
;
1209 int *charbuf
= coding
->charbuf
;
1210 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1211 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1212 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1213 int produced_chars
= 0;
1218 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1220 while (charbuf
< charbuf_end
)
1222 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1224 ASSURE_DESTINATION (safe_room
);
1226 CHAR_STRING_ADVANCE (c
, pend
);
1227 for (p
= str
; p
< pend
; p
++)
1233 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1235 while (charbuf
< charbuf_end
)
1237 ASSURE_DESTINATION (safe_room
);
1239 dst
+= CHAR_STRING (c
, dst
);
1243 coding
->result
= CODING_RESULT_SUCCESS
;
1244 coding
->produced_char
+= produced_chars
;
1245 coding
->produced
= dst
- coding
->destination
;
1250 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1251 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1252 Little Endian (otherwise). If it is, return
1253 CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1256 #define UTF_16_HIGH_SURROGATE_P(val) \
1257 (((val) & 0xFC00) == 0xD800)
1259 #define UTF_16_LOW_SURROGATE_P(val) \
1260 (((val) & 0xFC00) == 0xDC00)
1262 #define UTF_16_INVALID_P(val) \
1263 (((val) == 0xFFFE) \
1264 || ((val) == 0xFFFF) \
1265 || UTF_16_LOW_SURROGATE_P (val))
1269 detect_coding_utf_16 (coding
, mask
)
1270 struct coding_system
*coding
;
1273 unsigned char *src
= coding
->source
, *src_base
= src
;
1274 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1275 int multibytep
= coding
->src_multibyte
;
1276 int consumed_chars
= 0;
1282 if ((c1
== 0xFF) && (c2
== 0xFE))
1284 *mask
&= CATEGORY_MASK_UTF_16_LE
;
1287 else if ((c1
== 0xFE) && (c2
== 0xFF))
1289 *mask
&= CATEGORY_MASK_UTF_16_BE
;
1297 decode_coding_utf_16 (coding
)
1298 struct coding_system
*coding
;
1300 unsigned char *src
= coding
->source
+ coding
->consumed
;
1301 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1302 unsigned char *src_base
;
1303 int *charbuf
= coding
->charbuf
;
1304 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1305 int consumed_chars
= 0, consumed_chars_base
;
1306 int multibytep
= coding
->src_multibyte
;
1307 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1308 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1309 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1310 Lisp_Object attr
, eol_type
, charset_list
;
1312 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1314 if (bom
!= utf_16_without_bom
)
1322 if (bom
== utf_16_with_bom
)
1324 if (endian
== utf_16_big_endian
1325 ? c
!= 0xFFFE : c
!= 0xFEFF)
1327 /* We are sure that there's enouph room at CHARBUF. */
1336 CODING_UTF_16_ENDIAN (coding
)
1337 = endian
= utf_16_big_endian
;
1338 else if (c
== 0xFEFF)
1339 CODING_UTF_16_ENDIAN (coding
)
1340 = endian
= utf_16_little_endian
;
1343 CODING_UTF_16_ENDIAN (coding
)
1344 = endian
= utf_16_big_endian
;
1348 CODING_UTF_16_BOM (coding
) = utf_16_with_bom
;
1356 consumed_chars_base
= consumed_chars
;
1358 if (charbuf
+ 2 >= charbuf_end
)
1363 c
= (endian
== utf_16_big_endian
1364 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1367 if (! UTF_16_LOW_SURROGATE_P (c
))
1369 if (endian
== utf_16_big_endian
)
1370 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1372 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1376 if (UTF_16_HIGH_SURROGATE_P (c
))
1377 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1383 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1384 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1390 if (UTF_16_HIGH_SURROGATE_P (c
))
1391 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1398 coding
->consumed_char
+= consumed_chars_base
;
1399 coding
->consumed
= src_base
- coding
->source
;
1400 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1404 encode_coding_utf_16 (coding
)
1405 struct coding_system
*coding
;
1407 int multibytep
= coding
->dst_multibyte
;
1408 int *charbuf
= coding
->charbuf
;
1409 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1410 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1411 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1413 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1414 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1415 int produced_chars
= 0;
1416 Lisp_Object attrs
, eol_type
, charset_list
;
1419 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1421 if (bom
== utf_16_with_bom
)
1423 ASSURE_DESTINATION (safe_room
);
1425 EMIT_TWO_BYTES (0xFF, 0xFE);
1427 EMIT_TWO_BYTES (0xFE, 0xFF);
1428 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1431 while (charbuf
< charbuf_end
)
1433 ASSURE_DESTINATION (safe_room
);
1435 if (c
>= MAX_UNICODE_CHAR
)
1436 c
= coding
->default_char
;
1441 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1443 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1450 c1
= (c
>> 10) + 0xD800;
1451 c2
= (c
& 0x3FF) + 0xDC00;
1453 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1455 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1458 coding
->result
= CODING_RESULT_SUCCESS
;
1459 coding
->produced
= dst
- coding
->destination
;
1460 coding
->produced_char
+= produced_chars
;
1465 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1467 /* Emacs' internal format for representation of multiple character
1468 sets is a kind of multi-byte encoding, i.e. characters are
1469 represented by variable-length sequences of one-byte codes.
1471 ASCII characters and control characters (e.g. `tab', `newline') are
1472 represented by one-byte sequences which are their ASCII codes, in
1473 the range 0x00 through 0x7F.
1475 8-bit characters of the range 0x80..0x9F are represented by
1476 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1479 8-bit characters of the range 0xA0..0xFF are represented by
1480 one-byte sequences which are their 8-bit code.
1482 The other characters are represented by a sequence of `base
1483 leading-code', optional `extended leading-code', and one or two
1484 `position-code's. The length of the sequence is determined by the
1485 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1486 whereas extended leading-code and position-code take the range 0xA0
1487 through 0xFF. See `charset.h' for more details about leading-code
1490 --- CODE RANGE of Emacs' internal format ---
1494 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1495 eight-bit-graphic 0xA0..0xBF
1496 ELSE 0x81..0x9D + [0xA0..0xFF]+
1497 ---------------------------------------------
1499 As this is the internal character representation, the format is
1500 usually not used externally (i.e. in a file or in a data sent to a
1501 process). But, it is possible to have a text externally in this
1502 format (i.e. by encoding by the coding system `emacs-mule').
1504 In that case, a sequence of one-byte codes has a slightly different
1507 At first, all characters in eight-bit-control are represented by
1508 one-byte sequences which are their 8-bit code.
1510 Next, character composition data are represented by the byte
1511 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1513 METHOD is 0xF0 plus one of composition method (enum
1514 composition_method),
1516 BYTES is 0xA0 plus a byte length of this composition data,
1518 CHARS is 0x20 plus a number of characters composed by this
1521 COMPONENTs are characters of multibye form or composition
1522 rules encoded by two-byte of ASCII codes.
1524 In addition, for backward compatibility, the following formats are
1525 also recognized as composition data on decoding.
1528 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1531 MSEQ is a multibyte form but in these special format:
1532 ASCII: 0xA0 ASCII_CODE+0x80,
1533 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1534 RULE is a one byte code of the range 0xA0..0xF0 that
1535 represents a composition rule.
1538 char emacs_mule_bytes
[256];
1541 emacs_mule_char (coding
, src
, nbytes
, nchars
)
1542 struct coding_system
*coding
;
1544 int *nbytes
, *nchars
;
1546 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1547 int multibytep
= coding
->src_multibyte
;
1548 unsigned char *src_base
= src
;
1549 struct charset
*charset
;
1552 int consumed_chars
= 0;
1555 switch (emacs_mule_bytes
[c
])
1558 if (! (charset
= emacs_mule_charset
[c
]))
1565 if (c
== EMACS_MULE_LEADING_CODE_PRIVATE_11
1566 || c
== EMACS_MULE_LEADING_CODE_PRIVATE_12
)
1569 if (! (charset
= emacs_mule_charset
[c
]))
1576 if (! (charset
= emacs_mule_charset
[c
]))
1579 code
= (c
& 0x7F) << 8;
1587 if (! (charset
= emacs_mule_charset
[c
]))
1590 code
= (c
& 0x7F) << 8;
1597 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
)
1598 ? charset_ascii
: charset_eight_bit
);
1604 c
= DECODE_CHAR (charset
, code
);
1607 *nbytes
= src
- src_base
;
1608 *nchars
= consumed_chars
;
1619 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1620 Check if a text is encoded in `emacs-mule'. */
1623 detect_coding_emacs_mule (coding
, mask
)
1624 struct coding_system
*coding
;
1627 unsigned char *src
= coding
->source
, *src_base
= src
;
1628 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1629 int multibytep
= coding
->src_multibyte
;
1630 int consumed_chars
= 0;
1634 /* A coding system of this category is always ASCII compatible. */
1635 src
+= coding
->head_ascii
;
1643 /* Perhaps the start of composite character. We simple skip
1644 it because analyzing it is too heavy for detecting. But,
1645 at least, we check that the composite character
1646 constitues of more than 4 bytes. */
1647 unsigned char *src_base
;
1657 if (src
- src_base
<= 4)
1667 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1672 unsigned char *src_base
= src
- 1;
1679 if (src
- src_base
!= emacs_mule_bytes
[*src_base
])
1684 *mask
&= ~CATEGORY_MASK_EMACS_MULE
;
1690 *mask
&= CATEGORY_MASK_EMACS_MULE
;
1695 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1697 /* Decode a character represented as a component of composition
1698 sequence of Emacs 20/21 style at SRC. Set C to that character and
1699 update SRC to the head of next character (or an encoded composition
1700 rule). If SRC doesn't points a composition component, set C to -1.
1701 If SRC points an invalid byte sequence, global exit by a return
1704 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1708 int nbytes, nchars; \
1710 if (src == src_end) \
1712 c = emacs_mule_char (coding, src, &nbytes, &nchars); \
1717 goto invalid_code; \
1721 consumed_chars += nchars; \
1726 /* Decode a composition rule represented as a component of composition
1727 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1728 and increment BUF. If SRC points an invalid byte sequence, set C
1731 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1733 int c, gref, nref; \
1735 if (src >= src_end) \
1736 goto invalid_code; \
1737 ONE_MORE_BYTE_NO_CHECK (c); \
1739 if (c < 0 || c >= 81) \
1740 goto invalid_code; \
1742 gref = c / 9, nref = c % 9; \
1743 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1747 /* Decode a composition rule represented as a component of composition
1748 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1749 and increment BUF. If SRC points an invalid byte sequence, set C
1752 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1756 if (src + 1>= src_end) \
1757 goto invalid_code; \
1758 ONE_MORE_BYTE_NO_CHECK (gref); \
1760 ONE_MORE_BYTE_NO_CHECK (nref); \
1762 if (gref < 0 || gref >= 81 \
1763 || nref < 0 || nref >= 81) \
1764 goto invalid_code; \
1765 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1769 #define ADD_COMPOSITION_DATA(buf, method, nchars) \
1772 *buf++ = coding->produced_char + char_offset; \
1773 *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \
1779 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1781 /* Emacs 21 style format. The first three bytes at SRC are \
1782 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1783 the byte length of this composition information, CHARS is the \
1784 number of characters composed by this composition. */ \
1785 enum composition_method method = c - 0xF2; \
1786 int *charbuf_base = charbuf; \
1787 int consumed_chars_limit; \
1788 int nbytes, nchars; \
1790 ONE_MORE_BYTE (c); \
1791 nbytes = c - 0xA0; \
1793 goto invalid_code; \
1794 ONE_MORE_BYTE (c); \
1795 nchars = c - 0xA0; \
1796 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
1797 consumed_chars_limit = consumed_chars_base + nbytes; \
1798 if (method != COMPOSITION_RELATIVE) \
1801 while (consumed_chars < consumed_chars_limit) \
1803 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1804 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1806 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1809 if (consumed_chars < consumed_chars_limit) \
1810 goto invalid_code; \
1811 charbuf_base[0] -= i; \
1816 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1818 /* Emacs 20 style format for relative composition. */ \
1819 /* Store multibyte form of characters to be composed. */ \
1820 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1821 int *buf = components; \
1825 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1826 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1827 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1829 goto invalid_code; \
1830 ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \
1831 for (j = 0; j < i; j++) \
1832 *charbuf++ = components[j]; \
1836 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1838 /* Emacs 20 style format for rule-base composition. */ \
1839 /* Store multibyte form of characters to be composed. */ \
1840 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1841 int *buf = components; \
1844 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1845 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1847 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1848 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1850 if (i < 1 || (buf - components) % 2 == 0) \
1851 goto invalid_code; \
1852 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1853 goto no_more_source; \
1854 ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \
1855 for (j = 0; j < i; j++) \
1856 *charbuf++ = components[j]; \
1857 for (j = 0; j < i; j += 2) \
1858 *charbuf++ = components[j]; \
1863 decode_coding_emacs_mule (coding
)
1864 struct coding_system
*coding
;
1866 unsigned char *src
= coding
->source
+ coding
->consumed
;
1867 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1868 unsigned char *src_base
;
1869 int *charbuf
= coding
->charbuf
;
1870 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1871 int consumed_chars
= 0, consumed_chars_base
;
1872 int char_offset
= 0;
1873 int multibytep
= coding
->src_multibyte
;
1874 Lisp_Object attrs
, eol_type
, charset_list
;
1876 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1883 consumed_chars_base
= consumed_chars
;
1885 if (charbuf
>= charbuf_end
)
1894 if (EQ (eol_type
, Qdos
))
1897 goto no_more_source
;
1901 else if (EQ (eol_type
, Qmac
))
1909 if (charbuf
+ 5 + (MAX_COMPOSITION_COMPONENTS
* 2) - 1 > charbuf_end
)
1912 if (c
- 0xF2 >= COMPOSITION_RELATIVE
1913 && c
- 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS
)
1914 DECODE_EMACS_MULE_21_COMPOSITION (c
);
1916 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
1918 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
1921 coding
->annotated
= 1;
1923 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
1927 consumed_chars
= consumed_chars_base
;
1928 c
= emacs_mule_char (coding
, src
, &nbytes
, &nchars
);
1937 consumed_chars
+= nchars
;
1944 consumed_chars
= consumed_chars_base
;
1946 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1951 coding
->consumed_char
+= consumed_chars_base
;
1952 coding
->consumed
= src_base
- coding
->source
;
1953 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1957 #define EMACS_MULE_LEADING_CODES(id, codes) \
1960 codes[0] = id, codes[1] = 0; \
1961 else if (id < 0xE0) \
1962 codes[0] = 0x9A, codes[1] = id; \
1963 else if (id < 0xF0) \
1964 codes[0] = 0x9B, codes[1] = id; \
1965 else if (id < 0xF5) \
1966 codes[0] = 0x9C, codes[1] = id; \
1968 codes[0] = 0x9D, codes[1] = id; \
1973 encode_coding_emacs_mule (coding
)
1974 struct coding_system
*coding
;
1976 int multibytep
= coding
->dst_multibyte
;
1977 int *charbuf
= coding
->charbuf
;
1978 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1979 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1980 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1982 int produced_chars
= 0;
1983 Lisp_Object attrs
, eol_type
, charset_list
;
1986 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1988 while (charbuf
< charbuf_end
)
1990 ASSURE_DESTINATION (safe_room
);
1992 if (ASCII_CHAR_P (c
))
1993 EMIT_ONE_ASCII_BYTE (c
);
1994 else if (CHAR_BYTE8_P (c
))
1996 c
= CHAR_TO_BYTE8 (c
);
2001 struct charset
*charset
;
2005 unsigned char leading_codes
[2];
2007 charset
= char_charset (c
, charset_list
, &code
);
2010 c
= coding
->default_char
;
2011 if (ASCII_CHAR_P (c
))
2013 EMIT_ONE_ASCII_BYTE (c
);
2016 charset
= char_charset (c
, charset_list
, &code
);
2018 dimension
= CHARSET_DIMENSION (charset
);
2019 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2020 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2021 EMIT_ONE_BYTE (leading_codes
[0]);
2022 if (leading_codes
[1])
2023 EMIT_ONE_BYTE (leading_codes
[1]);
2025 EMIT_ONE_BYTE (code
);
2028 EMIT_ONE_BYTE (code
>> 8);
2029 EMIT_ONE_BYTE (code
& 0xFF);
2033 coding
->result
= CODING_RESULT_SUCCESS
;
2034 coding
->produced_char
+= produced_chars
;
2035 coding
->produced
= dst
- coding
->destination
;
2040 /*** 7. ISO2022 handlers ***/
2042 /* The following note describes the coding system ISO2022 briefly.
2043 Since the intention of this note is to help understand the
2044 functions in this file, some parts are NOT ACCURATE or are OVERLY
2045 SIMPLIFIED. For thorough understanding, please refer to the
2046 original document of ISO2022. This is equivalent to the standard
2047 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2049 ISO2022 provides many mechanisms to encode several character sets
2050 in 7-bit and 8-bit environments. For 7-bit environments, all text
2051 is encoded using bytes less than 128. This may make the encoded
2052 text a little bit longer, but the text passes more easily through
2053 several types of gateway, some of which strip off the MSB (Most
2056 There are two kinds of character sets: control character sets and
2057 graphic character sets. The former contain control characters such
2058 as `newline' and `escape' to provide control functions (control
2059 functions are also provided by escape sequences). The latter
2060 contain graphic characters such as 'A' and '-'. Emacs recognizes
2061 two control character sets and many graphic character sets.
2063 Graphic character sets are classified into one of the following
2064 four classes, according to the number of bytes (DIMENSION) and
2065 number of characters in one dimension (CHARS) of the set:
2066 - DIMENSION1_CHARS94
2067 - DIMENSION1_CHARS96
2068 - DIMENSION2_CHARS94
2069 - DIMENSION2_CHARS96
2071 In addition, each character set is assigned an identification tag,
2072 unique for each set, called the "final character" (denoted as <F>
2073 hereafter). The <F> of each character set is decided by ECMA(*)
2074 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2075 (0x30..0x3F are for private use only).
2077 Note (*): ECMA = European Computer Manufacturers Association
2079 Here are examples of graphic character sets [NAME(<F>)]:
2080 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2081 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2082 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2083 o DIMENSION2_CHARS96 -- none for the moment
2085 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2086 C0 [0x00..0x1F] -- control character plane 0
2087 GL [0x20..0x7F] -- graphic character plane 0
2088 C1 [0x80..0x9F] -- control character plane 1
2089 GR [0xA0..0xFF] -- graphic character plane 1
2091 A control character set is directly designated and invoked to C0 or
2092 C1 by an escape sequence. The most common case is that:
2093 - ISO646's control character set is designated/invoked to C0, and
2094 - ISO6429's control character set is designated/invoked to C1,
2095 and usually these designations/invocations are omitted in encoded
2096 text. In a 7-bit environment, only C0 can be used, and a control
2097 character for C1 is encoded by an appropriate escape sequence to
2098 fit into the environment. All control characters for C1 are
2099 defined to have corresponding escape sequences.
2101 A graphic character set is at first designated to one of four
2102 graphic registers (G0 through G3), then these graphic registers are
2103 invoked to GL or GR. These designations and invocations can be
2104 done independently. The most common case is that G0 is invoked to
2105 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2106 these invocations and designations are omitted in encoded text.
2107 In a 7-bit environment, only GL can be used.
2109 When a graphic character set of CHARS94 is invoked to GL, codes
2110 0x20 and 0x7F of the GL area work as control characters SPACE and
2111 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2114 There are two ways of invocation: locking-shift and single-shift.
2115 With locking-shift, the invocation lasts until the next different
2116 invocation, whereas with single-shift, the invocation affects the
2117 following character only and doesn't affect the locking-shift
2118 state. Invocations are done by the following control characters or
2121 ----------------------------------------------------------------------
2122 abbrev function cntrl escape seq description
2123 ----------------------------------------------------------------------
2124 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2125 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2126 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2127 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2128 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2129 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2130 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2131 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2132 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2133 ----------------------------------------------------------------------
2134 (*) These are not used by any known coding system.
2136 Control characters for these functions are defined by macros
2137 ISO_CODE_XXX in `coding.h'.
2139 Designations are done by the following escape sequences:
2140 ----------------------------------------------------------------------
2141 escape sequence description
2142 ----------------------------------------------------------------------
2143 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2144 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2145 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2146 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2147 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2148 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2149 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2150 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2151 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2152 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2153 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2154 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2155 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2156 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2157 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2158 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2159 ----------------------------------------------------------------------
2161 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2162 of dimension 1, chars 94, and final character <F>, etc...
2164 Note (*): Although these designations are not allowed in ISO2022,
2165 Emacs accepts them on decoding, and produces them on encoding
2166 CHARS96 character sets in a coding system which is characterized as
2167 7-bit environment, non-locking-shift, and non-single-shift.
2169 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2170 '(' must be omitted. We refer to this as "short-form" hereafter.
2172 Now you may notice that there are a lot of ways of encoding the
2173 same multilingual text in ISO2022. Actually, there exist many
2174 coding systems such as Compound Text (used in X11's inter client
2175 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2176 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2177 localized platforms), and all of these are variants of ISO2022.
2179 In addition to the above, Emacs handles two more kinds of escape
2180 sequences: ISO6429's direction specification and Emacs' private
2181 sequence for specifying character composition.
2183 ISO6429's direction specification takes the following form:
2184 o CSI ']' -- end of the current direction
2185 o CSI '0' ']' -- end of the current direction
2186 o CSI '1' ']' -- start of left-to-right text
2187 o CSI '2' ']' -- start of right-to-left text
2188 The control character CSI (0x9B: control sequence introducer) is
2189 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2191 Character composition specification takes the following form:
2192 o ESC '0' -- start relative composition
2193 o ESC '1' -- end composition
2194 o ESC '2' -- start rule-base composition (*)
2195 o ESC '3' -- start relative composition with alternate chars (**)
2196 o ESC '4' -- start rule-base composition with alternate chars (**)
2197 Since these are not standard escape sequences of any ISO standard,
2198 the use of them with these meanings is restricted to Emacs only.
2200 (*) This form is used only in Emacs 20.7 and older versions,
2201 but newer versions can safely decode it.
2202 (**) This form is used only in Emacs 21.1 and newer versions,
2203 and older versions can't decode it.
2205 Here's a list of example usages of these composition escape
2206 sequences (categorized by `enum composition_method').
2208 COMPOSITION_RELATIVE:
2209 ESC 0 CHAR [ CHAR ] ESC 1
2210 COMPOSITION_WITH_RULE:
2211 ESC 2 CHAR [ RULE CHAR ] ESC 1
2212 COMPOSITION_WITH_ALTCHARS:
2213 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2214 COMPOSITION_WITH_RULE_ALTCHARS:
2215 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2217 enum iso_code_class_type iso_code_class
[256];
2219 #define SAFE_CHARSET_P(coding, id) \
2220 ((id) <= (coding)->max_charset_id \
2221 && (coding)->safe_charsets[id] >= 0)
2224 #define SHIFT_OUT_OK(category) \
2225 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2228 setup_iso_safe_charsets (attrs
)
2231 Lisp_Object charset_list
, safe_charsets
;
2232 Lisp_Object request
;
2233 Lisp_Object reg_usage
;
2236 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2239 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2240 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2241 && ! EQ (charset_list
, Viso_2022_charset_list
))
2243 CODING_ATTR_CHARSET_LIST (attrs
)
2244 = charset_list
= Viso_2022_charset_list
;
2245 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2248 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2252 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2254 int id
= XINT (XCAR (tail
));
2255 if (max_charset_id
< id
)
2256 max_charset_id
= id
;
2259 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2261 request
= AREF (attrs
, coding_attr_iso_request
);
2262 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2263 reg94
= XINT (XCAR (reg_usage
));
2264 reg96
= XINT (XCDR (reg_usage
));
2266 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2270 struct charset
*charset
;
2273 charset
= CHARSET_FROM_ID (XINT (id
));
2274 reg
= Fcdr (Fassq (id
, request
));
2276 XSTRING (safe_charsets
)->data
[XINT (id
)] = XINT (reg
);
2277 else if (charset
->iso_chars_96
)
2280 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg96
;
2285 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg94
;
2288 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2292 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2293 Check if a text is encoded in ISO2022. If it is, returns an
2294 integer in which appropriate flag bits any of:
2296 CATEGORY_MASK_ISO_7_TIGHT
2297 CATEGORY_MASK_ISO_8_1
2298 CATEGORY_MASK_ISO_8_2
2299 CATEGORY_MASK_ISO_7_ELSE
2300 CATEGORY_MASK_ISO_8_ELSE
2301 are set. If a code which should never appear in ISO2022 is found,
2305 detect_coding_iso_2022 (coding
, mask
)
2306 struct coding_system
*coding
;
2309 unsigned char *src
= coding
->source
, *src_base
= src
;
2310 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2311 int multibytep
= coding
->src_multibyte
;
2312 int mask_iso
= CATEGORY_MASK_ISO
;
2313 int mask_found
= 0, mask_8bit_found
= 0;
2314 int reg
[4], shift_out
= 0, single_shifting
= 0;
2317 int consumed_chars
= 0;
2320 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2322 struct coding_system
*this = &(coding_categories
[i
]);
2323 Lisp_Object attrs
, val
;
2325 attrs
= CODING_ID_ATTRS (this->id
);
2326 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2327 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2328 setup_iso_safe_charsets (attrs
);
2329 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2330 this->max_charset_id
= XSTRING (val
)->size
- 1;
2331 this->safe_charsets
= (char *) XSTRING (val
)->data
;
2334 /* A coding system of this category is always ASCII compatible. */
2335 src
+= coding
->head_ascii
;
2337 reg
[0] = charset_ascii
, reg
[1] = reg
[2] = reg
[3] = -1;
2338 while (mask_iso
&& src
< src_end
)
2344 if (inhibit_iso_escape_detection
)
2346 single_shifting
= 0;
2348 if (c
>= '(' && c
<= '/')
2350 /* Designation sequence for a charset of dimension 1. */
2352 if (c1
< ' ' || c1
>= 0x80
2353 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2354 /* Invalid designation sequence. Just ignore. */
2356 reg
[(c
- '(') % 4] = id
;
2360 /* Designation sequence for a charset of dimension 2. */
2362 if (c
>= '@' && c
<= 'B')
2363 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2364 reg
[0] = id
= iso_charset_table
[1][0][c
];
2365 else if (c
>= '(' && c
<= '/')
2368 if (c1
< ' ' || c1
>= 0x80
2369 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2370 /* Invalid designation sequence. Just ignore. */
2372 reg
[(c
- '(') % 4] = id
;
2375 /* Invalid designation sequence. Just ignore. */
2378 else if (c
== 'N' || c
== 'O')
2380 /* ESC <Fe> for SS2 or SS3. */
2381 mask_iso
&= CATEGORY_MASK_ISO_7_ELSE
;
2384 else if (c
>= '0' && c
<= '4')
2386 /* ESC <Fp> for start/end composition. */
2387 mask_found
|= CATEGORY_MASK_ISO
;
2392 /* Invalid escape sequence. */
2393 mask_iso
&= ~CATEGORY_MASK_ISO_ESCAPE
;
2397 /* We found a valid designation sequence for CHARSET. */
2398 mask_iso
&= ~CATEGORY_MASK_ISO_8BIT
;
2399 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2401 mask_found
|= CATEGORY_MASK_ISO_7
;
2403 mask_iso
&= ~CATEGORY_MASK_ISO_7
;
2404 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2406 mask_found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2408 mask_iso
&= ~CATEGORY_MASK_ISO_7_TIGHT
;
2409 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2411 mask_found
|= CATEGORY_MASK_ISO_7_ELSE
;
2413 mask_iso
&= ~CATEGORY_MASK_ISO_7_ELSE
;
2414 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2416 mask_found
|= CATEGORY_MASK_ISO_8_ELSE
;
2418 mask_iso
&= ~CATEGORY_MASK_ISO_8_ELSE
;
2422 if (inhibit_iso_escape_detection
)
2424 single_shifting
= 0;
2427 || SHIFT_OUT_OK (coding_category_iso_7_else
)
2428 || SHIFT_OUT_OK (coding_category_iso_8_else
)))
2430 /* Locking shift out. */
2431 mask_iso
&= ~CATEGORY_MASK_ISO_7BIT
;
2432 mask_found
|= CATEGORY_MASK_ISO_ELSE
;
2437 if (inhibit_iso_escape_detection
)
2439 single_shifting
= 0;
2442 /* Locking shift in. */
2443 mask_iso
&= ~CATEGORY_MASK_ISO_7BIT
;
2444 mask_found
|= CATEGORY_MASK_ISO_ELSE
;
2449 single_shifting
= 0;
2453 int newmask
= CATEGORY_MASK_ISO_8_ELSE
;
2455 if (inhibit_iso_escape_detection
)
2457 if (c
!= ISO_CODE_CSI
)
2459 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2460 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2461 newmask
|= CATEGORY_MASK_ISO_8_1
;
2462 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2463 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2464 newmask
|= CATEGORY_MASK_ISO_8_2
;
2465 single_shifting
= 1;
2467 if (VECTORP (Vlatin_extra_code_table
)
2468 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2470 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2471 & CODING_ISO_FLAG_LATIN_EXTRA
)
2472 newmask
|= CATEGORY_MASK_ISO_8_1
;
2473 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2474 & CODING_ISO_FLAG_LATIN_EXTRA
)
2475 newmask
|= CATEGORY_MASK_ISO_8_2
;
2477 mask_iso
&= newmask
;
2478 mask_found
|= newmask
;
2485 single_shifting
= 0;
2490 single_shifting
= 0;
2491 mask_8bit_found
= 1;
2492 if (VECTORP (Vlatin_extra_code_table
)
2493 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2497 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2498 & CODING_ISO_FLAG_LATIN_EXTRA
)
2499 newmask
|= CATEGORY_MASK_ISO_8_1
;
2500 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2501 & CODING_ISO_FLAG_LATIN_EXTRA
)
2502 newmask
|= CATEGORY_MASK_ISO_8_2
;
2503 mask_iso
&= newmask
;
2504 mask_found
|= newmask
;
2511 mask_iso
&= ~(CATEGORY_MASK_ISO_7BIT
2512 | CATEGORY_MASK_ISO_7_ELSE
);
2513 mask_found
|= CATEGORY_MASK_ISO_8_1
;
2514 mask_8bit_found
= 1;
2515 /* Check the length of succeeding codes of the range
2516 0xA0..0FF. If the byte length is odd, we exclude
2517 CATEGORY_MASK_ISO_8_2. We can check this only
2518 when we are not single shifting. */
2519 if (!single_shifting
2520 && mask_iso
& CATEGORY_MASK_ISO_8_2
)
2523 while (src
< src_end
)
2531 if (i
& 1 && src
< src_end
)
2532 mask_iso
&= ~CATEGORY_MASK_ISO_8_2
;
2534 mask_found
|= CATEGORY_MASK_ISO_8_2
;
2543 *mask
&= ~CATEGORY_MASK_ISO
;
2548 *mask
&= mask_iso
& mask_found
;
2549 if (! mask_8bit_found
)
2550 *mask
&= ~(CATEGORY_MASK_ISO_8BIT
| CATEGORY_MASK_ISO_8_ELSE
);
2555 /* Set designation state into CODING. */
2556 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2560 if (final < '0' || final >= 128 \
2561 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2562 || !SAFE_CHARSET_P (coding, id)) \
2564 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2565 goto invalid_code; \
2567 prev = CODING_ISO_DESIGNATION (coding, reg); \
2568 if (id == charset_jisx0201_roman) \
2570 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2571 id = charset_ascii; \
2573 else if (id == charset_jisx0208_1978) \
2575 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2576 id = charset_jisx0208; \
2578 CODING_ISO_DESIGNATION (coding, reg) = id; \
2579 /* If there was an invalid designation to REG previously, and this \
2580 designation is ASCII to REG, we should keep this designation \
2582 if (prev == -2 && id == charset_ascii) \
2583 goto invalid_code; \
2587 #define MAYBE_FINISH_COMPOSITION() \
2590 if (composition_state == COMPOSING_NO) \
2592 /* It is assured that we have enough room for producing \
2593 characters stored in the table `components'. */ \
2594 if (charbuf + component_idx > charbuf_end) \
2595 goto no_more_source; \
2596 composition_state = COMPOSING_NO; \
2597 if (method == COMPOSITION_RELATIVE \
2598 || method == COMPOSITION_WITH_ALTCHARS) \
2600 for (i = 0; i < component_idx; i++) \
2601 *charbuf++ = components[i]; \
2602 char_offset += component_idx; \
2606 for (i = 0; i < component_idx; i += 2) \
2607 *charbuf++ = components[i]; \
2608 char_offset += (component_idx / 2) + 1; \
2613 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2614 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2615 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2616 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2617 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2620 #define DECODE_COMPOSITION_START(c1) \
2623 && composition_state == COMPOSING_COMPONENT_RULE) \
2625 component_len = component_idx; \
2626 composition_state = COMPOSING_CHAR; \
2632 MAYBE_FINISH_COMPOSITION (); \
2633 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2634 goto no_more_source; \
2635 for (p = src; p < src_end - 1; p++) \
2636 if (*p == ISO_CODE_ESC && p[1] == '1') \
2638 if (p == src_end - 1) \
2640 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2641 goto invalid_code; \
2642 goto no_more_source; \
2645 /* This is surely the start of a composition. */ \
2646 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2647 : c1 == '2' ? COMPOSITION_WITH_RULE \
2648 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2649 : COMPOSITION_WITH_RULE_ALTCHARS); \
2650 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2651 : COMPOSING_COMPONENT_CHAR); \
2652 component_idx = component_len = 0; \
2657 /* Handle compositoin end sequence ESC 1. */
2659 #define DECODE_COMPOSITION_END() \
2661 int nchars = (component_len > 0 ? component_idx - component_len \
2662 : method == COMPOSITION_RELATIVE ? component_idx \
2663 : (component_idx + 1) / 2); \
2665 int *saved_charbuf = charbuf; \
2667 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
2668 if (method != COMPOSITION_RELATIVE) \
2670 if (component_len == 0) \
2671 for (i = 0; i < component_idx; i++) \
2672 *charbuf++ = components[i]; \
2674 for (i = 0; i < component_len; i++) \
2675 *charbuf++ = components[i]; \
2676 *saved_charbuf = saved_charbuf - charbuf; \
2678 if (method == COMPOSITION_WITH_RULE) \
2679 for (i = 0; i < component_idx; i += 2, char_offset++) \
2680 *charbuf++ = components[i]; \
2682 for (i = component_len; i < component_idx; i++, char_offset++) \
2683 *charbuf++ = components[i]; \
2684 coding->annotated = 1; \
2685 composition_state = COMPOSING_NO; \
2689 /* Decode a composition rule from the byte C1 (and maybe one more byte
2690 from SRC) and store one encoded composition rule in
2691 coding->cmp_data. */
2693 #define DECODE_COMPOSITION_RULE(c1) \
2696 if (c1 < 81) /* old format (before ver.21) */ \
2698 int gref = (c1) / 9; \
2699 int nref = (c1) % 9; \
2700 if (gref == 4) gref = 10; \
2701 if (nref == 4) nref = 10; \
2702 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2704 else if (c1 < 93) /* new format (after ver.21) */ \
2706 ONE_MORE_BYTE (c2); \
2707 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2714 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2717 decode_coding_iso_2022 (coding
)
2718 struct coding_system
*coding
;
2720 unsigned char *src
= coding
->source
+ coding
->consumed
;
2721 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2722 unsigned char *src_base
;
2723 int *charbuf
= coding
->charbuf
;
2724 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- 4;
2725 int consumed_chars
= 0, consumed_chars_base
;
2726 int char_offset
= 0;
2727 int multibytep
= coding
->src_multibyte
;
2728 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2729 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2730 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2731 struct charset
*charset
;
2733 /* For handling composition sequence. */
2734 #define COMPOSING_NO 0
2735 #define COMPOSING_CHAR 1
2736 #define COMPOSING_RULE 2
2737 #define COMPOSING_COMPONENT_CHAR 3
2738 #define COMPOSING_COMPONENT_RULE 4
2740 int composition_state
= COMPOSING_NO
;
2741 enum composition_method method
;
2742 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2745 Lisp_Object attrs
, eol_type
, charset_list
;
2747 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2748 setup_iso_safe_charsets (attrs
);
2755 consumed_chars_base
= consumed_chars
;
2757 if (charbuf
>= charbuf_end
)
2762 /* We produce no character or one character. */
2763 switch (iso_code_class
[c1
])
2765 case ISO_0x20_or_0x7F
:
2766 if (composition_state
!= COMPOSING_NO
)
2768 if (composition_state
== COMPOSING_RULE
2769 || composition_state
== COMPOSING_COMPONENT_RULE
)
2771 DECODE_COMPOSITION_RULE (c1
);
2772 components
[component_idx
++] = c1
;
2773 composition_state
--;
2777 if (charset_id_0
< 0
2778 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2779 /* This is SPACE or DEL. */
2780 charset
= CHARSET_FROM_ID (charset_ascii
);
2782 charset
= CHARSET_FROM_ID (charset_id_0
);
2785 case ISO_graphic_plane_0
:
2786 if (composition_state
!= COMPOSING_NO
)
2788 if (composition_state
== COMPOSING_RULE
2789 || composition_state
== COMPOSING_COMPONENT_RULE
)
2791 DECODE_COMPOSITION_RULE (c1
);
2792 components
[component_idx
++] = c1
;
2793 composition_state
--;
2797 charset
= CHARSET_FROM_ID (charset_id_0
);
2800 case ISO_0xA0_or_0xFF
:
2801 if (charset_id_1
< 0
2802 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
2803 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
2805 /* This is a graphic character, we fall down ... */
2807 case ISO_graphic_plane_1
:
2808 if (charset_id_1
< 0)
2810 charset
= CHARSET_FROM_ID (charset_id_1
);
2813 case ISO_carriage_return
:
2816 if (EQ (eol_type
, Qdos
))
2819 goto no_more_source
;
2823 else if (EQ (eol_type
, Qmac
))
2829 MAYBE_FINISH_COMPOSITION ();
2830 charset
= CHARSET_FROM_ID (charset_ascii
);
2834 MAYBE_FINISH_COMPOSITION ();
2838 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2839 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
2841 CODING_ISO_INVOCATION (coding
, 0) = 1;
2842 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2846 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
2848 CODING_ISO_INVOCATION (coding
, 0) = 0;
2849 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2852 case ISO_single_shift_2_7
:
2853 case ISO_single_shift_2
:
2854 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2856 /* SS2 is handled as an escape sequence of ESC 'N' */
2858 goto label_escape_sequence
;
2860 case ISO_single_shift_3
:
2861 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2863 /* SS2 is handled as an escape sequence of ESC 'O' */
2865 goto label_escape_sequence
;
2867 case ISO_control_sequence_introducer
:
2868 /* CSI is handled as an escape sequence of ESC '[' ... */
2870 goto label_escape_sequence
;
2874 label_escape_sequence
:
2875 /* Escape sequences handled here are invocation,
2876 designation, direction specification, and character
2877 composition specification. */
2880 case '&': /* revision of following character set */
2882 if (!(c1
>= '@' && c1
<= '~'))
2885 if (c1
!= ISO_CODE_ESC
)
2888 goto label_escape_sequence
;
2890 case '$': /* designation of 2-byte character set */
2891 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2894 if (c1
>= '@' && c1
<= 'B')
2895 { /* designation of JISX0208.1978, GB2312.1980,
2897 DECODE_DESIGNATION (0, 2, 0, c1
);
2899 else if (c1
>= 0x28 && c1
<= 0x2B)
2900 { /* designation of DIMENSION2_CHARS94 character set */
2902 DECODE_DESIGNATION (c1
- 0x28, 2, 0, c2
);
2904 else if (c1
>= 0x2C && c1
<= 0x2F)
2905 { /* designation of DIMENSION2_CHARS96 character set */
2907 DECODE_DESIGNATION (c1
- 0x2C, 2, 1, c2
);
2911 /* We must update these variables now. */
2912 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2913 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2916 case 'n': /* invocation of locking-shift-2 */
2917 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2918 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2920 CODING_ISO_INVOCATION (coding
, 0) = 2;
2921 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2924 case 'o': /* invocation of locking-shift-3 */
2925 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2926 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2928 CODING_ISO_INVOCATION (coding
, 0) = 3;
2929 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2932 case 'N': /* invocation of single-shift-2 */
2933 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2934 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2936 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 2));
2938 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
2942 case 'O': /* invocation of single-shift-3 */
2943 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2944 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2946 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 3));
2948 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
2952 case '0': case '2': case '3': case '4': /* start composition */
2953 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
2955 DECODE_COMPOSITION_START (c1
);
2958 case '1': /* end composition */
2959 if (composition_state
== COMPOSING_NO
)
2961 DECODE_COMPOSITION_END ();
2964 case '[': /* specification of direction */
2965 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
2967 /* For the moment, nested direction is not supported.
2968 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2969 left-to-right, and nozero means right-to-left. */
2973 case ']': /* end of the current direction */
2974 coding
->mode
&= ~CODING_MODE_DIRECTION
;
2976 case '0': /* end of the current direction */
2977 case '1': /* start of left-to-right direction */
2980 coding
->mode
&= ~CODING_MODE_DIRECTION
;
2985 case '2': /* start of right-to-left direction */
2988 coding
->mode
|= CODING_MODE_DIRECTION
;
2999 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3001 if (c1
>= 0x28 && c1
<= 0x2B)
3002 { /* designation of DIMENSION1_CHARS94 character set */
3004 DECODE_DESIGNATION (c1
- 0x28, 1, 0, c2
);
3006 else if (c1
>= 0x2C && c1
<= 0x2F)
3007 { /* designation of DIMENSION1_CHARS96 character set */
3009 DECODE_DESIGNATION (c1
- 0x2C, 1, 1, c2
);
3013 /* We must update these variables now. */
3014 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3015 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3020 /* Now we know CHARSET and 1st position code C1 of a character.
3021 Produce a decoded character while getting 2nd position code
3024 if (CHARSET_DIMENSION (charset
) > 1)
3027 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3028 /* C2 is not in a valid range. */
3030 c1
= (c1
<< 8) | (c2
& 0x7F);
3031 if (CHARSET_DIMENSION (charset
) > 2)
3034 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3035 /* C2 is not in a valid range. */
3037 c1
= (c1
<< 8) | (c2
& 0x7F);
3041 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3044 MAYBE_FINISH_COMPOSITION ();
3045 for (; src_base
< src
; src_base
++, char_offset
++)
3047 if (ASCII_BYTE_P (*src_base
))
3048 *charbuf
++ = *src_base
;
3050 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3053 else if (composition_state
== COMPOSING_NO
)
3060 components
[component_idx
++] = c
;
3061 if (method
== COMPOSITION_WITH_RULE
3062 || (method
== COMPOSITION_WITH_RULE_ALTCHARS
3063 && composition_state
== COMPOSING_COMPONENT_CHAR
))
3064 composition_state
++;
3069 MAYBE_FINISH_COMPOSITION ();
3071 consumed_chars
= consumed_chars_base
;
3073 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3078 coding
->consumed_char
+= consumed_chars_base
;
3079 coding
->consumed
= src_base
- coding
->source
;
3080 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3084 /* ISO2022 encoding stuff. */
3087 It is not enough to say just "ISO2022" on encoding, we have to
3088 specify more details. In Emacs, each coding system of ISO2022
3089 variant has the following specifications:
3090 1. Initial designation to G0 thru G3.
3091 2. Allows short-form designation?
3092 3. ASCII should be designated to G0 before control characters?
3093 4. ASCII should be designated to G0 at end of line?
3094 5. 7-bit environment or 8-bit environment?
3095 6. Use locking-shift?
3096 7. Use Single-shift?
3097 And the following two are only for Japanese:
3098 8. Use ASCII in place of JIS0201-1976-Roman?
3099 9. Use JISX0208-1983 in place of JISX0208-1978?
3100 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3101 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3105 /* Produce codes (escape sequence) for designating CHARSET to graphic
3106 register REG at DST, and increment DST. If <final-char> of CHARSET is
3107 '@', 'A', or 'B' and the coding system CODING allows, produce
3108 designation sequence of short-form. */
3110 #define ENCODE_DESIGNATION(charset, reg, coding) \
3112 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3113 char *intermediate_char_94 = "()*+"; \
3114 char *intermediate_char_96 = ",-./"; \
3115 int revision = -1; \
3118 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3119 revision = CHARSET_ISO_REVISION (charset); \
3121 if (revision >= 0) \
3123 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3124 EMIT_ONE_BYTE ('@' + revision); \
3126 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3127 if (CHARSET_DIMENSION (charset) == 1) \
3129 if (! CHARSET_ISO_CHARS_96 (charset)) \
3130 c = intermediate_char_94[reg]; \
3132 c = intermediate_char_96[reg]; \
3133 EMIT_ONE_ASCII_BYTE (c); \
3137 EMIT_ONE_ASCII_BYTE ('$'); \
3138 if (! CHARSET_ISO_CHARS_96 (charset)) \
3140 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3142 || final_char < '@' || final_char > 'B') \
3143 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3146 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3148 EMIT_ONE_ASCII_BYTE (final_char); \
3150 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3154 /* The following two macros produce codes (control character or escape
3155 sequence) for ISO2022 single-shift functions (single-shift-2 and
3158 #define ENCODE_SINGLE_SHIFT_2 \
3160 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3161 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3163 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3164 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3168 #define ENCODE_SINGLE_SHIFT_3 \
3170 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3171 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3173 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3174 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3178 /* The following four macros produce codes (control character or
3179 escape sequence) for ISO2022 locking-shift functions (shift-in,
3180 shift-out, locking-shift-2, and locking-shift-3). */
3182 #define ENCODE_SHIFT_IN \
3184 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3185 CODING_ISO_INVOCATION (coding, 0) = 0; \
3189 #define ENCODE_SHIFT_OUT \
3191 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3192 CODING_ISO_INVOCATION (coding, 0) = 1; \
3196 #define ENCODE_LOCKING_SHIFT_2 \
3198 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3199 CODING_ISO_INVOCATION (coding, 0) = 2; \
3203 #define ENCODE_LOCKING_SHIFT_3 \
3205 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3206 CODING_ISO_INVOCATION (coding, 0) = 3; \
3210 /* Produce codes for a DIMENSION1 character whose character set is
3211 CHARSET and whose position-code is C1. Designation and invocation
3212 sequences are also produced in advance if necessary. */
3214 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3216 int id = CHARSET_ID (charset); \
3218 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3219 && id == charset_ascii) \
3221 id = charset_jisx0201_roman; \
3222 charset = CHARSET_FROM_ID (id); \
3225 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3227 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3228 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3230 EMIT_ONE_BYTE (c1 | 0x80); \
3231 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3234 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3236 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3239 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3241 EMIT_ONE_BYTE (c1 | 0x80); \
3245 /* Since CHARSET is not yet invoked to any graphic planes, we \
3246 must invoke it, or, at first, designate it to some graphic \
3247 register. Then repeat the loop to actually produce the \
3249 dst = encode_invocation_designation (charset, coding, dst, \
3254 /* Produce codes for a DIMENSION2 character whose character set is
3255 CHARSET and whose position-codes are C1 and C2. Designation and
3256 invocation codes are also produced in advance if necessary. */
3258 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3260 int id = CHARSET_ID (charset); \
3262 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3263 && id == charset_jisx0208) \
3265 id = charset_jisx0208_1978; \
3266 charset = CHARSET_FROM_ID (id); \
3269 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3271 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3272 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3274 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3275 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3278 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3280 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3283 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3285 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3289 /* Since CHARSET is not yet invoked to any graphic planes, we \
3290 must invoke it, or, at first, designate it to some graphic \
3291 register. Then repeat the loop to actually produce the \
3293 dst = encode_invocation_designation (charset, coding, dst, \
3298 #define ENCODE_ISO_CHARACTER(charset, c) \
3300 int code = ENCODE_CHAR ((charset),(c)); \
3302 if (CHARSET_DIMENSION (charset) == 1) \
3303 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3305 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3309 /* Produce designation and invocation codes at a place pointed by DST
3310 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3314 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3315 struct charset
*charset
;
3316 struct coding_system
*coding
;
3320 int multibytep
= coding
->dst_multibyte
;
3321 int produced_chars
= *p_nchars
;
3322 int reg
; /* graphic register number */
3323 int id
= CHARSET_ID (charset
);
3325 /* At first, check designations. */
3326 for (reg
= 0; reg
< 4; reg
++)
3327 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3332 /* CHARSET is not yet designated to any graphic registers. */
3333 /* At first check the requested designation. */
3334 reg
= CODING_ISO_REQUEST (coding
, id
);
3336 /* Since CHARSET requests no special designation, designate it
3337 to graphic register 0. */
3340 ENCODE_DESIGNATION (charset
, reg
, coding
);
3343 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3344 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3346 /* Since the graphic register REG is not invoked to any graphic
3347 planes, invoke it to graphic plane 0. */
3350 case 0: /* graphic register 0 */
3354 case 1: /* graphic register 1 */
3358 case 2: /* graphic register 2 */
3359 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3360 ENCODE_SINGLE_SHIFT_2
;
3362 ENCODE_LOCKING_SHIFT_2
;
3365 case 3: /* graphic register 3 */
3366 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3367 ENCODE_SINGLE_SHIFT_3
;
3369 ENCODE_LOCKING_SHIFT_3
;
3374 *p_nchars
= produced_chars
;
3378 /* The following three macros produce codes for indicating direction
3380 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3382 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3383 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3385 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3389 #define ENCODE_DIRECTION_R2L() \
3391 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3392 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3396 #define ENCODE_DIRECTION_L2R() \
3398 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3399 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3403 /* Produce codes for designation and invocation to reset the graphic
3404 planes and registers to initial state. */
3405 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3408 struct charset *charset; \
3410 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3412 for (reg = 0; reg < 4; reg++) \
3413 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3414 && (CODING_ISO_DESIGNATION (coding, reg) \
3415 != CODING_ISO_INITIAL (coding, reg))) \
3417 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3418 ENCODE_DESIGNATION (charset, reg, coding); \
3423 /* Produce designation sequences of charsets in the line started from
3424 SRC to a place pointed by DST, and return updated DST.
3426 If the current block ends before any end-of-line, we may fail to
3427 find all the necessary designations. */
3429 static unsigned char *
3430 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3431 struct coding_system
*coding
;
3432 int *charbuf
, *charbuf_end
;
3435 struct charset
*charset
;
3436 /* Table of charsets to be designated to each graphic register. */
3438 int c
, found
= 0, reg
;
3439 int produced_chars
= 0;
3440 int multibytep
= coding
->dst_multibyte
;
3442 Lisp_Object charset_list
;
3444 attrs
= CODING_ID_ATTRS (coding
->id
);
3445 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3446 if (EQ (charset_list
, Qiso_2022
))
3447 charset_list
= Viso_2022_charset_list
;
3449 for (reg
= 0; reg
< 4; reg
++)
3459 charset
= char_charset (c
, charset_list
, NULL
);
3460 id
= CHARSET_ID (charset
);
3461 reg
= CODING_ISO_REQUEST (coding
, id
);
3462 if (reg
>= 0 && r
[reg
] < 0)
3471 for (reg
= 0; reg
< 4; reg
++)
3473 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3474 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3480 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3483 encode_coding_iso_2022 (coding
)
3484 struct coding_system
*coding
;
3486 int multibytep
= coding
->dst_multibyte
;
3487 int *charbuf
= coding
->charbuf
;
3488 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3489 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3490 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3493 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3494 && CODING_ISO_BOL (coding
));
3495 int produced_chars
= 0;
3496 Lisp_Object attrs
, eol_type
, charset_list
;
3497 int ascii_compatible
;
3500 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3501 setup_iso_safe_charsets (attrs
);
3502 coding
->safe_charsets
3503 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs
))->data
;
3505 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3507 while (charbuf
< charbuf_end
)
3509 ASSURE_DESTINATION (safe_room
);
3511 if (bol_designation
)
3513 unsigned char *dst_prev
= dst
;
3515 /* We have to produce designation sequences if any now. */
3516 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3517 bol_designation
= 0;
3518 /* We are sure that designation sequences are all ASCII bytes. */
3519 produced_chars
+= dst
- dst_prev
;
3524 /* Now encode the character C. */
3525 if (c
< 0x20 || c
== 0x7F)
3528 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3530 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3531 ENCODE_RESET_PLANE_AND_REGISTER ();
3532 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3536 for (i
= 0; i
< 4; i
++)
3537 CODING_ISO_DESIGNATION (coding
, i
)
3538 = CODING_ISO_INITIAL (coding
, i
);
3541 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3543 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3544 ENCODE_RESET_PLANE_AND_REGISTER ();
3545 EMIT_ONE_ASCII_BYTE (c
);
3547 else if (ASCII_CHAR_P (c
))
3549 if (ascii_compatible
)
3550 EMIT_ONE_ASCII_BYTE (c
);
3553 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
3554 ENCODE_ISO_CHARACTER (charset
, c
);
3557 else if (CHAR_BYTE8_P (c
))
3559 c
= CHAR_TO_BYTE8 (c
);
3564 struct charset
*charset
= char_charset (c
, charset_list
, NULL
);
3568 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3570 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3571 charset
= CHARSET_FROM_ID (charset_ascii
);
3575 c
= coding
->default_char
;
3576 charset
= char_charset (c
, charset_list
, NULL
);
3579 ENCODE_ISO_CHARACTER (charset
, c
);
3583 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3584 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3586 ASSURE_DESTINATION (safe_room
);
3587 ENCODE_RESET_PLANE_AND_REGISTER ();
3589 coding
->result
= CODING_RESULT_SUCCESS
;
3590 CODING_ISO_BOL (coding
) = bol_designation
;
3591 coding
->produced_char
+= produced_chars
;
3592 coding
->produced
= dst
- coding
->destination
;
3597 /*** 8,9. SJIS and BIG5 handlers ***/
3599 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3600 quite widely. So, for the moment, Emacs supports them in the bare
3601 C code. But, in the future, they may be supported only by CCL. */
3603 /* SJIS is a coding system encoding three character sets: ASCII, right
3604 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3605 as is. A character of charset katakana-jisx0201 is encoded by
3606 "position-code + 0x80". A character of charset japanese-jisx0208
3607 is encoded in 2-byte but two position-codes are divided and shifted
3608 so that it fit in the range below.
3610 --- CODE RANGE of SJIS ---
3611 (character set) (range)
3613 KATAKANA-JISX0201 0xA0 .. 0xDF
3614 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3615 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3616 -------------------------------
3620 /* BIG5 is a coding system encoding two character sets: ASCII and
3621 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3622 character set and is encoded in two-byte.
3624 --- CODE RANGE of BIG5 ---
3625 (character set) (range)
3627 Big5 (1st byte) 0xA1 .. 0xFE
3628 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3629 --------------------------
3633 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3634 Check if a text is encoded in SJIS. If it is, return
3635 CATEGORY_MASK_SJIS, else return 0. */
3638 detect_coding_sjis (coding
, mask
)
3639 struct coding_system
*coding
;
3642 unsigned char *src
= coding
->source
, *src_base
= src
;
3643 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3644 int multibytep
= coding
->src_multibyte
;
3645 int consumed_chars
= 0;
3649 /* A coding system of this category is always ASCII compatible. */
3650 src
+= coding
->head_ascii
;
3657 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3660 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
3664 else if (c
>= 0xA0 && c
< 0xE0)
3669 *mask
&= ~CATEGORY_MASK_SJIS
;
3675 *mask
&= CATEGORY_MASK_SJIS
;
3679 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3680 Check if a text is encoded in BIG5. If it is, return
3681 CATEGORY_MASK_BIG5, else return 0. */
3684 detect_coding_big5 (coding
, mask
)
3685 struct coding_system
*coding
;
3688 unsigned char *src
= coding
->source
, *src_base
= src
;
3689 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3690 int multibytep
= coding
->src_multibyte
;
3691 int consumed_chars
= 0;
3695 /* A coding system of this category is always ASCII compatible. */
3696 src
+= coding
->head_ascii
;
3706 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
3713 *mask
&= ~CATEGORY_MASK_BIG5
;
3719 *mask
&= CATEGORY_MASK_BIG5
;
3723 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3724 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3727 decode_coding_sjis (coding
)
3728 struct coding_system
*coding
;
3730 unsigned char *src
= coding
->source
+ coding
->consumed
;
3731 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3732 unsigned char *src_base
;
3733 int *charbuf
= coding
->charbuf
;
3734 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
3735 int consumed_chars
= 0, consumed_chars_base
;
3736 int multibytep
= coding
->src_multibyte
;
3737 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3738 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3740 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3743 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3744 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3745 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3752 consumed_chars_base
= consumed_chars
;
3754 if (charbuf
>= charbuf_end
)
3761 if (EQ (eol_type
, Qdos
))
3764 goto no_more_source
;
3768 else if (EQ (eol_type
, Qmac
))
3773 struct charset
*charset
;
3776 charset
= charset_roman
;
3781 if (c
< 0xA0 || c
>= 0xE0)
3783 /* SJIS -> JISX0208 */
3785 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
3789 charset
= charset_kanji
;
3792 /* SJIS -> JISX0201-Kana */
3793 charset
= charset_kana
;
3795 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
3802 consumed_chars
= consumed_chars_base
;
3804 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3809 coding
->consumed_char
+= consumed_chars_base
;
3810 coding
->consumed
= src_base
- coding
->source
;
3811 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3815 decode_coding_big5 (coding
)
3816 struct coding_system
*coding
;
3818 unsigned char *src
= coding
->source
+ coding
->consumed
;
3819 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3820 unsigned char *src_base
;
3821 int *charbuf
= coding
->charbuf
;
3822 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
3823 int consumed_chars
= 0, consumed_chars_base
;
3824 int multibytep
= coding
->src_multibyte
;
3825 struct charset
*charset_roman
, *charset_big5
;
3826 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3828 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3830 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3831 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3838 consumed_chars_base
= consumed_chars
;
3840 if (charbuf
>= charbuf_end
)
3847 if (EQ (eol_type
, Qdos
))
3850 goto no_more_source
;
3854 else if (EQ (eol_type
, Qmac
))
3859 struct charset
*charset
;
3861 charset
= charset_roman
;
3865 if (c
< 0xA1 || c
> 0xFE)
3868 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
3871 charset
= charset_big5
;
3873 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
3881 consumed_chars
= consumed_chars_base
;
3883 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3888 coding
->consumed_char
+= consumed_chars_base
;
3889 coding
->consumed
= src_base
- coding
->source
;
3890 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3893 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3894 This function can encode charsets `ascii', `katakana-jisx0201',
3895 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3896 are sure that all these charsets are registered as official charset
3897 (i.e. do not have extended leading-codes). Characters of other
3898 charsets are produced without any encoding. If SJIS_P is 1, encode
3899 SJIS text, else encode BIG5 text. */
3902 encode_coding_sjis (coding
)
3903 struct coding_system
*coding
;
3905 int multibytep
= coding
->dst_multibyte
;
3906 int *charbuf
= coding
->charbuf
;
3907 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3908 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3909 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3911 int produced_chars
= 0;
3912 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3913 int ascii_compatible
;
3914 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3917 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3919 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3920 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3921 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3923 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3925 while (charbuf
< charbuf_end
)
3927 ASSURE_DESTINATION (safe_room
);
3929 /* Now encode the character C. */
3930 if (ASCII_CHAR_P (c
) && ascii_compatible
)
3931 EMIT_ONE_ASCII_BYTE (c
);
3932 else if (CHAR_BYTE8_P (c
))
3934 c
= CHAR_TO_BYTE8 (c
);
3940 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
3944 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3946 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3947 charset
= CHARSET_FROM_ID (charset_ascii
);
3951 c
= coding
->default_char
;
3952 charset
= char_charset (c
, charset_list
, &code
);
3955 if (code
== CHARSET_INVALID_CODE (charset
))
3957 if (charset
== charset_kanji
)
3961 c1
= code
>> 8, c2
= code
& 0xFF;
3962 EMIT_TWO_BYTES (c1
, c2
);
3964 else if (charset
== charset_kana
)
3965 EMIT_ONE_BYTE (code
| 0x80);
3967 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
3970 coding
->result
= CODING_RESULT_SUCCESS
;
3971 coding
->produced_char
+= produced_chars
;
3972 coding
->produced
= dst
- coding
->destination
;
3977 encode_coding_big5 (coding
)
3978 struct coding_system
*coding
;
3980 int multibytep
= coding
->dst_multibyte
;
3981 int *charbuf
= coding
->charbuf
;
3982 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3983 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3984 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3986 int produced_chars
= 0;
3987 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3988 int ascii_compatible
;
3989 struct charset
*charset_roman
, *charset_big5
;
3992 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3994 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3995 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3996 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3998 while (charbuf
< charbuf_end
)
4000 ASSURE_DESTINATION (safe_room
);
4002 /* Now encode the character C. */
4003 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4004 EMIT_ONE_ASCII_BYTE (c
);
4005 else if (CHAR_BYTE8_P (c
))
4007 c
= CHAR_TO_BYTE8 (c
);
4013 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4017 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4019 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4020 charset
= CHARSET_FROM_ID (charset_ascii
);
4024 c
= coding
->default_char
;
4025 charset
= char_charset (c
, charset_list
, &code
);
4028 if (code
== CHARSET_INVALID_CODE (charset
))
4030 if (charset
== charset_big5
)
4034 c1
= code
>> 8, c2
= code
& 0xFF;
4035 EMIT_TWO_BYTES (c1
, c2
);
4038 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4041 coding
->result
= CODING_RESULT_SUCCESS
;
4042 coding
->produced_char
+= produced_chars
;
4043 coding
->produced
= dst
- coding
->destination
;
4048 /*** 10. CCL handlers ***/
4050 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4051 Check if a text is encoded in a coding system of which
4052 encoder/decoder are written in CCL program. If it is, return
4053 CATEGORY_MASK_CCL, else return 0. */
4056 detect_coding_ccl (coding
, mask
)
4057 struct coding_system
*coding
;
4060 unsigned char *src
= coding
->source
, *src_base
= src
;
4061 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4062 int multibytep
= coding
->src_multibyte
;
4063 int consumed_chars
= 0;
4065 unsigned char *valids
= CODING_CCL_VALIDS (coding
);
4066 int head_ascii
= coding
->head_ascii
;
4069 coding
= &coding_categories
[coding_category_ccl
];
4070 attrs
= CODING_ID_ATTRS (coding
->id
);
4071 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4080 if (!found
&& valids
[c
] > 1)
4083 *mask
&= ~CATEGORY_MASK_CCL
;
4089 *mask
&= CATEGORY_MASK_CCL
;
4094 decode_coding_ccl (coding
)
4095 struct coding_system
*coding
;
4097 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4098 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4099 int *charbuf
= coding
->charbuf
;
4100 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4101 int consumed_chars
= 0;
4102 int multibytep
= coding
->src_multibyte
;
4103 struct ccl_program ccl
;
4104 int source_charbuf
[1024];
4105 int source_byteidx
[1024];
4107 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
4109 while (src
< src_end
)
4111 const unsigned char *p
= src
;
4112 int *source
, *source_end
;
4116 while (i
< 1024 && p
< src_end
)
4118 source_byteidx
[i
] = p
- src
;
4119 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4122 while (i
< 1024 && p
< src_end
)
4123 source_charbuf
[i
++] = *p
++;
4125 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4128 source
= source_charbuf
;
4129 source_end
= source
+ i
;
4130 while (source
< source_end
)
4132 ccl_driver (&ccl
, source
, charbuf
,
4133 source_end
- source
, charbuf_end
- charbuf
);
4134 source
+= ccl
.consumed
;
4135 charbuf
+= ccl
.produced
;
4136 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4139 if (source
< source_end
)
4140 src
+= source_byteidx
[source
- source_charbuf
];
4143 consumed_chars
+= source
- source_charbuf
;
4145 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4146 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4152 case CCL_STAT_SUSPEND_BY_SRC
:
4153 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4155 case CCL_STAT_SUSPEND_BY_DST
:
4158 case CCL_STAT_INVALID_CMD
:
4159 coding
->result
= CODING_RESULT_INTERRUPT
;
4162 coding
->result
= CODING_RESULT_SUCCESS
;
4165 coding
->consumed_char
+= consumed_chars
;
4166 coding
->consumed
= src
- coding
->source
;
4167 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4171 encode_coding_ccl (coding
)
4172 struct coding_system
*coding
;
4174 struct ccl_program ccl
;
4175 int multibytep
= coding
->dst_multibyte
;
4176 int *charbuf
= coding
->charbuf
;
4177 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4178 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4179 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4180 unsigned char *adjusted_dst_end
= dst_end
- 1;
4181 int destination_charbuf
[1024];
4182 int i
, produced_chars
= 0;
4184 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4186 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4187 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4189 while (charbuf
< charbuf_end
&& dst
< adjusted_dst_end
)
4191 int dst_bytes
= dst_end
- dst
;
4192 if (dst_bytes
> 1024)
4195 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4196 charbuf_end
- charbuf
, dst_bytes
);
4197 charbuf
+= ccl
.consumed
;
4199 for (i
= 0; i
< ccl
.produced
; i
++)
4200 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4203 for (i
= 0; i
< ccl
.produced
; i
++)
4204 *dst
++ = destination_charbuf
[i
] & 0xFF;
4205 produced_chars
+= ccl
.produced
;
4211 case CCL_STAT_SUSPEND_BY_SRC
:
4212 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4214 case CCL_STAT_SUSPEND_BY_DST
:
4215 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
4218 case CCL_STAT_INVALID_CMD
:
4219 coding
->result
= CODING_RESULT_INTERRUPT
;
4222 coding
->result
= CODING_RESULT_SUCCESS
;
4226 coding
->produced_char
+= produced_chars
;
4227 coding
->produced
= dst
- coding
->destination
;
4233 /*** 10, 11. no-conversion handlers ***/
4235 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4238 decode_coding_raw_text (coding
)
4239 struct coding_system
*coding
;
4241 coding
->chars_at_source
= 1;
4242 coding
->consumed_char
= 0;
4243 coding
->consumed
= 0;
4244 coding
->result
= CODING_RESULT_SUCCESS
;
4248 encode_coding_raw_text (coding
)
4249 struct coding_system
*coding
;
4251 int multibytep
= coding
->dst_multibyte
;
4252 int *charbuf
= coding
->charbuf
;
4253 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4254 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4255 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4256 int produced_chars
= 0;
4261 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4263 if (coding
->src_multibyte
)
4264 while (charbuf
< charbuf_end
)
4266 ASSURE_DESTINATION (safe_room
);
4268 if (ASCII_CHAR_P (c
))
4269 EMIT_ONE_ASCII_BYTE (c
);
4270 else if (CHAR_BYTE8_P (c
))
4272 c
= CHAR_TO_BYTE8 (c
);
4277 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4279 CHAR_STRING_ADVANCE (c
, p1
);
4282 EMIT_ONE_BYTE (*p0
);
4288 while (charbuf
< charbuf_end
)
4290 ASSURE_DESTINATION (safe_room
);
4297 if (coding
->src_multibyte
)
4299 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4301 while (charbuf
< charbuf_end
)
4303 ASSURE_DESTINATION (safe_room
);
4305 if (ASCII_CHAR_P (c
))
4307 else if (CHAR_BYTE8_P (c
))
4308 *dst
++ = CHAR_TO_BYTE8 (c
);
4310 CHAR_STRING_ADVANCE (c
, dst
);
4316 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4317 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4318 *dst
++ = *charbuf
++;
4319 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4322 coding
->result
= CODING_RESULT_SUCCESS
;
4323 coding
->produced_char
+= produced_chars
;
4324 coding
->produced
= dst
- coding
->destination
;
4329 detect_coding_charset (coding
, mask
)
4330 struct coding_system
*coding
;
4333 unsigned char *src
= coding
->source
, *src_base
= src
;
4334 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4335 int multibytep
= coding
->src_multibyte
;
4336 int consumed_chars
= 0;
4337 Lisp_Object attrs
, valids
;
4339 coding
= &coding_categories
[coding_category_charset
];
4340 attrs
= CODING_ID_ATTRS (coding
->id
);
4341 valids
= AREF (attrs
, coding_attr_charset_valids
);
4343 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4344 src
+= coding
->head_ascii
;
4351 if (NILP (AREF (valids
, c
)))
4354 *mask
&= ~CATEGORY_MASK_CHARSET
;
4358 *mask
&= CATEGORY_MASK_CHARSET
;
4363 decode_coding_charset (coding
)
4364 struct coding_system
*coding
;
4366 unsigned char *src
= coding
->source
+ coding
->consumed
;
4367 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4368 unsigned char *src_base
;
4369 int *charbuf
= coding
->charbuf
;
4370 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4371 int consumed_chars
= 0, consumed_chars_base
;
4372 int multibytep
= coding
->src_multibyte
;
4373 Lisp_Object attrs
, eol_type
, charset_list
, valids
;
4375 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4376 valids
= AREF (attrs
, coding_attr_charset_valids
);
4383 consumed_chars_base
= consumed_chars
;
4385 if (charbuf
>= charbuf_end
)
4391 /* Here we assume that no charset maps '\r' to something
4393 if (EQ (eol_type
, Qdos
))
4399 else if (EQ (eol_type
, Qmac
))
4405 struct charset
*charset
;
4410 val
= AREF (valids
, c
);
4415 charset
= CHARSET_FROM_ID (XFASTINT (val
));
4416 dim
= CHARSET_DIMENSION (charset
);
4420 code
= (code
<< 8) | c
;
4423 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
4428 /* VAL is a list of charset IDs. It is assured that the
4429 list is sorted by charset dimensions (smaller one
4433 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
4434 dim
= CHARSET_DIMENSION (charset
);
4438 code
= (code
<< 8) | c
;
4441 CODING_DECODE_CHAR (coding
, src
, src_base
,
4442 src_end
, charset
, code
, c
);
4456 consumed_chars
= consumed_chars_base
;
4458 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4463 coding
->consumed_char
+= consumed_chars_base
;
4464 coding
->consumed
= src_base
- coding
->source
;
4465 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4469 encode_coding_charset (coding
)
4470 struct coding_system
*coding
;
4472 int multibytep
= coding
->dst_multibyte
;
4473 int *charbuf
= coding
->charbuf
;
4474 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4475 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4476 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4477 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4478 int produced_chars
= 0;
4479 Lisp_Object attrs
, eol_type
, charset_list
;
4480 int ascii_compatible
;
4483 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4484 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4486 while (charbuf
< charbuf_end
)
4488 struct charset
*charset
;
4491 ASSURE_DESTINATION (safe_room
);
4493 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4494 EMIT_ONE_ASCII_BYTE (c
);
4495 else if (CHAR_BYTE8_P (c
))
4497 c
= CHAR_TO_BYTE8 (c
);
4502 charset
= char_charset (c
, charset_list
, &code
);
4505 if (CHARSET_DIMENSION (charset
) == 1)
4506 EMIT_ONE_BYTE (code
);
4507 else if (CHARSET_DIMENSION (charset
) == 2)
4508 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
4509 else if (CHARSET_DIMENSION (charset
) == 3)
4510 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
4512 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
4513 (code
>> 8) & 0xFF, code
& 0xFF);
4517 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4518 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4520 c
= coding
->default_char
;
4526 coding
->result
= CODING_RESULT_SUCCESS
;
4527 coding
->produced_char
+= produced_chars
;
4528 coding
->produced
= dst
- coding
->destination
;
4533 /*** 7. C library functions ***/
4535 /* Setup coding context CODING from information about CODING_SYSTEM.
4536 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4537 CODING_SYSTEM is invalid, signal an error. */
4540 setup_coding_system (coding_system
, coding
)
4541 Lisp_Object coding_system
;
4542 struct coding_system
*coding
;
4545 Lisp_Object eol_type
;
4546 Lisp_Object coding_type
;
4549 if (NILP (coding_system
))
4550 coding_system
= Qno_conversion
;
4552 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
4554 attrs
= CODING_ID_ATTRS (coding
->id
);
4555 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4558 coding
->head_ascii
= -1;
4559 coding
->common_flags
4560 = (VECTORP (eol_type
) ? CODING_REQUIRE_DETECTION_MASK
: 0);
4562 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4563 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4564 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4565 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
4567 coding_type
= CODING_ATTR_TYPE (attrs
);
4568 if (EQ (coding_type
, Qundecided
))
4570 coding
->detector
= NULL
;
4571 coding
->decoder
= decode_coding_raw_text
;
4572 coding
->encoder
= encode_coding_raw_text
;
4573 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4575 else if (EQ (coding_type
, Qiso_2022
))
4578 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
4580 /* Invoke graphic register 0 to plane 0. */
4581 CODING_ISO_INVOCATION (coding
, 0) = 0;
4582 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4583 CODING_ISO_INVOCATION (coding
, 1)
4584 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
4585 /* Setup the initial status of designation. */
4586 for (i
= 0; i
< 4; i
++)
4587 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
4588 /* Not single shifting initially. */
4589 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
4590 /* Beginning of buffer should also be regarded as bol. */
4591 CODING_ISO_BOL (coding
) = 1;
4592 coding
->detector
= detect_coding_iso_2022
;
4593 coding
->decoder
= decode_coding_iso_2022
;
4594 coding
->encoder
= encode_coding_iso_2022
;
4595 if (flags
& CODING_ISO_FLAG_SAFE
)
4596 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
4597 coding
->common_flags
4598 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4599 | CODING_REQUIRE_FLUSHING_MASK
);
4600 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
4601 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
4602 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
4604 setup_iso_safe_charsets (attrs
);
4605 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4606 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4607 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4609 CODING_ISO_FLAGS (coding
) = flags
;
4611 else if (EQ (coding_type
, Qcharset
))
4613 coding
->detector
= detect_coding_charset
;
4614 coding
->decoder
= decode_coding_charset
;
4615 coding
->encoder
= encode_coding_charset
;
4616 coding
->common_flags
4617 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4619 else if (EQ (coding_type
, Qutf_8
))
4621 coding
->detector
= detect_coding_utf_8
;
4622 coding
->decoder
= decode_coding_utf_8
;
4623 coding
->encoder
= encode_coding_utf_8
;
4624 coding
->common_flags
4625 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4627 else if (EQ (coding_type
, Qutf_16
))
4629 val
= AREF (attrs
, coding_attr_utf_16_bom
);
4630 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
4631 : EQ (val
, Qt
) ? utf_16_with_bom
4632 : utf_16_without_bom
);
4633 val
= AREF (attrs
, coding_attr_utf_16_endian
);
4634 CODING_UTF_16_ENDIAN (coding
) = (NILP (val
) ? utf_16_big_endian
4635 : utf_16_little_endian
);
4636 CODING_UTF_16_SURROGATE (coding
) = 0;
4637 coding
->detector
= detect_coding_utf_16
;
4638 coding
->decoder
= decode_coding_utf_16
;
4639 coding
->encoder
= encode_coding_utf_16
;
4640 coding
->common_flags
4641 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4643 else if (EQ (coding_type
, Qccl
))
4645 coding
->detector
= detect_coding_ccl
;
4646 coding
->decoder
= decode_coding_ccl
;
4647 coding
->encoder
= encode_coding_ccl
;
4648 coding
->common_flags
4649 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4650 | CODING_REQUIRE_FLUSHING_MASK
);
4652 else if (EQ (coding_type
, Qemacs_mule
))
4654 coding
->detector
= detect_coding_emacs_mule
;
4655 coding
->decoder
= decode_coding_emacs_mule
;
4656 coding
->encoder
= encode_coding_emacs_mule
;
4657 coding
->common_flags
4658 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4659 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
4660 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
4662 Lisp_Object tail
, safe_charsets
;
4663 int max_charset_id
= 0;
4665 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4667 if (max_charset_id
< XFASTINT (XCAR (tail
)))
4668 max_charset_id
= XFASTINT (XCAR (tail
));
4669 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
4671 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4673 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
4674 coding
->max_charset_id
= max_charset_id
;
4675 coding
->safe_charsets
= (char *) XSTRING (safe_charsets
)->data
;
4678 else if (EQ (coding_type
, Qshift_jis
))
4680 coding
->detector
= detect_coding_sjis
;
4681 coding
->decoder
= decode_coding_sjis
;
4682 coding
->encoder
= encode_coding_sjis
;
4683 coding
->common_flags
4684 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4686 else if (EQ (coding_type
, Qbig5
))
4688 coding
->detector
= detect_coding_big5
;
4689 coding
->decoder
= decode_coding_big5
;
4690 coding
->encoder
= encode_coding_big5
;
4691 coding
->common_flags
4692 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4694 else /* EQ (coding_type, Qraw_text) */
4696 coding
->detector
= NULL
;
4697 coding
->decoder
= decode_coding_raw_text
;
4698 coding
->encoder
= encode_coding_raw_text
;
4699 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
4705 /* Return raw-text or one of its subsidiaries that has the same
4706 eol_type as CODING-SYSTEM. */
4709 raw_text_coding_system (coding_system
)
4710 Lisp_Object coding_system
;
4712 Lisp_Object spec
, attrs
;
4713 Lisp_Object eol_type
, raw_text_eol_type
;
4715 spec
= CODING_SYSTEM_SPEC (coding_system
);
4716 attrs
= AREF (spec
, 0);
4718 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
4719 return coding_system
;
4721 eol_type
= AREF (spec
, 2);
4722 if (VECTORP (eol_type
))
4724 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
4725 raw_text_eol_type
= AREF (spec
, 2);
4726 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
4727 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
4728 : AREF (raw_text_eol_type
, 2));
4732 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4733 does, return one of the subsidiary that has the same eol-spec as
4734 PARENT. Otherwise, return CODING_SYSTEM. */
4737 coding_inherit_eol_type (coding_system
, parent
)
4738 Lisp_Object coding_system
, parent
;
4740 Lisp_Object spec
, attrs
, eol_type
;
4742 spec
= CODING_SYSTEM_SPEC (coding_system
);
4743 attrs
= AREF (spec
, 0);
4744 eol_type
= AREF (spec
, 2);
4745 if (VECTORP (eol_type
))
4747 Lisp_Object parent_spec
;
4748 Lisp_Object parent_eol_type
;
4751 = CODING_SYSTEM_SPEC (buffer_defaults
.buffer_file_coding_system
);
4752 parent_eol_type
= AREF (parent_spec
, 2);
4753 if (EQ (parent_eol_type
, Qunix
))
4754 coding_system
= AREF (eol_type
, 0);
4755 else if (EQ (parent_eol_type
, Qdos
))
4756 coding_system
= AREF (eol_type
, 1);
4757 else if (EQ (parent_eol_type
, Qmac
))
4758 coding_system
= AREF (eol_type
, 2);
4760 return coding_system
;
4763 /* Emacs has a mechanism to automatically detect a coding system if it
4764 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4765 it's impossible to distinguish some coding systems accurately
4766 because they use the same range of codes. So, at first, coding
4767 systems are categorized into 7, those are:
4769 o coding-category-emacs-mule
4771 The category for a coding system which has the same code range
4772 as Emacs' internal format. Assigned the coding-system (Lisp
4773 symbol) `emacs-mule' by default.
4775 o coding-category-sjis
4777 The category for a coding system which has the same code range
4778 as SJIS. Assigned the coding-system (Lisp
4779 symbol) `japanese-shift-jis' by default.
4781 o coding-category-iso-7
4783 The category for a coding system which has the same code range
4784 as ISO2022 of 7-bit environment. This doesn't use any locking
4785 shift and single shift functions. This can encode/decode all
4786 charsets. Assigned the coding-system (Lisp symbol)
4787 `iso-2022-7bit' by default.
4789 o coding-category-iso-7-tight
4791 Same as coding-category-iso-7 except that this can
4792 encode/decode only the specified charsets.
4794 o coding-category-iso-8-1
4796 The category for a coding system which has the same code range
4797 as ISO2022 of 8-bit environment and graphic plane 1 used only
4798 for DIMENSION1 charset. This doesn't use any locking shift
4799 and single shift functions. Assigned the coding-system (Lisp
4800 symbol) `iso-latin-1' by default.
4802 o coding-category-iso-8-2
4804 The category for a coding system which has the same code range
4805 as ISO2022 of 8-bit environment and graphic plane 1 used only
4806 for DIMENSION2 charset. This doesn't use any locking shift
4807 and single shift functions. Assigned the coding-system (Lisp
4808 symbol) `japanese-iso-8bit' by default.
4810 o coding-category-iso-7-else
4812 The category for a coding system which has the same code range
4813 as ISO2022 of 7-bit environemnt but uses locking shift or
4814 single shift functions. Assigned the coding-system (Lisp
4815 symbol) `iso-2022-7bit-lock' by default.
4817 o coding-category-iso-8-else
4819 The category for a coding system which has the same code range
4820 as ISO2022 of 8-bit environemnt but uses locking shift or
4821 single shift functions. Assigned the coding-system (Lisp
4822 symbol) `iso-2022-8bit-ss2' by default.
4824 o coding-category-big5
4826 The category for a coding system which has the same code range
4827 as BIG5. Assigned the coding-system (Lisp symbol)
4828 `cn-big5' by default.
4830 o coding-category-utf-8
4832 The category for a coding system which has the same code range
4833 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
4834 symbol) `utf-8' by default.
4836 o coding-category-utf-16-be
4838 The category for a coding system in which a text has an
4839 Unicode signature (cf. Unicode Standard) in the order of BIG
4840 endian at the head. Assigned the coding-system (Lisp symbol)
4841 `utf-16-be' by default.
4843 o coding-category-utf-16-le
4845 The category for a coding system in which a text has an
4846 Unicode signature (cf. Unicode Standard) in the order of
4847 LITTLE endian at the head. Assigned the coding-system (Lisp
4848 symbol) `utf-16-le' by default.
4850 o coding-category-ccl
4852 The category for a coding system of which encoder/decoder is
4853 written in CCL programs. The default value is nil, i.e., no
4854 coding system is assigned.
4856 o coding-category-binary
4858 The category for a coding system not categorized in any of the
4859 above. Assigned the coding-system (Lisp symbol)
4860 `no-conversion' by default.
4862 Each of them is a Lisp symbol and the value is an actual
4863 `coding-system's (this is also a Lisp symbol) assigned by a user.
4864 What Emacs does actually is to detect a category of coding system.
4865 Then, it uses a `coding-system' assigned to it. If Emacs can't
4866 decide only one possible category, it selects a category of the
4867 highest priority. Priorities of categories are also specified by a
4868 user in a Lisp variable `coding-category-list'.
4872 #define EOL_SEEN_NONE 0
4873 #define EOL_SEEN_LF 1
4874 #define EOL_SEEN_CR 2
4875 #define EOL_SEEN_CRLF 4
4877 /* Detect how end-of-line of a text of length CODING->src_bytes
4878 pointed by CODING->source is encoded. Return one of
4881 #define MAX_EOL_CHECK_COUNT 3
4884 detect_eol (coding
, source
, src_bytes
)
4885 struct coding_system
*coding
;
4886 unsigned char *source
;
4887 EMACS_INT src_bytes
;
4889 Lisp_Object attrs
, coding_type
;
4890 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
4893 int eol_seen
= EOL_SEEN_NONE
;
4895 attrs
= CODING_ID_ATTRS (coding
->id
);
4896 coding_type
= CODING_ATTR_TYPE (attrs
);
4898 if (EQ (coding_type
, Qccl
))
4902 msb
= coding
->spec
.utf_16
.endian
== utf_16_little_endian
;
4905 while (src
+ 1 < src_end
)
4908 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
4913 this_eol
= EOL_SEEN_LF
;
4914 else if (src
+ 3 >= src_end
4915 || src
[msb
+ 2] != 0
4916 || src
[lsb
+ 2] != '\n')
4917 this_eol
= EOL_SEEN_CR
;
4919 this_eol
= EOL_SEEN_CRLF
;
4921 if (eol_seen
== EOL_SEEN_NONE
)
4922 /* This is the first end-of-line. */
4923 eol_seen
= this_eol
;
4924 else if (eol_seen
!= this_eol
)
4926 /* The found type is different from what found before. */
4927 eol_seen
= EOL_SEEN_LF
;
4930 if (++total
== MAX_EOL_CHECK_COUNT
)
4938 while (src
< src_end
)
4941 if (c
== '\n' || c
== '\r')
4946 this_eol
= EOL_SEEN_LF
;
4947 else if (src
>= src_end
|| *src
!= '\n')
4948 this_eol
= EOL_SEEN_CR
;
4950 this_eol
= EOL_SEEN_CRLF
, src
++;
4952 if (eol_seen
== EOL_SEEN_NONE
)
4953 /* This is the first end-of-line. */
4954 eol_seen
= this_eol
;
4955 else if (eol_seen
!= this_eol
)
4957 /* The found type is different from what found before. */
4958 eol_seen
= EOL_SEEN_LF
;
4961 if (++total
== MAX_EOL_CHECK_COUNT
)
4971 adjust_coding_eol_type (coding
, eol_seen
)
4972 struct coding_system
*coding
;
4975 Lisp_Object eol_type
;
4977 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4978 if (eol_seen
& EOL_SEEN_LF
)
4979 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
4980 else if (eol_seen
& EOL_SEEN_CRLF
)
4981 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
4982 else if (eol_seen
& EOL_SEEN_CR
)
4983 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
4986 /* Detect how a text specified in CODING is encoded. If a coding
4987 system is detected, update fields of CODING by the detected coding
4991 detect_coding (coding
)
4992 struct coding_system
*coding
;
4994 unsigned char *src
, *src_end
;
4995 Lisp_Object attrs
, coding_type
;
4997 coding
->consumed
= coding
->consumed_char
= 0;
4998 coding
->produced
= coding
->produced_char
= 0;
4999 coding_set_source (coding
);
5001 src_end
= coding
->source
+ coding
->src_bytes
;
5003 /* If we have not yet decided the text encoding type, detect it
5005 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
5007 int mask
= CATEGORY_MASK_ANY
;
5010 for (src
= coding
->source
; src
< src_end
; src
++)
5013 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
5015 || c
== ISO_CODE_SO
)))
5018 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
5020 if (coding
->head_ascii
< coding
->src_bytes
)
5024 for (i
= 0; i
< coding_category_raw_text
; i
++)
5026 enum coding_category category
= coding_priorities
[i
];
5027 struct coding_system
*this = coding_categories
+ category
;
5029 if (category
>= coding_category_raw_text
5030 || detected
& (1 << category
))
5035 /* No coding system of this category is defined. */
5036 mask
&= ~(1 << category
);
5040 detected
|= detected_mask
[category
];
5041 if ((*(this->detector
)) (coding
, &mask
))
5046 setup_coding_system (Qraw_text
, coding
);
5047 else if (mask
!= CATEGORY_MASK_ANY
)
5048 for (i
= 0; i
< coding_category_raw_text
; i
++)
5050 enum coding_category category
= coding_priorities
[i
];
5051 struct coding_system
*this = coding_categories
+ category
;
5053 if (mask
& (1 << category
))
5055 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5062 attrs
= CODING_ID_ATTRS (coding
->id
);
5063 coding_type
= CODING_ATTR_TYPE (attrs
);
5065 /* If we have not yet decided the EOL type, detect it now. But, the
5066 detection is impossible for a CCL based coding system, in which
5067 case, we detct the EOL type after decoding. */
5068 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
))
5069 && ! EQ (coding_type
, Qccl
))
5071 int eol_seen
= detect_eol (coding
, coding
->source
, coding
->src_bytes
);
5073 if (eol_seen
!= EOL_SEEN_NONE
)
5074 adjust_coding_eol_type (coding
, eol_seen
);
5081 struct coding_system
*coding
;
5083 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
)))
5085 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5086 unsigned char *pend
= p
+ coding
->produced
;
5087 int eol_seen
= EOL_SEEN_NONE
;
5089 for (; p
< pend
; p
++)
5092 eol_seen
|= EOL_SEEN_LF
;
5093 else if (*p
== '\r')
5095 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
5097 eol_seen
|= EOL_SEEN_CRLF
;
5101 eol_seen
|= EOL_SEEN_CR
;
5104 if (eol_seen
!= EOL_SEEN_NONE
)
5105 adjust_coding_eol_type (coding
, eol_seen
);
5108 if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qmac
))
5110 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5111 unsigned char *pend
= p
+ coding
->produced
;
5113 for (; p
< pend
; p
++)
5117 else if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
))
5119 unsigned char *p
, *pbeg
, *pend
;
5120 Lisp_Object undo_list
;
5122 move_gap_both (coding
->dst_pos
+ coding
->produced_char
,
5123 coding
->dst_pos_byte
+ coding
->produced
);
5124 undo_list
= current_buffer
->undo_list
;
5125 current_buffer
->undo_list
= Qt
;
5126 del_range_2 (coding
->dst_pos
, coding
->dst_pos_byte
, GPT
, GPT_BYTE
, 0);
5127 current_buffer
->undo_list
= undo_list
;
5129 pend
= pbeg
+ coding
->produced
;
5131 for (p
= pend
- 1; p
>= pbeg
; p
--)
5134 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
- p
- 1);
5137 coding
->produced_char
-= coding
->produced
- (pend
- pbeg
);
5138 coding
->produced
= pend
- pbeg
;
5139 insert_from_gap (coding
->produced_char
, coding
->produced
);
5144 translate_chars (coding
, table
)
5145 struct coding_system
*coding
;
5148 int *charbuf
= coding
->charbuf
;
5149 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5152 if (coding
->chars_at_source
)
5155 while (charbuf
< charbuf_end
)
5161 *charbuf
++ = translate_char (table
, c
);
5166 produce_chars (coding
)
5167 struct coding_system
*coding
;
5169 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5170 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5172 int produced_chars
= 0;
5174 if (! coding
->chars_at_source
)
5176 /* Characters are in coding->charbuf. */
5177 int *buf
= coding
->charbuf
;
5178 int *buf_end
= buf
+ coding
->charbuf_used
;
5179 unsigned char *adjusted_dst_end
;
5181 if (BUFFERP (coding
->src_object
)
5182 && EQ (coding
->src_object
, coding
->dst_object
))
5183 dst_end
= coding
->source
+ coding
->consumed
;
5184 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5186 while (buf
< buf_end
)
5190 if (dst
>= adjusted_dst_end
)
5192 dst
= alloc_destination (coding
,
5193 buf_end
- buf
+ MAX_MULTIBYTE_LENGTH
,
5195 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5196 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5200 if (coding
->dst_multibyte
5201 || ! CHAR_BYTE8_P (c
))
5202 CHAR_STRING_ADVANCE (c
, dst
);
5204 *dst
++ = CHAR_TO_BYTE8 (c
);
5208 /* This is an annotation data. */
5214 unsigned char *src
= coding
->source
;
5215 unsigned char *src_end
= src
+ coding
->src_bytes
;
5216 Lisp_Object eol_type
;
5218 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5220 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5222 if (coding
->src_multibyte
)
5229 unsigned char *src_base
= src
;
5235 if (EQ (eol_type
, Qdos
))
5241 else if (EQ (eol_type
, Qmac
))
5246 coding
->consumed
= src
- coding
->source
;
5248 if (EQ (coding
->src_object
, coding
->dst_object
))
5252 dst
= alloc_destination (coding
, src_end
- src
+ 1,
5254 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5255 coding_set_source (coding
);
5256 src
= coding
->source
+ coding
->consumed
;
5257 src_end
= coding
->source
+ coding
->src_bytes
;
5267 while (src
< src_end
)
5274 if (EQ (eol_type
, Qdos
))
5280 else if (EQ (eol_type
, Qmac
))
5283 if (dst
>= dst_end
- 1)
5285 coding
->consumed
= src
- coding
->source
;
5287 if (EQ (coding
->src_object
, coding
->dst_object
))
5289 if (dst
>= dst_end
- 1)
5291 dst
= alloc_destination (coding
, src_end
- src
+ 2,
5293 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5294 coding_set_source (coding
);
5295 src
= coding
->source
+ coding
->consumed
;
5296 src_end
= coding
->source
+ coding
->src_bytes
;
5304 if (!EQ (coding
->src_object
, coding
->dst_object
))
5306 int require
= coding
->src_bytes
- coding
->dst_bytes
;
5310 EMACS_INT offset
= src
- coding
->source
;
5312 dst
= alloc_destination (coding
, require
, dst
);
5313 coding_set_source (coding
);
5314 src
= coding
->source
+ offset
;
5315 src_end
= coding
->source
+ coding
->src_bytes
;
5318 produced_chars
= coding
->src_chars
;
5319 while (src
< src_end
)
5325 if (EQ (eol_type
, Qdos
))
5332 else if (EQ (eol_type
, Qmac
))
5338 coding
->consumed
= coding
->src_bytes
;
5339 coding
->consumed_char
= coding
->src_chars
;
5342 produced
= dst
- (coding
->destination
+ coding
->produced
);
5343 if (BUFFERP (coding
->dst_object
))
5344 insert_from_gap (produced_chars
, produced
);
5345 coding
->produced
+= produced
;
5346 coding
->produced_char
+= produced_chars
;
5347 return produced_chars
;
5350 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5352 [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5356 produce_composition (coding
, charbuf
)
5357 struct coding_system
*coding
;
5363 enum composition_method method
;
5365 Lisp_Object components
;
5367 buffer
= coding
->dst_object
;
5369 pos
= coding
->dst_pos
+ charbuf
[1];
5370 method
= (enum composition_method
) (charbuf
[3]);
5371 cmp_len
= charbuf
[4];
5373 if (method
== COMPOSITION_RELATIVE
)
5377 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
5382 for (i
= 0; i
< len
; i
++)
5383 args
[i
] = make_number (charbuf
[i
]);
5384 components
= (method
== COMPOSITION_WITH_ALTCHARS
5385 ? Fstring (len
, args
) : Fvector (len
, args
));
5387 compose_text (pos
, pos
+ cmp_len
, components
, Qnil
, Qnil
);
5391 save_composition_data (buf
, buf_end
, prop
)
5395 enum composition_method method
= COMPOSITION_METHOD (prop
);
5396 int cmp_len
= COMPOSITION_LENGTH (prop
);
5398 if (buf
+ 4 + (MAX_COMPOSITION_COMPONENTS
* 2 - 1) > buf_end
)
5401 buf
[1] = CODING_ANNOTATE_COMPOSITION_MASK
;
5405 if (method
== COMPOSITION_RELATIVE
)
5409 Lisp_Object components
;
5412 components
= COMPOSITION_COMPONENTS (prop
);
5413 if (VECTORP (components
))
5415 len
= XVECTOR (components
)->size
;
5416 for (i
= 0; i
< len
; i
++)
5417 buf
[4 + i
] = XINT (AREF (components
, i
));
5419 else if (STRINGP (components
))
5423 len
= XSTRING (components
)->size
;
5426 FETCH_STRING_CHAR_ADVANCE (buf
[4 + i
], components
, i
, i_byte
);
5428 else if (INTEGERP (components
))
5431 buf
[4] = XINT (components
);
5433 else if (CONSP (components
))
5435 for (len
= 0; CONSP (components
);
5436 len
++, components
= XCDR (components
))
5437 buf
[4 + len
] = XINT (XCAR (components
));
5443 return (buf
+ buf
[0]);
5446 #define CHARBUF_SIZE 0x4000
5448 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5450 int size = CHARBUF_SIZE;; \
5452 coding->charbuf = NULL; \
5453 while (size > 1024) \
5455 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5456 if (coding->charbuf) \
5460 if (! coding->charbuf) \
5462 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5463 return coding->result; \
5465 coding->charbuf_size = size; \
5470 produce_annotation (coding
)
5471 struct coding_system
*coding
;
5473 int *charbuf
= coding
->charbuf
;
5474 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5476 while (charbuf
< charbuf_end
)
5482 int len
= -*charbuf
;
5485 case CODING_ANNOTATE_COMPOSITION_MASK
:
5486 produce_composition (coding
, charbuf
);
5496 /* Decode the data at CODING->src_object into CODING->dst_object.
5497 CODING->src_object is a buffer, a string, or nil.
5498 CODING->dst_object is a buffer.
5500 If CODING->src_object is a buffer, it must be the current buffer.
5501 In this case, if CODING->src_pos is positive, it is a position of
5502 the source text in the buffer, otherwise, the source text is in the
5503 gap area of the buffer, and CODING->src_pos specifies the offset of
5504 the text from GPT (which must be the same as PT). If this is the
5505 same buffer as CODING->dst_object, CODING->src_pos must be
5508 If CODING->src_object is a string, CODING->src_pos in an index to
5511 If CODING->src_object is nil, CODING->source must already point to
5512 the non-relocatable memory area. In this case, CODING->src_pos is
5513 an offset from CODING->source.
5515 The decoded data is inserted at the current point of the buffer
5520 decode_coding (coding
)
5521 struct coding_system
*coding
;
5525 if (BUFFERP (coding
->src_object
)
5526 && coding
->src_pos
> 0
5527 && coding
->src_pos
< GPT
5528 && coding
->src_pos
+ coding
->src_chars
> GPT
)
5529 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
5531 if (BUFFERP (coding
->dst_object
))
5533 if (current_buffer
!= XBUFFER (coding
->dst_object
))
5534 set_buffer_internal (XBUFFER (coding
->dst_object
));
5536 move_gap_both (PT
, PT_BYTE
);
5539 coding
->consumed
= coding
->consumed_char
= 0;
5540 coding
->produced
= coding
->produced_char
= 0;
5541 coding
->chars_at_source
= 0;
5542 coding
->result
= CODING_RESULT_SUCCESS
;
5545 ALLOC_CONVERSION_WORK_AREA (coding
);
5547 attrs
= CODING_ID_ATTRS (coding
->id
);
5551 coding_set_source (coding
);
5552 coding
->annotated
= 0;
5553 (*(coding
->decoder
)) (coding
);
5554 if (!NILP (CODING_ATTR_DECODE_TBL (attrs
)))
5555 translate_chars (CODING_ATTR_DECODE_TBL (attrs
), coding
);
5556 coding_set_destination (coding
);
5557 produce_chars (coding
);
5558 if (coding
->annotated
)
5559 produce_annotation (coding
);
5561 while (coding
->consumed
< coding
->src_bytes
5562 && ! coding
->result
);
5564 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qccl
)
5565 && SYMBOLP (CODING_ID_EOL_TYPE (coding
->id
))
5566 && ! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
5567 decode_eol (coding
);
5569 coding
->carryover_bytes
= 0;
5570 if (coding
->consumed
< coding
->src_bytes
)
5572 int nbytes
= coding
->src_bytes
- coding
->consumed
;
5575 coding_set_source (coding
);
5576 coding_set_destination (coding
);
5577 src
= coding
->source
+ coding
->consumed
;
5579 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
5581 /* Flush out unprocessed data as binary chars. We are sure
5582 that the number of data is less than the size of
5584 int *charbuf
= coding
->charbuf
;
5586 while (nbytes
-- > 0)
5589 *charbuf
++ = (c
& 0x80 ? - c
: c
);
5591 produce_chars (coding
);
5595 /* Record unprocessed bytes in coding->carryover. We are
5596 sure that the number of data is less than the size of
5597 coding->carryover. */
5598 unsigned char *p
= coding
->carryover
;
5600 coding
->carryover_bytes
= nbytes
;
5601 while (nbytes
-- > 0)
5604 coding
->consumed
= coding
->src_bytes
;
5607 return coding
->result
;
5611 consume_chars (coding
)
5612 struct coding_system
*coding
;
5614 int *buf
= coding
->charbuf
;
5615 /* -1 is to compensate for CRLF. */
5616 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
- 1;
5617 const unsigned char *src
= coding
->source
+ coding
->consumed
;
5618 int pos
= coding
->src_pos
+ coding
->consumed_char
;
5619 int end_pos
= coding
->src_pos
+ coding
->src_chars
;
5620 int multibytep
= coding
->src_multibyte
;
5621 Lisp_Object eol_type
;
5623 int start
, end
, stop
;
5624 Lisp_Object object
, prop
;
5626 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5627 if (VECTORP (eol_type
))
5630 object
= coding
->src_object
;
5632 /* Note: composition handling is not yet implemented. */
5633 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
5635 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
5636 && find_composition (pos
, end_pos
, &start
, &end
, &prop
, object
)
5639 || (find_composition (end
, end_pos
, &start
, &end
, &prop
, object
)
5640 && end
<= end_pos
)))
5645 while (buf
< buf_end
)
5653 p
= save_composition_data (buf
, buf_end
, prop
);
5657 if (find_composition (end
, end_pos
, &start
, &end
, &prop
, object
)
5667 c
= STRING_CHAR_ADVANCE (src
);
5668 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
5670 if (! EQ (eol_type
, Qunix
))
5674 if (EQ (eol_type
, Qdos
))
5684 coding
->consumed
= src
- coding
->source
;
5685 coding
->consumed_char
= pos
- coding
->src_pos
;
5686 coding
->charbuf_used
= buf
- coding
->charbuf
;
5687 coding
->chars_at_source
= 0;
5691 /* Encode the text at CODING->src_object into CODING->dst_object.
5692 CODING->src_object is a buffer or a string.
5693 CODING->dst_object is a buffer or nil.
5695 If CODING->src_object is a buffer, it must be the current buffer.
5696 In this case, if CODING->src_pos is positive, it is a position of
5697 the source text in the buffer, otherwise. the source text is in the
5698 gap area of the buffer, and coding->src_pos specifies the offset of
5699 the text from GPT (which must be the same as PT). If this is the
5700 same buffer as CODING->dst_object, CODING->src_pos must be
5701 negative and CODING should not have `pre-write-conversion'.
5703 If CODING->src_object is a string, CODING should not have
5704 `pre-write-conversion'.
5706 If CODING->dst_object is a buffer, the encoded data is inserted at
5707 the current point of that buffer.
5709 If CODING->dst_object is nil, the encoded data is placed at the
5710 memory area specified by CODING->destination. */
5713 encode_coding (coding
)
5714 struct coding_system
*coding
;
5718 attrs
= CODING_ID_ATTRS (coding
->id
);
5720 if (BUFFERP (coding
->dst_object
))
5722 set_buffer_internal (XBUFFER (coding
->dst_object
));
5723 coding
->dst_multibyte
5724 = ! NILP (current_buffer
->enable_multibyte_characters
);
5727 coding
->consumed
= coding
->consumed_char
= 0;
5728 coding
->produced
= coding
->produced_char
= 0;
5729 coding
->result
= CODING_RESULT_SUCCESS
;
5732 ALLOC_CONVERSION_WORK_AREA (coding
);
5735 coding_set_source (coding
);
5736 consume_chars (coding
);
5738 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs
)))
5739 translate_chars (CODING_ATTR_ENCODE_TBL (attrs
), coding
);
5741 coding_set_destination (coding
);
5742 (*(coding
->encoder
)) (coding
);
5743 } while (coding
->consumed_char
< coding
->src_chars
);
5745 if (BUFFERP (coding
->dst_object
))
5746 insert_from_gap (coding
->produced_char
, coding
->produced
);
5748 return (coding
->result
);
5753 /* List of currently used working buffer. */
5754 Lisp_Object Vcode_conversion_work_buf_list
;
5756 /* A working buffer used by the top level conversion. */
5757 Lisp_Object Vcode_conversion_reused_work_buf
;
5760 /* Return a working buffer that can be freely used by the following
5761 code conversion. MULTIBYTEP specifies the multibyteness of the
5765 make_conversion_work_buffer (multibytep
)
5768 struct buffer
*current
= current_buffer
;
5771 if (NILP (Vcode_conversion_work_buf_list
))
5773 if (NILP (Vcode_conversion_reused_work_buf
))
5774 Vcode_conversion_reused_work_buf
5775 = Fget_buffer_create (build_string (" *code-conversion-work*"));
5776 Vcode_conversion_work_buf_list
5777 = Fcons (Vcode_conversion_reused_work_buf
, Qnil
);
5781 int depth
= XINT (Flength (Vcode_conversion_work_buf_list
));
5784 sprintf (str
, " *code-conversion-work*<%d>", depth
);
5785 Vcode_conversion_work_buf_list
5786 = Fcons (Fget_buffer_create (build_string (str
)),
5787 Vcode_conversion_work_buf_list
);
5790 buf
= XCAR (Vcode_conversion_work_buf_list
);
5791 set_buffer_internal (XBUFFER (buf
));
5792 current_buffer
->undo_list
= Qt
;
5794 Fset_buffer_multibyte (multibytep
? Qt
: Qnil
, Qnil
);
5795 set_buffer_internal (current
);
5799 static struct coding_system
*saved_coding
;
5802 code_conversion_restore (info
)
5805 int depth
= XINT (Flength (Vcode_conversion_work_buf_list
));
5810 buf
= XCAR (Vcode_conversion_work_buf_list
);
5811 Vcode_conversion_work_buf_list
= XCDR (Vcode_conversion_work_buf_list
);
5812 if (depth
> 1 && !NILP (Fbuffer_live_p (buf
)))
5816 if (EQ (saved_coding
->dst_object
, Qt
)
5817 && saved_coding
->destination
)
5818 xfree (saved_coding
->destination
);
5820 return save_excursion_restore (info
);
5825 decode_coding_gap (coding
, chars
, bytes
)
5826 struct coding_system
*coding
;
5827 EMACS_INT chars
, bytes
;
5829 int count
= specpdl_ptr
- specpdl
;
5831 saved_coding
= coding
;
5832 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5834 coding
->src_object
= Fcurrent_buffer ();
5835 coding
->src_chars
= chars
;
5836 coding
->src_bytes
= bytes
;
5837 coding
->src_pos
= -chars
;
5838 coding
->src_pos_byte
= -bytes
;
5839 coding
->src_multibyte
= chars
< bytes
;
5840 coding
->dst_object
= coding
->src_object
;
5841 coding
->dst_pos
= PT
;
5842 coding
->dst_pos_byte
= PT_BYTE
;
5843 coding
->dst_multibyte
= ! NILP (current_buffer
->enable_multibyte_characters
);
5845 if (CODING_REQUIRE_DETECTION (coding
))
5846 detect_coding (coding
);
5848 decode_coding (coding
);
5850 unbind_to (count
, Qnil
);
5851 return coding
->result
;
5855 encode_coding_gap (coding
, chars
, bytes
)
5856 struct coding_system
*coding
;
5857 EMACS_INT chars
, bytes
;
5859 int count
= specpdl_ptr
- specpdl
;
5862 saved_coding
= coding
;
5863 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5865 buffer
= Fcurrent_buffer ();
5866 coding
->src_object
= buffer
;
5867 coding
->src_chars
= chars
;
5868 coding
->src_bytes
= bytes
;
5869 coding
->src_pos
= -chars
;
5870 coding
->src_pos_byte
= -bytes
;
5871 coding
->src_multibyte
= chars
< bytes
;
5872 coding
->dst_object
= coding
->src_object
;
5873 coding
->dst_pos
= PT
;
5874 coding
->dst_pos_byte
= PT_BYTE
;
5876 encode_coding (coding
);
5878 unbind_to (count
, Qnil
);
5879 return coding
->result
;
5883 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5884 SRC_OBJECT into DST_OBJECT by coding context CODING.
5886 SRC_OBJECT is a buffer, a string, or Qnil.
5888 If it is a buffer, the text is at point of the buffer. FROM and TO
5889 are positions in the buffer.
5891 If it is a string, the text is at the beginning of the string.
5892 FROM and TO are indices to the string.
5894 If it is nil, the text is at coding->source. FROM and TO are
5895 indices to coding->source.
5897 DST_OBJECT is a buffer, Qt, or Qnil.
5899 If it is a buffer, the decoded text is inserted at point of the
5900 buffer. If the buffer is the same as SRC_OBJECT, the source text
5903 If it is Qt, a string is made from the decoded text, and
5904 set in CODING->dst_object.
5906 If it is Qnil, the decoded text is stored at CODING->destination.
5907 The called must allocate CODING->dst_bytes bytes at
5908 CODING->destination by xmalloc. If the decoded text is longer than
5909 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5913 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
5915 struct coding_system
*coding
;
5916 Lisp_Object src_object
;
5917 EMACS_INT from
, from_byte
, to
, to_byte
;
5918 Lisp_Object dst_object
;
5920 int count
= specpdl_ptr
- specpdl
;
5921 unsigned char *destination
;
5922 EMACS_INT dst_bytes
;
5923 EMACS_INT chars
= to
- from
;
5924 EMACS_INT bytes
= to_byte
- from_byte
;
5927 saved_coding
= coding
;
5928 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5930 if (NILP (dst_object
))
5932 destination
= coding
->destination
;
5933 dst_bytes
= coding
->dst_bytes
;
5936 coding
->src_object
= src_object
;
5937 coding
->src_chars
= chars
;
5938 coding
->src_bytes
= bytes
;
5939 coding
->src_multibyte
= chars
< bytes
;
5941 if (STRINGP (src_object
))
5943 coding
->src_pos
= from
;
5944 coding
->src_pos_byte
= from_byte
;
5946 else if (BUFFERP (src_object
))
5948 set_buffer_internal (XBUFFER (src_object
));
5950 move_gap_both (from
, from_byte
);
5951 if (EQ (src_object
, dst_object
))
5953 TEMP_SET_PT_BOTH (from
, from_byte
);
5954 del_range_both (from
, from_byte
, to
, to_byte
, 1);
5955 coding
->src_pos
= -chars
;
5956 coding
->src_pos_byte
= -bytes
;
5960 coding
->src_pos
= from
;
5961 coding
->src_pos_byte
= from_byte
;
5965 if (CODING_REQUIRE_DETECTION (coding
))
5966 detect_coding (coding
);
5967 attrs
= CODING_ID_ATTRS (coding
->id
);
5969 if (! NILP (CODING_ATTR_POST_READ (attrs
))
5970 || EQ (dst_object
, Qt
))
5972 coding
->dst_object
= make_conversion_work_buffer (1);
5973 coding
->dst_pos
= BEG
;
5974 coding
->dst_pos_byte
= BEG_BYTE
;
5975 coding
->dst_multibyte
= 1;
5977 else if (BUFFERP (dst_object
))
5979 coding
->dst_object
= dst_object
;
5980 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
5981 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
5982 coding
->dst_multibyte
5983 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
5987 coding
->dst_object
= Qnil
;
5988 coding
->dst_multibyte
= 1;
5991 decode_coding (coding
);
5993 if (BUFFERP (coding
->dst_object
))
5994 set_buffer_internal (XBUFFER (coding
->dst_object
));
5996 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
5998 struct gcpro gcpro1
, gcpro2
;
5999 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6002 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6003 GCPRO2 (coding
->src_object
, coding
->dst_object
);
6004 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6005 make_number (coding
->produced_char
));
6008 coding
->produced_char
+= Z
- prev_Z
;
6009 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6012 if (EQ (dst_object
, Qt
))
6014 coding
->dst_object
= Fbuffer_string ();
6016 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
6018 set_buffer_internal (XBUFFER (coding
->dst_object
));
6019 if (dst_bytes
< coding
->produced
)
6022 = (unsigned char *) xrealloc (destination
, coding
->produced
);
6025 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
6026 unbind_to (count
, Qnil
);
6029 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
6030 move_gap_both (BEGV
, BEGV_BYTE
);
6031 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
6032 coding
->destination
= destination
;
6036 unbind_to (count
, Qnil
);
6041 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6043 struct coding_system
*coding
;
6044 Lisp_Object src_object
;
6045 EMACS_INT from
, from_byte
, to
, to_byte
;
6046 Lisp_Object dst_object
;
6048 int count
= specpdl_ptr
- specpdl
;
6049 EMACS_INT chars
= to
- from
;
6050 EMACS_INT bytes
= to_byte
- from_byte
;
6053 saved_coding
= coding
;
6054 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
6056 coding
->src_object
= src_object
;
6057 coding
->src_chars
= chars
;
6058 coding
->src_bytes
= bytes
;
6059 coding
->src_multibyte
= chars
< bytes
;
6061 attrs
= CODING_ID_ATTRS (coding
->id
);
6063 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
6065 coding
->src_object
= make_conversion_work_buffer (coding
->src_multibyte
);
6066 set_buffer_internal (XBUFFER (coding
->src_object
));
6067 if (STRINGP (src_object
))
6068 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
6069 else if (BUFFERP (src_object
))
6070 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
6072 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
6074 if (EQ (src_object
, dst_object
))
6076 set_buffer_internal (XBUFFER (src_object
));
6077 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6078 set_buffer_internal (XBUFFER (coding
->src_object
));
6081 call2 (CODING_ATTR_PRE_WRITE (attrs
),
6082 make_number (BEG
), make_number (Z
));
6083 coding
->src_object
= Fcurrent_buffer ();
6085 move_gap_both (BEG
, BEG_BYTE
);
6086 coding
->src_chars
= Z
- BEG
;
6087 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
6088 coding
->src_pos
= BEG
;
6089 coding
->src_pos_byte
= BEG_BYTE
;
6090 coding
->src_multibyte
= Z
< Z_BYTE
;
6092 else if (STRINGP (src_object
))
6094 coding
->src_pos
= from
;
6095 coding
->src_pos_byte
= from_byte
;
6097 else if (BUFFERP (src_object
))
6099 set_buffer_internal (XBUFFER (src_object
));
6101 move_gap_both (from
, from_byte
);
6102 if (EQ (src_object
, dst_object
))
6104 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6105 coding
->src_pos
= -chars
;
6106 coding
->src_pos_byte
= -bytes
;
6110 coding
->src_pos
= from
;
6111 coding
->src_pos_byte
= from_byte
;
6115 if (BUFFERP (dst_object
))
6117 coding
->dst_object
= dst_object
;
6118 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6119 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6120 coding
->dst_multibyte
6121 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6123 else if (EQ (dst_object
, Qt
))
6125 coding
->dst_object
= Qnil
;
6126 coding
->dst_bytes
= coding
->src_chars
;
6127 if (coding
->dst_bytes
== 0)
6128 coding
->dst_bytes
= 1;
6129 coding
->destination
= (unsigned char *) xmalloc (coding
->dst_bytes
);
6130 coding
->dst_multibyte
= 0;
6134 coding
->dst_object
= Qnil
;
6135 coding
->dst_multibyte
= 0;
6138 encode_coding (coding
);
6140 if (EQ (dst_object
, Qt
))
6142 if (BUFFERP (coding
->dst_object
))
6143 coding
->dst_object
= Fbuffer_string ();
6147 = make_unibyte_string ((char *) coding
->destination
,
6149 xfree (coding
->destination
);
6153 unbind_to (count
, Qnil
);
6158 preferred_coding_system ()
6160 int id
= coding_categories
[coding_priorities
[0]].id
;
6162 return CODING_ID_NAME (id
);
6167 /*** 8. Emacs Lisp library functions ***/
6169 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
6170 doc
: /* Return t if OBJECT is nil or a coding-system.
6171 See the documentation of `define-coding-system' for information
6172 about coding-system objects. */)
6176 return ((NILP (obj
) || CODING_SYSTEM_P (obj
)) ? Qt
: Qnil
);
6179 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
6180 Sread_non_nil_coding_system
, 1, 1, 0,
6181 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6188 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6189 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
6191 while (XSTRING (val
)->size
== 0);
6192 return (Fintern (val
, Qnil
));
6195 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
6196 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6197 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6198 (prompt
, default_coding_system
)
6199 Lisp_Object prompt
, default_coding_system
;
6202 if (SYMBOLP (default_coding_system
))
6203 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
6204 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6205 Qt
, Qnil
, Qcoding_system_history
,
6206 default_coding_system
, Qnil
);
6207 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
6210 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
6212 doc
: /* Check validity of CODING-SYSTEM.
6213 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6214 It is valid if it is a symbol with a non-nil `coding-system' property.
6215 The value of property should be a vector of length 5. */)
6217 Lisp_Object coding_system
;
6219 CHECK_SYMBOL (coding_system
);
6220 if (!NILP (Fcoding_system_p (coding_system
)))
6221 return coding_system
;
6223 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
6228 detect_coding_system (src
, src_bytes
, highest
, multibytep
, coding_system
)
6230 int src_bytes
, highest
;
6232 Lisp_Object coding_system
;
6234 unsigned char *src_end
= src
+ src_bytes
;
6235 int mask
= CATEGORY_MASK_ANY
;
6238 Lisp_Object attrs
, eol_type
;
6240 struct coding_system coding
;
6242 if (NILP (coding_system
))
6243 coding_system
= Qundecided
;
6244 setup_coding_system (coding_system
, &coding
);
6245 attrs
= CODING_ID_ATTRS (coding
.id
);
6246 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
6248 coding
.source
= src
;
6249 coding
.src_bytes
= src_bytes
;
6250 coding
.src_multibyte
= multibytep
;
6251 coding
.consumed
= 0;
6253 if (XINT (CODING_ATTR_CATEGORY (attrs
)) != coding_category_undecided
)
6255 mask
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
6259 coding_system
= Qnil
;
6260 for (; src
< src_end
; src
++)
6263 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
6265 || c
== ISO_CODE_SO
)))
6268 coding
.head_ascii
= src
- coding
.source
;
6271 for (i
= 0; i
< coding_category_raw_text
; i
++)
6273 enum coding_category category
= coding_priorities
[i
];
6274 struct coding_system
*this = coding_categories
+ category
;
6276 if (category
>= coding_category_raw_text
6277 || detected
& (1 << category
))
6282 /* No coding system of this category is defined. */
6283 mask
&= ~(1 << category
);
6287 detected
|= detected_mask
[category
];
6288 if ((*(coding_categories
[category
].detector
)) (&coding
, &mask
)
6291 mask
&= detected_mask
[category
];
6299 val
= Fcons (make_number (coding_category_raw_text
), Qnil
);
6300 else if (mask
== CATEGORY_MASK_ANY
)
6301 val
= Fcons (make_number (coding_category_undecided
), Qnil
);
6304 for (i
= 0; i
< coding_category_raw_text
; i
++)
6305 if (mask
& (1 << coding_priorities
[i
]))
6307 val
= Fcons (make_number (coding_priorities
[i
]), Qnil
);
6314 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6315 if (mask
& (1 << coding_priorities
[i
]))
6316 val
= Fcons (make_number (coding_priorities
[i
]), val
);
6320 int one_byte_eol
= -1, two_byte_eol
= -1;
6323 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
6325 struct coding_system
*this
6326 = (NILP (coding_system
) ? coding_categories
+ XINT (XCAR (tail
))
6330 attrs
= CODING_ID_ATTRS (this->id
);
6331 eol_type
= CODING_ID_EOL_TYPE (this->id
);
6332 XSETCAR (tail
, CODING_ID_NAME (this->id
));
6333 if (VECTORP (eol_type
))
6335 if (EQ (CODING_ATTR_TYPE (attrs
), Qutf_16
))
6337 if (two_byte_eol
< 0)
6338 two_byte_eol
= detect_eol (this, coding
.source
, src_bytes
);
6339 this_eol
= two_byte_eol
;
6343 if (one_byte_eol
< 0)
6344 one_byte_eol
=detect_eol (this, coding
.source
, src_bytes
);
6345 this_eol
= one_byte_eol
;
6347 if (this_eol
== EOL_SEEN_LF
)
6348 XSETCAR (tail
, AREF (eol_type
, 0));
6349 else if (this_eol
== EOL_SEEN_CRLF
)
6350 XSETCAR (tail
, AREF (eol_type
, 1));
6351 else if (this_eol
== EOL_SEEN_CR
)
6352 XSETCAR (tail
, AREF (eol_type
, 2));
6357 return (highest
? XCAR (val
) : val
);
6361 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
6363 doc
: /* Detect coding system of the text in the region between START and END.
6364 Return a list of possible coding systems ordered by priority.
6366 If only ASCII characters are found, it returns a list of single element
6367 `undecided' or its subsidiary coding system according to a detected
6370 If optional argument HIGHEST is non-nil, return the coding system of
6371 highest priority. */)
6372 (start
, end
, highest
)
6373 Lisp_Object start
, end
, highest
;
6376 int from_byte
, to_byte
;
6378 CHECK_NUMBER_COERCE_MARKER (start
);
6379 CHECK_NUMBER_COERCE_MARKER (end
);
6381 validate_region (&start
, &end
);
6382 from
= XINT (start
), to
= XINT (end
);
6383 from_byte
= CHAR_TO_BYTE (from
);
6384 to_byte
= CHAR_TO_BYTE (to
);
6386 if (from
< GPT
&& to
>= GPT
)
6387 move_gap_both (to
, to_byte
);
6389 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
6390 to_byte
- from_byte
,
6392 !NILP (current_buffer
6393 ->enable_multibyte_characters
),
6397 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
6399 doc
: /* Detect coding system of the text in STRING.
6400 Return a list of possible coding systems ordered by priority.
6402 If only ASCII characters are found, it returns a list of single element
6403 `undecided' or its subsidiary coding system according to a detected
6406 If optional argument HIGHEST is non-nil, return the coding system of
6407 highest priority. */)
6409 Lisp_Object string
, highest
;
6411 CHECK_STRING (string
);
6413 return detect_coding_system (XSTRING (string
)->data
,
6414 STRING_BYTES (XSTRING (string
)),
6416 STRING_MULTIBYTE (string
),
6422 char_encodable_p (c
, attrs
)
6427 struct charset
*charset
;
6429 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
6430 CONSP (tail
); tail
= XCDR (tail
))
6432 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
6433 if (CHAR_CHARSET_P (c
, charset
))
6436 return (! NILP (tail
));
6440 /* Return a list of coding systems that safely encode the text between
6441 START and END. If EXCLUDE is non-nil, it is a list of coding
6442 systems not to check. The returned list doesn't contain any such
6443 coding systems. In any case, if the text contains only ASCII or is
6444 unibyte, return t. */
6446 DEFUN ("find-coding-systems-region-internal",
6447 Ffind_coding_systems_region_internal
,
6448 Sfind_coding_systems_region_internal
, 2, 3, 0,
6449 doc
: /* Internal use only. */)
6450 (start
, end
, exclude
)
6451 Lisp_Object start
, end
, exclude
;
6453 Lisp_Object coding_attrs_list
, safe_codings
;
6454 EMACS_INT start_byte
, end_byte
;
6455 const unsigned char *p
, *pbeg
, *pend
;
6457 Lisp_Object tail
, elt
;
6459 if (STRINGP (start
))
6461 if (!STRING_MULTIBYTE (start
)
6462 || XSTRING (start
)->size
== STRING_BYTES (XSTRING (start
)))
6465 end_byte
= STRING_BYTES (XSTRING (start
));
6469 CHECK_NUMBER_COERCE_MARKER (start
);
6470 CHECK_NUMBER_COERCE_MARKER (end
);
6471 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
6472 args_out_of_range (start
, end
);
6473 if (NILP (current_buffer
->enable_multibyte_characters
))
6475 start_byte
= CHAR_TO_BYTE (XINT (start
));
6476 end_byte
= CHAR_TO_BYTE (XINT (end
));
6477 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
6480 if (start
< GPT
&& end
> GPT
)
6482 if ((GPT
- start
) < (end
- GPT
))
6483 move_gap_both (start
, start_byte
);
6485 move_gap_both (end
, end_byte
);
6489 coding_attrs_list
= Qnil
;
6490 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
6492 || NILP (Fmemq (XCAR (tail
), exclude
)))
6496 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
6497 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
6498 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
6499 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
6502 if (STRINGP (start
))
6503 p
= pbeg
= XSTRING (start
)->data
;
6505 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
6506 pend
= p
+ (end_byte
- start_byte
);
6508 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
6509 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
6513 if (ASCII_BYTE_P (*p
))
6517 c
= STRING_CHAR_ADVANCE (p
);
6519 charset_map_loaded
= 0;
6520 for (tail
= coding_attrs_list
; CONSP (tail
);)
6525 else if (char_encodable_p (c
, elt
))
6527 else if (CONSP (XCDR (tail
)))
6529 XSETCAR (tail
, XCAR (XCDR (tail
)));
6530 XSETCDR (tail
, XCDR (XCDR (tail
)));
6534 XSETCAR (tail
, Qnil
);
6538 if (charset_map_loaded
)
6540 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
6542 if (STRINGP (start
))
6543 pbeg
= XSTRING (start
)->data
;
6545 pbeg
= BYTE_POS_ADDR (start_byte
);
6546 p
= pbeg
+ p_offset
;
6547 pend
= pbeg
+ pend_offset
;
6552 safe_codings
= Qnil
;
6553 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
6554 if (! NILP (XCAR (tail
)))
6555 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
6557 return safe_codings
;
6561 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
6562 Scheck_coding_systems_region
, 3, 3, 0,
6563 doc
: /* Check if the region is encodable by coding systems.
6565 START and END are buffer positions specifying the region.
6566 CODING-SYSTEM-LIST is a list of coding systems to check.
6568 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6569 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6570 whole region, POS0, POS1, ... are buffer positions where non-encodable
6571 characters are found.
6573 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6576 START may be a string. In that case, check if the string is
6577 encodable, and the value contains indices to the string instead of
6578 buffer positions. END is ignored. */)
6579 (start
, end
, coding_system_list
)
6580 Lisp_Object start
, end
, coding_system_list
;
6583 EMACS_INT start_byte
, end_byte
;
6585 const unsigned char *p
, *pbeg
, *pend
;
6587 Lisp_Object tail
, elt
;
6589 if (STRINGP (start
))
6591 if (!STRING_MULTIBYTE (start
)
6592 && XSTRING (start
)->size
!= STRING_BYTES (XSTRING (start
)))
6595 end_byte
= STRING_BYTES (XSTRING (start
));
6600 CHECK_NUMBER_COERCE_MARKER (start
);
6601 CHECK_NUMBER_COERCE_MARKER (end
);
6602 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
6603 args_out_of_range (start
, end
);
6604 if (NILP (current_buffer
->enable_multibyte_characters
))
6606 start_byte
= CHAR_TO_BYTE (XINT (start
));
6607 end_byte
= CHAR_TO_BYTE (XINT (end
));
6608 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
6611 if (start
< GPT
&& end
> GPT
)
6613 if ((GPT
- start
) < (end
- GPT
))
6614 move_gap_both (start
, start_byte
);
6616 move_gap_both (end
, end_byte
);
6622 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
6625 list
= Fcons (Fcons (elt
, Fcons (AREF (CODING_SYSTEM_SPEC (elt
), 0),
6630 if (STRINGP (start
))
6631 p
= pbeg
= XSTRING (start
)->data
;
6633 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
6634 pend
= p
+ (end_byte
- start_byte
);
6636 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
6637 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
6641 if (ASCII_BYTE_P (*p
))
6645 c
= STRING_CHAR_ADVANCE (p
);
6647 charset_map_loaded
= 0;
6648 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
6650 elt
= XCDR (XCAR (tail
));
6651 if (! char_encodable_p (c
, XCAR (elt
)))
6652 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
6654 if (charset_map_loaded
)
6656 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
6658 if (STRINGP (start
))
6659 pbeg
= XSTRING (start
)->data
;
6661 pbeg
= BYTE_POS_ADDR (start_byte
);
6662 p
= pbeg
+ p_offset
;
6663 pend
= pbeg
+ pend_offset
;
6671 for (; CONSP (tail
); tail
= XCDR (tail
))
6674 if (CONSP (XCDR (XCDR (elt
))))
6675 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
6685 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
6686 Lisp_Object start
, end
, coding_system
, dst_object
;
6687 int encodep
, norecord
;
6689 struct coding_system coding
;
6690 EMACS_INT from
, from_byte
, to
, to_byte
;
6691 Lisp_Object src_object
;
6693 CHECK_NUMBER_COERCE_MARKER (start
);
6694 CHECK_NUMBER_COERCE_MARKER (end
);
6695 if (NILP (coding_system
))
6696 coding_system
= Qno_conversion
;
6698 CHECK_CODING_SYSTEM (coding_system
);
6699 src_object
= Fcurrent_buffer ();
6700 if (NILP (dst_object
))
6701 dst_object
= src_object
;
6702 else if (! EQ (dst_object
, Qt
))
6703 CHECK_BUFFER (dst_object
);
6705 validate_region (&start
, &end
);
6706 from
= XFASTINT (start
);
6707 from_byte
= CHAR_TO_BYTE (from
);
6708 to
= XFASTINT (end
);
6709 to_byte
= CHAR_TO_BYTE (to
);
6711 setup_coding_system (coding_system
, &coding
);
6712 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6715 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
6718 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
6721 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
6723 if (coding
.result
!= CODING_RESULT_SUCCESS
)
6724 error ("Code conversion error: %d", coding
.result
);
6726 return (BUFFERP (dst_object
)
6727 ? make_number (coding
.produced_char
)
6728 : coding
.dst_object
);
6732 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
6733 3, 4, "r\nzCoding system: ",
6734 doc
: /* Decode the current region from the specified coding system.
6735 When called from a program, takes four arguments:
6736 START, END, CODING-SYSTEM, and DESTINATION.
6737 START and END are buffer positions.
6739 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6740 If nil, the region between START and END is replace by the decoded text.
6741 If buffer, the decoded text is inserted in the buffer.
6742 If t, the decoded text is returned.
6744 This function sets `last-coding-system-used' to the precise coding system
6745 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6746 not fully specified.)
6747 It returns the length of the decoded text. */)
6748 (start
, end
, coding_system
, destination
)
6749 Lisp_Object start
, end
, coding_system
, destination
;
6751 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
6754 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
6755 3, 4, "r\nzCoding system: ",
6756 doc
: /* Encode the current region by specified coding system.
6757 When called from a program, takes three arguments:
6758 START, END, and CODING-SYSTEM. START and END are buffer positions.
6760 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6761 If nil, the region between START and END is replace by the encoded text.
6762 If buffer, the encoded text is inserted in the buffer.
6763 If t, the encoded text is returned.
6765 This function sets `last-coding-system-used' to the precise coding system
6766 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6767 not fully specified.)
6768 It returns the length of the encoded text. */)
6769 (start
, end
, coding_system
, destination
)
6770 Lisp_Object start
, end
, coding_system
, destination
;
6772 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
6776 code_convert_string (string
, coding_system
, dst_object
,
6777 encodep
, nocopy
, norecord
)
6778 Lisp_Object string
, coding_system
, dst_object
;
6779 int encodep
, nocopy
, norecord
;
6781 struct coding_system coding
;
6782 EMACS_INT chars
, bytes
;
6784 CHECK_STRING (string
);
6785 if (NILP (coding_system
))
6788 Vlast_coding_system_used
= Qno_conversion
;
6789 if (NILP (dst_object
))
6790 return (nocopy
? Fcopy_sequence (string
) : string
);
6793 if (NILP (coding_system
))
6794 coding_system
= Qno_conversion
;
6796 CHECK_CODING_SYSTEM (coding_system
);
6797 if (NILP (dst_object
))
6799 else if (! EQ (dst_object
, Qt
))
6800 CHECK_BUFFER (dst_object
);
6802 setup_coding_system (coding_system
, &coding
);
6803 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6804 chars
= XSTRING (string
)->size
;
6805 bytes
= STRING_BYTES (XSTRING (string
));
6807 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
6809 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
6811 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
6813 if (coding
.result
!= CODING_RESULT_SUCCESS
)
6814 error ("Code conversion error: %d", coding
.result
);
6816 return (BUFFERP (dst_object
)
6817 ? make_number (coding
.produced_char
)
6818 : coding
.dst_object
);
6822 /* Encode or decode STRING according to CODING_SYSTEM.
6823 Do not set Vlast_coding_system_used.
6825 This function is called only from macros DECODE_FILE and
6826 ENCODE_FILE, thus we ignore character composition. */
6829 code_convert_string_norecord (string
, coding_system
, encodep
)
6830 Lisp_Object string
, coding_system
;
6833 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
6837 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
6839 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6841 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6842 if the decoding operation is trivial.
6844 Optional fourth arg BUFFER non-nil meant that the decoded text is
6845 inserted in BUFFER instead of returned as a string. In this case,
6846 the return value is BUFFER.
6848 This function sets `last-coding-system-used' to the precise coding system
6849 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6850 not fully specified. */)
6851 (string
, coding_system
, nocopy
, buffer
)
6852 Lisp_Object string
, coding_system
, nocopy
, buffer
;
6854 return code_convert_string (string
, coding_system
, buffer
,
6855 0, ! NILP (nocopy
), 0);
6858 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
6860 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
6862 Optional third arg NOCOPY non-nil means it is OK to return STRING
6863 itself if the encoding operation is trivial.
6865 Optional fourth arg BUFFER non-nil meant that the encoded text is
6866 inserted in BUFFER instead of returned as a string. In this case,
6867 the return value is BUFFER.
6869 This function sets `last-coding-system-used' to the precise coding system
6870 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6871 not fully specified.) */)
6872 (string
, coding_system
, nocopy
, buffer
)
6873 Lisp_Object string
, coding_system
, nocopy
, buffer
;
6875 return code_convert_string (string
, coding_system
, buffer
,
6876 1, ! NILP (nocopy
), 1);
6880 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
6881 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
6882 Return the corresponding character. */)
6886 Lisp_Object spec
, attrs
, val
;
6887 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
6890 CHECK_NATNUM (code
);
6891 c
= XFASTINT (code
);
6892 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
6893 attrs
= AREF (spec
, 0);
6895 if (ASCII_BYTE_P (c
)
6896 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6899 val
= CODING_ATTR_CHARSET_LIST (attrs
);
6900 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6901 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6902 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
6905 charset
= charset_roman
;
6906 else if (c
>= 0xA0 && c
< 0xDF)
6908 charset
= charset_kana
;
6913 int s1
= c
>> 8, s2
= c
& 0xFF;
6915 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
6916 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
6917 error ("Invalid code: %d", code
);
6919 charset
= charset_kanji
;
6921 c
= DECODE_CHAR (charset
, c
);
6923 error ("Invalid code: %d", code
);
6924 return make_number (c
);
6928 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
6929 doc
: /* Encode a Japanese character CHAR to shift_jis encoding.
6930 Return the corresponding code in SJIS. */)
6934 Lisp_Object spec
, attrs
, charset_list
;
6936 struct charset
*charset
;
6939 CHECK_CHARACTER (ch
);
6941 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
6942 attrs
= AREF (spec
, 0);
6944 if (ASCII_CHAR_P (c
)
6945 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6948 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
6949 charset
= char_charset (c
, charset_list
, &code
);
6950 if (code
== CHARSET_INVALID_CODE (charset
))
6951 error ("Can't encode by shift_jis encoding: %d", c
);
6954 return make_number (code
);
6957 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
6958 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
6959 Return the corresponding character. */)
6963 Lisp_Object spec
, attrs
, val
;
6964 struct charset
*charset_roman
, *charset_big5
, *charset
;
6967 CHECK_NATNUM (code
);
6968 c
= XFASTINT (code
);
6969 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
6970 attrs
= AREF (spec
, 0);
6972 if (ASCII_BYTE_P (c
)
6973 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6976 val
= CODING_ATTR_CHARSET_LIST (attrs
);
6977 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6978 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
6981 charset
= charset_roman
;
6984 int b1
= c
>> 8, b2
= c
& 0x7F;
6985 if (b1
< 0xA1 || b1
> 0xFE
6986 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
6987 error ("Invalid code: %d", code
);
6988 charset
= charset_big5
;
6990 c
= DECODE_CHAR (charset
, (unsigned )c
);
6992 error ("Invalid code: %d", code
);
6993 return make_number (c
);
6996 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
6997 doc
: /* Encode the Big5 character CHAR to BIG5 coding system.
6998 Return the corresponding character code in Big5. */)
7002 Lisp_Object spec
, attrs
, charset_list
;
7003 struct charset
*charset
;
7007 CHECK_CHARACTER (ch
);
7009 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7010 attrs
= AREF (spec
, 0);
7011 if (ASCII_CHAR_P (c
)
7012 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7015 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7016 charset
= char_charset (c
, charset_list
, &code
);
7017 if (code
== CHARSET_INVALID_CODE (charset
))
7018 error ("Can't encode by Big5 encoding: %d", c
);
7020 return make_number (code
);
7024 DEFUN ("set-terminal-coding-system-internal",
7025 Fset_terminal_coding_system_internal
,
7026 Sset_terminal_coding_system_internal
, 1, 1, 0,
7027 doc
: /* Internal use only. */)
7029 Lisp_Object coding_system
;
7031 CHECK_SYMBOL (coding_system
);
7032 setup_coding_system (Fcheck_coding_system (coding_system
),
7035 /* We had better not send unsafe characters to terminal. */
7036 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
7037 /* Characer composition should be disabled. */
7038 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7039 terminal_coding
.src_multibyte
= 1;
7040 terminal_coding
.dst_multibyte
= 0;
7044 DEFUN ("set-safe-terminal-coding-system-internal",
7045 Fset_safe_terminal_coding_system_internal
,
7046 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
7047 doc
: /* Internal use only. */)
7049 Lisp_Object coding_system
;
7051 CHECK_SYMBOL (coding_system
);
7052 setup_coding_system (Fcheck_coding_system (coding_system
),
7053 &safe_terminal_coding
);
7054 /* Characer composition should be disabled. */
7055 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7056 safe_terminal_coding
.src_multibyte
= 1;
7057 safe_terminal_coding
.dst_multibyte
= 0;
7061 DEFUN ("terminal-coding-system",
7062 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
7063 doc
: /* Return coding system specified for terminal output. */)
7066 return CODING_ID_NAME (terminal_coding
.id
);
7069 DEFUN ("set-keyboard-coding-system-internal",
7070 Fset_keyboard_coding_system_internal
,
7071 Sset_keyboard_coding_system_internal
, 1, 1, 0,
7072 doc
: /* Internal use only. */)
7074 Lisp_Object coding_system
;
7076 CHECK_SYMBOL (coding_system
);
7077 setup_coding_system (Fcheck_coding_system (coding_system
),
7079 /* Characer composition should be disabled. */
7080 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7084 DEFUN ("keyboard-coding-system",
7085 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
7086 doc
: /* Return coding system specified for decoding keyboard input. */)
7089 return CODING_ID_NAME (keyboard_coding
.id
);
7093 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
7094 Sfind_operation_coding_system
, 1, MANY
, 0,
7095 doc
: /* Choose a coding system for an operation based on the target name.
7096 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7097 DECODING-SYSTEM is the coding system to use for decoding
7098 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7099 for encoding (in case OPERATION does encoding).
7101 The first argument OPERATION specifies an I/O primitive:
7102 For file I/O, `insert-file-contents' or `write-region'.
7103 For process I/O, `call-process', `call-process-region', or `start-process'.
7104 For network I/O, `open-network-stream'.
7106 The remaining arguments should be the same arguments that were passed
7107 to the primitive. Depending on which primitive, one of those arguments
7108 is selected as the TARGET. For example, if OPERATION does file I/O,
7109 whichever argument specifies the file name is TARGET.
7111 TARGET has a meaning which depends on OPERATION:
7112 For file I/O, TARGET is a file name.
7113 For process I/O, TARGET is a process name.
7114 For network I/O, TARGET is a service name or a port number
7116 This function looks up what specified for TARGET in,
7117 `file-coding-system-alist', `process-coding-system-alist',
7118 or `network-coding-system-alist' depending on OPERATION.
7119 They may specify a coding system, a cons of coding systems,
7120 or a function symbol to call.
7121 In the last case, we call the function with one argument,
7122 which is a list of all the arguments given to this function.
7124 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7129 Lisp_Object operation
, target_idx
, target
, val
;
7130 register Lisp_Object chain
;
7133 error ("Too few arguments");
7134 operation
= args
[0];
7135 if (!SYMBOLP (operation
)
7136 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
7137 error ("Invalid first arguement");
7138 if (nargs
< 1 + XINT (target_idx
))
7139 error ("Too few arguments for operation: %s",
7140 XSYMBOL (operation
)->name
->data
);
7141 target
= args
[XINT (target_idx
) + 1];
7142 if (!(STRINGP (target
)
7143 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
7144 error ("Invalid %dth argument", XINT (target_idx
) + 1);
7146 chain
= ((EQ (operation
, Qinsert_file_contents
)
7147 || EQ (operation
, Qwrite_region
))
7148 ? Vfile_coding_system_alist
7149 : (EQ (operation
, Qopen_network_stream
)
7150 ? Vnetwork_coding_system_alist
7151 : Vprocess_coding_system_alist
));
7155 for (; CONSP (chain
); chain
= XCDR (chain
))
7161 && ((STRINGP (target
)
7162 && STRINGP (XCAR (elt
))
7163 && fast_string_match (XCAR (elt
), target
) >= 0)
7164 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
7167 /* Here, if VAL is both a valid coding system and a valid
7168 function symbol, we return VAL as a coding system. */
7171 if (! SYMBOLP (val
))
7173 if (! NILP (Fcoding_system_p (val
)))
7174 return Fcons (val
, val
);
7175 if (! NILP (Ffboundp (val
)))
7177 val
= call1 (val
, Flist (nargs
, args
));
7180 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
7181 return Fcons (val
, val
);
7189 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
7190 Sset_coding_system_priority
, 0, MANY
, 0,
7191 doc
: /* Assign higher priority to the coding systems given as arguments.
7192 usage: (set-coding-system-priority CODING-SYSTEM ...) */)
7198 int changed
[coding_category_max
];
7199 enum coding_category priorities
[coding_category_max
];
7201 bzero (changed
, sizeof changed
);
7203 for (i
= j
= 0; i
< nargs
; i
++)
7205 enum coding_category category
;
7206 Lisp_Object spec
, attrs
;
7208 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
7209 attrs
= AREF (spec
, 0);
7210 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7211 if (changed
[category
])
7212 /* Ignore this coding system because a coding system of the
7213 same category already had a higher priority. */
7215 changed
[category
] = 1;
7216 priorities
[j
++] = category
;
7217 if (coding_categories
[category
].id
>= 0
7218 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
7219 setup_coding_system (args
[i
], &coding_categories
[category
]);
7222 /* Now we have decided top J priorities. Reflect the order of the
7223 original priorities to the remaining priorities. */
7225 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
7227 while (j
< coding_category_max
7228 && changed
[coding_priorities
[j
]])
7230 if (j
== coding_category_max
)
7232 priorities
[i
] = coding_priorities
[j
];
7235 bcopy (priorities
, coding_priorities
, sizeof priorities
);
7239 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
7240 Scoding_system_priority_list
, 0, 1, 0,
7241 doc
: /* Return a list of coding systems ordered by their priorities.
7242 HIGHESTP non-nil means just return the highest priority one. */)
7244 Lisp_Object highestp
;
7249 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
7251 enum coding_category category
= coding_priorities
[i
];
7252 int id
= coding_categories
[category
].id
;
7257 attrs
= CODING_ID_ATTRS (id
);
7258 if (! NILP (highestp
))
7259 return CODING_ATTR_BASE_NAME (attrs
);
7260 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
7262 return Fnreverse (val
);
7265 static char *suffixes
[] = { "-unix", "-dos", "-mac" };
7268 make_subsidiaries (base
)
7271 Lisp_Object subsidiaries
;
7272 int base_name_len
= STRING_BYTES (XSYMBOL (base
)->name
);
7273 char *buf
= (char *) alloca (base_name_len
+ 6);
7276 bcopy (XSYMBOL (base
)->name
->data
, buf
, base_name_len
);
7277 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
7278 for (i
= 0; i
< 3; i
++)
7280 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
7281 ASET (subsidiaries
, i
, intern (buf
));
7283 return subsidiaries
;
7287 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
7288 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
7289 doc
: /* For internal use only.
7290 usage: (define-coding-system-internal ...) */)
7296 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
7297 Lisp_Object attrs
; /* Vector of attributes. */
7298 Lisp_Object eol_type
;
7299 Lisp_Object aliases
;
7300 Lisp_Object coding_type
, charset_list
, safe_charsets
;
7301 enum coding_category category
;
7302 Lisp_Object tail
, val
;
7303 int max_charset_id
= 0;
7306 if (nargs
< coding_arg_max
)
7309 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
7311 name
= args
[coding_arg_name
];
7312 CHECK_SYMBOL (name
);
7313 CODING_ATTR_BASE_NAME (attrs
) = name
;
7315 val
= args
[coding_arg_mnemonic
];
7316 if (! STRINGP (val
))
7317 CHECK_CHARACTER (val
);
7318 CODING_ATTR_MNEMONIC (attrs
) = val
;
7320 coding_type
= args
[coding_arg_coding_type
];
7321 CHECK_SYMBOL (coding_type
);
7322 CODING_ATTR_TYPE (attrs
) = coding_type
;
7324 charset_list
= args
[coding_arg_charset_list
];
7325 if (SYMBOLP (charset_list
))
7327 if (EQ (charset_list
, Qiso_2022
))
7329 if (! EQ (coding_type
, Qiso_2022
))
7330 error ("Invalid charset-list");
7331 charset_list
= Viso_2022_charset_list
;
7333 else if (EQ (charset_list
, Qemacs_mule
))
7335 if (! EQ (coding_type
, Qemacs_mule
))
7336 error ("Invalid charset-list");
7337 charset_list
= Vemacs_mule_charset_list
;
7339 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7340 if (max_charset_id
< XFASTINT (XCAR (tail
)))
7341 max_charset_id
= XFASTINT (XCAR (tail
));
7345 charset_list
= Fcopy_sequence (charset_list
);
7346 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
7348 struct charset
*charset
;
7351 CHECK_CHARSET_GET_CHARSET (val
, charset
);
7352 if (EQ (coding_type
, Qiso_2022
)
7353 ? CHARSET_ISO_FINAL (charset
) < 0
7354 : EQ (coding_type
, Qemacs_mule
)
7355 ? CHARSET_EMACS_MULE_ID (charset
) < 0
7357 error ("Can't handle charset `%s'",
7358 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7360 XCAR (tail
) = make_number (charset
->id
);
7361 if (max_charset_id
< charset
->id
)
7362 max_charset_id
= charset
->id
;
7365 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
7367 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
7369 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7370 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
7371 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
7373 val
= args
[coding_arg_decode_translation_table
];
7375 CHECK_CHAR_TABLE (val
);
7376 CODING_ATTR_DECODE_TBL (attrs
) = val
;
7378 val
= args
[coding_arg_encode_translation_table
];
7380 CHECK_CHAR_TABLE (val
);
7381 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
7383 val
= args
[coding_arg_post_read_conversion
];
7385 CODING_ATTR_POST_READ (attrs
) = val
;
7387 val
= args
[coding_arg_pre_write_conversion
];
7389 CODING_ATTR_PRE_WRITE (attrs
) = val
;
7391 val
= args
[coding_arg_default_char
];
7393 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
7396 CHECK_CHARACTER (val
);
7397 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
7400 val
= args
[coding_arg_plist
];
7402 CODING_ATTR_PLIST (attrs
) = val
;
7404 if (EQ (coding_type
, Qcharset
))
7406 /* Generate a lisp vector of 256 elements. Each element is nil,
7407 integer, or a list of charset IDs.
7409 If Nth element is nil, the byte code N is invalid in this
7412 If Nth element is a number NUM, N is the first byte of a
7413 charset whose ID is NUM.
7415 If Nth element is a list of charset IDs, N is the first byte
7416 of one of them. The list is sorted by dimensions of the
7417 charsets. A charset of smaller dimension comes firtst.
7419 val
= Fmake_vector (make_number (256), Qnil
);
7421 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7423 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
7424 int dim
= CHARSET_DIMENSION (charset
);
7425 int idx
= (dim
- 1) * 4;
7427 for (i
= charset
->code_space
[idx
];
7428 i
<= charset
->code_space
[idx
+ 1]; i
++)
7430 Lisp_Object tmp
, tmp2
;
7433 tmp
= AREF (val
, i
);
7436 else if (NUMBERP (tmp
))
7438 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
7440 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
7442 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
7446 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
7448 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
7453 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
7456 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
7457 XSETCAR (tmp2
, XCAR (tail
));
7463 ASET (attrs
, coding_attr_charset_valids
, val
);
7464 category
= coding_category_charset
;
7466 else if (EQ (coding_type
, Qccl
))
7470 if (nargs
< coding_arg_ccl_max
)
7473 val
= args
[coding_arg_ccl_decoder
];
7474 CHECK_CCL_PROGRAM (val
);
7476 val
= Fcopy_sequence (val
);
7477 ASET (attrs
, coding_attr_ccl_decoder
, val
);
7479 val
= args
[coding_arg_ccl_encoder
];
7480 CHECK_CCL_PROGRAM (val
);
7482 val
= Fcopy_sequence (val
);
7483 ASET (attrs
, coding_attr_ccl_encoder
, val
);
7485 val
= args
[coding_arg_ccl_valids
];
7486 valids
= Fmake_string (make_number (256), make_number (0));
7487 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
7491 ASET (valids
, XINT (val
), make_number (1));
7497 CHECK_NUMBER (XCAR (val
));
7498 CHECK_NUMBER (XCDR (val
));
7499 from
= XINT (XCAR (val
));
7500 to
= XINT (XCDR (val
));
7501 for (i
= from
; i
<= to
; i
++)
7502 ASET (valids
, i
, make_number (1));
7505 ASET (attrs
, coding_attr_ccl_valids
, valids
);
7507 category
= coding_category_ccl
;
7509 else if (EQ (coding_type
, Qutf_16
))
7511 Lisp_Object bom
, endian
;
7513 if (nargs
< coding_arg_utf16_max
)
7516 bom
= args
[coding_arg_utf16_bom
];
7517 if (! NILP (bom
) && ! EQ (bom
, Qt
))
7520 CHECK_CODING_SYSTEM (XCAR (bom
));
7521 CHECK_CODING_SYSTEM (XCDR (bom
));
7523 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
7525 endian
= args
[coding_arg_utf16_endian
];
7526 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
7528 category
= (CONSP (bom
)
7529 ? coding_category_utf_16_auto
7532 ? coding_category_utf_16_be_nosig
7533 : coding_category_utf_16_le_nosig
)
7535 ? coding_category_utf_16_be
7536 : coding_category_utf_16_le
));
7538 else if (EQ (coding_type
, Qiso_2022
))
7540 Lisp_Object initial
, reg_usage
, request
, flags
;
7543 if (nargs
< coding_arg_iso2022_max
)
7546 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
7547 CHECK_VECTOR (initial
);
7548 for (i
= 0; i
< 4; i
++)
7550 val
= Faref (initial
, make_number (i
));
7553 CHECK_CHARSET_GET_ID (val
, id
);
7554 ASET (initial
, i
, make_number (id
));
7557 ASET (initial
, i
, make_number (-1));
7560 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
7561 CHECK_CONS (reg_usage
);
7562 CHECK_NATNUM (XCAR (reg_usage
));
7563 CHECK_NATNUM (XCDR (reg_usage
));
7565 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
7566 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
7572 CHECK_CHARSET_GET_ID (XCAR (val
), id
);
7573 CHECK_NATNUM (XCDR (val
));
7574 if (XINT (XCDR (val
)) >= 4)
7575 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
7576 XCAR (val
) = make_number (id
);
7579 flags
= args
[coding_arg_iso2022_flags
];
7580 CHECK_NATNUM (flags
);
7582 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
7583 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
7585 ASET (attrs
, coding_attr_iso_initial
, initial
);
7586 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
7587 ASET (attrs
, coding_attr_iso_request
, request
);
7588 ASET (attrs
, coding_attr_iso_flags
, flags
);
7589 setup_iso_safe_charsets (attrs
);
7591 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
7592 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
7593 | CODING_ISO_FLAG_SINGLE_SHIFT
))
7594 ? coding_category_iso_7_else
7595 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
7596 ? coding_category_iso_7
7597 : coding_category_iso_7_tight
);
7600 int id
= XINT (AREF (initial
, 1));
7602 category
= (((i
& CODING_ISO_FLAG_LOCKING_SHIFT
)
7603 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
7605 ? coding_category_iso_8_else
7606 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
7607 ? coding_category_iso_8_1
7608 : coding_category_iso_8_2
);
7611 else if (EQ (coding_type
, Qemacs_mule
))
7613 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
7614 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
7616 category
= coding_category_emacs_mule
;
7618 else if (EQ (coding_type
, Qshift_jis
))
7621 struct charset
*charset
;
7623 if (XINT (Flength (charset_list
)) != 3)
7624 error ("There should be just three charsets");
7626 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7627 if (CHARSET_DIMENSION (charset
) != 1)
7628 error ("Dimension of charset %s is not one",
7629 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7631 charset_list
= XCDR (charset_list
);
7632 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7633 if (CHARSET_DIMENSION (charset
) != 1)
7634 error ("Dimension of charset %s is not one",
7635 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7637 charset_list
= XCDR (charset_list
);
7638 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7639 if (CHARSET_DIMENSION (charset
) != 2)
7640 error ("Dimension of charset %s is not two",
7641 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7643 category
= coding_category_sjis
;
7644 Vsjis_coding_system
= name
;
7646 else if (EQ (coding_type
, Qbig5
))
7648 struct charset
*charset
;
7650 if (XINT (Flength (charset_list
)) != 2)
7651 error ("There should be just two charsets");
7653 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7654 if (CHARSET_DIMENSION (charset
) != 1)
7655 error ("Dimension of charset %s is not one",
7656 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7658 charset_list
= XCDR (charset_list
);
7659 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7660 if (CHARSET_DIMENSION (charset
) != 2)
7661 error ("Dimension of charset %s is not two",
7662 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7664 category
= coding_category_big5
;
7665 Vbig5_coding_system
= name
;
7667 else if (EQ (coding_type
, Qraw_text
))
7668 category
= coding_category_raw_text
;
7669 else if (EQ (coding_type
, Qutf_8
))
7670 category
= coding_category_utf_8
;
7671 else if (EQ (coding_type
, Qundecided
))
7672 category
= coding_category_undecided
;
7674 error ("Invalid coding system type: %s",
7675 XSYMBOL (coding_type
)->name
->data
);
7677 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
7679 eol_type
= args
[coding_arg_eol_type
];
7680 if (! NILP (eol_type
)
7681 && ! EQ (eol_type
, Qunix
)
7682 && ! EQ (eol_type
, Qdos
)
7683 && ! EQ (eol_type
, Qmac
))
7684 error ("Invalid eol-type");
7686 aliases
= Fcons (name
, Qnil
);
7688 if (NILP (eol_type
))
7690 eol_type
= make_subsidiaries (name
);
7691 for (i
= 0; i
< 3; i
++)
7693 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
7695 this_name
= AREF (eol_type
, i
);
7696 this_aliases
= Fcons (this_name
, Qnil
);
7697 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
7698 this_spec
= Fmake_vector (make_number (3), attrs
);
7699 ASET (this_spec
, 1, this_aliases
);
7700 ASET (this_spec
, 2, this_eol_type
);
7701 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
7702 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
7703 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
7704 Vcoding_system_alist
);
7708 spec_vec
= Fmake_vector (make_number (3), attrs
);
7709 ASET (spec_vec
, 1, aliases
);
7710 ASET (spec_vec
, 2, eol_type
);
7712 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
7713 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
7714 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
7715 Vcoding_system_alist
);
7718 int id
= coding_categories
[category
].id
;
7720 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
7721 setup_coding_system (name
, &coding_categories
[category
]);
7727 return Fsignal (Qwrong_number_of_arguments
,
7728 Fcons (intern ("define-coding-system-internal"),
7729 make_number (nargs
)));
7732 /* Fixme: should this record the alias relationships for
7734 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
7735 Sdefine_coding_system_alias
, 2, 2, 0,
7736 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
7737 (alias
, coding_system
)
7738 Lisp_Object alias
, coding_system
;
7740 Lisp_Object spec
, aliases
, eol_type
;
7742 CHECK_SYMBOL (alias
);
7743 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7744 aliases
= AREF (spec
, 1);
7745 while (!NILP (XCDR (aliases
)))
7746 aliases
= XCDR (aliases
);
7747 XCDR (aliases
) = Fcons (alias
, Qnil
);
7749 eol_type
= AREF (spec
, 2);
7750 if (VECTORP (eol_type
))
7752 Lisp_Object subsidiaries
;
7755 subsidiaries
= make_subsidiaries (alias
);
7756 for (i
= 0; i
< 3; i
++)
7757 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
7758 AREF (eol_type
, i
));
7760 ASET (spec
, 2, subsidiaries
);
7763 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
7764 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
7765 Vcoding_system_alist
);
7770 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
7772 doc
: /* Return the base of CODING-SYSTEM.
7773 Any alias or subsidiary coding system is not a base coding system. */)
7775 Lisp_Object coding_system
;
7777 Lisp_Object spec
, attrs
;
7779 if (NILP (coding_system
))
7780 return (Qno_conversion
);
7781 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7782 attrs
= AREF (spec
, 0);
7783 return CODING_ATTR_BASE_NAME (attrs
);
7786 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
7788 doc
: "Return the property list of CODING-SYSTEM.")
7790 Lisp_Object coding_system
;
7792 Lisp_Object spec
, attrs
;
7794 if (NILP (coding_system
))
7795 coding_system
= Qno_conversion
;
7796 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7797 attrs
= AREF (spec
, 0);
7798 return CODING_ATTR_PLIST (attrs
);
7802 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
7804 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
7806 Lisp_Object coding_system
;
7810 if (NILP (coding_system
))
7811 coding_system
= Qno_conversion
;
7812 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7813 return AREF (spec
, 1);
7816 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
7817 Scoding_system_eol_type
, 1, 1, 0,
7818 doc
: /* Return eol-type of CODING-SYSTEM.
7819 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7821 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7822 and CR respectively.
7824 A vector value indicates that a format of end-of-line should be
7825 detected automatically. Nth element of the vector is the subsidiary
7826 coding system whose eol-type is N. */)
7828 Lisp_Object coding_system
;
7830 Lisp_Object spec
, eol_type
;
7833 if (NILP (coding_system
))
7834 coding_system
= Qno_conversion
;
7835 if (! CODING_SYSTEM_P (coding_system
))
7837 spec
= CODING_SYSTEM_SPEC (coding_system
);
7838 eol_type
= AREF (spec
, 2);
7839 if (VECTORP (eol_type
))
7840 return Fcopy_sequence (eol_type
);
7841 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
7842 return make_number (n
);
7848 /*** 9. Post-amble ***/
7855 for (i
= 0; i
< coding_category_max
; i
++)
7857 coding_categories
[i
].id
= -1;
7858 coding_priorities
[i
] = i
;
7861 /* ISO2022 specific initialize routine. */
7862 for (i
= 0; i
< 0x20; i
++)
7863 iso_code_class
[i
] = ISO_control_0
;
7864 for (i
= 0x21; i
< 0x7F; i
++)
7865 iso_code_class
[i
] = ISO_graphic_plane_0
;
7866 for (i
= 0x80; i
< 0xA0; i
++)
7867 iso_code_class
[i
] = ISO_control_1
;
7868 for (i
= 0xA1; i
< 0xFF; i
++)
7869 iso_code_class
[i
] = ISO_graphic_plane_1
;
7870 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
7871 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
7872 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
7873 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
7874 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
7875 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
7876 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
7877 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
7878 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
7879 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
7881 inhibit_pre_post_conversion
= 0;
7883 for (i
= 0; i
< 256; i
++)
7885 emacs_mule_bytes
[i
] = 1;
7887 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_11
] = 3;
7888 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_12
] = 3;
7889 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_21
] = 4;
7890 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_22
] = 4;
7898 staticpro (&Vcoding_system_hash_table
);
7899 Vcoding_system_hash_table
= Fmakehash (Qeq
);
7901 staticpro (&Vsjis_coding_system
);
7902 Vsjis_coding_system
= Qnil
;
7904 staticpro (&Vbig5_coding_system
);
7905 Vbig5_coding_system
= Qnil
;
7907 staticpro (&Vcode_conversion_work_buf_list
);
7908 Vcode_conversion_work_buf_list
= Qnil
;
7910 staticpro (&Vcode_conversion_reused_work_buf
);
7911 Vcode_conversion_reused_work_buf
= Qnil
;
7913 DEFSYM (Qcharset
, "charset");
7914 DEFSYM (Qtarget_idx
, "target-idx");
7915 DEFSYM (Qcoding_system_history
, "coding-system-history");
7916 Fset (Qcoding_system_history
, Qnil
);
7918 /* Target FILENAME is the first argument. */
7919 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
7920 /* Target FILENAME is the third argument. */
7921 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
7923 DEFSYM (Qcall_process
, "call-process");
7924 /* Target PROGRAM is the first argument. */
7925 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
7927 DEFSYM (Qcall_process_region
, "call-process-region");
7928 /* Target PROGRAM is the third argument. */
7929 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
7931 DEFSYM (Qstart_process
, "start-process");
7932 /* Target PROGRAM is the third argument. */
7933 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
7935 DEFSYM (Qopen_network_stream
, "open-network-stream");
7936 /* Target SERVICE is the fourth argument. */
7937 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
7939 DEFSYM (Qcoding_system
, "coding-system");
7940 DEFSYM (Qcoding_aliases
, "coding-aliases");
7942 DEFSYM (Qeol_type
, "eol-type");
7943 DEFSYM (Qunix
, "unix");
7944 DEFSYM (Qdos
, "dos");
7946 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
7947 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
7948 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
7949 DEFSYM (Qdefault_char
, "default-char");
7950 DEFSYM (Qundecided
, "undecided");
7951 DEFSYM (Qno_conversion
, "no-conversion");
7952 DEFSYM (Qraw_text
, "raw-text");
7954 DEFSYM (Qiso_2022
, "iso-2022");
7956 DEFSYM (Qutf_8
, "utf-8");
7958 DEFSYM (Qutf_16
, "utf-16");
7959 DEFSYM (Qutf_16_be
, "utf-16-be");
7960 DEFSYM (Qutf_16_be_nosig
, "utf-16-be-nosig");
7961 DEFSYM (Qutf_16_le
, "utf-16-l3");
7962 DEFSYM (Qutf_16_le_nosig
, "utf-16-le-nosig");
7963 DEFSYM (Qsignature
, "signature");
7964 DEFSYM (Qendian
, "endian");
7965 DEFSYM (Qbig
, "big");
7966 DEFSYM (Qlittle
, "little");
7968 DEFSYM (Qshift_jis
, "shift-jis");
7969 DEFSYM (Qbig5
, "big5");
7971 DEFSYM (Qcoding_system_p
, "coding-system-p");
7973 DEFSYM (Qcoding_system_error
, "coding-system-error");
7974 Fput (Qcoding_system_error
, Qerror_conditions
,
7975 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
7976 Fput (Qcoding_system_error
, Qerror_message
,
7977 build_string ("Invalid coding system"));
7979 /* Intern this now in case it isn't already done.
7980 Setting this variable twice is harmless.
7981 But don't staticpro it here--that is done in alloc.c. */
7982 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
7984 DEFSYM (Qtranslation_table
, "translation-table");
7985 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
7986 DEFSYM (Qtranslation_table_id
, "translation-table-id");
7987 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
7988 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
7990 DEFSYM (Qvalid_codes
, "valid-codes");
7992 DEFSYM (Qemacs_mule
, "emacs-mule");
7994 Vcoding_category_table
7995 = Fmake_vector (make_number (coding_category_max
), Qnil
);
7996 staticpro (&Vcoding_category_table
);
7997 /* Followings are target of code detection. */
7998 ASET (Vcoding_category_table
, coding_category_iso_7
,
7999 intern ("coding-category-iso-7"));
8000 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
8001 intern ("coding-category-iso-7-tight"));
8002 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
8003 intern ("coding-category-iso-8-1"));
8004 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
8005 intern ("coding-category-iso-8-2"));
8006 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
8007 intern ("coding-category-iso-7-else"));
8008 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
8009 intern ("coding-category-iso-8-else"));
8010 ASET (Vcoding_category_table
, coding_category_utf_8
,
8011 intern ("coding-category-utf-8"));
8012 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
8013 intern ("coding-category-utf-16-be"));
8014 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
8015 intern ("coding-category-utf-16-le"));
8016 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
8017 intern ("coding-category-utf-16-be-nosig"));
8018 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
8019 intern ("coding-category-utf-16-le-nosig"));
8020 ASET (Vcoding_category_table
, coding_category_charset
,
8021 intern ("coding-category-charset"));
8022 ASET (Vcoding_category_table
, coding_category_sjis
,
8023 intern ("coding-category-sjis"));
8024 ASET (Vcoding_category_table
, coding_category_big5
,
8025 intern ("coding-category-big5"));
8026 ASET (Vcoding_category_table
, coding_category_ccl
,
8027 intern ("coding-category-ccl"));
8028 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
8029 intern ("coding-category-emacs-mule"));
8030 /* Followings are NOT target of code detection. */
8031 ASET (Vcoding_category_table
, coding_category_raw_text
,
8032 intern ("coding-category-raw-text"));
8033 ASET (Vcoding_category_table
, coding_category_undecided
,
8034 intern ("coding-category-undecided"));
8036 defsubr (&Scoding_system_p
);
8037 defsubr (&Sread_coding_system
);
8038 defsubr (&Sread_non_nil_coding_system
);
8039 defsubr (&Scheck_coding_system
);
8040 defsubr (&Sdetect_coding_region
);
8041 defsubr (&Sdetect_coding_string
);
8042 defsubr (&Sfind_coding_systems_region_internal
);
8043 defsubr (&Scheck_coding_systems_region
);
8044 defsubr (&Sdecode_coding_region
);
8045 defsubr (&Sencode_coding_region
);
8046 defsubr (&Sdecode_coding_string
);
8047 defsubr (&Sencode_coding_string
);
8048 defsubr (&Sdecode_sjis_char
);
8049 defsubr (&Sencode_sjis_char
);
8050 defsubr (&Sdecode_big5_char
);
8051 defsubr (&Sencode_big5_char
);
8052 defsubr (&Sset_terminal_coding_system_internal
);
8053 defsubr (&Sset_safe_terminal_coding_system_internal
);
8054 defsubr (&Sterminal_coding_system
);
8055 defsubr (&Sset_keyboard_coding_system_internal
);
8056 defsubr (&Skeyboard_coding_system
);
8057 defsubr (&Sfind_operation_coding_system
);
8058 defsubr (&Sset_coding_system_priority
);
8059 defsubr (&Sdefine_coding_system_internal
);
8060 defsubr (&Sdefine_coding_system_alias
);
8061 defsubr (&Scoding_system_base
);
8062 defsubr (&Scoding_system_plist
);
8063 defsubr (&Scoding_system_aliases
);
8064 defsubr (&Scoding_system_eol_type
);
8065 defsubr (&Scoding_system_priority_list
);
8067 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
8068 doc
: /* List of coding systems.
8070 Do not alter the value of this variable manually. This variable should be
8071 updated by the functions `define-coding-system' and
8072 `define-coding-system-alias'. */);
8073 Vcoding_system_list
= Qnil
;
8075 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
8076 doc
: /* Alist of coding system names.
8077 Each element is one element list of coding system name.
8078 This variable is given to `completing-read' as TABLE argument.
8080 Do not alter the value of this variable manually. This variable should be
8081 updated by the functions `make-coding-system' and
8082 `define-coding-system-alias'. */);
8083 Vcoding_system_alist
= Qnil
;
8085 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
8086 doc
: /* List of coding-categories (symbols) ordered by priority.
8088 On detecting a coding system, Emacs tries code detection algorithms
8089 associated with each coding-category one by one in this order. When
8090 one algorithm agrees with a byte sequence of source text, the coding
8091 system bound to the corresponding coding-category is selected. */);
8095 Vcoding_category_list
= Qnil
;
8096 for (i
= coding_category_max
- 1; i
>= 0; i
--)
8097 Vcoding_category_list
8098 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
8099 Vcoding_category_list
);
8102 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
8103 doc
: /* Specify the coding system for read operations.
8104 It is useful to bind this variable with `let', but do not set it globally.
8105 If the value is a coding system, it is used for decoding on read operation.
8106 If not, an appropriate element is used from one of the coding system alists:
8107 There are three such tables, `file-coding-system-alist',
8108 `process-coding-system-alist', and `network-coding-system-alist'. */);
8109 Vcoding_system_for_read
= Qnil
;
8111 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
8112 doc
: /* Specify the coding system for write operations.
8113 Programs bind this variable with `let', but you should not set it globally.
8114 If the value is a coding system, it is used for encoding of output,
8115 when writing it to a file and when sending it to a file or subprocess.
8117 If this does not specify a coding system, an appropriate element
8118 is used from one of the coding system alists:
8119 There are three such tables, `file-coding-system-alist',
8120 `process-coding-system-alist', and `network-coding-system-alist'.
8121 For output to files, if the above procedure does not specify a coding system,
8122 the value of `buffer-file-coding-system' is used. */);
8123 Vcoding_system_for_write
= Qnil
;
8125 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
8127 Coding system used in the latest file or process I/O. */);
8128 Vlast_coding_system_used
= Qnil
;
8130 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
8132 *Non-nil means always inhibit code conversion of end-of-line format.
8133 See info node `Coding Systems' and info node `Text and Binary' concerning
8134 such conversion. */);
8135 inhibit_eol_conversion
= 0;
8137 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
8139 Non-nil means process buffer inherits coding system of process output.
8140 Bind it to t if the process output is to be treated as if it were a file
8141 read from some filesystem. */);
8142 inherit_process_coding_system
= 0;
8144 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
8146 Alist to decide a coding system to use for a file I/O operation.
8147 The format is ((PATTERN . VAL) ...),
8148 where PATTERN is a regular expression matching a file name,
8149 VAL is a coding system, a cons of coding systems, or a function symbol.
8150 If VAL is a coding system, it is used for both decoding and encoding
8152 If VAL is a cons of coding systems, the car part is used for decoding,
8153 and the cdr part is used for encoding.
8154 If VAL is a function symbol, the function must return a coding system
8155 or a cons of coding systems which are used as above. The function gets
8156 the arguments with which `find-operation-coding-systems' was called.
8158 See also the function `find-operation-coding-system'
8159 and the variable `auto-coding-alist'. */);
8160 Vfile_coding_system_alist
= Qnil
;
8162 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
8164 Alist to decide a coding system to use for a process I/O operation.
8165 The format is ((PATTERN . VAL) ...),
8166 where PATTERN is a regular expression matching a program name,
8167 VAL is a coding system, a cons of coding systems, or a function symbol.
8168 If VAL is a coding system, it is used for both decoding what received
8169 from the program and encoding what sent to the program.
8170 If VAL is a cons of coding systems, the car part is used for decoding,
8171 and the cdr part is used for encoding.
8172 If VAL is a function symbol, the function must return a coding system
8173 or a cons of coding systems which are used as above.
8175 See also the function `find-operation-coding-system'. */);
8176 Vprocess_coding_system_alist
= Qnil
;
8178 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
8180 Alist to decide a coding system to use for a network I/O operation.
8181 The format is ((PATTERN . VAL) ...),
8182 where PATTERN is a regular expression matching a network service name
8183 or is a port number to connect to,
8184 VAL is a coding system, a cons of coding systems, or a function symbol.
8185 If VAL is a coding system, it is used for both decoding what received
8186 from the network stream and encoding what sent to the network stream.
8187 If VAL is a cons of coding systems, the car part is used for decoding,
8188 and the cdr part is used for encoding.
8189 If VAL is a function symbol, the function must return a coding system
8190 or a cons of coding systems which are used as above.
8192 See also the function `find-operation-coding-system'. */);
8193 Vnetwork_coding_system_alist
= Qnil
;
8195 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
8196 doc
: /* Coding system to use with system messages.
8197 Also used for decoding keyboard input on X Window system. */);
8198 Vlocale_coding_system
= Qnil
;
8200 /* The eol mnemonics are reset in startup.el system-dependently. */
8201 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
8203 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8204 eol_mnemonic_unix
= build_string (":");
8206 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
8208 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8209 eol_mnemonic_dos
= build_string ("\\");
8211 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
8213 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8214 eol_mnemonic_mac
= build_string ("/");
8216 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
8218 *String displayed in mode line when end-of-line format is not yet determined. */);
8219 eol_mnemonic_undecided
= build_string (":");
8221 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
8223 *Non-nil enables character translation while encoding and decoding. */);
8224 Venable_character_translation
= Qt
;
8226 DEFVAR_LISP ("standard-translation-table-for-decode",
8227 &Vstandard_translation_table_for_decode
,
8228 doc
: /* Table for translating characters while decoding. */);
8229 Vstandard_translation_table_for_decode
= Qnil
;
8231 DEFVAR_LISP ("standard-translation-table-for-encode",
8232 &Vstandard_translation_table_for_encode
,
8233 doc
: /* Table for translating characters while encoding. */);
8234 Vstandard_translation_table_for_encode
= Qnil
;
8236 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
8237 doc
: /* Alist of charsets vs revision numbers.
8238 While encoding, if a charset (car part of an element) is found,
8239 designate it with the escape sequence identifying revision (cdr part
8240 of the element). */);
8241 Vcharset_revision_table
= Qnil
;
8243 DEFVAR_LISP ("default-process-coding-system",
8244 &Vdefault_process_coding_system
,
8245 doc
: /* Cons of coding systems used for process I/O by default.
8246 The car part is used for decoding a process output,
8247 the cdr part is used for encoding a text to be sent to a process. */);
8248 Vdefault_process_coding_system
= Qnil
;
8250 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
8252 Table of extra Latin codes in the range 128..159 (inclusive).
8253 This is a vector of length 256.
8254 If Nth element is non-nil, the existence of code N in a file
8255 \(or output of subprocess) doesn't prevent it to be detected as
8256 a coding system of ISO 2022 variant which has a flag
8257 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8258 or reading output of a subprocess.
8259 Only 128th through 159th elements has a meaning. */);
8260 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
8262 DEFVAR_LISP ("select-safe-coding-system-function",
8263 &Vselect_safe_coding_system_function
,
8265 Function to call to select safe coding system for encoding a text.
8267 If set, this function is called to force a user to select a proper
8268 coding system which can encode the text in the case that a default
8269 coding system used in each operation can't encode the text.
8271 The default value is `select-safe-coding-system' (which see). */);
8272 Vselect_safe_coding_system_function
= Qnil
;
8274 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8275 &inhibit_iso_escape_detection
,
8277 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8279 By default, on reading a file, Emacs tries to detect how the text is
8280 encoded. This code detection is sensitive to escape sequences. If
8281 the sequence is valid as ISO2022, the code is determined as one of
8282 the ISO2022 encodings, and the file is decoded by the corresponding
8283 coding system (e.g. `iso-2022-7bit').
8285 However, there may be a case that you want to read escape sequences in
8286 a file as is. In such a case, you can set this variable to non-nil.
8287 Then, as the code detection ignores any escape sequences, no file is
8288 detected as encoded in some ISO2022 encoding. The result is that all
8289 escape sequences become visible in a buffer.
8291 The default value is nil, and it is strongly recommended not to change
8292 it. That is because many Emacs Lisp source files that contain
8293 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8294 in Emacs's distribution, and they won't be decoded correctly on
8295 reading if you suppress escape sequence detection.
8297 The other way to read escape sequences in a file without decoding is
8298 to explicitly specify some coding system that doesn't use ISO2022's
8299 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8300 inhibit_iso_escape_detection
= 0;
8303 Lisp_Object args
[coding_arg_max
];
8304 Lisp_Object plist
[14];
8307 for (i
= 0; i
< coding_arg_max
; i
++)
8310 plist
[0] = intern (":name");
8311 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
8312 plist
[2] = intern (":mnemonic");
8313 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
8314 plist
[4] = intern (":coding-type");
8315 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
8316 plist
[6] = intern (":ascii-compatible-p");
8317 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
8318 plist
[8] = intern (":default-char");
8319 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
8320 plist
[10] = intern (":docstring");
8321 plist
[11] = build_string ("Do no conversion.\n\
8323 When you visit a file with this coding, the file is read into a\n\
8324 unibyte buffer as is, thus each byte of a file is treated as a\n\
8326 plist
[12] = intern (":eol-type");
8327 plist
[13] = args
[coding_arg_eol_type
] = Qunix
;
8328 args
[coding_arg_plist
] = Flist (14, plist
);
8329 Fdefine_coding_system_internal (coding_arg_max
, args
);
8332 setup_coding_system (Qno_conversion
, &keyboard_coding
);
8333 setup_coding_system (Qno_conversion
, &terminal_coding
);
8334 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
8338 emacs_strerror (error_number
)
8343 synchronize_system_messages_locale ();
8344 str
= strerror (error_number
);
8346 if (! NILP (Vlocale_coding_system
))
8348 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
8349 Vlocale_coding_system
,
8351 str
= (char *) XSTRING (dec
)->data
;