1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX. Return 1 if the data contains
148 a byte sequence which can be decoded into non-ASCII characters by
149 the coding system. Otherwize (i.e. the data contains only ASCII
150 characters or invalid sequence) return 0.
152 It also resets some bits of an integer pointed by MASK. The macros
153 CATEGORY_MASK_XXX specifies each bit of this integer.
155 Below is the template of these functions. */
159 detect_coding_XXX (coding
, mask
)
160 struct coding_system
*coding
;
163 unsigned char *src
= coding
->source
;
164 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
165 int multibytep
= coding
->src_multibyte
;
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
175 /* Check if it conforms to XXX. If not, break the loop. */
177 /* As the data is invalid for XXX, reset a proper bits. */
178 *mask
&= ~CODING_CATEGORY_XXX
;
181 /* The source exausted. */
183 /* ASCII characters only. */
185 /* Some data should be decoded into non-ASCII characters. */
186 *mask
&= CODING_CATEGORY_XXX
;
191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
193 These functions decode a byte sequence specified as a source by
194 CODING. The resulting multibyte text goes to a place pointed to by
195 CODING->charbuf, the length of which should not exceed
196 CODING->charbuf_size;
198 These functions set the information of original and decoded texts in
199 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
200 They also set CODING->result to one of CODING_RESULT_XXX indicating
201 how the decoding is finished.
203 Below is the template of these functions. */
207 decode_coding_XXXX (coding
)
208 struct coding_system
*coding
;
210 unsigned char *src
= coding
->source
+ coding
->consumed
;
211 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
212 /* SRC_BASE remembers the start position in source in each loop.
213 The loop will be exited when there's not enough source code, or
214 when there's no room in CHARBUF for a decoded character. */
215 unsigned char *src_base
;
216 /* A buffer to produce decoded characters. */
217 int *charbuf
= coding
->charbuf
;
218 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
219 int multibytep
= coding
->src_multibyte
;
224 if (charbuf
< charbuf_end
)
225 /* No more room to produce a decoded character. */
232 if (src_base
< src_end
233 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
234 /* If the source ends by partial bytes to construct a character,
235 treat them as eight-bit raw data. */
236 while (src_base
< src_end
&& charbuf
< charbuf_end
)
237 *charbuf
++ = *src_base
++;
238 /* Remember how many bytes and characters we consumed. If the
239 source is multibyte, the bytes and chars are not identical. */
240 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
241 /* Remember how many characters we produced. */
242 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
248 These functions encode SRC_BYTES length text at SOURCE of Emacs'
249 internal multibyte format by CODING. The resulting byte sequence
250 goes to a place pointed to by DESTINATION, the length of which
251 should not exceed DST_BYTES.
253 These functions set the information of original and encoded texts in
254 the members produced, produced_char, consumed, and consumed_char of
255 the structure *CODING. They also set the member result to one of
256 CODING_RESULT_XXX indicating how the encoding finished.
258 DST_BYTES zero means that source area and destination area are
259 overlapped, which means that we can produce a encoded text until it
260 reaches at the head of not-yet-encoded source text.
262 Below is a template of these functions. */
265 encode_coding_XXX (coding
)
266 struct coding_system
*coding
;
268 int multibytep
= coding
->dst_multibyte
;
269 int *charbuf
= coding
->charbuf
;
270 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
271 unsigned char *dst
= coding
->destination
+ coding
->produced
;
272 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
273 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
274 int produced_chars
= 0;
276 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
279 /* Encode C into DST, and increment DST. */
281 label_no_more_destination
:
282 /* How many chars and bytes we produced. */
283 coding
->produced_char
+= produced_chars
;
284 coding
->produced
= dst
- coding
->destination
;
289 /*** 1. Preamble ***/
296 #include "character.h"
299 #include "composite.h"
303 Lisp_Object Vcoding_system_hash_table
;
305 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
306 Lisp_Object Qunix
, Qdos
;
307 extern Lisp_Object Qmac
; /* frame.c */
308 Lisp_Object Qbuffer_file_coding_system
;
309 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
310 Lisp_Object Qdefault_char
;
311 Lisp_Object Qno_conversion
, Qundecided
;
312 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
313 Lisp_Object Qutf_16_be_nosig
, Qutf_16_be
, Qutf_16_le_nosig
, Qutf_16_le
;
314 Lisp_Object Qsignature
, Qendian
, Qbig
, Qlittle
;
315 Lisp_Object Qcoding_system_history
;
316 Lisp_Object Qvalid_codes
;
318 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
319 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
320 Lisp_Object Qstart_process
, Qopen_network_stream
;
321 Lisp_Object Qtarget_idx
;
323 Lisp_Object Vselect_safe_coding_system_function
;
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
327 /* Mnemonic string to indicate format of end-of-line is not yet
329 Lisp_Object eol_mnemonic_undecided
;
333 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
335 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule
, Qraw_text
;
341 /* Coding-systems are handed between Emacs Lisp programs and C internal
342 routines by the following three variables. */
343 /* Coding-system for reading files and receiving data from process. */
344 Lisp_Object Vcoding_system_for_read
;
345 /* Coding-system for writing files and sending data to process. */
346 Lisp_Object Vcoding_system_for_write
;
347 /* Coding-system actually used in the latest I/O. */
348 Lisp_Object Vlast_coding_system_used
;
350 /* A vector of length 256 which contains information about special
351 Latin codes (especially for dealing with Microsoft codes). */
352 Lisp_Object Vlatin_extra_code_table
;
354 /* Flag to inhibit code conversion of end-of-line format. */
355 int inhibit_eol_conversion
;
357 /* Flag to inhibit ISO2022 escape sequence detection. */
358 int inhibit_iso_escape_detection
;
360 /* Flag to make buffer-file-coding-system inherit from process-coding. */
361 int inherit_process_coding_system
;
363 /* Coding system to be used to encode text for terminal display. */
364 struct coding_system terminal_coding
;
366 /* Coding system to be used to encode text for terminal display when
367 terminal coding system is nil. */
368 struct coding_system safe_terminal_coding
;
370 /* Coding system of what is sent from terminal keyboard. */
371 struct coding_system keyboard_coding
;
373 Lisp_Object Vfile_coding_system_alist
;
374 Lisp_Object Vprocess_coding_system_alist
;
375 Lisp_Object Vnetwork_coding_system_alist
;
377 Lisp_Object Vlocale_coding_system
;
381 /* Flag to tell if we look up translation table on character code
383 Lisp_Object Venable_character_translation
;
384 /* Standard translation table to look up on decoding (reading). */
385 Lisp_Object Vstandard_translation_table_for_decode
;
386 /* Standard translation table to look up on encoding (writing). */
387 Lisp_Object Vstandard_translation_table_for_encode
;
389 Lisp_Object Qtranslation_table
;
390 Lisp_Object Qtranslation_table_id
;
391 Lisp_Object Qtranslation_table_for_decode
;
392 Lisp_Object Qtranslation_table_for_encode
;
394 /* Alist of charsets vs revision number. */
395 static Lisp_Object Vcharset_revision_table
;
397 /* Default coding systems used for process I/O. */
398 Lisp_Object Vdefault_process_coding_system
;
400 /* Global flag to tell that we can't call post-read-conversion and
401 pre-write-conversion functions. Usually the value is zero, but it
402 is set to 1 temporarily while such functions are running. This is
403 to avoid infinite recursive call. */
404 static int inhibit_pre_post_conversion
;
406 /* Two special coding systems. */
407 Lisp_Object Vsjis_coding_system
;
408 Lisp_Object Vbig5_coding_system
;
411 static int detect_coding_utf_8
P_ ((struct coding_system
*, int *));
412 static void decode_coding_utf_8
P_ ((struct coding_system
*));
413 static int encode_coding_utf_8
P_ ((struct coding_system
*));
415 static int detect_coding_utf_16
P_ ((struct coding_system
*, int *));
416 static void decode_coding_utf_16
P_ ((struct coding_system
*));
417 static int encode_coding_utf_16
P_ ((struct coding_system
*));
419 static int detect_coding_iso_2022
P_ ((struct coding_system
*, int *));
420 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
421 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
423 static int detect_coding_emacs_mule
P_ ((struct coding_system
*, int *));
424 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
425 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
427 static int detect_coding_sjis
P_ ((struct coding_system
*, int *));
428 static void decode_coding_sjis
P_ ((struct coding_system
*));
429 static int encode_coding_sjis
P_ ((struct coding_system
*));
431 static int detect_coding_big5
P_ ((struct coding_system
*, int *));
432 static void decode_coding_big5
P_ ((struct coding_system
*));
433 static int encode_coding_big5
P_ ((struct coding_system
*));
435 static int detect_coding_ccl
P_ ((struct coding_system
*, int *));
436 static void decode_coding_ccl
P_ ((struct coding_system
*));
437 static int encode_coding_ccl
P_ ((struct coding_system
*));
439 static void decode_coding_raw_text
P_ ((struct coding_system
*));
440 static int encode_coding_raw_text
P_ ((struct coding_system
*));
443 /* ISO2022 section */
445 #define CODING_ISO_INITIAL(coding, reg) \
446 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
447 coding_attr_iso_initial), \
451 #define CODING_ISO_REQUEST(coding, charset_id) \
452 ((charset_id <= (coding)->max_charset_id \
453 ? (coding)->safe_charsets[charset_id] \
457 #define CODING_ISO_FLAGS(coding) \
458 ((coding)->spec.iso_2022.flags)
459 #define CODING_ISO_DESIGNATION(coding, reg) \
460 ((coding)->spec.iso_2022.current_designation[reg])
461 #define CODING_ISO_INVOCATION(coding, plane) \
462 ((coding)->spec.iso_2022.current_invocation[plane])
463 #define CODING_ISO_SINGLE_SHIFTING(coding) \
464 ((coding)->spec.iso_2022.single_shifting)
465 #define CODING_ISO_BOL(coding) \
466 ((coding)->spec.iso_2022.bol)
467 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
468 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
470 /* Control characters of ISO2022. */
471 /* code */ /* function */
472 #define ISO_CODE_LF 0x0A /* line-feed */
473 #define ISO_CODE_CR 0x0D /* carriage-return */
474 #define ISO_CODE_SO 0x0E /* shift-out */
475 #define ISO_CODE_SI 0x0F /* shift-in */
476 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
477 #define ISO_CODE_ESC 0x1B /* escape */
478 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
479 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
480 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
482 /* All code (1-byte) of ISO2022 is classified into one of the
484 enum iso_code_class_type
486 ISO_control_0
, /* Control codes in the range
487 0x00..0x1F and 0x7F, except for the
488 following 5 codes. */
489 ISO_carriage_return
, /* ISO_CODE_CR (0x0D) */
490 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
491 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
492 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
493 ISO_escape
, /* ISO_CODE_SO (0x1B) */
494 ISO_control_1
, /* Control codes in the range
495 0x80..0x9F, except for the
496 following 3 codes. */
497 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
498 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
499 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
500 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
501 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
502 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
503 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
506 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
507 `iso-flags' attribute of an iso2022 coding system. */
509 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
510 instead of the correct short-form sequence (e.g. ESC $ A). */
511 #define CODING_ISO_FLAG_LONG_FORM 0x0001
513 /* If set, reset graphic planes and registers at end-of-line to the
515 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
517 /* If set, reset graphic planes and registers before any control
518 characters to the initial state. */
519 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
521 /* If set, encode by 7-bit environment. */
522 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
524 /* If set, use locking-shift function. */
525 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
527 /* If set, use single-shift function. Overwrite
528 CODING_ISO_FLAG_LOCKING_SHIFT. */
529 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
531 /* If set, use designation escape sequence. */
532 #define CODING_ISO_FLAG_DESIGNATION 0x0040
534 /* If set, produce revision number sequence. */
535 #define CODING_ISO_FLAG_REVISION 0x0080
537 /* If set, produce ISO6429's direction specifying sequence. */
538 #define CODING_ISO_FLAG_DIRECTION 0x0100
540 /* If set, assume designation states are reset at beginning of line on
542 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
544 /* If set, designation sequence should be placed at beginning of line
546 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
548 /* If set, do not encode unsafe charactes on output. */
549 #define CODING_ISO_FLAG_SAFE 0x0800
551 /* If set, extra latin codes (128..159) are accepted as a valid code
553 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
555 #define CODING_ISO_FLAG_COMPOSITION 0x2000
557 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
559 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
561 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
563 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
565 /* A character to be produced on output if encoding of the original
566 character is prohibited by CODING_ISO_FLAG_SAFE. */
567 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
571 #define CODING_UTF_16_BOM(coding) \
572 ((coding)->spec.utf_16.bom)
574 #define CODING_UTF_16_ENDIAN(coding) \
575 ((coding)->spec.utf_16.endian)
577 #define CODING_UTF_16_SURROGATE(coding) \
578 ((coding)->spec.utf_16.surrogate)
582 #define CODING_CCL_DECODER(coding) \
583 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
584 #define CODING_CCL_ENCODER(coding) \
585 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
586 #define CODING_CCL_VALIDS(coding) \
587 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
590 /* Index for each coding category in `coding_categories' */
594 coding_category_iso_7
,
595 coding_category_iso_7_tight
,
596 coding_category_iso_8_1
,
597 coding_category_iso_8_2
,
598 coding_category_iso_7_else
,
599 coding_category_iso_8_else
,
600 coding_category_utf_8
,
601 coding_category_utf_16_auto
,
602 coding_category_utf_16_be
,
603 coding_category_utf_16_le
,
604 coding_category_utf_16_be_nosig
,
605 coding_category_utf_16_le_nosig
,
606 coding_category_charset
,
607 coding_category_sjis
,
608 coding_category_big5
,
610 coding_category_emacs_mule
,
611 /* All above are targets of code detection. */
612 coding_category_raw_text
,
613 coding_category_undecided
,
617 /* Definitions of flag bits used in detect_coding_XXXX. */
618 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
619 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
620 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
621 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
622 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
623 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
624 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
625 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
626 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
627 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
628 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
629 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
630 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
631 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
632 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
633 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
635 /* This value is returned if detect_coding_mask () find nothing other
636 than ASCII characters. */
637 #define CATEGORY_MASK_ANY \
638 (CATEGORY_MASK_ISO_7 \
639 | CATEGORY_MASK_ISO_7_TIGHT \
640 | CATEGORY_MASK_ISO_8_1 \
641 | CATEGORY_MASK_ISO_8_2 \
642 | CATEGORY_MASK_ISO_7_ELSE \
643 | CATEGORY_MASK_ISO_8_ELSE \
644 | CATEGORY_MASK_UTF_8 \
645 | CATEGORY_MASK_UTF_16_BE \
646 | CATEGORY_MASK_UTF_16_LE \
647 | CATEGORY_MASK_UTF_16_BE_NOSIG \
648 | CATEGORY_MASK_UTF_16_LE_NOSIG \
649 | CATEGORY_MASK_CHARSET \
650 | CATEGORY_MASK_SJIS \
651 | CATEGORY_MASK_BIG5 \
652 | CATEGORY_MASK_CCL \
653 | CATEGORY_MASK_EMACS_MULE)
656 #define CATEGORY_MASK_ISO_7BIT \
657 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
659 #define CATEGORY_MASK_ISO_8BIT \
660 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
662 #define CATEGORY_MASK_ISO_ELSE \
663 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
665 #define CATEGORY_MASK_ISO_ESCAPE \
666 (CATEGORY_MASK_ISO_7 \
667 | CATEGORY_MASK_ISO_7_TIGHT \
668 | CATEGORY_MASK_ISO_7_ELSE \
669 | CATEGORY_MASK_ISO_8_ELSE)
671 #define CATEGORY_MASK_ISO \
672 ( CATEGORY_MASK_ISO_7BIT \
673 | CATEGORY_MASK_ISO_8BIT \
674 | CATEGORY_MASK_ISO_ELSE)
676 #define CATEGORY_MASK_UTF_16 \
677 (CATEGORY_MASK_UTF_16_BE \
678 | CATEGORY_MASK_UTF_16_LE \
679 | CATEGORY_MASK_UTF_16_BE_NOSIG \
680 | CATEGORY_MASK_UTF_16_LE_NOSIG)
683 /* List of symbols `coding-category-xxx' ordered by priority. This
684 variable is exposed to Emacs Lisp. */
685 static Lisp_Object Vcoding_category_list
;
687 /* Table of coding categories (Lisp symbols). This variable is for
689 static Lisp_Object Vcoding_category_table
;
691 /* Table of coding-categories ordered by priority. */
692 static enum coding_category coding_priorities
[coding_category_max
];
694 /* Nth element is a coding context for the coding system bound to the
695 Nth coding category. */
696 static struct coding_system coding_categories
[coding_category_max
];
698 static int detected_mask
[coding_category_raw_text
] =
706 CATEGORY_MASK_UTF_16
,
707 CATEGORY_MASK_UTF_16
,
708 CATEGORY_MASK_UTF_16
,
709 CATEGORY_MASK_UTF_16
,
710 CATEGORY_MASK_UTF_16
,
711 CATEGORY_MASK_CHARSET
,
715 CATEGORY_MASK_EMACS_MULE
718 /*** Commonly used macros and functions ***/
721 #define min(a, b) ((a) < (b) ? (a) : (b))
724 #define max(a, b) ((a) > (b) ? (a) : (b))
727 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
729 attrs = CODING_ID_ATTRS (coding->id); \
730 eol_type = CODING_ID_EOL_TYPE (coding->id); \
731 if (VECTORP (eol_type)) \
733 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
737 /* Safely get one byte from the source text pointed by SRC which ends
738 at SRC_END, and set C to that byte. If there are not enough bytes
739 in the source, it jumps to `no_more_source'. The caller
740 should declare and set these variables appropriately in advance:
741 src, src_end, multibytep
744 #define ONE_MORE_BYTE(c) \
746 if (src == src_end) \
748 if (src_base < src) \
749 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
750 goto no_more_source; \
753 if (multibytep && (c & 0x80)) \
755 if ((c & 0xFE) != 0xC0) \
756 error ("Undecodable char found"); \
757 c = ((c & 1) << 6) | *src++; \
763 #define ONE_MORE_BYTE_NO_CHECK(c) \
766 if (multibytep && (c & 0x80)) \
768 if ((c & 0xFE) != 0xC0) \
769 error ("Undecodable char found"); \
770 c = ((c & 1) << 6) | *src++; \
776 /* Store a byte C in the place pointed by DST and increment DST to the
777 next free point, and increment PRODUCED_CHARS. The caller should
778 assure that C is 0..127, and declare and set the variable `dst'
779 appropriately in advance.
783 #define EMIT_ONE_ASCII_BYTE(c) \
790 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
792 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
794 produced_chars += 2; \
795 *dst++ = (c1), *dst++ = (c2); \
799 /* Store a byte C in the place pointed by DST and increment DST to the
800 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
801 nonzero, store in an appropriate multibyte from. The caller should
802 declare and set the variables `dst' and `multibytep' appropriately
805 #define EMIT_ONE_BYTE(c) \
812 ch = BYTE8_TO_CHAR (ch); \
813 CHAR_STRING_ADVANCE (ch, dst); \
820 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
822 #define EMIT_TWO_BYTES(c1, c2) \
824 produced_chars += 2; \
831 ch = BYTE8_TO_CHAR (ch); \
832 CHAR_STRING_ADVANCE (ch, dst); \
835 ch = BYTE8_TO_CHAR (ch); \
836 CHAR_STRING_ADVANCE (ch, dst); \
846 #define EMIT_THREE_BYTES(c1, c2, c3) \
848 EMIT_ONE_BYTE (c1); \
849 EMIT_TWO_BYTES (c2, c3); \
853 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
855 EMIT_TWO_BYTES (c1, c2); \
856 EMIT_TWO_BYTES (c3, c4); \
860 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
862 charset_map_loaded = 0; \
863 c = DECODE_CHAR (charset, code); \
864 if (charset_map_loaded) \
866 unsigned char *orig = coding->source; \
869 coding_set_source (coding); \
870 offset = coding->source - orig; \
872 src_base += offset; \
878 #define ASSURE_DESTINATION(bytes) \
880 if (dst + (bytes) >= dst_end) \
882 int more_bytes = charbuf_end - charbuf + (bytes); \
884 dst = alloc_destination (coding, more_bytes, dst); \
885 dst_end = coding->destination + coding->dst_bytes; \
892 coding_set_source (coding
)
893 struct coding_system
*coding
;
895 if (BUFFERP (coding
->src_object
))
897 if (coding
->src_pos
< 0)
898 coding
->source
= GAP_END_ADDR
+ coding
->src_pos_byte
;
901 struct buffer
*buf
= XBUFFER (coding
->src_object
);
902 EMACS_INT gpt_byte
= BUF_GPT_BYTE (buf
);
903 unsigned char *beg_addr
= BUF_BEG_ADDR (buf
);
905 coding
->source
= beg_addr
+ coding
->src_pos_byte
- 1;
906 if (coding
->src_pos_byte
>= gpt_byte
)
907 coding
->source
+= BUF_GAP_SIZE (buf
);
910 else if (STRINGP (coding
->src_object
))
912 coding
->source
= (XSTRING (coding
->src_object
)->data
913 + coding
->src_pos_byte
);
916 /* Otherwise, the source is C string and is never relocated
917 automatically. Thus we don't have to update anything. */
922 coding_set_destination (coding
)
923 struct coding_system
*coding
;
925 if (BUFFERP (coding
->dst_object
))
927 if (coding
->src_pos
< 0)
929 coding
->destination
= BEG_ADDR
+ coding
->dst_pos_byte
- 1;
930 coding
->dst_bytes
= (GAP_END_ADDR
931 - (coding
->src_bytes
- coding
->consumed
)
932 - coding
->destination
);
936 /* We are sure that coding->dst_pos_byte is before the gap
938 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
939 + coding
->dst_pos_byte
- 1);
940 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
941 - coding
->destination
);
945 /* Otherwise, the destination is C string and is never relocated
946 automatically. Thus we don't have to update anything. */
952 coding_alloc_by_realloc (coding
, bytes
)
953 struct coding_system
*coding
;
956 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
957 coding
->dst_bytes
+ bytes
);
958 coding
->dst_bytes
+= bytes
;
962 coding_alloc_by_making_gap (coding
, bytes
)
963 struct coding_system
*coding
;
966 if (BUFFERP (coding
->dst_object
)
967 && EQ (coding
->src_object
, coding
->dst_object
))
969 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
971 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
973 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
977 Lisp_Object this_buffer
;
979 this_buffer
= Fcurrent_buffer ();
980 set_buffer_internal (XBUFFER (coding
->dst_object
));
982 set_buffer_internal (XBUFFER (this_buffer
));
987 static unsigned char *
988 alloc_destination (coding
, nbytes
, dst
)
989 struct coding_system
*coding
;
993 EMACS_INT offset
= dst
- coding
->destination
;
995 if (BUFFERP (coding
->dst_object
))
996 coding_alloc_by_making_gap (coding
, nbytes
);
998 coding_alloc_by_realloc (coding
, nbytes
);
999 coding
->result
= CODING_RESULT_SUCCESS
;
1000 coding_set_destination (coding
);
1001 dst
= coding
->destination
+ offset
;
1006 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1013 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1014 Check if a text is encoded in UTF-8. If it is, return
1015 CATEGORY_MASK_UTF_8, else return 0. */
1017 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1018 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1019 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1020 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1021 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1022 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1025 detect_coding_utf_8 (coding
, mask
)
1026 struct coding_system
*coding
;
1029 unsigned char *src
= coding
->source
, *src_base
= src
;
1030 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1031 int multibytep
= coding
->src_multibyte
;
1032 int consumed_chars
= 0;
1036 /* A coding system of this category is always ASCII compatible. */
1037 src
+= coding
->head_ascii
;
1041 int c
, c1
, c2
, c3
, c4
;
1045 if (UTF_8_1_OCTET_P (c
))
1049 if (! UTF_8_EXTRA_OCTET_P (c1
))
1051 if (UTF_8_2_OCTET_LEADING_P (c
))
1057 if (! UTF_8_EXTRA_OCTET_P (c2
))
1059 if (UTF_8_3_OCTET_LEADING_P (c
))
1065 if (! UTF_8_EXTRA_OCTET_P (c3
))
1067 if (UTF_8_4_OCTET_LEADING_P (c
))
1073 if (! UTF_8_EXTRA_OCTET_P (c4
))
1075 if (UTF_8_5_OCTET_LEADING_P (c
))
1082 *mask
&= ~CATEGORY_MASK_UTF_8
;
1086 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1088 *mask
&= ~CATEGORY_MASK_UTF_8
;
1096 decode_coding_utf_8 (coding
)
1097 struct coding_system
*coding
;
1099 unsigned char *src
= coding
->source
+ coding
->consumed
;
1100 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1101 unsigned char *src_base
;
1102 int *charbuf
= coding
->charbuf
;
1103 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1104 int consumed_chars
= 0, consumed_chars_base
;
1105 int multibytep
= coding
->src_multibyte
;
1106 Lisp_Object attr
, eol_type
, charset_list
;
1108 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1112 int c
, c1
, c2
, c3
, c4
, c5
;
1115 consumed_chars_base
= consumed_chars
;
1117 if (charbuf
>= charbuf_end
)
1121 if (UTF_8_1_OCTET_P(c1
))
1126 if (EQ (eol_type
, Qdos
))
1129 goto no_more_source
;
1133 else if (EQ (eol_type
, Qmac
))
1140 if (! UTF_8_EXTRA_OCTET_P (c2
))
1142 if (UTF_8_2_OCTET_LEADING_P (c1
))
1144 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1145 /* Reject overlong sequences here and below. Encoders
1146 producing them are incorrect, they can be misleading,
1147 and they mess up read/write invariance. */
1154 if (! UTF_8_EXTRA_OCTET_P (c3
))
1156 if (UTF_8_3_OCTET_LEADING_P (c1
))
1158 c
= (((c1
& 0xF) << 12)
1159 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1161 || (c
>= 0xd800 && c
< 0xe000)) /* surrogates (invalid) */
1167 if (! UTF_8_EXTRA_OCTET_P (c4
))
1169 if (UTF_8_4_OCTET_LEADING_P (c1
))
1171 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1172 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1179 if (! UTF_8_EXTRA_OCTET_P (c5
))
1181 if (UTF_8_5_OCTET_LEADING_P (c1
))
1183 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1184 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1186 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1201 consumed_chars
= consumed_chars_base
;
1203 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1208 coding
->consumed_char
+= consumed_chars_base
;
1209 coding
->consumed
= src_base
- coding
->source
;
1210 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1215 encode_coding_utf_8 (coding
)
1216 struct coding_system
*coding
;
1218 int multibytep
= coding
->dst_multibyte
;
1219 int *charbuf
= coding
->charbuf
;
1220 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1221 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1222 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1223 int produced_chars
= 0;
1228 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1230 while (charbuf
< charbuf_end
)
1232 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1234 ASSURE_DESTINATION (safe_room
);
1236 if (CHAR_BYTE8_P (c
))
1238 c
= CHAR_TO_BYTE8 (c
);
1243 CHAR_STRING_ADVANCE (c
, pend
);
1244 for (p
= str
; p
< pend
; p
++)
1251 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1253 while (charbuf
< charbuf_end
)
1255 ASSURE_DESTINATION (safe_room
);
1257 dst
+= CHAR_STRING (c
, dst
);
1261 coding
->result
= CODING_RESULT_SUCCESS
;
1262 coding
->produced_char
+= produced_chars
;
1263 coding
->produced
= dst
- coding
->destination
;
1268 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1269 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1270 Little Endian (otherwise). If it is, return
1271 CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1274 #define UTF_16_HIGH_SURROGATE_P(val) \
1275 (((val) & 0xFC00) == 0xD800)
1277 #define UTF_16_LOW_SURROGATE_P(val) \
1278 (((val) & 0xFC00) == 0xDC00)
1280 #define UTF_16_INVALID_P(val) \
1281 (((val) == 0xFFFE) \
1282 || ((val) == 0xFFFF) \
1283 || UTF_16_LOW_SURROGATE_P (val))
1287 detect_coding_utf_16 (coding
, mask
)
1288 struct coding_system
*coding
;
1291 unsigned char *src
= coding
->source
, *src_base
= src
;
1292 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1293 int multibytep
= coding
->src_multibyte
;
1294 int consumed_chars
= 0;
1297 *mask
&= ~CATEGORY_MASK_UTF_16
;
1302 if ((c1
== 0xFF) && (c2
== 0xFE))
1303 *mask
|= CATEGORY_MASK_UTF_16_LE
;
1304 else if ((c1
== 0xFE) && (c2
== 0xFF))
1305 *mask
|= CATEGORY_MASK_UTF_16_BE
;
1307 *mask
|= CATEGORY_MASK_UTF_16_BE_NOSIG
| CATEGORY_MASK_UTF_16_LE_NOSIG
;
1315 decode_coding_utf_16 (coding
)
1316 struct coding_system
*coding
;
1318 unsigned char *src
= coding
->source
+ coding
->consumed
;
1319 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1320 unsigned char *src_base
;
1321 int *charbuf
= coding
->charbuf
;
1322 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1323 int consumed_chars
= 0, consumed_chars_base
;
1324 int multibytep
= coding
->src_multibyte
;
1325 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1326 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1327 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1328 Lisp_Object attr
, eol_type
, charset_list
;
1330 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1332 if (bom
!= utf_16_without_bom
)
1340 if (bom
== utf_16_with_bom
)
1342 if (endian
== utf_16_big_endian
1343 ? c
!= 0xFFFE : c
!= 0xFEFF)
1345 /* We are sure that there's enouph room at CHARBUF. */
1354 CODING_UTF_16_ENDIAN (coding
)
1355 = endian
= utf_16_big_endian
;
1356 else if (c
== 0xFEFF)
1357 CODING_UTF_16_ENDIAN (coding
)
1358 = endian
= utf_16_little_endian
;
1361 CODING_UTF_16_ENDIAN (coding
)
1362 = endian
= utf_16_big_endian
;
1366 CODING_UTF_16_BOM (coding
) = utf_16_with_bom
;
1374 consumed_chars_base
= consumed_chars
;
1376 if (charbuf
+ 2 >= charbuf_end
)
1381 c
= (endian
== utf_16_big_endian
1382 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1385 if (! UTF_16_LOW_SURROGATE_P (c
))
1387 if (endian
== utf_16_big_endian
)
1388 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1390 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1394 if (UTF_16_HIGH_SURROGATE_P (c
))
1395 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1401 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1402 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1408 if (UTF_16_HIGH_SURROGATE_P (c
))
1409 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1416 coding
->consumed_char
+= consumed_chars_base
;
1417 coding
->consumed
= src_base
- coding
->source
;
1418 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1422 encode_coding_utf_16 (coding
)
1423 struct coding_system
*coding
;
1425 int multibytep
= coding
->dst_multibyte
;
1426 int *charbuf
= coding
->charbuf
;
1427 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1428 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1429 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1431 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1432 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1433 int produced_chars
= 0;
1434 Lisp_Object attrs
, eol_type
, charset_list
;
1437 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1439 if (bom
== utf_16_with_bom
)
1441 ASSURE_DESTINATION (safe_room
);
1443 EMIT_TWO_BYTES (0xFF, 0xFE);
1445 EMIT_TWO_BYTES (0xFE, 0xFF);
1446 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1449 while (charbuf
< charbuf_end
)
1451 ASSURE_DESTINATION (safe_room
);
1453 if (c
>= MAX_UNICODE_CHAR
)
1454 c
= coding
->default_char
;
1459 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1461 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1468 c1
= (c
>> 10) + 0xD800;
1469 c2
= (c
& 0x3FF) + 0xDC00;
1471 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1473 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1476 coding
->result
= CODING_RESULT_SUCCESS
;
1477 coding
->produced
= dst
- coding
->destination
;
1478 coding
->produced_char
+= produced_chars
;
1483 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1485 /* Emacs' internal format for representation of multiple character
1486 sets is a kind of multi-byte encoding, i.e. characters are
1487 represented by variable-length sequences of one-byte codes.
1489 ASCII characters and control characters (e.g. `tab', `newline') are
1490 represented by one-byte sequences which are their ASCII codes, in
1491 the range 0x00 through 0x7F.
1493 8-bit characters of the range 0x80..0x9F are represented by
1494 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1497 8-bit characters of the range 0xA0..0xFF are represented by
1498 one-byte sequences which are their 8-bit code.
1500 The other characters are represented by a sequence of `base
1501 leading-code', optional `extended leading-code', and one or two
1502 `position-code's. The length of the sequence is determined by the
1503 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1504 whereas extended leading-code and position-code take the range 0xA0
1505 through 0xFF. See `charset.h' for more details about leading-code
1508 --- CODE RANGE of Emacs' internal format ---
1512 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1513 eight-bit-graphic 0xA0..0xBF
1514 ELSE 0x81..0x9D + [0xA0..0xFF]+
1515 ---------------------------------------------
1517 As this is the internal character representation, the format is
1518 usually not used externally (i.e. in a file or in a data sent to a
1519 process). But, it is possible to have a text externally in this
1520 format (i.e. by encoding by the coding system `emacs-mule').
1522 In that case, a sequence of one-byte codes has a slightly different
1525 At first, all characters in eight-bit-control are represented by
1526 one-byte sequences which are their 8-bit code.
1528 Next, character composition data are represented by the byte
1529 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1531 METHOD is 0xF0 plus one of composition method (enum
1532 composition_method),
1534 BYTES is 0xA0 plus a byte length of this composition data,
1536 CHARS is 0x20 plus a number of characters composed by this
1539 COMPONENTs are characters of multibye form or composition
1540 rules encoded by two-byte of ASCII codes.
1542 In addition, for backward compatibility, the following formats are
1543 also recognized as composition data on decoding.
1546 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1549 MSEQ is a multibyte form but in these special format:
1550 ASCII: 0xA0 ASCII_CODE+0x80,
1551 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1552 RULE is a one byte code of the range 0xA0..0xF0 that
1553 represents a composition rule.
1556 char emacs_mule_bytes
[256];
1559 emacs_mule_char (coding
, src
, nbytes
, nchars
)
1560 struct coding_system
*coding
;
1562 int *nbytes
, *nchars
;
1564 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1565 int multibytep
= coding
->src_multibyte
;
1566 unsigned char *src_base
= src
;
1567 struct charset
*charset
;
1570 int consumed_chars
= 0;
1573 switch (emacs_mule_bytes
[c
])
1576 if (! (charset
= emacs_mule_charset
[c
]))
1583 if (c
== EMACS_MULE_LEADING_CODE_PRIVATE_11
1584 || c
== EMACS_MULE_LEADING_CODE_PRIVATE_12
)
1587 if (! (charset
= emacs_mule_charset
[c
]))
1594 if (! (charset
= emacs_mule_charset
[c
]))
1597 code
= (c
& 0x7F) << 8;
1605 if (! (charset
= emacs_mule_charset
[c
]))
1608 code
= (c
& 0x7F) << 8;
1615 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
)
1616 ? charset_ascii
: charset_eight_bit
);
1622 c
= DECODE_CHAR (charset
, code
);
1625 *nbytes
= src
- src_base
;
1626 *nchars
= consumed_chars
;
1637 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1638 Check if a text is encoded in `emacs-mule'. */
1641 detect_coding_emacs_mule (coding
, mask
)
1642 struct coding_system
*coding
;
1645 unsigned char *src
= coding
->source
, *src_base
= src
;
1646 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1647 int multibytep
= coding
->src_multibyte
;
1648 int consumed_chars
= 0;
1653 /* A coding system of this category is always ASCII compatible. */
1654 src
+= coding
->head_ascii
;
1664 /* Perhaps the start of composite character. We simple skip
1665 it because analyzing it is too heavy for detecting. But,
1666 at least, we check that the composite character
1667 constitues of more than 4 bytes. */
1668 unsigned char *src_base
;
1678 if (src
- src_base
<= 4)
1688 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1693 unsigned char *src_base
= src
- 1;
1700 if (src
- src_base
!= emacs_mule_bytes
[*src_base
])
1705 *mask
&= ~CATEGORY_MASK_EMACS_MULE
;
1709 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1711 *mask
&= ~CATEGORY_MASK_EMACS_MULE
;
1718 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1720 /* Decode a character represented as a component of composition
1721 sequence of Emacs 20/21 style at SRC. Set C to that character and
1722 update SRC to the head of next character (or an encoded composition
1723 rule). If SRC doesn't points a composition component, set C to -1.
1724 If SRC points an invalid byte sequence, global exit by a return
1727 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1731 int nbytes, nchars; \
1733 if (src == src_end) \
1735 c = emacs_mule_char (coding, src, &nbytes, &nchars); \
1740 goto invalid_code; \
1744 consumed_chars += nchars; \
1749 /* Decode a composition rule represented as a component of composition
1750 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1751 and increment BUF. If SRC points an invalid byte sequence, set C
1754 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1756 int c, gref, nref; \
1758 if (src >= src_end) \
1759 goto invalid_code; \
1760 ONE_MORE_BYTE_NO_CHECK (c); \
1762 if (c < 0 || c >= 81) \
1763 goto invalid_code; \
1765 gref = c / 9, nref = c % 9; \
1766 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1770 /* Decode a composition rule represented as a component of composition
1771 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1772 and increment BUF. If SRC points an invalid byte sequence, set C
1775 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1779 if (src + 1>= src_end) \
1780 goto invalid_code; \
1781 ONE_MORE_BYTE_NO_CHECK (gref); \
1783 ONE_MORE_BYTE_NO_CHECK (nref); \
1785 if (gref < 0 || gref >= 81 \
1786 || nref < 0 || nref >= 81) \
1787 goto invalid_code; \
1788 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1792 #define ADD_COMPOSITION_DATA(buf, method, nchars) \
1795 *buf++ = coding->produced_char + char_offset; \
1796 *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \
1802 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1804 /* Emacs 21 style format. The first three bytes at SRC are \
1805 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1806 the byte length of this composition information, CHARS is the \
1807 number of characters composed by this composition. */ \
1808 enum composition_method method = c - 0xF2; \
1809 int *charbuf_base = charbuf; \
1810 int consumed_chars_limit; \
1811 int nbytes, nchars; \
1813 ONE_MORE_BYTE (c); \
1814 nbytes = c - 0xA0; \
1816 goto invalid_code; \
1817 ONE_MORE_BYTE (c); \
1818 nchars = c - 0xA0; \
1819 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
1820 consumed_chars_limit = consumed_chars_base + nbytes; \
1821 if (method != COMPOSITION_RELATIVE) \
1824 while (consumed_chars < consumed_chars_limit) \
1826 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1827 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1829 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1832 if (consumed_chars < consumed_chars_limit) \
1833 goto invalid_code; \
1834 charbuf_base[0] -= i; \
1839 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1841 /* Emacs 20 style format for relative composition. */ \
1842 /* Store multibyte form of characters to be composed. */ \
1843 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1844 int *buf = components; \
1848 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1849 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1850 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1852 goto invalid_code; \
1853 ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \
1854 for (j = 0; j < i; j++) \
1855 *charbuf++ = components[j]; \
1859 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1861 /* Emacs 20 style format for rule-base composition. */ \
1862 /* Store multibyte form of characters to be composed. */ \
1863 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1864 int *buf = components; \
1867 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1868 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1870 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1871 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1873 if (i < 1 || (buf - components) % 2 == 0) \
1874 goto invalid_code; \
1875 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1876 goto no_more_source; \
1877 ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \
1878 for (j = 0; j < i; j++) \
1879 *charbuf++ = components[j]; \
1880 for (j = 0; j < i; j += 2) \
1881 *charbuf++ = components[j]; \
1886 decode_coding_emacs_mule (coding
)
1887 struct coding_system
*coding
;
1889 unsigned char *src
= coding
->source
+ coding
->consumed
;
1890 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1891 unsigned char *src_base
;
1892 int *charbuf
= coding
->charbuf
;
1893 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1894 int consumed_chars
= 0, consumed_chars_base
;
1895 int char_offset
= 0;
1896 int multibytep
= coding
->src_multibyte
;
1897 Lisp_Object attrs
, eol_type
, charset_list
;
1899 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1906 consumed_chars_base
= consumed_chars
;
1908 if (charbuf
>= charbuf_end
)
1917 if (EQ (eol_type
, Qdos
))
1920 goto no_more_source
;
1924 else if (EQ (eol_type
, Qmac
))
1932 if (charbuf
+ 5 + (MAX_COMPOSITION_COMPONENTS
* 2) - 1 > charbuf_end
)
1935 if (c
- 0xF2 >= COMPOSITION_RELATIVE
1936 && c
- 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS
)
1937 DECODE_EMACS_MULE_21_COMPOSITION (c
);
1939 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
1941 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
1944 coding
->annotated
= 1;
1946 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
1950 consumed_chars
= consumed_chars_base
;
1951 c
= emacs_mule_char (coding
, src
, &nbytes
, &nchars
);
1960 consumed_chars
+= nchars
;
1967 consumed_chars
= consumed_chars_base
;
1969 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1974 coding
->consumed_char
+= consumed_chars_base
;
1975 coding
->consumed
= src_base
- coding
->source
;
1976 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1980 #define EMACS_MULE_LEADING_CODES(id, codes) \
1983 codes[0] = id, codes[1] = 0; \
1984 else if (id < 0xE0) \
1985 codes[0] = 0x9A, codes[1] = id; \
1986 else if (id < 0xF0) \
1987 codes[0] = 0x9B, codes[1] = id; \
1988 else if (id < 0xF5) \
1989 codes[0] = 0x9C, codes[1] = id; \
1991 codes[0] = 0x9D, codes[1] = id; \
1996 encode_coding_emacs_mule (coding
)
1997 struct coding_system
*coding
;
1999 int multibytep
= coding
->dst_multibyte
;
2000 int *charbuf
= coding
->charbuf
;
2001 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
2002 unsigned char *dst
= coding
->destination
+ coding
->produced
;
2003 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
2005 int produced_chars
= 0;
2006 Lisp_Object attrs
, eol_type
, charset_list
;
2009 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2011 while (charbuf
< charbuf_end
)
2013 ASSURE_DESTINATION (safe_room
);
2015 if (ASCII_CHAR_P (c
))
2016 EMIT_ONE_ASCII_BYTE (c
);
2017 else if (CHAR_BYTE8_P (c
))
2019 c
= CHAR_TO_BYTE8 (c
);
2024 struct charset
*charset
;
2028 unsigned char leading_codes
[2];
2030 charset
= char_charset (c
, charset_list
, &code
);
2033 c
= coding
->default_char
;
2034 if (ASCII_CHAR_P (c
))
2036 EMIT_ONE_ASCII_BYTE (c
);
2039 charset
= char_charset (c
, charset_list
, &code
);
2041 dimension
= CHARSET_DIMENSION (charset
);
2042 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2043 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2044 EMIT_ONE_BYTE (leading_codes
[0]);
2045 if (leading_codes
[1])
2046 EMIT_ONE_BYTE (leading_codes
[1]);
2048 EMIT_ONE_BYTE (code
);
2051 EMIT_ONE_BYTE (code
>> 8);
2052 EMIT_ONE_BYTE (code
& 0xFF);
2056 coding
->result
= CODING_RESULT_SUCCESS
;
2057 coding
->produced_char
+= produced_chars
;
2058 coding
->produced
= dst
- coding
->destination
;
2063 /*** 7. ISO2022 handlers ***/
2065 /* The following note describes the coding system ISO2022 briefly.
2066 Since the intention of this note is to help understand the
2067 functions in this file, some parts are NOT ACCURATE or are OVERLY
2068 SIMPLIFIED. For thorough understanding, please refer to the
2069 original document of ISO2022. This is equivalent to the standard
2070 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2072 ISO2022 provides many mechanisms to encode several character sets
2073 in 7-bit and 8-bit environments. For 7-bit environments, all text
2074 is encoded using bytes less than 128. This may make the encoded
2075 text a little bit longer, but the text passes more easily through
2076 several types of gateway, some of which strip off the MSB (Most
2079 There are two kinds of character sets: control character sets and
2080 graphic character sets. The former contain control characters such
2081 as `newline' and `escape' to provide control functions (control
2082 functions are also provided by escape sequences). The latter
2083 contain graphic characters such as 'A' and '-'. Emacs recognizes
2084 two control character sets and many graphic character sets.
2086 Graphic character sets are classified into one of the following
2087 four classes, according to the number of bytes (DIMENSION) and
2088 number of characters in one dimension (CHARS) of the set:
2089 - DIMENSION1_CHARS94
2090 - DIMENSION1_CHARS96
2091 - DIMENSION2_CHARS94
2092 - DIMENSION2_CHARS96
2094 In addition, each character set is assigned an identification tag,
2095 unique for each set, called the "final character" (denoted as <F>
2096 hereafter). The <F> of each character set is decided by ECMA(*)
2097 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2098 (0x30..0x3F are for private use only).
2100 Note (*): ECMA = European Computer Manufacturers Association
2102 Here are examples of graphic character sets [NAME(<F>)]:
2103 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2104 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2105 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2106 o DIMENSION2_CHARS96 -- none for the moment
2108 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2109 C0 [0x00..0x1F] -- control character plane 0
2110 GL [0x20..0x7F] -- graphic character plane 0
2111 C1 [0x80..0x9F] -- control character plane 1
2112 GR [0xA0..0xFF] -- graphic character plane 1
2114 A control character set is directly designated and invoked to C0 or
2115 C1 by an escape sequence. The most common case is that:
2116 - ISO646's control character set is designated/invoked to C0, and
2117 - ISO6429's control character set is designated/invoked to C1,
2118 and usually these designations/invocations are omitted in encoded
2119 text. In a 7-bit environment, only C0 can be used, and a control
2120 character for C1 is encoded by an appropriate escape sequence to
2121 fit into the environment. All control characters for C1 are
2122 defined to have corresponding escape sequences.
2124 A graphic character set is at first designated to one of four
2125 graphic registers (G0 through G3), then these graphic registers are
2126 invoked to GL or GR. These designations and invocations can be
2127 done independently. The most common case is that G0 is invoked to
2128 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2129 these invocations and designations are omitted in encoded text.
2130 In a 7-bit environment, only GL can be used.
2132 When a graphic character set of CHARS94 is invoked to GL, codes
2133 0x20 and 0x7F of the GL area work as control characters SPACE and
2134 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2137 There are two ways of invocation: locking-shift and single-shift.
2138 With locking-shift, the invocation lasts until the next different
2139 invocation, whereas with single-shift, the invocation affects the
2140 following character only and doesn't affect the locking-shift
2141 state. Invocations are done by the following control characters or
2144 ----------------------------------------------------------------------
2145 abbrev function cntrl escape seq description
2146 ----------------------------------------------------------------------
2147 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2148 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2149 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2150 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2151 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2152 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2153 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2154 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2155 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2156 ----------------------------------------------------------------------
2157 (*) These are not used by any known coding system.
2159 Control characters for these functions are defined by macros
2160 ISO_CODE_XXX in `coding.h'.
2162 Designations are done by the following escape sequences:
2163 ----------------------------------------------------------------------
2164 escape sequence description
2165 ----------------------------------------------------------------------
2166 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2167 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2168 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2169 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2170 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2171 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2172 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2173 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2174 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2175 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2176 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2177 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2178 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2179 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2180 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2181 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2182 ----------------------------------------------------------------------
2184 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2185 of dimension 1, chars 94, and final character <F>, etc...
2187 Note (*): Although these designations are not allowed in ISO2022,
2188 Emacs accepts them on decoding, and produces them on encoding
2189 CHARS96 character sets in a coding system which is characterized as
2190 7-bit environment, non-locking-shift, and non-single-shift.
2192 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2193 '(' must be omitted. We refer to this as "short-form" hereafter.
2195 Now you may notice that there are a lot of ways of encoding the
2196 same multilingual text in ISO2022. Actually, there exist many
2197 coding systems such as Compound Text (used in X11's inter client
2198 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2199 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2200 localized platforms), and all of these are variants of ISO2022.
2202 In addition to the above, Emacs handles two more kinds of escape
2203 sequences: ISO6429's direction specification and Emacs' private
2204 sequence for specifying character composition.
2206 ISO6429's direction specification takes the following form:
2207 o CSI ']' -- end of the current direction
2208 o CSI '0' ']' -- end of the current direction
2209 o CSI '1' ']' -- start of left-to-right text
2210 o CSI '2' ']' -- start of right-to-left text
2211 The control character CSI (0x9B: control sequence introducer) is
2212 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2214 Character composition specification takes the following form:
2215 o ESC '0' -- start relative composition
2216 o ESC '1' -- end composition
2217 o ESC '2' -- start rule-base composition (*)
2218 o ESC '3' -- start relative composition with alternate chars (**)
2219 o ESC '4' -- start rule-base composition with alternate chars (**)
2220 Since these are not standard escape sequences of any ISO standard,
2221 the use of them with these meanings is restricted to Emacs only.
2223 (*) This form is used only in Emacs 20.7 and older versions,
2224 but newer versions can safely decode it.
2225 (**) This form is used only in Emacs 21.1 and newer versions,
2226 and older versions can't decode it.
2228 Here's a list of example usages of these composition escape
2229 sequences (categorized by `enum composition_method').
2231 COMPOSITION_RELATIVE:
2232 ESC 0 CHAR [ CHAR ] ESC 1
2233 COMPOSITION_WITH_RULE:
2234 ESC 2 CHAR [ RULE CHAR ] ESC 1
2235 COMPOSITION_WITH_ALTCHARS:
2236 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2237 COMPOSITION_WITH_RULE_ALTCHARS:
2238 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2240 enum iso_code_class_type iso_code_class
[256];
2242 #define SAFE_CHARSET_P(coding, id) \
2243 ((id) <= (coding)->max_charset_id \
2244 && (coding)->safe_charsets[id] >= 0)
2247 #define SHIFT_OUT_OK(category) \
2248 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2251 setup_iso_safe_charsets (attrs
)
2254 Lisp_Object charset_list
, safe_charsets
;
2255 Lisp_Object request
;
2256 Lisp_Object reg_usage
;
2259 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2262 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2263 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2264 && ! EQ (charset_list
, Viso_2022_charset_list
))
2266 CODING_ATTR_CHARSET_LIST (attrs
)
2267 = charset_list
= Viso_2022_charset_list
;
2268 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2271 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2275 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2277 int id
= XINT (XCAR (tail
));
2278 if (max_charset_id
< id
)
2279 max_charset_id
= id
;
2282 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2284 request
= AREF (attrs
, coding_attr_iso_request
);
2285 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2286 reg94
= XINT (XCAR (reg_usage
));
2287 reg96
= XINT (XCDR (reg_usage
));
2289 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2293 struct charset
*charset
;
2296 charset
= CHARSET_FROM_ID (XINT (id
));
2297 reg
= Fcdr (Fassq (id
, request
));
2299 XSTRING (safe_charsets
)->data
[XINT (id
)] = XINT (reg
);
2300 else if (charset
->iso_chars_96
)
2303 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg96
;
2308 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg94
;
2311 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2315 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2316 Check if a text is encoded in ISO2022. If it is, returns an
2317 integer in which appropriate flag bits any of:
2319 CATEGORY_MASK_ISO_7_TIGHT
2320 CATEGORY_MASK_ISO_8_1
2321 CATEGORY_MASK_ISO_8_2
2322 CATEGORY_MASK_ISO_7_ELSE
2323 CATEGORY_MASK_ISO_8_ELSE
2324 are set. If a code which should never appear in ISO2022 is found,
2328 detect_coding_iso_2022 (coding
, mask
)
2329 struct coding_system
*coding
;
2332 unsigned char *src
= coding
->source
, *src_base
= src
;
2333 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2334 int multibytep
= coding
->src_multibyte
;
2335 int mask_iso
= CATEGORY_MASK_ISO
;
2336 int mask_found
= 0, mask_8bit_found
= 0;
2337 int reg
[4], shift_out
= 0, single_shifting
= 0;
2340 int consumed_chars
= 0;
2343 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2345 struct coding_system
*this = &(coding_categories
[i
]);
2346 Lisp_Object attrs
, val
;
2348 attrs
= CODING_ID_ATTRS (this->id
);
2349 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2350 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2351 setup_iso_safe_charsets (attrs
);
2352 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2353 this->max_charset_id
= XSTRING (val
)->size
- 1;
2354 this->safe_charsets
= (char *) XSTRING (val
)->data
;
2357 /* A coding system of this category is always ASCII compatible. */
2358 src
+= coding
->head_ascii
;
2360 reg
[0] = charset_ascii
, reg
[1] = reg
[2] = reg
[3] = -1;
2361 while (mask_iso
&& src
< src_end
)
2367 if (inhibit_iso_escape_detection
)
2369 single_shifting
= 0;
2371 if (c
>= '(' && c
<= '/')
2373 /* Designation sequence for a charset of dimension 1. */
2375 if (c1
< ' ' || c1
>= 0x80
2376 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2377 /* Invalid designation sequence. Just ignore. */
2379 reg
[(c
- '(') % 4] = id
;
2383 /* Designation sequence for a charset of dimension 2. */
2385 if (c
>= '@' && c
<= 'B')
2386 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2387 reg
[0] = id
= iso_charset_table
[1][0][c
];
2388 else if (c
>= '(' && c
<= '/')
2391 if (c1
< ' ' || c1
>= 0x80
2392 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2393 /* Invalid designation sequence. Just ignore. */
2395 reg
[(c
- '(') % 4] = id
;
2398 /* Invalid designation sequence. Just ignore. */
2401 else if (c
== 'N' || c
== 'O')
2403 /* ESC <Fe> for SS2 or SS3. */
2404 mask_iso
&= CATEGORY_MASK_ISO_7_ELSE
;
2407 else if (c
>= '0' && c
<= '4')
2409 /* ESC <Fp> for start/end composition. */
2410 mask_found
|= CATEGORY_MASK_ISO
;
2415 /* Invalid escape sequence. */
2416 mask_iso
&= ~CATEGORY_MASK_ISO_ESCAPE
;
2420 /* We found a valid designation sequence for CHARSET. */
2421 mask_iso
&= ~CATEGORY_MASK_ISO_8BIT
;
2422 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2424 mask_found
|= CATEGORY_MASK_ISO_7
;
2426 mask_iso
&= ~CATEGORY_MASK_ISO_7
;
2427 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2429 mask_found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2431 mask_iso
&= ~CATEGORY_MASK_ISO_7_TIGHT
;
2432 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2434 mask_found
|= CATEGORY_MASK_ISO_7_ELSE
;
2436 mask_iso
&= ~CATEGORY_MASK_ISO_7_ELSE
;
2437 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2439 mask_found
|= CATEGORY_MASK_ISO_8_ELSE
;
2441 mask_iso
&= ~CATEGORY_MASK_ISO_8_ELSE
;
2445 if (inhibit_iso_escape_detection
)
2447 single_shifting
= 0;
2450 || SHIFT_OUT_OK (coding_category_iso_7_else
)
2451 || SHIFT_OUT_OK (coding_category_iso_8_else
)))
2453 /* Locking shift out. */
2454 mask_iso
&= ~CATEGORY_MASK_ISO_7BIT
;
2455 mask_found
|= CATEGORY_MASK_ISO_ELSE
;
2460 if (inhibit_iso_escape_detection
)
2462 single_shifting
= 0;
2465 /* Locking shift in. */
2466 mask_iso
&= ~CATEGORY_MASK_ISO_7BIT
;
2467 mask_found
|= CATEGORY_MASK_ISO_ELSE
;
2472 single_shifting
= 0;
2476 int newmask
= CATEGORY_MASK_ISO_8_ELSE
;
2478 mask_8bit_found
= 1;
2479 if (inhibit_iso_escape_detection
)
2481 if (c
!= ISO_CODE_CSI
)
2483 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2484 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2485 newmask
|= CATEGORY_MASK_ISO_8_1
;
2486 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2487 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2488 newmask
|= CATEGORY_MASK_ISO_8_2
;
2489 single_shifting
= 1;
2491 if (VECTORP (Vlatin_extra_code_table
)
2492 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2494 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2495 & CODING_ISO_FLAG_LATIN_EXTRA
)
2496 newmask
|= CATEGORY_MASK_ISO_8_1
;
2497 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2498 & CODING_ISO_FLAG_LATIN_EXTRA
)
2499 newmask
|= CATEGORY_MASK_ISO_8_2
;
2501 mask_iso
&= newmask
;
2502 mask_found
|= newmask
;
2509 single_shifting
= 0;
2514 single_shifting
= 0;
2515 mask_8bit_found
= 1;
2516 if (VECTORP (Vlatin_extra_code_table
)
2517 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2521 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2522 & CODING_ISO_FLAG_LATIN_EXTRA
)
2523 newmask
|= CATEGORY_MASK_ISO_8_1
;
2524 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2525 & CODING_ISO_FLAG_LATIN_EXTRA
)
2526 newmask
|= CATEGORY_MASK_ISO_8_2
;
2527 mask_iso
&= newmask
;
2528 mask_found
|= newmask
;
2535 mask_iso
&= ~(CATEGORY_MASK_ISO_7BIT
2536 | CATEGORY_MASK_ISO_7_ELSE
);
2537 mask_found
|= CATEGORY_MASK_ISO_8_1
;
2538 mask_8bit_found
= 1;
2539 /* Check the length of succeeding codes of the range
2540 0xA0..0FF. If the byte length is odd, we exclude
2541 CATEGORY_MASK_ISO_8_2. We can check this only
2542 when we are not single shifting. */
2543 if (!single_shifting
2544 && mask_iso
& CATEGORY_MASK_ISO_8_2
)
2547 while (src
< src_end
)
2555 if (i
& 1 && src
< src_end
)
2556 mask_iso
&= ~CATEGORY_MASK_ISO_8_2
;
2558 mask_found
|= CATEGORY_MASK_ISO_8_2
;
2567 *mask
&= ~CATEGORY_MASK_ISO
;
2572 *mask
&= ~CATEGORY_MASK_ISO
;
2573 *mask
|= mask_iso
& mask_found
;
2574 if (! mask_8bit_found
)
2575 *mask
&= ~(CATEGORY_MASK_ISO_8BIT
| CATEGORY_MASK_ISO_8_ELSE
);
2580 /* Set designation state into CODING. */
2581 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2585 if (final < '0' || final >= 128 \
2586 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2587 || !SAFE_CHARSET_P (coding, id)) \
2589 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2590 goto invalid_code; \
2592 prev = CODING_ISO_DESIGNATION (coding, reg); \
2593 if (id == charset_jisx0201_roman) \
2595 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2596 id = charset_ascii; \
2598 else if (id == charset_jisx0208_1978) \
2600 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2601 id = charset_jisx0208; \
2603 CODING_ISO_DESIGNATION (coding, reg) = id; \
2604 /* If there was an invalid designation to REG previously, and this \
2605 designation is ASCII to REG, we should keep this designation \
2607 if (prev == -2 && id == charset_ascii) \
2608 goto invalid_code; \
2612 #define MAYBE_FINISH_COMPOSITION() \
2615 if (composition_state == COMPOSING_NO) \
2617 /* It is assured that we have enough room for producing \
2618 characters stored in the table `components'. */ \
2619 if (charbuf + component_idx > charbuf_end) \
2620 goto no_more_source; \
2621 composition_state = COMPOSING_NO; \
2622 if (method == COMPOSITION_RELATIVE \
2623 || method == COMPOSITION_WITH_ALTCHARS) \
2625 for (i = 0; i < component_idx; i++) \
2626 *charbuf++ = components[i]; \
2627 char_offset += component_idx; \
2631 for (i = 0; i < component_idx; i += 2) \
2632 *charbuf++ = components[i]; \
2633 char_offset += (component_idx / 2) + 1; \
2638 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2639 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2640 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2641 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2642 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2645 #define DECODE_COMPOSITION_START(c1) \
2648 && composition_state == COMPOSING_COMPONENT_RULE) \
2650 component_len = component_idx; \
2651 composition_state = COMPOSING_CHAR; \
2657 MAYBE_FINISH_COMPOSITION (); \
2658 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2659 goto no_more_source; \
2660 for (p = src; p < src_end - 1; p++) \
2661 if (*p == ISO_CODE_ESC && p[1] == '1') \
2663 if (p == src_end - 1) \
2665 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2666 goto invalid_code; \
2667 goto no_more_source; \
2670 /* This is surely the start of a composition. */ \
2671 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2672 : c1 == '2' ? COMPOSITION_WITH_RULE \
2673 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2674 : COMPOSITION_WITH_RULE_ALTCHARS); \
2675 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2676 : COMPOSING_COMPONENT_CHAR); \
2677 component_idx = component_len = 0; \
2682 /* Handle compositoin end sequence ESC 1. */
2684 #define DECODE_COMPOSITION_END() \
2686 int nchars = (component_len > 0 ? component_idx - component_len \
2687 : method == COMPOSITION_RELATIVE ? component_idx \
2688 : (component_idx + 1) / 2); \
2690 int *saved_charbuf = charbuf; \
2692 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
2693 if (method != COMPOSITION_RELATIVE) \
2695 if (component_len == 0) \
2696 for (i = 0; i < component_idx; i++) \
2697 *charbuf++ = components[i]; \
2699 for (i = 0; i < component_len; i++) \
2700 *charbuf++ = components[i]; \
2701 *saved_charbuf = saved_charbuf - charbuf; \
2703 if (method == COMPOSITION_WITH_RULE) \
2704 for (i = 0; i < component_idx; i += 2, char_offset++) \
2705 *charbuf++ = components[i]; \
2707 for (i = component_len; i < component_idx; i++, char_offset++) \
2708 *charbuf++ = components[i]; \
2709 coding->annotated = 1; \
2710 composition_state = COMPOSING_NO; \
2714 /* Decode a composition rule from the byte C1 (and maybe one more byte
2715 from SRC) and store one encoded composition rule in
2716 coding->cmp_data. */
2718 #define DECODE_COMPOSITION_RULE(c1) \
2721 if (c1 < 81) /* old format (before ver.21) */ \
2723 int gref = (c1) / 9; \
2724 int nref = (c1) % 9; \
2725 if (gref == 4) gref = 10; \
2726 if (nref == 4) nref = 10; \
2727 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2729 else if (c1 < 93) /* new format (after ver.21) */ \
2731 ONE_MORE_BYTE (c2); \
2732 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2739 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2742 decode_coding_iso_2022 (coding
)
2743 struct coding_system
*coding
;
2745 unsigned char *src
= coding
->source
+ coding
->consumed
;
2746 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2747 unsigned char *src_base
;
2748 int *charbuf
= coding
->charbuf
;
2749 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- 4;
2750 int consumed_chars
= 0, consumed_chars_base
;
2751 int char_offset
= 0;
2752 int multibytep
= coding
->src_multibyte
;
2753 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2754 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2755 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2756 struct charset
*charset
;
2758 /* For handling composition sequence. */
2759 #define COMPOSING_NO 0
2760 #define COMPOSING_CHAR 1
2761 #define COMPOSING_RULE 2
2762 #define COMPOSING_COMPONENT_CHAR 3
2763 #define COMPOSING_COMPONENT_RULE 4
2765 int composition_state
= COMPOSING_NO
;
2766 enum composition_method method
;
2767 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2770 Lisp_Object attrs
, eol_type
, charset_list
;
2772 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2773 setup_iso_safe_charsets (attrs
);
2780 consumed_chars_base
= consumed_chars
;
2782 if (charbuf
>= charbuf_end
)
2787 /* We produce no character or one character. */
2788 switch (iso_code_class
[c1
])
2790 case ISO_0x20_or_0x7F
:
2791 if (composition_state
!= COMPOSING_NO
)
2793 if (composition_state
== COMPOSING_RULE
2794 || composition_state
== COMPOSING_COMPONENT_RULE
)
2796 DECODE_COMPOSITION_RULE (c1
);
2797 components
[component_idx
++] = c1
;
2798 composition_state
--;
2802 if (charset_id_0
< 0
2803 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2804 /* This is SPACE or DEL. */
2805 charset
= CHARSET_FROM_ID (charset_ascii
);
2807 charset
= CHARSET_FROM_ID (charset_id_0
);
2810 case ISO_graphic_plane_0
:
2811 if (composition_state
!= COMPOSING_NO
)
2813 if (composition_state
== COMPOSING_RULE
2814 || composition_state
== COMPOSING_COMPONENT_RULE
)
2816 DECODE_COMPOSITION_RULE (c1
);
2817 components
[component_idx
++] = c1
;
2818 composition_state
--;
2822 charset
= CHARSET_FROM_ID (charset_id_0
);
2825 case ISO_0xA0_or_0xFF
:
2826 if (charset_id_1
< 0
2827 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
2828 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
2830 /* This is a graphic character, we fall down ... */
2832 case ISO_graphic_plane_1
:
2833 if (charset_id_1
< 0)
2835 charset
= CHARSET_FROM_ID (charset_id_1
);
2838 case ISO_carriage_return
:
2841 if (EQ (eol_type
, Qdos
))
2844 goto no_more_source
;
2848 else if (EQ (eol_type
, Qmac
))
2854 MAYBE_FINISH_COMPOSITION ();
2855 charset
= CHARSET_FROM_ID (charset_ascii
);
2859 MAYBE_FINISH_COMPOSITION ();
2863 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2864 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
2866 CODING_ISO_INVOCATION (coding
, 0) = 1;
2867 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2871 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
2873 CODING_ISO_INVOCATION (coding
, 0) = 0;
2874 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2877 case ISO_single_shift_2_7
:
2878 case ISO_single_shift_2
:
2879 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2881 /* SS2 is handled as an escape sequence of ESC 'N' */
2883 goto label_escape_sequence
;
2885 case ISO_single_shift_3
:
2886 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2888 /* SS2 is handled as an escape sequence of ESC 'O' */
2890 goto label_escape_sequence
;
2892 case ISO_control_sequence_introducer
:
2893 /* CSI is handled as an escape sequence of ESC '[' ... */
2895 goto label_escape_sequence
;
2899 label_escape_sequence
:
2900 /* Escape sequences handled here are invocation,
2901 designation, direction specification, and character
2902 composition specification. */
2905 case '&': /* revision of following character set */
2907 if (!(c1
>= '@' && c1
<= '~'))
2910 if (c1
!= ISO_CODE_ESC
)
2913 goto label_escape_sequence
;
2915 case '$': /* designation of 2-byte character set */
2916 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2919 if (c1
>= '@' && c1
<= 'B')
2920 { /* designation of JISX0208.1978, GB2312.1980,
2922 DECODE_DESIGNATION (0, 2, 0, c1
);
2924 else if (c1
>= 0x28 && c1
<= 0x2B)
2925 { /* designation of DIMENSION2_CHARS94 character set */
2927 DECODE_DESIGNATION (c1
- 0x28, 2, 0, c2
);
2929 else if (c1
>= 0x2C && c1
<= 0x2F)
2930 { /* designation of DIMENSION2_CHARS96 character set */
2932 DECODE_DESIGNATION (c1
- 0x2C, 2, 1, c2
);
2936 /* We must update these variables now. */
2937 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2938 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2941 case 'n': /* invocation of locking-shift-2 */
2942 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2943 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2945 CODING_ISO_INVOCATION (coding
, 0) = 2;
2946 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2949 case 'o': /* invocation of locking-shift-3 */
2950 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2951 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2953 CODING_ISO_INVOCATION (coding
, 0) = 3;
2954 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2957 case 'N': /* invocation of single-shift-2 */
2958 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2959 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2961 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 2));
2963 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
2967 case 'O': /* invocation of single-shift-3 */
2968 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
2969 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2971 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 3));
2973 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
2977 case '0': case '2': case '3': case '4': /* start composition */
2978 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
2980 DECODE_COMPOSITION_START (c1
);
2983 case '1': /* end composition */
2984 if (composition_state
== COMPOSING_NO
)
2986 DECODE_COMPOSITION_END ();
2989 case '[': /* specification of direction */
2990 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
2992 /* For the moment, nested direction is not supported.
2993 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2994 left-to-right, and nozero means right-to-left. */
2998 case ']': /* end of the current direction */
2999 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3001 case '0': /* end of the current direction */
3002 case '1': /* start of left-to-right direction */
3005 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3010 case '2': /* start of right-to-left direction */
3013 coding
->mode
|= CODING_MODE_DIRECTION
;
3024 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3026 if (c1
>= 0x28 && c1
<= 0x2B)
3027 { /* designation of DIMENSION1_CHARS94 character set */
3029 DECODE_DESIGNATION (c1
- 0x28, 1, 0, c2
);
3031 else if (c1
>= 0x2C && c1
<= 0x2F)
3032 { /* designation of DIMENSION1_CHARS96 character set */
3034 DECODE_DESIGNATION (c1
- 0x2C, 1, 1, c2
);
3038 /* We must update these variables now. */
3039 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3040 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3045 /* Now we know CHARSET and 1st position code C1 of a character.
3046 Produce a decoded character while getting 2nd position code
3049 if (CHARSET_DIMENSION (charset
) > 1)
3052 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3053 /* C2 is not in a valid range. */
3055 c1
= (c1
<< 8) | (c2
& 0x7F);
3056 if (CHARSET_DIMENSION (charset
) > 2)
3059 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3060 /* C2 is not in a valid range. */
3062 c1
= (c1
<< 8) | (c2
& 0x7F);
3066 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3069 MAYBE_FINISH_COMPOSITION ();
3070 for (; src_base
< src
; src_base
++, char_offset
++)
3072 if (ASCII_BYTE_P (*src_base
))
3073 *charbuf
++ = *src_base
;
3075 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3078 else if (composition_state
== COMPOSING_NO
)
3085 components
[component_idx
++] = c
;
3086 if (method
== COMPOSITION_WITH_RULE
3087 || (method
== COMPOSITION_WITH_RULE_ALTCHARS
3088 && composition_state
== COMPOSING_COMPONENT_CHAR
))
3089 composition_state
++;
3094 MAYBE_FINISH_COMPOSITION ();
3096 consumed_chars
= consumed_chars_base
;
3098 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3103 coding
->consumed_char
+= consumed_chars_base
;
3104 coding
->consumed
= src_base
- coding
->source
;
3105 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3109 /* ISO2022 encoding stuff. */
3112 It is not enough to say just "ISO2022" on encoding, we have to
3113 specify more details. In Emacs, each coding system of ISO2022
3114 variant has the following specifications:
3115 1. Initial designation to G0 thru G3.
3116 2. Allows short-form designation?
3117 3. ASCII should be designated to G0 before control characters?
3118 4. ASCII should be designated to G0 at end of line?
3119 5. 7-bit environment or 8-bit environment?
3120 6. Use locking-shift?
3121 7. Use Single-shift?
3122 And the following two are only for Japanese:
3123 8. Use ASCII in place of JIS0201-1976-Roman?
3124 9. Use JISX0208-1983 in place of JISX0208-1978?
3125 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3126 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3130 /* Produce codes (escape sequence) for designating CHARSET to graphic
3131 register REG at DST, and increment DST. If <final-char> of CHARSET is
3132 '@', 'A', or 'B' and the coding system CODING allows, produce
3133 designation sequence of short-form. */
3135 #define ENCODE_DESIGNATION(charset, reg, coding) \
3137 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3138 char *intermediate_char_94 = "()*+"; \
3139 char *intermediate_char_96 = ",-./"; \
3140 int revision = -1; \
3143 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3144 revision = CHARSET_ISO_REVISION (charset); \
3146 if (revision >= 0) \
3148 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3149 EMIT_ONE_BYTE ('@' + revision); \
3151 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3152 if (CHARSET_DIMENSION (charset) == 1) \
3154 if (! CHARSET_ISO_CHARS_96 (charset)) \
3155 c = intermediate_char_94[reg]; \
3157 c = intermediate_char_96[reg]; \
3158 EMIT_ONE_ASCII_BYTE (c); \
3162 EMIT_ONE_ASCII_BYTE ('$'); \
3163 if (! CHARSET_ISO_CHARS_96 (charset)) \
3165 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3167 || final_char < '@' || final_char > 'B') \
3168 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3171 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3173 EMIT_ONE_ASCII_BYTE (final_char); \
3175 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3179 /* The following two macros produce codes (control character or escape
3180 sequence) for ISO2022 single-shift functions (single-shift-2 and
3183 #define ENCODE_SINGLE_SHIFT_2 \
3185 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3186 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3188 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3189 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3193 #define ENCODE_SINGLE_SHIFT_3 \
3195 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3196 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3198 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3199 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3203 /* The following four macros produce codes (control character or
3204 escape sequence) for ISO2022 locking-shift functions (shift-in,
3205 shift-out, locking-shift-2, and locking-shift-3). */
3207 #define ENCODE_SHIFT_IN \
3209 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3210 CODING_ISO_INVOCATION (coding, 0) = 0; \
3214 #define ENCODE_SHIFT_OUT \
3216 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3217 CODING_ISO_INVOCATION (coding, 0) = 1; \
3221 #define ENCODE_LOCKING_SHIFT_2 \
3223 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3224 CODING_ISO_INVOCATION (coding, 0) = 2; \
3228 #define ENCODE_LOCKING_SHIFT_3 \
3230 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3231 CODING_ISO_INVOCATION (coding, 0) = 3; \
3235 /* Produce codes for a DIMENSION1 character whose character set is
3236 CHARSET and whose position-code is C1. Designation and invocation
3237 sequences are also produced in advance if necessary. */
3239 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3241 int id = CHARSET_ID (charset); \
3243 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3244 && id == charset_ascii) \
3246 id = charset_jisx0201_roman; \
3247 charset = CHARSET_FROM_ID (id); \
3250 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3252 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3253 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3255 EMIT_ONE_BYTE (c1 | 0x80); \
3256 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3259 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3261 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3264 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3266 EMIT_ONE_BYTE (c1 | 0x80); \
3270 /* Since CHARSET is not yet invoked to any graphic planes, we \
3271 must invoke it, or, at first, designate it to some graphic \
3272 register. Then repeat the loop to actually produce the \
3274 dst = encode_invocation_designation (charset, coding, dst, \
3279 /* Produce codes for a DIMENSION2 character whose character set is
3280 CHARSET and whose position-codes are C1 and C2. Designation and
3281 invocation codes are also produced in advance if necessary. */
3283 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3285 int id = CHARSET_ID (charset); \
3287 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3288 && id == charset_jisx0208) \
3290 id = charset_jisx0208_1978; \
3291 charset = CHARSET_FROM_ID (id); \
3294 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3296 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3297 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3299 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3300 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3303 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3305 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3308 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3310 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3314 /* Since CHARSET is not yet invoked to any graphic planes, we \
3315 must invoke it, or, at first, designate it to some graphic \
3316 register. Then repeat the loop to actually produce the \
3318 dst = encode_invocation_designation (charset, coding, dst, \
3323 #define ENCODE_ISO_CHARACTER(charset, c) \
3325 int code = ENCODE_CHAR ((charset),(c)); \
3327 if (CHARSET_DIMENSION (charset) == 1) \
3328 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3330 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3334 /* Produce designation and invocation codes at a place pointed by DST
3335 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3339 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3340 struct charset
*charset
;
3341 struct coding_system
*coding
;
3345 int multibytep
= coding
->dst_multibyte
;
3346 int produced_chars
= *p_nchars
;
3347 int reg
; /* graphic register number */
3348 int id
= CHARSET_ID (charset
);
3350 /* At first, check designations. */
3351 for (reg
= 0; reg
< 4; reg
++)
3352 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3357 /* CHARSET is not yet designated to any graphic registers. */
3358 /* At first check the requested designation. */
3359 reg
= CODING_ISO_REQUEST (coding
, id
);
3361 /* Since CHARSET requests no special designation, designate it
3362 to graphic register 0. */
3365 ENCODE_DESIGNATION (charset
, reg
, coding
);
3368 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3369 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3371 /* Since the graphic register REG is not invoked to any graphic
3372 planes, invoke it to graphic plane 0. */
3375 case 0: /* graphic register 0 */
3379 case 1: /* graphic register 1 */
3383 case 2: /* graphic register 2 */
3384 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3385 ENCODE_SINGLE_SHIFT_2
;
3387 ENCODE_LOCKING_SHIFT_2
;
3390 case 3: /* graphic register 3 */
3391 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3392 ENCODE_SINGLE_SHIFT_3
;
3394 ENCODE_LOCKING_SHIFT_3
;
3399 *p_nchars
= produced_chars
;
3403 /* The following three macros produce codes for indicating direction
3405 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3407 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3408 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3410 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3414 #define ENCODE_DIRECTION_R2L() \
3416 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3417 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3421 #define ENCODE_DIRECTION_L2R() \
3423 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3424 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3428 /* Produce codes for designation and invocation to reset the graphic
3429 planes and registers to initial state. */
3430 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3433 struct charset *charset; \
3435 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3437 for (reg = 0; reg < 4; reg++) \
3438 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3439 && (CODING_ISO_DESIGNATION (coding, reg) \
3440 != CODING_ISO_INITIAL (coding, reg))) \
3442 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3443 ENCODE_DESIGNATION (charset, reg, coding); \
3448 /* Produce designation sequences of charsets in the line started from
3449 SRC to a place pointed by DST, and return updated DST.
3451 If the current block ends before any end-of-line, we may fail to
3452 find all the necessary designations. */
3454 static unsigned char *
3455 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3456 struct coding_system
*coding
;
3457 int *charbuf
, *charbuf_end
;
3460 struct charset
*charset
;
3461 /* Table of charsets to be designated to each graphic register. */
3463 int c
, found
= 0, reg
;
3464 int produced_chars
= 0;
3465 int multibytep
= coding
->dst_multibyte
;
3467 Lisp_Object charset_list
;
3469 attrs
= CODING_ID_ATTRS (coding
->id
);
3470 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3471 if (EQ (charset_list
, Qiso_2022
))
3472 charset_list
= Viso_2022_charset_list
;
3474 for (reg
= 0; reg
< 4; reg
++)
3484 charset
= char_charset (c
, charset_list
, NULL
);
3485 id
= CHARSET_ID (charset
);
3486 reg
= CODING_ISO_REQUEST (coding
, id
);
3487 if (reg
>= 0 && r
[reg
] < 0)
3496 for (reg
= 0; reg
< 4; reg
++)
3498 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3499 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3505 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3508 encode_coding_iso_2022 (coding
)
3509 struct coding_system
*coding
;
3511 int multibytep
= coding
->dst_multibyte
;
3512 int *charbuf
= coding
->charbuf
;
3513 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3514 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3515 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3518 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3519 && CODING_ISO_BOL (coding
));
3520 int produced_chars
= 0;
3521 Lisp_Object attrs
, eol_type
, charset_list
;
3522 int ascii_compatible
;
3525 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3526 setup_iso_safe_charsets (attrs
);
3527 coding
->safe_charsets
3528 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs
))->data
;
3530 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3532 while (charbuf
< charbuf_end
)
3534 ASSURE_DESTINATION (safe_room
);
3536 if (bol_designation
)
3538 unsigned char *dst_prev
= dst
;
3540 /* We have to produce designation sequences if any now. */
3541 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3542 bol_designation
= 0;
3543 /* We are sure that designation sequences are all ASCII bytes. */
3544 produced_chars
+= dst
- dst_prev
;
3549 /* Now encode the character C. */
3550 if (c
< 0x20 || c
== 0x7F)
3553 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3555 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3556 ENCODE_RESET_PLANE_AND_REGISTER ();
3557 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3561 for (i
= 0; i
< 4; i
++)
3562 CODING_ISO_DESIGNATION (coding
, i
)
3563 = CODING_ISO_INITIAL (coding
, i
);
3566 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3568 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3569 ENCODE_RESET_PLANE_AND_REGISTER ();
3570 EMIT_ONE_ASCII_BYTE (c
);
3572 else if (ASCII_CHAR_P (c
))
3574 if (ascii_compatible
)
3575 EMIT_ONE_ASCII_BYTE (c
);
3578 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
3579 ENCODE_ISO_CHARACTER (charset
, c
);
3582 else if (CHAR_BYTE8_P (c
))
3584 c
= CHAR_TO_BYTE8 (c
);
3589 struct charset
*charset
= char_charset (c
, charset_list
, NULL
);
3593 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3595 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3596 charset
= CHARSET_FROM_ID (charset_ascii
);
3600 c
= coding
->default_char
;
3601 charset
= char_charset (c
, charset_list
, NULL
);
3604 ENCODE_ISO_CHARACTER (charset
, c
);
3608 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3609 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3611 ASSURE_DESTINATION (safe_room
);
3612 ENCODE_RESET_PLANE_AND_REGISTER ();
3614 coding
->result
= CODING_RESULT_SUCCESS
;
3615 CODING_ISO_BOL (coding
) = bol_designation
;
3616 coding
->produced_char
+= produced_chars
;
3617 coding
->produced
= dst
- coding
->destination
;
3622 /*** 8,9. SJIS and BIG5 handlers ***/
3624 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3625 quite widely. So, for the moment, Emacs supports them in the bare
3626 C code. But, in the future, they may be supported only by CCL. */
3628 /* SJIS is a coding system encoding three character sets: ASCII, right
3629 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3630 as is. A character of charset katakana-jisx0201 is encoded by
3631 "position-code + 0x80". A character of charset japanese-jisx0208
3632 is encoded in 2-byte but two position-codes are divided and shifted
3633 so that it fit in the range below.
3635 --- CODE RANGE of SJIS ---
3636 (character set) (range)
3638 KATAKANA-JISX0201 0xA0 .. 0xDF
3639 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3640 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3641 -------------------------------
3645 /* BIG5 is a coding system encoding two character sets: ASCII and
3646 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3647 character set and is encoded in two-byte.
3649 --- CODE RANGE of BIG5 ---
3650 (character set) (range)
3652 Big5 (1st byte) 0xA1 .. 0xFE
3653 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3654 --------------------------
3658 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3659 Check if a text is encoded in SJIS. If it is, return
3660 CATEGORY_MASK_SJIS, else return 0. */
3663 detect_coding_sjis (coding
, mask
)
3664 struct coding_system
*coding
;
3667 unsigned char *src
= coding
->source
, *src_base
= src
;
3668 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3669 int multibytep
= coding
->src_multibyte
;
3670 int consumed_chars
= 0;
3675 /* A coding system of this category is always ASCII compatible. */
3676 src
+= coding
->head_ascii
;
3685 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3688 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
3692 else if (c
>= 0xA0 && c
< 0xE0)
3697 *mask
&= ~CATEGORY_MASK_SJIS
;
3701 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3703 *mask
&= ~CATEGORY_MASK_SJIS
;
3709 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3710 Check if a text is encoded in BIG5. If it is, return
3711 CATEGORY_MASK_BIG5, else return 0. */
3714 detect_coding_big5 (coding
, mask
)
3715 struct coding_system
*coding
;
3718 unsigned char *src
= coding
->source
, *src_base
= src
;
3719 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3720 int multibytep
= coding
->src_multibyte
;
3721 int consumed_chars
= 0;
3726 /* A coding system of this category is always ASCII compatible. */
3727 src
+= coding
->head_ascii
;
3739 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
3746 *mask
&= ~CATEGORY_MASK_BIG5
;
3750 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3752 *mask
&= ~CATEGORY_MASK_BIG5
;
3758 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3759 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3762 decode_coding_sjis (coding
)
3763 struct coding_system
*coding
;
3765 unsigned char *src
= coding
->source
+ coding
->consumed
;
3766 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3767 unsigned char *src_base
;
3768 int *charbuf
= coding
->charbuf
;
3769 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
3770 int consumed_chars
= 0, consumed_chars_base
;
3771 int multibytep
= coding
->src_multibyte
;
3772 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3773 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3775 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3778 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3779 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3780 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3787 consumed_chars_base
= consumed_chars
;
3789 if (charbuf
>= charbuf_end
)
3796 if (EQ (eol_type
, Qdos
))
3799 goto no_more_source
;
3803 else if (EQ (eol_type
, Qmac
))
3808 struct charset
*charset
;
3811 charset
= charset_roman
;
3816 if (c
< 0xA0 || c
>= 0xE0)
3818 /* SJIS -> JISX0208 */
3820 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
3824 charset
= charset_kanji
;
3828 /* SJIS -> JISX0201-Kana */
3830 charset
= charset_kana
;
3833 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
3840 consumed_chars
= consumed_chars_base
;
3842 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3847 coding
->consumed_char
+= consumed_chars_base
;
3848 coding
->consumed
= src_base
- coding
->source
;
3849 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3853 decode_coding_big5 (coding
)
3854 struct coding_system
*coding
;
3856 unsigned char *src
= coding
->source
+ coding
->consumed
;
3857 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3858 unsigned char *src_base
;
3859 int *charbuf
= coding
->charbuf
;
3860 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
3861 int consumed_chars
= 0, consumed_chars_base
;
3862 int multibytep
= coding
->src_multibyte
;
3863 struct charset
*charset_roman
, *charset_big5
;
3864 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3866 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3868 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3869 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3876 consumed_chars_base
= consumed_chars
;
3878 if (charbuf
>= charbuf_end
)
3885 if (EQ (eol_type
, Qdos
))
3888 goto no_more_source
;
3892 else if (EQ (eol_type
, Qmac
))
3897 struct charset
*charset
;
3899 charset
= charset_roman
;
3903 if (c
< 0xA1 || c
> 0xFE)
3906 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
3909 charset
= charset_big5
;
3911 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
3919 consumed_chars
= consumed_chars_base
;
3921 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3926 coding
->consumed_char
+= consumed_chars_base
;
3927 coding
->consumed
= src_base
- coding
->source
;
3928 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3931 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3932 This function can encode charsets `ascii', `katakana-jisx0201',
3933 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3934 are sure that all these charsets are registered as official charset
3935 (i.e. do not have extended leading-codes). Characters of other
3936 charsets are produced without any encoding. If SJIS_P is 1, encode
3937 SJIS text, else encode BIG5 text. */
3940 encode_coding_sjis (coding
)
3941 struct coding_system
*coding
;
3943 int multibytep
= coding
->dst_multibyte
;
3944 int *charbuf
= coding
->charbuf
;
3945 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3946 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3947 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3949 int produced_chars
= 0;
3950 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3951 int ascii_compatible
;
3952 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3955 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3957 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3958 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3959 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3961 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3963 while (charbuf
< charbuf_end
)
3965 ASSURE_DESTINATION (safe_room
);
3967 /* Now encode the character C. */
3968 if (ASCII_CHAR_P (c
) && ascii_compatible
)
3969 EMIT_ONE_ASCII_BYTE (c
);
3970 else if (CHAR_BYTE8_P (c
))
3972 c
= CHAR_TO_BYTE8 (c
);
3978 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
3982 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3984 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3985 charset
= CHARSET_FROM_ID (charset_ascii
);
3989 c
= coding
->default_char
;
3990 charset
= char_charset (c
, charset_list
, &code
);
3993 if (code
== CHARSET_INVALID_CODE (charset
))
3995 if (charset
== charset_kanji
)
3999 c1
= code
>> 8, c2
= code
& 0xFF;
4000 EMIT_TWO_BYTES (c1
, c2
);
4002 else if (charset
== charset_kana
)
4003 EMIT_ONE_BYTE (code
| 0x80);
4005 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4008 coding
->result
= CODING_RESULT_SUCCESS
;
4009 coding
->produced_char
+= produced_chars
;
4010 coding
->produced
= dst
- coding
->destination
;
4015 encode_coding_big5 (coding
)
4016 struct coding_system
*coding
;
4018 int multibytep
= coding
->dst_multibyte
;
4019 int *charbuf
= coding
->charbuf
;
4020 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4021 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4022 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4024 int produced_chars
= 0;
4025 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4026 int ascii_compatible
;
4027 struct charset
*charset_roman
, *charset_big5
;
4030 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4032 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4033 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4034 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4036 while (charbuf
< charbuf_end
)
4038 ASSURE_DESTINATION (safe_room
);
4040 /* Now encode the character C. */
4041 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4042 EMIT_ONE_ASCII_BYTE (c
);
4043 else if (CHAR_BYTE8_P (c
))
4045 c
= CHAR_TO_BYTE8 (c
);
4051 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4055 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4057 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4058 charset
= CHARSET_FROM_ID (charset_ascii
);
4062 c
= coding
->default_char
;
4063 charset
= char_charset (c
, charset_list
, &code
);
4066 if (code
== CHARSET_INVALID_CODE (charset
))
4068 if (charset
== charset_big5
)
4072 c1
= code
>> 8, c2
= code
& 0xFF;
4073 EMIT_TWO_BYTES (c1
, c2
);
4076 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4079 coding
->result
= CODING_RESULT_SUCCESS
;
4080 coding
->produced_char
+= produced_chars
;
4081 coding
->produced
= dst
- coding
->destination
;
4086 /*** 10. CCL handlers ***/
4088 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4089 Check if a text is encoded in a coding system of which
4090 encoder/decoder are written in CCL program. If it is, return
4091 CATEGORY_MASK_CCL, else return 0. */
4094 detect_coding_ccl (coding
, mask
)
4095 struct coding_system
*coding
;
4098 unsigned char *src
= coding
->source
, *src_base
= src
;
4099 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4100 int multibytep
= coding
->src_multibyte
;
4101 int consumed_chars
= 0;
4103 unsigned char *valids
= CODING_CCL_VALIDS (coding
);
4104 int head_ascii
= coding
->head_ascii
;
4107 coding
= &coding_categories
[coding_category_ccl
];
4108 attrs
= CODING_ID_ATTRS (coding
->id
);
4109 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4118 if (!found
&& valids
[c
] > 1)
4121 *mask
&= ~CATEGORY_MASK_CCL
;
4129 decode_coding_ccl (coding
)
4130 struct coding_system
*coding
;
4132 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4133 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4134 int *charbuf
= coding
->charbuf
;
4135 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4136 int consumed_chars
= 0;
4137 int multibytep
= coding
->src_multibyte
;
4138 struct ccl_program ccl
;
4139 int source_charbuf
[1024];
4140 int source_byteidx
[1024];
4142 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
4144 while (src
< src_end
)
4146 const unsigned char *p
= src
;
4147 int *source
, *source_end
;
4151 while (i
< 1024 && p
< src_end
)
4153 source_byteidx
[i
] = p
- src
;
4154 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4157 while (i
< 1024 && p
< src_end
)
4158 source_charbuf
[i
++] = *p
++;
4160 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4163 source
= source_charbuf
;
4164 source_end
= source
+ i
;
4165 while (source
< source_end
)
4167 ccl_driver (&ccl
, source
, charbuf
,
4168 source_end
- source
, charbuf_end
- charbuf
);
4169 source
+= ccl
.consumed
;
4170 charbuf
+= ccl
.produced
;
4171 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4174 if (source
< source_end
)
4175 src
+= source_byteidx
[source
- source_charbuf
];
4178 consumed_chars
+= source
- source_charbuf
;
4180 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4181 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4187 case CCL_STAT_SUSPEND_BY_SRC
:
4188 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4190 case CCL_STAT_SUSPEND_BY_DST
:
4193 case CCL_STAT_INVALID_CMD
:
4194 coding
->result
= CODING_RESULT_INTERRUPT
;
4197 coding
->result
= CODING_RESULT_SUCCESS
;
4200 coding
->consumed_char
+= consumed_chars
;
4201 coding
->consumed
= src
- coding
->source
;
4202 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4206 encode_coding_ccl (coding
)
4207 struct coding_system
*coding
;
4209 struct ccl_program ccl
;
4210 int multibytep
= coding
->dst_multibyte
;
4211 int *charbuf
= coding
->charbuf
;
4212 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4213 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4214 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4215 unsigned char *adjusted_dst_end
= dst_end
- 1;
4216 int destination_charbuf
[1024];
4217 int i
, produced_chars
= 0;
4219 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4221 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4222 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4224 while (charbuf
< charbuf_end
&& dst
< adjusted_dst_end
)
4226 int dst_bytes
= dst_end
- dst
;
4227 if (dst_bytes
> 1024)
4230 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4231 charbuf_end
- charbuf
, dst_bytes
);
4232 charbuf
+= ccl
.consumed
;
4234 for (i
= 0; i
< ccl
.produced
; i
++)
4235 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4238 for (i
= 0; i
< ccl
.produced
; i
++)
4239 *dst
++ = destination_charbuf
[i
] & 0xFF;
4240 produced_chars
+= ccl
.produced
;
4246 case CCL_STAT_SUSPEND_BY_SRC
:
4247 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4249 case CCL_STAT_SUSPEND_BY_DST
:
4250 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
4253 case CCL_STAT_INVALID_CMD
:
4254 coding
->result
= CODING_RESULT_INTERRUPT
;
4257 coding
->result
= CODING_RESULT_SUCCESS
;
4261 coding
->produced_char
+= produced_chars
;
4262 coding
->produced
= dst
- coding
->destination
;
4268 /*** 10, 11. no-conversion handlers ***/
4270 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4273 decode_coding_raw_text (coding
)
4274 struct coding_system
*coding
;
4276 coding
->chars_at_source
= 1;
4277 coding
->consumed_char
= 0;
4278 coding
->consumed
= 0;
4279 coding
->result
= CODING_RESULT_SUCCESS
;
4283 encode_coding_raw_text (coding
)
4284 struct coding_system
*coding
;
4286 int multibytep
= coding
->dst_multibyte
;
4287 int *charbuf
= coding
->charbuf
;
4288 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4289 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4290 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4291 int produced_chars
= 0;
4296 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4298 if (coding
->src_multibyte
)
4299 while (charbuf
< charbuf_end
)
4301 ASSURE_DESTINATION (safe_room
);
4303 if (ASCII_CHAR_P (c
))
4304 EMIT_ONE_ASCII_BYTE (c
);
4305 else if (CHAR_BYTE8_P (c
))
4307 c
= CHAR_TO_BYTE8 (c
);
4312 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4314 CHAR_STRING_ADVANCE (c
, p1
);
4317 EMIT_ONE_BYTE (*p0
);
4323 while (charbuf
< charbuf_end
)
4325 ASSURE_DESTINATION (safe_room
);
4332 if (coding
->src_multibyte
)
4334 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4336 while (charbuf
< charbuf_end
)
4338 ASSURE_DESTINATION (safe_room
);
4340 if (ASCII_CHAR_P (c
))
4342 else if (CHAR_BYTE8_P (c
))
4343 *dst
++ = CHAR_TO_BYTE8 (c
);
4345 CHAR_STRING_ADVANCE (c
, dst
);
4351 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4352 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4353 *dst
++ = *charbuf
++;
4354 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4357 coding
->result
= CODING_RESULT_SUCCESS
;
4358 coding
->produced_char
+= produced_chars
;
4359 coding
->produced
= dst
- coding
->destination
;
4364 detect_coding_charset (coding
, mask
)
4365 struct coding_system
*coding
;
4368 unsigned char *src
= coding
->source
, *src_base
= src
;
4369 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4370 int multibytep
= coding
->src_multibyte
;
4371 int consumed_chars
= 0;
4372 Lisp_Object attrs
, valids
;
4374 coding
= &coding_categories
[coding_category_charset
];
4375 attrs
= CODING_ID_ATTRS (coding
->id
);
4376 valids
= AREF (attrs
, coding_attr_charset_valids
);
4378 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4379 src
+= coding
->head_ascii
;
4386 if (NILP (AREF (valids
, c
)))
4389 *mask
&= ~CATEGORY_MASK_CHARSET
;
4397 decode_coding_charset (coding
)
4398 struct coding_system
*coding
;
4400 unsigned char *src
= coding
->source
+ coding
->consumed
;
4401 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4402 unsigned char *src_base
;
4403 int *charbuf
= coding
->charbuf
;
4404 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4405 int consumed_chars
= 0, consumed_chars_base
;
4406 int multibytep
= coding
->src_multibyte
;
4407 Lisp_Object attrs
, eol_type
, charset_list
, valids
;
4409 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4410 valids
= AREF (attrs
, coding_attr_charset_valids
);
4417 consumed_chars_base
= consumed_chars
;
4419 if (charbuf
>= charbuf_end
)
4425 /* Here we assume that no charset maps '\r' to something
4427 if (EQ (eol_type
, Qdos
))
4433 else if (EQ (eol_type
, Qmac
))
4439 struct charset
*charset
;
4444 val
= AREF (valids
, c
);
4449 charset
= CHARSET_FROM_ID (XFASTINT (val
));
4450 dim
= CHARSET_DIMENSION (charset
);
4454 code
= (code
<< 8) | c
;
4457 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
4462 /* VAL is a list of charset IDs. It is assured that the
4463 list is sorted by charset dimensions (smaller one
4467 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
4468 dim
= CHARSET_DIMENSION (charset
);
4472 code
= (code
<< 8) | c
;
4475 CODING_DECODE_CHAR (coding
, src
, src_base
,
4476 src_end
, charset
, code
, c
);
4490 consumed_chars
= consumed_chars_base
;
4492 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4497 coding
->consumed_char
+= consumed_chars_base
;
4498 coding
->consumed
= src_base
- coding
->source
;
4499 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4503 encode_coding_charset (coding
)
4504 struct coding_system
*coding
;
4506 int multibytep
= coding
->dst_multibyte
;
4507 int *charbuf
= coding
->charbuf
;
4508 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4509 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4510 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4511 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4512 int produced_chars
= 0;
4513 Lisp_Object attrs
, eol_type
, charset_list
;
4514 int ascii_compatible
;
4517 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4518 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4520 while (charbuf
< charbuf_end
)
4522 struct charset
*charset
;
4525 ASSURE_DESTINATION (safe_room
);
4527 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4528 EMIT_ONE_ASCII_BYTE (c
);
4529 else if (CHAR_BYTE8_P (c
))
4531 c
= CHAR_TO_BYTE8 (c
);
4536 charset
= char_charset (c
, charset_list
, &code
);
4539 if (CHARSET_DIMENSION (charset
) == 1)
4540 EMIT_ONE_BYTE (code
);
4541 else if (CHARSET_DIMENSION (charset
) == 2)
4542 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
4543 else if (CHARSET_DIMENSION (charset
) == 3)
4544 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
4546 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
4547 (code
>> 8) & 0xFF, code
& 0xFF);
4551 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4552 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4554 c
= coding
->default_char
;
4560 coding
->result
= CODING_RESULT_SUCCESS
;
4561 coding
->produced_char
+= produced_chars
;
4562 coding
->produced
= dst
- coding
->destination
;
4567 /*** 7. C library functions ***/
4569 /* Setup coding context CODING from information about CODING_SYSTEM.
4570 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4571 CODING_SYSTEM is invalid, signal an error. */
4574 setup_coding_system (coding_system
, coding
)
4575 Lisp_Object coding_system
;
4576 struct coding_system
*coding
;
4579 Lisp_Object eol_type
;
4580 Lisp_Object coding_type
;
4583 if (NILP (coding_system
))
4584 coding_system
= Qno_conversion
;
4586 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
4588 attrs
= CODING_ID_ATTRS (coding
->id
);
4589 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4592 coding
->head_ascii
= -1;
4593 coding
->common_flags
4594 = (VECTORP (eol_type
) ? CODING_REQUIRE_DETECTION_MASK
: 0);
4596 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4597 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4598 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4599 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
4601 coding_type
= CODING_ATTR_TYPE (attrs
);
4602 if (EQ (coding_type
, Qundecided
))
4604 coding
->detector
= NULL
;
4605 coding
->decoder
= decode_coding_raw_text
;
4606 coding
->encoder
= encode_coding_raw_text
;
4607 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4609 else if (EQ (coding_type
, Qiso_2022
))
4612 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
4614 /* Invoke graphic register 0 to plane 0. */
4615 CODING_ISO_INVOCATION (coding
, 0) = 0;
4616 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4617 CODING_ISO_INVOCATION (coding
, 1)
4618 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
4619 /* Setup the initial status of designation. */
4620 for (i
= 0; i
< 4; i
++)
4621 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
4622 /* Not single shifting initially. */
4623 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
4624 /* Beginning of buffer should also be regarded as bol. */
4625 CODING_ISO_BOL (coding
) = 1;
4626 coding
->detector
= detect_coding_iso_2022
;
4627 coding
->decoder
= decode_coding_iso_2022
;
4628 coding
->encoder
= encode_coding_iso_2022
;
4629 if (flags
& CODING_ISO_FLAG_SAFE
)
4630 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
4631 coding
->common_flags
4632 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4633 | CODING_REQUIRE_FLUSHING_MASK
);
4634 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
4635 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
4636 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
4638 setup_iso_safe_charsets (attrs
);
4639 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4640 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4641 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4643 CODING_ISO_FLAGS (coding
) = flags
;
4645 else if (EQ (coding_type
, Qcharset
))
4647 coding
->detector
= detect_coding_charset
;
4648 coding
->decoder
= decode_coding_charset
;
4649 coding
->encoder
= encode_coding_charset
;
4650 coding
->common_flags
4651 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4653 else if (EQ (coding_type
, Qutf_8
))
4655 coding
->detector
= detect_coding_utf_8
;
4656 coding
->decoder
= decode_coding_utf_8
;
4657 coding
->encoder
= encode_coding_utf_8
;
4658 coding
->common_flags
4659 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4661 else if (EQ (coding_type
, Qutf_16
))
4663 val
= AREF (attrs
, coding_attr_utf_16_bom
);
4664 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
4665 : EQ (val
, Qt
) ? utf_16_with_bom
4666 : utf_16_without_bom
);
4667 val
= AREF (attrs
, coding_attr_utf_16_endian
);
4668 CODING_UTF_16_ENDIAN (coding
) = (NILP (val
) ? utf_16_big_endian
4669 : utf_16_little_endian
);
4670 CODING_UTF_16_SURROGATE (coding
) = 0;
4671 coding
->detector
= detect_coding_utf_16
;
4672 coding
->decoder
= decode_coding_utf_16
;
4673 coding
->encoder
= encode_coding_utf_16
;
4674 coding
->common_flags
4675 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4677 else if (EQ (coding_type
, Qccl
))
4679 coding
->detector
= detect_coding_ccl
;
4680 coding
->decoder
= decode_coding_ccl
;
4681 coding
->encoder
= encode_coding_ccl
;
4682 coding
->common_flags
4683 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4684 | CODING_REQUIRE_FLUSHING_MASK
);
4686 else if (EQ (coding_type
, Qemacs_mule
))
4688 coding
->detector
= detect_coding_emacs_mule
;
4689 coding
->decoder
= decode_coding_emacs_mule
;
4690 coding
->encoder
= encode_coding_emacs_mule
;
4691 coding
->common_flags
4692 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4693 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
4694 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
4696 Lisp_Object tail
, safe_charsets
;
4697 int max_charset_id
= 0;
4699 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4701 if (max_charset_id
< XFASTINT (XCAR (tail
)))
4702 max_charset_id
= XFASTINT (XCAR (tail
));
4703 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
4705 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4707 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
4708 coding
->max_charset_id
= max_charset_id
;
4709 coding
->safe_charsets
= (char *) XSTRING (safe_charsets
)->data
;
4712 else if (EQ (coding_type
, Qshift_jis
))
4714 coding
->detector
= detect_coding_sjis
;
4715 coding
->decoder
= decode_coding_sjis
;
4716 coding
->encoder
= encode_coding_sjis
;
4717 coding
->common_flags
4718 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4720 else if (EQ (coding_type
, Qbig5
))
4722 coding
->detector
= detect_coding_big5
;
4723 coding
->decoder
= decode_coding_big5
;
4724 coding
->encoder
= encode_coding_big5
;
4725 coding
->common_flags
4726 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4728 else /* EQ (coding_type, Qraw_text) */
4730 coding
->detector
= NULL
;
4731 coding
->decoder
= decode_coding_raw_text
;
4732 coding
->encoder
= encode_coding_raw_text
;
4733 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
4739 /* Return raw-text or one of its subsidiaries that has the same
4740 eol_type as CODING-SYSTEM. */
4743 raw_text_coding_system (coding_system
)
4744 Lisp_Object coding_system
;
4746 Lisp_Object spec
, attrs
;
4747 Lisp_Object eol_type
, raw_text_eol_type
;
4749 spec
= CODING_SYSTEM_SPEC (coding_system
);
4750 attrs
= AREF (spec
, 0);
4752 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
4753 return coding_system
;
4755 eol_type
= AREF (spec
, 2);
4756 if (VECTORP (eol_type
))
4758 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
4759 raw_text_eol_type
= AREF (spec
, 2);
4760 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
4761 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
4762 : AREF (raw_text_eol_type
, 2));
4766 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4767 does, return one of the subsidiary that has the same eol-spec as
4768 PARENT. Otherwise, return CODING_SYSTEM. */
4771 coding_inherit_eol_type (coding_system
, parent
)
4772 Lisp_Object coding_system
, parent
;
4774 Lisp_Object spec
, attrs
, eol_type
;
4776 spec
= CODING_SYSTEM_SPEC (coding_system
);
4777 attrs
= AREF (spec
, 0);
4778 eol_type
= AREF (spec
, 2);
4779 if (VECTORP (eol_type
))
4781 Lisp_Object parent_spec
;
4782 Lisp_Object parent_eol_type
;
4785 = CODING_SYSTEM_SPEC (buffer_defaults
.buffer_file_coding_system
);
4786 parent_eol_type
= AREF (parent_spec
, 2);
4787 if (EQ (parent_eol_type
, Qunix
))
4788 coding_system
= AREF (eol_type
, 0);
4789 else if (EQ (parent_eol_type
, Qdos
))
4790 coding_system
= AREF (eol_type
, 1);
4791 else if (EQ (parent_eol_type
, Qmac
))
4792 coding_system
= AREF (eol_type
, 2);
4794 return coding_system
;
4797 /* Emacs has a mechanism to automatically detect a coding system if it
4798 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4799 it's impossible to distinguish some coding systems accurately
4800 because they use the same range of codes. So, at first, coding
4801 systems are categorized into 7, those are:
4803 o coding-category-emacs-mule
4805 The category for a coding system which has the same code range
4806 as Emacs' internal format. Assigned the coding-system (Lisp
4807 symbol) `emacs-mule' by default.
4809 o coding-category-sjis
4811 The category for a coding system which has the same code range
4812 as SJIS. Assigned the coding-system (Lisp
4813 symbol) `japanese-shift-jis' by default.
4815 o coding-category-iso-7
4817 The category for a coding system which has the same code range
4818 as ISO2022 of 7-bit environment. This doesn't use any locking
4819 shift and single shift functions. This can encode/decode all
4820 charsets. Assigned the coding-system (Lisp symbol)
4821 `iso-2022-7bit' by default.
4823 o coding-category-iso-7-tight
4825 Same as coding-category-iso-7 except that this can
4826 encode/decode only the specified charsets.
4828 o coding-category-iso-8-1
4830 The category for a coding system which has the same code range
4831 as ISO2022 of 8-bit environment and graphic plane 1 used only
4832 for DIMENSION1 charset. This doesn't use any locking shift
4833 and single shift functions. Assigned the coding-system (Lisp
4834 symbol) `iso-latin-1' by default.
4836 o coding-category-iso-8-2
4838 The category for a coding system which has the same code range
4839 as ISO2022 of 8-bit environment and graphic plane 1 used only
4840 for DIMENSION2 charset. This doesn't use any locking shift
4841 and single shift functions. Assigned the coding-system (Lisp
4842 symbol) `japanese-iso-8bit' by default.
4844 o coding-category-iso-7-else
4846 The category for a coding system which has the same code range
4847 as ISO2022 of 7-bit environemnt but uses locking shift or
4848 single shift functions. Assigned the coding-system (Lisp
4849 symbol) `iso-2022-7bit-lock' by default.
4851 o coding-category-iso-8-else
4853 The category for a coding system which has the same code range
4854 as ISO2022 of 8-bit environemnt but uses locking shift or
4855 single shift functions. Assigned the coding-system (Lisp
4856 symbol) `iso-2022-8bit-ss2' by default.
4858 o coding-category-big5
4860 The category for a coding system which has the same code range
4861 as BIG5. Assigned the coding-system (Lisp symbol)
4862 `cn-big5' by default.
4864 o coding-category-utf-8
4866 The category for a coding system which has the same code range
4867 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
4868 symbol) `utf-8' by default.
4870 o coding-category-utf-16-be
4872 The category for a coding system in which a text has an
4873 Unicode signature (cf. Unicode Standard) in the order of BIG
4874 endian at the head. Assigned the coding-system (Lisp symbol)
4875 `utf-16-be' by default.
4877 o coding-category-utf-16-le
4879 The category for a coding system in which a text has an
4880 Unicode signature (cf. Unicode Standard) in the order of
4881 LITTLE endian at the head. Assigned the coding-system (Lisp
4882 symbol) `utf-16-le' by default.
4884 o coding-category-ccl
4886 The category for a coding system of which encoder/decoder is
4887 written in CCL programs. The default value is nil, i.e., no
4888 coding system is assigned.
4890 o coding-category-binary
4892 The category for a coding system not categorized in any of the
4893 above. Assigned the coding-system (Lisp symbol)
4894 `no-conversion' by default.
4896 Each of them is a Lisp symbol and the value is an actual
4897 `coding-system's (this is also a Lisp symbol) assigned by a user.
4898 What Emacs does actually is to detect a category of coding system.
4899 Then, it uses a `coding-system' assigned to it. If Emacs can't
4900 decide only one possible category, it selects a category of the
4901 highest priority. Priorities of categories are also specified by a
4902 user in a Lisp variable `coding-category-list'.
4906 #define EOL_SEEN_NONE 0
4907 #define EOL_SEEN_LF 1
4908 #define EOL_SEEN_CR 2
4909 #define EOL_SEEN_CRLF 4
4911 /* Detect how end-of-line of a text of length CODING->src_bytes
4912 pointed by CODING->source is encoded. Return one of
4915 #define MAX_EOL_CHECK_COUNT 3
4918 detect_eol (source
, src_bytes
, category
)
4919 unsigned char *source
;
4920 EMACS_INT src_bytes
;
4921 enum coding_category category
;
4923 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
4926 int eol_seen
= EOL_SEEN_NONE
;
4928 if ((1 << category
) & CATEGORY_MASK_UTF_16
)
4932 msb
= category
== (coding_category_utf_16_le
4933 | coding_category_utf_16_le_nosig
);
4936 while (src
+ 1 < src_end
)
4939 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
4944 this_eol
= EOL_SEEN_LF
;
4945 else if (src
+ 3 >= src_end
4946 || src
[msb
+ 2] != 0
4947 || src
[lsb
+ 2] != '\n')
4948 this_eol
= EOL_SEEN_CR
;
4950 this_eol
= EOL_SEEN_CRLF
;
4952 if (eol_seen
== EOL_SEEN_NONE
)
4953 /* This is the first end-of-line. */
4954 eol_seen
= this_eol
;
4955 else if (eol_seen
!= this_eol
)
4957 /* The found type is different from what found before. */
4958 eol_seen
= EOL_SEEN_LF
;
4961 if (++total
== MAX_EOL_CHECK_COUNT
)
4969 while (src
< src_end
)
4972 if (c
== '\n' || c
== '\r')
4977 this_eol
= EOL_SEEN_LF
;
4978 else if (src
>= src_end
|| *src
!= '\n')
4979 this_eol
= EOL_SEEN_CR
;
4981 this_eol
= EOL_SEEN_CRLF
, src
++;
4983 if (eol_seen
== EOL_SEEN_NONE
)
4984 /* This is the first end-of-line. */
4985 eol_seen
= this_eol
;
4986 else if (eol_seen
!= this_eol
)
4988 /* The found type is different from what found before. */
4989 eol_seen
= EOL_SEEN_LF
;
4992 if (++total
== MAX_EOL_CHECK_COUNT
)
5002 adjust_coding_eol_type (coding
, eol_seen
)
5003 struct coding_system
*coding
;
5006 Lisp_Object eol_type
;
5008 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5009 if (eol_seen
& EOL_SEEN_LF
)
5010 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
5011 else if (eol_seen
& EOL_SEEN_CRLF
)
5012 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
5013 else if (eol_seen
& EOL_SEEN_CR
)
5014 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
5017 /* Detect how a text specified in CODING is encoded. If a coding
5018 system is detected, update fields of CODING by the detected coding
5022 detect_coding (coding
)
5023 struct coding_system
*coding
;
5025 unsigned char *src
, *src_end
;
5026 Lisp_Object attrs
, coding_type
;
5028 coding
->consumed
= coding
->consumed_char
= 0;
5029 coding
->produced
= coding
->produced_char
= 0;
5030 coding_set_source (coding
);
5032 src_end
= coding
->source
+ coding
->src_bytes
;
5034 /* If we have not yet decided the text encoding type, detect it
5036 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
5038 int mask
= CATEGORY_MASK_ANY
;
5041 for (src
= coding
->source
; src
< src_end
; src
++)
5044 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
5046 || c
== ISO_CODE_SO
)))
5049 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
5051 if (coding
->head_ascii
< coding
->src_bytes
)
5055 for (i
= 0; i
< coding_category_raw_text
; i
++)
5057 enum coding_category category
= coding_priorities
[i
];
5058 struct coding_system
*this = coding_categories
+ category
;
5062 /* No coding system of this category is defined. */
5063 mask
&= ~(1 << category
);
5065 else if (category
>= coding_category_raw_text
5066 || detected
& (1 << category
))
5070 detected
|= detected_mask
[category
];
5071 if ((*(this->detector
)) (coding
, &mask
)
5072 && (mask
& (1 << category
)))
5077 setup_coding_system (Qraw_text
, coding
);
5078 else if (mask
!= CATEGORY_MASK_ANY
)
5079 for (i
= 0; i
< coding_category_raw_text
; i
++)
5081 enum coding_category category
= coding_priorities
[i
];
5082 struct coding_system
*this = coding_categories
+ category
;
5084 if (mask
& (1 << category
))
5086 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5093 attrs
= CODING_ID_ATTRS (coding
->id
);
5094 coding_type
= CODING_ATTR_TYPE (attrs
);
5096 /* If we have not yet decided the EOL type, detect it now. But, the
5097 detection is impossible for a CCL based coding system, in which
5098 case, we detct the EOL type after decoding. */
5099 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
))
5100 && ! EQ (coding_type
, Qccl
))
5102 int eol_seen
= detect_eol (coding
->source
, coding
->src_bytes
,
5103 XINT (CODING_ATTR_CATEGORY (attrs
)));
5105 if (eol_seen
!= EOL_SEEN_NONE
)
5106 adjust_coding_eol_type (coding
, eol_seen
);
5113 struct coding_system
*coding
;
5115 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
)))
5117 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5118 unsigned char *pend
= p
+ coding
->produced
;
5119 int eol_seen
= EOL_SEEN_NONE
;
5121 for (; p
< pend
; p
++)
5124 eol_seen
|= EOL_SEEN_LF
;
5125 else if (*p
== '\r')
5127 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
5129 eol_seen
|= EOL_SEEN_CRLF
;
5133 eol_seen
|= EOL_SEEN_CR
;
5136 if (eol_seen
!= EOL_SEEN_NONE
)
5137 adjust_coding_eol_type (coding
, eol_seen
);
5140 if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qmac
))
5142 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5143 unsigned char *pend
= p
+ coding
->produced
;
5145 for (; p
< pend
; p
++)
5149 else if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
))
5151 unsigned char *p
, *pbeg
, *pend
;
5152 Lisp_Object undo_list
;
5154 move_gap_both (coding
->dst_pos
+ coding
->produced_char
,
5155 coding
->dst_pos_byte
+ coding
->produced
);
5156 undo_list
= current_buffer
->undo_list
;
5157 current_buffer
->undo_list
= Qt
;
5158 del_range_2 (coding
->dst_pos
, coding
->dst_pos_byte
, GPT
, GPT_BYTE
, 0);
5159 current_buffer
->undo_list
= undo_list
;
5161 pend
= pbeg
+ coding
->produced
;
5163 for (p
= pend
- 1; p
>= pbeg
; p
--)
5166 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
- p
- 1);
5169 coding
->produced_char
-= coding
->produced
- (pend
- pbeg
);
5170 coding
->produced
= pend
- pbeg
;
5171 insert_from_gap (coding
->produced_char
, coding
->produced
);
5176 translate_chars (coding
, table
)
5177 struct coding_system
*coding
;
5180 int *charbuf
= coding
->charbuf
;
5181 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5184 if (coding
->chars_at_source
)
5187 while (charbuf
< charbuf_end
)
5193 *charbuf
++ = translate_char (table
, c
);
5198 produce_chars (coding
)
5199 struct coding_system
*coding
;
5201 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5202 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5204 int produced_chars
= 0;
5206 if (! coding
->chars_at_source
)
5208 /* Characters are in coding->charbuf. */
5209 int *buf
= coding
->charbuf
;
5210 int *buf_end
= buf
+ coding
->charbuf_used
;
5211 unsigned char *adjusted_dst_end
;
5213 if (BUFFERP (coding
->src_object
)
5214 && EQ (coding
->src_object
, coding
->dst_object
))
5215 dst_end
= coding
->source
+ coding
->consumed
;
5216 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5218 while (buf
< buf_end
)
5222 if (dst
>= adjusted_dst_end
)
5224 dst
= alloc_destination (coding
,
5225 buf_end
- buf
+ MAX_MULTIBYTE_LENGTH
,
5227 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5228 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5232 if (coding
->dst_multibyte
5233 || ! CHAR_BYTE8_P (c
))
5234 CHAR_STRING_ADVANCE (c
, dst
);
5236 *dst
++ = CHAR_TO_BYTE8 (c
);
5240 /* This is an annotation data. */
5246 unsigned char *src
= coding
->source
;
5247 unsigned char *src_end
= src
+ coding
->src_bytes
;
5248 Lisp_Object eol_type
;
5250 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5252 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5254 if (coding
->src_multibyte
)
5261 unsigned char *src_base
= src
;
5267 if (EQ (eol_type
, Qdos
))
5273 else if (EQ (eol_type
, Qmac
))
5278 coding
->consumed
= src
- coding
->source
;
5280 if (EQ (coding
->src_object
, coding
->dst_object
))
5284 dst
= alloc_destination (coding
, src_end
- src
+ 1,
5286 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5287 coding_set_source (coding
);
5288 src
= coding
->source
+ coding
->consumed
;
5289 src_end
= coding
->source
+ coding
->src_bytes
;
5299 while (src
< src_end
)
5306 if (EQ (eol_type
, Qdos
))
5312 else if (EQ (eol_type
, Qmac
))
5315 if (dst
>= dst_end
- 1)
5317 coding
->consumed
= src
- coding
->source
;
5319 if (EQ (coding
->src_object
, coding
->dst_object
))
5321 if (dst
>= dst_end
- 1)
5323 dst
= alloc_destination (coding
, src_end
- src
+ 2,
5325 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5326 coding_set_source (coding
);
5327 src
= coding
->source
+ coding
->consumed
;
5328 src_end
= coding
->source
+ coding
->src_bytes
;
5336 if (!EQ (coding
->src_object
, coding
->dst_object
))
5338 int require
= coding
->src_bytes
- coding
->dst_bytes
;
5342 EMACS_INT offset
= src
- coding
->source
;
5344 dst
= alloc_destination (coding
, require
, dst
);
5345 coding_set_source (coding
);
5346 src
= coding
->source
+ offset
;
5347 src_end
= coding
->source
+ coding
->src_bytes
;
5350 produced_chars
= coding
->src_chars
;
5351 while (src
< src_end
)
5357 if (EQ (eol_type
, Qdos
))
5364 else if (EQ (eol_type
, Qmac
))
5370 coding
->consumed
= coding
->src_bytes
;
5371 coding
->consumed_char
= coding
->src_chars
;
5374 produced
= dst
- (coding
->destination
+ coding
->produced
);
5375 if (BUFFERP (coding
->dst_object
))
5376 insert_from_gap (produced_chars
, produced
);
5377 coding
->produced
+= produced
;
5378 coding
->produced_char
+= produced_chars
;
5379 return produced_chars
;
5382 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5384 [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5388 produce_composition (coding
, charbuf
)
5389 struct coding_system
*coding
;
5395 enum composition_method method
;
5397 Lisp_Object components
;
5399 buffer
= coding
->dst_object
;
5401 pos
= coding
->dst_pos
+ charbuf
[1];
5402 method
= (enum composition_method
) (charbuf
[3]);
5403 cmp_len
= charbuf
[4];
5405 if (method
== COMPOSITION_RELATIVE
)
5409 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
5414 for (i
= 0; i
< len
; i
++)
5415 args
[i
] = make_number (charbuf
[i
]);
5416 components
= (method
== COMPOSITION_WITH_ALTCHARS
5417 ? Fstring (len
, args
) : Fvector (len
, args
));
5419 compose_text (pos
, pos
+ cmp_len
, components
, Qnil
, Qnil
);
5423 save_composition_data (buf
, buf_end
, prop
)
5427 enum composition_method method
= COMPOSITION_METHOD (prop
);
5428 int cmp_len
= COMPOSITION_LENGTH (prop
);
5430 if (buf
+ 4 + (MAX_COMPOSITION_COMPONENTS
* 2 - 1) > buf_end
)
5433 buf
[1] = CODING_ANNOTATE_COMPOSITION_MASK
;
5437 if (method
== COMPOSITION_RELATIVE
)
5441 Lisp_Object components
;
5444 components
= COMPOSITION_COMPONENTS (prop
);
5445 if (VECTORP (components
))
5447 len
= XVECTOR (components
)->size
;
5448 for (i
= 0; i
< len
; i
++)
5449 buf
[4 + i
] = XINT (AREF (components
, i
));
5451 else if (STRINGP (components
))
5455 len
= XSTRING (components
)->size
;
5458 FETCH_STRING_CHAR_ADVANCE (buf
[4 + i
], components
, i
, i_byte
);
5460 else if (INTEGERP (components
))
5463 buf
[4] = XINT (components
);
5465 else if (CONSP (components
))
5467 for (len
= 0; CONSP (components
);
5468 len
++, components
= XCDR (components
))
5469 buf
[4 + len
] = XINT (XCAR (components
));
5475 return (buf
+ buf
[0]);
5478 #define CHARBUF_SIZE 0x4000
5480 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5482 int size = CHARBUF_SIZE;; \
5484 coding->charbuf = NULL; \
5485 while (size > 1024) \
5487 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5488 if (coding->charbuf) \
5492 if (! coding->charbuf) \
5494 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5495 return coding->result; \
5497 coding->charbuf_size = size; \
5502 produce_annotation (coding
)
5503 struct coding_system
*coding
;
5505 int *charbuf
= coding
->charbuf
;
5506 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5508 while (charbuf
< charbuf_end
)
5514 int len
= -*charbuf
;
5517 case CODING_ANNOTATE_COMPOSITION_MASK
:
5518 produce_composition (coding
, charbuf
);
5528 /* Decode the data at CODING->src_object into CODING->dst_object.
5529 CODING->src_object is a buffer, a string, or nil.
5530 CODING->dst_object is a buffer.
5532 If CODING->src_object is a buffer, it must be the current buffer.
5533 In this case, if CODING->src_pos is positive, it is a position of
5534 the source text in the buffer, otherwise, the source text is in the
5535 gap area of the buffer, and CODING->src_pos specifies the offset of
5536 the text from GPT (which must be the same as PT). If this is the
5537 same buffer as CODING->dst_object, CODING->src_pos must be
5540 If CODING->src_object is a string, CODING->src_pos in an index to
5543 If CODING->src_object is nil, CODING->source must already point to
5544 the non-relocatable memory area. In this case, CODING->src_pos is
5545 an offset from CODING->source.
5547 The decoded data is inserted at the current point of the buffer
5552 decode_coding (coding
)
5553 struct coding_system
*coding
;
5557 if (BUFFERP (coding
->src_object
)
5558 && coding
->src_pos
> 0
5559 && coding
->src_pos
< GPT
5560 && coding
->src_pos
+ coding
->src_chars
> GPT
)
5561 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
5563 if (BUFFERP (coding
->dst_object
))
5565 if (current_buffer
!= XBUFFER (coding
->dst_object
))
5566 set_buffer_internal (XBUFFER (coding
->dst_object
));
5568 move_gap_both (PT
, PT_BYTE
);
5571 coding
->consumed
= coding
->consumed_char
= 0;
5572 coding
->produced
= coding
->produced_char
= 0;
5573 coding
->chars_at_source
= 0;
5574 coding
->result
= CODING_RESULT_SUCCESS
;
5577 ALLOC_CONVERSION_WORK_AREA (coding
);
5579 attrs
= CODING_ID_ATTRS (coding
->id
);
5583 coding_set_source (coding
);
5584 coding
->annotated
= 0;
5585 (*(coding
->decoder
)) (coding
);
5586 if (!NILP (CODING_ATTR_DECODE_TBL (attrs
)))
5587 translate_chars (CODING_ATTR_DECODE_TBL (attrs
), coding
);
5588 coding_set_destination (coding
);
5589 produce_chars (coding
);
5590 if (coding
->annotated
)
5591 produce_annotation (coding
);
5593 while (coding
->consumed
< coding
->src_bytes
5594 && ! coding
->result
);
5596 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qccl
)
5597 && SYMBOLP (CODING_ID_EOL_TYPE (coding
->id
))
5598 && ! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
5599 decode_eol (coding
);
5601 coding
->carryover_bytes
= 0;
5602 if (coding
->consumed
< coding
->src_bytes
)
5604 int nbytes
= coding
->src_bytes
- coding
->consumed
;
5607 coding_set_source (coding
);
5608 coding_set_destination (coding
);
5609 src
= coding
->source
+ coding
->consumed
;
5611 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
5613 /* Flush out unprocessed data as binary chars. We are sure
5614 that the number of data is less than the size of
5616 int *charbuf
= coding
->charbuf
;
5618 while (nbytes
-- > 0)
5621 *charbuf
++ = (c
& 0x80 ? - c
: c
);
5623 produce_chars (coding
);
5627 /* Record unprocessed bytes in coding->carryover. We are
5628 sure that the number of data is less than the size of
5629 coding->carryover. */
5630 unsigned char *p
= coding
->carryover
;
5632 coding
->carryover_bytes
= nbytes
;
5633 while (nbytes
-- > 0)
5636 coding
->consumed
= coding
->src_bytes
;
5639 return coding
->result
;
5643 consume_chars (coding
)
5644 struct coding_system
*coding
;
5646 int *buf
= coding
->charbuf
;
5647 /* -1 is to compensate for CRLF. */
5648 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
- 1;
5649 const unsigned char *src
= coding
->source
+ coding
->consumed
;
5650 int pos
= coding
->src_pos
+ coding
->consumed_char
;
5651 int end_pos
= coding
->src_pos
+ coding
->src_chars
;
5652 int multibytep
= coding
->src_multibyte
;
5653 Lisp_Object eol_type
;
5655 int start
, end
, stop
;
5656 Lisp_Object object
, prop
;
5658 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5659 if (VECTORP (eol_type
))
5662 object
= coding
->src_object
;
5664 /* Note: composition handling is not yet implemented. */
5665 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
5667 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
5668 && find_composition (pos
, end_pos
, &start
, &end
, &prop
, object
)
5671 || (find_composition (end
, end_pos
, &start
, &end
, &prop
, object
)
5672 && end
<= end_pos
)))
5677 while (buf
< buf_end
)
5685 p
= save_composition_data (buf
, buf_end
, prop
);
5689 if (find_composition (end
, end_pos
, &start
, &end
, &prop
, object
)
5699 c
= STRING_CHAR_ADVANCE (src
);
5700 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
5702 if (! EQ (eol_type
, Qunix
))
5706 if (EQ (eol_type
, Qdos
))
5716 coding
->consumed
= src
- coding
->source
;
5717 coding
->consumed_char
= pos
- coding
->src_pos
;
5718 coding
->charbuf_used
= buf
- coding
->charbuf
;
5719 coding
->chars_at_source
= 0;
5723 /* Encode the text at CODING->src_object into CODING->dst_object.
5724 CODING->src_object is a buffer or a string.
5725 CODING->dst_object is a buffer or nil.
5727 If CODING->src_object is a buffer, it must be the current buffer.
5728 In this case, if CODING->src_pos is positive, it is a position of
5729 the source text in the buffer, otherwise. the source text is in the
5730 gap area of the buffer, and coding->src_pos specifies the offset of
5731 the text from GPT (which must be the same as PT). If this is the
5732 same buffer as CODING->dst_object, CODING->src_pos must be
5733 negative and CODING should not have `pre-write-conversion'.
5735 If CODING->src_object is a string, CODING should not have
5736 `pre-write-conversion'.
5738 If CODING->dst_object is a buffer, the encoded data is inserted at
5739 the current point of that buffer.
5741 If CODING->dst_object is nil, the encoded data is placed at the
5742 memory area specified by CODING->destination. */
5745 encode_coding (coding
)
5746 struct coding_system
*coding
;
5750 attrs
= CODING_ID_ATTRS (coding
->id
);
5752 if (BUFFERP (coding
->dst_object
))
5754 set_buffer_internal (XBUFFER (coding
->dst_object
));
5755 coding
->dst_multibyte
5756 = ! NILP (current_buffer
->enable_multibyte_characters
);
5759 coding
->consumed
= coding
->consumed_char
= 0;
5760 coding
->produced
= coding
->produced_char
= 0;
5761 coding
->result
= CODING_RESULT_SUCCESS
;
5764 ALLOC_CONVERSION_WORK_AREA (coding
);
5767 coding_set_source (coding
);
5768 consume_chars (coding
);
5770 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs
)))
5771 translate_chars (CODING_ATTR_ENCODE_TBL (attrs
), coding
);
5773 coding_set_destination (coding
);
5774 (*(coding
->encoder
)) (coding
);
5775 } while (coding
->consumed_char
< coding
->src_chars
);
5777 if (BUFFERP (coding
->dst_object
))
5778 insert_from_gap (coding
->produced_char
, coding
->produced
);
5780 return (coding
->result
);
5785 /* List of currently used working buffer. */
5786 Lisp_Object Vcode_conversion_work_buf_list
;
5788 /* A working buffer used by the top level conversion. */
5789 Lisp_Object Vcode_conversion_reused_work_buf
;
5792 /* Return a working buffer that can be freely used by the following
5793 code conversion. MULTIBYTEP specifies the multibyteness of the
5797 make_conversion_work_buffer (multibytep
)
5800 struct buffer
*current
= current_buffer
;
5803 if (NILP (Vcode_conversion_work_buf_list
))
5805 if (NILP (Vcode_conversion_reused_work_buf
))
5806 Vcode_conversion_reused_work_buf
5807 = Fget_buffer_create (build_string (" *code-conversion-work*"));
5808 Vcode_conversion_work_buf_list
5809 = Fcons (Vcode_conversion_reused_work_buf
, Qnil
);
5813 int depth
= XINT (Flength (Vcode_conversion_work_buf_list
));
5816 sprintf (str
, " *code-conversion-work*<%d>", depth
);
5817 Vcode_conversion_work_buf_list
5818 = Fcons (Fget_buffer_create (build_string (str
)),
5819 Vcode_conversion_work_buf_list
);
5822 buf
= XCAR (Vcode_conversion_work_buf_list
);
5823 set_buffer_internal (XBUFFER (buf
));
5824 current_buffer
->undo_list
= Qt
;
5826 Fset_buffer_multibyte (multibytep
? Qt
: Qnil
, Qnil
);
5827 set_buffer_internal (current
);
5831 static struct coding_system
*saved_coding
;
5834 code_conversion_restore (info
)
5837 int depth
= XINT (Flength (Vcode_conversion_work_buf_list
));
5842 buf
= XCAR (Vcode_conversion_work_buf_list
);
5843 Vcode_conversion_work_buf_list
= XCDR (Vcode_conversion_work_buf_list
);
5844 if (depth
> 1 && !NILP (Fbuffer_live_p (buf
)))
5848 if (EQ (saved_coding
->dst_object
, Qt
)
5849 && saved_coding
->destination
)
5850 xfree (saved_coding
->destination
);
5852 return save_excursion_restore (info
);
5857 decode_coding_gap (coding
, chars
, bytes
)
5858 struct coding_system
*coding
;
5859 EMACS_INT chars
, bytes
;
5861 int count
= specpdl_ptr
- specpdl
;
5863 saved_coding
= coding
;
5864 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5866 coding
->src_object
= Fcurrent_buffer ();
5867 coding
->src_chars
= chars
;
5868 coding
->src_bytes
= bytes
;
5869 coding
->src_pos
= -chars
;
5870 coding
->src_pos_byte
= -bytes
;
5871 coding
->src_multibyte
= chars
< bytes
;
5872 coding
->dst_object
= coding
->src_object
;
5873 coding
->dst_pos
= PT
;
5874 coding
->dst_pos_byte
= PT_BYTE
;
5875 coding
->dst_multibyte
= ! NILP (current_buffer
->enable_multibyte_characters
);
5877 if (CODING_REQUIRE_DETECTION (coding
))
5878 detect_coding (coding
);
5880 decode_coding (coding
);
5882 unbind_to (count
, Qnil
);
5883 return coding
->result
;
5887 encode_coding_gap (coding
, chars
, bytes
)
5888 struct coding_system
*coding
;
5889 EMACS_INT chars
, bytes
;
5891 int count
= specpdl_ptr
- specpdl
;
5894 saved_coding
= coding
;
5895 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5897 buffer
= Fcurrent_buffer ();
5898 coding
->src_object
= buffer
;
5899 coding
->src_chars
= chars
;
5900 coding
->src_bytes
= bytes
;
5901 coding
->src_pos
= -chars
;
5902 coding
->src_pos_byte
= -bytes
;
5903 coding
->src_multibyte
= chars
< bytes
;
5904 coding
->dst_object
= coding
->src_object
;
5905 coding
->dst_pos
= PT
;
5906 coding
->dst_pos_byte
= PT_BYTE
;
5908 encode_coding (coding
);
5910 unbind_to (count
, Qnil
);
5911 return coding
->result
;
5915 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5916 SRC_OBJECT into DST_OBJECT by coding context CODING.
5918 SRC_OBJECT is a buffer, a string, or Qnil.
5920 If it is a buffer, the text is at point of the buffer. FROM and TO
5921 are positions in the buffer.
5923 If it is a string, the text is at the beginning of the string.
5924 FROM and TO are indices to the string.
5926 If it is nil, the text is at coding->source. FROM and TO are
5927 indices to coding->source.
5929 DST_OBJECT is a buffer, Qt, or Qnil.
5931 If it is a buffer, the decoded text is inserted at point of the
5932 buffer. If the buffer is the same as SRC_OBJECT, the source text
5935 If it is Qt, a string is made from the decoded text, and
5936 set in CODING->dst_object.
5938 If it is Qnil, the decoded text is stored at CODING->destination.
5939 The called must allocate CODING->dst_bytes bytes at
5940 CODING->destination by xmalloc. If the decoded text is longer than
5941 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5945 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
5947 struct coding_system
*coding
;
5948 Lisp_Object src_object
;
5949 EMACS_INT from
, from_byte
, to
, to_byte
;
5950 Lisp_Object dst_object
;
5952 int count
= specpdl_ptr
- specpdl
;
5953 unsigned char *destination
;
5954 EMACS_INT dst_bytes
;
5955 EMACS_INT chars
= to
- from
;
5956 EMACS_INT bytes
= to_byte
- from_byte
;
5959 saved_coding
= coding
;
5960 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
5962 if (NILP (dst_object
))
5964 destination
= coding
->destination
;
5965 dst_bytes
= coding
->dst_bytes
;
5968 coding
->src_object
= src_object
;
5969 coding
->src_chars
= chars
;
5970 coding
->src_bytes
= bytes
;
5971 coding
->src_multibyte
= chars
< bytes
;
5973 if (STRINGP (src_object
))
5975 coding
->src_pos
= from
;
5976 coding
->src_pos_byte
= from_byte
;
5978 else if (BUFFERP (src_object
))
5980 set_buffer_internal (XBUFFER (src_object
));
5982 move_gap_both (from
, from_byte
);
5983 if (EQ (src_object
, dst_object
))
5985 TEMP_SET_PT_BOTH (from
, from_byte
);
5986 del_range_both (from
, from_byte
, to
, to_byte
, 1);
5987 coding
->src_pos
= -chars
;
5988 coding
->src_pos_byte
= -bytes
;
5992 coding
->src_pos
= from
;
5993 coding
->src_pos_byte
= from_byte
;
5997 if (CODING_REQUIRE_DETECTION (coding
))
5998 detect_coding (coding
);
5999 attrs
= CODING_ID_ATTRS (coding
->id
);
6001 if (! NILP (CODING_ATTR_POST_READ (attrs
))
6002 || EQ (dst_object
, Qt
))
6004 coding
->dst_object
= make_conversion_work_buffer (1);
6005 coding
->dst_pos
= BEG
;
6006 coding
->dst_pos_byte
= BEG_BYTE
;
6007 coding
->dst_multibyte
= 1;
6009 else if (BUFFERP (dst_object
))
6011 coding
->dst_object
= dst_object
;
6012 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6013 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6014 coding
->dst_multibyte
6015 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6019 coding
->dst_object
= Qnil
;
6020 coding
->dst_multibyte
= 1;
6023 decode_coding (coding
);
6025 if (BUFFERP (coding
->dst_object
))
6026 set_buffer_internal (XBUFFER (coding
->dst_object
));
6028 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6030 struct gcpro gcpro1
, gcpro2
;
6031 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6034 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6035 GCPRO2 (coding
->src_object
, coding
->dst_object
);
6036 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6037 make_number (coding
->produced_char
));
6040 coding
->produced_char
+= Z
- prev_Z
;
6041 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6044 if (EQ (dst_object
, Qt
))
6046 coding
->dst_object
= Fbuffer_string ();
6048 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
6050 set_buffer_internal (XBUFFER (coding
->dst_object
));
6051 if (dst_bytes
< coding
->produced
)
6054 = (unsigned char *) xrealloc (destination
, coding
->produced
);
6057 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
6058 unbind_to (count
, Qnil
);
6061 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
6062 move_gap_both (BEGV
, BEGV_BYTE
);
6063 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
6064 coding
->destination
= destination
;
6068 unbind_to (count
, Qnil
);
6073 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6075 struct coding_system
*coding
;
6076 Lisp_Object src_object
;
6077 EMACS_INT from
, from_byte
, to
, to_byte
;
6078 Lisp_Object dst_object
;
6080 int count
= specpdl_ptr
- specpdl
;
6081 EMACS_INT chars
= to
- from
;
6082 EMACS_INT bytes
= to_byte
- from_byte
;
6085 saved_coding
= coding
;
6086 record_unwind_protect (code_conversion_restore
, save_excursion_save ());
6088 coding
->src_object
= src_object
;
6089 coding
->src_chars
= chars
;
6090 coding
->src_bytes
= bytes
;
6091 coding
->src_multibyte
= chars
< bytes
;
6093 attrs
= CODING_ID_ATTRS (coding
->id
);
6095 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
6097 coding
->src_object
= make_conversion_work_buffer (coding
->src_multibyte
);
6098 set_buffer_internal (XBUFFER (coding
->src_object
));
6099 if (STRINGP (src_object
))
6100 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
6101 else if (BUFFERP (src_object
))
6102 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
6104 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
6106 if (EQ (src_object
, dst_object
))
6108 set_buffer_internal (XBUFFER (src_object
));
6109 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6110 set_buffer_internal (XBUFFER (coding
->src_object
));
6113 call2 (CODING_ATTR_PRE_WRITE (attrs
),
6114 make_number (BEG
), make_number (Z
));
6115 coding
->src_object
= Fcurrent_buffer ();
6117 move_gap_both (BEG
, BEG_BYTE
);
6118 coding
->src_chars
= Z
- BEG
;
6119 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
6120 coding
->src_pos
= BEG
;
6121 coding
->src_pos_byte
= BEG_BYTE
;
6122 coding
->src_multibyte
= Z
< Z_BYTE
;
6124 else if (STRINGP (src_object
))
6126 coding
->src_pos
= from
;
6127 coding
->src_pos_byte
= from_byte
;
6129 else if (BUFFERP (src_object
))
6131 set_buffer_internal (XBUFFER (src_object
));
6133 move_gap_both (from
, from_byte
);
6134 if (EQ (src_object
, dst_object
))
6136 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6137 coding
->src_pos
= -chars
;
6138 coding
->src_pos_byte
= -bytes
;
6142 coding
->src_pos
= from
;
6143 coding
->src_pos_byte
= from_byte
;
6147 if (BUFFERP (dst_object
))
6149 coding
->dst_object
= dst_object
;
6150 if (EQ (src_object
, dst_object
))
6152 coding
->dst_pos
= from
;
6153 coding
->dst_pos_byte
= from_byte
;
6157 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6158 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6160 coding
->dst_multibyte
6161 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6163 else if (EQ (dst_object
, Qt
))
6165 coding
->dst_object
= Qnil
;
6166 coding
->dst_bytes
= coding
->src_chars
;
6167 if (coding
->dst_bytes
== 0)
6168 coding
->dst_bytes
= 1;
6169 coding
->destination
= (unsigned char *) xmalloc (coding
->dst_bytes
);
6170 coding
->dst_multibyte
= 0;
6174 coding
->dst_object
= Qnil
;
6175 coding
->dst_multibyte
= 0;
6178 encode_coding (coding
);
6180 if (EQ (dst_object
, Qt
))
6182 if (BUFFERP (coding
->dst_object
))
6183 coding
->dst_object
= Fbuffer_string ();
6187 = make_unibyte_string ((char *) coding
->destination
,
6189 xfree (coding
->destination
);
6193 unbind_to (count
, Qnil
);
6198 preferred_coding_system ()
6200 int id
= coding_categories
[coding_priorities
[0]].id
;
6202 return CODING_ID_NAME (id
);
6207 /*** 8. Emacs Lisp library functions ***/
6209 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
6210 doc
: /* Return t if OBJECT is nil or a coding-system.
6211 See the documentation of `define-coding-system' for information
6212 about coding-system objects. */)
6216 return ((NILP (obj
) || CODING_SYSTEM_P (obj
)) ? Qt
: Qnil
);
6219 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
6220 Sread_non_nil_coding_system
, 1, 1, 0,
6221 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6228 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6229 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
6231 while (XSTRING (val
)->size
== 0);
6232 return (Fintern (val
, Qnil
));
6235 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
6236 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6237 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6238 (prompt
, default_coding_system
)
6239 Lisp_Object prompt
, default_coding_system
;
6242 if (SYMBOLP (default_coding_system
))
6243 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
6244 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6245 Qt
, Qnil
, Qcoding_system_history
,
6246 default_coding_system
, Qnil
);
6247 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
6250 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
6252 doc
: /* Check validity of CODING-SYSTEM.
6253 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6254 It is valid if it is a symbol with a non-nil `coding-system' property.
6255 The value of property should be a vector of length 5. */)
6257 Lisp_Object coding_system
;
6259 CHECK_SYMBOL (coding_system
);
6260 if (!NILP (Fcoding_system_p (coding_system
)))
6261 return coding_system
;
6263 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
6267 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6268 HIGHEST is nonzero, return the coding system of the highest
6269 priority among the detected coding systems. Otherwize return a
6270 list of detected coding systems sorted by their priorities. If
6271 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6272 multibyte form but contains only ASCII and eight-bit chars.
6273 Otherwise, the bytes are raw bytes.
6275 CODING-SYSTEM controls the detection as below:
6277 If it is nil, detect both text-format and eol-format. If the
6278 text-format part of CODING-SYSTEM is already specified
6279 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6280 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6281 detect only text-format. */
6284 detect_coding_system (src
, src_bytes
, highest
, multibytep
, coding_system
)
6286 int src_bytes
, highest
;
6288 Lisp_Object coding_system
;
6290 unsigned char *src_end
= src
+ src_bytes
;
6291 int mask
= CATEGORY_MASK_ANY
;
6294 Lisp_Object attrs
, eol_type
;
6296 struct coding_system coding
;
6299 if (NILP (coding_system
))
6300 coding_system
= Qundecided
;
6301 setup_coding_system (coding_system
, &coding
);
6302 attrs
= CODING_ID_ATTRS (coding
.id
);
6303 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
6304 coding_system
= CODING_ATTR_BASE_NAME (attrs
);
6306 coding
.source
= src
;
6307 coding
.src_bytes
= src_bytes
;
6308 coding
.src_multibyte
= multibytep
;
6309 coding
.consumed
= 0;
6310 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6312 /* At first, detect text-format if necessary. */
6313 if (XINT (CODING_ATTR_CATEGORY (attrs
)) == coding_category_undecided
)
6315 for (; src
< src_end
; src
++)
6319 || (c
< 0x20 && (c
== ISO_CODE_ESC
6322 /* Most UTF-16 text contains '\0'. */
6326 coding
.head_ascii
= src
- coding
.source
;
6329 for (i
= 0; i
< coding_category_raw_text
; i
++)
6331 enum coding_category category
= coding_priorities
[i
];
6332 struct coding_system
*this = coding_categories
+ category
;
6336 /* No coding system of this category is defined. */
6337 mask
&= ~(1 << category
);
6339 else if (category
>= coding_category_raw_text
6340 || detected
& (1 << category
))
6344 detected
|= detected_mask
[category
];
6345 if ((*(coding_categories
[category
].detector
)) (&coding
, &mask
)
6347 && (mask
& (1 << category
)))
6349 mask
= 1 << category
;
6357 id
= coding_categories
[coding_category_raw_text
].id
;
6358 val
= Fcons (make_number (id
), Qnil
);
6360 else if (mask
== CATEGORY_MASK_ANY
)
6362 id
= coding_categories
[coding_category_undecided
].id
;
6363 val
= Fcons (make_number (id
), Qnil
);
6367 for (i
= 0; i
< coding_category_raw_text
; i
++)
6368 if (mask
& (1 << coding_priorities
[i
]))
6370 id
= coding_categories
[coding_priorities
[i
]].id
;
6371 val
= Fcons (make_number (id
), Qnil
);
6378 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6379 if (mask
& (1 << coding_priorities
[i
]))
6381 id
= coding_categories
[coding_priorities
[i
]].id
;
6382 val
= Fcons (make_number (id
), val
);
6388 mask
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
6389 val
= Fcons (make_number (coding
.id
), Qnil
);
6392 /* Then, detect eol-format if necessary. */
6394 int normal_eol
= -1, utf_16_be_eol
= -1, utf_16_le_eol
;
6397 if (VECTORP (eol_type
))
6399 if (mask
& ~CATEGORY_MASK_UTF_16
)
6400 normal_eol
= detect_eol (coding
.source
, src_bytes
,
6401 coding_category_raw_text
);
6402 if (mask
& (CATEGORY_MASK_UTF_16_BE
| CATEGORY_MASK_UTF_16_BE_NOSIG
))
6403 utf_16_be_eol
= detect_eol (coding
.source
, src_bytes
,
6404 coding_category_utf_16_be
);
6405 if (mask
& (CATEGORY_MASK_UTF_16_LE
| CATEGORY_MASK_UTF_16_LE_NOSIG
))
6406 utf_16_le_eol
= detect_eol (coding
.source
, src_bytes
,
6407 coding_category_utf_16_le
);
6411 if (EQ (eol_type
, Qunix
))
6412 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_LF
;
6413 else if (EQ (eol_type
, Qdos
))
6414 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CRLF
;
6416 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CR
;
6419 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
6421 enum coding_category category
;
6424 id
= XINT (XCAR (tail
));
6425 attrs
= CODING_ID_ATTRS (id
);
6426 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
6427 eol_type
= CODING_ID_EOL_TYPE (id
);
6428 if (VECTORP (eol_type
))
6430 if (category
== coding_category_utf_16_be
6431 || category
== coding_category_utf_16_be_nosig
)
6432 this_eol
= utf_16_be_eol
;
6433 else if (category
== coding_category_utf_16_le
6434 || category
== coding_category_utf_16_le_nosig
)
6435 this_eol
= utf_16_le_eol
;
6437 this_eol
= normal_eol
;
6439 if (this_eol
== EOL_SEEN_LF
)
6440 XSETCAR (tail
, AREF (eol_type
, 0));
6441 else if (this_eol
== EOL_SEEN_CRLF
)
6442 XSETCAR (tail
, AREF (eol_type
, 1));
6443 else if (this_eol
== EOL_SEEN_CR
)
6444 XSETCAR (tail
, AREF (eol_type
, 2));
6446 XSETCAR (tail
, CODING_ID_NAME (id
));
6449 XSETCAR (tail
, CODING_ID_NAME (id
));
6453 return (highest
? XCAR (val
) : val
);
6457 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
6459 doc
: /* Detect coding system of the text in the region between START and END.
6460 Return a list of possible coding systems ordered by priority.
6462 If only ASCII characters are found, it returns a list of single element
6463 `undecided' or its subsidiary coding system according to a detected
6466 If optional argument HIGHEST is non-nil, return the coding system of
6467 highest priority. */)
6468 (start
, end
, highest
)
6469 Lisp_Object start
, end
, highest
;
6472 int from_byte
, to_byte
;
6474 CHECK_NUMBER_COERCE_MARKER (start
);
6475 CHECK_NUMBER_COERCE_MARKER (end
);
6477 validate_region (&start
, &end
);
6478 from
= XINT (start
), to
= XINT (end
);
6479 from_byte
= CHAR_TO_BYTE (from
);
6480 to_byte
= CHAR_TO_BYTE (to
);
6482 if (from
< GPT
&& to
>= GPT
)
6483 move_gap_both (to
, to_byte
);
6485 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
6486 to_byte
- from_byte
,
6488 !NILP (current_buffer
6489 ->enable_multibyte_characters
),
6493 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
6495 doc
: /* Detect coding system of the text in STRING.
6496 Return a list of possible coding systems ordered by priority.
6498 If only ASCII characters are found, it returns a list of single element
6499 `undecided' or its subsidiary coding system according to a detected
6502 If optional argument HIGHEST is non-nil, return the coding system of
6503 highest priority. */)
6505 Lisp_Object string
, highest
;
6507 CHECK_STRING (string
);
6509 return detect_coding_system (XSTRING (string
)->data
,
6510 STRING_BYTES (XSTRING (string
)),
6512 STRING_MULTIBYTE (string
),
6518 char_encodable_p (c
, attrs
)
6523 struct charset
*charset
;
6525 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
6526 CONSP (tail
); tail
= XCDR (tail
))
6528 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
6529 if (CHAR_CHARSET_P (c
, charset
))
6532 return (! NILP (tail
));
6536 /* Return a list of coding systems that safely encode the text between
6537 START and END. If EXCLUDE is non-nil, it is a list of coding
6538 systems not to check. The returned list doesn't contain any such
6539 coding systems. In any case, if the text contains only ASCII or is
6540 unibyte, return t. */
6542 DEFUN ("find-coding-systems-region-internal",
6543 Ffind_coding_systems_region_internal
,
6544 Sfind_coding_systems_region_internal
, 2, 3, 0,
6545 doc
: /* Internal use only. */)
6546 (start
, end
, exclude
)
6547 Lisp_Object start
, end
, exclude
;
6549 Lisp_Object coding_attrs_list
, safe_codings
;
6550 EMACS_INT start_byte
, end_byte
;
6551 const unsigned char *p
, *pbeg
, *pend
;
6553 Lisp_Object tail
, elt
;
6555 if (STRINGP (start
))
6557 if (!STRING_MULTIBYTE (start
)
6558 || XSTRING (start
)->size
== STRING_BYTES (XSTRING (start
)))
6561 end_byte
= STRING_BYTES (XSTRING (start
));
6565 CHECK_NUMBER_COERCE_MARKER (start
);
6566 CHECK_NUMBER_COERCE_MARKER (end
);
6567 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
6568 args_out_of_range (start
, end
);
6569 if (NILP (current_buffer
->enable_multibyte_characters
))
6571 start_byte
= CHAR_TO_BYTE (XINT (start
));
6572 end_byte
= CHAR_TO_BYTE (XINT (end
));
6573 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
6576 if (start
< GPT
&& end
> GPT
)
6578 if ((GPT
- start
) < (end
- GPT
))
6579 move_gap_both (start
, start_byte
);
6581 move_gap_both (end
, end_byte
);
6585 coding_attrs_list
= Qnil
;
6586 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
6588 || NILP (Fmemq (XCAR (tail
), exclude
)))
6592 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
6593 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
6594 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
6595 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
6598 if (STRINGP (start
))
6599 p
= pbeg
= XSTRING (start
)->data
;
6601 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
6602 pend
= p
+ (end_byte
- start_byte
);
6604 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
6605 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
6609 if (ASCII_BYTE_P (*p
))
6613 c
= STRING_CHAR_ADVANCE (p
);
6615 charset_map_loaded
= 0;
6616 for (tail
= coding_attrs_list
; CONSP (tail
);)
6621 else if (char_encodable_p (c
, elt
))
6623 else if (CONSP (XCDR (tail
)))
6625 XSETCAR (tail
, XCAR (XCDR (tail
)));
6626 XSETCDR (tail
, XCDR (XCDR (tail
)));
6630 XSETCAR (tail
, Qnil
);
6634 if (charset_map_loaded
)
6636 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
6638 if (STRINGP (start
))
6639 pbeg
= XSTRING (start
)->data
;
6641 pbeg
= BYTE_POS_ADDR (start_byte
);
6642 p
= pbeg
+ p_offset
;
6643 pend
= pbeg
+ pend_offset
;
6648 safe_codings
= Qnil
;
6649 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
6650 if (! NILP (XCAR (tail
)))
6651 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
6653 return safe_codings
;
6657 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
6658 Scheck_coding_systems_region
, 3, 3, 0,
6659 doc
: /* Check if the region is encodable by coding systems.
6661 START and END are buffer positions specifying the region.
6662 CODING-SYSTEM-LIST is a list of coding systems to check.
6664 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6665 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6666 whole region, POS0, POS1, ... are buffer positions where non-encodable
6667 characters are found.
6669 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6672 START may be a string. In that case, check if the string is
6673 encodable, and the value contains indices to the string instead of
6674 buffer positions. END is ignored. */)
6675 (start
, end
, coding_system_list
)
6676 Lisp_Object start
, end
, coding_system_list
;
6679 EMACS_INT start_byte
, end_byte
;
6681 const unsigned char *p
, *pbeg
, *pend
;
6683 Lisp_Object tail
, elt
;
6685 if (STRINGP (start
))
6687 if (!STRING_MULTIBYTE (start
)
6688 && XSTRING (start
)->size
!= STRING_BYTES (XSTRING (start
)))
6691 end_byte
= STRING_BYTES (XSTRING (start
));
6696 CHECK_NUMBER_COERCE_MARKER (start
);
6697 CHECK_NUMBER_COERCE_MARKER (end
);
6698 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
6699 args_out_of_range (start
, end
);
6700 if (NILP (current_buffer
->enable_multibyte_characters
))
6702 start_byte
= CHAR_TO_BYTE (XINT (start
));
6703 end_byte
= CHAR_TO_BYTE (XINT (end
));
6704 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
6707 if (start
< GPT
&& end
> GPT
)
6709 if ((GPT
- start
) < (end
- GPT
))
6710 move_gap_both (start
, start_byte
);
6712 move_gap_both (end
, end_byte
);
6718 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
6721 list
= Fcons (Fcons (elt
, Fcons (AREF (CODING_SYSTEM_SPEC (elt
), 0),
6726 if (STRINGP (start
))
6727 p
= pbeg
= XSTRING (start
)->data
;
6729 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
6730 pend
= p
+ (end_byte
- start_byte
);
6732 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
6733 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
6737 if (ASCII_BYTE_P (*p
))
6741 c
= STRING_CHAR_ADVANCE (p
);
6743 charset_map_loaded
= 0;
6744 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
6746 elt
= XCDR (XCAR (tail
));
6747 if (! char_encodable_p (c
, XCAR (elt
)))
6748 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
6750 if (charset_map_loaded
)
6752 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
6754 if (STRINGP (start
))
6755 pbeg
= XSTRING (start
)->data
;
6757 pbeg
= BYTE_POS_ADDR (start_byte
);
6758 p
= pbeg
+ p_offset
;
6759 pend
= pbeg
+ pend_offset
;
6767 for (; CONSP (tail
); tail
= XCDR (tail
))
6770 if (CONSP (XCDR (XCDR (elt
))))
6771 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
6781 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
6782 Lisp_Object start
, end
, coding_system
, dst_object
;
6783 int encodep
, norecord
;
6785 struct coding_system coding
;
6786 EMACS_INT from
, from_byte
, to
, to_byte
;
6787 Lisp_Object src_object
;
6789 CHECK_NUMBER_COERCE_MARKER (start
);
6790 CHECK_NUMBER_COERCE_MARKER (end
);
6791 if (NILP (coding_system
))
6792 coding_system
= Qno_conversion
;
6794 CHECK_CODING_SYSTEM (coding_system
);
6795 src_object
= Fcurrent_buffer ();
6796 if (NILP (dst_object
))
6797 dst_object
= src_object
;
6798 else if (! EQ (dst_object
, Qt
))
6799 CHECK_BUFFER (dst_object
);
6801 validate_region (&start
, &end
);
6802 from
= XFASTINT (start
);
6803 from_byte
= CHAR_TO_BYTE (from
);
6804 to
= XFASTINT (end
);
6805 to_byte
= CHAR_TO_BYTE (to
);
6807 setup_coding_system (coding_system
, &coding
);
6808 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6811 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
6814 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
6817 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
6819 if (coding
.result
!= CODING_RESULT_SUCCESS
)
6820 error ("Code conversion error: %d", coding
.result
);
6822 return (BUFFERP (dst_object
)
6823 ? make_number (coding
.produced_char
)
6824 : coding
.dst_object
);
6828 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
6829 3, 4, "r\nzCoding system: ",
6830 doc
: /* Decode the current region from the specified coding system.
6831 When called from a program, takes four arguments:
6832 START, END, CODING-SYSTEM, and DESTINATION.
6833 START and END are buffer positions.
6835 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6836 If nil, the region between START and END is replace by the decoded text.
6837 If buffer, the decoded text is inserted in the buffer.
6838 If t, the decoded text is returned.
6840 This function sets `last-coding-system-used' to the precise coding system
6841 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6842 not fully specified.)
6843 It returns the length of the decoded text. */)
6844 (start
, end
, coding_system
, destination
)
6845 Lisp_Object start
, end
, coding_system
, destination
;
6847 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
6850 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
6851 3, 4, "r\nzCoding system: ",
6852 doc
: /* Encode the current region by specified coding system.
6853 When called from a program, takes three arguments:
6854 START, END, and CODING-SYSTEM. START and END are buffer positions.
6856 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6857 If nil, the region between START and END is replace by the encoded text.
6858 If buffer, the encoded text is inserted in the buffer.
6859 If t, the encoded text is returned.
6861 This function sets `last-coding-system-used' to the precise coding system
6862 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6863 not fully specified.)
6864 It returns the length of the encoded text. */)
6865 (start
, end
, coding_system
, destination
)
6866 Lisp_Object start
, end
, coding_system
, destination
;
6868 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
6872 code_convert_string (string
, coding_system
, dst_object
,
6873 encodep
, nocopy
, norecord
)
6874 Lisp_Object string
, coding_system
, dst_object
;
6875 int encodep
, nocopy
, norecord
;
6877 struct coding_system coding
;
6878 EMACS_INT chars
, bytes
;
6880 CHECK_STRING (string
);
6881 if (NILP (coding_system
))
6884 Vlast_coding_system_used
= Qno_conversion
;
6885 if (NILP (dst_object
))
6886 return (nocopy
? Fcopy_sequence (string
) : string
);
6889 if (NILP (coding_system
))
6890 coding_system
= Qno_conversion
;
6892 CHECK_CODING_SYSTEM (coding_system
);
6893 if (NILP (dst_object
))
6895 else if (! EQ (dst_object
, Qt
))
6896 CHECK_BUFFER (dst_object
);
6898 setup_coding_system (coding_system
, &coding
);
6899 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6900 chars
= XSTRING (string
)->size
;
6901 bytes
= STRING_BYTES (XSTRING (string
));
6903 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
6905 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
6907 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
6909 if (coding
.result
!= CODING_RESULT_SUCCESS
)
6910 error ("Code conversion error: %d", coding
.result
);
6912 return (BUFFERP (dst_object
)
6913 ? make_number (coding
.produced_char
)
6914 : coding
.dst_object
);
6918 /* Encode or decode STRING according to CODING_SYSTEM.
6919 Do not set Vlast_coding_system_used.
6921 This function is called only from macros DECODE_FILE and
6922 ENCODE_FILE, thus we ignore character composition. */
6925 code_convert_string_norecord (string
, coding_system
, encodep
)
6926 Lisp_Object string
, coding_system
;
6929 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
6933 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
6935 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6937 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6938 if the decoding operation is trivial.
6940 Optional fourth arg BUFFER non-nil meant that the decoded text is
6941 inserted in BUFFER instead of returned as a string. In this case,
6942 the return value is BUFFER.
6944 This function sets `last-coding-system-used' to the precise coding system
6945 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6946 not fully specified. */)
6947 (string
, coding_system
, nocopy
, buffer
)
6948 Lisp_Object string
, coding_system
, nocopy
, buffer
;
6950 return code_convert_string (string
, coding_system
, buffer
,
6951 0, ! NILP (nocopy
), 0);
6954 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
6956 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
6958 Optional third arg NOCOPY non-nil means it is OK to return STRING
6959 itself if the encoding operation is trivial.
6961 Optional fourth arg BUFFER non-nil meant that the encoded text is
6962 inserted in BUFFER instead of returned as a string. In this case,
6963 the return value is BUFFER.
6965 This function sets `last-coding-system-used' to the precise coding system
6966 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6967 not fully specified.) */)
6968 (string
, coding_system
, nocopy
, buffer
)
6969 Lisp_Object string
, coding_system
, nocopy
, buffer
;
6971 return code_convert_string (string
, coding_system
, buffer
,
6972 1, ! NILP (nocopy
), 1);
6976 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
6977 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
6978 Return the corresponding character. */)
6982 Lisp_Object spec
, attrs
, val
;
6983 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
6986 CHECK_NATNUM (code
);
6987 c
= XFASTINT (code
);
6988 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
6989 attrs
= AREF (spec
, 0);
6991 if (ASCII_BYTE_P (c
)
6992 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
6995 val
= CODING_ATTR_CHARSET_LIST (attrs
);
6996 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6997 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
6998 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7001 charset
= charset_roman
;
7002 else if (c
>= 0xA0 && c
< 0xDF)
7004 charset
= charset_kana
;
7009 int s1
= c
>> 8, s2
= c
& 0xFF;
7011 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
7012 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
7013 error ("Invalid code: %d", code
);
7015 charset
= charset_kanji
;
7017 c
= DECODE_CHAR (charset
, c
);
7019 error ("Invalid code: %d", code
);
7020 return make_number (c
);
7024 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
7025 doc
: /* Encode a Japanese character CHAR to shift_jis encoding.
7026 Return the corresponding code in SJIS. */)
7030 Lisp_Object spec
, attrs
, charset_list
;
7032 struct charset
*charset
;
7035 CHECK_CHARACTER (ch
);
7037 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7038 attrs
= AREF (spec
, 0);
7040 if (ASCII_CHAR_P (c
)
7041 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7044 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7045 charset
= char_charset (c
, charset_list
, &code
);
7046 if (code
== CHARSET_INVALID_CODE (charset
))
7047 error ("Can't encode by shift_jis encoding: %d", c
);
7050 return make_number (code
);
7053 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
7054 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
7055 Return the corresponding character. */)
7059 Lisp_Object spec
, attrs
, val
;
7060 struct charset
*charset_roman
, *charset_big5
, *charset
;
7063 CHECK_NATNUM (code
);
7064 c
= XFASTINT (code
);
7065 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7066 attrs
= AREF (spec
, 0);
7068 if (ASCII_BYTE_P (c
)
7069 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7072 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7073 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7074 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7077 charset
= charset_roman
;
7080 int b1
= c
>> 8, b2
= c
& 0x7F;
7081 if (b1
< 0xA1 || b1
> 0xFE
7082 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
7083 error ("Invalid code: %d", code
);
7084 charset
= charset_big5
;
7086 c
= DECODE_CHAR (charset
, (unsigned )c
);
7088 error ("Invalid code: %d", code
);
7089 return make_number (c
);
7092 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
7093 doc
: /* Encode the Big5 character CHAR to BIG5 coding system.
7094 Return the corresponding character code in Big5. */)
7098 Lisp_Object spec
, attrs
, charset_list
;
7099 struct charset
*charset
;
7103 CHECK_CHARACTER (ch
);
7105 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7106 attrs
= AREF (spec
, 0);
7107 if (ASCII_CHAR_P (c
)
7108 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7111 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7112 charset
= char_charset (c
, charset_list
, &code
);
7113 if (code
== CHARSET_INVALID_CODE (charset
))
7114 error ("Can't encode by Big5 encoding: %d", c
);
7116 return make_number (code
);
7120 DEFUN ("set-terminal-coding-system-internal",
7121 Fset_terminal_coding_system_internal
,
7122 Sset_terminal_coding_system_internal
, 1, 1, 0,
7123 doc
: /* Internal use only. */)
7125 Lisp_Object coding_system
;
7127 CHECK_SYMBOL (coding_system
);
7128 setup_coding_system (Fcheck_coding_system (coding_system
),
7131 /* We had better not send unsafe characters to terminal. */
7132 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
7133 /* Characer composition should be disabled. */
7134 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7135 terminal_coding
.src_multibyte
= 1;
7136 terminal_coding
.dst_multibyte
= 0;
7140 DEFUN ("set-safe-terminal-coding-system-internal",
7141 Fset_safe_terminal_coding_system_internal
,
7142 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
7143 doc
: /* Internal use only. */)
7145 Lisp_Object coding_system
;
7147 CHECK_SYMBOL (coding_system
);
7148 setup_coding_system (Fcheck_coding_system (coding_system
),
7149 &safe_terminal_coding
);
7150 /* Characer composition should be disabled. */
7151 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7152 safe_terminal_coding
.src_multibyte
= 1;
7153 safe_terminal_coding
.dst_multibyte
= 0;
7157 DEFUN ("terminal-coding-system",
7158 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
7159 doc
: /* Return coding system specified for terminal output. */)
7162 return CODING_ID_NAME (terminal_coding
.id
);
7165 DEFUN ("set-keyboard-coding-system-internal",
7166 Fset_keyboard_coding_system_internal
,
7167 Sset_keyboard_coding_system_internal
, 1, 1, 0,
7168 doc
: /* Internal use only. */)
7170 Lisp_Object coding_system
;
7172 CHECK_SYMBOL (coding_system
);
7173 setup_coding_system (Fcheck_coding_system (coding_system
),
7175 /* Characer composition should be disabled. */
7176 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7180 DEFUN ("keyboard-coding-system",
7181 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
7182 doc
: /* Return coding system specified for decoding keyboard input. */)
7185 return CODING_ID_NAME (keyboard_coding
.id
);
7189 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
7190 Sfind_operation_coding_system
, 1, MANY
, 0,
7191 doc
: /* Choose a coding system for an operation based on the target name.
7192 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7193 DECODING-SYSTEM is the coding system to use for decoding
7194 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7195 for encoding (in case OPERATION does encoding).
7197 The first argument OPERATION specifies an I/O primitive:
7198 For file I/O, `insert-file-contents' or `write-region'.
7199 For process I/O, `call-process', `call-process-region', or `start-process'.
7200 For network I/O, `open-network-stream'.
7202 The remaining arguments should be the same arguments that were passed
7203 to the primitive. Depending on which primitive, one of those arguments
7204 is selected as the TARGET. For example, if OPERATION does file I/O,
7205 whichever argument specifies the file name is TARGET.
7207 TARGET has a meaning which depends on OPERATION:
7208 For file I/O, TARGET is a file name.
7209 For process I/O, TARGET is a process name.
7210 For network I/O, TARGET is a service name or a port number
7212 This function looks up what specified for TARGET in,
7213 `file-coding-system-alist', `process-coding-system-alist',
7214 or `network-coding-system-alist' depending on OPERATION.
7215 They may specify a coding system, a cons of coding systems,
7216 or a function symbol to call.
7217 In the last case, we call the function with one argument,
7218 which is a list of all the arguments given to this function.
7220 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7225 Lisp_Object operation
, target_idx
, target
, val
;
7226 register Lisp_Object chain
;
7229 error ("Too few arguments");
7230 operation
= args
[0];
7231 if (!SYMBOLP (operation
)
7232 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
7233 error ("Invalid first arguement");
7234 if (nargs
< 1 + XINT (target_idx
))
7235 error ("Too few arguments for operation: %s",
7236 XSYMBOL (operation
)->name
->data
);
7237 target
= args
[XINT (target_idx
) + 1];
7238 if (!(STRINGP (target
)
7239 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
7240 error ("Invalid %dth argument", XINT (target_idx
) + 1);
7242 chain
= ((EQ (operation
, Qinsert_file_contents
)
7243 || EQ (operation
, Qwrite_region
))
7244 ? Vfile_coding_system_alist
7245 : (EQ (operation
, Qopen_network_stream
)
7246 ? Vnetwork_coding_system_alist
7247 : Vprocess_coding_system_alist
));
7251 for (; CONSP (chain
); chain
= XCDR (chain
))
7257 && ((STRINGP (target
)
7258 && STRINGP (XCAR (elt
))
7259 && fast_string_match (XCAR (elt
), target
) >= 0)
7260 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
7263 /* Here, if VAL is both a valid coding system and a valid
7264 function symbol, we return VAL as a coding system. */
7267 if (! SYMBOLP (val
))
7269 if (! NILP (Fcoding_system_p (val
)))
7270 return Fcons (val
, val
);
7271 if (! NILP (Ffboundp (val
)))
7273 val
= call1 (val
, Flist (nargs
, args
));
7276 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
7277 return Fcons (val
, val
);
7285 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
7286 Sset_coding_system_priority
, 0, MANY
, 0,
7287 doc
: /* Assign higher priority to the coding systems given as arguments.
7288 usage: (set-coding-system-priority CODING-SYSTEM ...) */)
7294 int changed
[coding_category_max
];
7295 enum coding_category priorities
[coding_category_max
];
7297 bzero (changed
, sizeof changed
);
7299 for (i
= j
= 0; i
< nargs
; i
++)
7301 enum coding_category category
;
7302 Lisp_Object spec
, attrs
;
7304 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
7305 attrs
= AREF (spec
, 0);
7306 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7307 if (changed
[category
])
7308 /* Ignore this coding system because a coding system of the
7309 same category already had a higher priority. */
7311 changed
[category
] = 1;
7312 priorities
[j
++] = category
;
7313 if (coding_categories
[category
].id
>= 0
7314 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
7315 setup_coding_system (args
[i
], &coding_categories
[category
]);
7318 /* Now we have decided top J priorities. Reflect the order of the
7319 original priorities to the remaining priorities. */
7321 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
7323 while (j
< coding_category_max
7324 && changed
[coding_priorities
[j
]])
7326 if (j
== coding_category_max
)
7328 priorities
[i
] = coding_priorities
[j
];
7331 bcopy (priorities
, coding_priorities
, sizeof priorities
);
7335 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
7336 Scoding_system_priority_list
, 0, 1, 0,
7337 doc
: /* Return a list of coding systems ordered by their priorities.
7338 HIGHESTP non-nil means just return the highest priority one. */)
7340 Lisp_Object highestp
;
7345 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
7347 enum coding_category category
= coding_priorities
[i
];
7348 int id
= coding_categories
[category
].id
;
7353 attrs
= CODING_ID_ATTRS (id
);
7354 if (! NILP (highestp
))
7355 return CODING_ATTR_BASE_NAME (attrs
);
7356 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
7358 return Fnreverse (val
);
7361 static char *suffixes
[] = { "-unix", "-dos", "-mac" };
7364 make_subsidiaries (base
)
7367 Lisp_Object subsidiaries
;
7368 int base_name_len
= STRING_BYTES (XSYMBOL (base
)->name
);
7369 char *buf
= (char *) alloca (base_name_len
+ 6);
7372 bcopy (XSYMBOL (base
)->name
->data
, buf
, base_name_len
);
7373 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
7374 for (i
= 0; i
< 3; i
++)
7376 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
7377 ASET (subsidiaries
, i
, intern (buf
));
7379 return subsidiaries
;
7383 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
7384 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
7385 doc
: /* For internal use only.
7386 usage: (define-coding-system-internal ...) */)
7392 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
7393 Lisp_Object attrs
; /* Vector of attributes. */
7394 Lisp_Object eol_type
;
7395 Lisp_Object aliases
;
7396 Lisp_Object coding_type
, charset_list
, safe_charsets
;
7397 enum coding_category category
;
7398 Lisp_Object tail
, val
;
7399 int max_charset_id
= 0;
7402 if (nargs
< coding_arg_max
)
7405 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
7407 name
= args
[coding_arg_name
];
7408 CHECK_SYMBOL (name
);
7409 CODING_ATTR_BASE_NAME (attrs
) = name
;
7411 val
= args
[coding_arg_mnemonic
];
7412 if (! STRINGP (val
))
7413 CHECK_CHARACTER (val
);
7414 CODING_ATTR_MNEMONIC (attrs
) = val
;
7416 coding_type
= args
[coding_arg_coding_type
];
7417 CHECK_SYMBOL (coding_type
);
7418 CODING_ATTR_TYPE (attrs
) = coding_type
;
7420 charset_list
= args
[coding_arg_charset_list
];
7421 if (SYMBOLP (charset_list
))
7423 if (EQ (charset_list
, Qiso_2022
))
7425 if (! EQ (coding_type
, Qiso_2022
))
7426 error ("Invalid charset-list");
7427 charset_list
= Viso_2022_charset_list
;
7429 else if (EQ (charset_list
, Qemacs_mule
))
7431 if (! EQ (coding_type
, Qemacs_mule
))
7432 error ("Invalid charset-list");
7433 charset_list
= Vemacs_mule_charset_list
;
7435 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7436 if (max_charset_id
< XFASTINT (XCAR (tail
)))
7437 max_charset_id
= XFASTINT (XCAR (tail
));
7441 charset_list
= Fcopy_sequence (charset_list
);
7442 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
7444 struct charset
*charset
;
7447 CHECK_CHARSET_GET_CHARSET (val
, charset
);
7448 if (EQ (coding_type
, Qiso_2022
)
7449 ? CHARSET_ISO_FINAL (charset
) < 0
7450 : EQ (coding_type
, Qemacs_mule
)
7451 ? CHARSET_EMACS_MULE_ID (charset
) < 0
7453 error ("Can't handle charset `%s'",
7454 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7456 XCAR (tail
) = make_number (charset
->id
);
7457 if (max_charset_id
< charset
->id
)
7458 max_charset_id
= charset
->id
;
7461 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
7463 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
7465 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7466 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
7467 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
7469 val
= args
[coding_arg_decode_translation_table
];
7471 CHECK_CHAR_TABLE (val
);
7472 CODING_ATTR_DECODE_TBL (attrs
) = val
;
7474 val
= args
[coding_arg_encode_translation_table
];
7476 CHECK_CHAR_TABLE (val
);
7477 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
7479 val
= args
[coding_arg_post_read_conversion
];
7481 CODING_ATTR_POST_READ (attrs
) = val
;
7483 val
= args
[coding_arg_pre_write_conversion
];
7485 CODING_ATTR_PRE_WRITE (attrs
) = val
;
7487 val
= args
[coding_arg_default_char
];
7489 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
7492 CHECK_CHARACTER (val
);
7493 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
7496 val
= args
[coding_arg_plist
];
7498 CODING_ATTR_PLIST (attrs
) = val
;
7500 if (EQ (coding_type
, Qcharset
))
7502 /* Generate a lisp vector of 256 elements. Each element is nil,
7503 integer, or a list of charset IDs.
7505 If Nth element is nil, the byte code N is invalid in this
7508 If Nth element is a number NUM, N is the first byte of a
7509 charset whose ID is NUM.
7511 If Nth element is a list of charset IDs, N is the first byte
7512 of one of them. The list is sorted by dimensions of the
7513 charsets. A charset of smaller dimension comes firtst.
7515 val
= Fmake_vector (make_number (256), Qnil
);
7517 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7519 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
7520 int dim
= CHARSET_DIMENSION (charset
);
7521 int idx
= (dim
- 1) * 4;
7523 for (i
= charset
->code_space
[idx
];
7524 i
<= charset
->code_space
[idx
+ 1]; i
++)
7526 Lisp_Object tmp
, tmp2
;
7529 tmp
= AREF (val
, i
);
7532 else if (NUMBERP (tmp
))
7534 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
7536 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
7538 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
7542 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
7544 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
7549 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
7552 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
7553 XSETCAR (tmp2
, XCAR (tail
));
7559 ASET (attrs
, coding_attr_charset_valids
, val
);
7560 category
= coding_category_charset
;
7562 else if (EQ (coding_type
, Qccl
))
7566 if (nargs
< coding_arg_ccl_max
)
7569 val
= args
[coding_arg_ccl_decoder
];
7570 CHECK_CCL_PROGRAM (val
);
7572 val
= Fcopy_sequence (val
);
7573 ASET (attrs
, coding_attr_ccl_decoder
, val
);
7575 val
= args
[coding_arg_ccl_encoder
];
7576 CHECK_CCL_PROGRAM (val
);
7578 val
= Fcopy_sequence (val
);
7579 ASET (attrs
, coding_attr_ccl_encoder
, val
);
7581 val
= args
[coding_arg_ccl_valids
];
7582 valids
= Fmake_string (make_number (256), make_number (0));
7583 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
7587 ASET (valids
, XINT (val
), make_number (1));
7593 CHECK_NUMBER (XCAR (val
));
7594 CHECK_NUMBER (XCDR (val
));
7595 from
= XINT (XCAR (val
));
7596 to
= XINT (XCDR (val
));
7597 for (i
= from
; i
<= to
; i
++)
7598 ASET (valids
, i
, make_number (1));
7601 ASET (attrs
, coding_attr_ccl_valids
, valids
);
7603 category
= coding_category_ccl
;
7605 else if (EQ (coding_type
, Qutf_16
))
7607 Lisp_Object bom
, endian
;
7609 if (nargs
< coding_arg_utf16_max
)
7612 bom
= args
[coding_arg_utf16_bom
];
7613 if (! NILP (bom
) && ! EQ (bom
, Qt
))
7616 CHECK_CODING_SYSTEM (XCAR (bom
));
7617 CHECK_CODING_SYSTEM (XCDR (bom
));
7619 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
7621 endian
= args
[coding_arg_utf16_endian
];
7622 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
7624 category
= (CONSP (bom
)
7625 ? coding_category_utf_16_auto
7628 ? coding_category_utf_16_be_nosig
7629 : coding_category_utf_16_le_nosig
)
7631 ? coding_category_utf_16_be
7632 : coding_category_utf_16_le
));
7634 else if (EQ (coding_type
, Qiso_2022
))
7636 Lisp_Object initial
, reg_usage
, request
, flags
;
7639 if (nargs
< coding_arg_iso2022_max
)
7642 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
7643 CHECK_VECTOR (initial
);
7644 for (i
= 0; i
< 4; i
++)
7646 val
= Faref (initial
, make_number (i
));
7649 CHECK_CHARSET_GET_ID (val
, id
);
7650 ASET (initial
, i
, make_number (id
));
7653 ASET (initial
, i
, make_number (-1));
7656 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
7657 CHECK_CONS (reg_usage
);
7658 CHECK_NATNUM (XCAR (reg_usage
));
7659 CHECK_NATNUM (XCDR (reg_usage
));
7661 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
7662 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
7668 CHECK_CHARSET_GET_ID (XCAR (val
), id
);
7669 CHECK_NATNUM (XCDR (val
));
7670 if (XINT (XCDR (val
)) >= 4)
7671 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
7672 XCAR (val
) = make_number (id
);
7675 flags
= args
[coding_arg_iso2022_flags
];
7676 CHECK_NATNUM (flags
);
7678 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
7679 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
7681 ASET (attrs
, coding_attr_iso_initial
, initial
);
7682 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
7683 ASET (attrs
, coding_attr_iso_request
, request
);
7684 ASET (attrs
, coding_attr_iso_flags
, flags
);
7685 setup_iso_safe_charsets (attrs
);
7687 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
7688 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
7689 | CODING_ISO_FLAG_SINGLE_SHIFT
))
7690 ? coding_category_iso_7_else
7691 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
7692 ? coding_category_iso_7
7693 : coding_category_iso_7_tight
);
7696 int id
= XINT (AREF (initial
, 1));
7698 category
= (((i
& CODING_ISO_FLAG_LOCKING_SHIFT
)
7699 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
7701 ? coding_category_iso_8_else
7702 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
7703 ? coding_category_iso_8_1
7704 : coding_category_iso_8_2
);
7707 else if (EQ (coding_type
, Qemacs_mule
))
7709 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
7710 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
7712 category
= coding_category_emacs_mule
;
7714 else if (EQ (coding_type
, Qshift_jis
))
7717 struct charset
*charset
;
7719 if (XINT (Flength (charset_list
)) != 3)
7720 error ("There should be just three charsets");
7722 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7723 if (CHARSET_DIMENSION (charset
) != 1)
7724 error ("Dimension of charset %s is not one",
7725 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7727 charset_list
= XCDR (charset_list
);
7728 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7729 if (CHARSET_DIMENSION (charset
) != 1)
7730 error ("Dimension of charset %s is not one",
7731 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7733 charset_list
= XCDR (charset_list
);
7734 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7735 if (CHARSET_DIMENSION (charset
) != 2)
7736 error ("Dimension of charset %s is not two",
7737 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7739 category
= coding_category_sjis
;
7740 Vsjis_coding_system
= name
;
7742 else if (EQ (coding_type
, Qbig5
))
7744 struct charset
*charset
;
7746 if (XINT (Flength (charset_list
)) != 2)
7747 error ("There should be just two charsets");
7749 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7750 if (CHARSET_DIMENSION (charset
) != 1)
7751 error ("Dimension of charset %s is not one",
7752 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7754 charset_list
= XCDR (charset_list
);
7755 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
7756 if (CHARSET_DIMENSION (charset
) != 2)
7757 error ("Dimension of charset %s is not two",
7758 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7760 category
= coding_category_big5
;
7761 Vbig5_coding_system
= name
;
7763 else if (EQ (coding_type
, Qraw_text
))
7764 category
= coding_category_raw_text
;
7765 else if (EQ (coding_type
, Qutf_8
))
7766 category
= coding_category_utf_8
;
7767 else if (EQ (coding_type
, Qundecided
))
7768 category
= coding_category_undecided
;
7770 error ("Invalid coding system type: %s",
7771 XSYMBOL (coding_type
)->name
->data
);
7773 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
7775 eol_type
= args
[coding_arg_eol_type
];
7776 if (! NILP (eol_type
)
7777 && ! EQ (eol_type
, Qunix
)
7778 && ! EQ (eol_type
, Qdos
)
7779 && ! EQ (eol_type
, Qmac
))
7780 error ("Invalid eol-type");
7782 aliases
= Fcons (name
, Qnil
);
7784 if (NILP (eol_type
))
7786 eol_type
= make_subsidiaries (name
);
7787 for (i
= 0; i
< 3; i
++)
7789 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
7791 this_name
= AREF (eol_type
, i
);
7792 this_aliases
= Fcons (this_name
, Qnil
);
7793 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
7794 this_spec
= Fmake_vector (make_number (3), attrs
);
7795 ASET (this_spec
, 1, this_aliases
);
7796 ASET (this_spec
, 2, this_eol_type
);
7797 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
7798 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
7799 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
7800 Vcoding_system_alist
);
7804 spec_vec
= Fmake_vector (make_number (3), attrs
);
7805 ASET (spec_vec
, 1, aliases
);
7806 ASET (spec_vec
, 2, eol_type
);
7808 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
7809 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
7810 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
7811 Vcoding_system_alist
);
7814 int id
= coding_categories
[category
].id
;
7816 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
7817 setup_coding_system (name
, &coding_categories
[category
]);
7823 return Fsignal (Qwrong_number_of_arguments
,
7824 Fcons (intern ("define-coding-system-internal"),
7825 make_number (nargs
)));
7828 /* Fixme: should this record the alias relationships for
7830 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
7831 Sdefine_coding_system_alias
, 2, 2, 0,
7832 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
7833 (alias
, coding_system
)
7834 Lisp_Object alias
, coding_system
;
7836 Lisp_Object spec
, aliases
, eol_type
;
7838 CHECK_SYMBOL (alias
);
7839 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7840 aliases
= AREF (spec
, 1);
7841 while (!NILP (XCDR (aliases
)))
7842 aliases
= XCDR (aliases
);
7843 XCDR (aliases
) = Fcons (alias
, Qnil
);
7845 eol_type
= AREF (spec
, 2);
7846 if (VECTORP (eol_type
))
7848 Lisp_Object subsidiaries
;
7851 subsidiaries
= make_subsidiaries (alias
);
7852 for (i
= 0; i
< 3; i
++)
7853 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
7854 AREF (eol_type
, i
));
7856 ASET (spec
, 2, subsidiaries
);
7859 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
7860 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
7861 Vcoding_system_alist
);
7866 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
7868 doc
: /* Return the base of CODING-SYSTEM.
7869 Any alias or subsidiary coding system is not a base coding system. */)
7871 Lisp_Object coding_system
;
7873 Lisp_Object spec
, attrs
;
7875 if (NILP (coding_system
))
7876 return (Qno_conversion
);
7877 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7878 attrs
= AREF (spec
, 0);
7879 return CODING_ATTR_BASE_NAME (attrs
);
7882 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
7884 doc
: "Return the property list of CODING-SYSTEM.")
7886 Lisp_Object coding_system
;
7888 Lisp_Object spec
, attrs
;
7890 if (NILP (coding_system
))
7891 coding_system
= Qno_conversion
;
7892 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7893 attrs
= AREF (spec
, 0);
7894 return CODING_ATTR_PLIST (attrs
);
7898 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
7900 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
7902 Lisp_Object coding_system
;
7906 if (NILP (coding_system
))
7907 coding_system
= Qno_conversion
;
7908 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
7909 return AREF (spec
, 1);
7912 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
7913 Scoding_system_eol_type
, 1, 1, 0,
7914 doc
: /* Return eol-type of CODING-SYSTEM.
7915 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7917 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7918 and CR respectively.
7920 A vector value indicates that a format of end-of-line should be
7921 detected automatically. Nth element of the vector is the subsidiary
7922 coding system whose eol-type is N. */)
7924 Lisp_Object coding_system
;
7926 Lisp_Object spec
, eol_type
;
7929 if (NILP (coding_system
))
7930 coding_system
= Qno_conversion
;
7931 if (! CODING_SYSTEM_P (coding_system
))
7933 spec
= CODING_SYSTEM_SPEC (coding_system
);
7934 eol_type
= AREF (spec
, 2);
7935 if (VECTORP (eol_type
))
7936 return Fcopy_sequence (eol_type
);
7937 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
7938 return make_number (n
);
7944 /*** 9. Post-amble ***/
7951 for (i
= 0; i
< coding_category_max
; i
++)
7953 coding_categories
[i
].id
= -1;
7954 coding_priorities
[i
] = i
;
7957 /* ISO2022 specific initialize routine. */
7958 for (i
= 0; i
< 0x20; i
++)
7959 iso_code_class
[i
] = ISO_control_0
;
7960 for (i
= 0x21; i
< 0x7F; i
++)
7961 iso_code_class
[i
] = ISO_graphic_plane_0
;
7962 for (i
= 0x80; i
< 0xA0; i
++)
7963 iso_code_class
[i
] = ISO_control_1
;
7964 for (i
= 0xA1; i
< 0xFF; i
++)
7965 iso_code_class
[i
] = ISO_graphic_plane_1
;
7966 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
7967 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
7968 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
7969 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
7970 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
7971 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
7972 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
7973 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
7974 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
7975 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
7977 inhibit_pre_post_conversion
= 0;
7979 for (i
= 0; i
< 256; i
++)
7981 emacs_mule_bytes
[i
] = 1;
7983 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_11
] = 3;
7984 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_12
] = 3;
7985 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_21
] = 4;
7986 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_22
] = 4;
7994 staticpro (&Vcoding_system_hash_table
);
7995 Vcoding_system_hash_table
= Fmakehash (Qeq
);
7997 staticpro (&Vsjis_coding_system
);
7998 Vsjis_coding_system
= Qnil
;
8000 staticpro (&Vbig5_coding_system
);
8001 Vbig5_coding_system
= Qnil
;
8003 staticpro (&Vcode_conversion_work_buf_list
);
8004 Vcode_conversion_work_buf_list
= Qnil
;
8006 staticpro (&Vcode_conversion_reused_work_buf
);
8007 Vcode_conversion_reused_work_buf
= Qnil
;
8009 DEFSYM (Qcharset
, "charset");
8010 DEFSYM (Qtarget_idx
, "target-idx");
8011 DEFSYM (Qcoding_system_history
, "coding-system-history");
8012 Fset (Qcoding_system_history
, Qnil
);
8014 /* Target FILENAME is the first argument. */
8015 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
8016 /* Target FILENAME is the third argument. */
8017 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
8019 DEFSYM (Qcall_process
, "call-process");
8020 /* Target PROGRAM is the first argument. */
8021 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
8023 DEFSYM (Qcall_process_region
, "call-process-region");
8024 /* Target PROGRAM is the third argument. */
8025 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
8027 DEFSYM (Qstart_process
, "start-process");
8028 /* Target PROGRAM is the third argument. */
8029 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
8031 DEFSYM (Qopen_network_stream
, "open-network-stream");
8032 /* Target SERVICE is the fourth argument. */
8033 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
8035 DEFSYM (Qcoding_system
, "coding-system");
8036 DEFSYM (Qcoding_aliases
, "coding-aliases");
8038 DEFSYM (Qeol_type
, "eol-type");
8039 DEFSYM (Qunix
, "unix");
8040 DEFSYM (Qdos
, "dos");
8042 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
8043 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
8044 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
8045 DEFSYM (Qdefault_char
, "default-char");
8046 DEFSYM (Qundecided
, "undecided");
8047 DEFSYM (Qno_conversion
, "no-conversion");
8048 DEFSYM (Qraw_text
, "raw-text");
8050 DEFSYM (Qiso_2022
, "iso-2022");
8052 DEFSYM (Qutf_8
, "utf-8");
8054 DEFSYM (Qutf_16
, "utf-16");
8055 DEFSYM (Qutf_16_be
, "utf-16-be");
8056 DEFSYM (Qutf_16_be_nosig
, "utf-16-be-nosig");
8057 DEFSYM (Qutf_16_le
, "utf-16-l3");
8058 DEFSYM (Qutf_16_le_nosig
, "utf-16-le-nosig");
8059 DEFSYM (Qsignature
, "signature");
8060 DEFSYM (Qendian
, "endian");
8061 DEFSYM (Qbig
, "big");
8062 DEFSYM (Qlittle
, "little");
8064 DEFSYM (Qshift_jis
, "shift-jis");
8065 DEFSYM (Qbig5
, "big5");
8067 DEFSYM (Qcoding_system_p
, "coding-system-p");
8069 DEFSYM (Qcoding_system_error
, "coding-system-error");
8070 Fput (Qcoding_system_error
, Qerror_conditions
,
8071 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
8072 Fput (Qcoding_system_error
, Qerror_message
,
8073 build_string ("Invalid coding system"));
8075 /* Intern this now in case it isn't already done.
8076 Setting this variable twice is harmless.
8077 But don't staticpro it here--that is done in alloc.c. */
8078 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
8080 DEFSYM (Qtranslation_table
, "translation-table");
8081 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
8082 DEFSYM (Qtranslation_table_id
, "translation-table-id");
8083 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
8084 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
8086 DEFSYM (Qvalid_codes
, "valid-codes");
8088 DEFSYM (Qemacs_mule
, "emacs-mule");
8090 Vcoding_category_table
8091 = Fmake_vector (make_number (coding_category_max
), Qnil
);
8092 staticpro (&Vcoding_category_table
);
8093 /* Followings are target of code detection. */
8094 ASET (Vcoding_category_table
, coding_category_iso_7
,
8095 intern ("coding-category-iso-7"));
8096 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
8097 intern ("coding-category-iso-7-tight"));
8098 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
8099 intern ("coding-category-iso-8-1"));
8100 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
8101 intern ("coding-category-iso-8-2"));
8102 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
8103 intern ("coding-category-iso-7-else"));
8104 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
8105 intern ("coding-category-iso-8-else"));
8106 ASET (Vcoding_category_table
, coding_category_utf_8
,
8107 intern ("coding-category-utf-8"));
8108 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
8109 intern ("coding-category-utf-16-be"));
8110 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
8111 intern ("coding-category-utf-16-le"));
8112 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
8113 intern ("coding-category-utf-16-be-nosig"));
8114 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
8115 intern ("coding-category-utf-16-le-nosig"));
8116 ASET (Vcoding_category_table
, coding_category_charset
,
8117 intern ("coding-category-charset"));
8118 ASET (Vcoding_category_table
, coding_category_sjis
,
8119 intern ("coding-category-sjis"));
8120 ASET (Vcoding_category_table
, coding_category_big5
,
8121 intern ("coding-category-big5"));
8122 ASET (Vcoding_category_table
, coding_category_ccl
,
8123 intern ("coding-category-ccl"));
8124 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
8125 intern ("coding-category-emacs-mule"));
8126 /* Followings are NOT target of code detection. */
8127 ASET (Vcoding_category_table
, coding_category_raw_text
,
8128 intern ("coding-category-raw-text"));
8129 ASET (Vcoding_category_table
, coding_category_undecided
,
8130 intern ("coding-category-undecided"));
8132 defsubr (&Scoding_system_p
);
8133 defsubr (&Sread_coding_system
);
8134 defsubr (&Sread_non_nil_coding_system
);
8135 defsubr (&Scheck_coding_system
);
8136 defsubr (&Sdetect_coding_region
);
8137 defsubr (&Sdetect_coding_string
);
8138 defsubr (&Sfind_coding_systems_region_internal
);
8139 defsubr (&Scheck_coding_systems_region
);
8140 defsubr (&Sdecode_coding_region
);
8141 defsubr (&Sencode_coding_region
);
8142 defsubr (&Sdecode_coding_string
);
8143 defsubr (&Sencode_coding_string
);
8144 defsubr (&Sdecode_sjis_char
);
8145 defsubr (&Sencode_sjis_char
);
8146 defsubr (&Sdecode_big5_char
);
8147 defsubr (&Sencode_big5_char
);
8148 defsubr (&Sset_terminal_coding_system_internal
);
8149 defsubr (&Sset_safe_terminal_coding_system_internal
);
8150 defsubr (&Sterminal_coding_system
);
8151 defsubr (&Sset_keyboard_coding_system_internal
);
8152 defsubr (&Skeyboard_coding_system
);
8153 defsubr (&Sfind_operation_coding_system
);
8154 defsubr (&Sset_coding_system_priority
);
8155 defsubr (&Sdefine_coding_system_internal
);
8156 defsubr (&Sdefine_coding_system_alias
);
8157 defsubr (&Scoding_system_base
);
8158 defsubr (&Scoding_system_plist
);
8159 defsubr (&Scoding_system_aliases
);
8160 defsubr (&Scoding_system_eol_type
);
8161 defsubr (&Scoding_system_priority_list
);
8163 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
8164 doc
: /* List of coding systems.
8166 Do not alter the value of this variable manually. This variable should be
8167 updated by the functions `define-coding-system' and
8168 `define-coding-system-alias'. */);
8169 Vcoding_system_list
= Qnil
;
8171 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
8172 doc
: /* Alist of coding system names.
8173 Each element is one element list of coding system name.
8174 This variable is given to `completing-read' as TABLE argument.
8176 Do not alter the value of this variable manually. This variable should be
8177 updated by the functions `make-coding-system' and
8178 `define-coding-system-alias'. */);
8179 Vcoding_system_alist
= Qnil
;
8181 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
8182 doc
: /* List of coding-categories (symbols) ordered by priority.
8184 On detecting a coding system, Emacs tries code detection algorithms
8185 associated with each coding-category one by one in this order. When
8186 one algorithm agrees with a byte sequence of source text, the coding
8187 system bound to the corresponding coding-category is selected. */);
8191 Vcoding_category_list
= Qnil
;
8192 for (i
= coding_category_max
- 1; i
>= 0; i
--)
8193 Vcoding_category_list
8194 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
8195 Vcoding_category_list
);
8198 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
8199 doc
: /* Specify the coding system for read operations.
8200 It is useful to bind this variable with `let', but do not set it globally.
8201 If the value is a coding system, it is used for decoding on read operation.
8202 If not, an appropriate element is used from one of the coding system alists:
8203 There are three such tables, `file-coding-system-alist',
8204 `process-coding-system-alist', and `network-coding-system-alist'. */);
8205 Vcoding_system_for_read
= Qnil
;
8207 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
8208 doc
: /* Specify the coding system for write operations.
8209 Programs bind this variable with `let', but you should not set it globally.
8210 If the value is a coding system, it is used for encoding of output,
8211 when writing it to a file and when sending it to a file or subprocess.
8213 If this does not specify a coding system, an appropriate element
8214 is used from one of the coding system alists:
8215 There are three such tables, `file-coding-system-alist',
8216 `process-coding-system-alist', and `network-coding-system-alist'.
8217 For output to files, if the above procedure does not specify a coding system,
8218 the value of `buffer-file-coding-system' is used. */);
8219 Vcoding_system_for_write
= Qnil
;
8221 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
8223 Coding system used in the latest file or process I/O. */);
8224 Vlast_coding_system_used
= Qnil
;
8226 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
8228 *Non-nil means always inhibit code conversion of end-of-line format.
8229 See info node `Coding Systems' and info node `Text and Binary' concerning
8230 such conversion. */);
8231 inhibit_eol_conversion
= 0;
8233 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
8235 Non-nil means process buffer inherits coding system of process output.
8236 Bind it to t if the process output is to be treated as if it were a file
8237 read from some filesystem. */);
8238 inherit_process_coding_system
= 0;
8240 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
8242 Alist to decide a coding system to use for a file I/O operation.
8243 The format is ((PATTERN . VAL) ...),
8244 where PATTERN is a regular expression matching a file name,
8245 VAL is a coding system, a cons of coding systems, or a function symbol.
8246 If VAL is a coding system, it is used for both decoding and encoding
8248 If VAL is a cons of coding systems, the car part is used for decoding,
8249 and the cdr part is used for encoding.
8250 If VAL is a function symbol, the function must return a coding system
8251 or a cons of coding systems which are used as above. The function gets
8252 the arguments with which `find-operation-coding-systems' was called.
8254 See also the function `find-operation-coding-system'
8255 and the variable `auto-coding-alist'. */);
8256 Vfile_coding_system_alist
= Qnil
;
8258 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
8260 Alist to decide a coding system to use for a process I/O operation.
8261 The format is ((PATTERN . VAL) ...),
8262 where PATTERN is a regular expression matching a program name,
8263 VAL is a coding system, a cons of coding systems, or a function symbol.
8264 If VAL is a coding system, it is used for both decoding what received
8265 from the program and encoding what sent to the program.
8266 If VAL is a cons of coding systems, the car part is used for decoding,
8267 and the cdr part is used for encoding.
8268 If VAL is a function symbol, the function must return a coding system
8269 or a cons of coding systems which are used as above.
8271 See also the function `find-operation-coding-system'. */);
8272 Vprocess_coding_system_alist
= Qnil
;
8274 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
8276 Alist to decide a coding system to use for a network I/O operation.
8277 The format is ((PATTERN . VAL) ...),
8278 where PATTERN is a regular expression matching a network service name
8279 or is a port number to connect to,
8280 VAL is a coding system, a cons of coding systems, or a function symbol.
8281 If VAL is a coding system, it is used for both decoding what received
8282 from the network stream and encoding what sent to the network stream.
8283 If VAL is a cons of coding systems, the car part is used for decoding,
8284 and the cdr part is used for encoding.
8285 If VAL is a function symbol, the function must return a coding system
8286 or a cons of coding systems which are used as above.
8288 See also the function `find-operation-coding-system'. */);
8289 Vnetwork_coding_system_alist
= Qnil
;
8291 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
8292 doc
: /* Coding system to use with system messages.
8293 Also used for decoding keyboard input on X Window system. */);
8294 Vlocale_coding_system
= Qnil
;
8296 /* The eol mnemonics are reset in startup.el system-dependently. */
8297 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
8299 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8300 eol_mnemonic_unix
= build_string (":");
8302 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
8304 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8305 eol_mnemonic_dos
= build_string ("\\");
8307 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
8309 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8310 eol_mnemonic_mac
= build_string ("/");
8312 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
8314 *String displayed in mode line when end-of-line format is not yet determined. */);
8315 eol_mnemonic_undecided
= build_string (":");
8317 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
8319 *Non-nil enables character translation while encoding and decoding. */);
8320 Venable_character_translation
= Qt
;
8322 DEFVAR_LISP ("standard-translation-table-for-decode",
8323 &Vstandard_translation_table_for_decode
,
8324 doc
: /* Table for translating characters while decoding. */);
8325 Vstandard_translation_table_for_decode
= Qnil
;
8327 DEFVAR_LISP ("standard-translation-table-for-encode",
8328 &Vstandard_translation_table_for_encode
,
8329 doc
: /* Table for translating characters while encoding. */);
8330 Vstandard_translation_table_for_encode
= Qnil
;
8332 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
8333 doc
: /* Alist of charsets vs revision numbers.
8334 While encoding, if a charset (car part of an element) is found,
8335 designate it with the escape sequence identifying revision (cdr part
8336 of the element). */);
8337 Vcharset_revision_table
= Qnil
;
8339 DEFVAR_LISP ("default-process-coding-system",
8340 &Vdefault_process_coding_system
,
8341 doc
: /* Cons of coding systems used for process I/O by default.
8342 The car part is used for decoding a process output,
8343 the cdr part is used for encoding a text to be sent to a process. */);
8344 Vdefault_process_coding_system
= Qnil
;
8346 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
8348 Table of extra Latin codes in the range 128..159 (inclusive).
8349 This is a vector of length 256.
8350 If Nth element is non-nil, the existence of code N in a file
8351 \(or output of subprocess) doesn't prevent it to be detected as
8352 a coding system of ISO 2022 variant which has a flag
8353 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8354 or reading output of a subprocess.
8355 Only 128th through 159th elements has a meaning. */);
8356 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
8358 DEFVAR_LISP ("select-safe-coding-system-function",
8359 &Vselect_safe_coding_system_function
,
8361 Function to call to select safe coding system for encoding a text.
8363 If set, this function is called to force a user to select a proper
8364 coding system which can encode the text in the case that a default
8365 coding system used in each operation can't encode the text.
8367 The default value is `select-safe-coding-system' (which see). */);
8368 Vselect_safe_coding_system_function
= Qnil
;
8370 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8371 &inhibit_iso_escape_detection
,
8373 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8375 By default, on reading a file, Emacs tries to detect how the text is
8376 encoded. This code detection is sensitive to escape sequences. If
8377 the sequence is valid as ISO2022, the code is determined as one of
8378 the ISO2022 encodings, and the file is decoded by the corresponding
8379 coding system (e.g. `iso-2022-7bit').
8381 However, there may be a case that you want to read escape sequences in
8382 a file as is. In such a case, you can set this variable to non-nil.
8383 Then, as the code detection ignores any escape sequences, no file is
8384 detected as encoded in some ISO2022 encoding. The result is that all
8385 escape sequences become visible in a buffer.
8387 The default value is nil, and it is strongly recommended not to change
8388 it. That is because many Emacs Lisp source files that contain
8389 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8390 in Emacs's distribution, and they won't be decoded correctly on
8391 reading if you suppress escape sequence detection.
8393 The other way to read escape sequences in a file without decoding is
8394 to explicitly specify some coding system that doesn't use ISO2022's
8395 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8396 inhibit_iso_escape_detection
= 0;
8399 Lisp_Object args
[coding_arg_max
];
8400 Lisp_Object plist
[14];
8403 for (i
= 0; i
< coding_arg_max
; i
++)
8406 plist
[0] = intern (":name");
8407 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
8408 plist
[2] = intern (":mnemonic");
8409 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
8410 plist
[4] = intern (":coding-type");
8411 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
8412 plist
[6] = intern (":ascii-compatible-p");
8413 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
8414 plist
[8] = intern (":default-char");
8415 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
8416 plist
[10] = intern (":docstring");
8417 plist
[11] = build_string ("Do no conversion.\n\
8419 When you visit a file with this coding, the file is read into a\n\
8420 unibyte buffer as is, thus each byte of a file is treated as a\n\
8422 plist
[12] = intern (":eol-type");
8423 plist
[13] = args
[coding_arg_eol_type
] = Qunix
;
8424 args
[coding_arg_plist
] = Flist (14, plist
);
8425 Fdefine_coding_system_internal (coding_arg_max
, args
);
8428 setup_coding_system (Qno_conversion
, &keyboard_coding
);
8429 setup_coding_system (Qno_conversion
, &terminal_coding
);
8430 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
8434 emacs_strerror (error_number
)
8439 synchronize_system_messages_locale ();
8440 str
= strerror (error_number
);
8442 if (! NILP (Vlocale_coding_system
))
8444 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
8445 Vlocale_coding_system
,
8447 str
= (char *) XSTRING (dec
)->data
;