src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (coding, detect_info)
 158      struct coding_system *coding;
 159      struct coding_detection_info *detect_info;
 160 {
 161   const unsigned char *src = coding->source;
 162   const unsigned char *src_end = coding->source + coding->src_bytes;
 163   int multibytep = coding->src_multibyte;
 164   int consumed_chars = 0;
 165   int found = 0;
 166   ...;
 167
 168   while (1)
 169     {
 170       /* Get one byte from the source.  If the souce is exausted, jump
 171          to no_more_source:.  */
 172       ONE_MORE_BYTE (c);
 173
 174       if (! __C_conforms_to_XXX___ (c))
 175         break;
 176       if (! __C_strongly_suggests_XXX__ (c))
 177         found = CATEGORY_MASK_XXX;
 178     }
 179   /* The byte sequence is invalid for XXX.  */
 180   detect_info->rejected |= CATEGORY_MASK_XXX;
 181   return 0;
 182
 183  no_more_source:
 184   /* The source exausted successfully.  */
 185   detect_info->found |= found;
 186   return 1;
 187 }
 188 #endif
 189
 190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 191
 192   These functions decode a byte sequence specified as a source by
 193   CODING.  The resulting multibyte text goes to a place pointed to by
 194   CODING->charbuf, the length of which should not exceed
 195   CODING->charbuf_size;
 196
 197   These functions set the information of original and decoded texts in
 198   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 199   They also set CODING->result to one of CODING_RESULT_XXX indicating
 200   how the decoding is finished.
 201
 202   Below is the template of these functions.  */
 203
 204 #if 0
 205 static void
 206 decode_coding_XXXX (coding)
 207      struct coding_system *coding;
 208 {
 209   const unsigned char *src = coding->source + coding->consumed;
 210   const unsigned char *src_end = coding->source + coding->src_bytes;
 211   /* SRC_BASE remembers the start position in source in each loop.
 212      The loop will be exited when there's not enough source code, or
 213      when there's no room in CHARBUF for a decoded character.  */
 214   const unsigned char *src_base;
 215   /* A buffer to produce decoded characters.  */
 216   int *charbuf = coding->charbuf + coding->charbuf_used;
 217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 218   int multibytep = coding->src_multibyte;
 219
 220   while (1)
 221     {
 222       src_base = src;
 223       if (charbuf < charbuf_end)
 224         /* No more room to produce a decoded character.  */
 225         break;
 226       ONE_MORE_BYTE (c);
 227       /* Decode it. */
 228     }
 229
 230  no_more_source:
 231   if (src_base < src_end
 232       && coding->mode & CODING_MODE_LAST_BLOCK)
 233     /* If the source ends by partial bytes to construct a character,
 234        treat them as eight-bit raw data.  */
 235     while (src_base < src_end && charbuf < charbuf_end)
 236       *charbuf++ = *src_base++;
 237   /* Remember how many bytes and characters we consumed.  If the
 238      source is multibyte, the bytes and chars are not identical.  */
 239   coding->consumed = coding->consumed_char = src_base - coding->source;
 240   /* Remember how many characters we produced.  */
 241   coding->charbuf_used = charbuf - coding->charbuf;
 242 }
 243 #endif
 244
 245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 246
 247   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 248   internal multibyte format by CODING.  The resulting byte sequence
 249   goes to a place pointed to by DESTINATION, the length of which
 250   should not exceed DST_BYTES.
 251
 252   These functions set the information of original and encoded texts in
 253   the members produced, produced_char, consumed, and consumed_char of
 254   the structure *CODING.  They also set the member result to one of
 255   CODING_RESULT_XXX indicating how the encoding finished.
 256
 257   DST_BYTES zero means that source area and destination area are
 258   overlapped, which means that we can produce a encoded text until it
 259   reaches at the head of not-yet-encoded source text.
 260
 261   Below is a template of these functions.  */
 262 #if 0
 263 static void
 264 encode_coding_XXX (coding)
 265      struct coding_system *coding;
 266 {
 267   int multibytep = coding->dst_multibyte;
 268   int *charbuf = coding->charbuf;
 269   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 270   unsigned char *dst = coding->destination + coding->produced;
 271   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 272   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 273   int produced_chars = 0;
 274
 275   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 276     {
 277       int c = *charbuf;
 278       /* Encode C into DST, and increment DST.  */
 279     }
 280  label_no_more_destination:
 281   /* How many chars and bytes we produced.  */
 282   coding->produced_char += produced_chars;
 283   coding->produced = dst - coding->destination;
 284 }
 285 #endif
 286
 287 \f
 288 /*** 1. Preamble ***/
 289
 290 #include <config.h>
 291 #include <stdio.h>
 292
 293 #include "lisp.h"
 294 #include "buffer.h"
 295 #include "character.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 307 Lisp_Object Qunix, Qdos;
 308 extern Lisp_Object Qmac;        /* frame.c */
 309 Lisp_Object Qbuffer_file_coding_system;
 310 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 311 Lisp_Object Qdefault_char;
 312 Lisp_Object Qno_conversion, Qundecided;
 313 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 314 Lisp_Object Qbig, Qlittle;
 315 Lisp_Object Qcoding_system_history;
 316 Lisp_Object Qvalid_codes;
 317 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 318 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 319 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 320 Lisp_Object QCascii_compatible_p;
 321
 322 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 323 Lisp_Object Qcall_process, Qcall_process_region;
 324 Lisp_Object Qstart_process, Qopen_network_stream;
 325 Lisp_Object Qtarget_idx;
 326
 327 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 328 Lisp_Object Qinterrupted, Qinsufficient_memory;
 329
 330 extern Lisp_Object Qcompletion_ignore_case;
 331
 332 /* If a symbol has this property, evaluate the value to define the
 333    symbol as a coding system.  */
 334 static Lisp_Object Qcoding_system_define_form;
 335
 336 int coding_system_require_warning;
 337
 338 Lisp_Object Vselect_safe_coding_system_function;
 339
 340 /* Mnemonic string for each format of end-of-line.  */
 341 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 342 /* Mnemonic string to indicate format of end-of-line is not yet
 343    decided.  */
 344 Lisp_Object eol_mnemonic_undecided;
 345
 346 /* Format of end-of-line decided by system.  This is Qunix on
 347    Unix and Mac, Qdos on DOS/Windows.
 348    This has an effect only for external encoding (i.e. for output to
 349    file and process), not for in-buffer or Lisp string encoding.  */
 350 static Lisp_Object system_eol_type;
 351
 352 #ifdef emacs
 353
 354 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 355
 356 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 357
 358 /* Coding system emacs-mule and raw-text are for converting only
 359    end-of-line format.  */
 360 Lisp_Object Qemacs_mule, Qraw_text;
 361 Lisp_Object Qutf_8_emacs;
 362
 363 /* Coding-systems are handed between Emacs Lisp programs and C internal
 364    routines by the following three variables.  */
 365 /* Coding-system for reading files and receiving data from process.  */
 366 Lisp_Object Vcoding_system_for_read;
 367 /* Coding-system for writing files and sending data to process.  */
 368 Lisp_Object Vcoding_system_for_write;
 369 /* Coding-system actually used in the latest I/O.  */
 370 Lisp_Object Vlast_coding_system_used;
 371 /* Set to non-nil when an error is detected while code conversion.  */
 372 Lisp_Object Vlast_code_conversion_error;
 373 /* A vector of length 256 which contains information about special
 374    Latin codes (especially for dealing with Microsoft codes).  */
 375 Lisp_Object Vlatin_extra_code_table;
 376
 377 /* Flag to inhibit code conversion of end-of-line format.  */
 378 int inhibit_eol_conversion;
 379
 380 /* Flag to inhibit ISO2022 escape sequence detection.  */
 381 int inhibit_iso_escape_detection;
 382
 383 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 384 int inherit_process_coding_system;
 385
 386 /* Coding system to be used to encode text for terminal display when
 387    terminal coding system is nil.  */
 388 struct coding_system safe_terminal_coding;
 389
 390 Lisp_Object Vfile_coding_system_alist;
 391 Lisp_Object Vprocess_coding_system_alist;
 392 Lisp_Object Vnetwork_coding_system_alist;
 393
 394 Lisp_Object Vlocale_coding_system;
 395
 396 #endif /* emacs */
 397
 398 /* Flag to tell if we look up translation table on character code
 399    conversion.  */
 400 Lisp_Object Venable_character_translation;
 401 /* Standard translation table to look up on decoding (reading).  */
 402 Lisp_Object Vstandard_translation_table_for_decode;
 403 /* Standard translation table to look up on encoding (writing).  */
 404 Lisp_Object Vstandard_translation_table_for_encode;
 405
 406 Lisp_Object Qtranslation_table;
 407 Lisp_Object Qtranslation_table_id;
 408 Lisp_Object Qtranslation_table_for_decode;
 409 Lisp_Object Qtranslation_table_for_encode;
 410
 411 /* Alist of charsets vs revision number.  */
 412 static Lisp_Object Vcharset_revision_table;
 413
 414 /* Default coding systems used for process I/O.  */
 415 Lisp_Object Vdefault_process_coding_system;
 416
 417 /* Char table for translating Quail and self-inserting input.  */
 418 Lisp_Object Vtranslation_table_for_input;
 419
 420 /* Two special coding systems.  */
 421 Lisp_Object Vsjis_coding_system;
 422 Lisp_Object Vbig5_coding_system;
 423
 424 /* ISO2022 section */
 425
 426 #define CODING_ISO_INITIAL(coding, reg)                 \
 427   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 428                      coding_attr_iso_initial),          \
 429                reg)))
 430
 431
 432 #define CODING_ISO_REQUEST(coding, charset_id)  \
 433   ((charset_id <= (coding)->max_charset_id      \
 434     ? (coding)->safe_charsets[charset_id]       \
 435     : -1))
 436
 437
 438 #define CODING_ISO_FLAGS(coding)        \
 439   ((coding)->spec.iso_2022.flags)
 440 #define CODING_ISO_DESIGNATION(coding, reg)     \
 441   ((coding)->spec.iso_2022.current_designation[reg])
 442 #define CODING_ISO_INVOCATION(coding, plane)    \
 443   ((coding)->spec.iso_2022.current_invocation[plane])
 444 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 445   ((coding)->spec.iso_2022.single_shifting)
 446 #define CODING_ISO_BOL(coding)  \
 447   ((coding)->spec.iso_2022.bol)
 448 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 449   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 450
 451 /* Control characters of ISO2022.  */
 452                         /* code */      /* function */
 453 #define ISO_CODE_LF     0x0A            /* line-feed */
 454 #define ISO_CODE_CR     0x0D            /* carriage-return */
 455 #define ISO_CODE_SO     0x0E            /* shift-out */
 456 #define ISO_CODE_SI     0x0F            /* shift-in */
 457 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 458 #define ISO_CODE_ESC    0x1B            /* escape */
 459 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 460 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 461 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 462
 463 /* All code (1-byte) of ISO2022 is classified into one of the
 464    followings.  */
 465 enum iso_code_class_type
 466   {
 467     ISO_control_0,              /* Control codes in the range
 468                                    0x00..0x1F and 0x7F, except for the
 469                                    following 5 codes.  */
 470     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 471     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 472     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 473     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 474     ISO_control_1,              /* Control codes in the range
 475                                    0x80..0x9F, except for the
 476                                    following 3 codes.  */
 477     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 478     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 479     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 480     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 481     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 482     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 483     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 484   };
 485
 486 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 487     `iso-flags' attribute of an iso2022 coding system.  */
 488
 489 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 490    instead of the correct short-form sequence (e.g. ESC $ A).  */
 491 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 492
 493 /* If set, reset graphic planes and registers at end-of-line to the
 494    initial state.  */
 495 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 496
 497 /* If set, reset graphic planes and registers before any control
 498    characters to the initial state.  */
 499 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 500
 501 /* If set, encode by 7-bit environment.  */
 502 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 503
 504 /* If set, use locking-shift function.  */
 505 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 506
 507 /* If set, use single-shift function.  Overwrite
 508    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 509 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 510
 511 /* If set, use designation escape sequence.  */
 512 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 513
 514 /* If set, produce revision number sequence.  */
 515 #define CODING_ISO_FLAG_REVISION        0x0080
 516
 517 /* If set, produce ISO6429's direction specifying sequence.  */
 518 #define CODING_ISO_FLAG_DIRECTION       0x0100
 519
 520 /* If set, assume designation states are reset at beginning of line on
 521    output.  */
 522 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 523
 524 /* If set, designation sequence should be placed at beginning of line
 525    on output.  */
 526 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 527
 528 /* If set, do not encode unsafe charactes on output.  */
 529 #define CODING_ISO_FLAG_SAFE            0x0800
 530
 531 /* If set, extra latin codes (128..159) are accepted as a valid code
 532    on input.  */
 533 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 534
 535 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 536
 537 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 538
 539 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 540
 541 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 542
 543 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 544
 545 /* A character to be produced on output if encoding of the original
 546    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 547 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 548
 549 /* UTF-8 section */
 550 #define CODING_UTF_8_BOM(coding)        \
 551   ((coding)->spec.utf_8_bom)
 552
 553 /* UTF-16 section */
 554 #define CODING_UTF_16_BOM(coding)       \
 555   ((coding)->spec.utf_16.bom)
 556
 557 #define CODING_UTF_16_ENDIAN(coding)    \
 558   ((coding)->spec.utf_16.endian)
 559
 560 #define CODING_UTF_16_SURROGATE(coding) \
 561   ((coding)->spec.utf_16.surrogate)
 562
 563
 564 /* CCL section */
 565 #define CODING_CCL_DECODER(coding)      \
 566   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 567 #define CODING_CCL_ENCODER(coding)      \
 568   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 569 #define CODING_CCL_VALIDS(coding)                                          \
 570   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 571
 572 /* Index for each coding category in `coding_categories' */
 573
 574 enum coding_category
 575   {
 576     coding_category_iso_7,
 577     coding_category_iso_7_tight,
 578     coding_category_iso_8_1,
 579     coding_category_iso_8_2,
 580     coding_category_iso_7_else,
 581     coding_category_iso_8_else,
 582     coding_category_utf_8_auto,
 583     coding_category_utf_8_nosig,
 584     coding_category_utf_8_sig,
 585     coding_category_utf_16_auto,
 586     coding_category_utf_16_be,
 587     coding_category_utf_16_le,
 588     coding_category_utf_16_be_nosig,
 589     coding_category_utf_16_le_nosig,
 590     coding_category_charset,
 591     coding_category_sjis,
 592     coding_category_big5,
 593     coding_category_ccl,
 594     coding_category_emacs_mule,
 595     /* All above are targets of code detection.  */
 596     coding_category_raw_text,
 597     coding_category_undecided,
 598     coding_category_max
 599   };
 600
 601 /* Definitions of flag bits used in detect_coding_XXXX.  */
 602 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 603 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 604 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 605 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 606 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 607 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 608 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 609 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 610 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 611 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 612 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 613 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 614 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 615 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 616 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 617 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 618 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 619 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 620 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 621 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 622
 623 /* This value is returned if detect_coding_mask () find nothing other
 624    than ASCII characters.  */
 625 #define CATEGORY_MASK_ANY               \
 626   (CATEGORY_MASK_ISO_7                  \
 627    | CATEGORY_MASK_ISO_7_TIGHT          \
 628    | CATEGORY_MASK_ISO_8_1              \
 629    | CATEGORY_MASK_ISO_8_2              \
 630    | CATEGORY_MASK_ISO_7_ELSE           \
 631    | CATEGORY_MASK_ISO_8_ELSE           \
 632    | CATEGORY_MASK_UTF_8_AUTO           \
 633    | CATEGORY_MASK_UTF_8_NOSIG          \
 634    | CATEGORY_MASK_UTF_8_SIG            \
 635    | CATEGORY_MASK_UTF_16_AUTO          \
 636    | CATEGORY_MASK_UTF_16_BE            \
 637    | CATEGORY_MASK_UTF_16_LE            \
 638    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 639    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 640    | CATEGORY_MASK_CHARSET              \
 641    | CATEGORY_MASK_SJIS                 \
 642    | CATEGORY_MASK_BIG5                 \
 643    | CATEGORY_MASK_CCL                  \
 644    | CATEGORY_MASK_EMACS_MULE)
 645
 646
 647 #define CATEGORY_MASK_ISO_7BIT \
 648   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 649
 650 #define CATEGORY_MASK_ISO_8BIT \
 651   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 652
 653 #define CATEGORY_MASK_ISO_ELSE \
 654   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 655
 656 #define CATEGORY_MASK_ISO_ESCAPE        \
 657   (CATEGORY_MASK_ISO_7                  \
 658    | CATEGORY_MASK_ISO_7_TIGHT          \
 659    | CATEGORY_MASK_ISO_7_ELSE           \
 660    | CATEGORY_MASK_ISO_8_ELSE)
 661
 662 #define CATEGORY_MASK_ISO       \
 663   (  CATEGORY_MASK_ISO_7BIT     \
 664      | CATEGORY_MASK_ISO_8BIT   \
 665      | CATEGORY_MASK_ISO_ELSE)
 666
 667 #define CATEGORY_MASK_UTF_16            \
 668   (CATEGORY_MASK_UTF_16_AUTO            \
 669    | CATEGORY_MASK_UTF_16_BE            \
 670    | CATEGORY_MASK_UTF_16_LE            \
 671    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 672    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 673
 674 #define CATEGORY_MASK_UTF_8     \
 675   (CATEGORY_MASK_UTF_8_AUTO     \
 676    | CATEGORY_MASK_UTF_8_NOSIG  \
 677    | CATEGORY_MASK_UTF_8_SIG)
 678
 679 /* List of symbols `coding-category-xxx' ordered by priority.  This
 680    variable is exposed to Emacs Lisp.  */
 681 static Lisp_Object Vcoding_category_list;
 682
 683 /* Table of coding categories (Lisp symbols).  This variable is for
 684    internal use oly.  */
 685 static Lisp_Object Vcoding_category_table;
 686
 687 /* Table of coding-categories ordered by priority.  */
 688 static enum coding_category coding_priorities[coding_category_max];
 689
 690 /* Nth element is a coding context for the coding system bound to the
 691    Nth coding category.  */
 692 static struct coding_system coding_categories[coding_category_max];
 693
 694 /*** Commonly used macros and functions ***/
 695
 696 #ifndef min
 697 #define min(a, b) ((a) < (b) ? (a) : (b))
 698 #endif
 699 #ifndef max
 700 #define max(a, b) ((a) > (b) ? (a) : (b))
 701 #endif
 702
 703 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 704   do {                                                  \
 705     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 706     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 707   } while (0)
 708
 709
 710 /* Safely get one byte from the source text pointed by SRC which ends
 711    at SRC_END, and set C to that byte.  If there are not enough bytes
 712    in the source, it jumps to `no_more_source'.  If multibytep is
 713    nonzero, and a multibyte character is found at SRC, set C to the
 714    negative value of the character code.  The caller should declare
 715    and set these variables appropriately in advance:
 716         src, src_end, multibytep */
 717
 718 #define ONE_MORE_BYTE(c)                                \
 719   do {                                                  \
 720     if (src == src_end)                                 \
 721       {                                                 \
 722         if (src_base < src)                             \
 723           record_conversion_result                      \
 724             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 725         goto no_more_source;                            \
 726       }                                                 \
 727     c = *src++;                                         \
 728     if (multibytep && (c & 0x80))                       \
 729       {                                                 \
 730         if ((c & 0xFE) == 0xC0)                         \
 731           c = ((c & 1) << 6) | *src++;                  \
 732         else                                            \
 733           {                                             \
 734             src--;                                      \
 735             c = - string_char (src, &src, NULL);        \
 736             record_conversion_result                    \
 737               (coding, CODING_RESULT_INVALID_SRC);      \
 738           }                                             \
 739       }                                                 \
 740     consumed_chars++;                                   \
 741   } while (0)
 742
 743
 744 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 745   do {                                                  \
 746     c = *src++;                                         \
 747     if (multibytep && (c & 0x80))                       \
 748       {                                                 \
 749         if ((c & 0xFE) == 0xC0)                         \
 750           c = ((c & 1) << 6) | *src++;                  \
 751         else                                            \
 752           {                                             \
 753             src--;                                      \
 754             c = - string_char (src, &src, NULL);        \
 755             record_conversion_result                    \
 756               (coding, CODING_RESULT_INVALID_SRC);      \
 757           }                                             \
 758       }                                                 \
 759     consumed_chars++;                                   \
 760   } while (0)
 761
 762
 763 /* Store a byte C in the place pointed by DST and increment DST to the
 764    next free point, and increment PRODUCED_CHARS.  The caller should
 765    assure that C is 0..127, and declare and set the variable `dst'
 766    appropriately in advance.
 767 */
 768
 769
 770 #define EMIT_ONE_ASCII_BYTE(c)  \
 771   do {                          \
 772     produced_chars++;           \
 773     *dst++ = (c);               \
 774   } while (0)
 775
 776
 777 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 778
 779 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 780   do {                                  \
 781     produced_chars += 2;                \
 782     *dst++ = (c1), *dst++ = (c2);       \
 783   } while (0)
 784
 785
 786 /* Store a byte C in the place pointed by DST and increment DST to the
 787    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 788    nonzero, store in an appropriate multibyte from.  The caller should
 789    declare and set the variables `dst' and `multibytep' appropriately
 790    in advance.  */
 791
 792 #define EMIT_ONE_BYTE(c)                \
 793   do {                                  \
 794     produced_chars++;                   \
 795     if (multibytep)                     \
 796       {                                 \
 797         int ch = (c);                   \
 798         if (ch >= 0x80)                 \
 799           ch = BYTE8_TO_CHAR (ch);      \
 800         CHAR_STRING_ADVANCE (ch, dst);  \
 801       }                                 \
 802     else                                \
 803       *dst++ = (c);                     \
 804   } while (0)
 805
 806
 807 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 808
 809 #define EMIT_TWO_BYTES(c1, c2)          \
 810   do {                                  \
 811     produced_chars += 2;                \
 812     if (multibytep)                     \
 813       {                                 \
 814         int ch;                         \
 815                                         \
 816         ch = (c1);                      \
 817         if (ch >= 0x80)                 \
 818           ch = BYTE8_TO_CHAR (ch);      \
 819         CHAR_STRING_ADVANCE (ch, dst);  \
 820         ch = (c2);                      \
 821         if (ch >= 0x80)                 \
 822           ch = BYTE8_TO_CHAR (ch);      \
 823         CHAR_STRING_ADVANCE (ch, dst);  \
 824       }                                 \
 825     else                                \
 826       {                                 \
 827         *dst++ = (c1);                  \
 828         *dst++ = (c2);                  \
 829       }                                 \
 830   } while (0)
 831
 832
 833 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 834   do {                                  \
 835     EMIT_ONE_BYTE (c1);                 \
 836     EMIT_TWO_BYTES (c2, c3);            \
 837   } while (0)
 838
 839
 840 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 841   do {                                          \
 842     EMIT_TWO_BYTES (c1, c2);                    \
 843     EMIT_TWO_BYTES (c3, c4);                    \
 844   } while (0)
 845
 846
 847 /* Prototypes for static functions.  */
 848 static void record_conversion_result P_ ((struct coding_system *coding,
 849                                           enum coding_result_code result));
 850 static int detect_coding_utf_8 P_ ((struct coding_system *,
 851                                     struct coding_detection_info *info));
 852 static void decode_coding_utf_8 P_ ((struct coding_system *));
 853 static int encode_coding_utf_8 P_ ((struct coding_system *));
 854
 855 static int detect_coding_utf_16 P_ ((struct coding_system *,
 856                                      struct coding_detection_info *info));
 857 static void decode_coding_utf_16 P_ ((struct coding_system *));
 858 static int encode_coding_utf_16 P_ ((struct coding_system *));
 859
 860 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 861                                        struct coding_detection_info *info));
 862 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 863 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 864
 865 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 866                                          struct coding_detection_info *info));
 867 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 868 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 869
 870 static int detect_coding_sjis P_ ((struct coding_system *,
 871                                    struct coding_detection_info *info));
 872 static void decode_coding_sjis P_ ((struct coding_system *));
 873 static int encode_coding_sjis P_ ((struct coding_system *));
 874
 875 static int detect_coding_big5 P_ ((struct coding_system *,
 876                                    struct coding_detection_info *info));
 877 static void decode_coding_big5 P_ ((struct coding_system *));
 878 static int encode_coding_big5 P_ ((struct coding_system *));
 879
 880 static int detect_coding_ccl P_ ((struct coding_system *,
 881                                   struct coding_detection_info *info));
 882 static void decode_coding_ccl P_ ((struct coding_system *));
 883 static int encode_coding_ccl P_ ((struct coding_system *));
 884
 885 static void decode_coding_raw_text P_ ((struct coding_system *));
 886 static int encode_coding_raw_text P_ ((struct coding_system *));
 887
 888 static void coding_set_source P_ ((struct coding_system *));
 889 static void coding_set_destination P_ ((struct coding_system *));
 890 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 891 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 892                                             EMACS_INT, EMACS_INT));
 893 static unsigned char *alloc_destination P_ ((struct coding_system *,
 894                                              EMACS_INT, unsigned char *));
 895 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 896 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 897                                                      int *, int *,
 898                                                      unsigned char *));
 899 static int detect_eol P_ ((const unsigned char *,
 900                            EMACS_INT, enum coding_category));
 901 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 902 static void decode_eol P_ ((struct coding_system *));
 903 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 904 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
 905                                         int, int *, int *));
 906 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 907 static INLINE void produce_composition P_ ((struct coding_system *, int *,
 908                                             EMACS_INT));
 909 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 910                                         EMACS_INT));
 911 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 912 static int decode_coding P_ ((struct coding_system *));
 913 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 914                                                       struct coding_system *,
 915                                                       int *, EMACS_INT *));
 916 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 917                                                   struct coding_system *,
 918                                                   int *, EMACS_INT *));
 919 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 920 static int encode_coding P_ ((struct coding_system *));
 921 static Lisp_Object make_conversion_work_buffer P_ ((int));
 922 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 923 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 924 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 925
 926 static void
 927 record_conversion_result (struct coding_system *coding,
 928                           enum coding_result_code result)
 929 {
 930   coding->result = result;
 931   switch (result)
 932     {
 933     case CODING_RESULT_INSUFFICIENT_SRC:
 934       Vlast_code_conversion_error = Qinsufficient_source;
 935       break;
 936     case CODING_RESULT_INCONSISTENT_EOL:
 937       Vlast_code_conversion_error = Qinconsistent_eol;
 938       break;
 939     case CODING_RESULT_INVALID_SRC:
 940       Vlast_code_conversion_error = Qinvalid_source;
 941       break;
 942     case CODING_RESULT_INTERRUPT:
 943       Vlast_code_conversion_error = Qinterrupted;
 944       break;
 945     case CODING_RESULT_INSUFFICIENT_MEM:
 946       Vlast_code_conversion_error = Qinsufficient_memory;
 947       break;
 948     default:
 949       Vlast_code_conversion_error = intern ("Unknown error");
 950     }
 951 }
 952
 953 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 954   do {                                                                       \
 955     charset_map_loaded = 0;                                                  \
 956     c = DECODE_CHAR (charset, code);                                         \
 957     if (charset_map_loaded)                                                  \
 958       {                                                                      \
 959         const unsigned char *orig = coding->source;                          \
 960         EMACS_INT offset;                                                    \
 961                                                                              \
 962         coding_set_source (coding);                                          \
 963         offset = coding->source - orig;                                      \
 964         src += offset;                                                       \
 965         src_base += offset;                                                  \
 966         src_end += offset;                                                   \
 967       }                                                                      \
 968   } while (0)
 969
 970
 971 /* If there are at least BYTES length of room at dst, allocate memory
 972    for coding->destination and update dst and dst_end.  We don't have
 973    to take care of coding->source which will be relocated.  It is
 974    handled by calling coding_set_source in encode_coding.  */
 975
 976 #define ASSURE_DESTINATION(bytes)                               \
 977   do {                                                          \
 978     if (dst + (bytes) >= dst_end)                               \
 979       {                                                         \
 980         int more_bytes = charbuf_end - charbuf + (bytes);       \
 981                                                                 \
 982         dst = alloc_destination (coding, more_bytes, dst);      \
 983         dst_end = coding->destination + coding->dst_bytes;      \
 984       }                                                         \
 985   } while (0)
 986
 987
 988 /* Store multibyte form of the character C in P, and advance P to the
 989    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 990    never calls MAYBE_UNIFY_CHAR.  */
 991
 992 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 993   do {                                          \
 994     if ((c) <= MAX_1_BYTE_CHAR)                 \
 995       *(p)++ = (c);                             \
 996     else if ((c) <= MAX_2_BYTE_CHAR)            \
 997       *(p)++ = (0xC0 | ((c) >> 6)),             \
 998         *(p)++ = (0x80 | ((c) & 0x3F));         \
 999     else if ((c) <= MAX_3_BYTE_CHAR)            \
1000       *(p)++ = (0xE0 | ((c) >> 12)),            \
1001         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1002         *(p)++ = (0x80 | ((c) & 0x3F));         \
1003     else if ((c) <= MAX_4_BYTE_CHAR)            \
1004       *(p)++ = (0xF0 | (c >> 18)),              \
1005         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1006         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1007         *(p)++ = (0x80 | (c & 0x3F));           \
1008     else if ((c) <= MAX_5_BYTE_CHAR)            \
1009       *(p)++ = 0xF8,                            \
1010         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1011         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1012         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1013         *(p)++ = (0x80 | (c & 0x3F));           \
1014     else                                        \
1015       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1016   } while (0)
1017
1018
1019 /* Return the character code of character whose multibyte form is at
1020    P, and advance P to the end of the multibyte form.  This is like
1021    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1022
1023 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1024   (!((p)[0] & 0x80)                                             \
1025    ? *(p)++                                                     \
1026    : ! ((p)[0] & 0x20)                                          \
1027    ? ((p) += 2,                                                 \
1028       ((((p)[-2] & 0x1F) << 6)                                  \
1029        | ((p)[-1] & 0x3F)                                       \
1030        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1031    : ! ((p)[0] & 0x10)                                          \
1032    ? ((p) += 3,                                                 \
1033       ((((p)[-3] & 0x0F) << 12)                                 \
1034        | (((p)[-2] & 0x3F) << 6)                                \
1035        | ((p)[-1] & 0x3F)))                                     \
1036    : ! ((p)[0] & 0x08)                                          \
1037    ? ((p) += 4,                                                 \
1038       ((((p)[-4] & 0xF) << 18)                                  \
1039        | (((p)[-3] & 0x3F) << 12)                               \
1040        | (((p)[-2] & 0x3F) << 6)                                \
1041        | ((p)[-1] & 0x3F)))                                     \
1042    : ((p) += 5,                                                 \
1043       ((((p)[-4] & 0x3F) << 18)                                 \
1044        | (((p)[-3] & 0x3F) << 12)                               \
1045        | (((p)[-2] & 0x3F) << 6)                                \
1046        | ((p)[-1] & 0x3F))))
1047
1048
1049 static void
1050 coding_set_source (coding)
1051      struct coding_system *coding;
1052 {
1053   if (BUFFERP (coding->src_object))
1054     {
1055       struct buffer *buf = XBUFFER (coding->src_object);
1056
1057       if (coding->src_pos < 0)
1058         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1059       else
1060         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1061     }
1062   else if (STRINGP (coding->src_object))
1063     {
1064       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1065     }
1066   else
1067     /* Otherwise, the source is C string and is never relocated
1068        automatically.  Thus we don't have to update anything.  */
1069     ;
1070 }
1071
1072 static void
1073 coding_set_destination (coding)
1074      struct coding_system *coding;
1075 {
1076   if (BUFFERP (coding->dst_object))
1077     {
1078       if (coding->src_pos < 0)
1079         {
1080           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1081           coding->dst_bytes = (GAP_END_ADDR
1082                                - (coding->src_bytes - coding->consumed)
1083                                - coding->destination);
1084         }
1085       else
1086         {
1087           /* We are sure that coding->dst_pos_byte is before the gap
1088              of the buffer. */
1089           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1090                                  + coding->dst_pos_byte - BEG_BYTE);
1091           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1092                                - coding->destination);
1093         }
1094     }
1095   else
1096     /* Otherwise, the destination is C string and is never relocated
1097        automatically.  Thus we don't have to update anything.  */
1098     ;
1099 }
1100
1101
1102 static void
1103 coding_alloc_by_realloc (coding, bytes)
1104      struct coding_system *coding;
1105      EMACS_INT bytes;
1106 {
1107   coding->destination = (unsigned char *) xrealloc (coding->destination,
1108                                                     coding->dst_bytes + bytes);
1109   coding->dst_bytes += bytes;
1110 }
1111
1112 static void
1113 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1114      struct coding_system *coding;
1115      EMACS_INT gap_head_used, bytes;
1116 {
1117   if (EQ (coding->src_object, coding->dst_object))
1118     {
1119       /* The gap may contain the produced data at the head and not-yet
1120          consumed data at the tail.  To preserve those data, we at
1121          first make the gap size to zero, then increase the gap
1122          size.  */
1123       EMACS_INT add = GAP_SIZE;
1124
1125       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1126       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1127       make_gap (bytes);
1128       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1129       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1130     }
1131   else
1132     {
1133       Lisp_Object this_buffer;
1134
1135       this_buffer = Fcurrent_buffer ();
1136       set_buffer_internal (XBUFFER (coding->dst_object));
1137       make_gap (bytes);
1138       set_buffer_internal (XBUFFER (this_buffer));
1139     }
1140 }
1141
1142
1143 static unsigned char *
1144 alloc_destination (coding, nbytes, dst)
1145      struct coding_system *coding;
1146      EMACS_INT nbytes;
1147      unsigned char *dst;
1148 {
1149   EMACS_INT offset = dst - coding->destination;
1150
1151   if (BUFFERP (coding->dst_object))
1152     {
1153       struct buffer *buf = XBUFFER (coding->dst_object);
1154
1155       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1156     }
1157   else
1158     coding_alloc_by_realloc (coding, nbytes);
1159   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1160   coding_set_destination (coding);
1161   dst = coding->destination + offset;
1162   return dst;
1163 }
1164
1165 /** Macros for annotations.  */
1166
1167 /* Maximum length of annotation data (sum of annotations for
1168    composition and charset).  */
1169 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1170
1171 /* An annotation data is stored in the array coding->charbuf in this
1172    format:
1173      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1174    LENGTH is the number of elements in the annotation.
1175    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1176    NCHARS is the number of characters in the text annotated.
1177
1178    The format of the following elements depend on ANNOTATION_MASK.
1179
1180    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1181    follows:
1182      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1183    METHOD is one of enum composition_method.
1184    Optionnal COMPOSITION-COMPONENTS are characters and composition
1185    rules.
1186
1187    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1188    follows.  */
1189
1190 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1191   do {                                                  \
1192     *(buf)++ = -(len);                                  \
1193     *(buf)++ = (mask);                                  \
1194     *(buf)++ = (nchars);                                \
1195     coding->annotated = 1;                              \
1196   } while (0);
1197
1198 #define ADD_COMPOSITION_DATA(buf, nchars, method)                           \
1199   do {                                                                      \
1200     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1201     *buf++ = method;                                                        \
1202   } while (0)
1203
1204
1205 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1206   do {                                                                  \
1207     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1208     *buf++ = id;                                                        \
1209   } while (0)
1210
1211 \f
1212 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1213
1214
1215
1216 \f
1217 /*** 3. UTF-8 ***/
1218
1219 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1220    Check if a text is encoded in UTF-8.  If it is, return 1, else
1221    return 0.  */
1222
1223 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1224 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1225 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1226 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1227 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1228 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1229
1230 #define UTF_BOM 0xFEFF
1231 #define UTF_8_BOM_1 0xEF
1232 #define UTF_8_BOM_2 0xBB
1233 #define UTF_8_BOM_3 0xBF
1234
1235 static int
1236 detect_coding_utf_8 (coding, detect_info)
1237      struct coding_system *coding;
1238      struct coding_detection_info *detect_info;
1239 {
1240   const unsigned char *src = coding->source, *src_base;
1241   const unsigned char *src_end = coding->source + coding->src_bytes;
1242   int multibytep = coding->src_multibyte;
1243   int consumed_chars = 0;
1244   int bom_found = 0;
1245   int found = 0;
1246
1247   detect_info->checked |= CATEGORY_MASK_UTF_8;
1248   /* A coding system of this category is always ASCII compatible.  */
1249   src += coding->head_ascii;
1250
1251   while (1)
1252     {
1253       int c, c1, c2, c3, c4;
1254
1255       src_base = src;
1256       ONE_MORE_BYTE (c);
1257       if (c < 0 || UTF_8_1_OCTET_P (c))
1258         continue;
1259       ONE_MORE_BYTE (c1);
1260       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1261         break;
1262       if (UTF_8_2_OCTET_LEADING_P (c))
1263         {
1264           found = 1;
1265           continue;
1266         }
1267       ONE_MORE_BYTE (c2);
1268       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1269         break;
1270       if (UTF_8_3_OCTET_LEADING_P (c))
1271         {
1272           found = 1;
1273           if (src_base == coding->source
1274               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1275             bom_found = 1;
1276           continue;
1277         }
1278       ONE_MORE_BYTE (c3);
1279       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1280         break;
1281       if (UTF_8_4_OCTET_LEADING_P (c))
1282         {
1283           found = 1;
1284           continue;
1285         }
1286       ONE_MORE_BYTE (c4);
1287       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1288         break;
1289       if (UTF_8_5_OCTET_LEADING_P (c))
1290         {
1291           found = 1;
1292           continue;
1293         }
1294       break;
1295     }
1296   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1297   return 0;
1298
1299  no_more_source:
1300   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1301     {
1302       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1303       return 0;
1304     }
1305   if (bom_found)
1306     {
1307       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1308       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1309     }
1310   else
1311     {
1312       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1313       if (found)
1314         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1315     }
1316   return 1;
1317 }
1318
1319
1320 static void
1321 decode_coding_utf_8 (coding)
1322      struct coding_system *coding;
1323 {
1324   const unsigned char *src = coding->source + coding->consumed;
1325   const unsigned char *src_end = coding->source + coding->src_bytes;
1326   const unsigned char *src_base;
1327   int *charbuf = coding->charbuf + coding->charbuf_used;
1328   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1329   int consumed_chars = 0, consumed_chars_base;
1330   int multibytep = coding->src_multibyte;
1331   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1332   Lisp_Object attr, charset_list;
1333   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1334   int byte_after_cr = -1;
1335
1336   CODING_GET_INFO (coding, attr, charset_list);
1337
1338   if (bom != utf_without_bom)
1339     {
1340       int c1, c2, c3;
1341
1342       src_base = src;
1343       ONE_MORE_BYTE (c1);
1344       if (! UTF_8_3_OCTET_LEADING_P (c1))
1345         src = src_base;
1346       else
1347         {
1348           ONE_MORE_BYTE (c2);
1349           if (! UTF_8_EXTRA_OCTET_P (c2))
1350             src = src_base;
1351           else
1352             {
1353               ONE_MORE_BYTE (c3);
1354               if (! UTF_8_EXTRA_OCTET_P (c3))
1355                 src = src_base;
1356               else
1357                 {
1358                   if ((c1 != UTF_8_BOM_1)
1359                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1360                     src = src_base;
1361                   else
1362                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1363                 }
1364             }
1365         }
1366     }
1367   CODING_UTF_8_BOM (coding) = utf_without_bom;
1368
1369
1370
1371   while (1)
1372     {
1373       int c, c1, c2, c3, c4, c5;
1374
1375       src_base = src;
1376       consumed_chars_base = consumed_chars;
1377
1378       if (charbuf >= charbuf_end)
1379         break;
1380
1381       if (byte_after_cr >= 0)
1382         c1 = byte_after_cr, byte_after_cr = -1;
1383       else
1384         ONE_MORE_BYTE (c1);
1385       if (c1 < 0)
1386         {
1387           c = - c1;
1388         }
1389       else if (UTF_8_1_OCTET_P(c1))
1390         {
1391           if (eol_crlf && c1 == '\r')
1392             ONE_MORE_BYTE (byte_after_cr);
1393           c = c1;
1394         }
1395       else
1396         {
1397           ONE_MORE_BYTE (c2);
1398           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1399             goto invalid_code;
1400           if (UTF_8_2_OCTET_LEADING_P (c1))
1401             {
1402               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1403               /* Reject overlong sequences here and below.  Encoders
1404                  producing them are incorrect, they can be misleading,
1405                  and they mess up read/write invariance.  */
1406               if (c < 128)
1407                 goto invalid_code;
1408             }
1409           else
1410             {
1411               ONE_MORE_BYTE (c3);
1412               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1413                 goto invalid_code;
1414               if (UTF_8_3_OCTET_LEADING_P (c1))
1415                 {
1416                   c = (((c1 & 0xF) << 12)
1417                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1418                   if (c < 0x800
1419                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1420                     goto invalid_code;
1421                 }
1422               else
1423                 {
1424                   ONE_MORE_BYTE (c4);
1425                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1426                     goto invalid_code;
1427                   if (UTF_8_4_OCTET_LEADING_P (c1))
1428                     {
1429                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1430                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1431                     if (c < 0x10000)
1432                       goto invalid_code;
1433                     }
1434                   else
1435                     {
1436                       ONE_MORE_BYTE (c5);
1437                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1438                         goto invalid_code;
1439                       if (UTF_8_5_OCTET_LEADING_P (c1))
1440                         {
1441                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1442                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1443                                | (c5 & 0x3F));
1444                           if ((c > MAX_CHAR) || (c < 0x200000))
1445                             goto invalid_code;
1446                         }
1447                       else
1448                         goto invalid_code;
1449                     }
1450                 }
1451             }
1452         }
1453
1454       *charbuf++ = c;
1455       continue;
1456
1457     invalid_code:
1458       src = src_base;
1459       consumed_chars = consumed_chars_base;
1460       ONE_MORE_BYTE (c);
1461       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1462       coding->errors++;
1463     }
1464
1465  no_more_source:
1466   coding->consumed_char += consumed_chars_base;
1467   coding->consumed = src_base - coding->source;
1468   coding->charbuf_used = charbuf - coding->charbuf;
1469 }
1470
1471
1472 static int
1473 encode_coding_utf_8 (coding)
1474      struct coding_system *coding;
1475 {
1476   int multibytep = coding->dst_multibyte;
1477   int *charbuf = coding->charbuf;
1478   int *charbuf_end = charbuf + coding->charbuf_used;
1479   unsigned char *dst = coding->destination + coding->produced;
1480   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1481   int produced_chars = 0;
1482   int c;
1483
1484   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1485     {
1486       ASSURE_DESTINATION (3);
1487       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1488       CODING_UTF_8_BOM (coding) = utf_without_bom;
1489     }
1490
1491   if (multibytep)
1492     {
1493       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1494
1495       while (charbuf < charbuf_end)
1496         {
1497           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1498
1499           ASSURE_DESTINATION (safe_room);
1500           c = *charbuf++;
1501           if (CHAR_BYTE8_P (c))
1502             {
1503               c = CHAR_TO_BYTE8 (c);
1504               EMIT_ONE_BYTE (c);
1505             }
1506           else
1507             {
1508               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1509               for (p = str; p < pend; p++)
1510                 EMIT_ONE_BYTE (*p);
1511             }
1512         }
1513     }
1514   else
1515     {
1516       int safe_room = MAX_MULTIBYTE_LENGTH;
1517
1518       while (charbuf < charbuf_end)
1519         {
1520           ASSURE_DESTINATION (safe_room);
1521           c = *charbuf++;
1522           if (CHAR_BYTE8_P (c))
1523             *dst++ = CHAR_TO_BYTE8 (c);
1524           else
1525             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1526           produced_chars++;
1527         }
1528     }
1529   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1530   coding->produced_char += produced_chars;
1531   coding->produced = dst - coding->destination;
1532   return 0;
1533 }
1534
1535
1536 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1537    Check if a text is encoded in one of UTF-16 based coding systems.
1538    If it is, return 1, else return 0.  */
1539
1540 #define UTF_16_HIGH_SURROGATE_P(val) \
1541   (((val) & 0xFC00) == 0xD800)
1542
1543 #define UTF_16_LOW_SURROGATE_P(val) \
1544   (((val) & 0xFC00) == 0xDC00)
1545
1546 #define UTF_16_INVALID_P(val)   \
1547   (((val) == 0xFFFE)            \
1548    || ((val) == 0xFFFF)         \
1549    || UTF_16_LOW_SURROGATE_P (val))
1550
1551
1552 static int
1553 detect_coding_utf_16 (coding, detect_info)
1554      struct coding_system *coding;
1555      struct coding_detection_info *detect_info;
1556 {
1557   const unsigned char *src = coding->source, *src_base = src;
1558   const unsigned char *src_end = coding->source + coding->src_bytes;
1559   int multibytep = coding->src_multibyte;
1560   int consumed_chars = 0;
1561   int c1, c2;
1562
1563   detect_info->checked |= CATEGORY_MASK_UTF_16;
1564   if (coding->mode & CODING_MODE_LAST_BLOCK
1565       && (coding->src_chars & 1))
1566     {
1567       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1568       return 0;
1569     }
1570
1571   ONE_MORE_BYTE (c1);
1572   ONE_MORE_BYTE (c2);
1573   if ((c1 == 0xFF) && (c2 == 0xFE))
1574     {
1575       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1576                              | CATEGORY_MASK_UTF_16_AUTO);
1577       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1578                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1579                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1580     }
1581   else if ((c1 == 0xFE) && (c2 == 0xFF))
1582     {
1583       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1584                              | CATEGORY_MASK_UTF_16_AUTO);
1585       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1586                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1587                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1588     }
1589   else
1590     {
1591       /* We check the dispersion of Eth and Oth bytes where E is even and
1592          O is odd.  If both are high, we assume binary data.*/
1593       unsigned char e[256], o[256];
1594       unsigned e_num = 1, o_num = 1;
1595
1596       memset (e, 0, 256);
1597       memset (o, 0, 256);
1598       e[c1] = 1;
1599       o[c2] = 1;
1600
1601       detect_info->rejected
1602         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1603
1604       while (1)
1605         {
1606           ONE_MORE_BYTE (c1);
1607           ONE_MORE_BYTE (c2);
1608           if (! e[c1])
1609             {
1610               e[c1] = 1;
1611               e_num++;
1612               if (e_num >= 128)
1613                 break;
1614             }
1615           if (! o[c2])
1616             {
1617               o[c1] = 1;
1618               o_num++;
1619               if (o_num >= 128)
1620                 break;
1621             }
1622         }
1623       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1624       return 0;
1625     }
1626
1627  no_more_source:
1628   return 1;
1629 }
1630
1631 static void
1632 decode_coding_utf_16 (coding)
1633      struct coding_system *coding;
1634 {
1635   const unsigned char *src = coding->source + coding->consumed;
1636   const unsigned char *src_end = coding->source + coding->src_bytes;
1637   const unsigned char *src_base;
1638   int *charbuf = coding->charbuf + coding->charbuf_used;
1639   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1640   int consumed_chars = 0, consumed_chars_base;
1641   int multibytep = coding->src_multibyte;
1642   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1643   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1644   int surrogate = CODING_UTF_16_SURROGATE (coding);
1645   Lisp_Object attr, charset_list;
1646   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1647   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1648
1649   CODING_GET_INFO (coding, attr, charset_list);
1650
1651   if (bom == utf_with_bom)
1652     {
1653       int c, c1, c2;
1654
1655       src_base = src;
1656       ONE_MORE_BYTE (c1);
1657       ONE_MORE_BYTE (c2);
1658       c = (c1 << 8) | c2;
1659
1660       if (endian == utf_16_big_endian
1661           ? c != 0xFEFF : c != 0xFFFE)
1662         {
1663           /* The first two bytes are not BOM.  Treat them as bytes
1664              for a normal character.  */
1665           src = src_base;
1666           coding->errors++;
1667         }
1668       CODING_UTF_16_BOM (coding) = utf_without_bom;
1669     }
1670   else if (bom == utf_detect_bom)
1671     {
1672       /* We have already tried to detect BOM and failed in
1673          detect_coding.  */
1674       CODING_UTF_16_BOM (coding) = utf_without_bom;
1675     }
1676
1677   while (1)
1678     {
1679       int c, c1, c2;
1680
1681       src_base = src;
1682       consumed_chars_base = consumed_chars;
1683
1684       if (charbuf + 2 >= charbuf_end)
1685         break;
1686
1687       if (byte_after_cr1 >= 0)
1688         c1 = byte_after_cr1, byte_after_cr1 = -1;
1689       else
1690         ONE_MORE_BYTE (c1);
1691       if (c1 < 0)
1692         {
1693           *charbuf++ = -c1;
1694           continue;
1695         }
1696       if (byte_after_cr2 >= 0)
1697         c2 = byte_after_cr2, byte_after_cr2 = -1;
1698       else
1699         ONE_MORE_BYTE (c2);
1700       if (c2 < 0)
1701         {
1702           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1703           *charbuf++ = -c2;
1704           continue;
1705         }
1706       c = (endian == utf_16_big_endian
1707            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1708
1709       if (surrogate)
1710         {
1711           if (! UTF_16_LOW_SURROGATE_P (c))
1712             {
1713               if (endian == utf_16_big_endian)
1714                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1715               else
1716                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1717               *charbuf++ = c1;
1718               *charbuf++ = c2;
1719               coding->errors++;
1720               if (UTF_16_HIGH_SURROGATE_P (c))
1721                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1722               else
1723                 *charbuf++ = c;
1724             }
1725           else
1726             {
1727               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1728               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1729               *charbuf++ = 0x10000 + c;
1730             }
1731         }
1732       else
1733         {
1734           if (UTF_16_HIGH_SURROGATE_P (c))
1735             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1736           else
1737             {
1738               if (eol_crlf && c == '\r')
1739                 {
1740                   ONE_MORE_BYTE (byte_after_cr1);
1741                   ONE_MORE_BYTE (byte_after_cr2);
1742                 }
1743               *charbuf++ = c;
1744             }
1745         }
1746     }
1747
1748  no_more_source:
1749   coding->consumed_char += consumed_chars_base;
1750   coding->consumed = src_base - coding->source;
1751   coding->charbuf_used = charbuf - coding->charbuf;
1752 }
1753
1754 static int
1755 encode_coding_utf_16 (coding)
1756      struct coding_system *coding;
1757 {
1758   int multibytep = coding->dst_multibyte;
1759   int *charbuf = coding->charbuf;
1760   int *charbuf_end = charbuf + coding->charbuf_used;
1761   unsigned char *dst = coding->destination + coding->produced;
1762   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1763   int safe_room = 8;
1764   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1765   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1766   int produced_chars = 0;
1767   Lisp_Object attrs, charset_list;
1768   int c;
1769
1770   CODING_GET_INFO (coding, attrs, charset_list);
1771
1772   if (bom != utf_without_bom)
1773     {
1774       ASSURE_DESTINATION (safe_room);
1775       if (big_endian)
1776         EMIT_TWO_BYTES (0xFE, 0xFF);
1777       else
1778         EMIT_TWO_BYTES (0xFF, 0xFE);
1779       CODING_UTF_16_BOM (coding) = utf_without_bom;
1780     }
1781
1782   while (charbuf < charbuf_end)
1783     {
1784       ASSURE_DESTINATION (safe_room);
1785       c = *charbuf++;
1786       if (c >= MAX_UNICODE_CHAR)
1787         c = coding->default_char;
1788
1789       if (c < 0x10000)
1790         {
1791           if (big_endian)
1792             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1793           else
1794             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1795         }
1796       else
1797         {
1798           int c1, c2;
1799
1800           c -= 0x10000;
1801           c1 = (c >> 10) + 0xD800;
1802           c2 = (c & 0x3FF) + 0xDC00;
1803           if (big_endian)
1804             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1805           else
1806             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1807         }
1808     }
1809   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1810   coding->produced = dst - coding->destination;
1811   coding->produced_char += produced_chars;
1812   return 0;
1813 }
1814
1815 \f
1816 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1817
1818 /* Emacs' internal format for representation of multiple character
1819    sets is a kind of multi-byte encoding, i.e. characters are
1820    represented by variable-length sequences of one-byte codes.
1821
1822    ASCII characters and control characters (e.g. `tab', `newline') are
1823    represented by one-byte sequences which are their ASCII codes, in
1824    the range 0x00 through 0x7F.
1825
1826    8-bit characters of the range 0x80..0x9F are represented by
1827    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1828    code + 0x20).
1829
1830    8-bit characters of the range 0xA0..0xFF are represented by
1831    one-byte sequences which are their 8-bit code.
1832
1833    The other characters are represented by a sequence of `base
1834    leading-code', optional `extended leading-code', and one or two
1835    `position-code's.  The length of the sequence is determined by the
1836    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1837    whereas extended leading-code and position-code take the range 0xA0
1838    through 0xFF.  See `charset.h' for more details about leading-code
1839    and position-code.
1840
1841    --- CODE RANGE of Emacs' internal format ---
1842    character set        range
1843    -------------        -----
1844    ascii                0x00..0x7F
1845    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1846    eight-bit-graphic    0xA0..0xBF
1847    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1848    ---------------------------------------------
1849
1850    As this is the internal character representation, the format is
1851    usually not used externally (i.e. in a file or in a data sent to a
1852    process).  But, it is possible to have a text externally in this
1853    format (i.e. by encoding by the coding system `emacs-mule').
1854
1855    In that case, a sequence of one-byte codes has a slightly different
1856    form.
1857
1858    At first, all characters in eight-bit-control are represented by
1859    one-byte sequences which are their 8-bit code.
1860
1861    Next, character composition data are represented by the byte
1862    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1863    where,
1864         METHOD is 0xF0 plus one of composition method (enum
1865         composition_method),
1866
1867         BYTES is 0xA0 plus a byte length of this composition data,
1868
1869         CHARS is 0x20 plus a number of characters composed by this
1870         data,
1871
1872         COMPONENTs are characters of multibye form or composition
1873         rules encoded by two-byte of ASCII codes.
1874
1875    In addition, for backward compatibility, the following formats are
1876    also recognized as composition data on decoding.
1877
1878    0x80 MSEQ ...
1879    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1880
1881    Here,
1882         MSEQ is a multibyte form but in these special format:
1883           ASCII: 0xA0 ASCII_CODE+0x80,
1884           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1885         RULE is a one byte code of the range 0xA0..0xF0 that
1886         represents a composition rule.
1887   */
1888
1889 char emacs_mule_bytes[256];
1890
1891 int
1892 emacs_mule_char (coding, src, nbytes, nchars, id)
1893      struct coding_system *coding;
1894      const unsigned char *src;
1895      int *nbytes, *nchars, *id;
1896 {
1897   const unsigned char *src_end = coding->source + coding->src_bytes;
1898   const unsigned char *src_base = src;
1899   int multibytep = coding->src_multibyte;
1900   struct charset *charset;
1901   unsigned code;
1902   int c;
1903   int consumed_chars = 0;
1904
1905   ONE_MORE_BYTE (c);
1906   if (c < 0)
1907     {
1908       c = -c;
1909       charset = emacs_mule_charset[0];
1910     }
1911   else
1912     {
1913       if (c >= 0xA0)
1914         {
1915           /* Old style component character of a composition.  */
1916           if (c == 0xA0)
1917             {
1918               ONE_MORE_BYTE (c);
1919               c -= 0x80;
1920             }
1921           else
1922             c -= 0x20;
1923         }
1924
1925       switch (emacs_mule_bytes[c])
1926         {
1927         case 2:
1928           if (! (charset = emacs_mule_charset[c]))
1929             goto invalid_code;
1930           ONE_MORE_BYTE (c);
1931           if (c < 0xA0)
1932             goto invalid_code;
1933           code = c & 0x7F;
1934           break;
1935
1936         case 3:
1937           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1938               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1939             {
1940               ONE_MORE_BYTE (c);
1941               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1942                 goto invalid_code;
1943               ONE_MORE_BYTE (c);
1944               if (c < 0xA0)
1945                 goto invalid_code;
1946               code = c & 0x7F;
1947             }
1948           else
1949             {
1950               if (! (charset = emacs_mule_charset[c]))
1951                 goto invalid_code;
1952               ONE_MORE_BYTE (c);
1953               if (c < 0xA0)
1954                 goto invalid_code;
1955               code = (c & 0x7F) << 8;
1956               ONE_MORE_BYTE (c);
1957               if (c < 0xA0)
1958                 goto invalid_code;
1959               code |= c & 0x7F;
1960             }
1961           break;
1962
1963         case 4:
1964           ONE_MORE_BYTE (c);
1965           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1966             goto invalid_code;
1967           ONE_MORE_BYTE (c);
1968           if (c < 0xA0)
1969             goto invalid_code;
1970           code = (c & 0x7F) << 8;
1971           ONE_MORE_BYTE (c);
1972           if (c < 0xA0)
1973             goto invalid_code;
1974           code |= c & 0x7F;
1975           break;
1976
1977         case 1:
1978           code = c;
1979           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1980                                      ? charset_ascii : charset_eight_bit);
1981           break;
1982
1983         default:
1984           abort ();
1985         }
1986       c = DECODE_CHAR (charset, code);
1987       if (c < 0)
1988         goto invalid_code;
1989     }
1990   *nbytes = src - src_base;
1991   *nchars = consumed_chars;
1992   if (id)
1993     *id = charset->id;
1994   return c;
1995
1996  no_more_source:
1997   return -2;
1998
1999  invalid_code:
2000   return -1;
2001 }
2002
2003
2004 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2005    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
2006    else return 0.  */
2007
2008 static int
2009 detect_coding_emacs_mule (coding, detect_info)
2010      struct coding_system *coding;
2011      struct coding_detection_info *detect_info;
2012 {
2013   const unsigned char *src = coding->source, *src_base;
2014   const unsigned char *src_end = coding->source + coding->src_bytes;
2015   int multibytep = coding->src_multibyte;
2016   int consumed_chars = 0;
2017   int c;
2018   int found = 0;
2019
2020   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
2021   /* A coding system of this category is always ASCII compatible.  */
2022   src += coding->head_ascii;
2023
2024   while (1)
2025     {
2026       src_base = src;
2027       ONE_MORE_BYTE (c);
2028       if (c < 0)
2029         continue;
2030       if (c == 0x80)
2031         {
2032           /* Perhaps the start of composite character.  We simple skip
2033              it because analyzing it is too heavy for detecting.  But,
2034              at least, we check that the composite character
2035              constitutes of more than 4 bytes.  */
2036           const unsigned char *src_base;
2037
2038         repeat:
2039           src_base = src;
2040           do
2041             {
2042               ONE_MORE_BYTE (c);
2043             }
2044           while (c >= 0xA0);
2045
2046           if (src - src_base <= 4)
2047             break;
2048           found = CATEGORY_MASK_EMACS_MULE;
2049           if (c == 0x80)
2050             goto repeat;
2051         }
2052
2053       if (c < 0x80)
2054         {
2055           if (c < 0x20
2056               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2057             break;
2058         }
2059       else
2060         {
2061           int more_bytes = emacs_mule_bytes[*src_base] - 1;
2062
2063           while (more_bytes > 0)
2064             {
2065               ONE_MORE_BYTE (c);
2066               if (c < 0xA0)
2067                 {
2068                   src--;        /* Unread the last byte.  */
2069                   break;
2070                 }
2071               more_bytes--;
2072             }
2073           if (more_bytes != 0)
2074             break;
2075           found = CATEGORY_MASK_EMACS_MULE;
2076         }
2077     }
2078   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2079   return 0;
2080
2081  no_more_source:
2082   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2083     {
2084       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2085       return 0;
2086     }
2087   detect_info->found |= found;
2088   return 1;
2089 }
2090
2091
2092 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2093
2094 /* Decode a character represented as a component of composition
2095    sequence of Emacs 20/21 style at SRC.  Set C to that character and
2096    update SRC to the head of next character (or an encoded composition
2097    rule).  If SRC doesn't points a composition component, set C to -1.
2098    If SRC points an invalid byte sequence, global exit by a return
2099    value 0.  */
2100
2101 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
2102   do                                                            \
2103     {                                                           \
2104       int c;                                                    \
2105       int nbytes, nchars;                                       \
2106                                                                 \
2107       if (src == src_end)                                       \
2108         break;                                                  \
2109       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
2110       if (c < 0)                                                \
2111         {                                                       \
2112           if (c == -2)                                          \
2113             break;                                              \
2114           goto invalid_code;                                    \
2115         }                                                       \
2116       *buf++ = c;                                               \
2117       src += nbytes;                                            \
2118       consumed_chars += nchars;                                 \
2119     }                                                           \
2120   while (0)
2121
2122
2123 /* Decode a composition rule represented as a component of composition
2124    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
2125    and increment BUF.  If SRC points an invalid byte sequence, set C
2126    to -1.  */
2127
2128 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
2129   do {                                                  \
2130     int c, gref, nref;                                  \
2131                                                         \
2132     if (src >= src_end)                                 \
2133       goto invalid_code;                                \
2134     ONE_MORE_BYTE_NO_CHECK (c);                         \
2135     c -= 0xA0;                                          \
2136     if (c < 0 || c >= 81)                               \
2137       goto invalid_code;                                \
2138                                                         \
2139     gref = c / 9, nref = c % 9;                         \
2140     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
2141   } while (0)
2142
2143
2144 /* Decode a composition rule represented as a component of composition
2145    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
2146    and increment BUF.  If SRC points an invalid byte sequence, set C
2147    to -1.  */
2148
2149 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
2150   do {                                                  \
2151     int gref, nref;                                     \
2152                                                         \
2153     if (src + 1>= src_end)                              \
2154       goto invalid_code;                                \
2155     ONE_MORE_BYTE_NO_CHECK (gref);                      \
2156     gref -= 0x20;                                       \
2157     ONE_MORE_BYTE_NO_CHECK (nref);                      \
2158     nref -= 0x20;                                       \
2159     if (gref < 0 || gref >= 81                          \
2160         || nref < 0 || nref >= 81)                      \
2161       goto invalid_code;                                \
2162     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
2163   } while (0)
2164
2165
2166 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
2167   do {                                                                  \
2168     /* Emacs 21 style format.  The first three bytes at SRC are         \
2169        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
2170        the byte length of this composition information, CHARS is the    \
2171        number of characters composed by this composition.  */           \
2172     enum composition_method method = c - 0xF2;                          \
2173     int *charbuf_base = charbuf;                                        \
2174     int consumed_chars_limit;                                           \
2175     int nbytes, nchars;                                                 \
2176                                                                         \
2177     ONE_MORE_BYTE (c);                                                  \
2178     if (c < 0)                                                          \
2179       goto invalid_code;                                                \
2180     nbytes = c - 0xA0;                                                  \
2181     if (nbytes < 3)                                                     \
2182       goto invalid_code;                                                \
2183     ONE_MORE_BYTE (c);                                                  \
2184     if (c < 0)                                                          \
2185       goto invalid_code;                                                \
2186     nchars = c - 0xA0;                                                  \
2187     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
2188     consumed_chars_limit = consumed_chars_base + nbytes;                \
2189     if (method != COMPOSITION_RELATIVE)                                 \
2190       {                                                                 \
2191         int i = 0;                                                      \
2192         while (consumed_chars < consumed_chars_limit)                   \
2193           {                                                             \
2194             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
2195               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
2196             else                                                        \
2197               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
2198             i++;                                                        \
2199           }                                                             \
2200         if (consumed_chars < consumed_chars_limit)                      \
2201           goto invalid_code;                                            \
2202         charbuf_base[0] -= i;                                           \
2203       }                                                                 \
2204   } while (0)
2205
2206
2207 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)                    \
2208   do {                                                                  \
2209     /* Emacs 20 style format for relative composition.  */              \
2210     /* Store multibyte form of characters to be composed.  */           \
2211     enum composition_method method = COMPOSITION_RELATIVE;              \
2212     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];                 \
2213     int *buf = components;                                              \
2214     int i, j;                                                           \
2215                                                                         \
2216     src = src_base;                                                     \
2217     ONE_MORE_BYTE (c);          /* skip 0x80 */                         \
2218     for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++)    \
2219       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                         \
2220     if (i < 2)                                                          \
2221       goto invalid_code;                                                \
2222     ADD_COMPOSITION_DATA (charbuf, i, method);                          \
2223     for (j = 0; j < i; j++)                                             \
2224       *charbuf++ = components[j];                                       \
2225   } while (0)
2226
2227
2228 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
2229   do {                                                          \
2230     /* Emacs 20 style format for rule-base composition.  */     \
2231     /* Store multibyte form of characters to be composed.  */   \
2232     enum composition_method method = COMPOSITION_WITH_RULE;     \
2233     int *charbuf_base = charbuf;                                \
2234     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2235     int *buf = components;                                      \
2236     int i, j;                                                   \
2237                                                                 \
2238     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
2239     for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2240       {                                                         \
2241         if (*src < 0xA0)                                        \
2242           break;                                                \
2243         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
2244         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
2245       }                                                         \
2246     if (i <= 1 || (buf - components) % 2 == 0)                  \
2247       goto invalid_code;                                        \
2248     if (charbuf + i + (i / 2) + 1 >= charbuf_end)               \
2249       goto no_more_source;                                      \
2250     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
2251     i = i * 2 - 1;                                              \
2252     for (j = 0; j < i; j++)                                     \
2253       *charbuf++ = components[j];                               \
2254     charbuf_base[0] -= i;                                       \
2255     for (j = 0; j < i; j += 2)                                  \
2256       *charbuf++ = components[j];                               \
2257   } while (0)
2258
2259
2260 static void
2261 decode_coding_emacs_mule (coding)
2262      struct coding_system *coding;
2263 {
2264   const unsigned char *src = coding->source + coding->consumed;
2265   const unsigned char *src_end = coding->source + coding->src_bytes;
2266   const unsigned char *src_base;
2267   int *charbuf = coding->charbuf + coding->charbuf_used;
2268   int *charbuf_end
2269     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2270   int consumed_chars = 0, consumed_chars_base;
2271   int multibytep = coding->src_multibyte;
2272   Lisp_Object attrs, charset_list;
2273   int char_offset = coding->produced_char;
2274   int last_offset = char_offset;
2275   int last_id = charset_ascii;
2276   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2277   int byte_after_cr = -1;
2278
2279   CODING_GET_INFO (coding, attrs, charset_list);
2280
2281   while (1)
2282     {
2283       int c;
2284
2285       src_base = src;
2286       consumed_chars_base = consumed_chars;
2287
2288       if (charbuf >= charbuf_end)
2289         break;
2290
2291       if (byte_after_cr >= 0)
2292         c = byte_after_cr, byte_after_cr = -1;
2293       else
2294         ONE_MORE_BYTE (c);
2295       if (c < 0)
2296         {
2297           *charbuf++ = -c;
2298           char_offset++;
2299         }
2300       else if (c < 0x80)
2301         {
2302           if (eol_crlf && c == '\r')
2303             ONE_MORE_BYTE (byte_after_cr);
2304           *charbuf++ = c;
2305           char_offset++;
2306         }
2307       else if (c == 0x80)
2308         {
2309           ONE_MORE_BYTE (c);
2310           if (c < 0)
2311             goto invalid_code;
2312           if (c - 0xF2 >= COMPOSITION_RELATIVE
2313               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2314             DECODE_EMACS_MULE_21_COMPOSITION (c);
2315           else if (c < 0xC0)
2316             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2317           else if (c == 0xFF)
2318             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2319           else
2320             goto invalid_code;
2321         }
2322       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2323         {
2324           int nbytes, nchars;
2325           int id;
2326
2327           src = src_base;
2328           consumed_chars = consumed_chars_base;
2329           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2330           if (c < 0)
2331             {
2332               if (c == -2)
2333                 break;
2334               goto invalid_code;
2335             }
2336           if (last_id != id)
2337             {
2338               if (last_id != charset_ascii)
2339                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2340               last_id = id;
2341               last_offset = char_offset;
2342             }
2343           *charbuf++ = c;
2344           src += nbytes;
2345           consumed_chars += nchars;
2346           char_offset++;
2347         }
2348       else
2349         goto invalid_code;
2350       continue;
2351
2352     invalid_code:
2353       src = src_base;
2354       consumed_chars = consumed_chars_base;
2355       ONE_MORE_BYTE (c);
2356       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2357       char_offset++;
2358       coding->errors++;
2359     }
2360
2361  no_more_source:
2362   if (last_id != charset_ascii)
2363     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2364   coding->consumed_char += consumed_chars_base;
2365   coding->consumed = src_base - coding->source;
2366   coding->charbuf_used = charbuf - coding->charbuf;
2367 }
2368
2369
2370 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2371   do {                                          \
2372     if (id < 0xA0)                              \
2373       codes[0] = id, codes[1] = 0;              \
2374     else if (id < 0xE0)                         \
2375       codes[0] = 0x9A, codes[1] = id;           \
2376     else if (id < 0xF0)                         \
2377       codes[0] = 0x9B, codes[1] = id;           \
2378     else if (id < 0xF5)                         \
2379       codes[0] = 0x9C, codes[1] = id;           \
2380     else                                        \
2381       codes[0] = 0x9D, codes[1] = id;           \
2382   } while (0);
2383
2384
2385 static int
2386 encode_coding_emacs_mule (coding)
2387      struct coding_system *coding;
2388 {
2389   int multibytep = coding->dst_multibyte;
2390   int *charbuf = coding->charbuf;
2391   int *charbuf_end = charbuf + coding->charbuf_used;
2392   unsigned char *dst = coding->destination + coding->produced;
2393   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2394   int safe_room = 8;
2395   int produced_chars = 0;
2396   Lisp_Object attrs, charset_list;
2397   int c;
2398   int preferred_charset_id = -1;
2399
2400   CODING_GET_INFO (coding, attrs, charset_list);
2401   if (! EQ (charset_list, Vemacs_mule_charset_list))
2402     {
2403       CODING_ATTR_CHARSET_LIST (attrs)
2404         = charset_list = Vemacs_mule_charset_list;
2405     }
2406
2407   while (charbuf < charbuf_end)
2408     {
2409       ASSURE_DESTINATION (safe_room);
2410       c = *charbuf++;
2411
2412       if (c < 0)
2413         {
2414           /* Handle an annotation.  */
2415           switch (*charbuf)
2416             {
2417             case CODING_ANNOTATE_COMPOSITION_MASK:
2418               /* Not yet implemented.  */
2419               break;
2420             case CODING_ANNOTATE_CHARSET_MASK:
2421               preferred_charset_id = charbuf[3];
2422               if (preferred_charset_id >= 0
2423                   && NILP (Fmemq (make_number (preferred_charset_id),
2424                                   charset_list)))
2425                 preferred_charset_id = -1;
2426               break;
2427             default:
2428               abort ();
2429             }
2430           charbuf += -c - 1;
2431           continue;
2432         }
2433
2434       if (ASCII_CHAR_P (c))
2435         EMIT_ONE_ASCII_BYTE (c);
2436       else if (CHAR_BYTE8_P (c))
2437         {
2438           c = CHAR_TO_BYTE8 (c);
2439           EMIT_ONE_BYTE (c);
2440         }
2441       else
2442         {
2443           struct charset *charset;
2444           unsigned code;
2445           int dimension;
2446           int emacs_mule_id;
2447           unsigned char leading_codes[2];
2448
2449           if (preferred_charset_id >= 0)
2450             {
2451               charset = CHARSET_FROM_ID (preferred_charset_id);
2452               if (! CHAR_CHARSET_P (c, charset))
2453                 charset = char_charset (c, charset_list, NULL);
2454             }
2455           else
2456             charset = char_charset (c, charset_list, &code);
2457           if (! charset)
2458             {
2459               c = coding->default_char;
2460               if (ASCII_CHAR_P (c))
2461                 {
2462                   EMIT_ONE_ASCII_BYTE (c);
2463                   continue;
2464                 }
2465               charset = char_charset (c, charset_list, &code);
2466             }
2467           dimension = CHARSET_DIMENSION (charset);
2468           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2469           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2470           EMIT_ONE_BYTE (leading_codes[0]);
2471           if (leading_codes[1])
2472             EMIT_ONE_BYTE (leading_codes[1]);
2473           if (dimension == 1)
2474             EMIT_ONE_BYTE (code | 0x80);
2475           else
2476             {
2477               code |= 0x8080;
2478               EMIT_ONE_BYTE (code >> 8);
2479               EMIT_ONE_BYTE (code & 0xFF);
2480             }
2481         }
2482     }
2483   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2484   coding->produced_char += produced_chars;
2485   coding->produced = dst - coding->destination;
2486   return 0;
2487 }
2488
2489 \f
2490 /*** 7. ISO2022 handlers ***/
2491
2492 /* The following note describes the coding system ISO2022 briefly.
2493    Since the intention of this note is to help understand the
2494    functions in this file, some parts are NOT ACCURATE or are OVERLY
2495    SIMPLIFIED.  For thorough understanding, please refer to the
2496    original document of ISO2022.  This is equivalent to the standard
2497    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2498
2499    ISO2022 provides many mechanisms to encode several character sets
2500    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2501    is encoded using bytes less than 128.  This may make the encoded
2502    text a little bit longer, but the text passes more easily through
2503    several types of gateway, some of which strip off the MSB (Most
2504    Significant Bit).
2505
2506    There are two kinds of character sets: control character sets and
2507    graphic character sets.  The former contain control characters such
2508    as `newline' and `escape' to provide control functions (control
2509    functions are also provided by escape sequences).  The latter
2510    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2511    two control character sets and many graphic character sets.
2512
2513    Graphic character sets are classified into one of the following
2514    four classes, according to the number of bytes (DIMENSION) and
2515    number of characters in one dimension (CHARS) of the set:
2516    - DIMENSION1_CHARS94
2517    - DIMENSION1_CHARS96
2518    - DIMENSION2_CHARS94
2519    - DIMENSION2_CHARS96
2520
2521    In addition, each character set is assigned an identification tag,
2522    unique for each set, called the "final character" (denoted as <F>
2523    hereafter).  The <F> of each character set is decided by ECMA(*)
2524    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2525    (0x30..0x3F are for private use only).
2526
2527    Note (*): ECMA = European Computer Manufacturers Association
2528
2529    Here are examples of graphic character sets [NAME(<F>)]:
2530         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2531         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2532         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2533         o DIMENSION2_CHARS96 -- none for the moment
2534
2535    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2536         C0 [0x00..0x1F] -- control character plane 0
2537         GL [0x20..0x7F] -- graphic character plane 0
2538         C1 [0x80..0x9F] -- control character plane 1
2539         GR [0xA0..0xFF] -- graphic character plane 1
2540
2541    A control character set is directly designated and invoked to C0 or
2542    C1 by an escape sequence.  The most common case is that:
2543    - ISO646's  control character set is designated/invoked to C0, and
2544    - ISO6429's control character set is designated/invoked to C1,
2545    and usually these designations/invocations are omitted in encoded
2546    text.  In a 7-bit environment, only C0 can be used, and a control
2547    character for C1 is encoded by an appropriate escape sequence to
2548    fit into the environment.  All control characters for C1 are
2549    defined to have corresponding escape sequences.
2550
2551    A graphic character set is at first designated to one of four
2552    graphic registers (G0 through G3), then these graphic registers are
2553    invoked to GL or GR.  These designations and invocations can be
2554    done independently.  The most common case is that G0 is invoked to
2555    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2556    these invocations and designations are omitted in encoded text.
2557    In a 7-bit environment, only GL can be used.
2558
2559    When a graphic character set of CHARS94 is invoked to GL, codes
2560    0x20 and 0x7F of the GL area work as control characters SPACE and
2561    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2562    be used.
2563
2564    There are two ways of invocation: locking-shift and single-shift.
2565    With locking-shift, the invocation lasts until the next different
2566    invocation, whereas with single-shift, the invocation affects the
2567    following character only and doesn't affect the locking-shift
2568    state.  Invocations are done by the following control characters or
2569    escape sequences:
2570
2571    ----------------------------------------------------------------------
2572    abbrev  function                  cntrl escape seq   description
2573    ----------------------------------------------------------------------
2574    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2575    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2576    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2577    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2578    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2579    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2580    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2581    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2582    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2583    ----------------------------------------------------------------------
2584    (*) These are not used by any known coding system.
2585
2586    Control characters for these functions are defined by macros
2587    ISO_CODE_XXX in `coding.h'.
2588
2589    Designations are done by the following escape sequences:
2590    ----------------------------------------------------------------------
2591    escape sequence      description
2592    ----------------------------------------------------------------------
2593    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2594    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2595    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2596    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2597    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2598    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2599    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2600    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2601    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2602    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2603    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2604    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2605    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2606    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2607    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2608    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2609    ----------------------------------------------------------------------
2610
2611    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2612    of dimension 1, chars 94, and final character <F>, etc...
2613
2614    Note (*): Although these designations are not allowed in ISO2022,
2615    Emacs accepts them on decoding, and produces them on encoding
2616    CHARS96 character sets in a coding system which is characterized as
2617    7-bit environment, non-locking-shift, and non-single-shift.
2618
2619    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2620    '(' must be omitted.  We refer to this as "short-form" hereafter.
2621
2622    Now you may notice that there are a lot of ways of encoding the
2623    same multilingual text in ISO2022.  Actually, there exist many
2624    coding systems such as Compound Text (used in X11's inter client
2625    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2626    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2627    localized platforms), and all of these are variants of ISO2022.
2628
2629    In addition to the above, Emacs handles two more kinds of escape
2630    sequences: ISO6429's direction specification and Emacs' private
2631    sequence for specifying character composition.
2632
2633    ISO6429's direction specification takes the following form:
2634         o CSI ']'      -- end of the current direction
2635         o CSI '0' ']'  -- end of the current direction
2636         o CSI '1' ']'  -- start of left-to-right text
2637         o CSI '2' ']'  -- start of right-to-left text
2638    The control character CSI (0x9B: control sequence introducer) is
2639    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2640
2641    Character composition specification takes the following form:
2642         o ESC '0' -- start relative composition
2643         o ESC '1' -- end composition
2644         o ESC '2' -- start rule-base composition (*)
2645         o ESC '3' -- start relative composition with alternate chars  (**)
2646         o ESC '4' -- start rule-base composition with alternate chars  (**)
2647   Since these are not standard escape sequences of any ISO standard,
2648   the use of them with these meanings is restricted to Emacs only.
2649
2650   (*) This form is used only in Emacs 20.7 and older versions,
2651   but newer versions can safely decode it.
2652   (**) This form is used only in Emacs 21.1 and newer versions,
2653   and older versions can't decode it.
2654
2655   Here's a list of example usages of these composition escape
2656   sequences (categorized by `enum composition_method').
2657
2658   COMPOSITION_RELATIVE:
2659         ESC 0 CHAR [ CHAR ] ESC 1
2660   COMPOSITION_WITH_RULE:
2661         ESC 2 CHAR [ RULE CHAR ] ESC 1
2662   COMPOSITION_WITH_ALTCHARS:
2663         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2664   COMPOSITION_WITH_RULE_ALTCHARS:
2665         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2666
2667 enum iso_code_class_type iso_code_class[256];
2668
2669 #define SAFE_CHARSET_P(coding, id)      \
2670   ((id) <= (coding)->max_charset_id     \
2671    && (coding)->safe_charsets[id] >= 0)
2672
2673
2674 #define SHIFT_OUT_OK(category)  \
2675   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2676
2677 static void
2678 setup_iso_safe_charsets (attrs)
2679      Lisp_Object attrs;
2680 {
2681   Lisp_Object charset_list, safe_charsets;
2682   Lisp_Object request;
2683   Lisp_Object reg_usage;
2684   Lisp_Object tail;
2685   int reg94, reg96;
2686   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2687   int max_charset_id;
2688
2689   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2690   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2691       && ! EQ (charset_list, Viso_2022_charset_list))
2692     {
2693       CODING_ATTR_CHARSET_LIST (attrs)
2694         = charset_list = Viso_2022_charset_list;
2695       ASET (attrs, coding_attr_safe_charsets, Qnil);
2696     }
2697
2698   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2699     return;
2700
2701   max_charset_id = 0;
2702   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2703     {
2704       int id = XINT (XCAR (tail));
2705       if (max_charset_id < id)
2706         max_charset_id = id;
2707     }
2708
2709   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2710                                 make_number (255));
2711   request = AREF (attrs, coding_attr_iso_request);
2712   reg_usage = AREF (attrs, coding_attr_iso_usage);
2713   reg94 = XINT (XCAR (reg_usage));
2714   reg96 = XINT (XCDR (reg_usage));
2715
2716   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2717     {
2718       Lisp_Object id;
2719       Lisp_Object reg;
2720       struct charset *charset;
2721
2722       id = XCAR (tail);
2723       charset = CHARSET_FROM_ID (XINT (id));
2724       reg = Fcdr (Fassq (id, request));
2725       if (! NILP (reg))
2726         SSET (safe_charsets, XINT (id), XINT (reg));
2727       else if (charset->iso_chars_96)
2728         {
2729           if (reg96 < 4)
2730             SSET (safe_charsets, XINT (id), reg96);
2731         }
2732       else
2733         {
2734           if (reg94 < 4)
2735             SSET (safe_charsets, XINT (id), reg94);
2736         }
2737     }
2738   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2739 }
2740
2741
2742 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2743    Check if a text is encoded in one of ISO-2022 based codig systems.
2744    If it is, return 1, else return 0.  */
2745
2746 static int
2747 detect_coding_iso_2022 (coding, detect_info)
2748      struct coding_system *coding;
2749      struct coding_detection_info *detect_info;
2750 {
2751   const unsigned char *src = coding->source, *src_base = src;
2752   const unsigned char *src_end = coding->source + coding->src_bytes;
2753   int multibytep = coding->src_multibyte;
2754   int single_shifting = 0;
2755   int id;
2756   int c, c1;
2757   int consumed_chars = 0;
2758   int i;
2759   int rejected = 0;
2760   int found = 0;
2761
2762   detect_info->checked |= CATEGORY_MASK_ISO;
2763
2764   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2765     {
2766       struct coding_system *this = &(coding_categories[i]);
2767       Lisp_Object attrs, val;
2768
2769       if (this->id < 0)
2770         continue;
2771       attrs = CODING_ID_ATTRS (this->id);
2772       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2773           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2774         setup_iso_safe_charsets (attrs);
2775       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2776       this->max_charset_id = SCHARS (val) - 1;
2777       this->safe_charsets = (char *) SDATA (val);
2778     }
2779
2780   /* A coding system of this category is always ASCII compatible.  */
2781   src += coding->head_ascii;
2782
2783   while (rejected != CATEGORY_MASK_ISO)
2784     {
2785       src_base = src;
2786       ONE_MORE_BYTE (c);
2787       switch (c)
2788         {
2789         case ISO_CODE_ESC:
2790           if (inhibit_iso_escape_detection)
2791             break;
2792           single_shifting = 0;
2793           ONE_MORE_BYTE (c);
2794           if (c >= '(' && c <= '/')
2795             {
2796               /* Designation sequence for a charset of dimension 1.  */
2797               ONE_MORE_BYTE (c1);
2798               if (c1 < ' ' || c1 >= 0x80
2799                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2800                 /* Invalid designation sequence.  Just ignore.  */
2801                 break;
2802             }
2803           else if (c == '$')
2804             {
2805               /* Designation sequence for a charset of dimension 2.  */
2806               ONE_MORE_BYTE (c);
2807               if (c >= '@' && c <= 'B')
2808                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2809                 id = iso_charset_table[1][0][c];
2810               else if (c >= '(' && c <= '/')
2811                 {
2812                   ONE_MORE_BYTE (c1);
2813                   if (c1 < ' ' || c1 >= 0x80
2814                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2815                     /* Invalid designation sequence.  Just ignore.  */
2816                     break;
2817                 }
2818               else
2819                 /* Invalid designation sequence.  Just ignore it.  */
2820                 break;
2821             }
2822           else if (c == 'N' || c == 'O')
2823             {
2824               /* ESC <Fe> for SS2 or SS3.  */
2825               single_shifting = 1;
2826               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2827               break;
2828             }
2829           else if (c >= '0' && c <= '4')
2830             {
2831               /* ESC <Fp> for start/end composition.  */
2832               found |= CATEGORY_MASK_ISO;
2833               break;
2834             }
2835           else
2836             {
2837               /* Invalid escape sequence.  Just ignore it.  */
2838               break;
2839             }
2840
2841           /* We found a valid designation sequence for CHARSET.  */
2842           rejected |= CATEGORY_MASK_ISO_8BIT;
2843           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2844                               id))
2845             found |= CATEGORY_MASK_ISO_7;
2846           else
2847             rejected |= CATEGORY_MASK_ISO_7;
2848           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2849                               id))
2850             found |= CATEGORY_MASK_ISO_7_TIGHT;
2851           else
2852             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2853           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2854                               id))
2855             found |= CATEGORY_MASK_ISO_7_ELSE;
2856           else
2857             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2858           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2859                               id))
2860             found |= CATEGORY_MASK_ISO_8_ELSE;
2861           else
2862             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2863           break;
2864
2865         case ISO_CODE_SO:
2866         case ISO_CODE_SI:
2867           /* Locking shift out/in.  */
2868           if (inhibit_iso_escape_detection)
2869             break;
2870           single_shifting = 0;
2871           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2872           break;
2873
2874         case ISO_CODE_CSI:
2875           /* Control sequence introducer.  */
2876           single_shifting = 0;
2877           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2878           found |= CATEGORY_MASK_ISO_8_ELSE;
2879           goto check_extra_latin;
2880
2881         case ISO_CODE_SS2:
2882         case ISO_CODE_SS3:
2883           /* Single shift.   */
2884           if (inhibit_iso_escape_detection)
2885             break;
2886           single_shifting = 0;
2887           rejected |= CATEGORY_MASK_ISO_7BIT;
2888           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2889               & CODING_ISO_FLAG_SINGLE_SHIFT)
2890             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2891           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2892               & CODING_ISO_FLAG_SINGLE_SHIFT)
2893             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2894           if (single_shifting)
2895             break;
2896           goto check_extra_latin;
2897
2898         default:
2899           if (c < 0)
2900             continue;
2901           if (c < 0x80)
2902             {
2903               single_shifting = 0;
2904               break;
2905             }
2906           if (c >= 0xA0)
2907             {
2908               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2909               found |= CATEGORY_MASK_ISO_8_1;
2910               /* Check the length of succeeding codes of the range
2911                  0xA0..0FF.  If the byte length is even, we include
2912                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2913                  only when we are not single shifting.  */
2914               if (! single_shifting
2915                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2916                 {
2917                   int i = 1;
2918                   while (src < src_end)
2919                     {
2920                       ONE_MORE_BYTE (c);
2921                       if (c < 0xA0)
2922                         break;
2923                       i++;
2924                     }
2925
2926                   if (i & 1 && src < src_end)
2927                     rejected |= CATEGORY_MASK_ISO_8_2;
2928                   else
2929                     found |= CATEGORY_MASK_ISO_8_2;
2930                 }
2931               break;
2932             }
2933         check_extra_latin:
2934           single_shifting = 0;
2935           if (! VECTORP (Vlatin_extra_code_table)
2936               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2937             {
2938               rejected = CATEGORY_MASK_ISO;
2939               break;
2940             }
2941           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2942               & CODING_ISO_FLAG_LATIN_EXTRA)
2943             found |= CATEGORY_MASK_ISO_8_1;
2944           else
2945             rejected |= CATEGORY_MASK_ISO_8_1;
2946           rejected |= CATEGORY_MASK_ISO_8_2;
2947         }
2948     }
2949   detect_info->rejected |= CATEGORY_MASK_ISO;
2950   return 0;
2951
2952  no_more_source:
2953   detect_info->rejected |= rejected;
2954   detect_info->found |= (found & ~rejected);
2955   return 1;
2956 }
2957
2958
2959 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
2960    escape sequence should be kept.  */
2961 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2962   do {                                                                  \
2963     int id, prev;                                                       \
2964                                                                         \
2965     if (final < '0' || final >= 128                                     \
2966         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2967         || !SAFE_CHARSET_P (coding, id))                                \
2968       {                                                                 \
2969         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2970         chars_96 = -1;                                                  \
2971         break;                                                          \
2972       }                                                                 \
2973     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2974     if (id == charset_jisx0201_roman)                                   \
2975       {                                                                 \
2976         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2977           id = charset_ascii;                                           \
2978       }                                                                 \
2979     else if (id == charset_jisx0208_1978)                               \
2980       {                                                                 \
2981         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2982           id = charset_jisx0208;                                        \
2983       }                                                                 \
2984     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2985     /* If there was an invalid designation to REG previously, and this  \
2986        designation is ASCII to REG, we should keep this designation     \
2987        sequence.  */                                                    \
2988     if (prev == -2 && id == charset_ascii)                              \
2989       chars_96 = -1;                                                    \
2990   } while (0)
2991
2992
2993 #define MAYBE_FINISH_COMPOSITION()                              \
2994   do {                                                          \
2995     int i;                                                      \
2996     if (composition_state == COMPOSING_NO)                      \
2997       break;                                                    \
2998     /* It is assured that we have enough room for producing     \
2999        characters stored in the table `components'.  */         \
3000     if (charbuf + component_idx > charbuf_end)                  \
3001       goto no_more_source;                                      \
3002     composition_state = COMPOSING_NO;                           \
3003     if (method == COMPOSITION_RELATIVE                          \
3004         || method == COMPOSITION_WITH_ALTCHARS)                 \
3005       {                                                         \
3006         for (i = 0; i < component_idx; i++)                     \
3007           *charbuf++ = components[i];                           \
3008         char_offset += component_idx;                           \
3009       }                                                         \
3010     else                                                        \
3011       {                                                         \
3012         for (i = 0; i < component_idx; i += 2)                  \
3013           *charbuf++ = components[i];                           \
3014         char_offset += (component_idx / 2) + 1;                 \
3015       }                                                         \
3016   } while (0)
3017
3018
3019 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3020    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3021    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3022    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3023    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3024   */
3025
3026 #define DECODE_COMPOSITION_START(c1)                                    \
3027   do {                                                                  \
3028     if (c1 == '0'                                                       \
3029         && composition_state == COMPOSING_COMPONENT_RULE)               \
3030       {                                                                 \
3031         component_len = component_idx;                                  \
3032         composition_state = COMPOSING_CHAR;                             \
3033       }                                                                 \
3034     else                                                                \
3035       {                                                                 \
3036         const unsigned char *p;                                         \
3037                                                                         \
3038         MAYBE_FINISH_COMPOSITION ();                                    \
3039         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
3040           goto no_more_source;                                          \
3041         for (p = src; p < src_end - 1; p++)                             \
3042           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
3043             break;                                                      \
3044         if (p == src_end - 1)                                           \
3045           {                                                             \
3046             /* The current composition doesn't end in the current       \
3047                source.  */                                              \
3048             record_conversion_result                                    \
3049               (coding, CODING_RESULT_INSUFFICIENT_SRC);                 \
3050             goto no_more_source;                                        \
3051           }                                                             \
3052                                                                         \
3053         /* This is surely the start of a composition.  */               \
3054         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
3055                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
3056                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
3057                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
3058         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
3059                              : COMPOSING_COMPONENT_CHAR);               \
3060         component_idx = component_len = 0;                              \
3061       }                                                                 \
3062   } while (0)
3063
3064
3065 /* Handle compositoin end sequence ESC 1.  */
3066
3067 #define DECODE_COMPOSITION_END()                                        \
3068   do {                                                                  \
3069     int nchars = (component_len > 0 ? component_idx - component_len     \
3070                   : method == COMPOSITION_RELATIVE ? component_idx      \
3071                   : (component_idx + 1) / 2);                           \
3072     int i;                                                              \
3073     int *saved_charbuf = charbuf;                                       \
3074                                                                         \
3075     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
3076     if (method != COMPOSITION_RELATIVE)                                 \
3077       {                                                                 \
3078         if (component_len == 0)                                         \
3079           for (i = 0; i < component_idx; i++)                           \
3080             *charbuf++ = components[i];                                 \
3081         else                                                            \
3082           for (i = 0; i < component_len; i++)                           \
3083             *charbuf++ = components[i];                                 \
3084         *saved_charbuf = saved_charbuf - charbuf;                       \
3085       }                                                                 \
3086     if (method == COMPOSITION_WITH_RULE)                                \
3087       for (i = 0; i < component_idx; i += 2, char_offset++)             \
3088         *charbuf++ = components[i];                                     \
3089     else                                                                \
3090       for (i = component_len; i < component_idx; i++, char_offset++)    \
3091         *charbuf++ = components[i];                                     \
3092     coding->annotated = 1;                                              \
3093     composition_state = COMPOSING_NO;                                   \
3094   } while (0)
3095
3096
3097 /* Decode a composition rule from the byte C1 (and maybe one more byte
3098    from SRC) and store one encoded composition rule in
3099    coding->cmp_data.  */
3100
3101 #define DECODE_COMPOSITION_RULE(c1)                                     \
3102   do {                                                                  \
3103     (c1) -= 32;                                                         \
3104     if (c1 < 81)                /* old format (before ver.21) */        \
3105       {                                                                 \
3106         int gref = (c1) / 9;                                            \
3107         int nref = (c1) % 9;                                            \
3108         if (gref == 4) gref = 10;                                       \
3109         if (nref == 4) nref = 10;                                       \
3110         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
3111       }                                                                 \
3112     else if (c1 < 93)           /* new format (after ver.21) */         \
3113       {                                                                 \
3114         ONE_MORE_BYTE (c2);                                             \
3115         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
3116       }                                                                 \
3117     else                                                                \
3118       c1 = 0;                                                           \
3119   } while (0)
3120
3121
3122 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3123
3124 static void
3125 decode_coding_iso_2022 (coding)
3126      struct coding_system *coding;
3127 {
3128   const unsigned char *src = coding->source + coding->consumed;
3129   const unsigned char *src_end = coding->source + coding->src_bytes;
3130   const unsigned char *src_base;
3131   int *charbuf = coding->charbuf + coding->charbuf_used;
3132   int *charbuf_end
3133     = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
3134   int consumed_chars = 0, consumed_chars_base;
3135   int multibytep = coding->src_multibyte;
3136   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3137   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3138   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3139   int charset_id_2, charset_id_3;
3140   struct charset *charset;
3141   int c;
3142   /* For handling composition sequence.  */
3143 #define COMPOSING_NO                    0
3144 #define COMPOSING_CHAR                  1
3145 #define COMPOSING_RULE                  2
3146 #define COMPOSING_COMPONENT_CHAR        3
3147 #define COMPOSING_COMPONENT_RULE        4
3148
3149   int composition_state = COMPOSING_NO;
3150   enum composition_method method;
3151   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
3152   int component_idx;
3153   int component_len;
3154   Lisp_Object attrs, charset_list;
3155   int char_offset = coding->produced_char;
3156   int last_offset = char_offset;
3157   int last_id = charset_ascii;
3158   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3159   int byte_after_cr = -1;
3160
3161   CODING_GET_INFO (coding, attrs, charset_list);
3162   setup_iso_safe_charsets (attrs);
3163   /* Charset list may have been changed.  */
3164   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3165   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3166
3167   while (1)
3168     {
3169       int c1, c2;
3170
3171       src_base = src;
3172       consumed_chars_base = consumed_chars;
3173
3174       if (charbuf >= charbuf_end)
3175         break;
3176
3177       if (byte_after_cr >= 0)
3178         c1 = byte_after_cr, byte_after_cr = -1;
3179       else
3180         ONE_MORE_BYTE (c1);
3181       if (c1 < 0)
3182         goto invalid_code;
3183
3184       /* We produce at most one character.  */
3185       switch (iso_code_class [c1])
3186         {
3187         case ISO_0x20_or_0x7F:
3188           if (composition_state != COMPOSING_NO)
3189             {
3190               if (composition_state == COMPOSING_RULE
3191                   || composition_state == COMPOSING_COMPONENT_RULE)
3192                 {
3193                   DECODE_COMPOSITION_RULE (c1);
3194                   components[component_idx++] = c1;
3195                   composition_state--;
3196                   continue;
3197                 }
3198             }
3199           if (charset_id_0 < 0
3200               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3201             /* This is SPACE or DEL.  */
3202             charset = CHARSET_FROM_ID (charset_ascii);
3203           else
3204             charset = CHARSET_FROM_ID (charset_id_0);
3205           break;
3206
3207         case ISO_graphic_plane_0:
3208           if (composition_state != COMPOSING_NO)
3209             {
3210               if (composition_state == COMPOSING_RULE
3211                   || composition_state == COMPOSING_COMPONENT_RULE)
3212                 {
3213                   DECODE_COMPOSITION_RULE (c1);
3214                   components[component_idx++] = c1;
3215                   composition_state--;
3216                   continue;
3217                 }
3218             }
3219           if (charset_id_0 < 0)
3220             charset = CHARSET_FROM_ID (charset_ascii);
3221           else
3222             charset = CHARSET_FROM_ID (charset_id_0);
3223           break;
3224
3225         case ISO_0xA0_or_0xFF:
3226           if (charset_id_1 < 0
3227               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3228               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3229             goto invalid_code;
3230           /* This is a graphic character, we fall down ... */
3231
3232         case ISO_graphic_plane_1:
3233           if (charset_id_1 < 0)
3234             goto invalid_code;
3235           charset = CHARSET_FROM_ID (charset_id_1);
3236           break;
3237
3238         case ISO_control_0:
3239           if (eol_crlf && c1 == '\r')
3240             ONE_MORE_BYTE (byte_after_cr);
3241           MAYBE_FINISH_COMPOSITION ();
3242           charset = CHARSET_FROM_ID (charset_ascii);
3243           break;
3244
3245         case ISO_control_1:
3246           MAYBE_FINISH_COMPOSITION ();
3247           goto invalid_code;
3248
3249         case ISO_shift_out:
3250           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3251               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3252             goto invalid_code;
3253           CODING_ISO_INVOCATION (coding, 0) = 1;
3254           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3255           continue;
3256
3257         case ISO_shift_in:
3258           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3259             goto invalid_code;
3260           CODING_ISO_INVOCATION (coding, 0) = 0;
3261           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3262           continue;
3263
3264         case ISO_single_shift_2_7:
3265         case ISO_single_shift_2:
3266           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3267             goto invalid_code;
3268           /* SS2 is handled as an escape sequence of ESC 'N' */
3269           c1 = 'N';
3270           goto label_escape_sequence;
3271
3272         case ISO_single_shift_3:
3273           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3274             goto invalid_code;
3275           /* SS2 is handled as an escape sequence of ESC 'O' */
3276           c1 = 'O';
3277           goto label_escape_sequence;
3278
3279         case ISO_control_sequence_introducer:
3280           /* CSI is handled as an escape sequence of ESC '[' ...  */
3281           c1 = '[';
3282           goto label_escape_sequence;
3283
3284         case ISO_escape:
3285           ONE_MORE_BYTE (c1);
3286         label_escape_sequence:
3287           /* Escape sequences handled here are invocation,
3288              designation, direction specification, and character
3289              composition specification.  */
3290           switch (c1)
3291             {
3292             case '&':           /* revision of following character set */
3293               ONE_MORE_BYTE (c1);
3294               if (!(c1 >= '@' && c1 <= '~'))
3295                 goto invalid_code;
3296               ONE_MORE_BYTE (c1);
3297               if (c1 != ISO_CODE_ESC)
3298                 goto invalid_code;
3299               ONE_MORE_BYTE (c1);
3300               goto label_escape_sequence;
3301
3302             case '$':           /* designation of 2-byte character set */
3303               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3304                 goto invalid_code;
3305               {
3306                 int reg, chars96;
3307
3308                 ONE_MORE_BYTE (c1);
3309                 if (c1 >= '@' && c1 <= 'B')
3310                   {     /* designation of JISX0208.1978, GB2312.1980,
3311                            or JISX0208.1980 */
3312                     reg = 0, chars96 = 0;
3313                   }
3314                 else if (c1 >= 0x28 && c1 <= 0x2B)
3315                   { /* designation of DIMENSION2_CHARS94 character set */
3316                     reg = c1 - 0x28, chars96 = 0;
3317                     ONE_MORE_BYTE (c1);
3318                   }
3319                 else if (c1 >= 0x2C && c1 <= 0x2F)
3320                   { /* designation of DIMENSION2_CHARS96 character set */
3321                     reg = c1 - 0x2C, chars96 = 1;
3322                     ONE_MORE_BYTE (c1);
3323                   }
3324                 else
3325                   goto invalid_code;
3326                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3327                 /* We must update these variables now.  */
3328                 if (reg == 0)
3329                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3330                 else if (reg == 1)
3331                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3332                 if (chars96 < 0)
3333                   goto invalid_code;
3334               }
3335               continue;
3336
3337             case 'n':           /* invocation of locking-shift-2 */
3338               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3339                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3340                 goto invalid_code;
3341               CODING_ISO_INVOCATION (coding, 0) = 2;
3342               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3343               continue;
3344
3345             case 'o':           /* invocation of locking-shift-3 */
3346               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3347                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3348                 goto invalid_code;
3349               CODING_ISO_INVOCATION (coding, 0) = 3;
3350               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3351               continue;
3352
3353             case 'N':           /* invocation of single-shift-2 */
3354               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3355                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3356                 goto invalid_code;
3357               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3358               if (charset_id_2 < 0)
3359                 charset = CHARSET_FROM_ID (charset_ascii);
3360               else
3361                 charset = CHARSET_FROM_ID (charset_id_2);
3362               ONE_MORE_BYTE (c1);
3363               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3364                 goto invalid_code;
3365               break;
3366
3367             case 'O':           /* invocation of single-shift-3 */
3368               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3369                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3370                 goto invalid_code;
3371               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3372               if (charset_id_3 < 0)
3373                 charset = CHARSET_FROM_ID (charset_ascii);
3374               else
3375                 charset = CHARSET_FROM_ID (charset_id_3);
3376               ONE_MORE_BYTE (c1);
3377               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3378                 goto invalid_code;
3379               break;
3380
3381             case '0': case '2': case '3': case '4': /* start composition */
3382               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3383                 goto invalid_code;
3384               DECODE_COMPOSITION_START (c1);
3385               continue;
3386
3387             case '1':           /* end composition */
3388               if (composition_state == COMPOSING_NO)
3389                 goto invalid_code;
3390               DECODE_COMPOSITION_END ();
3391               continue;
3392
3393             case '[':           /* specification of direction */
3394               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3395                 goto invalid_code;
3396               /* For the moment, nested direction is not supported.
3397                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3398                  left-to-right, and nozero means right-to-left.  */
3399               ONE_MORE_BYTE (c1);
3400               switch (c1)
3401                 {
3402                 case ']':       /* end of the current direction */
3403                   coding->mode &= ~CODING_MODE_DIRECTION;
3404
3405                 case '0':       /* end of the current direction */
3406                 case '1':       /* start of left-to-right direction */
3407                   ONE_MORE_BYTE (c1);
3408                   if (c1 == ']')
3409                     coding->mode &= ~CODING_MODE_DIRECTION;
3410                   else
3411                     goto invalid_code;
3412                   break;
3413
3414                 case '2':       /* start of right-to-left direction */
3415                   ONE_MORE_BYTE (c1);
3416                   if (c1 == ']')
3417                     coding->mode |= CODING_MODE_DIRECTION;
3418                   else
3419                     goto invalid_code;
3420                   break;
3421
3422                 default:
3423                   goto invalid_code;
3424                 }
3425               continue;
3426
3427             case '%':
3428               ONE_MORE_BYTE (c1);
3429               if (c1 == '/')
3430                 {
3431                   /* CTEXT extended segment:
3432                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3433                      We keep these bytes as is for the moment.
3434                      They may be decoded by post-read-conversion.  */
3435                   int dim, M, L;
3436                   int size;
3437
3438                   ONE_MORE_BYTE (dim);
3439                   ONE_MORE_BYTE (M);
3440                   ONE_MORE_BYTE (L);
3441                   size = ((M - 128) * 128) + (L - 128);
3442                   if (charbuf + 8 + size > charbuf_end)
3443                     goto break_loop;
3444                   *charbuf++ = ISO_CODE_ESC;
3445                   *charbuf++ = '%';
3446                   *charbuf++ = '/';
3447                   *charbuf++ = dim;
3448                   *charbuf++ = BYTE8_TO_CHAR (M);
3449                   *charbuf++ = BYTE8_TO_CHAR (L);
3450                   while (size-- > 0)
3451                     {
3452                       ONE_MORE_BYTE (c1);
3453                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3454                     }
3455                 }
3456               else if (c1 == 'G')
3457                 {
3458                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3459                      ESC % G --UTF-8-BYTES-- ESC % @
3460                      We keep these bytes as is for the moment.
3461                      They may be decoded by post-read-conversion.  */
3462                   int *p = charbuf;
3463
3464                   if (p + 6 > charbuf_end)
3465                     goto break_loop;
3466                   *p++ = ISO_CODE_ESC;
3467                   *p++ = '%';
3468                   *p++ = 'G';
3469                   while (p < charbuf_end)
3470                     {
3471                       ONE_MORE_BYTE (c1);
3472                       if (c1 == ISO_CODE_ESC
3473                           && src + 1 < src_end
3474                           && src[0] == '%'
3475                           && src[1] == '@')
3476                         {
3477                           src += 2;
3478                           break;
3479                         }
3480                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3481                     }
3482                   if (p + 3 > charbuf_end)
3483                     goto break_loop;
3484                   *p++ = ISO_CODE_ESC;
3485                   *p++ = '%';
3486                   *p++ = '@';
3487                   charbuf = p;
3488                 }
3489               else
3490                 goto invalid_code;
3491               continue;
3492               break;
3493
3494             default:
3495               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3496                 goto invalid_code;
3497               {
3498                 int reg, chars96;
3499
3500                 if (c1 >= 0x28 && c1 <= 0x2B)
3501                   { /* designation of DIMENSION1_CHARS94 character set */
3502                     reg = c1 - 0x28, chars96 = 0;
3503                     ONE_MORE_BYTE (c1);
3504                   }
3505                 else if (c1 >= 0x2C && c1 <= 0x2F)
3506                   { /* designation of DIMENSION1_CHARS96 character set */
3507                     reg = c1 - 0x2C, chars96 = 1;
3508                     ONE_MORE_BYTE (c1);
3509                   }
3510                 else
3511                   goto invalid_code;
3512                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3513                 /* We must update these variables now.  */
3514                 if (reg == 0)
3515                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3516                 else if (reg == 1)
3517                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3518                 if (chars96 < 0)
3519                   goto invalid_code;
3520               }
3521               continue;
3522             }
3523         }
3524
3525       if (charset->id != charset_ascii
3526           && last_id != charset->id)
3527         {
3528           if (last_id != charset_ascii)
3529             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3530           last_id = charset->id;
3531           last_offset = char_offset;
3532         }
3533
3534       /* Now we know CHARSET and 1st position code C1 of a character.
3535          Produce a decoded character while getting 2nd position code
3536          C2 if necessary.  */
3537       c1 &= 0x7F;
3538       if (CHARSET_DIMENSION (charset) > 1)
3539         {
3540           ONE_MORE_BYTE (c2);
3541           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3542             /* C2 is not in a valid range.  */
3543             goto invalid_code;
3544           c1 = (c1 << 8) | (c2 & 0x7F);
3545           if (CHARSET_DIMENSION (charset) > 2)
3546             {
3547               ONE_MORE_BYTE (c2);
3548               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3549                 /* C2 is not in a valid range.  */
3550                 goto invalid_code;
3551               c1 = (c1 << 8) | (c2 & 0x7F);
3552             }
3553         }
3554
3555       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3556       if (c < 0)
3557         {
3558           MAYBE_FINISH_COMPOSITION ();
3559           for (; src_base < src; src_base++, char_offset++)
3560             {
3561               if (ASCII_BYTE_P (*src_base))
3562                 *charbuf++ = *src_base;
3563               else
3564                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3565             }
3566         }
3567       else if (composition_state == COMPOSING_NO)
3568         {
3569           *charbuf++ = c;
3570           char_offset++;
3571         }
3572       else
3573         {
3574           components[component_idx++] = c;
3575           if (method == COMPOSITION_WITH_RULE
3576               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3577                   && composition_state == COMPOSING_COMPONENT_CHAR))
3578             composition_state++;
3579         }
3580       continue;
3581
3582     invalid_code:
3583       MAYBE_FINISH_COMPOSITION ();
3584       src = src_base;
3585       consumed_chars = consumed_chars_base;
3586       ONE_MORE_BYTE (c);
3587       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3588       char_offset++;
3589       coding->errors++;
3590       continue;
3591
3592     break_loop:
3593       break;
3594     }
3595
3596  no_more_source:
3597   if (last_id != charset_ascii)
3598     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3599   coding->consumed_char += consumed_chars_base;
3600   coding->consumed = src_base - coding->source;
3601   coding->charbuf_used = charbuf - coding->charbuf;
3602 }
3603
3604
3605 /* ISO2022 encoding stuff.  */
3606
3607 /*
3608    It is not enough to say just "ISO2022" on encoding, we have to
3609    specify more details.  In Emacs, each coding system of ISO2022
3610    variant has the following specifications:
3611         1. Initial designation to G0 thru G3.
3612         2. Allows short-form designation?
3613         3. ASCII should be designated to G0 before control characters?
3614         4. ASCII should be designated to G0 at end of line?
3615         5. 7-bit environment or 8-bit environment?
3616         6. Use locking-shift?
3617         7. Use Single-shift?
3618    And the following two are only for Japanese:
3619         8. Use ASCII in place of JIS0201-1976-Roman?
3620         9. Use JISX0208-1983 in place of JISX0208-1978?
3621    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3622    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3623    details.
3624 */
3625
3626 /* Produce codes (escape sequence) for designating CHARSET to graphic
3627    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3628    '@', 'A', or 'B' and the coding system CODING allows, produce
3629    designation sequence of short-form.  */
3630
3631 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3632   do {                                                                  \
3633     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3634     char *intermediate_char_94 = "()*+";                                \
3635     char *intermediate_char_96 = ",-./";                                \
3636     int revision = -1;                                                  \
3637     int c;                                                              \
3638                                                                         \
3639     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3640       revision = CHARSET_ISO_REVISION (charset);                        \
3641                                                                         \
3642     if (revision >= 0)                                                  \
3643       {                                                                 \
3644         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3645         EMIT_ONE_BYTE ('@' + revision);                                 \
3646       }                                                                 \
3647     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3648     if (CHARSET_DIMENSION (charset) == 1)                               \
3649       {                                                                 \
3650         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3651           c = intermediate_char_94[reg];                                \
3652         else                                                            \
3653           c = intermediate_char_96[reg];                                \
3654         EMIT_ONE_ASCII_BYTE (c);                                        \
3655       }                                                                 \
3656     else                                                                \
3657       {                                                                 \
3658         EMIT_ONE_ASCII_BYTE ('$');                                      \
3659         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3660           {                                                             \
3661             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3662                 || reg != 0                                             \
3663                 || final_char < '@' || final_char > 'B')                \
3664               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3665           }                                                             \
3666         else                                                            \
3667           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3668       }                                                                 \
3669     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3670                                                                         \
3671     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3672   } while (0)
3673
3674
3675 /* The following two macros produce codes (control character or escape
3676    sequence) for ISO2022 single-shift functions (single-shift-2 and
3677    single-shift-3).  */
3678
3679 #define ENCODE_SINGLE_SHIFT_2                                           \
3680   do {                                                                  \
3681     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3682       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3683     else                                                                \
3684       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3685     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3686   } while (0)
3687
3688
3689 #define ENCODE_SINGLE_SHIFT_3                                           \
3690   do {                                                                  \
3691     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3692       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3693     else                                                                \
3694       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3695     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3696   } while (0)
3697
3698
3699 /* The following four macros produce codes (control character or
3700    escape sequence) for ISO2022 locking-shift functions (shift-in,
3701    shift-out, locking-shift-2, and locking-shift-3).  */
3702
3703 #define ENCODE_SHIFT_IN                                 \
3704   do {                                                  \
3705     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3706     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3707   } while (0)
3708
3709
3710 #define ENCODE_SHIFT_OUT                                \
3711   do {                                                  \
3712     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3713     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3714   } while (0)
3715
3716
3717 #define ENCODE_LOCKING_SHIFT_2                          \
3718   do {                                                  \
3719     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3720     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3721   } while (0)
3722
3723
3724 #define ENCODE_LOCKING_SHIFT_3                          \
3725   do {                                                  \
3726     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3727     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3728   } while (0)
3729
3730
3731 /* Produce codes for a DIMENSION1 character whose character set is
3732    CHARSET and whose position-code is C1.  Designation and invocation
3733    sequences are also produced in advance if necessary.  */
3734
3735 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3736   do {                                                                  \
3737     int id = CHARSET_ID (charset);                                      \
3738                                                                         \
3739     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3740         && id == charset_ascii)                                         \
3741       {                                                                 \
3742         id = charset_jisx0201_roman;                                    \
3743         charset = CHARSET_FROM_ID (id);                                 \
3744       }                                                                 \
3745                                                                         \
3746     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3747       {                                                                 \
3748         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3749           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3750         else                                                            \
3751           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3752         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3753         break;                                                          \
3754       }                                                                 \
3755     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3756       {                                                                 \
3757         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3758         break;                                                          \
3759       }                                                                 \
3760     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3761       {                                                                 \
3762         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3763         break;                                                          \
3764       }                                                                 \
3765     else                                                                \
3766       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3767          must invoke it, or, at first, designate it to some graphic     \
3768          register.  Then repeat the loop to actually produce the        \
3769          character.  */                                                 \
3770       dst = encode_invocation_designation (charset, coding, dst,        \
3771                                            &produced_chars);            \
3772   } while (1)
3773
3774
3775 /* Produce codes for a DIMENSION2 character whose character set is
3776    CHARSET and whose position-codes are C1 and C2.  Designation and
3777    invocation codes are also produced in advance if necessary.  */
3778
3779 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3780   do {                                                                  \
3781     int id = CHARSET_ID (charset);                                      \
3782                                                                         \
3783     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3784         && id == charset_jisx0208)                                      \
3785       {                                                                 \
3786         id = charset_jisx0208_1978;                                     \
3787         charset = CHARSET_FROM_ID (id);                                 \
3788       }                                                                 \
3789                                                                         \
3790     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3791       {                                                                 \
3792         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3793           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3794         else                                                            \
3795           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3796         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3797         break;                                                          \
3798       }                                                                 \
3799     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3800       {                                                                 \
3801         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3802         break;                                                          \
3803       }                                                                 \
3804     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3805       {                                                                 \
3806         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3807         break;                                                          \
3808       }                                                                 \
3809     else                                                                \
3810       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3811          must invoke it, or, at first, designate it to some graphic     \
3812          register.  Then repeat the loop to actually produce the        \
3813          character.  */                                                 \
3814       dst = encode_invocation_designation (charset, coding, dst,        \
3815                                            &produced_chars);            \
3816   } while (1)
3817
3818
3819 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3820   do {                                                                     \
3821     int code = ENCODE_CHAR ((charset),(c));                                \
3822                                                                            \
3823     if (CHARSET_DIMENSION (charset) == 1)                                  \
3824       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3825     else                                                                   \
3826       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3827   } while (0)
3828
3829
3830 /* Produce designation and invocation codes at a place pointed by DST
3831    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3832    Return new DST.  */
3833
3834 unsigned char *
3835 encode_invocation_designation (charset, coding, dst, p_nchars)
3836      struct charset *charset;
3837      struct coding_system *coding;
3838      unsigned char *dst;
3839      int *p_nchars;
3840 {
3841   int multibytep = coding->dst_multibyte;
3842   int produced_chars = *p_nchars;
3843   int reg;                      /* graphic register number */
3844   int id = CHARSET_ID (charset);
3845
3846   /* At first, check designations.  */
3847   for (reg = 0; reg < 4; reg++)
3848     if (id == CODING_ISO_DESIGNATION (coding, reg))
3849       break;
3850
3851   if (reg >= 4)
3852     {
3853       /* CHARSET is not yet designated to any graphic registers.  */
3854       /* At first check the requested designation.  */
3855       reg = CODING_ISO_REQUEST (coding, id);
3856       if (reg < 0)
3857         /* Since CHARSET requests no special designation, designate it
3858            to graphic register 0.  */
3859         reg = 0;
3860
3861       ENCODE_DESIGNATION (charset, reg, coding);
3862     }
3863
3864   if (CODING_ISO_INVOCATION (coding, 0) != reg
3865       && CODING_ISO_INVOCATION (coding, 1) != reg)
3866     {
3867       /* Since the graphic register REG is not invoked to any graphic
3868          planes, invoke it to graphic plane 0.  */
3869       switch (reg)
3870         {
3871         case 0:                 /* graphic register 0 */
3872           ENCODE_SHIFT_IN;
3873           break;
3874
3875         case 1:                 /* graphic register 1 */
3876           ENCODE_SHIFT_OUT;
3877           break;
3878
3879         case 2:                 /* graphic register 2 */
3880           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3881             ENCODE_SINGLE_SHIFT_2;
3882           else
3883             ENCODE_LOCKING_SHIFT_2;
3884           break;
3885
3886         case 3:                 /* graphic register 3 */
3887           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3888             ENCODE_SINGLE_SHIFT_3;
3889           else
3890             ENCODE_LOCKING_SHIFT_3;
3891           break;
3892         }
3893     }
3894
3895   *p_nchars = produced_chars;
3896   return dst;
3897 }
3898
3899 /* The following three macros produce codes for indicating direction
3900    of text.  */
3901 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3902   do {                                                                  \
3903     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3904       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3905     else                                                                \
3906       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3907   } while (0)
3908
3909
3910 #define ENCODE_DIRECTION_R2L()                  \
3911   do {                                          \
3912     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3913     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3914   } while (0)
3915
3916
3917 #define ENCODE_DIRECTION_L2R()                  \
3918   do {                                          \
3919     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3920     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3921   } while (0)
3922
3923
3924 /* Produce codes for designation and invocation to reset the graphic
3925    planes and registers to initial state.  */
3926 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3927   do {                                                                  \
3928     int reg;                                                            \
3929     struct charset *charset;                                            \
3930                                                                         \
3931     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3932       ENCODE_SHIFT_IN;                                                  \
3933     for (reg = 0; reg < 4; reg++)                                       \
3934       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3935           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3936               != CODING_ISO_INITIAL (coding, reg)))                     \
3937         {                                                               \
3938           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3939           ENCODE_DESIGNATION (charset, reg, coding);                    \
3940         }                                                               \
3941   } while (0)
3942
3943
3944 /* Produce designation sequences of charsets in the line started from
3945    SRC to a place pointed by DST, and return updated DST.
3946
3947    If the current block ends before any end-of-line, we may fail to
3948    find all the necessary designations.  */
3949
3950 static unsigned char *
3951 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3952      struct coding_system *coding;
3953      int *charbuf, *charbuf_end;
3954      unsigned char *dst;
3955 {
3956   struct charset *charset;
3957   /* Table of charsets to be designated to each graphic register.  */
3958   int r[4];
3959   int c, found = 0, reg;
3960   int produced_chars = 0;
3961   int multibytep = coding->dst_multibyte;
3962   Lisp_Object attrs;
3963   Lisp_Object charset_list;
3964
3965   attrs = CODING_ID_ATTRS (coding->id);
3966   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3967   if (EQ (charset_list, Qiso_2022))
3968     charset_list = Viso_2022_charset_list;
3969
3970   for (reg = 0; reg < 4; reg++)
3971     r[reg] = -1;
3972
3973   while (found < 4)
3974     {
3975       int id;
3976
3977       c = *charbuf++;
3978       if (c == '\n')
3979         break;
3980       charset = char_charset (c, charset_list, NULL);
3981       id = CHARSET_ID (charset);
3982       reg = CODING_ISO_REQUEST (coding, id);
3983       if (reg >= 0 && r[reg] < 0)
3984         {
3985           found++;
3986           r[reg] = id;
3987         }
3988     }
3989
3990   if (found)
3991     {
3992       for (reg = 0; reg < 4; reg++)
3993         if (r[reg] >= 0
3994             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3995           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3996     }
3997
3998   return dst;
3999 }
4000
4001 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4002
4003 static int
4004 encode_coding_iso_2022 (coding)
4005      struct coding_system *coding;
4006 {
4007   int multibytep = coding->dst_multibyte;
4008   int *charbuf = coding->charbuf;
4009   int *charbuf_end = charbuf + coding->charbuf_used;
4010   unsigned char *dst = coding->destination + coding->produced;
4011   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4012   int safe_room = 16;
4013   int bol_designation
4014     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4015        && CODING_ISO_BOL (coding));
4016   int produced_chars = 0;
4017   Lisp_Object attrs, eol_type, charset_list;
4018   int ascii_compatible;
4019   int c;
4020   int preferred_charset_id = -1;
4021
4022   CODING_GET_INFO (coding, attrs, charset_list);
4023   eol_type = CODING_ID_EOL_TYPE (coding->id);
4024   if (VECTORP (eol_type))
4025     eol_type = Qunix;
4026
4027   setup_iso_safe_charsets (attrs);
4028   /* Charset list may have been changed.  */
4029   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4030   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
4031
4032   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4033
4034   while (charbuf < charbuf_end)
4035     {
4036       ASSURE_DESTINATION (safe_room);
4037
4038       if (bol_designation)
4039         {
4040           unsigned char *dst_prev = dst;
4041
4042           /* We have to produce designation sequences if any now.  */
4043           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4044           bol_designation = 0;
4045           /* We are sure that designation sequences are all ASCII bytes.  */
4046           produced_chars += dst - dst_prev;
4047         }
4048
4049       c = *charbuf++;
4050
4051       if (c < 0)
4052         {
4053           /* Handle an annotation.  */
4054           switch (*charbuf)
4055             {
4056             case CODING_ANNOTATE_COMPOSITION_MASK:
4057               /* Not yet implemented.  */
4058               break;
4059             case CODING_ANNOTATE_CHARSET_MASK:
4060               preferred_charset_id = charbuf[2];
4061               if (preferred_charset_id >= 0
4062                   && NILP (Fmemq (make_number (preferred_charset_id),
4063                                   charset_list)))
4064                 preferred_charset_id = -1;
4065               break;
4066             default:
4067               abort ();
4068             }
4069           charbuf += -c - 1;
4070           continue;
4071         }
4072
4073       /* Now encode the character C.  */
4074       if (c < 0x20 || c == 0x7F)
4075         {
4076           if (c == '\n'
4077               || (c == '\r' && EQ (eol_type, Qmac)))
4078             {
4079               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4080                 ENCODE_RESET_PLANE_AND_REGISTER ();
4081               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4082                 {
4083                   int i;
4084
4085                   for (i = 0; i < 4; i++)
4086                     CODING_ISO_DESIGNATION (coding, i)
4087                       = CODING_ISO_INITIAL (coding, i);
4088                 }
4089               bol_designation
4090                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4091             }
4092           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4093             ENCODE_RESET_PLANE_AND_REGISTER ();
4094           EMIT_ONE_ASCII_BYTE (c);
4095         }
4096       else if (ASCII_CHAR_P (c))
4097         {
4098           if (ascii_compatible)
4099             EMIT_ONE_ASCII_BYTE (c);
4100           else
4101             {
4102               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4103               ENCODE_ISO_CHARACTER (charset, c);
4104             }
4105         }
4106       else if (CHAR_BYTE8_P (c))
4107         {
4108           c = CHAR_TO_BYTE8 (c);
4109           EMIT_ONE_BYTE (c);
4110         }
4111       else
4112         {
4113           struct charset *charset;
4114
4115           if (preferred_charset_id >= 0)
4116             {
4117               charset = CHARSET_FROM_ID (preferred_charset_id);
4118               if (! CHAR_CHARSET_P (c, charset))
4119                 charset = char_charset (c, charset_list, NULL);
4120             }
4121           else
4122             charset = char_charset (c, charset_list, NULL);
4123           if (!charset)
4124             {
4125               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4126                 {
4127                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4128                   charset = CHARSET_FROM_ID (charset_ascii);
4129                 }
4130               else
4131                 {
4132                   c = coding->default_char;
4133                   charset = char_charset (c, charset_list, NULL);
4134                 }
4135             }
4136           ENCODE_ISO_CHARACTER (charset, c);
4137         }
4138     }
4139
4140   if (coding->mode & CODING_MODE_LAST_BLOCK
4141       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4142     {
4143       ASSURE_DESTINATION (safe_room);
4144       ENCODE_RESET_PLANE_AND_REGISTER ();
4145     }
4146   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4147   CODING_ISO_BOL (coding) = bol_designation;
4148   coding->produced_char += produced_chars;
4149   coding->produced = dst - coding->destination;
4150   return 0;
4151 }
4152
4153 \f
4154 /*** 8,9. SJIS and BIG5 handlers ***/
4155
4156 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4157    quite widely.  So, for the moment, Emacs supports them in the bare
4158    C code.  But, in the future, they may be supported only by CCL.  */
4159
4160 /* SJIS is a coding system encoding three character sets: ASCII, right
4161    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4162    as is.  A character of charset katakana-jisx0201 is encoded by
4163    "position-code + 0x80".  A character of charset japanese-jisx0208
4164    is encoded in 2-byte but two position-codes are divided and shifted
4165    so that it fit in the range below.
4166
4167    --- CODE RANGE of SJIS ---
4168    (character set)      (range)
4169    ASCII                0x00 .. 0x7F
4170    KATAKANA-JISX0201    0xA0 .. 0xDF
4171    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4172             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4173    -------------------------------
4174
4175 */
4176
4177 /* BIG5 is a coding system encoding two character sets: ASCII and
4178    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4179    character set and is encoded in two-byte.
4180
4181    --- CODE RANGE of BIG5 ---
4182    (character set)      (range)
4183    ASCII                0x00 .. 0x7F
4184    Big5 (1st byte)      0xA1 .. 0xFE
4185         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4186    --------------------------
4187
4188   */
4189
4190 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4191    Check if a text is encoded in SJIS.  If it is, return
4192    CATEGORY_MASK_SJIS, else return 0.  */
4193
4194 static int
4195 detect_coding_sjis (coding, detect_info)
4196      struct coding_system *coding;
4197      struct coding_detection_info *detect_info;
4198 {
4199   const unsigned char *src = coding->source, *src_base;
4200   const unsigned char *src_end = coding->source + coding->src_bytes;
4201   int multibytep = coding->src_multibyte;
4202   int consumed_chars = 0;
4203   int found = 0;
4204   int c;
4205
4206   detect_info->checked |= CATEGORY_MASK_SJIS;
4207   /* A coding system of this category is always ASCII compatible.  */
4208   src += coding->head_ascii;
4209
4210   while (1)
4211     {
4212       src_base = src;
4213       ONE_MORE_BYTE (c);
4214       if (c < 0x80)
4215         continue;
4216       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4217         {
4218           ONE_MORE_BYTE (c);
4219           if (c < 0x40 || c == 0x7F || c > 0xFC)
4220             break;
4221           found = CATEGORY_MASK_SJIS;
4222         }
4223       else if (c >= 0xA0 && c < 0xE0)
4224         found = CATEGORY_MASK_SJIS;
4225       else
4226         break;
4227     }
4228   detect_info->rejected |= CATEGORY_MASK_SJIS;
4229   return 0;
4230
4231  no_more_source:
4232   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4233     {
4234       detect_info->rejected |= CATEGORY_MASK_SJIS;
4235       return 0;
4236     }
4237   detect_info->found |= found;
4238   return 1;
4239 }
4240
4241 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4242    Check if a text is encoded in BIG5.  If it is, return
4243    CATEGORY_MASK_BIG5, else return 0.  */
4244
4245 static int
4246 detect_coding_big5 (coding, detect_info)
4247      struct coding_system *coding;
4248      struct coding_detection_info *detect_info;
4249 {
4250   const unsigned char *src = coding->source, *src_base;
4251   const unsigned char *src_end = coding->source + coding->src_bytes;
4252   int multibytep = coding->src_multibyte;
4253   int consumed_chars = 0;
4254   int found = 0;
4255   int c;
4256
4257   detect_info->checked |= CATEGORY_MASK_BIG5;
4258   /* A coding system of this category is always ASCII compatible.  */
4259   src += coding->head_ascii;
4260
4261   while (1)
4262     {
4263       src_base = src;
4264       ONE_MORE_BYTE (c);
4265       if (c < 0x80)
4266         continue;
4267       if (c >= 0xA1)
4268         {
4269           ONE_MORE_BYTE (c);
4270           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4271             return 0;
4272           found = CATEGORY_MASK_BIG5;
4273         }
4274       else
4275         break;
4276     }
4277   detect_info->rejected |= CATEGORY_MASK_BIG5;
4278   return 0;
4279
4280  no_more_source:
4281   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4282     {
4283       detect_info->rejected |= CATEGORY_MASK_BIG5;
4284       return 0;
4285     }
4286   detect_info->found |= found;
4287   return 1;
4288 }
4289
4290 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4291    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4292
4293 static void
4294 decode_coding_sjis (coding)
4295      struct coding_system *coding;
4296 {
4297   const unsigned char *src = coding->source + coding->consumed;
4298   const unsigned char *src_end = coding->source + coding->src_bytes;
4299   const unsigned char *src_base;
4300   int *charbuf = coding->charbuf + coding->charbuf_used;
4301   int *charbuf_end
4302     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4303   int consumed_chars = 0, consumed_chars_base;
4304   int multibytep = coding->src_multibyte;
4305   struct charset *charset_roman, *charset_kanji, *charset_kana;
4306   struct charset *charset_kanji2;
4307   Lisp_Object attrs, charset_list, val;
4308   int char_offset = coding->produced_char;
4309   int last_offset = char_offset;
4310   int last_id = charset_ascii;
4311   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4312   int byte_after_cr = -1;
4313
4314   CODING_GET_INFO (coding, attrs, charset_list);
4315
4316   val = charset_list;
4317   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4318   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4319   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4320   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4321
4322   while (1)
4323     {
4324       int c, c1;
4325       struct charset *charset;
4326
4327       src_base = src;
4328       consumed_chars_base = consumed_chars;
4329
4330       if (charbuf >= charbuf_end)
4331         break;
4332
4333       if (byte_after_cr >= 0)
4334         c = byte_after_cr, byte_after_cr = -1;
4335       else
4336         ONE_MORE_BYTE (c);
4337       if (c < 0)
4338         goto invalid_code;
4339       if (c < 0x80)
4340         {
4341           if (eol_crlf && c == '\r')
4342             ONE_MORE_BYTE (byte_after_cr);
4343           charset = charset_roman;
4344         }
4345       else if (c == 0x80 || c == 0xA0)
4346         goto invalid_code;
4347       else if (c >= 0xA1 && c <= 0xDF)
4348         {
4349           /* SJIS -> JISX0201-Kana */
4350           c &= 0x7F;
4351           charset = charset_kana;
4352         }
4353       else if (c <= 0xEF)
4354         {
4355           /* SJIS -> JISX0208 */
4356           ONE_MORE_BYTE (c1);
4357           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4358             goto invalid_code;
4359           c = (c << 8) | c1;
4360           SJIS_TO_JIS (c);
4361           charset = charset_kanji;
4362         }
4363       else if (c <= 0xFC && charset_kanji2)
4364         {
4365           /* SJIS -> JISX0213-2 */
4366           ONE_MORE_BYTE (c1);
4367           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4368             goto invalid_code;
4369           c = (c << 8) | c1;
4370           SJIS_TO_JIS2 (c);
4371           charset = charset_kanji2;
4372         }
4373       else
4374         goto invalid_code;
4375       if (charset->id != charset_ascii
4376           && last_id != charset->id)
4377         {
4378           if (last_id != charset_ascii)
4379             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4380           last_id = charset->id;
4381           last_offset = char_offset;
4382         }
4383       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4384       *charbuf++ = c;
4385       char_offset++;
4386       continue;
4387
4388     invalid_code:
4389       src = src_base;
4390       consumed_chars = consumed_chars_base;
4391       ONE_MORE_BYTE (c);
4392       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4393       char_offset++;
4394       coding->errors++;
4395     }
4396
4397  no_more_source:
4398   if (last_id != charset_ascii)
4399     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4400   coding->consumed_char += consumed_chars_base;
4401   coding->consumed = src_base - coding->source;
4402   coding->charbuf_used = charbuf - coding->charbuf;
4403 }
4404
4405 static void
4406 decode_coding_big5 (coding)
4407      struct coding_system *coding;
4408 {
4409   const unsigned char *src = coding->source + coding->consumed;
4410   const unsigned char *src_end = coding->source + coding->src_bytes;
4411   const unsigned char *src_base;
4412   int *charbuf = coding->charbuf + coding->charbuf_used;
4413   int *charbuf_end
4414     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4415   int consumed_chars = 0, consumed_chars_base;
4416   int multibytep = coding->src_multibyte;
4417   struct charset *charset_roman, *charset_big5;
4418   Lisp_Object attrs, charset_list, val;
4419   int char_offset = coding->produced_char;
4420   int last_offset = char_offset;
4421   int last_id = charset_ascii;
4422   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4423   int byte_after_cr = -1;
4424
4425   CODING_GET_INFO (coding, attrs, charset_list);
4426   val = charset_list;
4427   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4428   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4429
4430   while (1)
4431     {
4432       int c, c1;
4433       struct charset *charset;
4434
4435       src_base = src;
4436       consumed_chars_base = consumed_chars;
4437
4438       if (charbuf >= charbuf_end)
4439         break;
4440
4441       if (byte_after_cr >= 0)
4442         c = byte_after_cr, byte_after_cr = -1;
4443       else
4444         ONE_MORE_BYTE (c);
4445
4446       if (c < 0)
4447         goto invalid_code;
4448       if (c < 0x80)
4449         {
4450           if (eol_crlf && c == '\r')
4451             ONE_MORE_BYTE (byte_after_cr);
4452           charset = charset_roman;
4453         }
4454       else
4455         {
4456           /* BIG5 -> Big5 */
4457           if (c < 0xA1 || c > 0xFE)
4458             goto invalid_code;
4459           ONE_MORE_BYTE (c1);
4460           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4461             goto invalid_code;
4462           c = c << 8 | c1;
4463           charset = charset_big5;
4464         }
4465       if (charset->id != charset_ascii
4466           && last_id != charset->id)
4467         {
4468           if (last_id != charset_ascii)
4469             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4470           last_id = charset->id;
4471           last_offset = char_offset;
4472         }
4473       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4474       *charbuf++ = c;
4475       char_offset++;
4476       continue;
4477
4478     invalid_code:
4479       src = src_base;
4480       consumed_chars = consumed_chars_base;
4481       ONE_MORE_BYTE (c);
4482       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4483       char_offset++;
4484       coding->errors++;
4485     }
4486
4487  no_more_source:
4488   if (last_id != charset_ascii)
4489     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4490   coding->consumed_char += consumed_chars_base;
4491   coding->consumed = src_base - coding->source;
4492   coding->charbuf_used = charbuf - coding->charbuf;
4493 }
4494
4495 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4496    This function can encode charsets `ascii', `katakana-jisx0201',
4497    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4498    are sure that all these charsets are registered as official charset
4499    (i.e. do not have extended leading-codes).  Characters of other
4500    charsets are produced without any encoding.  If SJIS_P is 1, encode
4501    SJIS text, else encode BIG5 text.  */
4502
4503 static int
4504 encode_coding_sjis (coding)
4505      struct coding_system *coding;
4506 {
4507   int multibytep = coding->dst_multibyte;
4508   int *charbuf = coding->charbuf;
4509   int *charbuf_end = charbuf + coding->charbuf_used;
4510   unsigned char *dst = coding->destination + coding->produced;
4511   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4512   int safe_room = 4;
4513   int produced_chars = 0;
4514   Lisp_Object attrs, charset_list, val;
4515   int ascii_compatible;
4516   struct charset *charset_roman, *charset_kanji, *charset_kana;
4517   struct charset *charset_kanji2;
4518   int c;
4519
4520   CODING_GET_INFO (coding, attrs, charset_list);
4521   val = charset_list;
4522   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4523   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4524   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4525   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4526
4527   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4528
4529   while (charbuf < charbuf_end)
4530     {
4531       ASSURE_DESTINATION (safe_room);
4532       c = *charbuf++;
4533       /* Now encode the character C.  */
4534       if (ASCII_CHAR_P (c) && ascii_compatible)
4535         EMIT_ONE_ASCII_BYTE (c);
4536       else if (CHAR_BYTE8_P (c))
4537         {
4538           c = CHAR_TO_BYTE8 (c);
4539           EMIT_ONE_BYTE (c);
4540         }
4541       else
4542         {
4543           unsigned code;
4544           struct charset *charset = char_charset (c, charset_list, &code);
4545
4546           if (!charset)
4547             {
4548               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4549                 {
4550                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4551                   charset = CHARSET_FROM_ID (charset_ascii);
4552                 }
4553               else
4554                 {
4555                   c = coding->default_char;
4556                   charset = char_charset (c, charset_list, &code);
4557                 }
4558             }
4559           if (code == CHARSET_INVALID_CODE (charset))
4560             abort ();
4561           if (charset == charset_kanji)
4562             {
4563               int c1, c2;
4564               JIS_TO_SJIS (code);
4565               c1 = code >> 8, c2 = code & 0xFF;
4566               EMIT_TWO_BYTES (c1, c2);
4567             }
4568           else if (charset == charset_kana)
4569             EMIT_ONE_BYTE (code | 0x80);
4570           else if (charset_kanji2 && charset == charset_kanji2)
4571             {
4572               int c1, c2;
4573
4574               c1 = code >> 8;
4575               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4576                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4577                 {
4578                   JIS_TO_SJIS2 (code);
4579                   c1 = code >> 8, c2 = code & 0xFF;
4580                   EMIT_TWO_BYTES (c1, c2);
4581                 }
4582               else
4583                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4584             }
4585           else
4586             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4587         }
4588     }
4589   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4590   coding->produced_char += produced_chars;
4591   coding->produced = dst - coding->destination;
4592   return 0;
4593 }
4594
4595 static int
4596 encode_coding_big5 (coding)
4597      struct coding_system *coding;
4598 {
4599   int multibytep = coding->dst_multibyte;
4600   int *charbuf = coding->charbuf;
4601   int *charbuf_end = charbuf + coding->charbuf_used;
4602   unsigned char *dst = coding->destination + coding->produced;
4603   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4604   int safe_room = 4;
4605   int produced_chars = 0;
4606   Lisp_Object attrs, charset_list, val;
4607   int ascii_compatible;
4608   struct charset *charset_roman, *charset_big5;
4609   int c;
4610
4611   CODING_GET_INFO (coding, attrs, charset_list);
4612   val = charset_list;
4613   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4614   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4615   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4616
4617   while (charbuf < charbuf_end)
4618     {
4619       ASSURE_DESTINATION (safe_room);
4620       c = *charbuf++;
4621       /* Now encode the character C.  */
4622       if (ASCII_CHAR_P (c) && ascii_compatible)
4623         EMIT_ONE_ASCII_BYTE (c);
4624       else if (CHAR_BYTE8_P (c))
4625         {
4626           c = CHAR_TO_BYTE8 (c);
4627           EMIT_ONE_BYTE (c);
4628         }
4629       else
4630         {
4631           unsigned code;
4632           struct charset *charset = char_charset (c, charset_list, &code);
4633
4634           if (! charset)
4635             {
4636               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4637                 {
4638                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4639                   charset = CHARSET_FROM_ID (charset_ascii);
4640                 }
4641               else
4642                 {
4643                   c = coding->default_char;
4644                   charset = char_charset (c, charset_list, &code);
4645                 }
4646             }
4647           if (code == CHARSET_INVALID_CODE (charset))
4648             abort ();
4649           if (charset == charset_big5)
4650             {
4651               int c1, c2;
4652
4653               c1 = code >> 8, c2 = code & 0xFF;
4654               EMIT_TWO_BYTES (c1, c2);
4655             }
4656           else
4657             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4658         }
4659     }
4660   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4661   coding->produced_char += produced_chars;
4662   coding->produced = dst - coding->destination;
4663   return 0;
4664 }
4665
4666 \f
4667 /*** 10. CCL handlers ***/
4668
4669 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4670    Check if a text is encoded in a coding system of which
4671    encoder/decoder are written in CCL program.  If it is, return
4672    CATEGORY_MASK_CCL, else return 0.  */
4673
4674 static int
4675 detect_coding_ccl (coding, detect_info)
4676      struct coding_system *coding;
4677      struct coding_detection_info *detect_info;
4678 {
4679   const unsigned char *src = coding->source, *src_base;
4680   const unsigned char *src_end = coding->source + coding->src_bytes;
4681   int multibytep = coding->src_multibyte;
4682   int consumed_chars = 0;
4683   int found = 0;
4684   unsigned char *valids;
4685   int head_ascii = coding->head_ascii;
4686   Lisp_Object attrs;
4687
4688   detect_info->checked |= CATEGORY_MASK_CCL;
4689
4690   coding = &coding_categories[coding_category_ccl];
4691   valids = CODING_CCL_VALIDS (coding);
4692   attrs = CODING_ID_ATTRS (coding->id);
4693   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4694     src += head_ascii;
4695
4696   while (1)
4697     {
4698       int c;
4699
4700       src_base = src;
4701       ONE_MORE_BYTE (c);
4702       if (c < 0 || ! valids[c])
4703         break;
4704       if ((valids[c] > 1))
4705         found = CATEGORY_MASK_CCL;
4706     }
4707   detect_info->rejected |= CATEGORY_MASK_CCL;
4708   return 0;
4709
4710  no_more_source:
4711   detect_info->found |= found;
4712   return 1;
4713 }
4714
4715 static void
4716 decode_coding_ccl (coding)
4717      struct coding_system *coding;
4718 {
4719   const unsigned char *src = coding->source + coding->consumed;
4720   const unsigned char *src_end = coding->source + coding->src_bytes;
4721   int *charbuf = coding->charbuf + coding->charbuf_used;
4722   int *charbuf_end = coding->charbuf + coding->charbuf_size;
4723   int consumed_chars = 0;
4724   int multibytep = coding->src_multibyte;
4725   struct ccl_program ccl;
4726   int source_charbuf[1024];
4727   int source_byteidx[1024];
4728   Lisp_Object attrs, charset_list;
4729
4730   CODING_GET_INFO (coding, attrs, charset_list);
4731   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4732
4733   while (src < src_end)
4734     {
4735       const unsigned char *p = src;
4736       int *source, *source_end;
4737       int i = 0;
4738
4739       if (multibytep)
4740         while (i < 1024 && p < src_end)
4741           {
4742             source_byteidx[i] = p - src;
4743             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4744           }
4745       else
4746         while (i < 1024 && p < src_end)
4747           source_charbuf[i++] = *p++;
4748
4749       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4750         ccl.last_block = 1;
4751
4752       source = source_charbuf;
4753       source_end = source + i;
4754       while (source < source_end)
4755         {
4756           ccl_driver (&ccl, source, charbuf,
4757                       source_end - source, charbuf_end - charbuf,
4758                       charset_list);
4759           source += ccl.consumed;
4760           charbuf += ccl.produced;
4761           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4762             break;
4763         }
4764       if (source < source_end)
4765         src += source_byteidx[source - source_charbuf];
4766       else
4767         src = p;
4768       consumed_chars += source - source_charbuf;
4769
4770       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4771           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4772         break;
4773     }
4774
4775   switch (ccl.status)
4776     {
4777     case CCL_STAT_SUSPEND_BY_SRC:
4778       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4779       break;
4780     case CCL_STAT_SUSPEND_BY_DST:
4781       break;
4782     case CCL_STAT_QUIT:
4783     case CCL_STAT_INVALID_CMD:
4784       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4785       break;
4786     default:
4787       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4788       break;
4789     }
4790   coding->consumed_char += consumed_chars;
4791   coding->consumed = src - coding->source;
4792   coding->charbuf_used = charbuf - coding->charbuf;
4793 }
4794
4795 static int
4796 encode_coding_ccl (coding)
4797      struct coding_system *coding;
4798 {
4799   struct ccl_program ccl;
4800   int multibytep = coding->dst_multibyte;
4801   int *charbuf = coding->charbuf;
4802   int *charbuf_end = charbuf + coding->charbuf_used;
4803   unsigned char *dst = coding->destination + coding->produced;
4804   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4805   int destination_charbuf[1024];
4806   int i, produced_chars = 0;
4807   Lisp_Object attrs, charset_list;
4808
4809   CODING_GET_INFO (coding, attrs, charset_list);
4810   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4811
4812   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4813   ccl.dst_multibyte = coding->dst_multibyte;
4814
4815   while (charbuf < charbuf_end)
4816     {
4817       ccl_driver (&ccl, charbuf, destination_charbuf,
4818                   charbuf_end - charbuf, 1024, charset_list);
4819       if (multibytep)
4820         {
4821           ASSURE_DESTINATION (ccl.produced * 2);
4822           for (i = 0; i < ccl.produced; i++)
4823             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4824         }
4825       else
4826         {
4827           ASSURE_DESTINATION (ccl.produced);
4828           for (i = 0; i < ccl.produced; i++)
4829             *dst++ = destination_charbuf[i] & 0xFF;
4830           produced_chars += ccl.produced;
4831         }
4832       charbuf += ccl.consumed;
4833       if (ccl.status == CCL_STAT_QUIT
4834           || ccl.status == CCL_STAT_INVALID_CMD)
4835         break;
4836     }
4837
4838   switch (ccl.status)
4839     {
4840     case CCL_STAT_SUSPEND_BY_SRC:
4841       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4842       break;
4843     case CCL_STAT_SUSPEND_BY_DST:
4844       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4845       break;
4846     case CCL_STAT_QUIT:
4847     case CCL_STAT_INVALID_CMD:
4848       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4849       break;
4850     default:
4851       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4852       break;
4853     }
4854
4855   coding->produced_char += produced_chars;
4856   coding->produced = dst - coding->destination;
4857   return 0;
4858 }
4859
4860
4861 \f
4862 /*** 10, 11. no-conversion handlers ***/
4863
4864 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4865
4866 static void
4867 decode_coding_raw_text (coding)
4868      struct coding_system *coding;
4869 {
4870   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4871
4872   coding->chars_at_source = 1;
4873   coding->consumed_char = coding->src_chars;
4874   coding->consumed = coding->src_bytes;
4875   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
4876     {
4877       coding->consumed_char--;
4878       coding->consumed--;
4879       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4880     }
4881   else
4882     record_conversion_result (coding, CODING_RESULT_SUCCESS);
4883 }
4884
4885 static int
4886 encode_coding_raw_text (coding)
4887      struct coding_system *coding;
4888 {
4889   int multibytep = coding->dst_multibyte;
4890   int *charbuf = coding->charbuf;
4891   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4892   unsigned char *dst = coding->destination + coding->produced;
4893   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4894   int produced_chars = 0;
4895   int c;
4896
4897   if (multibytep)
4898     {
4899       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4900
4901       if (coding->src_multibyte)
4902         while (charbuf < charbuf_end)
4903           {
4904             ASSURE_DESTINATION (safe_room);
4905             c = *charbuf++;
4906             if (ASCII_CHAR_P (c))
4907               EMIT_ONE_ASCII_BYTE (c);
4908             else if (CHAR_BYTE8_P (c))
4909               {
4910                 c = CHAR_TO_BYTE8 (c);
4911                 EMIT_ONE_BYTE (c);
4912               }
4913             else
4914               {
4915                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4916
4917                 CHAR_STRING_ADVANCE (c, p1);
4918                 while (p0 < p1)
4919                   {
4920                     EMIT_ONE_BYTE (*p0);
4921                     p0++;
4922                   }
4923               }
4924           }
4925       else
4926         while (charbuf < charbuf_end)
4927           {
4928             ASSURE_DESTINATION (safe_room);
4929             c = *charbuf++;
4930             EMIT_ONE_BYTE (c);
4931           }
4932     }
4933   else
4934     {
4935       if (coding->src_multibyte)
4936         {
4937           int safe_room = MAX_MULTIBYTE_LENGTH;
4938
4939           while (charbuf < charbuf_end)
4940             {
4941               ASSURE_DESTINATION (safe_room);
4942               c = *charbuf++;
4943               if (ASCII_CHAR_P (c))
4944                 *dst++ = c;
4945               else if (CHAR_BYTE8_P (c))
4946                 *dst++ = CHAR_TO_BYTE8 (c);
4947               else
4948                 CHAR_STRING_ADVANCE (c, dst);
4949             }
4950         }
4951       else
4952         {
4953           ASSURE_DESTINATION (charbuf_end - charbuf);
4954           while (charbuf < charbuf_end && dst < dst_end)
4955             *dst++ = *charbuf++;
4956         }
4957       produced_chars = dst - (coding->destination + coding->produced);
4958     }
4959   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4960   coding->produced_char += produced_chars;
4961   coding->produced = dst - coding->destination;
4962   return 0;
4963 }
4964
4965 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4966    Check if a text is encoded in a charset-based coding system.  If it
4967    is, return 1, else return 0.  */
4968
4969 static int
4970 detect_coding_charset (coding, detect_info)
4971      struct coding_system *coding;
4972      struct coding_detection_info *detect_info;
4973 {
4974   const unsigned char *src = coding->source, *src_base;
4975   const unsigned char *src_end = coding->source + coding->src_bytes;
4976   int multibytep = coding->src_multibyte;
4977   int consumed_chars = 0;
4978   Lisp_Object attrs, valids;
4979   int found = 0;
4980   int head_ascii = coding->head_ascii;
4981
4982   detect_info->checked |= CATEGORY_MASK_CHARSET;
4983
4984   coding = &coding_categories[coding_category_charset];
4985   attrs = CODING_ID_ATTRS (coding->id);
4986   valids = AREF (attrs, coding_attr_charset_valids);
4987
4988   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4989     src += head_ascii;
4990
4991   while (1)
4992     {
4993       int c;
4994       Lisp_Object val;
4995       struct charset *charset;
4996       int dim, idx;
4997
4998       src_base = src;
4999       ONE_MORE_BYTE (c);
5000       if (c < 0)
5001         continue;
5002       val = AREF (valids, c);
5003       if (NILP (val))
5004         break;
5005       if (c >= 0x80)
5006         found = CATEGORY_MASK_CHARSET;
5007       if (INTEGERP (val))
5008         {
5009           charset = CHARSET_FROM_ID (XFASTINT (val));
5010           dim = CHARSET_DIMENSION (charset);
5011           for (idx = 1; idx < dim; idx++)
5012             {
5013               if (src == src_end)
5014                 goto too_short;
5015               ONE_MORE_BYTE (c);
5016               if (c < charset->code_space[(dim - 1 - idx) * 2]
5017                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5018                 break;
5019             }
5020           if (idx < dim)
5021             break;
5022         }
5023       else
5024         {
5025           idx = 1;
5026           for (; CONSP (val); val = XCDR (val))
5027             {
5028               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5029               dim = CHARSET_DIMENSION (charset);
5030               while (idx < dim)
5031                 {
5032                   if (src == src_end)
5033                     goto too_short;
5034                   ONE_MORE_BYTE (c);
5035                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5036                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5037                     break;
5038                   idx++;
5039                 }
5040               if (idx == dim)
5041                 {
5042                   val = Qnil;
5043                   break;
5044                 }
5045             }
5046           if (CONSP (val))
5047             break;
5048         }
5049     }
5050  too_short:
5051   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5052   return 0;
5053
5054  no_more_source:
5055   detect_info->found |= found;
5056   return 1;
5057 }
5058
5059 static void
5060 decode_coding_charset (coding)
5061      struct coding_system *coding;
5062 {
5063   const unsigned char *src = coding->source + coding->consumed;
5064   const unsigned char *src_end = coding->source + coding->src_bytes;
5065   const unsigned char *src_base;
5066   int *charbuf = coding->charbuf + coding->charbuf_used;
5067   int *charbuf_end
5068     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
5069   int consumed_chars = 0, consumed_chars_base;
5070   int multibytep = coding->src_multibyte;
5071   Lisp_Object attrs, charset_list, valids;
5072   int char_offset = coding->produced_char;
5073   int last_offset = char_offset;
5074   int last_id = charset_ascii;
5075   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5076   int byte_after_cr = -1;
5077
5078   CODING_GET_INFO (coding, attrs, charset_list);
5079   valids = AREF (attrs, coding_attr_charset_valids);
5080
5081   while (1)
5082     {
5083       int c;
5084       Lisp_Object val;
5085       struct charset *charset;
5086       int dim;
5087       int len = 1;
5088       unsigned code;
5089
5090       src_base = src;
5091       consumed_chars_base = consumed_chars;
5092
5093       if (charbuf >= charbuf_end)
5094         break;
5095
5096       if (byte_after_cr >= 0)
5097         {
5098           c = byte_after_cr;
5099           byte_after_cr = -1;
5100         }
5101       else
5102         {
5103           ONE_MORE_BYTE (c);
5104           if (eol_crlf && c == '\r')
5105             ONE_MORE_BYTE (byte_after_cr);
5106         }
5107       if (c < 0)
5108         goto invalid_code;
5109       code = c;
5110
5111       val = AREF (valids, c);
5112       if (NILP (val))
5113         goto invalid_code;
5114       if (INTEGERP (val))
5115         {
5116           charset = CHARSET_FROM_ID (XFASTINT (val));
5117           dim = CHARSET_DIMENSION (charset);
5118           while (len < dim)
5119             {
5120               ONE_MORE_BYTE (c);
5121               code = (code << 8) | c;
5122               len++;
5123             }
5124           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5125                               charset, code, c);
5126         }
5127       else
5128         {
5129           /* VAL is a list of charset IDs.  It is assured that the
5130              list is sorted by charset dimensions (smaller one
5131              comes first).  */
5132           while (CONSP (val))
5133             {
5134               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5135               dim = CHARSET_DIMENSION (charset);
5136               while (len < dim)
5137                 {
5138                   ONE_MORE_BYTE (c);
5139                   code = (code << 8) | c;
5140                   len++;
5141                 }
5142               CODING_DECODE_CHAR (coding, src, src_base,
5143                                   src_end, charset, code, c);
5144               if (c >= 0)
5145                 break;
5146               val = XCDR (val);
5147             }
5148         }
5149       if (c < 0)
5150         goto invalid_code;
5151       if (charset->id != charset_ascii
5152           && last_id != charset->id)
5153         {
5154           if (last_id != charset_ascii)
5155             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5156           last_id = charset->id;
5157           last_offset = char_offset;
5158         }
5159
5160       *charbuf++ = c;
5161       char_offset++;
5162       continue;
5163
5164     invalid_code:
5165       src = src_base;
5166       consumed_chars = consumed_chars_base;
5167       ONE_MORE_BYTE (c);
5168       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5169       char_offset++;
5170       coding->errors++;
5171     }
5172
5173  no_more_source:
5174   if (last_id != charset_ascii)
5175     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5176   coding->consumed_char += consumed_chars_base;
5177   coding->consumed = src_base - coding->source;
5178   coding->charbuf_used = charbuf - coding->charbuf;
5179 }
5180
5181 static int
5182 encode_coding_charset (coding)
5183      struct coding_system *coding;
5184 {
5185   int multibytep = coding->dst_multibyte;
5186   int *charbuf = coding->charbuf;
5187   int *charbuf_end = charbuf + coding->charbuf_used;
5188   unsigned char *dst = coding->destination + coding->produced;
5189   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5190   int safe_room = MAX_MULTIBYTE_LENGTH;
5191   int produced_chars = 0;
5192   Lisp_Object attrs, charset_list;
5193   int ascii_compatible;
5194   int c;
5195
5196   CODING_GET_INFO (coding, attrs, charset_list);
5197   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5198
5199   while (charbuf < charbuf_end)
5200     {
5201       struct charset *charset;
5202       unsigned code;
5203
5204       ASSURE_DESTINATION (safe_room);
5205       c = *charbuf++;
5206       if (ascii_compatible && ASCII_CHAR_P (c))
5207         EMIT_ONE_ASCII_BYTE (c);
5208       else if (CHAR_BYTE8_P (c))
5209         {
5210           c = CHAR_TO_BYTE8 (c);
5211           EMIT_ONE_BYTE (c);
5212         }
5213       else
5214         {
5215           charset = char_charset (c, charset_list, &code);
5216           if (charset)
5217             {
5218               if (CHARSET_DIMENSION (charset) == 1)
5219                 EMIT_ONE_BYTE (code);
5220               else if (CHARSET_DIMENSION (charset) == 2)
5221                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5222               else if (CHARSET_DIMENSION (charset) == 3)
5223                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5224               else
5225                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5226                                  (code >> 8) & 0xFF, code & 0xFF);
5227             }
5228           else
5229             {
5230               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5231                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5232               else
5233                 c = coding->default_char;
5234               EMIT_ONE_BYTE (c);
5235             }
5236         }
5237     }
5238
5239   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5240   coding->produced_char += produced_chars;
5241   coding->produced = dst - coding->destination;
5242   return 0;
5243 }
5244
5245 \f
5246 /*** 7. C library functions ***/
5247
5248 /* Setup coding context CODING from information about CODING_SYSTEM.
5249    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5250    CODING_SYSTEM is invalid, signal an error.  */
5251
5252 void
5253 setup_coding_system (coding_system, coding)
5254      Lisp_Object coding_system;
5255      struct coding_system *coding;
5256 {
5257   Lisp_Object attrs;
5258   Lisp_Object eol_type;
5259   Lisp_Object coding_type;
5260   Lisp_Object val;
5261
5262   if (NILP (coding_system))
5263     coding_system = Qundecided;
5264
5265   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5266
5267   attrs = CODING_ID_ATTRS (coding->id);
5268   eol_type = CODING_ID_EOL_TYPE (coding->id);
5269
5270   coding->mode = 0;
5271   coding->head_ascii = -1;
5272   if (VECTORP (eol_type))
5273     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5274                             | CODING_REQUIRE_DETECTION_MASK);
5275   else if (! EQ (eol_type, Qunix))
5276     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5277                             | CODING_REQUIRE_ENCODING_MASK);
5278   else
5279     coding->common_flags = 0;
5280   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5281     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5282   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5283     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5284   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5285     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5286
5287   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5288   coding->max_charset_id = SCHARS (val) - 1;
5289   coding->safe_charsets = (char *) SDATA (val);
5290   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5291
5292   coding_type = CODING_ATTR_TYPE (attrs);
5293   if (EQ (coding_type, Qundecided))
5294     {
5295       coding->detector = NULL;
5296       coding->decoder = decode_coding_raw_text;
5297       coding->encoder = encode_coding_raw_text;
5298       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5299     }
5300   else if (EQ (coding_type, Qiso_2022))
5301     {
5302       int i;
5303       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5304
5305       /* Invoke graphic register 0 to plane 0.  */
5306       CODING_ISO_INVOCATION (coding, 0) = 0;
5307       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5308       CODING_ISO_INVOCATION (coding, 1)
5309         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5310       /* Setup the initial status of designation.  */
5311       for (i = 0; i < 4; i++)
5312         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5313       /* Not single shifting initially.  */
5314       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5315       /* Beginning of buffer should also be regarded as bol. */
5316       CODING_ISO_BOL (coding) = 1;
5317       coding->detector = detect_coding_iso_2022;
5318       coding->decoder = decode_coding_iso_2022;
5319       coding->encoder = encode_coding_iso_2022;
5320       if (flags & CODING_ISO_FLAG_SAFE)
5321         coding->mode |= CODING_MODE_SAFE_ENCODING;
5322       coding->common_flags
5323         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5324             | CODING_REQUIRE_FLUSHING_MASK);
5325       if (flags & CODING_ISO_FLAG_COMPOSITION)
5326         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5327       if (flags & CODING_ISO_FLAG_DESIGNATION)
5328         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5329       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5330         {
5331           setup_iso_safe_charsets (attrs);
5332           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5333           coding->max_charset_id = SCHARS (val) - 1;
5334           coding->safe_charsets = (char *) SDATA (val);
5335         }
5336       CODING_ISO_FLAGS (coding) = flags;
5337     }
5338   else if (EQ (coding_type, Qcharset))
5339     {
5340       coding->detector = detect_coding_charset;
5341       coding->decoder = decode_coding_charset;
5342       coding->encoder = encode_coding_charset;
5343       coding->common_flags
5344         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5345     }
5346   else if (EQ (coding_type, Qutf_8))
5347     {
5348       val = AREF (attrs, coding_attr_utf_bom);
5349       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5350                                    : EQ (val, Qt) ? utf_with_bom
5351                                    : utf_without_bom);
5352       coding->detector = detect_coding_utf_8;
5353       coding->decoder = decode_coding_utf_8;
5354       coding->encoder = encode_coding_utf_8;
5355       coding->common_flags
5356         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5357       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5358         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5359     }
5360   else if (EQ (coding_type, Qutf_16))
5361     {
5362       val = AREF (attrs, coding_attr_utf_bom);
5363       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5364                                     : EQ (val, Qt) ? utf_with_bom
5365                                     : utf_without_bom);
5366       val = AREF (attrs, coding_attr_utf_16_endian);
5367       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5368                                        : utf_16_little_endian);
5369       CODING_UTF_16_SURROGATE (coding) = 0;
5370       coding->detector = detect_coding_utf_16;
5371       coding->decoder = decode_coding_utf_16;
5372       coding->encoder = encode_coding_utf_16;
5373       coding->common_flags
5374         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5375       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5376         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5377     }
5378   else if (EQ (coding_type, Qccl))
5379     {
5380       coding->detector = detect_coding_ccl;
5381       coding->decoder = decode_coding_ccl;
5382       coding->encoder = encode_coding_ccl;
5383       coding->common_flags
5384         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5385             | CODING_REQUIRE_FLUSHING_MASK);
5386     }
5387   else if (EQ (coding_type, Qemacs_mule))
5388     {
5389       coding->detector = detect_coding_emacs_mule;
5390       coding->decoder = decode_coding_emacs_mule;
5391       coding->encoder = encode_coding_emacs_mule;
5392       coding->common_flags
5393         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5394       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5395           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5396         {
5397           Lisp_Object tail, safe_charsets;
5398           int max_charset_id = 0;
5399
5400           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5401                tail = XCDR (tail))
5402             if (max_charset_id < XFASTINT (XCAR (tail)))
5403               max_charset_id = XFASTINT (XCAR (tail));
5404           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5405                                         make_number (255));
5406           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5407                tail = XCDR (tail))
5408             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5409           coding->max_charset_id = max_charset_id;
5410           coding->safe_charsets = (char *) SDATA (safe_charsets);
5411         }
5412     }
5413   else if (EQ (coding_type, Qshift_jis))
5414     {
5415       coding->detector = detect_coding_sjis;
5416       coding->decoder = decode_coding_sjis;
5417       coding->encoder = encode_coding_sjis;
5418       coding->common_flags
5419         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5420     }
5421   else if (EQ (coding_type, Qbig5))
5422     {
5423       coding->detector = detect_coding_big5;
5424       coding->decoder = decode_coding_big5;
5425       coding->encoder = encode_coding_big5;
5426       coding->common_flags
5427         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5428     }
5429   else                          /* EQ (coding_type, Qraw_text) */
5430     {
5431       coding->detector = NULL;
5432       coding->decoder = decode_coding_raw_text;
5433       coding->encoder = encode_coding_raw_text;
5434       if (! EQ (eol_type, Qunix))
5435         {
5436           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5437           if (! VECTORP (eol_type))
5438             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5439         }
5440
5441     }
5442
5443   return;
5444 }
5445
5446 /* Return a list of charsets supported by CODING.  */
5447
5448 Lisp_Object
5449 coding_charset_list (coding)
5450      struct coding_system *coding;
5451 {
5452   Lisp_Object attrs, charset_list;
5453
5454   CODING_GET_INFO (coding, attrs, charset_list);
5455   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5456     {
5457       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5458
5459       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5460         charset_list = Viso_2022_charset_list;
5461     }
5462   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5463     {
5464       charset_list = Vemacs_mule_charset_list;
5465     }
5466   return charset_list;
5467 }
5468
5469
5470 /* Return raw-text or one of its subsidiaries that has the same
5471    eol_type as CODING-SYSTEM.  */
5472
5473 Lisp_Object
5474 raw_text_coding_system (coding_system)
5475      Lisp_Object coding_system;
5476 {
5477   Lisp_Object spec, attrs;
5478   Lisp_Object eol_type, raw_text_eol_type;
5479
5480   if (NILP (coding_system))
5481     return Qraw_text;
5482   spec = CODING_SYSTEM_SPEC (coding_system);
5483   attrs = AREF (spec, 0);
5484
5485   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5486     return coding_system;
5487
5488   eol_type = AREF (spec, 2);
5489   if (VECTORP (eol_type))
5490     return Qraw_text;
5491   spec = CODING_SYSTEM_SPEC (Qraw_text);
5492   raw_text_eol_type = AREF (spec, 2);
5493   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5494           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5495           : AREF (raw_text_eol_type, 2));
5496 }
5497
5498
5499 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5500    does, return one of the subsidiary that has the same eol-spec as
5501    PARENT.  Otherwise, return CODING_SYSTEM.  If PARENT is nil,
5502    inherit end-of-line format from the system's setting
5503    (system_eol_type).  */
5504
5505 Lisp_Object
5506 coding_inherit_eol_type (coding_system, parent)
5507      Lisp_Object coding_system, parent;
5508 {
5509   Lisp_Object spec, eol_type;
5510
5511   if (NILP (coding_system))
5512     coding_system = Qraw_text;
5513   spec = CODING_SYSTEM_SPEC (coding_system);
5514   eol_type = AREF (spec, 2);
5515   if (VECTORP (eol_type))
5516     {
5517       Lisp_Object parent_eol_type;
5518
5519       if (! NILP (parent))
5520         {
5521           Lisp_Object parent_spec;
5522
5523           parent_spec = CODING_SYSTEM_SPEC (parent);
5524           parent_eol_type = AREF (parent_spec, 2);
5525         }
5526       else
5527         parent_eol_type = system_eol_type;
5528       if (EQ (parent_eol_type, Qunix))
5529         coding_system = AREF (eol_type, 0);
5530       else if (EQ (parent_eol_type, Qdos))
5531         coding_system = AREF (eol_type, 1);
5532       else if (EQ (parent_eol_type, Qmac))
5533         coding_system = AREF (eol_type, 2);
5534     }
5535   return coding_system;
5536 }
5537
5538 /* Emacs has a mechanism to automatically detect a coding system if it
5539    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5540    it's impossible to distinguish some coding systems accurately
5541    because they use the same range of codes.  So, at first, coding
5542    systems are categorized into 7, those are:
5543
5544    o coding-category-emacs-mule
5545
5546         The category for a coding system which has the same code range
5547         as Emacs' internal format.  Assigned the coding-system (Lisp
5548         symbol) `emacs-mule' by default.
5549
5550    o coding-category-sjis
5551
5552         The category for a coding system which has the same code range
5553         as SJIS.  Assigned the coding-system (Lisp
5554         symbol) `japanese-shift-jis' by default.
5555
5556    o coding-category-iso-7
5557
5558         The category for a coding system which has the same code range
5559         as ISO2022 of 7-bit environment.  This doesn't use any locking
5560         shift and single shift functions.  This can encode/decode all
5561         charsets.  Assigned the coding-system (Lisp symbol)
5562         `iso-2022-7bit' by default.
5563
5564    o coding-category-iso-7-tight
5565
5566         Same as coding-category-iso-7 except that this can
5567         encode/decode only the specified charsets.
5568
5569    o coding-category-iso-8-1
5570
5571         The category for a coding system which has the same code range
5572         as ISO2022 of 8-bit environment and graphic plane 1 used only
5573         for DIMENSION1 charset.  This doesn't use any locking shift
5574         and single shift functions.  Assigned the coding-system (Lisp
5575         symbol) `iso-latin-1' by default.
5576
5577    o coding-category-iso-8-2
5578
5579         The category for a coding system which has the same code range
5580         as ISO2022 of 8-bit environment and graphic plane 1 used only
5581         for DIMENSION2 charset.  This doesn't use any locking shift
5582         and single shift functions.  Assigned the coding-system (Lisp
5583         symbol) `japanese-iso-8bit' by default.
5584
5585    o coding-category-iso-7-else
5586
5587         The category for a coding system which has the same code range
5588         as ISO2022 of 7-bit environemnt but uses locking shift or
5589         single shift functions.  Assigned the coding-system (Lisp
5590         symbol) `iso-2022-7bit-lock' by default.
5591
5592    o coding-category-iso-8-else
5593
5594         The category for a coding system which has the same code range
5595         as ISO2022 of 8-bit environemnt but uses locking shift or
5596         single shift functions.  Assigned the coding-system (Lisp
5597         symbol) `iso-2022-8bit-ss2' by default.
5598
5599    o coding-category-big5
5600
5601         The category for a coding system which has the same code range
5602         as BIG5.  Assigned the coding-system (Lisp symbol)
5603         `cn-big5' by default.
5604
5605    o coding-category-utf-8
5606
5607         The category for a coding system which has the same code range
5608         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
5609         symbol) `utf-8' by default.
5610
5611    o coding-category-utf-16-be
5612
5613         The category for a coding system in which a text has an
5614         Unicode signature (cf. Unicode Standard) in the order of BIG
5615         endian at the head.  Assigned the coding-system (Lisp symbol)
5616         `utf-16-be' by default.
5617
5618    o coding-category-utf-16-le
5619
5620         The category for a coding system in which a text has an
5621         Unicode signature (cf. Unicode Standard) in the order of
5622         LITTLE endian at the head.  Assigned the coding-system (Lisp
5623         symbol) `utf-16-le' by default.
5624
5625    o coding-category-ccl
5626
5627         The category for a coding system of which encoder/decoder is
5628         written in CCL programs.  The default value is nil, i.e., no
5629         coding system is assigned.
5630
5631    o coding-category-binary
5632
5633         The category for a coding system not categorized in any of the
5634         above.  Assigned the coding-system (Lisp symbol)
5635         `no-conversion' by default.
5636
5637    Each of them is a Lisp symbol and the value is an actual
5638    `coding-system's (this is also a Lisp symbol) assigned by a user.
5639    What Emacs does actually is to detect a category of coding system.
5640    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5641    decide only one possible category, it selects a category of the
5642    highest priority.  Priorities of categories are also specified by a
5643    user in a Lisp variable `coding-category-list'.
5644
5645 */
5646
5647 #define EOL_SEEN_NONE   0
5648 #define EOL_SEEN_LF     1
5649 #define EOL_SEEN_CR     2
5650 #define EOL_SEEN_CRLF   4
5651
5652 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5653    SOURCE is encoded.  If CATEGORY is one of
5654    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5655    two-byte, else they are encoded by one-byte.
5656
5657    Return one of EOL_SEEN_XXX.  */
5658
5659 #define MAX_EOL_CHECK_COUNT 3
5660
5661 static int
5662 detect_eol (source, src_bytes, category)
5663      const unsigned char *source;
5664      EMACS_INT src_bytes;
5665      enum coding_category category;
5666 {
5667   const unsigned char *src = source, *src_end = src + src_bytes;
5668   unsigned char c;
5669   int total  = 0;
5670   int eol_seen = EOL_SEEN_NONE;
5671
5672   if ((1 << category) & CATEGORY_MASK_UTF_16)
5673     {
5674       int msb, lsb;
5675
5676       msb = category == (coding_category_utf_16_le
5677                          | coding_category_utf_16_le_nosig);
5678       lsb = 1 - msb;
5679
5680       while (src + 1 < src_end)
5681         {
5682           c = src[lsb];
5683           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5684             {
5685               int this_eol;
5686
5687               if (c == '\n')
5688                 this_eol = EOL_SEEN_LF;
5689               else if (src + 3 >= src_end
5690                        || src[msb + 2] != 0
5691                        || src[lsb + 2] != '\n')
5692                 this_eol = EOL_SEEN_CR;
5693               else
5694                 this_eol = EOL_SEEN_CRLF;
5695
5696               if (eol_seen == EOL_SEEN_NONE)
5697                 /* This is the first end-of-line.  */
5698                 eol_seen = this_eol;
5699               else if (eol_seen != this_eol)
5700                 {
5701                   /* The found type is different from what found before.  */
5702                   eol_seen = EOL_SEEN_LF;
5703                   break;
5704                 }
5705               if (++total == MAX_EOL_CHECK_COUNT)
5706                 break;
5707             }
5708           src += 2;
5709         }
5710     }
5711   else
5712     {
5713       while (src < src_end)
5714         {
5715           c = *src++;
5716           if (c == '\n' || c == '\r')
5717             {
5718               int this_eol;
5719
5720               if (c == '\n')
5721                 this_eol = EOL_SEEN_LF;
5722               else if (src >= src_end || *src != '\n')
5723                 this_eol = EOL_SEEN_CR;
5724               else
5725                 this_eol = EOL_SEEN_CRLF, src++;
5726
5727               if (eol_seen == EOL_SEEN_NONE)
5728                 /* This is the first end-of-line.  */
5729                 eol_seen = this_eol;
5730               else if (eol_seen != this_eol)
5731                 {
5732                   /* The found type is different from what found before.  */
5733                   eol_seen = EOL_SEEN_LF;
5734                   break;
5735                 }
5736               if (++total == MAX_EOL_CHECK_COUNT)
5737                 break;
5738             }
5739         }
5740     }
5741   return eol_seen;
5742 }
5743
5744
5745 static Lisp_Object
5746 adjust_coding_eol_type (coding, eol_seen)
5747      struct coding_system *coding;
5748      int eol_seen;
5749 {
5750   Lisp_Object eol_type;
5751
5752   eol_type = CODING_ID_EOL_TYPE (coding->id);
5753   if (eol_seen & EOL_SEEN_LF)
5754     {
5755       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5756       eol_type = Qunix;
5757     }
5758   else if (eol_seen & EOL_SEEN_CRLF)
5759     {
5760       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5761       eol_type = Qdos;
5762     }
5763   else if (eol_seen & EOL_SEEN_CR)
5764     {
5765       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5766       eol_type = Qmac;
5767     }
5768   return eol_type;
5769 }
5770
5771 /* Detect how a text specified in CODING is encoded.  If a coding
5772    system is detected, update fields of CODING by the detected coding
5773    system.  */
5774
5775 void
5776 detect_coding (coding)
5777      struct coding_system *coding;
5778 {
5779   const unsigned char *src, *src_end;
5780
5781   coding->consumed = coding->consumed_char = 0;
5782   coding->produced = coding->produced_char = 0;
5783   coding_set_source (coding);
5784
5785   src_end = coding->source + coding->src_bytes;
5786   coding->head_ascii = 0;
5787
5788   /* If we have not yet decided the text encoding type, detect it
5789      now.  */
5790   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5791     {
5792       int c, i;
5793       struct coding_detection_info detect_info;
5794       int null_byte_found = 0, eight_bit_found = 0;
5795
5796       detect_info.checked = detect_info.found = detect_info.rejected = 0;
5797       for (src = coding->source; src < src_end; src++)
5798         {
5799           c = *src;
5800           if (c & 0x80)
5801             {
5802               eight_bit_found = 1;
5803               if (null_byte_found)
5804                 break;
5805             }
5806           else if (c < 0x20)
5807             {
5808               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5809                   && ! inhibit_iso_escape_detection
5810                   && ! detect_info.checked)
5811                 {
5812                   if (detect_coding_iso_2022 (coding, &detect_info))
5813                     {
5814                       /* We have scanned the whole data.  */
5815                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5816                         {
5817                           /* We didn't find an 8-bit code.  We may
5818                              have found a null-byte, but it's very
5819                              rare that a binary file confirm to
5820                              ISO-2022.  */
5821                           src = src_end;
5822                           coding->head_ascii = src - coding->source;
5823                         }
5824                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
5825                       break;
5826                     }
5827                 }
5828               else if (! c)
5829                 {
5830                   null_byte_found = 1;
5831                   if (eight_bit_found)
5832                     break;
5833                 }
5834               if (! eight_bit_found)
5835                 coding->head_ascii++;
5836             }
5837           else if (! eight_bit_found)
5838             coding->head_ascii++;
5839         }
5840
5841       if (null_byte_found || eight_bit_found
5842           || coding->head_ascii < coding->src_bytes
5843           || detect_info.found)
5844         {
5845           enum coding_category category;
5846           struct coding_system *this;
5847
5848           if (coding->head_ascii == coding->src_bytes)
5849             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
5850             for (i = 0; i < coding_category_raw_text; i++)
5851               {
5852                 category = coding_priorities[i];
5853                 this = coding_categories + category;
5854                 if (detect_info.found & (1 << category))
5855                   break;
5856               }
5857           else
5858             {
5859               if (null_byte_found)
5860                 {
5861                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
5862                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
5863                 }
5864               for (i = 0; i < coding_category_raw_text; i++)
5865                 {
5866                   category = coding_priorities[i];
5867                   this = coding_categories + category;
5868                   if (this->id < 0)
5869                     {
5870                       /* No coding system of this category is defined.  */
5871                       detect_info.rejected |= (1 << category);
5872                     }
5873                   else if (category >= coding_category_raw_text)
5874                     continue;
5875                   else if (detect_info.checked & (1 << category))
5876                     {
5877                       if (detect_info.found & (1 << category))
5878                         break;
5879                     }
5880                   else if ((*(this->detector)) (coding, &detect_info)
5881                            && detect_info.found & (1 << category))
5882                     {
5883                       if (category == coding_category_utf_16_auto)
5884                         {
5885                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5886                             category = coding_category_utf_16_le;
5887                           else
5888                             category = coding_category_utf_16_be;
5889                         }
5890                       break;
5891                     }
5892                 }
5893             }
5894
5895           if (i < coding_category_raw_text)
5896             setup_coding_system (CODING_ID_NAME (this->id), coding);
5897           else if (null_byte_found)
5898             setup_coding_system (Qno_conversion, coding);
5899           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
5900                    == CATEGORY_MASK_ANY)
5901             setup_coding_system (Qraw_text, coding);
5902           else if (detect_info.rejected)
5903             for (i = 0; i < coding_category_raw_text; i++)
5904               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5905                 {
5906                   this = coding_categories + coding_priorities[i];
5907                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5908                   break;
5909                 }
5910         }
5911     }
5912   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5913            == coding_category_utf_8_auto)
5914     {
5915       Lisp_Object coding_systems;
5916       struct coding_detection_info detect_info;
5917
5918       coding_systems
5919         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
5920       detect_info.found = detect_info.rejected = 0;
5921       coding->head_ascii = 0;
5922       if (CONSP (coding_systems)
5923           && detect_coding_utf_8 (coding, &detect_info))
5924         {
5925           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
5926             setup_coding_system (XCAR (coding_systems), coding);
5927           else
5928             setup_coding_system (XCDR (coding_systems), coding);
5929         }
5930     }
5931   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5932            == coding_category_utf_16_auto)
5933     {
5934       Lisp_Object coding_systems;
5935       struct coding_detection_info detect_info;
5936
5937       coding_systems
5938         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
5939       detect_info.found = detect_info.rejected = 0;
5940       coding->head_ascii = 0;
5941       if (CONSP (coding_systems)
5942           && detect_coding_utf_16 (coding, &detect_info))
5943         {
5944           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5945             setup_coding_system (XCAR (coding_systems), coding);
5946           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5947             setup_coding_system (XCDR (coding_systems), coding);
5948         }
5949     }
5950 }
5951
5952
5953 static void
5954 decode_eol (coding)
5955      struct coding_system *coding;
5956 {
5957   Lisp_Object eol_type;
5958   unsigned char *p, *pbeg, *pend;
5959
5960   eol_type = CODING_ID_EOL_TYPE (coding->id);
5961   if (EQ (eol_type, Qunix))
5962     return;
5963
5964   if (NILP (coding->dst_object))
5965     pbeg = coding->destination;
5966   else
5967     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5968   pend = pbeg + coding->produced;
5969
5970   if (VECTORP (eol_type))
5971     {
5972       int eol_seen = EOL_SEEN_NONE;
5973
5974       for (p = pbeg; p < pend; p++)
5975         {
5976           if (*p == '\n')
5977             eol_seen |= EOL_SEEN_LF;
5978           else if (*p == '\r')
5979             {
5980               if (p + 1 < pend && *(p + 1) == '\n')
5981                 {
5982                   eol_seen |= EOL_SEEN_CRLF;
5983                   p++;
5984                 }
5985               else
5986                 eol_seen |= EOL_SEEN_CR;
5987             }
5988         }
5989       if (eol_seen != EOL_SEEN_NONE
5990           && eol_seen != EOL_SEEN_LF
5991           && eol_seen != EOL_SEEN_CRLF
5992           && eol_seen != EOL_SEEN_CR)
5993         eol_seen = EOL_SEEN_LF;
5994       if (eol_seen != EOL_SEEN_NONE)
5995         eol_type = adjust_coding_eol_type (coding, eol_seen);
5996     }
5997
5998   if (EQ (eol_type, Qmac))
5999     {
6000       for (p = pbeg; p < pend; p++)
6001         if (*p == '\r')
6002           *p = '\n';
6003     }
6004   else if (EQ (eol_type, Qdos))
6005     {
6006       int n = 0;
6007
6008       if (NILP (coding->dst_object))
6009         {
6010           /* Start deleting '\r' from the tail to minimize the memory
6011              movement.  */
6012           for (p = pend - 2; p >= pbeg; p--)
6013             if (*p == '\r')
6014               {
6015                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6016                 n++;
6017               }
6018         }
6019       else
6020         {
6021           int pos_byte = coding->dst_pos_byte;
6022           int pos = coding->dst_pos;
6023           int pos_end = pos + coding->produced_char - 1;
6024
6025           while (pos < pos_end)
6026             {
6027               p = BYTE_POS_ADDR (pos_byte);
6028               if (*p == '\r' && p[1] == '\n')
6029                 {
6030                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6031                   n++;
6032                   pos_end--;
6033                 }
6034               pos++;
6035               if (coding->dst_multibyte)
6036                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6037               else
6038                 pos_byte++;
6039             }
6040         }
6041       coding->produced -= n;
6042       coding->produced_char -= n;
6043     }
6044 }
6045
6046
6047 /* Return a translation table (or list of them) from coding system
6048    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6049    decoding (ENCODEP is zero). */
6050
6051 static Lisp_Object
6052 get_translation_table (attrs, encodep, max_lookup)
6053      Lisp_Object attrs;
6054      int encodep, *max_lookup;
6055 {
6056   Lisp_Object standard, translation_table;
6057   Lisp_Object val;
6058
6059   if (encodep)
6060     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6061       standard = Vstandard_translation_table_for_encode;
6062   else
6063     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6064       standard = Vstandard_translation_table_for_decode;
6065   if (NILP (translation_table))
6066     translation_table = standard;
6067   else
6068     {
6069       if (SYMBOLP (translation_table))
6070         translation_table = Fget (translation_table, Qtranslation_table);
6071       else if (CONSP (translation_table))
6072         {
6073           translation_table = Fcopy_sequence (translation_table);
6074           for (val = translation_table; CONSP (val); val = XCDR (val))
6075             if (SYMBOLP (XCAR (val)))
6076               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6077         }
6078       if (CHAR_TABLE_P (standard))
6079         {
6080           if (CONSP (translation_table))
6081             translation_table = nconc2 (translation_table,
6082                                         Fcons (standard, Qnil));
6083           else
6084             translation_table = Fcons (translation_table,
6085                                        Fcons (standard, Qnil));
6086         }
6087     }
6088
6089   if (max_lookup)
6090     {
6091       *max_lookup = 1;
6092       if (CHAR_TABLE_P (translation_table)
6093           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6094         {
6095           val = XCHAR_TABLE (translation_table)->extras[1];
6096           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6097             *max_lookup = XFASTINT (val);
6098         }
6099       else if (CONSP (translation_table))
6100         {
6101           Lisp_Object tail, val;
6102
6103           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6104             if (CHAR_TABLE_P (XCAR (tail))
6105                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6106               {
6107                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6108                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6109                   *max_lookup = XFASTINT (val);
6110               }
6111         }
6112     }
6113   return translation_table;
6114 }
6115
6116 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6117   do {                                                          \
6118     trans = Qnil;                                               \
6119     if (CHAR_TABLE_P (table))                                   \
6120       {                                                         \
6121         trans = CHAR_TABLE_REF (table, c);                      \
6122         if (CHARACTERP (trans))                                 \
6123           c = XFASTINT (trans), trans = Qnil;                   \
6124       }                                                         \
6125     else if (CONSP (table))                                     \
6126       {                                                         \
6127         Lisp_Object tail;                                       \
6128                                                                 \
6129         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6130           if (CHAR_TABLE_P (XCAR (tail)))                       \
6131             {                                                   \
6132               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6133               if (CHARACTERP (trans))                           \
6134                 c = XFASTINT (trans), trans = Qnil;             \
6135               else if (! NILP (trans))                          \
6136                 break;                                          \
6137             }                                                   \
6138       }                                                         \
6139   } while (0)
6140
6141
6142 static Lisp_Object
6143 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
6144      Lisp_Object val;
6145      int *buf, *buf_end;
6146      int last_block;
6147      int *from_nchars, *to_nchars;
6148 {
6149   /* VAL is TO or (([FROM-CHAR ...] .  TO) ...) where TO is TO-CHAR or
6150      [TO-CHAR ...].  */
6151   if (CONSP (val))
6152     {
6153       Lisp_Object from, tail;
6154       int i, len;
6155
6156       for (tail = val; CONSP (tail); tail = XCDR (tail))
6157         {
6158           val = XCAR (tail);
6159           from = XCAR (val);
6160           len = ASIZE (from);
6161           for (i = 0; i < len; i++)
6162             {
6163               if (buf + i == buf_end)
6164                 {
6165                   if (! last_block)
6166                     return Qt;
6167                   break;
6168                 }
6169               if (XINT (AREF (from, i)) != buf[i])
6170                 break;
6171             }
6172           if (i == len)
6173             {
6174               val = XCDR (val);
6175               *from_nchars = len;
6176               break;
6177             }
6178         }
6179       if (! CONSP (tail))
6180         return Qnil;
6181     }
6182   if (VECTORP (val))
6183     *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
6184   else
6185     *buf = XINT (val);
6186   return val;
6187 }
6188
6189
6190 static int
6191 produce_chars (coding, translation_table, last_block)
6192      struct coding_system *coding;
6193      Lisp_Object translation_table;
6194      int last_block;
6195 {
6196   unsigned char *dst = coding->destination + coding->produced;
6197   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6198   EMACS_INT produced;
6199   EMACS_INT produced_chars = 0;
6200   int carryover = 0;
6201
6202   if (! coding->chars_at_source)
6203     {
6204       /* Source characters are in coding->charbuf.  */
6205       int *buf = coding->charbuf;
6206       int *buf_end = buf + coding->charbuf_used;
6207
6208       if (EQ (coding->src_object, coding->dst_object))
6209         {
6210           coding_set_source (coding);
6211           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6212         }
6213
6214       while (buf < buf_end)
6215         {
6216           int c = *buf, i;
6217
6218           if (c >= 0)
6219             {
6220               int from_nchars = 1, to_nchars = 1;
6221               Lisp_Object trans = Qnil;
6222
6223               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6224               if (! NILP (trans))
6225                 {
6226                   trans = get_translation (trans, buf, buf_end, last_block,
6227                                            &from_nchars, &to_nchars);
6228                   if (EQ (trans, Qt))
6229                     break;
6230                   c = *buf;
6231                 }
6232
6233               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6234                 {
6235                   dst = alloc_destination (coding,
6236                                            buf_end - buf
6237                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6238                                            dst);
6239                   if (EQ (coding->src_object, coding->dst_object))
6240                     {
6241                       coding_set_source (coding);
6242                       dst_end = ((unsigned char *) coding->source) + coding->consumed;
6243                     }
6244                   else
6245                     dst_end = coding->destination + coding->dst_bytes;
6246                 }
6247
6248               for (i = 0; i < to_nchars; i++)
6249                 {
6250                   if (i > 0)
6251                     c = XINT (AREF (trans, i));
6252                   if (coding->dst_multibyte
6253                       || ! CHAR_BYTE8_P (c))
6254                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6255                   else
6256                     *dst++ = CHAR_TO_BYTE8 (c);
6257                 }
6258               produced_chars += to_nchars;
6259               *buf++ = to_nchars;
6260               while (--from_nchars > 0)
6261                 *buf++ = 0;
6262             }
6263           else
6264             /* This is an annotation datum.  (-C) is the length.  */
6265             buf += -c;
6266         }
6267       carryover = buf_end - buf;
6268     }
6269   else
6270     {
6271       /* Source characters are at coding->source.  */
6272       const unsigned char *src = coding->source;
6273       const unsigned char *src_end = src + coding->consumed;
6274
6275       if (EQ (coding->dst_object, coding->src_object))
6276         dst_end = (unsigned char *) src;
6277       if (coding->src_multibyte != coding->dst_multibyte)
6278         {
6279           if (coding->src_multibyte)
6280             {
6281               int multibytep = 1;
6282               EMACS_INT consumed_chars;
6283
6284               while (1)
6285                 {
6286                   const unsigned char *src_base = src;
6287                   int c;
6288
6289                   ONE_MORE_BYTE (c);
6290                   if (dst == dst_end)
6291                     {
6292                       if (EQ (coding->src_object, coding->dst_object))
6293                         dst_end = (unsigned char *) src;
6294                       if (dst == dst_end)
6295                         {
6296                           EMACS_INT offset = src - coding->source;
6297
6298                           dst = alloc_destination (coding, src_end - src + 1,
6299                                                    dst);
6300                           dst_end = coding->destination + coding->dst_bytes;
6301                           coding_set_source (coding);
6302                           src = coding->source + offset;
6303                           src_end = coding->source + coding->src_bytes;
6304                           if (EQ (coding->src_object, coding->dst_object))
6305                             dst_end = (unsigned char *) src;
6306                         }
6307                     }
6308                   *dst++ = c;
6309                   produced_chars++;
6310                 }
6311             no_more_source:
6312               ;
6313             }
6314           else
6315             while (src < src_end)
6316               {
6317                 int multibytep = 1;
6318                 int c = *src++;
6319
6320                 if (dst >= dst_end - 1)
6321                   {
6322                     if (EQ (coding->src_object, coding->dst_object))
6323                       dst_end = (unsigned char *) src;
6324                     if (dst >= dst_end - 1)
6325                       {
6326                         EMACS_INT offset = src - coding->source;
6327                         EMACS_INT more_bytes;
6328
6329                         if (EQ (coding->src_object, coding->dst_object))
6330                           more_bytes = ((src_end - src) / 2) + 2;
6331                         else
6332                           more_bytes = src_end - src + 2;
6333                         dst = alloc_destination (coding, more_bytes, dst);
6334                         dst_end = coding->destination + coding->dst_bytes;
6335                         coding_set_source (coding);
6336                         src = coding->source + offset;
6337                         src_end = coding->source + coding->src_bytes;
6338                         if (EQ (coding->src_object, coding->dst_object))
6339                           dst_end = (unsigned char *) src;
6340                       }
6341                   }
6342                 EMIT_ONE_BYTE (c);
6343               }
6344         }
6345       else
6346         {
6347           if (!EQ (coding->src_object, coding->dst_object))
6348             {
6349               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6350
6351               if (require > 0)
6352                 {
6353                   EMACS_INT offset = src - coding->source;
6354
6355                   dst = alloc_destination (coding, require, dst);
6356                   coding_set_source (coding);
6357                   src = coding->source + offset;
6358                   src_end = coding->source + coding->src_bytes;
6359                 }
6360             }
6361           produced_chars = coding->consumed_char;
6362           while (src < src_end)
6363             *dst++ = *src++;
6364         }
6365     }
6366
6367   produced = dst - (coding->destination + coding->produced);
6368   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6369     insert_from_gap (produced_chars, produced);
6370   coding->produced += produced;
6371   coding->produced_char += produced_chars;
6372   return carryover;
6373 }
6374
6375 /* Compose text in CODING->object according to the annotation data at
6376    CHARBUF.  CHARBUF is an array:
6377      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
6378  */
6379
6380 static INLINE void
6381 produce_composition (coding, charbuf, pos)
6382      struct coding_system *coding;
6383      int *charbuf;
6384      EMACS_INT pos;
6385 {
6386   int len;
6387   EMACS_INT to;
6388   enum composition_method method;
6389   Lisp_Object components;
6390
6391   len = -charbuf[0];
6392   to = pos + charbuf[2];
6393   if (to <= pos)
6394     return;
6395   method = (enum composition_method) (charbuf[3]);
6396
6397   if (method == COMPOSITION_RELATIVE)
6398     components = Qnil;
6399   else if (method >= COMPOSITION_WITH_RULE
6400            && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6401     {
6402       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6403       int i;
6404
6405       len -= 4;
6406       charbuf += 4;
6407       for (i = 0; i < len; i++)
6408         {
6409           args[i] = make_number (charbuf[i]);
6410           if (charbuf[i] < 0)
6411             return;
6412         }
6413       components = (method == COMPOSITION_WITH_ALTCHARS
6414                     ? Fstring (len, args) : Fvector (len, args));
6415     }
6416   else
6417     return;
6418   compose_text (pos, to, components, Qnil, coding->dst_object);
6419 }
6420
6421
6422 /* Put `charset' property on text in CODING->object according to
6423    the annotation data at CHARBUF.  CHARBUF is an array:
6424      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6425  */
6426
6427 static INLINE void
6428 produce_charset (coding, charbuf, pos)
6429      struct coding_system *coding;
6430      int *charbuf;
6431      EMACS_INT pos;
6432 {
6433   EMACS_INT from = pos - charbuf[2];
6434   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6435
6436   Fput_text_property (make_number (from), make_number (pos),
6437                       Qcharset, CHARSET_NAME (charset),
6438                       coding->dst_object);
6439 }
6440
6441
6442 #define CHARBUF_SIZE 0x4000
6443
6444 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6445   do {                                                                  \
6446     int size = CHARBUF_SIZE;;                                           \
6447                                                                         \
6448     coding->charbuf = NULL;                                             \
6449     while (size > 1024)                                                 \
6450       {                                                                 \
6451         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6452         if (coding->charbuf)                                            \
6453           break;                                                        \
6454         size >>= 1;                                                     \
6455       }                                                                 \
6456     if (! coding->charbuf)                                              \
6457       {                                                                 \
6458         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6459         return coding->result;                                          \
6460       }                                                                 \
6461     coding->charbuf_size = size;                                        \
6462   } while (0)
6463
6464
6465 static void
6466 produce_annotation (coding, pos)
6467      struct coding_system *coding;
6468      EMACS_INT pos;
6469 {
6470   int *charbuf = coding->charbuf;
6471   int *charbuf_end = charbuf + coding->charbuf_used;
6472
6473   if (NILP (coding->dst_object))
6474     return;
6475
6476   while (charbuf < charbuf_end)
6477     {
6478       if (*charbuf >= 0)
6479         pos += *charbuf++;
6480       else
6481         {
6482           int len = -*charbuf;
6483           switch (charbuf[1])
6484             {
6485             case CODING_ANNOTATE_COMPOSITION_MASK:
6486               produce_composition (coding, charbuf, pos);
6487               break;
6488             case CODING_ANNOTATE_CHARSET_MASK:
6489               produce_charset (coding, charbuf, pos);
6490               break;
6491             default:
6492               abort ();
6493             }
6494           charbuf += len;
6495         }
6496     }
6497 }
6498
6499 /* Decode the data at CODING->src_object into CODING->dst_object.
6500    CODING->src_object is a buffer, a string, or nil.
6501    CODING->dst_object is a buffer.
6502
6503    If CODING->src_object is a buffer, it must be the current buffer.
6504    In this case, if CODING->src_pos is positive, it is a position of
6505    the source text in the buffer, otherwise, the source text is in the
6506    gap area of the buffer, and CODING->src_pos specifies the offset of
6507    the text from GPT (which must be the same as PT).  If this is the
6508    same buffer as CODING->dst_object, CODING->src_pos must be
6509    negative.
6510
6511    If CODING->src_object is a string, CODING->src_pos is an index to
6512    that string.
6513
6514    If CODING->src_object is nil, CODING->source must already point to
6515    the non-relocatable memory area.  In this case, CODING->src_pos is
6516    an offset from CODING->source.
6517
6518    The decoded data is inserted at the current point of the buffer
6519    CODING->dst_object.
6520 */
6521
6522 static int
6523 decode_coding (coding)
6524      struct coding_system *coding;
6525 {
6526   Lisp_Object attrs;
6527   Lisp_Object undo_list;
6528   Lisp_Object translation_table;
6529   int carryover;
6530   int i;
6531
6532   if (BUFFERP (coding->src_object)
6533       && coding->src_pos > 0
6534       && coding->src_pos < GPT
6535       && coding->src_pos + coding->src_chars > GPT)
6536     move_gap_both (coding->src_pos, coding->src_pos_byte);
6537
6538   undo_list = Qt;
6539   if (BUFFERP (coding->dst_object))
6540     {
6541       if (current_buffer != XBUFFER (coding->dst_object))
6542         set_buffer_internal (XBUFFER (coding->dst_object));
6543       if (GPT != PT)
6544         move_gap_both (PT, PT_BYTE);
6545       undo_list = current_buffer->undo_list;
6546       current_buffer->undo_list = Qt;
6547     }
6548
6549   coding->consumed = coding->consumed_char = 0;
6550   coding->produced = coding->produced_char = 0;
6551   coding->chars_at_source = 0;
6552   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6553   coding->errors = 0;
6554
6555   ALLOC_CONVERSION_WORK_AREA (coding);
6556
6557   attrs = CODING_ID_ATTRS (coding->id);
6558   translation_table = get_translation_table (attrs, 0, NULL);
6559
6560   carryover = 0;
6561   do
6562     {
6563       EMACS_INT pos = coding->dst_pos + coding->produced_char;
6564
6565       coding_set_source (coding);
6566       coding->annotated = 0;
6567       coding->charbuf_used = carryover;
6568       (*(coding->decoder)) (coding);
6569       coding_set_destination (coding);
6570       carryover = produce_chars (coding, translation_table, 0);
6571       if (coding->annotated)
6572         produce_annotation (coding, pos);
6573       for (i = 0; i < carryover; i++)
6574         coding->charbuf[i]
6575           = coding->charbuf[coding->charbuf_used - carryover + i];
6576     }
6577   while (coding->consumed < coding->src_bytes
6578          && (coding->result == CODING_RESULT_SUCCESS
6579              || coding->result == CODING_RESULT_INVALID_SRC));
6580
6581   if (carryover > 0)
6582     {
6583       coding_set_destination (coding);
6584       coding->charbuf_used = carryover;
6585       produce_chars (coding, translation_table, 1);
6586     }
6587
6588   coding->carryover_bytes = 0;
6589   if (coding->consumed < coding->src_bytes)
6590     {
6591       int nbytes = coding->src_bytes - coding->consumed;
6592       const unsigned char *src;
6593
6594       coding_set_source (coding);
6595       coding_set_destination (coding);
6596       src = coding->source + coding->consumed;
6597
6598       if (coding->mode & CODING_MODE_LAST_BLOCK)
6599         {
6600           /* Flush out unprocessed data as binary chars.  We are sure
6601              that the number of data is less than the size of
6602              coding->charbuf.  */
6603           coding->charbuf_used = 0;
6604           while (nbytes-- > 0)
6605             {
6606               int c = *src++;
6607
6608               if (c & 0x80)
6609                 c = BYTE8_TO_CHAR (c);
6610               coding->charbuf[coding->charbuf_used++] = c;
6611             }
6612           produce_chars (coding, Qnil, 1);
6613         }
6614       else
6615         {
6616           /* Record unprocessed bytes in coding->carryover.  We are
6617              sure that the number of data is less than the size of
6618              coding->carryover.  */
6619           unsigned char *p = coding->carryover;
6620
6621           coding->carryover_bytes = nbytes;
6622           while (nbytes-- > 0)
6623             *p++ = *src++;
6624         }
6625       coding->consumed = coding->src_bytes;
6626     }
6627
6628   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6629     decode_eol (coding);
6630   if (BUFFERP (coding->dst_object))
6631     {
6632       current_buffer->undo_list = undo_list;
6633       record_insert (coding->dst_pos, coding->produced_char);
6634     }
6635   return coding->result;
6636 }
6637
6638
6639 /* Extract an annotation datum from a composition starting at POS and
6640    ending before LIMIT of CODING->src_object (buffer or string), store
6641    the data in BUF, set *STOP to a starting position of the next
6642    composition (if any) or to LIMIT, and return the address of the
6643    next element of BUF.
6644
6645    If such an annotation is not found, set *STOP to a starting
6646    position of a composition after POS (if any) or to LIMIT, and
6647    return BUF.  */
6648
6649 static INLINE int *
6650 handle_composition_annotation (pos, limit, coding, buf, stop)
6651      EMACS_INT pos, limit;
6652      struct coding_system *coding;
6653      int *buf;
6654      EMACS_INT *stop;
6655 {
6656   EMACS_INT start, end;
6657   Lisp_Object prop;
6658
6659   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6660       || end > limit)
6661     *stop = limit;
6662   else if (start > pos)
6663     *stop = start;
6664   else
6665     {
6666       if (start == pos)
6667         {
6668           /* We found a composition.  Store the corresponding
6669              annotation data in BUF.  */
6670           int *head = buf;
6671           enum composition_method method = COMPOSITION_METHOD (prop);
6672           int nchars = COMPOSITION_LENGTH (prop);
6673
6674           ADD_COMPOSITION_DATA (buf, nchars, method);
6675           if (method != COMPOSITION_RELATIVE)
6676             {
6677               Lisp_Object components;
6678               int len, i, i_byte;
6679
6680               components = COMPOSITION_COMPONENTS (prop);
6681               if (VECTORP (components))
6682                 {
6683                   len = XVECTOR (components)->size;
6684                   for (i = 0; i < len; i++)
6685                     *buf++ = XINT (AREF (components, i));
6686                 }
6687               else if (STRINGP (components))
6688                 {
6689                   len = SCHARS (components);
6690                   i = i_byte = 0;
6691                   while (i < len)
6692                     {
6693                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6694                       buf++;
6695                     }
6696                 }
6697               else if (INTEGERP (components))
6698                 {
6699                   len = 1;
6700                   *buf++ = XINT (components);
6701                 }
6702               else if (CONSP (components))
6703                 {
6704                   for (len = 0; CONSP (components);
6705                        len++, components = XCDR (components))
6706                     *buf++ = XINT (XCAR (components));
6707                 }
6708               else
6709                 abort ();
6710               *head -= len;
6711             }
6712         }
6713
6714       if (find_composition (end, limit, &start, &end, &prop,
6715                             coding->src_object)
6716           && end <= limit)
6717         *stop = start;
6718       else
6719         *stop = limit;
6720     }
6721   return buf;
6722 }
6723
6724
6725 /* Extract an annotation datum from a text property `charset' at POS of
6726    CODING->src_object (buffer of string), store the data in BUF, set
6727    *STOP to the position where the value of `charset' property changes
6728    (limiting by LIMIT), and return the address of the next element of
6729    BUF.
6730
6731    If the property value is nil, set *STOP to the position where the
6732    property value is non-nil (limiting by LIMIT), and return BUF.  */
6733
6734 static INLINE int *
6735 handle_charset_annotation (pos, limit, coding, buf, stop)
6736      EMACS_INT pos, limit;
6737      struct coding_system *coding;
6738      int *buf;
6739      EMACS_INT *stop;
6740 {
6741   Lisp_Object val, next;
6742   int id;
6743
6744   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6745   if (! NILP (val) && CHARSETP (val))
6746     id = XINT (CHARSET_SYMBOL_ID (val));
6747   else
6748     id = -1;
6749   ADD_CHARSET_DATA (buf, 0, id);
6750   next = Fnext_single_property_change (make_number (pos), Qcharset,
6751                                        coding->src_object,
6752                                        make_number (limit));
6753   *stop = XINT (next);
6754   return buf;
6755 }
6756
6757
6758 static void
6759 consume_chars (coding, translation_table, max_lookup)
6760      struct coding_system *coding;
6761      Lisp_Object translation_table;
6762      int max_lookup;
6763 {
6764   int *buf = coding->charbuf;
6765   int *buf_end = coding->charbuf + coding->charbuf_size;
6766   const unsigned char *src = coding->source + coding->consumed;
6767   const unsigned char *src_end = coding->source + coding->src_bytes;
6768   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6769   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6770   int multibytep = coding->src_multibyte;
6771   Lisp_Object eol_type;
6772   int c;
6773   EMACS_INT stop, stop_composition, stop_charset;
6774   int *lookup_buf = NULL;
6775
6776   if (! NILP (translation_table))
6777     lookup_buf = alloca (sizeof (int) * max_lookup);
6778
6779   eol_type = CODING_ID_EOL_TYPE (coding->id);
6780   if (VECTORP (eol_type))
6781     eol_type = Qunix;
6782
6783   /* Note: composition handling is not yet implemented.  */
6784   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6785
6786   if (NILP (coding->src_object))
6787     stop = stop_composition = stop_charset = end_pos;
6788   else
6789     {
6790       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6791         stop = stop_composition = pos;
6792       else
6793         stop = stop_composition = end_pos;
6794       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6795         stop = stop_charset = pos;
6796       else
6797         stop_charset = end_pos;
6798     }
6799
6800   /* Compensate for CRLF and conversion.  */
6801   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6802   while (buf < buf_end)
6803     {
6804       Lisp_Object trans;
6805
6806       if (pos == stop)
6807         {
6808           if (pos == end_pos)
6809             break;
6810           if (pos == stop_composition)
6811             buf = handle_composition_annotation (pos, end_pos, coding,
6812                                                  buf, &stop_composition);
6813           if (pos == stop_charset)
6814             buf = handle_charset_annotation (pos, end_pos, coding,
6815                                              buf, &stop_charset);
6816           stop = (stop_composition < stop_charset
6817                   ? stop_composition : stop_charset);
6818         }
6819
6820       if (! multibytep)
6821         {
6822           EMACS_INT bytes;
6823
6824           if (coding->encoder == encode_coding_raw_text)
6825             c = *src++, pos++;
6826           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6827             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
6828           else
6829             c = BYTE8_TO_CHAR (*src), src++, pos++;
6830         }
6831       else
6832         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
6833       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6834         c = '\n';
6835       if (! EQ (eol_type, Qunix))
6836         {
6837           if (c == '\n')
6838             {
6839               if (EQ (eol_type, Qdos))
6840                 *buf++ = '\r';
6841               else
6842                 c = '\r';
6843             }
6844         }
6845
6846       trans = Qnil;
6847       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6848       if (NILP (trans))
6849         *buf++ = c;
6850       else
6851         {
6852           int from_nchars = 1, to_nchars = 1;
6853           int *lookup_buf_end;
6854           const unsigned char *p = src;
6855           int i;
6856
6857           lookup_buf[0] = c;
6858           for (i = 1; i < max_lookup && p < src_end; i++)
6859             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6860           lookup_buf_end = lookup_buf + i;
6861           trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6862                                    &from_nchars, &to_nchars);
6863           if (EQ (trans, Qt)
6864               || buf + to_nchars > buf_end)
6865             break;
6866           *buf++ = *lookup_buf;
6867           for (i = 1; i < to_nchars; i++)
6868             *buf++ = XINT (AREF (trans, i));
6869           for (i = 1; i < from_nchars; i++, pos++)
6870             src += MULTIBYTE_LENGTH_NO_CHECK (src);
6871         }
6872     }
6873
6874   coding->consumed = src - coding->source;
6875   coding->consumed_char = pos - coding->src_pos;
6876   coding->charbuf_used = buf - coding->charbuf;
6877   coding->chars_at_source = 0;
6878 }
6879
6880
6881 /* Encode the text at CODING->src_object into CODING->dst_object.
6882    CODING->src_object is a buffer or a string.
6883    CODING->dst_object is a buffer or nil.
6884
6885    If CODING->src_object is a buffer, it must be the current buffer.
6886    In this case, if CODING->src_pos is positive, it is a position of
6887    the source text in the buffer, otherwise. the source text is in the
6888    gap area of the buffer, and coding->src_pos specifies the offset of
6889    the text from GPT (which must be the same as PT).  If this is the
6890    same buffer as CODING->dst_object, CODING->src_pos must be
6891    negative and CODING should not have `pre-write-conversion'.
6892
6893    If CODING->src_object is a string, CODING should not have
6894    `pre-write-conversion'.
6895
6896    If CODING->dst_object is a buffer, the encoded data is inserted at
6897    the current point of that buffer.
6898
6899    If CODING->dst_object is nil, the encoded data is placed at the
6900    memory area specified by CODING->destination.  */
6901
6902 static int
6903 encode_coding (coding)
6904      struct coding_system *coding;
6905 {
6906   Lisp_Object attrs;
6907   Lisp_Object translation_table;
6908   int max_lookup;
6909
6910   attrs = CODING_ID_ATTRS (coding->id);
6911   if (coding->encoder == encode_coding_raw_text)
6912     translation_table = Qnil, max_lookup = 0;
6913   else
6914     translation_table = get_translation_table (attrs, 1, &max_lookup);
6915
6916   if (BUFFERP (coding->dst_object))
6917     {
6918       set_buffer_internal (XBUFFER (coding->dst_object));
6919       coding->dst_multibyte
6920         = ! NILP (current_buffer->enable_multibyte_characters);
6921     }
6922
6923   coding->consumed = coding->consumed_char = 0;
6924   coding->produced = coding->produced_char = 0;
6925   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6926   coding->errors = 0;
6927
6928   ALLOC_CONVERSION_WORK_AREA (coding);
6929
6930   do {
6931     coding_set_source (coding);
6932     consume_chars (coding, translation_table, max_lookup);
6933     coding_set_destination (coding);
6934     (*(coding->encoder)) (coding);
6935   } while (coding->consumed_char < coding->src_chars);
6936
6937   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
6938     insert_from_gap (coding->produced_char, coding->produced);
6939
6940   return (coding->result);
6941 }
6942
6943
6944 /* Name (or base name) of work buffer for code conversion.  */
6945 static Lisp_Object Vcode_conversion_workbuf_name;
6946
6947 /* A working buffer used by the top level conversion.  Once it is
6948    created, it is never destroyed.  It has the name
6949    Vcode_conversion_workbuf_name.  The other working buffers are
6950    destroyed after the use is finished, and their names are modified
6951    versions of Vcode_conversion_workbuf_name.  */
6952 static Lisp_Object Vcode_conversion_reused_workbuf;
6953
6954 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6955 static int reused_workbuf_in_use;
6956
6957
6958 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6959    multibyteness of returning buffer.  */
6960
6961 static Lisp_Object
6962 make_conversion_work_buffer (multibyte)
6963      int multibyte;
6964 {
6965   Lisp_Object name, workbuf;
6966   struct buffer *current;
6967
6968   if (reused_workbuf_in_use++)
6969     {
6970       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6971       workbuf = Fget_buffer_create (name);
6972     }
6973   else
6974     {
6975       if (NILP (Vcode_conversion_reused_workbuf))
6976         Vcode_conversion_reused_workbuf
6977           = Fget_buffer_create (Vcode_conversion_workbuf_name);
6978       workbuf = Vcode_conversion_reused_workbuf;
6979     }
6980   current = current_buffer;
6981   set_buffer_internal (XBUFFER (workbuf));
6982   Ferase_buffer ();
6983   current_buffer->undo_list = Qt;
6984   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6985   set_buffer_internal (current);
6986   return workbuf;
6987 }
6988
6989
6990 static Lisp_Object
6991 code_conversion_restore (arg)
6992      Lisp_Object arg;
6993 {
6994   Lisp_Object current, workbuf;
6995   struct gcpro gcpro1;
6996
6997   GCPRO1 (arg);
6998   current = XCAR (arg);
6999   workbuf = XCDR (arg);
7000   if (! NILP (workbuf))
7001     {
7002       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7003         reused_workbuf_in_use = 0;
7004       else if (! NILP (Fbuffer_live_p (workbuf)))
7005         Fkill_buffer (workbuf);
7006     }
7007   set_buffer_internal (XBUFFER (current));
7008   UNGCPRO;
7009   return Qnil;
7010 }
7011
7012 Lisp_Object
7013 code_conversion_save (with_work_buf, multibyte)
7014      int with_work_buf, multibyte;
7015 {
7016   Lisp_Object workbuf = Qnil;
7017
7018   if (with_work_buf)
7019     workbuf = make_conversion_work_buffer (multibyte);
7020   record_unwind_protect (code_conversion_restore,
7021                          Fcons (Fcurrent_buffer (), workbuf));
7022   return workbuf;
7023 }
7024
7025 int
7026 decode_coding_gap (coding, chars, bytes)
7027      struct coding_system *coding;
7028      EMACS_INT chars, bytes;
7029 {
7030   int count = specpdl_ptr - specpdl;
7031   Lisp_Object attrs;
7032
7033   code_conversion_save (0, 0);
7034
7035   coding->src_object = Fcurrent_buffer ();
7036   coding->src_chars = chars;
7037   coding->src_bytes = bytes;
7038   coding->src_pos = -chars;
7039   coding->src_pos_byte = -bytes;
7040   coding->src_multibyte = chars < bytes;
7041   coding->dst_object = coding->src_object;
7042   coding->dst_pos = PT;
7043   coding->dst_pos_byte = PT_BYTE;
7044   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7045
7046   if (CODING_REQUIRE_DETECTION (coding))
7047     detect_coding (coding);
7048
7049   coding->mode |= CODING_MODE_LAST_BLOCK;
7050   current_buffer->text->inhibit_shrinking = 1;
7051   decode_coding (coding);
7052   current_buffer->text->inhibit_shrinking = 0;
7053
7054   attrs = CODING_ID_ATTRS (coding->id);
7055   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7056     {
7057       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7058       Lisp_Object val;
7059
7060       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7061       val = call1 (CODING_ATTR_POST_READ (attrs),
7062                    make_number (coding->produced_char));
7063       CHECK_NATNUM (val);
7064       coding->produced_char += Z - prev_Z;
7065       coding->produced += Z_BYTE - prev_Z_BYTE;
7066     }
7067
7068   unbind_to (count, Qnil);
7069   return coding->result;
7070 }
7071
7072 int
7073 encode_coding_gap (coding, chars, bytes)
7074      struct coding_system *coding;
7075      EMACS_INT chars, bytes;
7076 {
7077   int count = specpdl_ptr - specpdl;
7078
7079   code_conversion_save (0, 0);
7080
7081   coding->src_object = Fcurrent_buffer ();
7082   coding->src_chars = chars;
7083   coding->src_bytes = bytes;
7084   coding->src_pos = -chars;
7085   coding->src_pos_byte = -bytes;
7086   coding->src_multibyte = chars < bytes;
7087   coding->dst_object = coding->src_object;
7088   coding->dst_pos = PT;
7089   coding->dst_pos_byte = PT_BYTE;
7090
7091   encode_coding (coding);
7092
7093   unbind_to (count, Qnil);
7094   return coding->result;
7095 }
7096
7097
7098 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7099    SRC_OBJECT into DST_OBJECT by coding context CODING.
7100
7101    SRC_OBJECT is a buffer, a string, or Qnil.
7102
7103    If it is a buffer, the text is at point of the buffer.  FROM and TO
7104    are positions in the buffer.
7105
7106    If it is a string, the text is at the beginning of the string.
7107    FROM and TO are indices to the string.
7108
7109    If it is nil, the text is at coding->source.  FROM and TO are
7110    indices to coding->source.
7111
7112    DST_OBJECT is a buffer, Qt, or Qnil.
7113
7114    If it is a buffer, the decoded text is inserted at point of the
7115    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7116    is deleted.
7117
7118    If it is Qt, a string is made from the decoded text, and
7119    set in CODING->dst_object.
7120
7121    If it is Qnil, the decoded text is stored at CODING->destination.
7122    The caller must allocate CODING->dst_bytes bytes at
7123    CODING->destination by xmalloc.  If the decoded text is longer than
7124    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7125  */
7126
7127 void
7128 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7129                       dst_object)
7130      struct coding_system *coding;
7131      Lisp_Object src_object;
7132      EMACS_INT from, from_byte, to, to_byte;
7133      Lisp_Object dst_object;
7134 {
7135   int count = specpdl_ptr - specpdl;
7136   unsigned char *destination;
7137   EMACS_INT dst_bytes;
7138   EMACS_INT chars = to - from;
7139   EMACS_INT bytes = to_byte - from_byte;
7140   Lisp_Object attrs;
7141   int saved_pt = -1, saved_pt_byte;
7142   int need_marker_adjustment = 0;
7143   Lisp_Object old_deactivate_mark;
7144
7145   old_deactivate_mark = Vdeactivate_mark;
7146
7147   if (NILP (dst_object))
7148     {
7149       destination = coding->destination;
7150       dst_bytes = coding->dst_bytes;
7151     }
7152
7153   coding->src_object = src_object;
7154   coding->src_chars = chars;
7155   coding->src_bytes = bytes;
7156   coding->src_multibyte = chars < bytes;
7157
7158   if (STRINGP (src_object))
7159     {
7160       coding->src_pos = from;
7161       coding->src_pos_byte = from_byte;
7162     }
7163   else if (BUFFERP (src_object))
7164     {
7165       set_buffer_internal (XBUFFER (src_object));
7166       if (from != GPT)
7167         move_gap_both (from, from_byte);
7168       if (EQ (src_object, dst_object))
7169         {
7170           struct Lisp_Marker *tail;
7171
7172           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7173             {
7174               tail->need_adjustment
7175                 = tail->charpos == (tail->insertion_type ? from : to);
7176               need_marker_adjustment |= tail->need_adjustment;
7177             }
7178           saved_pt = PT, saved_pt_byte = PT_BYTE;
7179           TEMP_SET_PT_BOTH (from, from_byte);
7180           current_buffer->text->inhibit_shrinking = 1;
7181           del_range_both (from, from_byte, to, to_byte, 1);
7182           coding->src_pos = -chars;
7183           coding->src_pos_byte = -bytes;
7184         }
7185       else
7186         {
7187           coding->src_pos = from;
7188           coding->src_pos_byte = from_byte;
7189         }
7190     }
7191
7192   if (CODING_REQUIRE_DETECTION (coding))
7193     detect_coding (coding);
7194   attrs = CODING_ID_ATTRS (coding->id);
7195
7196   if (EQ (dst_object, Qt)
7197       || (! NILP (CODING_ATTR_POST_READ (attrs))
7198           && NILP (dst_object)))
7199     {
7200       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7201       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7202       coding->dst_pos = BEG;
7203       coding->dst_pos_byte = BEG_BYTE;
7204     }
7205   else if (BUFFERP (dst_object))
7206     {
7207       code_conversion_save (0, 0);
7208       coding->dst_object = dst_object;
7209       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7210       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7211       coding->dst_multibyte
7212         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7213     }
7214   else
7215     {
7216       code_conversion_save (0, 0);
7217       coding->dst_object = Qnil;
7218       /* Most callers presume this will return a multibyte result, and they
7219          won't use `binary' or `raw-text' anyway, so let's not worry about
7220          CODING_FOR_UNIBYTE.  */
7221       coding->dst_multibyte = 1;
7222     }
7223
7224   decode_coding (coding);
7225
7226   if (BUFFERP (coding->dst_object))
7227     set_buffer_internal (XBUFFER (coding->dst_object));
7228
7229   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7230     {
7231       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7232       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7233       Lisp_Object val;
7234
7235       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7236       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7237               old_deactivate_mark);
7238       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7239                         make_number (coding->produced_char));
7240       UNGCPRO;
7241       CHECK_NATNUM (val);
7242       coding->produced_char += Z - prev_Z;
7243       coding->produced += Z_BYTE - prev_Z_BYTE;
7244     }
7245
7246   if (EQ (dst_object, Qt))
7247     {
7248       coding->dst_object = Fbuffer_string ();
7249     }
7250   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7251     {
7252       set_buffer_internal (XBUFFER (coding->dst_object));
7253       if (dst_bytes < coding->produced)
7254         {
7255           destination = xrealloc (destination, coding->produced);
7256           if (! destination)
7257             {
7258               record_conversion_result (coding,
7259                                         CODING_RESULT_INSUFFICIENT_DST);
7260               unbind_to (count, Qnil);
7261               return;
7262             }
7263           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7264             move_gap_both (BEGV, BEGV_BYTE);
7265           bcopy (BEGV_ADDR, destination, coding->produced);
7266           coding->destination = destination;
7267         }
7268     }
7269
7270   if (saved_pt >= 0)
7271     {
7272       /* This is the case of:
7273          (BUFFERP (src_object) && EQ (src_object, dst_object))
7274          As we have moved PT while replacing the original buffer
7275          contents, we must recover it now.  */
7276       set_buffer_internal (XBUFFER (src_object));
7277       current_buffer->text->inhibit_shrinking = 0;
7278       if (saved_pt < from)
7279         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7280       else if (saved_pt < from + chars)
7281         TEMP_SET_PT_BOTH (from, from_byte);
7282       else if (! NILP (current_buffer->enable_multibyte_characters))
7283         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7284                           saved_pt_byte + (coding->produced - bytes));
7285       else
7286         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7287                           saved_pt_byte + (coding->produced - bytes));
7288
7289       if (need_marker_adjustment)
7290         {
7291           struct Lisp_Marker *tail;
7292
7293           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7294             if (tail->need_adjustment)
7295               {
7296                 tail->need_adjustment = 0;
7297                 if (tail->insertion_type)
7298                   {
7299                     tail->bytepos = from_byte;
7300                     tail->charpos = from;
7301                   }
7302                 else
7303                   {
7304                     tail->bytepos = from_byte + coding->produced;
7305                     tail->charpos
7306                       = (NILP (current_buffer->enable_multibyte_characters)
7307                          ? tail->bytepos : from + coding->produced_char);
7308                   }
7309               }
7310         }
7311     }
7312
7313   Vdeactivate_mark = old_deactivate_mark;
7314   unbind_to (count, coding->dst_object);
7315 }
7316
7317
7318 void
7319 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7320                       dst_object)
7321      struct coding_system *coding;
7322      Lisp_Object src_object;
7323      EMACS_INT from, from_byte, to, to_byte;
7324      Lisp_Object dst_object;
7325 {
7326   int count = specpdl_ptr - specpdl;
7327   EMACS_INT chars = to - from;
7328   EMACS_INT bytes = to_byte - from_byte;
7329   Lisp_Object attrs;
7330   int saved_pt = -1, saved_pt_byte;
7331   int need_marker_adjustment = 0;
7332   int kill_src_buffer = 0;
7333   Lisp_Object old_deactivate_mark;
7334
7335   old_deactivate_mark = Vdeactivate_mark;
7336
7337   coding->src_object = src_object;
7338   coding->src_chars = chars;
7339   coding->src_bytes = bytes;
7340   coding->src_multibyte = chars < bytes;
7341
7342   attrs = CODING_ID_ATTRS (coding->id);
7343
7344   if (EQ (src_object, dst_object))
7345     {
7346       struct Lisp_Marker *tail;
7347
7348       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7349         {
7350           tail->need_adjustment
7351             = tail->charpos == (tail->insertion_type ? from : to);
7352           need_marker_adjustment |= tail->need_adjustment;
7353         }
7354     }
7355
7356   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7357     {
7358       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7359       set_buffer_internal (XBUFFER (coding->src_object));
7360       if (STRINGP (src_object))
7361         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7362       else if (BUFFERP (src_object))
7363         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7364       else
7365         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7366
7367       if (EQ (src_object, dst_object))
7368         {
7369           set_buffer_internal (XBUFFER (src_object));
7370           saved_pt = PT, saved_pt_byte = PT_BYTE;
7371           del_range_both (from, from_byte, to, to_byte, 1);
7372           set_buffer_internal (XBUFFER (coding->src_object));
7373         }
7374
7375       {
7376         Lisp_Object args[3];
7377         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7378
7379         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7380                 old_deactivate_mark);
7381         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7382         args[1] = make_number (BEG);
7383         args[2] = make_number (Z);
7384         safe_call (3, args);
7385         UNGCPRO;
7386       }
7387       if (XBUFFER (coding->src_object) != current_buffer)
7388         kill_src_buffer = 1;
7389       coding->src_object = Fcurrent_buffer ();
7390       if (BEG != GPT)
7391         move_gap_both (BEG, BEG_BYTE);
7392       coding->src_chars = Z - BEG;
7393       coding->src_bytes = Z_BYTE - BEG_BYTE;
7394       coding->src_pos = BEG;
7395       coding->src_pos_byte = BEG_BYTE;
7396       coding->src_multibyte = Z < Z_BYTE;
7397     }
7398   else if (STRINGP (src_object))
7399     {
7400       code_conversion_save (0, 0);
7401       coding->src_pos = from;
7402       coding->src_pos_byte = from_byte;
7403     }
7404   else if (BUFFERP (src_object))
7405     {
7406       code_conversion_save (0, 0);
7407       set_buffer_internal (XBUFFER (src_object));
7408       if (EQ (src_object, dst_object))
7409         {
7410           saved_pt = PT, saved_pt_byte = PT_BYTE;
7411           coding->src_object = del_range_1 (from, to, 1, 1);
7412           coding->src_pos = 0;
7413           coding->src_pos_byte = 0;
7414         }
7415       else
7416         {
7417           if (from < GPT && to >= GPT)
7418             move_gap_both (from, from_byte);
7419           coding->src_pos = from;
7420           coding->src_pos_byte = from_byte;
7421         }
7422     }
7423   else
7424     code_conversion_save (0, 0);
7425
7426   if (BUFFERP (dst_object))
7427     {
7428       coding->dst_object = dst_object;
7429       if (EQ (src_object, dst_object))
7430         {
7431           coding->dst_pos = from;
7432           coding->dst_pos_byte = from_byte;
7433         }
7434       else
7435         {
7436           struct buffer *current = current_buffer;
7437
7438           set_buffer_temp (XBUFFER (dst_object));
7439           coding->dst_pos = PT;
7440           coding->dst_pos_byte = PT_BYTE;
7441           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7442           set_buffer_temp (current);
7443         }
7444       coding->dst_multibyte
7445         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7446     }
7447   else if (EQ (dst_object, Qt))
7448     {
7449       coding->dst_object = Qnil;
7450       coding->dst_bytes = coding->src_chars;
7451       if (coding->dst_bytes == 0)
7452         coding->dst_bytes = 1;
7453       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7454       coding->dst_multibyte = 0;
7455     }
7456   else
7457     {
7458       coding->dst_object = Qnil;
7459       coding->dst_multibyte = 0;
7460     }
7461
7462   encode_coding (coding);
7463
7464   if (EQ (dst_object, Qt))
7465     {
7466       if (BUFFERP (coding->dst_object))
7467         coding->dst_object = Fbuffer_string ();
7468       else
7469         {
7470           coding->dst_object
7471             = make_unibyte_string ((char *) coding->destination,
7472                                    coding->produced);
7473           xfree (coding->destination);
7474         }
7475     }
7476
7477   if (saved_pt >= 0)
7478     {
7479       /* This is the case of:
7480          (BUFFERP (src_object) && EQ (src_object, dst_object))
7481          As we have moved PT while replacing the original buffer
7482          contents, we must recover it now.  */
7483       set_buffer_internal (XBUFFER (src_object));
7484       if (saved_pt < from)
7485         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7486       else if (saved_pt < from + chars)
7487         TEMP_SET_PT_BOTH (from, from_byte);
7488       else if (! NILP (current_buffer->enable_multibyte_characters))
7489         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7490                           saved_pt_byte + (coding->produced - bytes));
7491       else
7492         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7493                           saved_pt_byte + (coding->produced - bytes));
7494
7495       if (need_marker_adjustment)
7496         {
7497           struct Lisp_Marker *tail;
7498
7499           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7500             if (tail->need_adjustment)
7501               {
7502                 tail->need_adjustment = 0;
7503                 if (tail->insertion_type)
7504                   {
7505                     tail->bytepos = from_byte;
7506                     tail->charpos = from;
7507                   }
7508                 else
7509                   {
7510                     tail->bytepos = from_byte + coding->produced;
7511                     tail->charpos
7512                       = (NILP (current_buffer->enable_multibyte_characters)
7513                          ? tail->bytepos : from + coding->produced_char);
7514                   }
7515               }
7516         }
7517     }
7518
7519   if (kill_src_buffer)
7520     Fkill_buffer (coding->src_object);
7521
7522   Vdeactivate_mark = old_deactivate_mark;
7523   unbind_to (count, Qnil);
7524 }
7525
7526
7527 Lisp_Object
7528 preferred_coding_system ()
7529 {
7530   int id = coding_categories[coding_priorities[0]].id;
7531
7532   return CODING_ID_NAME (id);
7533 }
7534
7535 \f
7536 #ifdef emacs
7537 /*** 8. Emacs Lisp library functions ***/
7538
7539 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7540        doc: /* Return t if OBJECT is nil or a coding-system.
7541 See the documentation of `define-coding-system' for information
7542 about coding-system objects.  */)
7543      (object)
7544      Lisp_Object object;
7545 {
7546   if (NILP (object)
7547       || CODING_SYSTEM_ID (object) >= 0)
7548     return Qt;
7549   if (! SYMBOLP (object)
7550       || NILP (Fget (object, Qcoding_system_define_form)))
7551     return Qnil;
7552   return Qt;
7553 }
7554
7555 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7556        Sread_non_nil_coding_system, 1, 1, 0,
7557        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7558      (prompt)
7559      Lisp_Object prompt;
7560 {
7561   Lisp_Object val;
7562   do
7563     {
7564       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7565                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7566     }
7567   while (SCHARS (val) == 0);
7568   return (Fintern (val, Qnil));
7569 }
7570
7571 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7572        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7573 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
7574 Ignores case when completing coding systems (all Emacs coding systems
7575 are lower-case).  */)
7576      (prompt, default_coding_system)
7577      Lisp_Object prompt, default_coding_system;
7578 {
7579   Lisp_Object val;
7580   int count = SPECPDL_INDEX ();
7581
7582   if (SYMBOLP (default_coding_system))
7583     default_coding_system = SYMBOL_NAME (default_coding_system);
7584   specbind (Qcompletion_ignore_case, Qt);
7585   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7586                           Qt, Qnil, Qcoding_system_history,
7587                           default_coding_system, Qnil);
7588   unbind_to (count, Qnil);
7589   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7590 }
7591
7592 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7593        1, 1, 0,
7594        doc: /* Check validity of CODING-SYSTEM.
7595 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7596 It is valid if it is nil or a symbol defined as a coding system by the
7597 function `define-coding-system'.  */)
7598   (coding_system)
7599      Lisp_Object coding_system;
7600 {
7601   Lisp_Object define_form;
7602
7603   define_form = Fget (coding_system, Qcoding_system_define_form);
7604   if (! NILP (define_form))
7605     {
7606       Fput (coding_system, Qcoding_system_define_form, Qnil);
7607       safe_eval (define_form);
7608     }
7609   if (!NILP (Fcoding_system_p (coding_system)))
7610     return coding_system;
7611   xsignal1 (Qcoding_system_error, coding_system);
7612 }
7613
7614 \f
7615 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
7616    HIGHEST is nonzero, return the coding system of the highest
7617    priority among the detected coding systems.  Otherwize return a
7618    list of detected coding systems sorted by their priorities.  If
7619    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7620    multibyte form but contains only ASCII and eight-bit chars.
7621    Otherwise, the bytes are raw bytes.
7622
7623    CODING-SYSTEM controls the detection as below:
7624
7625    If it is nil, detect both text-format and eol-format.  If the
7626    text-format part of CODING-SYSTEM is already specified
7627    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
7628    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7629    detect only text-format.  */
7630
7631 Lisp_Object
7632 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7633                       coding_system)
7634      const unsigned char *src;
7635      EMACS_INT src_chars, src_bytes;
7636      int highest;
7637      int multibytep;
7638      Lisp_Object coding_system;
7639 {
7640   const unsigned char *src_end = src + src_bytes;
7641   Lisp_Object attrs, eol_type;
7642   Lisp_Object val;
7643   struct coding_system coding;
7644   int id;
7645   struct coding_detection_info detect_info;
7646   enum coding_category base_category;
7647   int null_byte_found = 0, eight_bit_found = 0;
7648
7649   if (NILP (coding_system))
7650     coding_system = Qundecided;
7651   setup_coding_system (coding_system, &coding);
7652   attrs = CODING_ID_ATTRS (coding.id);
7653   eol_type = CODING_ID_EOL_TYPE (coding.id);
7654   coding_system = CODING_ATTR_BASE_NAME (attrs);
7655
7656   coding.source = src;
7657   coding.src_chars = src_chars;
7658   coding.src_bytes = src_bytes;
7659   coding.src_multibyte = multibytep;
7660   coding.consumed = 0;
7661   coding.mode |= CODING_MODE_LAST_BLOCK;
7662   coding.head_ascii = 0;
7663
7664   detect_info.checked = detect_info.found = detect_info.rejected = 0;
7665
7666   /* At first, detect text-format if necessary.  */
7667   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7668   if (base_category == coding_category_undecided)
7669     {
7670       enum coding_category category;
7671       struct coding_system *this;
7672       int c, i;
7673
7674       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
7675       for (; src < src_end; src++)
7676         {
7677           c = *src;
7678           if (c & 0x80)
7679             {
7680               eight_bit_found = 1;
7681               if (null_byte_found)
7682                 break;
7683             }
7684           else if (c < 0x20)
7685             {
7686               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7687                   && ! inhibit_iso_escape_detection
7688                   && ! detect_info.checked)
7689                 {
7690                   if (detect_coding_iso_2022 (&coding, &detect_info))
7691                     {
7692                       /* We have scanned the whole data.  */
7693                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7694                         {
7695                           /* We didn't find an 8-bit code.  We may
7696                              have found a null-byte, but it's very
7697                              rare that a binary file confirm to
7698                              ISO-2022.  */
7699                           src = src_end;
7700                           coding.head_ascii = src - coding.source;
7701                         }
7702                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
7703                       break;
7704                     }
7705                 }
7706               else if (! c)
7707                 {
7708                   null_byte_found = 1;
7709                   if (eight_bit_found)
7710                     break;
7711                 }
7712               if (! eight_bit_found)
7713                 coding.head_ascii++;
7714             }
7715           else if (! eight_bit_found)
7716             coding.head_ascii++;
7717         }
7718
7719       if (null_byte_found || eight_bit_found
7720           || coding.head_ascii < coding.src_bytes
7721           || detect_info.found)
7722         {
7723           if (coding.head_ascii == coding.src_bytes)
7724             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
7725             for (i = 0; i < coding_category_raw_text; i++)
7726               {
7727                 category = coding_priorities[i];
7728                 this = coding_categories + category;
7729                 if (detect_info.found & (1 << category))
7730                   break;
7731               }
7732           else
7733             {
7734               if (null_byte_found)
7735                 {
7736                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
7737                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
7738                 }
7739               for (i = 0; i < coding_category_raw_text; i++)
7740                 {
7741                   category = coding_priorities[i];
7742                   this = coding_categories + category;
7743
7744                   if (this->id < 0)
7745                     {
7746                       /* No coding system of this category is defined.  */
7747                       detect_info.rejected |= (1 << category);
7748                     }
7749                   else if (category >= coding_category_raw_text)
7750                     continue;
7751                   else if (detect_info.checked & (1 << category))
7752                     {
7753                       if (highest
7754                           && (detect_info.found & (1 << category)))
7755                         break;
7756                     }
7757                   else if ((*(this->detector)) (&coding, &detect_info)
7758                            && highest
7759                            && (detect_info.found & (1 << category)))
7760                     {
7761                       if (category == coding_category_utf_16_auto)
7762                         {
7763                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7764                             category = coding_category_utf_16_le;
7765                           else
7766                             category = coding_category_utf_16_be;
7767                         }
7768                       break;
7769                     }
7770                 }
7771             }
7772         }
7773
7774       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
7775         {
7776           detect_info.found = CATEGORY_MASK_RAW_TEXT;
7777           id = coding_categories[coding_category_raw_text].id;
7778           val = Fcons (make_number (id), Qnil);
7779         }
7780       else if (! detect_info.rejected && ! detect_info.found)
7781         {
7782           detect_info.found = CATEGORY_MASK_ANY;
7783           id = coding_categories[coding_category_undecided].id;
7784           val = Fcons (make_number (id), Qnil);
7785         }
7786       else if (highest)
7787         {
7788           if (detect_info.found)
7789             {
7790               detect_info.found = 1 << category;
7791               val = Fcons (make_number (this->id), Qnil);
7792             }
7793           else
7794             for (i = 0; i < coding_category_raw_text; i++)
7795               if (! (detect_info.rejected & (1 << coding_priorities[i])))
7796                 {
7797                   detect_info.found = 1 << coding_priorities[i];
7798                   id = coding_categories[coding_priorities[i]].id;
7799                   val = Fcons (make_number (id), Qnil);
7800                   break;
7801                 }
7802         }
7803       else
7804         {
7805           int mask = detect_info.rejected | detect_info.found;
7806           int found = 0;
7807           val = Qnil;
7808
7809           for (i = coding_category_raw_text - 1; i >= 0; i--)
7810             {
7811               category = coding_priorities[i];
7812               if (! (mask & (1 << category)))
7813                 {
7814                   found |= 1 << category;
7815                   id = coding_categories[category].id;
7816                   if (id >= 0)
7817                     val = Fcons (make_number (id), val);
7818                 }
7819             }
7820           for (i = coding_category_raw_text - 1; i >= 0; i--)
7821             {
7822               category = coding_priorities[i];
7823               if (detect_info.found & (1 << category))
7824                 {
7825                   id = coding_categories[category].id;
7826                   val = Fcons (make_number (id), val);
7827                 }
7828             }
7829           detect_info.found |= found;
7830         }
7831     }
7832   else if (base_category == coding_category_utf_8_auto)
7833     {
7834       if (detect_coding_utf_8 (&coding, &detect_info))
7835         {
7836           struct coding_system *this;
7837
7838           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
7839             this = coding_categories + coding_category_utf_8_sig;
7840           else
7841             this = coding_categories + coding_category_utf_8_nosig;
7842           val = Fcons (make_number (this->id), Qnil);
7843         }
7844     }
7845   else if (base_category == coding_category_utf_16_auto)
7846     {
7847       if (detect_coding_utf_16 (&coding, &detect_info))
7848         {
7849           struct coding_system *this;
7850
7851           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7852             this = coding_categories + coding_category_utf_16_le;
7853           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7854             this = coding_categories + coding_category_utf_16_be;
7855           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7856             this = coding_categories + coding_category_utf_16_be_nosig;
7857           else
7858             this = coding_categories + coding_category_utf_16_le_nosig;
7859           val = Fcons (make_number (this->id), Qnil);
7860         }
7861     }
7862   else
7863     {
7864       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7865       val = Fcons (make_number (coding.id), Qnil);
7866     }
7867
7868   /* Then, detect eol-format if necessary.  */
7869   {
7870     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7871     Lisp_Object tail;
7872
7873     if (VECTORP (eol_type))
7874       {
7875         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7876           {
7877             if (null_byte_found)
7878               normal_eol = EOL_SEEN_LF;
7879             else
7880               normal_eol = detect_eol (coding.source, src_bytes,
7881                                        coding_category_raw_text);
7882           }
7883         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7884                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7885           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7886                                       coding_category_utf_16_be);
7887         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7888                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7889           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7890                                       coding_category_utf_16_le);
7891       }
7892     else
7893       {
7894         if (EQ (eol_type, Qunix))
7895           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7896         else if (EQ (eol_type, Qdos))
7897           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7898         else
7899           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7900       }
7901
7902     for (tail = val; CONSP (tail); tail = XCDR (tail))
7903       {
7904         enum coding_category category;
7905         int this_eol;
7906
7907         id = XINT (XCAR (tail));
7908         attrs = CODING_ID_ATTRS (id);
7909         category = XINT (CODING_ATTR_CATEGORY (attrs));
7910         eol_type = CODING_ID_EOL_TYPE (id);
7911         if (VECTORP (eol_type))
7912           {
7913             if (category == coding_category_utf_16_be
7914                 || category == coding_category_utf_16_be_nosig)
7915               this_eol = utf_16_be_eol;
7916             else if (category == coding_category_utf_16_le
7917                      || category == coding_category_utf_16_le_nosig)
7918               this_eol = utf_16_le_eol;
7919             else
7920               this_eol = normal_eol;
7921
7922             if (this_eol == EOL_SEEN_LF)
7923               XSETCAR (tail, AREF (eol_type, 0));
7924             else if (this_eol == EOL_SEEN_CRLF)
7925               XSETCAR (tail, AREF (eol_type, 1));
7926             else if (this_eol == EOL_SEEN_CR)
7927               XSETCAR (tail, AREF (eol_type, 2));
7928             else
7929               XSETCAR (tail, CODING_ID_NAME (id));
7930           }
7931         else
7932           XSETCAR (tail, CODING_ID_NAME (id));
7933       }
7934   }
7935
7936   return (highest ? XCAR (val) : val);
7937 }
7938
7939
7940 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7941        2, 3, 0,
7942        doc: /* Detect coding system of the text in the region between START and END.
7943 Return a list of possible coding systems ordered by priority.
7944
7945 If only ASCII characters are found (except for such ISO-2022 control
7946 characters as ESC), it returns a list of single element `undecided'
7947 or its subsidiary coding system according to a detected end-of-line
7948 format.
7949
7950 If optional argument HIGHEST is non-nil, return the coding system of
7951 highest priority.  */)
7952      (start, end, highest)
7953      Lisp_Object start, end, highest;
7954 {
7955   int from, to;
7956   int from_byte, to_byte;
7957
7958   CHECK_NUMBER_COERCE_MARKER (start);
7959   CHECK_NUMBER_COERCE_MARKER (end);
7960
7961   validate_region (&start, &end);
7962   from = XINT (start), to = XINT (end);
7963   from_byte = CHAR_TO_BYTE (from);
7964   to_byte = CHAR_TO_BYTE (to);
7965
7966   if (from < GPT && to >= GPT)
7967     move_gap_both (to, to_byte);
7968
7969   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7970                                to - from, to_byte - from_byte,
7971                                !NILP (highest),
7972                                !NILP (current_buffer
7973                                       ->enable_multibyte_characters),
7974                                Qnil);
7975 }
7976
7977 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7978        1, 2, 0,
7979        doc: /* Detect coding system of the text in STRING.
7980 Return a list of possible coding systems ordered by priority.
7981
7982 If only ASCII characters are found (except for such ISO-2022 control
7983 characters as ESC), it returns a list of single element `undecided'
7984 or its subsidiary coding system according to a detected end-of-line
7985 format.
7986
7987 If optional argument HIGHEST is non-nil, return the coding system of
7988 highest priority.  */)
7989      (string, highest)
7990      Lisp_Object string, highest;
7991 {
7992   CHECK_STRING (string);
7993
7994   return detect_coding_system (SDATA (string),
7995                                SCHARS (string), SBYTES (string),
7996                                !NILP (highest), STRING_MULTIBYTE (string),
7997                                Qnil);
7998 }
7999
8000
8001 static INLINE int
8002 char_encodable_p (c, attrs)
8003      int c;
8004      Lisp_Object attrs;
8005 {
8006   Lisp_Object tail;
8007   struct charset *charset;
8008   Lisp_Object translation_table;
8009
8010   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8011   if (! NILP (translation_table))
8012     c = translate_char (translation_table, c);
8013   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8014        CONSP (tail); tail = XCDR (tail))
8015     {
8016       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8017       if (CHAR_CHARSET_P (c, charset))
8018         break;
8019     }
8020   return (! NILP (tail));
8021 }
8022
8023
8024 /* Return a list of coding systems that safely encode the text between
8025    START and END.  If EXCLUDE is non-nil, it is a list of coding
8026    systems not to check.  The returned list doesn't contain any such
8027    coding systems.  In any case, if the text contains only ASCII or is
8028    unibyte, return t.  */
8029
8030 DEFUN ("find-coding-systems-region-internal",
8031        Ffind_coding_systems_region_internal,
8032        Sfind_coding_systems_region_internal, 2, 3, 0,
8033        doc: /* Internal use only.  */)
8034      (start, end, exclude)
8035      Lisp_Object start, end, exclude;
8036 {
8037   Lisp_Object coding_attrs_list, safe_codings;
8038   EMACS_INT start_byte, end_byte;
8039   const unsigned char *p, *pbeg, *pend;
8040   int c;
8041   Lisp_Object tail, elt;
8042
8043   if (STRINGP (start))
8044     {
8045       if (!STRING_MULTIBYTE (start)
8046           || SCHARS (start) == SBYTES (start))
8047         return Qt;
8048       start_byte = 0;
8049       end_byte = SBYTES (start);
8050     }
8051   else
8052     {
8053       CHECK_NUMBER_COERCE_MARKER (start);
8054       CHECK_NUMBER_COERCE_MARKER (end);
8055       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8056         args_out_of_range (start, end);
8057       if (NILP (current_buffer->enable_multibyte_characters))
8058         return Qt;
8059       start_byte = CHAR_TO_BYTE (XINT (start));
8060       end_byte = CHAR_TO_BYTE (XINT (end));
8061       if (XINT (end) - XINT (start) == end_byte - start_byte)
8062         return Qt;
8063
8064       if (XINT (start) < GPT && XINT (end) > GPT)
8065         {
8066           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8067             move_gap_both (XINT (start), start_byte);
8068           else
8069             move_gap_both (XINT (end), end_byte);
8070         }
8071     }
8072
8073   coding_attrs_list = Qnil;
8074   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8075     if (NILP (exclude)
8076         || NILP (Fmemq (XCAR (tail), exclude)))
8077       {
8078         Lisp_Object attrs;
8079
8080         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8081         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8082             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8083           {
8084             ASET (attrs, coding_attr_trans_tbl,
8085                   get_translation_table (attrs, 1, NULL));
8086             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8087           }
8088       }
8089
8090   if (STRINGP (start))
8091     p = pbeg = SDATA (start);
8092   else
8093     p = pbeg = BYTE_POS_ADDR (start_byte);
8094   pend = p + (end_byte - start_byte);
8095
8096   while (p < pend && ASCII_BYTE_P (*p)) p++;
8097   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8098
8099   while (p < pend)
8100     {
8101       if (ASCII_BYTE_P (*p))
8102         p++;
8103       else
8104         {
8105           c = STRING_CHAR_ADVANCE (p);
8106
8107           charset_map_loaded = 0;
8108           for (tail = coding_attrs_list; CONSP (tail);)
8109             {
8110               elt = XCAR (tail);
8111               if (NILP (elt))
8112                 tail = XCDR (tail);
8113               else if (char_encodable_p (c, elt))
8114                 tail = XCDR (tail);
8115               else if (CONSP (XCDR (tail)))
8116                 {
8117                   XSETCAR (tail, XCAR (XCDR (tail)));
8118                   XSETCDR (tail, XCDR (XCDR (tail)));
8119                 }
8120               else
8121                 {
8122                   XSETCAR (tail, Qnil);
8123                   tail = XCDR (tail);
8124                 }
8125             }
8126           if (charset_map_loaded)
8127             {
8128               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8129
8130               if (STRINGP (start))
8131                 pbeg = SDATA (start);
8132               else
8133                 pbeg = BYTE_POS_ADDR (start_byte);
8134               p = pbeg + p_offset;
8135               pend = pbeg + pend_offset;
8136             }
8137         }
8138     }
8139
8140   safe_codings = list2 (Qraw_text, Qno_conversion);
8141   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8142     if (! NILP (XCAR (tail)))
8143       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8144
8145   return safe_codings;
8146 }
8147
8148
8149 DEFUN ("unencodable-char-position", Funencodable_char_position,
8150        Sunencodable_char_position, 3, 5, 0,
8151        doc: /*
8152 Return position of first un-encodable character in a region.
8153 START and END specify the region and CODING-SYSTEM specifies the
8154 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8155
8156 If optional 4th argument COUNT is non-nil, it specifies at most how
8157 many un-encodable characters to search.  In this case, the value is a
8158 list of positions.
8159
8160 If optional 5th argument STRING is non-nil, it is a string to search
8161 for un-encodable characters.  In that case, START and END are indexes
8162 to the string.  */)
8163      (start, end, coding_system, count, string)
8164      Lisp_Object start, end, coding_system, count, string;
8165 {
8166   int n;
8167   struct coding_system coding;
8168   Lisp_Object attrs, charset_list, translation_table;
8169   Lisp_Object positions;
8170   int from, to;
8171   const unsigned char *p, *stop, *pend;
8172   int ascii_compatible;
8173
8174   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8175   attrs = CODING_ID_ATTRS (coding.id);
8176   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8177     return Qnil;
8178   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8179   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8180   translation_table = get_translation_table (attrs, 1, NULL);
8181
8182   if (NILP (string))
8183     {
8184       validate_region (&start, &end);
8185       from = XINT (start);
8186       to = XINT (end);
8187       if (NILP (current_buffer->enable_multibyte_characters)
8188           || (ascii_compatible
8189               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8190         return Qnil;
8191       p = CHAR_POS_ADDR (from);
8192       pend = CHAR_POS_ADDR (to);
8193       if (from < GPT && to >= GPT)
8194         stop = GPT_ADDR;
8195       else
8196         stop = pend;
8197     }
8198   else
8199     {
8200       CHECK_STRING (string);
8201       CHECK_NATNUM (start);
8202       CHECK_NATNUM (end);
8203       from = XINT (start);
8204       to = XINT (end);
8205       if (from > to
8206           || to > SCHARS (string))
8207         args_out_of_range_3 (string, start, end);
8208       if (! STRING_MULTIBYTE (string))
8209         return Qnil;
8210       p = SDATA (string) + string_char_to_byte (string, from);
8211       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8212       if (ascii_compatible && (to - from) == (pend - p))
8213         return Qnil;
8214     }
8215
8216   if (NILP (count))
8217     n = 1;
8218   else
8219     {
8220       CHECK_NATNUM (count);
8221       n = XINT (count);
8222     }
8223
8224   positions = Qnil;
8225   while (1)
8226     {
8227       int c;
8228
8229       if (ascii_compatible)
8230         while (p < stop && ASCII_BYTE_P (*p))
8231           p++, from++;
8232       if (p >= stop)
8233         {
8234           if (p >= pend)
8235             break;
8236           stop = pend;
8237           p = GAP_END_ADDR;
8238         }
8239
8240       c = STRING_CHAR_ADVANCE (p);
8241       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8242           && ! char_charset (translate_char (translation_table, c),
8243                              charset_list, NULL))
8244         {
8245           positions = Fcons (make_number (from), positions);
8246           n--;
8247           if (n == 0)
8248             break;
8249         }
8250
8251       from++;
8252     }
8253
8254   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8255 }
8256
8257
8258 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8259        Scheck_coding_systems_region, 3, 3, 0,
8260        doc: /* Check if the region is encodable by coding systems.
8261
8262 START and END are buffer positions specifying the region.
8263 CODING-SYSTEM-LIST is a list of coding systems to check.
8264
8265 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8266 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8267 whole region, POS0, POS1, ... are buffer positions where non-encodable
8268 characters are found.
8269
8270 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8271 value is nil.
8272
8273 START may be a string.  In that case, check if the string is
8274 encodable, and the value contains indices to the string instead of
8275 buffer positions.  END is ignored.  */)
8276      (start, end, coding_system_list)
8277      Lisp_Object start, end, coding_system_list;
8278 {
8279   Lisp_Object list;
8280   EMACS_INT start_byte, end_byte;
8281   int pos;
8282   const unsigned char *p, *pbeg, *pend;
8283   int c;
8284   Lisp_Object tail, elt, attrs;
8285
8286   if (STRINGP (start))
8287     {
8288       if (!STRING_MULTIBYTE (start)
8289           && SCHARS (start) != SBYTES (start))
8290         return Qnil;
8291       start_byte = 0;
8292       end_byte = SBYTES (start);
8293       pos = 0;
8294     }
8295   else
8296     {
8297       CHECK_NUMBER_COERCE_MARKER (start);
8298       CHECK_NUMBER_COERCE_MARKER (end);
8299       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8300         args_out_of_range (start, end);
8301       if (NILP (current_buffer->enable_multibyte_characters))
8302         return Qnil;
8303       start_byte = CHAR_TO_BYTE (XINT (start));
8304       end_byte = CHAR_TO_BYTE (XINT (end));
8305       if (XINT (end) - XINT (start) == end_byte - start_byte)
8306         return Qt;
8307
8308       if (XINT (start) < GPT && XINT (end) > GPT)
8309         {
8310           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8311             move_gap_both (XINT (start), start_byte);
8312           else
8313             move_gap_both (XINT (end), end_byte);
8314         }
8315       pos = XINT (start);
8316     }
8317
8318   list = Qnil;
8319   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8320     {
8321       elt = XCAR (tail);
8322       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8323       ASET (attrs, coding_attr_trans_tbl,
8324             get_translation_table (attrs, 1, NULL));
8325       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8326     }
8327
8328   if (STRINGP (start))
8329     p = pbeg = SDATA (start);
8330   else
8331     p = pbeg = BYTE_POS_ADDR (start_byte);
8332   pend = p + (end_byte - start_byte);
8333
8334   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8335   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8336
8337   while (p < pend)
8338     {
8339       if (ASCII_BYTE_P (*p))
8340         p++;
8341       else
8342         {
8343           c = STRING_CHAR_ADVANCE (p);
8344
8345           charset_map_loaded = 0;
8346           for (tail = list; CONSP (tail); tail = XCDR (tail))
8347             {
8348               elt = XCDR (XCAR (tail));
8349               if (! char_encodable_p (c, XCAR (elt)))
8350                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8351             }
8352           if (charset_map_loaded)
8353             {
8354               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8355
8356               if (STRINGP (start))
8357                 pbeg = SDATA (start);
8358               else
8359                 pbeg = BYTE_POS_ADDR (start_byte);
8360               p = pbeg + p_offset;
8361               pend = pbeg + pend_offset;
8362             }
8363         }
8364       pos++;
8365     }
8366
8367   tail = list;
8368   list = Qnil;
8369   for (; CONSP (tail); tail = XCDR (tail))
8370     {
8371       elt = XCAR (tail);
8372       if (CONSP (XCDR (XCDR (elt))))
8373         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8374                       list);
8375     }
8376
8377   return list;
8378 }
8379
8380
8381 Lisp_Object
8382 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8383      Lisp_Object start, end, coding_system, dst_object;
8384      int encodep, norecord;
8385 {
8386   struct coding_system coding;
8387   EMACS_INT from, from_byte, to, to_byte;
8388   Lisp_Object src_object;
8389
8390   CHECK_NUMBER_COERCE_MARKER (start);
8391   CHECK_NUMBER_COERCE_MARKER (end);
8392   if (NILP (coding_system))
8393     coding_system = Qno_conversion;
8394   else
8395     CHECK_CODING_SYSTEM (coding_system);
8396   src_object = Fcurrent_buffer ();
8397   if (NILP (dst_object))
8398     dst_object = src_object;
8399   else if (! EQ (dst_object, Qt))
8400     CHECK_BUFFER (dst_object);
8401
8402   validate_region (&start, &end);
8403   from = XFASTINT (start);
8404   from_byte = CHAR_TO_BYTE (from);
8405   to = XFASTINT (end);
8406   to_byte = CHAR_TO_BYTE (to);
8407
8408   setup_coding_system (coding_system, &coding);
8409   coding.mode |= CODING_MODE_LAST_BLOCK;
8410
8411   if (encodep)
8412     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8413                           dst_object);
8414   else
8415     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8416                           dst_object);
8417   if (! norecord)
8418     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8419
8420   return (BUFFERP (dst_object)
8421           ? make_number (coding.produced_char)
8422           : coding.dst_object);
8423 }
8424
8425
8426 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8427        3, 4, "r\nzCoding system: ",
8428        doc: /* Decode the current region from the specified coding system.
8429 When called from a program, takes four arguments:
8430         START, END, CODING-SYSTEM, and DESTINATION.
8431 START and END are buffer positions.
8432
8433 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8434 If nil, the region between START and END is replaced by the decoded text.
8435 If buffer, the decoded text is inserted in the buffer.
8436 In those cases, the length of the decoded text is returned.
8437 If DESTINATION is t, the decoded text is returned.
8438
8439 This function sets `last-coding-system-used' to the precise coding system
8440 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8441 not fully specified.)  */)
8442      (start, end, coding_system, destination)
8443      Lisp_Object start, end, coding_system, destination;
8444 {
8445   return code_convert_region (start, end, coding_system, destination, 0, 0);
8446 }
8447
8448 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8449        3, 4, "r\nzCoding system: ",
8450        doc: /* Encode the current region by specified coding system.
8451 When called from a program, takes four arguments:
8452         START, END, CODING-SYSTEM and DESTINATION.
8453 START and END are buffer positions.
8454
8455 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8456 If nil, the region between START and END is replace by the encoded text.
8457 If buffer, the encoded text is inserted in the buffer.
8458 In those cases, the length of the encoded text is returned.
8459 If DESTINATION is t, the encoded text is returned.
8460
8461 This function sets `last-coding-system-used' to the precise coding system
8462 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8463 not fully specified.)  */)
8464   (start, end, coding_system, destination)
8465      Lisp_Object start, end, coding_system, destination;
8466 {
8467   return code_convert_region (start, end, coding_system, destination, 1, 0);
8468 }
8469
8470 Lisp_Object
8471 code_convert_string (string, coding_system, dst_object,
8472                      encodep, nocopy, norecord)
8473      Lisp_Object string, coding_system, dst_object;
8474      int encodep, nocopy, norecord;
8475 {
8476   struct coding_system coding;
8477   EMACS_INT chars, bytes;
8478
8479   CHECK_STRING (string);
8480   if (NILP (coding_system))
8481     {
8482       if (! norecord)
8483         Vlast_coding_system_used = Qno_conversion;
8484       if (NILP (dst_object))
8485         return (nocopy ? Fcopy_sequence (string) : string);
8486     }
8487
8488   if (NILP (coding_system))
8489     coding_system = Qno_conversion;
8490   else
8491     CHECK_CODING_SYSTEM (coding_system);
8492   if (NILP (dst_object))
8493     dst_object = Qt;
8494   else if (! EQ (dst_object, Qt))
8495     CHECK_BUFFER (dst_object);
8496
8497   setup_coding_system (coding_system, &coding);
8498   coding.mode |= CODING_MODE_LAST_BLOCK;
8499   chars = SCHARS (string);
8500   bytes = SBYTES (string);
8501   if (encodep)
8502     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8503   else
8504     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8505   if (! norecord)
8506     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8507
8508   return (BUFFERP (dst_object)
8509           ? make_number (coding.produced_char)
8510           : coding.dst_object);
8511 }
8512
8513
8514 /* Encode or decode STRING according to CODING_SYSTEM.
8515    Do not set Vlast_coding_system_used.
8516
8517    This function is called only from macros DECODE_FILE and
8518    ENCODE_FILE, thus we ignore character composition.  */
8519
8520 Lisp_Object
8521 code_convert_string_norecord (string, coding_system, encodep)
8522      Lisp_Object string, coding_system;
8523      int encodep;
8524 {
8525   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8526 }
8527
8528
8529 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8530        2, 4, 0,
8531        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8532
8533 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8534 if the decoding operation is trivial.
8535
8536 Optional fourth arg BUFFER non-nil means that the decoded text is
8537 inserted in BUFFER instead of returned as a string.  In this case,
8538 the return value is the length of the decoded text.
8539
8540 This function sets `last-coding-system-used' to the precise coding system
8541 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8542 not fully specified.)  */)
8543   (string, coding_system, nocopy, buffer)
8544      Lisp_Object string, coding_system, nocopy, buffer;
8545 {
8546   return code_convert_string (string, coding_system, buffer,
8547                               0, ! NILP (nocopy), 0);
8548 }
8549
8550 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8551        2, 4, 0,
8552        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8553
8554 Optional third arg NOCOPY non-nil means it is OK to return STRING
8555 itself if the encoding operation is trivial.
8556
8557 Optional fourth arg BUFFER non-nil means that the encoded text is
8558 inserted in BUFFER instead of returned as a string.  In this case,
8559 the return value is the length of the encoded text.
8560
8561 This function sets `last-coding-system-used' to the precise coding system
8562 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8563 not fully specified.)  */)
8564      (string, coding_system, nocopy, buffer)
8565      Lisp_Object string, coding_system, nocopy, buffer;
8566 {
8567   return code_convert_string (string, coding_system, buffer,
8568                               1, ! NILP (nocopy), 1);
8569 }
8570
8571 \f
8572 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8573        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8574 Return the corresponding character.  */)
8575      (code)
8576      Lisp_Object code;
8577 {
8578   Lisp_Object spec, attrs, val;
8579   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8580   int c;
8581
8582   CHECK_NATNUM (code);
8583   c = XFASTINT (code);
8584   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8585   attrs = AREF (spec, 0);
8586
8587   if (ASCII_BYTE_P (c)
8588       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8589     return code;
8590
8591   val = CODING_ATTR_CHARSET_LIST (attrs);
8592   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8593   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8594   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8595
8596   if (c <= 0x7F)
8597     charset = charset_roman;
8598   else if (c >= 0xA0 && c < 0xDF)
8599     {
8600       charset = charset_kana;
8601       c -= 0x80;
8602     }
8603   else
8604     {
8605       int s1 = c >> 8, s2 = c & 0xFF;
8606
8607       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8608           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8609         error ("Invalid code: %d", code);
8610       SJIS_TO_JIS (c);
8611       charset = charset_kanji;
8612     }
8613   c = DECODE_CHAR (charset, c);
8614   if (c < 0)
8615     error ("Invalid code: %d", code);
8616   return make_number (c);
8617 }
8618
8619
8620 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8621        doc: /* Encode a Japanese character CH to shift_jis encoding.
8622 Return the corresponding code in SJIS.  */)
8623      (ch)
8624     Lisp_Object ch;
8625 {
8626   Lisp_Object spec, attrs, charset_list;
8627   int c;
8628   struct charset *charset;
8629   unsigned code;
8630
8631   CHECK_CHARACTER (ch);
8632   c = XFASTINT (ch);
8633   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8634   attrs = AREF (spec, 0);
8635
8636   if (ASCII_CHAR_P (c)
8637       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8638     return ch;
8639
8640   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8641   charset = char_charset (c, charset_list, &code);
8642   if (code == CHARSET_INVALID_CODE (charset))
8643     error ("Can't encode by shift_jis encoding: %d", c);
8644   JIS_TO_SJIS (code);
8645
8646   return make_number (code);
8647 }
8648
8649 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8650        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8651 Return the corresponding character.  */)
8652      (code)
8653      Lisp_Object code;
8654 {
8655   Lisp_Object spec, attrs, val;
8656   struct charset *charset_roman, *charset_big5, *charset;
8657   int c;
8658
8659   CHECK_NATNUM (code);
8660   c = XFASTINT (code);
8661   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8662   attrs = AREF (spec, 0);
8663
8664   if (ASCII_BYTE_P (c)
8665       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8666     return code;
8667
8668   val = CODING_ATTR_CHARSET_LIST (attrs);
8669   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8670   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8671
8672   if (c <= 0x7F)
8673     charset = charset_roman;
8674   else
8675     {
8676       int b1 = c >> 8, b2 = c & 0x7F;
8677       if (b1 < 0xA1 || b1 > 0xFE
8678           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8679         error ("Invalid code: %d", code);
8680       charset = charset_big5;
8681     }
8682   c = DECODE_CHAR (charset, (unsigned )c);
8683   if (c < 0)
8684     error ("Invalid code: %d", code);
8685   return make_number (c);
8686 }
8687
8688 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8689        doc: /* Encode the Big5 character CH to BIG5 coding system.
8690 Return the corresponding character code in Big5.  */)
8691      (ch)
8692      Lisp_Object ch;
8693 {
8694   Lisp_Object spec, attrs, charset_list;
8695   struct charset *charset;
8696   int c;
8697   unsigned code;
8698
8699   CHECK_CHARACTER (ch);
8700   c = XFASTINT (ch);
8701   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8702   attrs = AREF (spec, 0);
8703   if (ASCII_CHAR_P (c)
8704       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8705     return ch;
8706
8707   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8708   charset = char_charset (c, charset_list, &code);
8709   if (code == CHARSET_INVALID_CODE (charset))
8710     error ("Can't encode by Big5 encoding: %d", c);
8711
8712   return make_number (code);
8713 }
8714
8715 \f
8716 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
8717        Sset_terminal_coding_system_internal, 1, 2, 0,
8718        doc: /* Internal use only.  */)
8719      (coding_system, terminal)
8720      Lisp_Object coding_system;
8721      Lisp_Object terminal;
8722 {
8723   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8724   CHECK_SYMBOL (coding_system);
8725   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
8726   /* We had better not send unsafe characters to terminal.  */
8727   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
8728   /* Characer composition should be disabled.  */
8729   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8730   terminal_coding->src_multibyte = 1;
8731   terminal_coding->dst_multibyte = 0;
8732   return Qnil;
8733 }
8734
8735 DEFUN ("set-safe-terminal-coding-system-internal",
8736        Fset_safe_terminal_coding_system_internal,
8737        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8738        doc: /* Internal use only.  */)
8739      (coding_system)
8740      Lisp_Object coding_system;
8741 {
8742   CHECK_SYMBOL (coding_system);
8743   setup_coding_system (Fcheck_coding_system (coding_system),
8744                        &safe_terminal_coding);
8745   /* Characer composition should be disabled.  */
8746   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8747   safe_terminal_coding.src_multibyte = 1;
8748   safe_terminal_coding.dst_multibyte = 0;
8749   return Qnil;
8750 }
8751
8752 DEFUN ("terminal-coding-system", Fterminal_coding_system,
8753        Sterminal_coding_system, 0, 1, 0,
8754        doc: /* Return coding system specified for terminal output on the given terminal.
8755 TERMINAL may be a terminal id, a frame, or nil for the selected
8756 frame's terminal device.  */)
8757      (terminal)
8758      Lisp_Object terminal;
8759 {
8760   struct coding_system *terminal_coding
8761     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8762   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
8763
8764   /* For backward compatibility, return nil if it is `undecided'. */
8765   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
8766 }
8767
8768 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
8769        Sset_keyboard_coding_system_internal, 1, 2, 0,
8770        doc: /* Internal use only.  */)
8771      (coding_system, terminal)
8772      Lisp_Object coding_system;
8773      Lisp_Object terminal;
8774 {
8775   struct terminal *t = get_terminal (terminal, 1);
8776   CHECK_SYMBOL (coding_system);
8777   setup_coding_system (Fcheck_coding_system (coding_system),
8778                        TERMINAL_KEYBOARD_CODING (t));
8779   /* Characer composition should be disabled.  */
8780   TERMINAL_KEYBOARD_CODING (t)->common_flags
8781     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8782   return Qnil;
8783 }
8784
8785 DEFUN ("keyboard-coding-system",
8786        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
8787        doc: /* Return coding system specified for decoding keyboard input.  */)
8788      (terminal)
8789      Lisp_Object terminal;
8790 {
8791   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
8792                          (get_terminal (terminal, 1))->id);
8793 }
8794
8795 \f
8796 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8797        Sfind_operation_coding_system,  1, MANY, 0,
8798        doc: /* Choose a coding system for an operation based on the target name.
8799 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8800 DECODING-SYSTEM is the coding system to use for decoding
8801 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8802 for encoding (in case OPERATION does encoding).
8803
8804 The first argument OPERATION specifies an I/O primitive:
8805   For file I/O, `insert-file-contents' or `write-region'.
8806   For process I/O, `call-process', `call-process-region', or `start-process'.
8807   For network I/O, `open-network-stream'.
8808
8809 The remaining arguments should be the same arguments that were passed
8810 to the primitive.  Depending on which primitive, one of those arguments
8811 is selected as the TARGET.  For example, if OPERATION does file I/O,
8812 whichever argument specifies the file name is TARGET.
8813
8814 TARGET has a meaning which depends on OPERATION:
8815   For file I/O, TARGET is a file name (except for the special case below).
8816   For process I/O, TARGET is a process name.
8817   For network I/O, TARGET is a service name or a port number.
8818
8819 This function looks up what is specified for TARGET in
8820 `file-coding-system-alist', `process-coding-system-alist',
8821 or `network-coding-system-alist' depending on OPERATION.
8822 They may specify a coding system, a cons of coding systems,
8823 or a function symbol to call.
8824 In the last case, we call the function with one argument,
8825 which is a list of all the arguments given to this function.
8826 If the function can't decide a coding system, it can return
8827 `undecided' so that the normal code-detection is performed.
8828
8829 If OPERATION is `insert-file-contents', the argument corresponding to
8830 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
8831 file name to look up, and BUFFER is a buffer that contains the file's
8832 contents (not yet decoded).  If `file-coding-system-alist' specifies a
8833 function to call for FILENAME, that function should examine the
8834 contents of BUFFER instead of reading the file.
8835
8836 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
8837      (nargs, args)
8838      int nargs;
8839      Lisp_Object *args;
8840 {
8841   Lisp_Object operation, target_idx, target, val;
8842   register Lisp_Object chain;
8843
8844   if (nargs < 2)
8845     error ("Too few arguments");
8846   operation = args[0];
8847   if (!SYMBOLP (operation)
8848       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8849     error ("Invalid first argument");
8850   if (nargs < 1 + XINT (target_idx))
8851     error ("Too few arguments for operation: %s",
8852            SDATA (SYMBOL_NAME (operation)));
8853   target = args[XINT (target_idx) + 1];
8854   if (!(STRINGP (target)
8855         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
8856             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
8857         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8858     error ("Invalid %dth argument", XINT (target_idx) + 1);
8859   if (CONSP (target))
8860     target = XCAR (target);
8861
8862   chain = ((EQ (operation, Qinsert_file_contents)
8863             || EQ (operation, Qwrite_region))
8864            ? Vfile_coding_system_alist
8865            : (EQ (operation, Qopen_network_stream)
8866               ? Vnetwork_coding_system_alist
8867               : Vprocess_coding_system_alist));
8868   if (NILP (chain))
8869     return Qnil;
8870
8871   for (; CONSP (chain); chain = XCDR (chain))
8872     {
8873       Lisp_Object elt;
8874
8875       elt = XCAR (chain);
8876       if (CONSP (elt)
8877           && ((STRINGP (target)
8878                && STRINGP (XCAR (elt))
8879                && fast_string_match (XCAR (elt), target) >= 0)
8880               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8881         {
8882           val = XCDR (elt);
8883           /* Here, if VAL is both a valid coding system and a valid
8884              function symbol, we return VAL as a coding system.  */
8885           if (CONSP (val))
8886             return val;
8887           if (! SYMBOLP (val))
8888             return Qnil;
8889           if (! NILP (Fcoding_system_p (val)))
8890             return Fcons (val, val);
8891           if (! NILP (Ffboundp (val)))
8892             {
8893               /* We use call1 rather than safe_call1
8894                  so as to get bug reports about functions called here
8895                  which don't handle the current interface.  */
8896               val = call1 (val, Flist (nargs, args));
8897               if (CONSP (val))
8898                 return val;
8899               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8900                 return Fcons (val, val);
8901             }
8902           return Qnil;
8903         }
8904     }
8905   return Qnil;
8906 }
8907
8908 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8909        Sset_coding_system_priority, 0, MANY, 0,
8910        doc: /* Assign higher priority to the coding systems given as arguments.
8911 If multiple coding systems belong to the same category,
8912 all but the first one are ignored.
8913
8914 usage: (set-coding-system-priority &rest coding-systems)  */)
8915      (nargs, args)
8916      int nargs;
8917      Lisp_Object *args;
8918 {
8919   int i, j;
8920   int changed[coding_category_max];
8921   enum coding_category priorities[coding_category_max];
8922
8923   bzero (changed, sizeof changed);
8924
8925   for (i = j = 0; i < nargs; i++)
8926     {
8927       enum coding_category category;
8928       Lisp_Object spec, attrs;
8929
8930       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8931       attrs = AREF (spec, 0);
8932       category = XINT (CODING_ATTR_CATEGORY (attrs));
8933       if (changed[category])
8934         /* Ignore this coding system because a coding system of the
8935            same category already had a higher priority.  */
8936         continue;
8937       changed[category] = 1;
8938       priorities[j++] = category;
8939       if (coding_categories[category].id >= 0
8940           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8941         setup_coding_system (args[i], &coding_categories[category]);
8942       Fset (AREF (Vcoding_category_table, category), args[i]);
8943     }
8944
8945   /* Now we have decided top J priorities.  Reflect the order of the
8946      original priorities to the remaining priorities.  */
8947
8948   for (i = j, j = 0; i < coding_category_max; i++, j++)
8949     {
8950       while (j < coding_category_max
8951              && changed[coding_priorities[j]])
8952         j++;
8953       if (j == coding_category_max)
8954         abort ();
8955       priorities[i] = coding_priorities[j];
8956     }
8957
8958   bcopy (priorities, coding_priorities, sizeof priorities);
8959
8960   /* Update `coding-category-list'.  */
8961   Vcoding_category_list = Qnil;
8962   for (i = coding_category_max - 1; i >= 0; i--)
8963     Vcoding_category_list
8964       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8965                Vcoding_category_list);
8966
8967   return Qnil;
8968 }
8969
8970 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8971        Scoding_system_priority_list, 0, 1, 0,
8972        doc: /* Return a list of coding systems ordered by their priorities.
8973 HIGHESTP non-nil means just return the highest priority one.  */)
8974      (highestp)
8975      Lisp_Object highestp;
8976 {
8977   int i;
8978   Lisp_Object val;
8979
8980   for (i = 0, val = Qnil; i < coding_category_max; i++)
8981     {
8982       enum coding_category category = coding_priorities[i];
8983       int id = coding_categories[category].id;
8984       Lisp_Object attrs;
8985
8986       if (id < 0)
8987         continue;
8988       attrs = CODING_ID_ATTRS (id);
8989       if (! NILP (highestp))
8990         return CODING_ATTR_BASE_NAME (attrs);
8991       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8992     }
8993   return Fnreverse (val);
8994 }
8995
8996 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8997
8998 static Lisp_Object
8999 make_subsidiaries (base)
9000      Lisp_Object base;
9001 {
9002   Lisp_Object subsidiaries;
9003   int base_name_len = SBYTES (SYMBOL_NAME (base));
9004   char *buf = (char *) alloca (base_name_len + 6);
9005   int i;
9006
9007   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9008   subsidiaries = Fmake_vector (make_number (3), Qnil);
9009   for (i = 0; i < 3; i++)
9010     {
9011       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9012       ASET (subsidiaries, i, intern (buf));
9013     }
9014   return subsidiaries;
9015 }
9016
9017
9018 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9019        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9020        doc: /* For internal use only.
9021 usage: (define-coding-system-internal ...)  */)
9022      (nargs, args)
9023      int nargs;
9024      Lisp_Object *args;
9025 {
9026   Lisp_Object name;
9027   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9028   Lisp_Object attrs;            /* Vector of attributes.  */
9029   Lisp_Object eol_type;
9030   Lisp_Object aliases;
9031   Lisp_Object coding_type, charset_list, safe_charsets;
9032   enum coding_category category;
9033   Lisp_Object tail, val;
9034   int max_charset_id = 0;
9035   int i;
9036
9037   if (nargs < coding_arg_max)
9038     goto short_args;
9039
9040   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9041
9042   name = args[coding_arg_name];
9043   CHECK_SYMBOL (name);
9044   CODING_ATTR_BASE_NAME (attrs) = name;
9045
9046   val = args[coding_arg_mnemonic];
9047   if (! STRINGP (val))
9048     CHECK_CHARACTER (val);
9049   CODING_ATTR_MNEMONIC (attrs) = val;
9050
9051   coding_type = args[coding_arg_coding_type];
9052   CHECK_SYMBOL (coding_type);
9053   CODING_ATTR_TYPE (attrs) = coding_type;
9054
9055   charset_list = args[coding_arg_charset_list];
9056   if (SYMBOLP (charset_list))
9057     {
9058       if (EQ (charset_list, Qiso_2022))
9059         {
9060           if (! EQ (coding_type, Qiso_2022))
9061             error ("Invalid charset-list");
9062           charset_list = Viso_2022_charset_list;
9063         }
9064       else if (EQ (charset_list, Qemacs_mule))
9065         {
9066           if (! EQ (coding_type, Qemacs_mule))
9067             error ("Invalid charset-list");
9068           charset_list = Vemacs_mule_charset_list;
9069         }
9070       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9071         if (max_charset_id < XFASTINT (XCAR (tail)))
9072           max_charset_id = XFASTINT (XCAR (tail));
9073     }
9074   else
9075     {
9076       charset_list = Fcopy_sequence (charset_list);
9077       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9078         {
9079           struct charset *charset;
9080
9081           val = XCAR (tail);
9082           CHECK_CHARSET_GET_CHARSET (val, charset);
9083           if (EQ (coding_type, Qiso_2022)
9084               ? CHARSET_ISO_FINAL (charset) < 0
9085               : EQ (coding_type, Qemacs_mule)
9086               ? CHARSET_EMACS_MULE_ID (charset) < 0
9087               : 0)
9088             error ("Can't handle charset `%s'",
9089                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9090
9091           XSETCAR (tail, make_number (charset->id));
9092           if (max_charset_id < charset->id)
9093             max_charset_id = charset->id;
9094         }
9095     }
9096   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9097
9098   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
9099                                 make_number (255));
9100   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9101     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9102   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9103
9104   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9105
9106   val = args[coding_arg_decode_translation_table];
9107   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9108     CHECK_SYMBOL (val);
9109   CODING_ATTR_DECODE_TBL (attrs) = val;
9110
9111   val = args[coding_arg_encode_translation_table];
9112   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9113     CHECK_SYMBOL (val);
9114   CODING_ATTR_ENCODE_TBL (attrs) = val;
9115
9116   val = args[coding_arg_post_read_conversion];
9117   CHECK_SYMBOL (val);
9118   CODING_ATTR_POST_READ (attrs) = val;
9119
9120   val = args[coding_arg_pre_write_conversion];
9121   CHECK_SYMBOL (val);
9122   CODING_ATTR_PRE_WRITE (attrs) = val;
9123
9124   val = args[coding_arg_default_char];
9125   if (NILP (val))
9126     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9127   else
9128     {
9129       CHECK_CHARACTER (val);
9130       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9131     }
9132
9133   val = args[coding_arg_for_unibyte];
9134   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9135
9136   val = args[coding_arg_plist];
9137   CHECK_LIST (val);
9138   CODING_ATTR_PLIST (attrs) = val;
9139
9140   if (EQ (coding_type, Qcharset))
9141     {
9142       /* Generate a lisp vector of 256 elements.  Each element is nil,
9143          integer, or a list of charset IDs.
9144
9145          If Nth element is nil, the byte code N is invalid in this
9146          coding system.
9147
9148          If Nth element is a number NUM, N is the first byte of a
9149          charset whose ID is NUM.
9150
9151          If Nth element is a list of charset IDs, N is the first byte
9152          of one of them.  The list is sorted by dimensions of the
9153          charsets.  A charset of smaller dimension comes firtst. */
9154       val = Fmake_vector (make_number (256), Qnil);
9155
9156       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9157         {
9158           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9159           int dim = CHARSET_DIMENSION (charset);
9160           int idx = (dim - 1) * 4;
9161
9162           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9163             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9164
9165           for (i = charset->code_space[idx];
9166                i <= charset->code_space[idx + 1]; i++)
9167             {
9168               Lisp_Object tmp, tmp2;
9169               int dim2;
9170
9171               tmp = AREF (val, i);
9172               if (NILP (tmp))
9173                 tmp = XCAR (tail);
9174               else if (NUMBERP (tmp))
9175                 {
9176                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9177                   if (dim < dim2)
9178                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9179                   else
9180                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9181                 }
9182               else
9183                 {
9184                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9185                     {
9186                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9187                       if (dim < dim2)
9188                         break;
9189                     }
9190                   if (NILP (tmp2))
9191                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9192                   else
9193                     {
9194                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9195                       XSETCAR (tmp2, XCAR (tail));
9196                     }
9197                 }
9198               ASET (val, i, tmp);
9199             }
9200         }
9201       ASET (attrs, coding_attr_charset_valids, val);
9202       category = coding_category_charset;
9203     }
9204   else if (EQ (coding_type, Qccl))
9205     {
9206       Lisp_Object valids;
9207
9208       if (nargs < coding_arg_ccl_max)
9209         goto short_args;
9210
9211       val = args[coding_arg_ccl_decoder];
9212       CHECK_CCL_PROGRAM (val);
9213       if (VECTORP (val))
9214         val = Fcopy_sequence (val);
9215       ASET (attrs, coding_attr_ccl_decoder, val);
9216
9217       val = args[coding_arg_ccl_encoder];
9218       CHECK_CCL_PROGRAM (val);
9219       if (VECTORP (val))
9220         val = Fcopy_sequence (val);
9221       ASET (attrs, coding_attr_ccl_encoder, val);
9222
9223       val = args[coding_arg_ccl_valids];
9224       valids = Fmake_string (make_number (256), make_number (0));
9225       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9226         {
9227           int from, to;
9228
9229           val = Fcar (tail);
9230           if (INTEGERP (val))
9231             {
9232               from = to = XINT (val);
9233               if (from < 0 || from > 255)
9234                 args_out_of_range_3 (val, make_number (0), make_number (255));
9235             }
9236           else
9237             {
9238               CHECK_CONS (val);
9239               CHECK_NATNUM_CAR (val);
9240               CHECK_NATNUM_CDR (val);
9241               from = XINT (XCAR (val));
9242               if (from > 255)
9243                 args_out_of_range_3 (XCAR (val),
9244                                      make_number (0), make_number (255));
9245               to = XINT (XCDR (val));
9246               if (to < from || to > 255)
9247                 args_out_of_range_3 (XCDR (val),
9248                                      XCAR (val), make_number (255));
9249             }
9250           for (i = from; i <= to; i++)
9251             SSET (valids, i, 1);
9252         }
9253       ASET (attrs, coding_attr_ccl_valids, valids);
9254
9255       category = coding_category_ccl;
9256     }
9257   else if (EQ (coding_type, Qutf_16))
9258     {
9259       Lisp_Object bom, endian;
9260
9261       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9262
9263       if (nargs < coding_arg_utf16_max)
9264         goto short_args;
9265
9266       bom = args[coding_arg_utf16_bom];
9267       if (! NILP (bom) && ! EQ (bom, Qt))
9268         {
9269           CHECK_CONS (bom);
9270           val = XCAR (bom);
9271           CHECK_CODING_SYSTEM (val);
9272           val = XCDR (bom);
9273           CHECK_CODING_SYSTEM (val);
9274         }
9275       ASET (attrs, coding_attr_utf_bom, bom);
9276
9277       endian = args[coding_arg_utf16_endian];
9278       CHECK_SYMBOL (endian);
9279       if (NILP (endian))
9280         endian = Qbig;
9281       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9282         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9283       ASET (attrs, coding_attr_utf_16_endian, endian);
9284
9285       category = (CONSP (bom)
9286                   ? coding_category_utf_16_auto
9287                   : NILP (bom)
9288                   ? (EQ (endian, Qbig)
9289                      ? coding_category_utf_16_be_nosig
9290                      : coding_category_utf_16_le_nosig)
9291                   : (EQ (endian, Qbig)
9292                      ? coding_category_utf_16_be
9293                      : coding_category_utf_16_le));
9294     }
9295   else if (EQ (coding_type, Qiso_2022))
9296     {
9297       Lisp_Object initial, reg_usage, request, flags;
9298       int i;
9299
9300       if (nargs < coding_arg_iso2022_max)
9301         goto short_args;
9302
9303       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9304       CHECK_VECTOR (initial);
9305       for (i = 0; i < 4; i++)
9306         {
9307           val = Faref (initial, make_number (i));
9308           if (! NILP (val))
9309             {
9310               struct charset *charset;
9311
9312               CHECK_CHARSET_GET_CHARSET (val, charset);
9313               ASET (initial, i, make_number (CHARSET_ID (charset)));
9314               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9315                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9316             }
9317           else
9318             ASET (initial, i, make_number (-1));
9319         }
9320
9321       reg_usage = args[coding_arg_iso2022_reg_usage];
9322       CHECK_CONS (reg_usage);
9323       CHECK_NUMBER_CAR (reg_usage);
9324       CHECK_NUMBER_CDR (reg_usage);
9325
9326       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9327       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9328         {
9329           int id;
9330           Lisp_Object tmp;
9331
9332           val = Fcar (tail);
9333           CHECK_CONS (val);
9334           tmp = XCAR (val);
9335           CHECK_CHARSET_GET_ID (tmp, id);
9336           CHECK_NATNUM_CDR (val);
9337           if (XINT (XCDR (val)) >= 4)
9338             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9339           XSETCAR (val, make_number (id));
9340         }
9341
9342       flags = args[coding_arg_iso2022_flags];
9343       CHECK_NATNUM (flags);
9344       i = XINT (flags);
9345       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9346         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9347
9348       ASET (attrs, coding_attr_iso_initial, initial);
9349       ASET (attrs, coding_attr_iso_usage, reg_usage);
9350       ASET (attrs, coding_attr_iso_request, request);
9351       ASET (attrs, coding_attr_iso_flags, flags);
9352       setup_iso_safe_charsets (attrs);
9353
9354       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9355         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9356                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9357                     ? coding_category_iso_7_else
9358                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9359                     ? coding_category_iso_7
9360                     : coding_category_iso_7_tight);
9361       else
9362         {
9363           int id = XINT (AREF (initial, 1));
9364
9365           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9366                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9367                        || id < 0)
9368                       ? coding_category_iso_8_else
9369                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9370                       ? coding_category_iso_8_1
9371                       : coding_category_iso_8_2);
9372         }
9373       if (category != coding_category_iso_8_1
9374           && category != coding_category_iso_8_2)
9375         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9376     }
9377   else if (EQ (coding_type, Qemacs_mule))
9378     {
9379       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9380         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9381       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9382       category = coding_category_emacs_mule;
9383     }
9384   else if (EQ (coding_type, Qshift_jis))
9385     {
9386
9387       struct charset *charset;
9388
9389       if (XINT (Flength (charset_list)) != 3
9390           && XINT (Flength (charset_list)) != 4)
9391         error ("There should be three or four charsets");
9392
9393       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9394       if (CHARSET_DIMENSION (charset) != 1)
9395         error ("Dimension of charset %s is not one",
9396                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9397       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9398         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9399
9400       charset_list = XCDR (charset_list);
9401       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9402       if (CHARSET_DIMENSION (charset) != 1)
9403         error ("Dimension of charset %s is not one",
9404                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9405
9406       charset_list = XCDR (charset_list);
9407       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9408       if (CHARSET_DIMENSION (charset) != 2)
9409         error ("Dimension of charset %s is not two",
9410                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9411
9412       charset_list = XCDR (charset_list);
9413       if (! NILP (charset_list))
9414         {
9415           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9416           if (CHARSET_DIMENSION (charset) != 2)
9417             error ("Dimension of charset %s is not two",
9418                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9419         }
9420
9421       category = coding_category_sjis;
9422       Vsjis_coding_system = name;
9423     }
9424   else if (EQ (coding_type, Qbig5))
9425     {
9426       struct charset *charset;
9427
9428       if (XINT (Flength (charset_list)) != 2)
9429         error ("There should be just two charsets");
9430
9431       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9432       if (CHARSET_DIMENSION (charset) != 1)
9433         error ("Dimension of charset %s is not one",
9434                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9435       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9436         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9437
9438       charset_list = XCDR (charset_list);
9439       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9440       if (CHARSET_DIMENSION (charset) != 2)
9441         error ("Dimension of charset %s is not two",
9442                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9443
9444       category = coding_category_big5;
9445       Vbig5_coding_system = name;
9446     }
9447   else if (EQ (coding_type, Qraw_text))
9448     {
9449       category = coding_category_raw_text;
9450       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9451     }
9452   else if (EQ (coding_type, Qutf_8))
9453     {
9454       Lisp_Object bom;
9455
9456       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9457
9458       if (nargs < coding_arg_utf8_max)
9459         goto short_args;
9460
9461       bom = args[coding_arg_utf8_bom];
9462       if (! NILP (bom) && ! EQ (bom, Qt))
9463         {
9464           CHECK_CONS (bom);
9465           val = XCAR (bom);
9466           CHECK_CODING_SYSTEM (val);
9467           val = XCDR (bom);
9468           CHECK_CODING_SYSTEM (val);
9469         }
9470       ASET (attrs, coding_attr_utf_bom, bom);
9471
9472       category = (CONSP (bom) ? coding_category_utf_8_auto
9473                   : NILP (bom) ? coding_category_utf_8_nosig
9474                   : coding_category_utf_8_sig);
9475     }
9476   else if (EQ (coding_type, Qundecided))
9477     category = coding_category_undecided;
9478   else
9479     error ("Invalid coding system type: %s",
9480            SDATA (SYMBOL_NAME (coding_type)));
9481
9482   CODING_ATTR_CATEGORY (attrs) = make_number (category);
9483   CODING_ATTR_PLIST (attrs)
9484     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9485                                 CODING_ATTR_PLIST (attrs)));
9486   CODING_ATTR_PLIST (attrs)
9487     = Fcons (QCascii_compatible_p,
9488              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9489                     CODING_ATTR_PLIST (attrs)));
9490
9491   eol_type = args[coding_arg_eol_type];
9492   if (! NILP (eol_type)
9493       && ! EQ (eol_type, Qunix)
9494       && ! EQ (eol_type, Qdos)
9495       && ! EQ (eol_type, Qmac))
9496     error ("Invalid eol-type");
9497
9498   aliases = Fcons (name, Qnil);
9499
9500   if (NILP (eol_type))
9501     {
9502       eol_type = make_subsidiaries (name);
9503       for (i = 0; i < 3; i++)
9504         {
9505           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9506
9507           this_name = AREF (eol_type, i);
9508           this_aliases = Fcons (this_name, Qnil);
9509           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9510           this_spec = Fmake_vector (make_number (3), attrs);
9511           ASET (this_spec, 1, this_aliases);
9512           ASET (this_spec, 2, this_eol_type);
9513           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9514           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9515           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9516           if (NILP (val))
9517             Vcoding_system_alist
9518               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9519                        Vcoding_system_alist);
9520         }
9521     }
9522
9523   spec_vec = Fmake_vector (make_number (3), attrs);
9524   ASET (spec_vec, 1, aliases);
9525   ASET (spec_vec, 2, eol_type);
9526
9527   Fputhash (name, spec_vec, Vcoding_system_hash_table);
9528   Vcoding_system_list = Fcons (name, Vcoding_system_list);
9529   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9530   if (NILP (val))
9531     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9532                                   Vcoding_system_alist);
9533
9534   {
9535     int id = coding_categories[category].id;
9536
9537     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9538       setup_coding_system (name, &coding_categories[category]);
9539   }
9540
9541   return Qnil;
9542
9543  short_args:
9544   return Fsignal (Qwrong_number_of_arguments,
9545                   Fcons (intern ("define-coding-system-internal"),
9546                          make_number (nargs)));
9547 }
9548
9549
9550 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9551        3, 3, 0,
9552        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
9553   (coding_system, prop, val)
9554      Lisp_Object coding_system, prop, val;
9555 {
9556   Lisp_Object spec, attrs;
9557
9558   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9559   attrs = AREF (spec, 0);
9560   if (EQ (prop, QCmnemonic))
9561     {
9562       if (! STRINGP (val))
9563         CHECK_CHARACTER (val);
9564       CODING_ATTR_MNEMONIC (attrs) = val;
9565     }
9566   else if (EQ (prop, QCdefalut_char))
9567     {
9568       if (NILP (val))
9569         val = make_number (' ');
9570       else
9571         CHECK_CHARACTER (val);
9572       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9573     }
9574   else if (EQ (prop, QCdecode_translation_table))
9575     {
9576       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9577         CHECK_SYMBOL (val);
9578       CODING_ATTR_DECODE_TBL (attrs) = val;
9579     }
9580   else if (EQ (prop, QCencode_translation_table))
9581     {
9582       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9583         CHECK_SYMBOL (val);
9584       CODING_ATTR_ENCODE_TBL (attrs) = val;
9585     }
9586   else if (EQ (prop, QCpost_read_conversion))
9587     {
9588       CHECK_SYMBOL (val);
9589       CODING_ATTR_POST_READ (attrs) = val;
9590     }
9591   else if (EQ (prop, QCpre_write_conversion))
9592     {
9593       CHECK_SYMBOL (val);
9594       CODING_ATTR_PRE_WRITE (attrs) = val;
9595     }
9596   else if (EQ (prop, QCascii_compatible_p))
9597     {
9598       CODING_ATTR_ASCII_COMPAT (attrs) = val;
9599     }
9600
9601   CODING_ATTR_PLIST (attrs)
9602     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9603   return val;
9604 }
9605
9606
9607 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9608        Sdefine_coding_system_alias, 2, 2, 0,
9609        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
9610      (alias, coding_system)
9611      Lisp_Object alias, coding_system;
9612 {
9613   Lisp_Object spec, aliases, eol_type, val;
9614
9615   CHECK_SYMBOL (alias);
9616   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9617   aliases = AREF (spec, 1);
9618   /* ALIASES should be a list of length more than zero, and the first
9619      element is a base coding system.  Append ALIAS at the tail of the
9620      list.  */
9621   while (!NILP (XCDR (aliases)))
9622     aliases = XCDR (aliases);
9623   XSETCDR (aliases, Fcons (alias, Qnil));
9624
9625   eol_type = AREF (spec, 2);
9626   if (VECTORP (eol_type))
9627     {
9628       Lisp_Object subsidiaries;
9629       int i;
9630
9631       subsidiaries = make_subsidiaries (alias);
9632       for (i = 0; i < 3; i++)
9633         Fdefine_coding_system_alias (AREF (subsidiaries, i),
9634                                      AREF (eol_type, i));
9635     }
9636
9637   Fputhash (alias, spec, Vcoding_system_hash_table);
9638   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9639   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9640   if (NILP (val))
9641     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9642                                   Vcoding_system_alist);
9643
9644   return Qnil;
9645 }
9646
9647 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9648        1, 1, 0,
9649        doc: /* Return the base of CODING-SYSTEM.
9650 Any alias or subsidiary coding system is not a base coding system.  */)
9651   (coding_system)
9652      Lisp_Object coding_system;
9653 {
9654   Lisp_Object spec, attrs;
9655
9656   if (NILP (coding_system))
9657     return (Qno_conversion);
9658   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9659   attrs = AREF (spec, 0);
9660   return CODING_ATTR_BASE_NAME (attrs);
9661 }
9662
9663 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9664        1, 1, 0,
9665        doc: "Return the property list of CODING-SYSTEM.")
9666      (coding_system)
9667      Lisp_Object coding_system;
9668 {
9669   Lisp_Object spec, attrs;
9670
9671   if (NILP (coding_system))
9672     coding_system = Qno_conversion;
9673   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9674   attrs = AREF (spec, 0);
9675   return CODING_ATTR_PLIST (attrs);
9676 }
9677
9678
9679 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9680        1, 1, 0,
9681        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
9682      (coding_system)
9683      Lisp_Object coding_system;
9684 {
9685   Lisp_Object spec;
9686
9687   if (NILP (coding_system))
9688     coding_system = Qno_conversion;
9689   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9690   return AREF (spec, 1);
9691 }
9692
9693 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9694        Scoding_system_eol_type, 1, 1, 0,
9695        doc: /* Return eol-type of CODING-SYSTEM.
9696 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
9697
9698 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9699 and CR respectively.
9700
9701 A vector value indicates that a format of end-of-line should be
9702 detected automatically.  Nth element of the vector is the subsidiary
9703 coding system whose eol-type is N.  */)
9704      (coding_system)
9705      Lisp_Object coding_system;
9706 {
9707   Lisp_Object spec, eol_type;
9708   int n;
9709
9710   if (NILP (coding_system))
9711     coding_system = Qno_conversion;
9712   if (! CODING_SYSTEM_P (coding_system))
9713     return Qnil;
9714   spec = CODING_SYSTEM_SPEC (coding_system);
9715   eol_type = AREF (spec, 2);
9716   if (VECTORP (eol_type))
9717     return Fcopy_sequence (eol_type);
9718   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9719   return make_number (n);
9720 }
9721
9722 #endif /* emacs */
9723
9724 \f
9725 /*** 9. Post-amble ***/
9726
9727 void
9728 init_coding_once ()
9729 {
9730   int i;
9731
9732   for (i = 0; i < coding_category_max; i++)
9733     {
9734       coding_categories[i].id = -1;
9735       coding_priorities[i] = i;
9736     }
9737
9738   /* ISO2022 specific initialize routine.  */
9739   for (i = 0; i < 0x20; i++)
9740     iso_code_class[i] = ISO_control_0;
9741   for (i = 0x21; i < 0x7F; i++)
9742     iso_code_class[i] = ISO_graphic_plane_0;
9743   for (i = 0x80; i < 0xA0; i++)
9744     iso_code_class[i] = ISO_control_1;
9745   for (i = 0xA1; i < 0xFF; i++)
9746     iso_code_class[i] = ISO_graphic_plane_1;
9747   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9748   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9749   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9750   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9751   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9752   iso_code_class[ISO_CODE_ESC] = ISO_escape;
9753   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9754   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9755   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9756
9757   for (i = 0; i < 256; i++)
9758     {
9759       emacs_mule_bytes[i] = 1;
9760     }
9761   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9762   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9763   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9764   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9765 }
9766
9767 #ifdef emacs
9768
9769 void
9770 syms_of_coding ()
9771 {
9772   staticpro (&Vcoding_system_hash_table);
9773   {
9774     Lisp_Object args[2];
9775     args[0] = QCtest;
9776     args[1] = Qeq;
9777     Vcoding_system_hash_table = Fmake_hash_table (2, args);
9778   }
9779
9780   staticpro (&Vsjis_coding_system);
9781   Vsjis_coding_system = Qnil;
9782
9783   staticpro (&Vbig5_coding_system);
9784   Vbig5_coding_system = Qnil;
9785
9786   staticpro (&Vcode_conversion_reused_workbuf);
9787   Vcode_conversion_reused_workbuf = Qnil;
9788
9789   staticpro (&Vcode_conversion_workbuf_name);
9790   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9791
9792   reused_workbuf_in_use = 0;
9793
9794   DEFSYM (Qcharset, "charset");
9795   DEFSYM (Qtarget_idx, "target-idx");
9796   DEFSYM (Qcoding_system_history, "coding-system-history");
9797   Fset (Qcoding_system_history, Qnil);
9798
9799   /* Target FILENAME is the first argument.  */
9800   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9801   /* Target FILENAME is the third argument.  */
9802   Fput (Qwrite_region, Qtarget_idx, make_number (2));
9803
9804   DEFSYM (Qcall_process, "call-process");
9805   /* Target PROGRAM is the first argument.  */
9806   Fput (Qcall_process, Qtarget_idx, make_number (0));
9807
9808   DEFSYM (Qcall_process_region, "call-process-region");
9809   /* Target PROGRAM is the third argument.  */
9810   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9811
9812   DEFSYM (Qstart_process, "start-process");
9813   /* Target PROGRAM is the third argument.  */
9814   Fput (Qstart_process, Qtarget_idx, make_number (2));
9815
9816   DEFSYM (Qopen_network_stream, "open-network-stream");
9817   /* Target SERVICE is the fourth argument.  */
9818   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9819
9820   DEFSYM (Qcoding_system, "coding-system");
9821   DEFSYM (Qcoding_aliases, "coding-aliases");
9822
9823   DEFSYM (Qeol_type, "eol-type");
9824   DEFSYM (Qunix, "unix");
9825   DEFSYM (Qdos, "dos");
9826
9827   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9828   DEFSYM (Qpost_read_conversion, "post-read-conversion");
9829   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9830   DEFSYM (Qdefault_char, "default-char");
9831   DEFSYM (Qundecided, "undecided");
9832   DEFSYM (Qno_conversion, "no-conversion");
9833   DEFSYM (Qraw_text, "raw-text");
9834
9835   DEFSYM (Qiso_2022, "iso-2022");
9836
9837   DEFSYM (Qutf_8, "utf-8");
9838   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9839
9840   DEFSYM (Qutf_16, "utf-16");
9841   DEFSYM (Qbig, "big");
9842   DEFSYM (Qlittle, "little");
9843
9844   DEFSYM (Qshift_jis, "shift-jis");
9845   DEFSYM (Qbig5, "big5");
9846
9847   DEFSYM (Qcoding_system_p, "coding-system-p");
9848
9849   DEFSYM (Qcoding_system_error, "coding-system-error");
9850   Fput (Qcoding_system_error, Qerror_conditions,
9851         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9852   Fput (Qcoding_system_error, Qerror_message,
9853         build_string ("Invalid coding system"));
9854
9855   /* Intern this now in case it isn't already done.
9856      Setting this variable twice is harmless.
9857      But don't staticpro it here--that is done in alloc.c.  */
9858   Qchar_table_extra_slots = intern ("char-table-extra-slots");
9859
9860   DEFSYM (Qtranslation_table, "translation-table");
9861   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9862   DEFSYM (Qtranslation_table_id, "translation-table-id");
9863   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9864   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9865
9866   DEFSYM (Qvalid_codes, "valid-codes");
9867
9868   DEFSYM (Qemacs_mule, "emacs-mule");
9869
9870   DEFSYM (QCcategory, ":category");
9871   DEFSYM (QCmnemonic, ":mnemonic");
9872   DEFSYM (QCdefalut_char, ":default-char");
9873   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9874   DEFSYM (QCencode_translation_table, ":encode-translation-table");
9875   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9876   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9877   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9878
9879   Vcoding_category_table
9880     = Fmake_vector (make_number (coding_category_max), Qnil);
9881   staticpro (&Vcoding_category_table);
9882   /* Followings are target of code detection.  */
9883   ASET (Vcoding_category_table, coding_category_iso_7,
9884         intern ("coding-category-iso-7"));
9885   ASET (Vcoding_category_table, coding_category_iso_7_tight,
9886         intern ("coding-category-iso-7-tight"));
9887   ASET (Vcoding_category_table, coding_category_iso_8_1,
9888         intern ("coding-category-iso-8-1"));
9889   ASET (Vcoding_category_table, coding_category_iso_8_2,
9890         intern ("coding-category-iso-8-2"));
9891   ASET (Vcoding_category_table, coding_category_iso_7_else,
9892         intern ("coding-category-iso-7-else"));
9893   ASET (Vcoding_category_table, coding_category_iso_8_else,
9894         intern ("coding-category-iso-8-else"));
9895   ASET (Vcoding_category_table, coding_category_utf_8_auto,
9896         intern ("coding-category-utf-8-auto"));
9897   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
9898         intern ("coding-category-utf-8"));
9899   ASET (Vcoding_category_table, coding_category_utf_8_sig,
9900         intern ("coding-category-utf-8-sig"));
9901   ASET (Vcoding_category_table, coding_category_utf_16_be,
9902         intern ("coding-category-utf-16-be"));
9903   ASET (Vcoding_category_table, coding_category_utf_16_auto,
9904         intern ("coding-category-utf-16-auto"));
9905   ASET (Vcoding_category_table, coding_category_utf_16_le,
9906         intern ("coding-category-utf-16-le"));
9907   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9908         intern ("coding-category-utf-16-be-nosig"));
9909   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9910         intern ("coding-category-utf-16-le-nosig"));
9911   ASET (Vcoding_category_table, coding_category_charset,
9912         intern ("coding-category-charset"));
9913   ASET (Vcoding_category_table, coding_category_sjis,
9914         intern ("coding-category-sjis"));
9915   ASET (Vcoding_category_table, coding_category_big5,
9916         intern ("coding-category-big5"));
9917   ASET (Vcoding_category_table, coding_category_ccl,
9918         intern ("coding-category-ccl"));
9919   ASET (Vcoding_category_table, coding_category_emacs_mule,
9920         intern ("coding-category-emacs-mule"));
9921   /* Followings are NOT target of code detection.  */
9922   ASET (Vcoding_category_table, coding_category_raw_text,
9923         intern ("coding-category-raw-text"));
9924   ASET (Vcoding_category_table, coding_category_undecided,
9925         intern ("coding-category-undecided"));
9926
9927   DEFSYM (Qinsufficient_source, "insufficient-source");
9928   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9929   DEFSYM (Qinvalid_source, "invalid-source");
9930   DEFSYM (Qinterrupted, "interrupted");
9931   DEFSYM (Qinsufficient_memory, "insufficient-memory");
9932   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9933
9934   defsubr (&Scoding_system_p);
9935   defsubr (&Sread_coding_system);
9936   defsubr (&Sread_non_nil_coding_system);
9937   defsubr (&Scheck_coding_system);
9938   defsubr (&Sdetect_coding_region);
9939   defsubr (&Sdetect_coding_string);
9940   defsubr (&Sfind_coding_systems_region_internal);
9941   defsubr (&Sunencodable_char_position);
9942   defsubr (&Scheck_coding_systems_region);
9943   defsubr (&Sdecode_coding_region);
9944   defsubr (&Sencode_coding_region);
9945   defsubr (&Sdecode_coding_string);
9946   defsubr (&Sencode_coding_string);
9947   defsubr (&Sdecode_sjis_char);
9948   defsubr (&Sencode_sjis_char);
9949   defsubr (&Sdecode_big5_char);
9950   defsubr (&Sencode_big5_char);
9951   defsubr (&Sset_terminal_coding_system_internal);
9952   defsubr (&Sset_safe_terminal_coding_system_internal);
9953   defsubr (&Sterminal_coding_system);
9954   defsubr (&Sset_keyboard_coding_system_internal);
9955   defsubr (&Skeyboard_coding_system);
9956   defsubr (&Sfind_operation_coding_system);
9957   defsubr (&Sset_coding_system_priority);
9958   defsubr (&Sdefine_coding_system_internal);
9959   defsubr (&Sdefine_coding_system_alias);
9960   defsubr (&Scoding_system_put);
9961   defsubr (&Scoding_system_base);
9962   defsubr (&Scoding_system_plist);
9963   defsubr (&Scoding_system_aliases);
9964   defsubr (&Scoding_system_eol_type);
9965   defsubr (&Scoding_system_priority_list);
9966
9967   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9968                doc: /* List of coding systems.
9969
9970 Do not alter the value of this variable manually.  This variable should be
9971 updated by the functions `define-coding-system' and
9972 `define-coding-system-alias'.  */);
9973   Vcoding_system_list = Qnil;
9974
9975   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9976                doc: /* Alist of coding system names.
9977 Each element is one element list of coding system name.
9978 This variable is given to `completing-read' as COLLECTION argument.
9979
9980 Do not alter the value of this variable manually.  This variable should be
9981 updated by the functions `make-coding-system' and
9982 `define-coding-system-alias'.  */);
9983   Vcoding_system_alist = Qnil;
9984
9985   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9986                doc: /* List of coding-categories (symbols) ordered by priority.
9987
9988 On detecting a coding system, Emacs tries code detection algorithms
9989 associated with each coding-category one by one in this order.  When
9990 one algorithm agrees with a byte sequence of source text, the coding
9991 system bound to the corresponding coding-category is selected.
9992
9993 Don't modify this variable directly, but use `set-coding-priority'.  */);
9994   {
9995     int i;
9996
9997     Vcoding_category_list = Qnil;
9998     for (i = coding_category_max - 1; i >= 0; i--)
9999       Vcoding_category_list
10000         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10001                  Vcoding_category_list);
10002   }
10003
10004   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10005                doc: /* Specify the coding system for read operations.
10006 It is useful to bind this variable with `let', but do not set it globally.
10007 If the value is a coding system, it is used for decoding on read operation.
10008 If not, an appropriate element is used from one of the coding system alists.
10009 There are three such tables: `file-coding-system-alist',
10010 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10011   Vcoding_system_for_read = Qnil;
10012
10013   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10014                doc: /* Specify the coding system for write operations.
10015 Programs bind this variable with `let', but you should not set it globally.
10016 If the value is a coding system, it is used for encoding of output,
10017 when writing it to a file and when sending it to a file or subprocess.
10018
10019 If this does not specify a coding system, an appropriate element
10020 is used from one of the coding system alists.
10021 There are three such tables: `file-coding-system-alist',
10022 `process-coding-system-alist', and `network-coding-system-alist'.
10023 For output to files, if the above procedure does not specify a coding system,
10024 the value of `buffer-file-coding-system' is used.  */);
10025   Vcoding_system_for_write = Qnil;
10026
10027   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10028                doc: /*
10029 Coding system used in the latest file or process I/O.  */);
10030   Vlast_coding_system_used = Qnil;
10031
10032   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10033                doc: /*
10034 Error status of the last code conversion.
10035
10036 When an error was detected in the last code conversion, this variable
10037 is set to one of the following symbols.
10038   `insufficient-source'
10039   `inconsistent-eol'
10040   `invalid-source'
10041   `interrupted'
10042   `insufficient-memory'
10043 When no error was detected, the value doesn't change.  So, to check
10044 the error status of a code conversion by this variable, you must
10045 explicitly set this variable to nil before performing code
10046 conversion.  */);
10047   Vlast_code_conversion_error = Qnil;
10048
10049   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10050                doc: /*
10051 *Non-nil means always inhibit code conversion of end-of-line format.
10052 See info node `Coding Systems' and info node `Text and Binary' concerning
10053 such conversion.  */);
10054   inhibit_eol_conversion = 0;
10055
10056   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10057                doc: /*
10058 Non-nil means process buffer inherits coding system of process output.
10059 Bind it to t if the process output is to be treated as if it were a file
10060 read from some filesystem.  */);
10061   inherit_process_coding_system = 0;
10062
10063   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10064                doc: /*
10065 Alist to decide a coding system to use for a file I/O operation.
10066 The format is ((PATTERN . VAL) ...),
10067 where PATTERN is a regular expression matching a file name,
10068 VAL is a coding system, a cons of coding systems, or a function symbol.
10069 If VAL is a coding system, it is used for both decoding and encoding
10070 the file contents.
10071 If VAL is a cons of coding systems, the car part is used for decoding,
10072 and the cdr part is used for encoding.
10073 If VAL is a function symbol, the function must return a coding system
10074 or a cons of coding systems which are used as above.  The function is
10075 called with an argument that is a list of the arguments with which
10076 `find-operation-coding-system' was called.  If the function can't decide
10077 a coding system, it can return `undecided' so that the normal
10078 code-detection is performed.
10079
10080 See also the function `find-operation-coding-system'
10081 and the variable `auto-coding-alist'.  */);
10082   Vfile_coding_system_alist = Qnil;
10083
10084   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10085                doc: /*
10086 Alist to decide a coding system to use for a process I/O operation.
10087 The format is ((PATTERN . VAL) ...),
10088 where PATTERN is a regular expression matching a program name,
10089 VAL is a coding system, a cons of coding systems, or a function symbol.
10090 If VAL is a coding system, it is used for both decoding what received
10091 from the program and encoding what sent to the program.
10092 If VAL is a cons of coding systems, the car part is used for decoding,
10093 and the cdr part is used for encoding.
10094 If VAL is a function symbol, the function must return a coding system
10095 or a cons of coding systems which are used as above.
10096
10097 See also the function `find-operation-coding-system'.  */);
10098   Vprocess_coding_system_alist = Qnil;
10099
10100   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10101                doc: /*
10102 Alist to decide a coding system to use for a network I/O operation.
10103 The format is ((PATTERN . VAL) ...),
10104 where PATTERN is a regular expression matching a network service name
10105 or is a port number to connect to,
10106 VAL is a coding system, a cons of coding systems, or a function symbol.
10107 If VAL is a coding system, it is used for both decoding what received
10108 from the network stream and encoding what sent to the network stream.
10109 If VAL is a cons of coding systems, the car part is used for decoding,
10110 and the cdr part is used for encoding.
10111 If VAL is a function symbol, the function must return a coding system
10112 or a cons of coding systems which are used as above.
10113
10114 See also the function `find-operation-coding-system'.  */);
10115   Vnetwork_coding_system_alist = Qnil;
10116
10117   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10118                doc: /* Coding system to use with system messages.
10119 Also used for decoding keyboard input on X Window system.  */);
10120   Vlocale_coding_system = Qnil;
10121
10122   /* The eol mnemonics are reset in startup.el system-dependently.  */
10123   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10124                doc: /*
10125 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10126   eol_mnemonic_unix = build_string (":");
10127
10128   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10129                doc: /*
10130 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10131   eol_mnemonic_dos = build_string ("\\");
10132
10133   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10134                doc: /*
10135 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10136   eol_mnemonic_mac = build_string ("/");
10137
10138   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10139                doc: /*
10140 *String displayed in mode line when end-of-line format is not yet determined.  */);
10141   eol_mnemonic_undecided = build_string (":");
10142
10143   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10144                doc: /*
10145 *Non-nil enables character translation while encoding and decoding.  */);
10146   Venable_character_translation = Qt;
10147
10148   DEFVAR_LISP ("standard-translation-table-for-decode",
10149                &Vstandard_translation_table_for_decode,
10150                doc: /* Table for translating characters while decoding.  */);
10151   Vstandard_translation_table_for_decode = Qnil;
10152
10153   DEFVAR_LISP ("standard-translation-table-for-encode",
10154                &Vstandard_translation_table_for_encode,
10155                doc: /* Table for translating characters while encoding.  */);
10156   Vstandard_translation_table_for_encode = Qnil;
10157
10158   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10159                doc: /* Alist of charsets vs revision numbers.
10160 While encoding, if a charset (car part of an element) is found,
10161 designate it with the escape sequence identifying revision (cdr part
10162 of the element).  */);
10163   Vcharset_revision_table = Qnil;
10164
10165   DEFVAR_LISP ("default-process-coding-system",
10166                &Vdefault_process_coding_system,
10167                doc: /* Cons of coding systems used for process I/O by default.
10168 The car part is used for decoding a process output,
10169 the cdr part is used for encoding a text to be sent to a process.  */);
10170   Vdefault_process_coding_system = Qnil;
10171
10172   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10173                doc: /*
10174 Table of extra Latin codes in the range 128..159 (inclusive).
10175 This is a vector of length 256.
10176 If Nth element is non-nil, the existence of code N in a file
10177 \(or output of subprocess) doesn't prevent it to be detected as
10178 a coding system of ISO 2022 variant which has a flag
10179 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10180 or reading output of a subprocess.
10181 Only 128th through 159th elements have a meaning.  */);
10182   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10183
10184   DEFVAR_LISP ("select-safe-coding-system-function",
10185                &Vselect_safe_coding_system_function,
10186                doc: /*
10187 Function to call to select safe coding system for encoding a text.
10188
10189 If set, this function is called to force a user to select a proper
10190 coding system which can encode the text in the case that a default
10191 coding system used in each operation can't encode the text.  The
10192 function should take care that the buffer is not modified while
10193 the coding system is being selected.
10194
10195 The default value is `select-safe-coding-system' (which see).  */);
10196   Vselect_safe_coding_system_function = Qnil;
10197
10198   DEFVAR_BOOL ("coding-system-require-warning",
10199                &coding_system_require_warning,
10200                doc: /* Internal use only.
10201 If non-nil, on writing a file, `select-safe-coding-system-function' is
10202 called even if `coding-system-for-write' is non-nil.  The command
10203 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10204   coding_system_require_warning = 0;
10205
10206
10207   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10208                &inhibit_iso_escape_detection,
10209                doc: /*
10210 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
10211
10212 By default, on reading a file, Emacs tries to detect how the text is
10213 encoded.  This code detection is sensitive to escape sequences.  If
10214 the sequence is valid as ISO2022, the code is determined as one of
10215 the ISO2022 encodings, and the file is decoded by the corresponding
10216 coding system (e.g. `iso-2022-7bit').
10217
10218 However, there may be a case that you want to read escape sequences in
10219 a file as is.  In such a case, you can set this variable to non-nil.
10220 Then, as the code detection ignores any escape sequences, no file is
10221 detected as encoded in some ISO2022 encoding.  The result is that all
10222 escape sequences become visible in a buffer.
10223
10224 The default value is nil, and it is strongly recommended not to change
10225 it.  That is because many Emacs Lisp source files that contain
10226 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10227 in Emacs's distribution, and they won't be decoded correctly on
10228 reading if you suppress escape sequence detection.
10229
10230 The other way to read escape sequences in a file without decoding is
10231 to explicitly specify some coding system that doesn't use ISO2022's
10232 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10233   inhibit_iso_escape_detection = 0;
10234
10235   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10236                doc: /* Char table for translating self-inserting characters.
10237 This is applied to the result of input methods, not their input.
10238 See also `keyboard-translate-table'.  */);
10239     Vtranslation_table_for_input = Qnil;
10240
10241   {
10242     Lisp_Object args[coding_arg_max];
10243     Lisp_Object plist[16];
10244     int i;
10245
10246     for (i = 0; i < coding_arg_max; i++)
10247       args[i] = Qnil;
10248
10249     plist[0] = intern (":name");
10250     plist[1] = args[coding_arg_name] = Qno_conversion;
10251     plist[2] = intern (":mnemonic");
10252     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10253     plist[4] = intern (":coding-type");
10254     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10255     plist[6] = intern (":ascii-compatible-p");
10256     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10257     plist[8] = intern (":default-char");
10258     plist[9] = args[coding_arg_default_char] = make_number (0);
10259     plist[10] = intern (":for-unibyte");
10260     plist[11] = args[coding_arg_for_unibyte] = Qt;
10261     plist[12] = intern (":docstring");
10262     plist[13] = build_string ("Do no conversion.\n\
10263 \n\
10264 When you visit a file with this coding, the file is read into a\n\
10265 unibyte buffer as is, thus each byte of a file is treated as a\n\
10266 character.");
10267     plist[14] = intern (":eol-type");
10268     plist[15] = args[coding_arg_eol_type] = Qunix;
10269     args[coding_arg_plist] = Flist (16, plist);
10270     Fdefine_coding_system_internal (coding_arg_max, args);
10271
10272     plist[1] = args[coding_arg_name] = Qundecided;
10273     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10274     plist[5] = args[coding_arg_coding_type] = Qundecided;
10275     /* This is already set.
10276        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10277     plist[8] = intern (":charset-list");
10278     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10279     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10280     plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10281     plist[15] = args[coding_arg_eol_type] = Qnil;
10282     args[coding_arg_plist] = Flist (16, plist);
10283     Fdefine_coding_system_internal (coding_arg_max, args);
10284   }
10285
10286   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10287
10288   {
10289     int i;
10290
10291     for (i = 0; i < coding_category_max; i++)
10292       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10293   }
10294 #if defined (MSDOS) || defined (WINDOWSNT)
10295   system_eol_type = Qdos;
10296 #else
10297   system_eol_type = Qunix;
10298 #endif
10299   staticpro (&system_eol_type);
10300 }
10301
10302 char *
10303 emacs_strerror (error_number)
10304      int error_number;
10305 {
10306   char *str;
10307
10308   synchronize_system_messages_locale ();
10309   str = strerror (error_number);
10310
10311   if (! NILP (Vlocale_coding_system))
10312     {
10313       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10314                                                       Vlocale_coding_system,
10315                                                       0);
10316       str = (char *) SDATA (dec);
10317     }
10318
10319   return str;
10320 }
10321
10322 #endif /* emacs */
10323
10324 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10325    (do not change this comment) */