src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008, 2009, 2010
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (struct coding_system *coding,
 158                    struct coding_detection_info *detect_info)
 159 {
 160   const unsigned char *src = coding->source;
 161   const unsigned char *src_end = coding->source + coding->src_bytes;
 162   int multibytep = coding->src_multibyte;
 163   int consumed_chars = 0;
 164   int found = 0;
 165   ...;
 166
 167   while (1)
 168     {
 169       /* Get one byte from the source.  If the souce is exausted, jump
 170          to no_more_source:.  */
 171       ONE_MORE_BYTE (c);
 172
 173       if (! __C_conforms_to_XXX___ (c))
 174         break;
 175       if (! __C_strongly_suggests_XXX__ (c))
 176         found = CATEGORY_MASK_XXX;
 177     }
 178   /* The byte sequence is invalid for XXX.  */
 179   detect_info->rejected |= CATEGORY_MASK_XXX;
 180   return 0;
 181
 182  no_more_source:
 183   /* The source exausted successfully.  */
 184   detect_info->found |= found;
 185   return 1;
 186 }
 187 #endif
 188
 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 190
 191   These functions decode a byte sequence specified as a source by
 192   CODING.  The resulting multibyte text goes to a place pointed to by
 193   CODING->charbuf, the length of which should not exceed
 194   CODING->charbuf_size;
 195
 196   These functions set the information of original and decoded texts in
 197   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 198   They also set CODING->result to one of CODING_RESULT_XXX indicating
 199   how the decoding is finished.
 200
 201   Below is the template of these functions.  */
 202
 203 #if 0
 204 static void
 205 decode_coding_XXXX (struct coding_system *coding)
 206 {
 207   const unsigned char *src = coding->source + coding->consumed;
 208   const unsigned char *src_end = coding->source + coding->src_bytes;
 209   /* SRC_BASE remembers the start position in source in each loop.
 210      The loop will be exited when there's not enough source code, or
 211      when there's no room in CHARBUF for a decoded character.  */
 212   const unsigned char *src_base;
 213   /* A buffer to produce decoded characters.  */
 214   int *charbuf = coding->charbuf + coding->charbuf_used;
 215   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 216   int multibytep = coding->src_multibyte;
 217
 218   while (1)
 219     {
 220       src_base = src;
 221       if (charbuf < charbuf_end)
 222         /* No more room to produce a decoded character.  */
 223         break;
 224       ONE_MORE_BYTE (c);
 225       /* Decode it. */
 226     }
 227
 228  no_more_source:
 229   if (src_base < src_end
 230       && coding->mode & CODING_MODE_LAST_BLOCK)
 231     /* If the source ends by partial bytes to construct a character,
 232        treat them as eight-bit raw data.  */
 233     while (src_base < src_end && charbuf < charbuf_end)
 234       *charbuf++ = *src_base++;
 235   /* Remember how many bytes and characters we consumed.  If the
 236      source is multibyte, the bytes and chars are not identical.  */
 237   coding->consumed = coding->consumed_char = src_base - coding->source;
 238   /* Remember how many characters we produced.  */
 239   coding->charbuf_used = charbuf - coding->charbuf;
 240 }
 241 #endif
 242
 243 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 244
 245   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 246   internal multibyte format by CODING.  The resulting byte sequence
 247   goes to a place pointed to by DESTINATION, the length of which
 248   should not exceed DST_BYTES.
 249
 250   These functions set the information of original and encoded texts in
 251   the members produced, produced_char, consumed, and consumed_char of
 252   the structure *CODING.  They also set the member result to one of
 253   CODING_RESULT_XXX indicating how the encoding finished.
 254
 255   DST_BYTES zero means that source area and destination area are
 256   overlapped, which means that we can produce a encoded text until it
 257   reaches at the head of not-yet-encoded source text.
 258
 259   Below is a template of these functions.  */
 260 #if 0
 261 static void
 262 encode_coding_XXX (struct coding_system *coding)
 263 {
 264   int multibytep = coding->dst_multibyte;
 265   int *charbuf = coding->charbuf;
 266   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 267   unsigned char *dst = coding->destination + coding->produced;
 268   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 269   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 270   int produced_chars = 0;
 271
 272   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 273     {
 274       int c = *charbuf;
 275       /* Encode C into DST, and increment DST.  */
 276     }
 277  label_no_more_destination:
 278   /* How many chars and bytes we produced.  */
 279   coding->produced_char += produced_chars;
 280   coding->produced = dst - coding->destination;
 281 }
 282 #endif
 283
 284 \f
 285 /*** 1. Preamble ***/
 286
 287 #include <config.h>
 288 #include <stdio.h>
 289 #include <setjmp.h>
 290
 291 #include "lisp.h"
 292 #include "buffer.h"
 293 #include "character.h"
 294 #include "charset.h"
 295 #include "ccl.h"
 296 #include "composite.h"
 297 #include "coding.h"
 298 #include "window.h"
 299 #include "frame.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 305 Lisp_Object Qunix, Qdos;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 311 Lisp_Object Qbig, Qlittle;
 312 Lisp_Object Qcoding_system_history;
 313 Lisp_Object Qvalid_codes;
 314 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 317 Lisp_Object QCascii_compatible_p;
 318
 319 Lisp_Object Qcall_process, Qcall_process_region;
 320 Lisp_Object Qstart_process, Qopen_network_stream;
 321 Lisp_Object Qtarget_idx;
 322
 323 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 324 Lisp_Object Qinterrupted, Qinsufficient_memory;
 325
 326 /* If a symbol has this property, evaluate the value to define the
 327    symbol as a coding system.  */
 328 static Lisp_Object Qcoding_system_define_form;
 329
 330 int coding_system_require_warning;
 331
 332 Lisp_Object Vselect_safe_coding_system_function;
 333
 334 /* Mnemonic string for each format of end-of-line.  */
 335 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 336 /* Mnemonic string to indicate format of end-of-line is not yet
 337    decided.  */
 338 Lisp_Object eol_mnemonic_undecided;
 339
 340 /* Format of end-of-line decided by system.  This is Qunix on
 341    Unix and Mac, Qdos on DOS/Windows.
 342    This has an effect only for external encoding (i.e. for output to
 343    file and process), not for in-buffer or Lisp string encoding.  */
 344 static Lisp_Object system_eol_type;
 345
 346 #ifdef emacs
 347
 348 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 349
 350 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 351
 352 /* Coding system emacs-mule and raw-text are for converting only
 353    end-of-line format.  */
 354 Lisp_Object Qemacs_mule, Qraw_text;
 355 Lisp_Object Qutf_8_emacs;
 356
 357 /* Coding-systems are handed between Emacs Lisp programs and C internal
 358    routines by the following three variables.  */
 359 /* Coding-system for reading files and receiving data from process.  */
 360 Lisp_Object Vcoding_system_for_read;
 361 /* Coding-system for writing files and sending data to process.  */
 362 Lisp_Object Vcoding_system_for_write;
 363 /* Coding-system actually used in the latest I/O.  */
 364 Lisp_Object Vlast_coding_system_used;
 365 /* Set to non-nil when an error is detected while code conversion.  */
 366 Lisp_Object Vlast_code_conversion_error;
 367 /* A vector of length 256 which contains information about special
 368    Latin codes (especially for dealing with Microsoft codes).  */
 369 Lisp_Object Vlatin_extra_code_table;
 370
 371 /* Flag to inhibit code conversion of end-of-line format.  */
 372 int inhibit_eol_conversion;
 373
 374 /* Flag to inhibit ISO2022 escape sequence detection.  */
 375 int inhibit_iso_escape_detection;
 376
 377 /* Flag to inhibit detection of binary files through null bytes.  */
 378 int inhibit_null_byte_detection;
 379
 380 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 381 int inherit_process_coding_system;
 382
 383 /* Coding system to be used to encode text for terminal display when
 384    terminal coding system is nil.  */
 385 struct coding_system safe_terminal_coding;
 386
 387 Lisp_Object Vfile_coding_system_alist;
 388 Lisp_Object Vprocess_coding_system_alist;
 389 Lisp_Object Vnetwork_coding_system_alist;
 390
 391 Lisp_Object Vlocale_coding_system;
 392
 393 #endif /* emacs */
 394
 395 /* Flag to tell if we look up translation table on character code
 396    conversion.  */
 397 Lisp_Object Venable_character_translation;
 398 /* Standard translation table to look up on decoding (reading).  */
 399 Lisp_Object Vstandard_translation_table_for_decode;
 400 /* Standard translation table to look up on encoding (writing).  */
 401 Lisp_Object Vstandard_translation_table_for_encode;
 402
 403 Lisp_Object Qtranslation_table;
 404 Lisp_Object Qtranslation_table_id;
 405 Lisp_Object Qtranslation_table_for_decode;
 406 Lisp_Object Qtranslation_table_for_encode;
 407
 408 /* Alist of charsets vs revision number.  */
 409 static Lisp_Object Vcharset_revision_table;
 410
 411 /* Default coding systems used for process I/O.  */
 412 Lisp_Object Vdefault_process_coding_system;
 413
 414 /* Char table for translating Quail and self-inserting input.  */
 415 Lisp_Object Vtranslation_table_for_input;
 416
 417 /* Two special coding systems.  */
 418 Lisp_Object Vsjis_coding_system;
 419 Lisp_Object Vbig5_coding_system;
 420
 421 /* ISO2022 section */
 422
 423 #define CODING_ISO_INITIAL(coding, reg)                 \
 424   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 425                      coding_attr_iso_initial),          \
 426                reg)))
 427
 428
 429 #define CODING_ISO_REQUEST(coding, charset_id)          \
 430   (((charset_id) <= (coding)->max_charset_id            \
 431     ? ((coding)->safe_charsets[charset_id] != 255       \
 432        ? (coding)->safe_charsets[charset_id]            \
 433        : -1)                                            \
 434     : -1))
 435
 436
 437 #define CODING_ISO_FLAGS(coding)        \
 438   ((coding)->spec.iso_2022.flags)
 439 #define CODING_ISO_DESIGNATION(coding, reg)     \
 440   ((coding)->spec.iso_2022.current_designation[reg])
 441 #define CODING_ISO_INVOCATION(coding, plane)    \
 442   ((coding)->spec.iso_2022.current_invocation[plane])
 443 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 444   ((coding)->spec.iso_2022.single_shifting)
 445 #define CODING_ISO_BOL(coding)  \
 446   ((coding)->spec.iso_2022.bol)
 447 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 448   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 449 #define CODING_ISO_CMP_STATUS(coding)   \
 450   (&(coding)->spec.iso_2022.cmp_status)
 451 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 452   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 453 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 454   ((coding)->spec.iso_2022.embedded_utf_8)
 455
 456 /* Control characters of ISO2022.  */
 457                         /* code */      /* function */
 458 #define ISO_CODE_LF     0x0A            /* line-feed */
 459 #define ISO_CODE_CR     0x0D            /* carriage-return */
 460 #define ISO_CODE_SO     0x0E            /* shift-out */
 461 #define ISO_CODE_SI     0x0F            /* shift-in */
 462 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 463 #define ISO_CODE_ESC    0x1B            /* escape */
 464 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 465 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 466 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 467
 468 /* All code (1-byte) of ISO2022 is classified into one of the
 469    followings.  */
 470 enum iso_code_class_type
 471   {
 472     ISO_control_0,              /* Control codes in the range
 473                                    0x00..0x1F and 0x7F, except for the
 474                                    following 5 codes.  */
 475     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 476     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 477     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 478     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 479     ISO_control_1,              /* Control codes in the range
 480                                    0x80..0x9F, except for the
 481                                    following 3 codes.  */
 482     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 483     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 484     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 485     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 486     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 487     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 488     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 489   };
 490
 491 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 492     `iso-flags' attribute of an iso2022 coding system.  */
 493
 494 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 495    instead of the correct short-form sequence (e.g. ESC $ A).  */
 496 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 497
 498 /* If set, reset graphic planes and registers at end-of-line to the
 499    initial state.  */
 500 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 501
 502 /* If set, reset graphic planes and registers before any control
 503    characters to the initial state.  */
 504 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 505
 506 /* If set, encode by 7-bit environment.  */
 507 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 508
 509 /* If set, use locking-shift function.  */
 510 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 511
 512 /* If set, use single-shift function.  Overwrite
 513    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 514 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 515
 516 /* If set, use designation escape sequence.  */
 517 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 518
 519 /* If set, produce revision number sequence.  */
 520 #define CODING_ISO_FLAG_REVISION        0x0080
 521
 522 /* If set, produce ISO6429's direction specifying sequence.  */
 523 #define CODING_ISO_FLAG_DIRECTION       0x0100
 524
 525 /* If set, assume designation states are reset at beginning of line on
 526    output.  */
 527 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 528
 529 /* If set, designation sequence should be placed at beginning of line
 530    on output.  */
 531 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 532
 533 /* If set, do not encode unsafe charactes on output.  */
 534 #define CODING_ISO_FLAG_SAFE            0x0800
 535
 536 /* If set, extra latin codes (128..159) are accepted as a valid code
 537    on input.  */
 538 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 539
 540 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 541
 542 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 543
 544 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 545
 546 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 547
 548 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 549
 550 /* A character to be produced on output if encoding of the original
 551    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 552 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 553
 554 /* UTF-8 section */
 555 #define CODING_UTF_8_BOM(coding)        \
 556   ((coding)->spec.utf_8_bom)
 557
 558 /* UTF-16 section */
 559 #define CODING_UTF_16_BOM(coding)       \
 560   ((coding)->spec.utf_16.bom)
 561
 562 #define CODING_UTF_16_ENDIAN(coding)    \
 563   ((coding)->spec.utf_16.endian)
 564
 565 #define CODING_UTF_16_SURROGATE(coding) \
 566   ((coding)->spec.utf_16.surrogate)
 567
 568
 569 /* CCL section */
 570 #define CODING_CCL_DECODER(coding)      \
 571   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 572 #define CODING_CCL_ENCODER(coding)      \
 573   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 574 #define CODING_CCL_VALIDS(coding)                                          \
 575   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 576
 577 /* Index for each coding category in `coding_categories' */
 578
 579 enum coding_category
 580   {
 581     coding_category_iso_7,
 582     coding_category_iso_7_tight,
 583     coding_category_iso_8_1,
 584     coding_category_iso_8_2,
 585     coding_category_iso_7_else,
 586     coding_category_iso_8_else,
 587     coding_category_utf_8_auto,
 588     coding_category_utf_8_nosig,
 589     coding_category_utf_8_sig,
 590     coding_category_utf_16_auto,
 591     coding_category_utf_16_be,
 592     coding_category_utf_16_le,
 593     coding_category_utf_16_be_nosig,
 594     coding_category_utf_16_le_nosig,
 595     coding_category_charset,
 596     coding_category_sjis,
 597     coding_category_big5,
 598     coding_category_ccl,
 599     coding_category_emacs_mule,
 600     /* All above are targets of code detection.  */
 601     coding_category_raw_text,
 602     coding_category_undecided,
 603     coding_category_max
 604   };
 605
 606 /* Definitions of flag bits used in detect_coding_XXXX.  */
 607 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 608 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 609 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 610 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 611 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 612 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 613 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 614 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 615 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 616 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 617 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 618 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 619 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 620 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 621 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 622 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 623 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 624 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 625 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 626 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 627
 628 /* This value is returned if detect_coding_mask () find nothing other
 629    than ASCII characters.  */
 630 #define CATEGORY_MASK_ANY               \
 631   (CATEGORY_MASK_ISO_7                  \
 632    | CATEGORY_MASK_ISO_7_TIGHT          \
 633    | CATEGORY_MASK_ISO_8_1              \
 634    | CATEGORY_MASK_ISO_8_2              \
 635    | CATEGORY_MASK_ISO_7_ELSE           \
 636    | CATEGORY_MASK_ISO_8_ELSE           \
 637    | CATEGORY_MASK_UTF_8_AUTO           \
 638    | CATEGORY_MASK_UTF_8_NOSIG          \
 639    | CATEGORY_MASK_UTF_8_SIG            \
 640    | CATEGORY_MASK_UTF_16_AUTO          \
 641    | CATEGORY_MASK_UTF_16_BE            \
 642    | CATEGORY_MASK_UTF_16_LE            \
 643    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 644    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 645    | CATEGORY_MASK_CHARSET              \
 646    | CATEGORY_MASK_SJIS                 \
 647    | CATEGORY_MASK_BIG5                 \
 648    | CATEGORY_MASK_CCL                  \
 649    | CATEGORY_MASK_EMACS_MULE)
 650
 651
 652 #define CATEGORY_MASK_ISO_7BIT \
 653   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 654
 655 #define CATEGORY_MASK_ISO_8BIT \
 656   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 657
 658 #define CATEGORY_MASK_ISO_ELSE \
 659   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 660
 661 #define CATEGORY_MASK_ISO_ESCAPE        \
 662   (CATEGORY_MASK_ISO_7                  \
 663    | CATEGORY_MASK_ISO_7_TIGHT          \
 664    | CATEGORY_MASK_ISO_7_ELSE           \
 665    | CATEGORY_MASK_ISO_8_ELSE)
 666
 667 #define CATEGORY_MASK_ISO       \
 668   (  CATEGORY_MASK_ISO_7BIT     \
 669      | CATEGORY_MASK_ISO_8BIT   \
 670      | CATEGORY_MASK_ISO_ELSE)
 671
 672 #define CATEGORY_MASK_UTF_16            \
 673   (CATEGORY_MASK_UTF_16_AUTO            \
 674    | CATEGORY_MASK_UTF_16_BE            \
 675    | CATEGORY_MASK_UTF_16_LE            \
 676    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 677    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 678
 679 #define CATEGORY_MASK_UTF_8     \
 680   (CATEGORY_MASK_UTF_8_AUTO     \
 681    | CATEGORY_MASK_UTF_8_NOSIG  \
 682    | CATEGORY_MASK_UTF_8_SIG)
 683
 684 /* List of symbols `coding-category-xxx' ordered by priority.  This
 685    variable is exposed to Emacs Lisp.  */
 686 static Lisp_Object Vcoding_category_list;
 687
 688 /* Table of coding categories (Lisp symbols).  This variable is for
 689    internal use oly.  */
 690 static Lisp_Object Vcoding_category_table;
 691
 692 /* Table of coding-categories ordered by priority.  */
 693 static enum coding_category coding_priorities[coding_category_max];
 694
 695 /* Nth element is a coding context for the coding system bound to the
 696    Nth coding category.  */
 697 static struct coding_system coding_categories[coding_category_max];
 698
 699 /*** Commonly used macros and functions ***/
 700
 701 #ifndef min
 702 #define min(a, b) ((a) < (b) ? (a) : (b))
 703 #endif
 704 #ifndef max
 705 #define max(a, b) ((a) > (b) ? (a) : (b))
 706 #endif
 707
 708 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 709   do {                                                  \
 710     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 711     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 712   } while (0)
 713
 714
 715 /* Safely get one byte from the source text pointed by SRC which ends
 716    at SRC_END, and set C to that byte.  If there are not enough bytes
 717    in the source, it jumps to `no_more_source'.  If multibytep is
 718    nonzero, and a multibyte character is found at SRC, set C to the
 719    negative value of the character code.  The caller should declare
 720    and set these variables appropriately in advance:
 721         src, src_end, multibytep */
 722
 723 #define ONE_MORE_BYTE(c)                                \
 724   do {                                                  \
 725     if (src == src_end)                                 \
 726       {                                                 \
 727         if (src_base < src)                             \
 728           record_conversion_result                      \
 729             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 730         goto no_more_source;                            \
 731       }                                                 \
 732     c = *src++;                                         \
 733     if (multibytep && (c & 0x80))                       \
 734       {                                                 \
 735         if ((c & 0xFE) == 0xC0)                         \
 736           c = ((c & 1) << 6) | *src++;                  \
 737         else                                            \
 738           {                                             \
 739             src--;                                      \
 740             c = - string_char (src, &src, NULL);        \
 741             record_conversion_result                    \
 742               (coding, CODING_RESULT_INVALID_SRC);      \
 743           }                                             \
 744       }                                                 \
 745     consumed_chars++;                                   \
 746   } while (0)
 747
 748 /* Safely get two bytes from the source text pointed by SRC which ends
 749    at SRC_END, and set C1 and C2 to those bytes while skipping the
 750    heading multibyte characters.  If there are not enough bytes in the
 751    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 752    a multibyte character is found for C2, set C2 to the negative value
 753    of the character code.  The caller should declare and set these
 754    variables appropriately in advance:
 755         src, src_end, multibytep
 756    It is intended that this macro is used in detect_coding_utf_16.  */
 757
 758 #define TWO_MORE_BYTES(c1, c2)                          \
 759   do {                                                  \
 760     do {                                                \
 761       if (src == src_end)                               \
 762         goto no_more_source;                            \
 763       c1 = *src++;                                      \
 764       if (multibytep && (c1 & 0x80))                    \
 765         {                                               \
 766           if ((c1 & 0xFE) == 0xC0)                      \
 767             c1 = ((c1 & 1) << 6) | *src++;              \
 768           else                                          \
 769             {                                           \
 770               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 771               c1 = -1;                                  \
 772             }                                           \
 773         }                                               \
 774     } while (c1 < 0);                                   \
 775     if (src == src_end)                                 \
 776       goto no_more_source;                              \
 777     c2 = *src++;                                        \
 778     if (multibytep && (c2 & 0x80))                      \
 779       {                                                 \
 780         if ((c2 & 0xFE) == 0xC0)                        \
 781           c2 = ((c2 & 1) << 6) | *src++;                \
 782         else                                            \
 783           c2 = -1;                                      \
 784       }                                                 \
 785   } while (0)
 786
 787
 788 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 789   do {                                                  \
 790     c = *src++;                                         \
 791     if (multibytep && (c & 0x80))                       \
 792       {                                                 \
 793         if ((c & 0xFE) == 0xC0)                         \
 794           c = ((c & 1) << 6) | *src++;                  \
 795         else                                            \
 796           {                                             \
 797             src--;                                      \
 798             c = - string_char (src, &src, NULL);        \
 799             record_conversion_result                    \
 800               (coding, CODING_RESULT_INVALID_SRC);      \
 801           }                                             \
 802       }                                                 \
 803     consumed_chars++;                                   \
 804   } while (0)
 805
 806
 807 /* Store a byte C in the place pointed by DST and increment DST to the
 808    next free point, and increment PRODUCED_CHARS.  The caller should
 809    assure that C is 0..127, and declare and set the variable `dst'
 810    appropriately in advance.
 811 */
 812
 813
 814 #define EMIT_ONE_ASCII_BYTE(c)  \
 815   do {                          \
 816     produced_chars++;           \
 817     *dst++ = (c);               \
 818   } while (0)
 819
 820
 821 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 822
 823 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 824   do {                                  \
 825     produced_chars += 2;                \
 826     *dst++ = (c1), *dst++ = (c2);       \
 827   } while (0)
 828
 829
 830 /* Store a byte C in the place pointed by DST and increment DST to the
 831    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 832    nonzero, store in an appropriate multibyte from.  The caller should
 833    declare and set the variables `dst' and `multibytep' appropriately
 834    in advance.  */
 835
 836 #define EMIT_ONE_BYTE(c)                \
 837   do {                                  \
 838     produced_chars++;                   \
 839     if (multibytep)                     \
 840       {                                 \
 841         int ch = (c);                   \
 842         if (ch >= 0x80)                 \
 843           ch = BYTE8_TO_CHAR (ch);      \
 844         CHAR_STRING_ADVANCE (ch, dst);  \
 845       }                                 \
 846     else                                \
 847       *dst++ = (c);                     \
 848   } while (0)
 849
 850
 851 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 852
 853 #define EMIT_TWO_BYTES(c1, c2)          \
 854   do {                                  \
 855     produced_chars += 2;                \
 856     if (multibytep)                     \
 857       {                                 \
 858         int ch;                         \
 859                                         \
 860         ch = (c1);                      \
 861         if (ch >= 0x80)                 \
 862           ch = BYTE8_TO_CHAR (ch);      \
 863         CHAR_STRING_ADVANCE (ch, dst);  \
 864         ch = (c2);                      \
 865         if (ch >= 0x80)                 \
 866           ch = BYTE8_TO_CHAR (ch);      \
 867         CHAR_STRING_ADVANCE (ch, dst);  \
 868       }                                 \
 869     else                                \
 870       {                                 \
 871         *dst++ = (c1);                  \
 872         *dst++ = (c2);                  \
 873       }                                 \
 874   } while (0)
 875
 876
 877 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 878   do {                                  \
 879     EMIT_ONE_BYTE (c1);                 \
 880     EMIT_TWO_BYTES (c2, c3);            \
 881   } while (0)
 882
 883
 884 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 885   do {                                          \
 886     EMIT_TWO_BYTES (c1, c2);                    \
 887     EMIT_TWO_BYTES (c3, c4);                    \
 888   } while (0)
 889
 890
 891 /* Prototypes for static functions.  */
 892 static void record_conversion_result (struct coding_system *coding,
 893                                       enum coding_result_code result);
 894 static int detect_coding_utf_8 (struct coding_system *,
 895                                 struct coding_detection_info *info);
 896 static void decode_coding_utf_8 (struct coding_system *);
 897 static int encode_coding_utf_8 (struct coding_system *);
 898
 899 static int detect_coding_utf_16 (struct coding_system *,
 900                                  struct coding_detection_info *info);
 901 static void decode_coding_utf_16 (struct coding_system *);
 902 static int encode_coding_utf_16 (struct coding_system *);
 903
 904 static int detect_coding_iso_2022 (struct coding_system *,
 905                                    struct coding_detection_info *info);
 906 static void decode_coding_iso_2022 (struct coding_system *);
 907 static int encode_coding_iso_2022 (struct coding_system *);
 908
 909 static int detect_coding_emacs_mule (struct coding_system *,
 910                                      struct coding_detection_info *info);
 911 static void decode_coding_emacs_mule (struct coding_system *);
 912 static int encode_coding_emacs_mule (struct coding_system *);
 913
 914 static int detect_coding_sjis (struct coding_system *,
 915                                struct coding_detection_info *info);
 916 static void decode_coding_sjis (struct coding_system *);
 917 static int encode_coding_sjis (struct coding_system *);
 918
 919 static int detect_coding_big5 (struct coding_system *,
 920                                struct coding_detection_info *info);
 921 static void decode_coding_big5 (struct coding_system *);
 922 static int encode_coding_big5 (struct coding_system *);
 923
 924 static int detect_coding_ccl (struct coding_system *,
 925                               struct coding_detection_info *info);
 926 static void decode_coding_ccl (struct coding_system *);
 927 static int encode_coding_ccl (struct coding_system *);
 928
 929 static void decode_coding_raw_text (struct coding_system *);
 930 static int encode_coding_raw_text (struct coding_system *);
 931
 932 static void coding_set_source (struct coding_system *);
 933 static void coding_set_destination (struct coding_system *);
 934 static void coding_alloc_by_realloc (struct coding_system *, EMACS_INT);
 935 static void coding_alloc_by_making_gap (struct coding_system *,
 936                                         EMACS_INT, EMACS_INT);
 937 static unsigned char *alloc_destination (struct coding_system *,
 938                                          EMACS_INT, unsigned char *);
 939 static void setup_iso_safe_charsets (Lisp_Object);
 940 static unsigned char *encode_designation_at_bol (struct coding_system *,
 941                                                  int *, int *,
 942                                                  unsigned char *);
 943 static int detect_eol (const unsigned char *,
 944                        EMACS_INT, enum coding_category);
 945 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
 946 static void decode_eol (struct coding_system *);
 947 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
 948 static Lisp_Object get_translation (Lisp_Object, int *, int *);
 949 static int produce_chars (struct coding_system *, Lisp_Object, int);
 950 static INLINE void produce_charset (struct coding_system *, int *,
 951                                     EMACS_INT);
 952 static void produce_annotation (struct coding_system *, EMACS_INT);
 953 static int decode_coding (struct coding_system *);
 954 static INLINE int *handle_composition_annotation (EMACS_INT, EMACS_INT,
 955                                                   struct coding_system *,
 956                                                   int *, EMACS_INT *);
 957 static INLINE int *handle_charset_annotation (EMACS_INT, EMACS_INT,
 958                                               struct coding_system *,
 959                                               int *, EMACS_INT *);
 960 static void consume_chars (struct coding_system *, Lisp_Object, int);
 961 static int encode_coding (struct coding_system *);
 962 static Lisp_Object make_conversion_work_buffer (int);
 963 static Lisp_Object code_conversion_restore (Lisp_Object);
 964 static INLINE int char_encodable_p (int, Lisp_Object);
 965 static Lisp_Object make_subsidiaries (Lisp_Object);
 966
 967 static void
 968 record_conversion_result (struct coding_system *coding,
 969                           enum coding_result_code result)
 970 {
 971   coding->result = result;
 972   switch (result)
 973     {
 974     case CODING_RESULT_INSUFFICIENT_SRC:
 975       Vlast_code_conversion_error = Qinsufficient_source;
 976       break;
 977     case CODING_RESULT_INCONSISTENT_EOL:
 978       Vlast_code_conversion_error = Qinconsistent_eol;
 979       break;
 980     case CODING_RESULT_INVALID_SRC:
 981       Vlast_code_conversion_error = Qinvalid_source;
 982       break;
 983     case CODING_RESULT_INTERRUPT:
 984       Vlast_code_conversion_error = Qinterrupted;
 985       break;
 986     case CODING_RESULT_INSUFFICIENT_MEM:
 987       Vlast_code_conversion_error = Qinsufficient_memory;
 988       break;
 989     case CODING_RESULT_INSUFFICIENT_DST:
 990       /* Don't record this error in Vlast_code_conversion_error
 991          because it happens just temporarily and is resolved when the
 992          whole conversion is finished.  */
 993       break;
 994     case CODING_RESULT_SUCCESS:
 995       break;
 996     default:
 997       Vlast_code_conversion_error = intern ("Unknown error");
 998     }
 999 }
1000
1001 /* This wrapper macro is used to preserve validity of pointers into
1002    buffer text across calls to decode_char, which could cause
1003    relocation of buffers if it loads a charset map, because loading a
1004    charset map allocates large structures.  */
1005 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1006   do {                                                                       \
1007     charset_map_loaded = 0;                                                  \
1008     c = DECODE_CHAR (charset, code);                                         \
1009     if (charset_map_loaded)                                                  \
1010       {                                                                      \
1011         const unsigned char *orig = coding->source;                          \
1012         EMACS_INT offset;                                                    \
1013                                                                              \
1014         coding_set_source (coding);                                          \
1015         offset = coding->source - orig;                                      \
1016         src += offset;                                                       \
1017         src_base += offset;                                                  \
1018         src_end += offset;                                                   \
1019       }                                                                      \
1020   } while (0)
1021
1022
1023 /* If there are at least BYTES length of room at dst, allocate memory
1024    for coding->destination and update dst and dst_end.  We don't have
1025    to take care of coding->source which will be relocated.  It is
1026    handled by calling coding_set_source in encode_coding.  */
1027
1028 #define ASSURE_DESTINATION(bytes)                               \
1029   do {                                                          \
1030     if (dst + (bytes) >= dst_end)                               \
1031       {                                                         \
1032         int more_bytes = charbuf_end - charbuf + (bytes);       \
1033                                                                 \
1034         dst = alloc_destination (coding, more_bytes, dst);      \
1035         dst_end = coding->destination + coding->dst_bytes;      \
1036       }                                                         \
1037   } while (0)
1038
1039
1040 /* Store multibyte form of the character C in P, and advance P to the
1041    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1042    never calls MAYBE_UNIFY_CHAR.  */
1043
1044 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1045   do {                                          \
1046     if ((c) <= MAX_1_BYTE_CHAR)                 \
1047       *(p)++ = (c);                             \
1048     else if ((c) <= MAX_2_BYTE_CHAR)            \
1049       *(p)++ = (0xC0 | ((c) >> 6)),             \
1050         *(p)++ = (0x80 | ((c) & 0x3F));         \
1051     else if ((c) <= MAX_3_BYTE_CHAR)            \
1052       *(p)++ = (0xE0 | ((c) >> 12)),            \
1053         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1054         *(p)++ = (0x80 | ((c) & 0x3F));         \
1055     else if ((c) <= MAX_4_BYTE_CHAR)            \
1056       *(p)++ = (0xF0 | (c >> 18)),              \
1057         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1058         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1059         *(p)++ = (0x80 | (c & 0x3F));           \
1060     else if ((c) <= MAX_5_BYTE_CHAR)            \
1061       *(p)++ = 0xF8,                            \
1062         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1063         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1064         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1065         *(p)++ = (0x80 | (c & 0x3F));           \
1066     else                                        \
1067       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1068   } while (0)
1069
1070
1071 /* Return the character code of character whose multibyte form is at
1072    P, and advance P to the end of the multibyte form.  This is like
1073    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1074
1075 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1076   (!((p)[0] & 0x80)                                             \
1077    ? *(p)++                                                     \
1078    : ! ((p)[0] & 0x20)                                          \
1079    ? ((p) += 2,                                                 \
1080       ((((p)[-2] & 0x1F) << 6)                                  \
1081        | ((p)[-1] & 0x3F)                                       \
1082        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1083    : ! ((p)[0] & 0x10)                                          \
1084    ? ((p) += 3,                                                 \
1085       ((((p)[-3] & 0x0F) << 12)                                 \
1086        | (((p)[-2] & 0x3F) << 6)                                \
1087        | ((p)[-1] & 0x3F)))                                     \
1088    : ! ((p)[0] & 0x08)                                          \
1089    ? ((p) += 4,                                                 \
1090       ((((p)[-4] & 0xF) << 18)                                  \
1091        | (((p)[-3] & 0x3F) << 12)                               \
1092        | (((p)[-2] & 0x3F) << 6)                                \
1093        | ((p)[-1] & 0x3F)))                                     \
1094    : ((p) += 5,                                                 \
1095       ((((p)[-4] & 0x3F) << 18)                                 \
1096        | (((p)[-3] & 0x3F) << 12)                               \
1097        | (((p)[-2] & 0x3F) << 6)                                \
1098        | ((p)[-1] & 0x3F))))
1099
1100
1101 static void
1102 coding_set_source (struct coding_system *coding)
1103 {
1104   if (BUFFERP (coding->src_object))
1105     {
1106       struct buffer *buf = XBUFFER (coding->src_object);
1107
1108       if (coding->src_pos < 0)
1109         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1110       else
1111         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1112     }
1113   else if (STRINGP (coding->src_object))
1114     {
1115       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1116     }
1117   else
1118     /* Otherwise, the source is C string and is never relocated
1119        automatically.  Thus we don't have to update anything.  */
1120     ;
1121 }
1122
1123 static void
1124 coding_set_destination (struct coding_system *coding)
1125 {
1126   if (BUFFERP (coding->dst_object))
1127     {
1128       if (coding->src_pos < 0)
1129         {
1130           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1131           coding->dst_bytes = (GAP_END_ADDR
1132                                - (coding->src_bytes - coding->consumed)
1133                                - coding->destination);
1134         }
1135       else
1136         {
1137           /* We are sure that coding->dst_pos_byte is before the gap
1138              of the buffer. */
1139           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1140                                  + coding->dst_pos_byte - BEG_BYTE);
1141           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1142                                - coding->destination);
1143         }
1144     }
1145   else
1146     /* Otherwise, the destination is C string and is never relocated
1147        automatically.  Thus we don't have to update anything.  */
1148     ;
1149 }
1150
1151
1152 static void
1153 coding_alloc_by_realloc (struct coding_system *coding, EMACS_INT bytes)
1154 {
1155   coding->destination = (unsigned char *) xrealloc (coding->destination,
1156                                                     coding->dst_bytes + bytes);
1157   coding->dst_bytes += bytes;
1158 }
1159
1160 static void
1161 coding_alloc_by_making_gap (struct coding_system *coding,
1162                             EMACS_INT gap_head_used, EMACS_INT bytes)
1163 {
1164   if (EQ (coding->src_object, coding->dst_object))
1165     {
1166       /* The gap may contain the produced data at the head and not-yet
1167          consumed data at the tail.  To preserve those data, we at
1168          first make the gap size to zero, then increase the gap
1169          size.  */
1170       EMACS_INT add = GAP_SIZE;
1171
1172       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1173       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1174       make_gap (bytes);
1175       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1176       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1177     }
1178   else
1179     {
1180       Lisp_Object this_buffer;
1181
1182       this_buffer = Fcurrent_buffer ();
1183       set_buffer_internal (XBUFFER (coding->dst_object));
1184       make_gap (bytes);
1185       set_buffer_internal (XBUFFER (this_buffer));
1186     }
1187 }
1188
1189
1190 static unsigned char *
1191 alloc_destination (struct coding_system *coding, EMACS_INT nbytes,
1192                    unsigned char *dst)
1193 {
1194   EMACS_INT offset = dst - coding->destination;
1195
1196   if (BUFFERP (coding->dst_object))
1197     {
1198       struct buffer *buf = XBUFFER (coding->dst_object);
1199
1200       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1201     }
1202   else
1203     coding_alloc_by_realloc (coding, nbytes);
1204   coding_set_destination (coding);
1205   dst = coding->destination + offset;
1206   return dst;
1207 }
1208
1209 /** Macros for annotations.  */
1210
1211 /* An annotation data is stored in the array coding->charbuf in this
1212    format:
1213      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1214    LENGTH is the number of elements in the annotation.
1215    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1216    NCHARS is the number of characters in the text annotated.
1217
1218    The format of the following elements depend on ANNOTATION_MASK.
1219
1220    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1221    follows:
1222      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1223
1224    NBYTES is the number of bytes specified in the header part of
1225    old-style emacs-mule encoding, or 0 for the other kind of
1226    composition.
1227
1228    METHOD is one of enum composition_method.
1229
1230    Optionnal COMPOSITION-COMPONENTS are characters and composition
1231    rules.
1232
1233    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1234    follows.
1235
1236    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1237    recover from an invalid annotation, and should be skipped by
1238    produce_annotation.  */
1239
1240 /* Maximum length of the header of annotation data.  */
1241 #define MAX_ANNOTATION_LENGTH 5
1242
1243 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1244   do {                                                  \
1245     *(buf)++ = -(len);                                  \
1246     *(buf)++ = (mask);                                  \
1247     *(buf)++ = (nchars);                                \
1248     coding->annotated = 1;                              \
1249   } while (0);
1250
1251 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1252   do {                                                                      \
1253     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1254     *buf++ = nbytes;                                                        \
1255     *buf++ = method;                                                        \
1256   } while (0)
1257
1258
1259 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1260   do {                                                                  \
1261     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1262     *buf++ = id;                                                        \
1263   } while (0)
1264
1265 \f
1266 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1267
1268
1269
1270 \f
1271 /*** 3. UTF-8 ***/
1272
1273 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1274    Check if a text is encoded in UTF-8.  If it is, return 1, else
1275    return 0.  */
1276
1277 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1278 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1279 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1280 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1281 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1282 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1283
1284 #define UTF_BOM 0xFEFF
1285 #define UTF_8_BOM_1 0xEF
1286 #define UTF_8_BOM_2 0xBB
1287 #define UTF_8_BOM_3 0xBF
1288
1289 static int
1290 detect_coding_utf_8 (struct coding_system *coding,
1291                      struct coding_detection_info *detect_info)
1292 {
1293   const unsigned char *src = coding->source, *src_base;
1294   const unsigned char *src_end = coding->source + coding->src_bytes;
1295   int multibytep = coding->src_multibyte;
1296   int consumed_chars = 0;
1297   int bom_found = 0;
1298   int found = 0;
1299
1300   detect_info->checked |= CATEGORY_MASK_UTF_8;
1301   /* A coding system of this category is always ASCII compatible.  */
1302   src += coding->head_ascii;
1303
1304   while (1)
1305     {
1306       int c, c1, c2, c3, c4;
1307
1308       src_base = src;
1309       ONE_MORE_BYTE (c);
1310       if (c < 0 || UTF_8_1_OCTET_P (c))
1311         continue;
1312       ONE_MORE_BYTE (c1);
1313       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1314         break;
1315       if (UTF_8_2_OCTET_LEADING_P (c))
1316         {
1317           found = 1;
1318           continue;
1319         }
1320       ONE_MORE_BYTE (c2);
1321       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1322         break;
1323       if (UTF_8_3_OCTET_LEADING_P (c))
1324         {
1325           found = 1;
1326           if (src_base == coding->source
1327               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1328             bom_found = 1;
1329           continue;
1330         }
1331       ONE_MORE_BYTE (c3);
1332       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1333         break;
1334       if (UTF_8_4_OCTET_LEADING_P (c))
1335         {
1336           found = 1;
1337           continue;
1338         }
1339       ONE_MORE_BYTE (c4);
1340       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1341         break;
1342       if (UTF_8_5_OCTET_LEADING_P (c))
1343         {
1344           found = 1;
1345           continue;
1346         }
1347       break;
1348     }
1349   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1350   return 0;
1351
1352  no_more_source:
1353   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1354     {
1355       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1356       return 0;
1357     }
1358   if (bom_found)
1359     {
1360       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1361       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1362     }
1363   else
1364     {
1365       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1366       if (found)
1367         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1368     }
1369   return 1;
1370 }
1371
1372
1373 static void
1374 decode_coding_utf_8 (struct coding_system *coding)
1375 {
1376   const unsigned char *src = coding->source + coding->consumed;
1377   const unsigned char *src_end = coding->source + coding->src_bytes;
1378   const unsigned char *src_base;
1379   int *charbuf = coding->charbuf + coding->charbuf_used;
1380   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1381   int consumed_chars = 0, consumed_chars_base = 0;
1382   int multibytep = coding->src_multibyte;
1383   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1384   Lisp_Object attr, charset_list;
1385   int eol_crlf =
1386     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1387   int byte_after_cr = -1;
1388
1389   CODING_GET_INFO (coding, attr, charset_list);
1390
1391   if (bom != utf_without_bom)
1392     {
1393       int c1, c2, c3;
1394
1395       src_base = src;
1396       ONE_MORE_BYTE (c1);
1397       if (! UTF_8_3_OCTET_LEADING_P (c1))
1398         src = src_base;
1399       else
1400         {
1401           ONE_MORE_BYTE (c2);
1402           if (! UTF_8_EXTRA_OCTET_P (c2))
1403             src = src_base;
1404           else
1405             {
1406               ONE_MORE_BYTE (c3);
1407               if (! UTF_8_EXTRA_OCTET_P (c3))
1408                 src = src_base;
1409               else
1410                 {
1411                   if ((c1 != UTF_8_BOM_1)
1412                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1413                     src = src_base;
1414                   else
1415                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1416                 }
1417             }
1418         }
1419     }
1420   CODING_UTF_8_BOM (coding) = utf_without_bom;
1421
1422   while (1)
1423     {
1424       int c, c1, c2, c3, c4, c5;
1425
1426       src_base = src;
1427       consumed_chars_base = consumed_chars;
1428
1429       if (charbuf >= charbuf_end)
1430         {
1431           if (byte_after_cr >= 0)
1432             src_base--;
1433           break;
1434         }
1435
1436       if (byte_after_cr >= 0)
1437         c1 = byte_after_cr, byte_after_cr = -1;
1438       else
1439         ONE_MORE_BYTE (c1);
1440       if (c1 < 0)
1441         {
1442           c = - c1;
1443         }
1444       else if (UTF_8_1_OCTET_P (c1))
1445         {
1446           if (eol_crlf && c1 == '\r')
1447             ONE_MORE_BYTE (byte_after_cr);
1448           c = c1;
1449         }
1450       else
1451         {
1452           ONE_MORE_BYTE (c2);
1453           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1454             goto invalid_code;
1455           if (UTF_8_2_OCTET_LEADING_P (c1))
1456             {
1457               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1458               /* Reject overlong sequences here and below.  Encoders
1459                  producing them are incorrect, they can be misleading,
1460                  and they mess up read/write invariance.  */
1461               if (c < 128)
1462                 goto invalid_code;
1463             }
1464           else
1465             {
1466               ONE_MORE_BYTE (c3);
1467               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1468                 goto invalid_code;
1469               if (UTF_8_3_OCTET_LEADING_P (c1))
1470                 {
1471                   c = (((c1 & 0xF) << 12)
1472                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1473                   if (c < 0x800
1474                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1475                     goto invalid_code;
1476                 }
1477               else
1478                 {
1479                   ONE_MORE_BYTE (c4);
1480                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1481                     goto invalid_code;
1482                   if (UTF_8_4_OCTET_LEADING_P (c1))
1483                     {
1484                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1485                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1486                     if (c < 0x10000)
1487                       goto invalid_code;
1488                     }
1489                   else
1490                     {
1491                       ONE_MORE_BYTE (c5);
1492                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1493                         goto invalid_code;
1494                       if (UTF_8_5_OCTET_LEADING_P (c1))
1495                         {
1496                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1497                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1498                                | (c5 & 0x3F));
1499                           if ((c > MAX_CHAR) || (c < 0x200000))
1500                             goto invalid_code;
1501                         }
1502                       else
1503                         goto invalid_code;
1504                     }
1505                 }
1506             }
1507         }
1508
1509       *charbuf++ = c;
1510       continue;
1511
1512     invalid_code:
1513       src = src_base;
1514       consumed_chars = consumed_chars_base;
1515       ONE_MORE_BYTE (c);
1516       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1517       coding->errors++;
1518     }
1519
1520  no_more_source:
1521   coding->consumed_char += consumed_chars_base;
1522   coding->consumed = src_base - coding->source;
1523   coding->charbuf_used = charbuf - coding->charbuf;
1524 }
1525
1526
1527 static int
1528 encode_coding_utf_8 (struct coding_system *coding)
1529 {
1530   int multibytep = coding->dst_multibyte;
1531   int *charbuf = coding->charbuf;
1532   int *charbuf_end = charbuf + coding->charbuf_used;
1533   unsigned char *dst = coding->destination + coding->produced;
1534   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1535   int produced_chars = 0;
1536   int c;
1537
1538   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1539     {
1540       ASSURE_DESTINATION (3);
1541       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1542       CODING_UTF_8_BOM (coding) = utf_without_bom;
1543     }
1544
1545   if (multibytep)
1546     {
1547       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1548
1549       while (charbuf < charbuf_end)
1550         {
1551           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1552
1553           ASSURE_DESTINATION (safe_room);
1554           c = *charbuf++;
1555           if (CHAR_BYTE8_P (c))
1556             {
1557               c = CHAR_TO_BYTE8 (c);
1558               EMIT_ONE_BYTE (c);
1559             }
1560           else
1561             {
1562               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1563               for (p = str; p < pend; p++)
1564                 EMIT_ONE_BYTE (*p);
1565             }
1566         }
1567     }
1568   else
1569     {
1570       int safe_room = MAX_MULTIBYTE_LENGTH;
1571
1572       while (charbuf < charbuf_end)
1573         {
1574           ASSURE_DESTINATION (safe_room);
1575           c = *charbuf++;
1576           if (CHAR_BYTE8_P (c))
1577             *dst++ = CHAR_TO_BYTE8 (c);
1578           else
1579             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1580           produced_chars++;
1581         }
1582     }
1583   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1584   coding->produced_char += produced_chars;
1585   coding->produced = dst - coding->destination;
1586   return 0;
1587 }
1588
1589
1590 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1591    Check if a text is encoded in one of UTF-16 based coding systems.
1592    If it is, return 1, else return 0.  */
1593
1594 #define UTF_16_HIGH_SURROGATE_P(val) \
1595   (((val) & 0xFC00) == 0xD800)
1596
1597 #define UTF_16_LOW_SURROGATE_P(val) \
1598   (((val) & 0xFC00) == 0xDC00)
1599
1600 #define UTF_16_INVALID_P(val)   \
1601   (((val) == 0xFFFE)            \
1602    || ((val) == 0xFFFF)         \
1603    || UTF_16_LOW_SURROGATE_P (val))
1604
1605
1606 static int
1607 detect_coding_utf_16 (struct coding_system *coding,
1608                       struct coding_detection_info *detect_info)
1609 {
1610   const unsigned char *src = coding->source, *src_base = src;
1611   const unsigned char *src_end = coding->source + coding->src_bytes;
1612   int multibytep = coding->src_multibyte;
1613   int consumed_chars = 0;
1614   int c1, c2;
1615
1616   detect_info->checked |= CATEGORY_MASK_UTF_16;
1617   if (coding->mode & CODING_MODE_LAST_BLOCK
1618       && (coding->src_chars & 1))
1619     {
1620       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1621       return 0;
1622     }
1623
1624   TWO_MORE_BYTES (c1, c2);
1625   if ((c1 == 0xFF) && (c2 == 0xFE))
1626     {
1627       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1628                              | CATEGORY_MASK_UTF_16_AUTO);
1629       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1630                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1631                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1632     }
1633   else if ((c1 == 0xFE) && (c2 == 0xFF))
1634     {
1635       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1636                              | CATEGORY_MASK_UTF_16_AUTO);
1637       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1638                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1639                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1640     }
1641   else if (c2 < 0)
1642     {
1643       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1644       return 0;
1645     }
1646   else
1647     {
1648       /* We check the dispersion of Eth and Oth bytes where E is even and
1649          O is odd.  If both are high, we assume binary data.*/
1650       unsigned char e[256], o[256];
1651       unsigned e_num = 1, o_num = 1;
1652
1653       memset (e, 0, 256);
1654       memset (o, 0, 256);
1655       e[c1] = 1;
1656       o[c2] = 1;
1657
1658       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1659                                 |CATEGORY_MASK_UTF_16_BE
1660                                 | CATEGORY_MASK_UTF_16_LE);
1661
1662       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1663              != CATEGORY_MASK_UTF_16)
1664         {
1665           TWO_MORE_BYTES (c1, c2);
1666           if (c2 < 0)
1667             break;
1668           if (! e[c1])
1669             {
1670               e[c1] = 1;
1671               e_num++;
1672               if (e_num >= 128)
1673                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1674             }
1675           if (! o[c2])
1676             {
1677               o[c2] = 1;
1678               o_num++;
1679               if (o_num >= 128)
1680                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1681             }
1682         }
1683       return 0;
1684     }
1685
1686  no_more_source:
1687   return 1;
1688 }
1689
1690 static void
1691 decode_coding_utf_16 (struct coding_system *coding)
1692 {
1693   const unsigned char *src = coding->source + coding->consumed;
1694   const unsigned char *src_end = coding->source + coding->src_bytes;
1695   const unsigned char *src_base;
1696   int *charbuf = coding->charbuf + coding->charbuf_used;
1697   /* We may produces at most 3 chars in one loop.  */
1698   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1699   int consumed_chars = 0, consumed_chars_base = 0;
1700   int multibytep = coding->src_multibyte;
1701   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1702   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1703   int surrogate = CODING_UTF_16_SURROGATE (coding);
1704   Lisp_Object attr, charset_list;
1705   int eol_crlf =
1706     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1707   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1708
1709   CODING_GET_INFO (coding, attr, charset_list);
1710
1711   if (bom == utf_with_bom)
1712     {
1713       int c, c1, c2;
1714
1715       src_base = src;
1716       ONE_MORE_BYTE (c1);
1717       ONE_MORE_BYTE (c2);
1718       c = (c1 << 8) | c2;
1719
1720       if (endian == utf_16_big_endian
1721           ? c != 0xFEFF : c != 0xFFFE)
1722         {
1723           /* The first two bytes are not BOM.  Treat them as bytes
1724              for a normal character.  */
1725           src = src_base;
1726           coding->errors++;
1727         }
1728       CODING_UTF_16_BOM (coding) = utf_without_bom;
1729     }
1730   else if (bom == utf_detect_bom)
1731     {
1732       /* We have already tried to detect BOM and failed in
1733          detect_coding.  */
1734       CODING_UTF_16_BOM (coding) = utf_without_bom;
1735     }
1736
1737   while (1)
1738     {
1739       int c, c1, c2;
1740
1741       src_base = src;
1742       consumed_chars_base = consumed_chars;
1743
1744       if (charbuf >= charbuf_end)
1745         {
1746           if (byte_after_cr1 >= 0)
1747             src_base -= 2;
1748           break;
1749         }
1750
1751       if (byte_after_cr1 >= 0)
1752         c1 = byte_after_cr1, byte_after_cr1 = -1;
1753       else
1754         ONE_MORE_BYTE (c1);
1755       if (c1 < 0)
1756         {
1757           *charbuf++ = -c1;
1758           continue;
1759         }
1760       if (byte_after_cr2 >= 0)
1761         c2 = byte_after_cr2, byte_after_cr2 = -1;
1762       else
1763         ONE_MORE_BYTE (c2);
1764       if (c2 < 0)
1765         {
1766           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1767           *charbuf++ = -c2;
1768           continue;
1769         }
1770       c = (endian == utf_16_big_endian
1771            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1772
1773       if (surrogate)
1774         {
1775           if (! UTF_16_LOW_SURROGATE_P (c))
1776             {
1777               if (endian == utf_16_big_endian)
1778                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1779               else
1780                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1781               *charbuf++ = c1;
1782               *charbuf++ = c2;
1783               coding->errors++;
1784               if (UTF_16_HIGH_SURROGATE_P (c))
1785                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1786               else
1787                 *charbuf++ = c;
1788             }
1789           else
1790             {
1791               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1792               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1793               *charbuf++ = 0x10000 + c;
1794             }
1795         }
1796       else
1797         {
1798           if (UTF_16_HIGH_SURROGATE_P (c))
1799             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1800           else
1801             {
1802               if (eol_crlf && c == '\r')
1803                 {
1804                   ONE_MORE_BYTE (byte_after_cr1);
1805                   ONE_MORE_BYTE (byte_after_cr2);
1806                 }
1807               *charbuf++ = c;
1808             }
1809         }
1810     }
1811
1812  no_more_source:
1813   coding->consumed_char += consumed_chars_base;
1814   coding->consumed = src_base - coding->source;
1815   coding->charbuf_used = charbuf - coding->charbuf;
1816 }
1817
1818 static int
1819 encode_coding_utf_16 (struct coding_system *coding)
1820 {
1821   int multibytep = coding->dst_multibyte;
1822   int *charbuf = coding->charbuf;
1823   int *charbuf_end = charbuf + coding->charbuf_used;
1824   unsigned char *dst = coding->destination + coding->produced;
1825   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1826   int safe_room = 8;
1827   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1828   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1829   int produced_chars = 0;
1830   Lisp_Object attrs, charset_list;
1831   int c;
1832
1833   CODING_GET_INFO (coding, attrs, charset_list);
1834
1835   if (bom != utf_without_bom)
1836     {
1837       ASSURE_DESTINATION (safe_room);
1838       if (big_endian)
1839         EMIT_TWO_BYTES (0xFE, 0xFF);
1840       else
1841         EMIT_TWO_BYTES (0xFF, 0xFE);
1842       CODING_UTF_16_BOM (coding) = utf_without_bom;
1843     }
1844
1845   while (charbuf < charbuf_end)
1846     {
1847       ASSURE_DESTINATION (safe_room);
1848       c = *charbuf++;
1849       if (c > MAX_UNICODE_CHAR)
1850         c = coding->default_char;
1851
1852       if (c < 0x10000)
1853         {
1854           if (big_endian)
1855             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1856           else
1857             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1858         }
1859       else
1860         {
1861           int c1, c2;
1862
1863           c -= 0x10000;
1864           c1 = (c >> 10) + 0xD800;
1865           c2 = (c & 0x3FF) + 0xDC00;
1866           if (big_endian)
1867             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1868           else
1869             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1870         }
1871     }
1872   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1873   coding->produced = dst - coding->destination;
1874   coding->produced_char += produced_chars;
1875   return 0;
1876 }
1877
1878 \f
1879 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1880
1881 /* Emacs' internal format for representation of multiple character
1882    sets is a kind of multi-byte encoding, i.e. characters are
1883    represented by variable-length sequences of one-byte codes.
1884
1885    ASCII characters and control characters (e.g. `tab', `newline') are
1886    represented by one-byte sequences which are their ASCII codes, in
1887    the range 0x00 through 0x7F.
1888
1889    8-bit characters of the range 0x80..0x9F are represented by
1890    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1891    code + 0x20).
1892
1893    8-bit characters of the range 0xA0..0xFF are represented by
1894    one-byte sequences which are their 8-bit code.
1895
1896    The other characters are represented by a sequence of `base
1897    leading-code', optional `extended leading-code', and one or two
1898    `position-code's.  The length of the sequence is determined by the
1899    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1900    whereas extended leading-code and position-code take the range 0xA0
1901    through 0xFF.  See `charset.h' for more details about leading-code
1902    and position-code.
1903
1904    --- CODE RANGE of Emacs' internal format ---
1905    character set        range
1906    -------------        -----
1907    ascii                0x00..0x7F
1908    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1909    eight-bit-graphic    0xA0..0xBF
1910    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1911    ---------------------------------------------
1912
1913    As this is the internal character representation, the format is
1914    usually not used externally (i.e. in a file or in a data sent to a
1915    process).  But, it is possible to have a text externally in this
1916    format (i.e. by encoding by the coding system `emacs-mule').
1917
1918    In that case, a sequence of one-byte codes has a slightly different
1919    form.
1920
1921    At first, all characters in eight-bit-control are represented by
1922    one-byte sequences which are their 8-bit code.
1923
1924    Next, character composition data are represented by the byte
1925    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1926    where,
1927         METHOD is 0xF2 plus one of composition method (enum
1928         composition_method),
1929
1930         BYTES is 0xA0 plus a byte length of this composition data,
1931
1932         CHARS is 0xA0 plus a number of characters composed by this
1933         data,
1934
1935         COMPONENTs are characters of multibye form or composition
1936         rules encoded by two-byte of ASCII codes.
1937
1938    In addition, for backward compatibility, the following formats are
1939    also recognized as composition data on decoding.
1940
1941    0x80 MSEQ ...
1942    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1943
1944    Here,
1945         MSEQ is a multibyte form but in these special format:
1946           ASCII: 0xA0 ASCII_CODE+0x80,
1947           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1948         RULE is a one byte code of the range 0xA0..0xF0 that
1949         represents a composition rule.
1950   */
1951
1952 char emacs_mule_bytes[256];
1953
1954
1955 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1956    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1957    else return 0.  */
1958
1959 static int
1960 detect_coding_emacs_mule (struct coding_system *coding,
1961                           struct coding_detection_info *detect_info)
1962 {
1963   const unsigned char *src = coding->source, *src_base;
1964   const unsigned char *src_end = coding->source + coding->src_bytes;
1965   int multibytep = coding->src_multibyte;
1966   int consumed_chars = 0;
1967   int c;
1968   int found = 0;
1969
1970   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1971   /* A coding system of this category is always ASCII compatible.  */
1972   src += coding->head_ascii;
1973
1974   while (1)
1975     {
1976       src_base = src;
1977       ONE_MORE_BYTE (c);
1978       if (c < 0)
1979         continue;
1980       if (c == 0x80)
1981         {
1982           /* Perhaps the start of composite character.  We simply skip
1983              it because analyzing it is too heavy for detecting.  But,
1984              at least, we check that the composite character
1985              constitutes of more than 4 bytes.  */
1986           const unsigned char *src_base;
1987
1988         repeat:
1989           src_base = src;
1990           do
1991             {
1992               ONE_MORE_BYTE (c);
1993             }
1994           while (c >= 0xA0);
1995
1996           if (src - src_base <= 4)
1997             break;
1998           found = CATEGORY_MASK_EMACS_MULE;
1999           if (c == 0x80)
2000             goto repeat;
2001         }
2002
2003       if (c < 0x80)
2004         {
2005           if (c < 0x20
2006               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2007             break;
2008         }
2009       else
2010         {
2011           int more_bytes = emacs_mule_bytes[c] - 1;
2012
2013           while (more_bytes > 0)
2014             {
2015               ONE_MORE_BYTE (c);
2016               if (c < 0xA0)
2017                 {
2018                   src--;        /* Unread the last byte.  */
2019                   break;
2020                 }
2021               more_bytes--;
2022             }
2023           if (more_bytes != 0)
2024             break;
2025           found = CATEGORY_MASK_EMACS_MULE;
2026         }
2027     }
2028   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2029   return 0;
2030
2031  no_more_source:
2032   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2033     {
2034       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2035       return 0;
2036     }
2037   detect_info->found |= found;
2038   return 1;
2039 }
2040
2041
2042 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2043    character.  If CMP_STATUS indicates that we must expect MSEQ or
2044    RULE described above, decode it and return the negative value of
2045    the decoded character or rule.  If an invalid byte is found, return
2046    -1.  If SRC is too short, return -2.  */
2047
2048 int
2049 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2050                  int *nbytes, int *nchars, int *id,
2051                  struct composition_status *cmp_status)
2052 {
2053   const unsigned char *src_end = coding->source + coding->src_bytes;
2054   const unsigned char *src_base = src;
2055   int multibytep = coding->src_multibyte;
2056   struct charset *charset;
2057   unsigned code;
2058   int c;
2059   int consumed_chars = 0;
2060   int mseq_found = 0;
2061
2062   ONE_MORE_BYTE (c);
2063   if (c < 0)
2064     {
2065       c = -c;
2066       charset = emacs_mule_charset[0];
2067     }
2068   else
2069     {
2070       if (c >= 0xA0)
2071         {
2072           if (cmp_status->state != COMPOSING_NO
2073               && cmp_status->old_form)
2074             {
2075               if (cmp_status->state == COMPOSING_CHAR)
2076                 {
2077                   if (c == 0xA0)
2078                     {
2079                       ONE_MORE_BYTE (c);
2080                       c -= 0x80;
2081                       if (c < 0)
2082                         goto invalid_code;
2083                     }
2084                   else
2085                     c -= 0x20;
2086                   mseq_found = 1;
2087                 }
2088               else
2089                 {
2090                   *nbytes = src - src_base;
2091                   *nchars = consumed_chars;
2092                   return -c;
2093                 }
2094             }
2095           else
2096             goto invalid_code;
2097         }
2098
2099       switch (emacs_mule_bytes[c])
2100         {
2101         case 2:
2102           if (! (charset = emacs_mule_charset[c]))
2103             goto invalid_code;
2104           ONE_MORE_BYTE (c);
2105           if (c < 0xA0)
2106             goto invalid_code;
2107           code = c & 0x7F;
2108           break;
2109
2110         case 3:
2111           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2112               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2113             {
2114               ONE_MORE_BYTE (c);
2115               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
2116                 goto invalid_code;
2117               ONE_MORE_BYTE (c);
2118               if (c < 0xA0)
2119                 goto invalid_code;
2120               code = c & 0x7F;
2121             }
2122           else
2123             {
2124               if (! (charset = emacs_mule_charset[c]))
2125                 goto invalid_code;
2126               ONE_MORE_BYTE (c);
2127               if (c < 0xA0)
2128                 goto invalid_code;
2129               code = (c & 0x7F) << 8;
2130               ONE_MORE_BYTE (c);
2131               if (c < 0xA0)
2132                 goto invalid_code;
2133               code |= c & 0x7F;
2134             }
2135           break;
2136
2137         case 4:
2138           ONE_MORE_BYTE (c);
2139           if (c < 0 || ! (charset = emacs_mule_charset[c]))
2140             goto invalid_code;
2141           ONE_MORE_BYTE (c);
2142           if (c < 0xA0)
2143             goto invalid_code;
2144           code = (c & 0x7F) << 8;
2145           ONE_MORE_BYTE (c);
2146           if (c < 0xA0)
2147             goto invalid_code;
2148           code |= c & 0x7F;
2149           break;
2150
2151         case 1:
2152           code = c;
2153           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
2154                                      ? charset_ascii : charset_eight_bit);
2155           break;
2156
2157         default:
2158           abort ();
2159         }
2160       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, code, c);
2161       if (c < 0)
2162         goto invalid_code;
2163     }
2164   *nbytes = src - src_base;
2165   *nchars = consumed_chars;
2166   if (id)
2167     *id = charset->id;
2168   return (mseq_found ? -c : c);
2169
2170  no_more_source:
2171   return -2;
2172
2173  invalid_code:
2174   return -1;
2175 }
2176
2177
2178 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2179
2180 /* Handle these composition sequence ('|': the end of header elements,
2181    BYTES and CHARS >= 0xA0):
2182
2183    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2184    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2185    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2186
2187    and these old form:
2188
2189    (4) relative composition: 0x80 | MSEQ ... MSEQ
2190    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2191
2192    When the starter 0x80 and the following header elements are found,
2193    this annotation header is produced.
2194
2195         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2196
2197    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2198    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2199
2200    Then, upon reading the following elements, these codes are produced
2201    until the composition end is found:
2202
2203    (1) CHAR ... CHAR
2204    (2) ALT ... ALT CHAR ... CHAR
2205    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2206    (4) CHAR ... CHAR
2207    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2208
2209    When the composition end is found, LENGTH and NCHARS in the
2210    annotation header is updated as below:
2211
2212    (1) LENGTH: unchanged, NCHARS: unchanged
2213    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2214    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2215    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2216    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2217
2218    If an error is found while composing, the annotation header is
2219    changed to the original composition header (plus filler -1s) as
2220    below:
2221
2222    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2223    (5)          [ 0x80 0xFF -1 -1- -1 ]
2224
2225    and the sequence [ -2 DECODED-RULE ] is changed to the original
2226    byte sequence as below:
2227         o the original byte sequence is B: [ B -1 ]
2228         o the original byte sequence is B1 B2: [ B1 B2 ]
2229
2230    Most of the routines are implemented by macros because many
2231    variables and labels in the caller decode_coding_emacs_mule must be
2232    accessible, and they are usually called just once (thus doesn't
2233    increase the size of compiled object).  */
2234
2235 /* Decode a composition rule represented by C as a component of
2236    composition sequence of Emacs 20 style.  Set RULE to the decoded
2237    rule. */
2238
2239 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2240   do {                                                  \
2241     int gref, nref;                                     \
2242                                                         \
2243     c -= 0xA0;                                          \
2244     if (c < 0 || c >= 81)                               \
2245       goto invalid_code;                                \
2246     gref = c / 9, nref = c % 9;                         \
2247     if (gref == 4) gref = 10;                           \
2248     if (nref == 4) nref = 10;                           \
2249     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2250   } while (0)
2251
2252
2253 /* Decode a composition rule represented by C and the following byte
2254    at SRC as a component of composition sequence of Emacs 21 style.
2255    Set RULE to the decoded rule.  */
2256
2257 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2258   do {                                                  \
2259     int gref, nref;                                     \
2260                                                         \
2261     gref = c - 0x20;                                    \
2262     if (gref < 0 || gref >= 81)                         \
2263       goto invalid_code;                                \
2264     ONE_MORE_BYTE (c);                                  \
2265     nref = c - 0x20;                                    \
2266     if (nref < 0 || nref >= 81)                         \
2267       goto invalid_code;                                \
2268     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2269   } while (0)
2270
2271
2272 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2273    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2274    byte length of this composition information, CHARS is the number of
2275    characters composed by this composition.  */
2276
2277 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2278   do {                                                                  \
2279     enum composition_method method = c - 0xF2;                          \
2280     int *charbuf_base = charbuf;                                        \
2281     int nbytes, nchars;                                                 \
2282                                                                         \
2283     ONE_MORE_BYTE (c);                                                  \
2284     if (c < 0)                                                          \
2285       goto invalid_code;                                                \
2286     nbytes = c - 0xA0;                                                  \
2287     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2288       goto invalid_code;                                                \
2289     ONE_MORE_BYTE (c);                                                  \
2290     nchars = c - 0xA0;                                                  \
2291     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2292       goto invalid_code;                                                \
2293     cmp_status->old_form = 0;                                           \
2294     cmp_status->method = method;                                        \
2295     if (method == COMPOSITION_RELATIVE)                                 \
2296       cmp_status->state = COMPOSING_CHAR;                               \
2297     else                                                                \
2298       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2299     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2300     cmp_status->nchars = nchars;                                        \
2301     cmp_status->ncomps = nbytes - 4;                                    \
2302     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2303   } while (0)
2304
2305
2306 /* Start of Emacs 20 style format for relative composition.  */
2307
2308 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2309   do {                                                          \
2310     cmp_status->old_form = 1;                                   \
2311     cmp_status->method = COMPOSITION_RELATIVE;                  \
2312     cmp_status->state = COMPOSING_CHAR;                         \
2313     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2314     cmp_status->nchars = cmp_status->ncomps = 0;                \
2315     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2316   } while (0)
2317
2318
2319 /* Start of Emacs 20 style format for rule-base composition.  */
2320
2321 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2322   do {                                                          \
2323     cmp_status->old_form = 1;                                   \
2324     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2325     cmp_status->state = COMPOSING_CHAR;                         \
2326     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2327     cmp_status->nchars = cmp_status->ncomps = 0;                \
2328     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2329   } while (0)
2330
2331
2332 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2333   do {                                                  \
2334     const unsigned char *current_src = src;             \
2335                                                         \
2336     ONE_MORE_BYTE (c);                                  \
2337     if (c < 0)                                          \
2338       goto invalid_code;                                \
2339     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2340         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2341       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2342     else if (c < 0xA0)                                  \
2343       goto invalid_code;                                \
2344     else if (c < 0xC0)                                  \
2345       {                                                 \
2346         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2347         /* Re-read C as a composition component.  */    \
2348         src = current_src;                              \
2349       }                                                 \
2350     else if (c == 0xFF)                                 \
2351       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2352     else                                                \
2353       goto invalid_code;                                \
2354   } while (0)
2355
2356 #define EMACS_MULE_COMPOSITION_END()                            \
2357   do {                                                          \
2358     int idx = - cmp_status->length;                             \
2359                                                                 \
2360     if (cmp_status->old_form)                                   \
2361       charbuf[idx + 2] = cmp_status->nchars;                    \
2362     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2363       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2364     cmp_status->state = COMPOSING_NO;                           \
2365   } while (0)
2366
2367
2368 static int
2369 emacs_mule_finish_composition (int *charbuf,
2370                                struct composition_status *cmp_status)
2371 {
2372   int idx = - cmp_status->length;
2373   int new_chars;
2374
2375   if (cmp_status->old_form && cmp_status->nchars > 0)
2376     {
2377       charbuf[idx + 2] = cmp_status->nchars;
2378       new_chars = 0;
2379       if (cmp_status->method == COMPOSITION_WITH_RULE
2380           && cmp_status->state == COMPOSING_CHAR)
2381         {
2382           /* The last rule was invalid.  */
2383           int rule = charbuf[-1] + 0xA0;
2384
2385           charbuf[-2] = BYTE8_TO_CHAR (rule);
2386           charbuf[-1] = -1;
2387           new_chars = 1;
2388         }
2389     }
2390   else
2391     {
2392       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2393
2394       if (cmp_status->method == COMPOSITION_WITH_RULE)
2395         {
2396           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2397           charbuf[idx++] = -3;
2398           charbuf[idx++] = 0;
2399           new_chars = 1;
2400         }
2401       else
2402         {
2403           int nchars = charbuf[idx + 1] + 0xA0;
2404           int nbytes = charbuf[idx + 2] + 0xA0;
2405
2406           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2407           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2408           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2409           charbuf[idx++] = -1;
2410           new_chars = 4;
2411         }
2412     }
2413   cmp_status->state = COMPOSING_NO;
2414   return new_chars;
2415 }
2416
2417 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2418   do {                                                                    \
2419     if (cmp_status->state != COMPOSING_NO)                                \
2420       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2421   } while (0)
2422
2423
2424 static void
2425 decode_coding_emacs_mule (struct coding_system *coding)
2426 {
2427   const unsigned char *src = coding->source + coding->consumed;
2428   const unsigned char *src_end = coding->source + coding->src_bytes;
2429   const unsigned char *src_base;
2430   int *charbuf = coding->charbuf + coding->charbuf_used;
2431   /* We may produce two annocations (charset and composition) in one
2432      loop and one more charset annocation at the end.  */
2433   int *charbuf_end
2434     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2435   int consumed_chars = 0, consumed_chars_base;
2436   int multibytep = coding->src_multibyte;
2437   Lisp_Object attrs, charset_list;
2438   int char_offset = coding->produced_char;
2439   int last_offset = char_offset;
2440   int last_id = charset_ascii;
2441   int eol_crlf =
2442     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2443   int byte_after_cr = -1;
2444   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2445
2446   CODING_GET_INFO (coding, attrs, charset_list);
2447
2448   if (cmp_status->state != COMPOSING_NO)
2449     {
2450       int i;
2451
2452       for (i = 0; i < cmp_status->length; i++)
2453         *charbuf++ = cmp_status->carryover[i];
2454       coding->annotated = 1;
2455     }
2456
2457   while (1)
2458     {
2459       int c, id;
2460
2461       src_base = src;
2462       consumed_chars_base = consumed_chars;
2463
2464       if (charbuf >= charbuf_end)
2465         {
2466           if (byte_after_cr >= 0)
2467             src_base--;
2468           break;
2469         }
2470
2471       if (byte_after_cr >= 0)
2472         c = byte_after_cr, byte_after_cr = -1;
2473       else
2474         ONE_MORE_BYTE (c);
2475
2476       if (c < 0 || c == 0x80)
2477         {
2478           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2479           if (c < 0)
2480             {
2481               *charbuf++ = -c;
2482               char_offset++;
2483             }
2484           else
2485             DECODE_EMACS_MULE_COMPOSITION_START ();
2486           continue;
2487         }
2488
2489       if (c < 0x80)
2490         {
2491           if (eol_crlf && c == '\r')
2492             ONE_MORE_BYTE (byte_after_cr);
2493           id = charset_ascii;
2494           if (cmp_status->state != COMPOSING_NO)
2495             {
2496               if (cmp_status->old_form)
2497                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2498               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2499                 cmp_status->ncomps--;
2500             }
2501         }
2502       else
2503         {
2504           int nchars, nbytes;
2505           /* emacs_mule_char can load a charset map from a file, which
2506              allocates a large structure and might cause buffer text
2507              to be relocated as result.  Thus, we need to remember the
2508              original pointer to buffer text, and fixup all related
2509              pointers after the call.  */
2510           const unsigned char *orig = coding->source;
2511           EMACS_INT offset;
2512
2513           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2514                                cmp_status);
2515           offset = coding->source - orig;
2516           if (offset)
2517             {
2518               src += offset;
2519               src_base += offset;
2520               src_end += offset;
2521             }
2522           if (c < 0)
2523             {
2524               if (c == -1)
2525                 goto invalid_code;
2526               if (c == -2)
2527                 break;
2528             }
2529           src = src_base + nbytes;
2530           consumed_chars = consumed_chars_base + nchars;
2531           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2532             cmp_status->ncomps -= nchars;
2533         }
2534
2535       /* Now if C >= 0, we found a normally encoded characer, if C <
2536          0, we found an old-style composition component character or
2537          rule.  */
2538
2539       if (cmp_status->state == COMPOSING_NO)
2540         {
2541           if (last_id != id)
2542             {
2543               if (last_id != charset_ascii)
2544                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2545                                   last_id);
2546               last_id = id;
2547               last_offset = char_offset;
2548             }
2549           *charbuf++ = c;
2550           char_offset++;
2551         }
2552       else if (cmp_status->state == COMPOSING_CHAR)
2553         {
2554           if (cmp_status->old_form)
2555             {
2556               if (c >= 0)
2557                 {
2558                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2559                   *charbuf++ = c;
2560                   char_offset++;
2561                 }
2562               else
2563                 {
2564                   *charbuf++ = -c;
2565                   cmp_status->nchars++;
2566                   cmp_status->length++;
2567                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2568                     EMACS_MULE_COMPOSITION_END ();
2569                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2570                     cmp_status->state = COMPOSING_RULE;
2571                 }
2572             }
2573           else
2574             {
2575               *charbuf++ = c;
2576               cmp_status->length++;
2577               cmp_status->nchars--;
2578               if (cmp_status->nchars == 0)
2579                 EMACS_MULE_COMPOSITION_END ();
2580             }
2581         }
2582       else if (cmp_status->state == COMPOSING_RULE)
2583         {
2584           int rule;
2585
2586           if (c >= 0)
2587             {
2588               EMACS_MULE_COMPOSITION_END ();
2589               *charbuf++ = c;
2590               char_offset++;
2591             }
2592           else
2593             {
2594               c = -c;
2595               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2596               if (rule < 0)
2597                 goto invalid_code;
2598               *charbuf++ = -2;
2599               *charbuf++ = rule;
2600               cmp_status->length += 2;
2601               cmp_status->state = COMPOSING_CHAR;
2602             }
2603         }
2604       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2605         {
2606           *charbuf++ = c;
2607           cmp_status->length++;
2608           if (cmp_status->ncomps == 0)
2609             cmp_status->state = COMPOSING_CHAR;
2610           else if (cmp_status->ncomps > 0)
2611             {
2612               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2613                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2614             }
2615           else
2616             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2617         }
2618       else                      /* COMPOSING_COMPONENT_RULE */
2619         {
2620           int rule;
2621
2622           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2623           if (rule < 0)
2624             goto invalid_code;
2625           *charbuf++ = -2;
2626           *charbuf++ = rule;
2627           cmp_status->length += 2;
2628           cmp_status->ncomps--;
2629           if (cmp_status->ncomps > 0)
2630             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2631           else
2632             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2633         }
2634       continue;
2635
2636     retry:
2637       src = src_base;
2638       consumed_chars = consumed_chars_base;
2639       continue;
2640
2641     invalid_code:
2642       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2643       src = src_base;
2644       consumed_chars = consumed_chars_base;
2645       ONE_MORE_BYTE (c);
2646       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2647       char_offset++;
2648       coding->errors++;
2649     }
2650
2651  no_more_source:
2652   if (cmp_status->state != COMPOSING_NO)
2653     {
2654       if (coding->mode & CODING_MODE_LAST_BLOCK)
2655         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2656       else
2657         {
2658           int i;
2659
2660           charbuf -= cmp_status->length;
2661           for (i = 0; i < cmp_status->length; i++)
2662             cmp_status->carryover[i] = charbuf[i];
2663         }
2664     }
2665   if (last_id != charset_ascii)
2666     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2667   coding->consumed_char += consumed_chars_base;
2668   coding->consumed = src_base - coding->source;
2669   coding->charbuf_used = charbuf - coding->charbuf;
2670 }
2671
2672
2673 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2674   do {                                          \
2675     if (id < 0xA0)                              \
2676       codes[0] = id, codes[1] = 0;              \
2677     else if (id < 0xE0)                         \
2678       codes[0] = 0x9A, codes[1] = id;           \
2679     else if (id < 0xF0)                         \
2680       codes[0] = 0x9B, codes[1] = id;           \
2681     else if (id < 0xF5)                         \
2682       codes[0] = 0x9C, codes[1] = id;           \
2683     else                                        \
2684       codes[0] = 0x9D, codes[1] = id;           \
2685   } while (0);
2686
2687
2688 static int
2689 encode_coding_emacs_mule (struct coding_system *coding)
2690 {
2691   int multibytep = coding->dst_multibyte;
2692   int *charbuf = coding->charbuf;
2693   int *charbuf_end = charbuf + coding->charbuf_used;
2694   unsigned char *dst = coding->destination + coding->produced;
2695   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2696   int safe_room = 8;
2697   int produced_chars = 0;
2698   Lisp_Object attrs, charset_list;
2699   int c;
2700   int preferred_charset_id = -1;
2701
2702   CODING_GET_INFO (coding, attrs, charset_list);
2703   if (! EQ (charset_list, Vemacs_mule_charset_list))
2704     {
2705       CODING_ATTR_CHARSET_LIST (attrs)
2706         = charset_list = Vemacs_mule_charset_list;
2707     }
2708
2709   while (charbuf < charbuf_end)
2710     {
2711       ASSURE_DESTINATION (safe_room);
2712       c = *charbuf++;
2713
2714       if (c < 0)
2715         {
2716           /* Handle an annotation.  */
2717           switch (*charbuf)
2718             {
2719             case CODING_ANNOTATE_COMPOSITION_MASK:
2720               /* Not yet implemented.  */
2721               break;
2722             case CODING_ANNOTATE_CHARSET_MASK:
2723               preferred_charset_id = charbuf[3];
2724               if (preferred_charset_id >= 0
2725                   && NILP (Fmemq (make_number (preferred_charset_id),
2726                                   charset_list)))
2727                 preferred_charset_id = -1;
2728               break;
2729             default:
2730               abort ();
2731             }
2732           charbuf += -c - 1;
2733           continue;
2734         }
2735
2736       if (ASCII_CHAR_P (c))
2737         EMIT_ONE_ASCII_BYTE (c);
2738       else if (CHAR_BYTE8_P (c))
2739         {
2740           c = CHAR_TO_BYTE8 (c);
2741           EMIT_ONE_BYTE (c);
2742         }
2743       else
2744         {
2745           struct charset *charset;
2746           unsigned code;
2747           int dimension;
2748           int emacs_mule_id;
2749           unsigned char leading_codes[2];
2750
2751           if (preferred_charset_id >= 0)
2752             {
2753               charset = CHARSET_FROM_ID (preferred_charset_id);
2754               if (CHAR_CHARSET_P (c, charset))
2755                 code = ENCODE_CHAR (charset, c);
2756               else
2757                 charset = char_charset (c, charset_list, &code);
2758             }
2759           else
2760             charset = char_charset (c, charset_list, &code);
2761           if (! charset)
2762             {
2763               c = coding->default_char;
2764               if (ASCII_CHAR_P (c))
2765                 {
2766                   EMIT_ONE_ASCII_BYTE (c);
2767                   continue;
2768                 }
2769               charset = char_charset (c, charset_list, &code);
2770             }
2771           dimension = CHARSET_DIMENSION (charset);
2772           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2773           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2774           EMIT_ONE_BYTE (leading_codes[0]);
2775           if (leading_codes[1])
2776             EMIT_ONE_BYTE (leading_codes[1]);
2777           if (dimension == 1)
2778             EMIT_ONE_BYTE (code | 0x80);
2779           else
2780             {
2781               code |= 0x8080;
2782               EMIT_ONE_BYTE (code >> 8);
2783               EMIT_ONE_BYTE (code & 0xFF);
2784             }
2785         }
2786     }
2787   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2788   coding->produced_char += produced_chars;
2789   coding->produced = dst - coding->destination;
2790   return 0;
2791 }
2792
2793 \f
2794 /*** 7. ISO2022 handlers ***/
2795
2796 /* The following note describes the coding system ISO2022 briefly.
2797    Since the intention of this note is to help understand the
2798    functions in this file, some parts are NOT ACCURATE or are OVERLY
2799    SIMPLIFIED.  For thorough understanding, please refer to the
2800    original document of ISO2022.  This is equivalent to the standard
2801    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2802
2803    ISO2022 provides many mechanisms to encode several character sets
2804    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2805    is encoded using bytes less than 128.  This may make the encoded
2806    text a little bit longer, but the text passes more easily through
2807    several types of gateway, some of which strip off the MSB (Most
2808    Significant Bit).
2809
2810    There are two kinds of character sets: control character sets and
2811    graphic character sets.  The former contain control characters such
2812    as `newline' and `escape' to provide control functions (control
2813    functions are also provided by escape sequences).  The latter
2814    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2815    two control character sets and many graphic character sets.
2816
2817    Graphic character sets are classified into one of the following
2818    four classes, according to the number of bytes (DIMENSION) and
2819    number of characters in one dimension (CHARS) of the set:
2820    - DIMENSION1_CHARS94
2821    - DIMENSION1_CHARS96
2822    - DIMENSION2_CHARS94
2823    - DIMENSION2_CHARS96
2824
2825    In addition, each character set is assigned an identification tag,
2826    unique for each set, called the "final character" (denoted as <F>
2827    hereafter).  The <F> of each character set is decided by ECMA(*)
2828    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2829    (0x30..0x3F are for private use only).
2830
2831    Note (*): ECMA = European Computer Manufacturers Association
2832
2833    Here are examples of graphic character sets [NAME(<F>)]:
2834         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2835         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2836         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2837         o DIMENSION2_CHARS96 -- none for the moment
2838
2839    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2840         C0 [0x00..0x1F] -- control character plane 0
2841         GL [0x20..0x7F] -- graphic character plane 0
2842         C1 [0x80..0x9F] -- control character plane 1
2843         GR [0xA0..0xFF] -- graphic character plane 1
2844
2845    A control character set is directly designated and invoked to C0 or
2846    C1 by an escape sequence.  The most common case is that:
2847    - ISO646's  control character set is designated/invoked to C0, and
2848    - ISO6429's control character set is designated/invoked to C1,
2849    and usually these designations/invocations are omitted in encoded
2850    text.  In a 7-bit environment, only C0 can be used, and a control
2851    character for C1 is encoded by an appropriate escape sequence to
2852    fit into the environment.  All control characters for C1 are
2853    defined to have corresponding escape sequences.
2854
2855    A graphic character set is at first designated to one of four
2856    graphic registers (G0 through G3), then these graphic registers are
2857    invoked to GL or GR.  These designations and invocations can be
2858    done independently.  The most common case is that G0 is invoked to
2859    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2860    these invocations and designations are omitted in encoded text.
2861    In a 7-bit environment, only GL can be used.
2862
2863    When a graphic character set of CHARS94 is invoked to GL, codes
2864    0x20 and 0x7F of the GL area work as control characters SPACE and
2865    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2866    be used.
2867
2868    There are two ways of invocation: locking-shift and single-shift.
2869    With locking-shift, the invocation lasts until the next different
2870    invocation, whereas with single-shift, the invocation affects the
2871    following character only and doesn't affect the locking-shift
2872    state.  Invocations are done by the following control characters or
2873    escape sequences:
2874
2875    ----------------------------------------------------------------------
2876    abbrev  function                  cntrl escape seq   description
2877    ----------------------------------------------------------------------
2878    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2879    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2880    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2881    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2882    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2883    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2884    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2885    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2886    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2887    ----------------------------------------------------------------------
2888    (*) These are not used by any known coding system.
2889
2890    Control characters for these functions are defined by macros
2891    ISO_CODE_XXX in `coding.h'.
2892
2893    Designations are done by the following escape sequences:
2894    ----------------------------------------------------------------------
2895    escape sequence      description
2896    ----------------------------------------------------------------------
2897    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2898    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2899    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2900    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2901    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2902    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2903    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2904    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2905    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2906    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2907    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2908    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2909    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2910    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2911    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2912    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2913    ----------------------------------------------------------------------
2914
2915    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2916    of dimension 1, chars 94, and final character <F>, etc...
2917
2918    Note (*): Although these designations are not allowed in ISO2022,
2919    Emacs accepts them on decoding, and produces them on encoding
2920    CHARS96 character sets in a coding system which is characterized as
2921    7-bit environment, non-locking-shift, and non-single-shift.
2922
2923    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2924    '(' must be omitted.  We refer to this as "short-form" hereafter.
2925
2926    Now you may notice that there are a lot of ways of encoding the
2927    same multilingual text in ISO2022.  Actually, there exist many
2928    coding systems such as Compound Text (used in X11's inter client
2929    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2930    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2931    localized platforms), and all of these are variants of ISO2022.
2932
2933    In addition to the above, Emacs handles two more kinds of escape
2934    sequences: ISO6429's direction specification and Emacs' private
2935    sequence for specifying character composition.
2936
2937    ISO6429's direction specification takes the following form:
2938         o CSI ']'      -- end of the current direction
2939         o CSI '0' ']'  -- end of the current direction
2940         o CSI '1' ']'  -- start of left-to-right text
2941         o CSI '2' ']'  -- start of right-to-left text
2942    The control character CSI (0x9B: control sequence introducer) is
2943    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2944
2945    Character composition specification takes the following form:
2946         o ESC '0' -- start relative composition
2947         o ESC '1' -- end composition
2948         o ESC '2' -- start rule-base composition (*)
2949         o ESC '3' -- start relative composition with alternate chars  (**)
2950         o ESC '4' -- start rule-base composition with alternate chars  (**)
2951   Since these are not standard escape sequences of any ISO standard,
2952   the use of them with these meanings is restricted to Emacs only.
2953
2954   (*) This form is used only in Emacs 20.7 and older versions,
2955   but newer versions can safely decode it.
2956   (**) This form is used only in Emacs 21.1 and newer versions,
2957   and older versions can't decode it.
2958
2959   Here's a list of example usages of these composition escape
2960   sequences (categorized by `enum composition_method').
2961
2962   COMPOSITION_RELATIVE:
2963         ESC 0 CHAR [ CHAR ] ESC 1
2964   COMPOSITION_WITH_RULE:
2965         ESC 2 CHAR [ RULE CHAR ] ESC 1
2966   COMPOSITION_WITH_ALTCHARS:
2967         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2968   COMPOSITION_WITH_RULE_ALTCHARS:
2969         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2970
2971 enum iso_code_class_type iso_code_class[256];
2972
2973 #define SAFE_CHARSET_P(coding, id)      \
2974   ((id) <= (coding)->max_charset_id     \
2975    && (coding)->safe_charsets[id] != 255)
2976
2977
2978 #define SHIFT_OUT_OK(category)  \
2979   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2980
2981 static void
2982 setup_iso_safe_charsets (Lisp_Object attrs)
2983 {
2984   Lisp_Object charset_list, safe_charsets;
2985   Lisp_Object request;
2986   Lisp_Object reg_usage;
2987   Lisp_Object tail;
2988   int reg94, reg96;
2989   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2990   int max_charset_id;
2991
2992   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2993   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2994       && ! EQ (charset_list, Viso_2022_charset_list))
2995     {
2996       CODING_ATTR_CHARSET_LIST (attrs)
2997         = charset_list = Viso_2022_charset_list;
2998       ASET (attrs, coding_attr_safe_charsets, Qnil);
2999     }
3000
3001   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
3002     return;
3003
3004   max_charset_id = 0;
3005   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3006     {
3007       int id = XINT (XCAR (tail));
3008       if (max_charset_id < id)
3009         max_charset_id = id;
3010     }
3011
3012   safe_charsets = make_uninit_string (max_charset_id + 1);
3013   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3014   request = AREF (attrs, coding_attr_iso_request);
3015   reg_usage = AREF (attrs, coding_attr_iso_usage);
3016   reg94 = XINT (XCAR (reg_usage));
3017   reg96 = XINT (XCDR (reg_usage));
3018
3019   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3020     {
3021       Lisp_Object id;
3022       Lisp_Object reg;
3023       struct charset *charset;
3024
3025       id = XCAR (tail);
3026       charset = CHARSET_FROM_ID (XINT (id));
3027       reg = Fcdr (Fassq (id, request));
3028       if (! NILP (reg))
3029         SSET (safe_charsets, XINT (id), XINT (reg));
3030       else if (charset->iso_chars_96)
3031         {
3032           if (reg96 < 4)
3033             SSET (safe_charsets, XINT (id), reg96);
3034         }
3035       else
3036         {
3037           if (reg94 < 4)
3038             SSET (safe_charsets, XINT (id), reg94);
3039         }
3040     }
3041   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3042 }
3043
3044
3045 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3046    Check if a text is encoded in one of ISO-2022 based codig systems.
3047    If it is, return 1, else return 0.  */
3048
3049 static int
3050 detect_coding_iso_2022 (struct coding_system *coding,
3051                         struct coding_detection_info *detect_info)
3052 {
3053   const unsigned char *src = coding->source, *src_base = src;
3054   const unsigned char *src_end = coding->source + coding->src_bytes;
3055   int multibytep = coding->src_multibyte;
3056   int single_shifting = 0;
3057   int id;
3058   int c, c1;
3059   int consumed_chars = 0;
3060   int i;
3061   int rejected = 0;
3062   int found = 0;
3063   int composition_count = -1;
3064
3065   detect_info->checked |= CATEGORY_MASK_ISO;
3066
3067   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3068     {
3069       struct coding_system *this = &(coding_categories[i]);
3070       Lisp_Object attrs, val;
3071
3072       if (this->id < 0)
3073         continue;
3074       attrs = CODING_ID_ATTRS (this->id);
3075       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3076           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3077         setup_iso_safe_charsets (attrs);
3078       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3079       this->max_charset_id = SCHARS (val) - 1;
3080       this->safe_charsets = SDATA (val);
3081     }
3082
3083   /* A coding system of this category is always ASCII compatible.  */
3084   src += coding->head_ascii;
3085
3086   while (rejected != CATEGORY_MASK_ISO)
3087     {
3088       src_base = src;
3089       ONE_MORE_BYTE (c);
3090       switch (c)
3091         {
3092         case ISO_CODE_ESC:
3093           if (inhibit_iso_escape_detection)
3094             break;
3095           single_shifting = 0;
3096           ONE_MORE_BYTE (c);
3097           if (c >= '(' && c <= '/')
3098             {
3099               /* Designation sequence for a charset of dimension 1.  */
3100               ONE_MORE_BYTE (c1);
3101               if (c1 < ' ' || c1 >= 0x80
3102                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3103                 /* Invalid designation sequence.  Just ignore.  */
3104                 break;
3105             }
3106           else if (c == '$')
3107             {
3108               /* Designation sequence for a charset of dimension 2.  */
3109               ONE_MORE_BYTE (c);
3110               if (c >= '@' && c <= 'B')
3111                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3112                 id = iso_charset_table[1][0][c];
3113               else if (c >= '(' && c <= '/')
3114                 {
3115                   ONE_MORE_BYTE (c1);
3116                   if (c1 < ' ' || c1 >= 0x80
3117                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3118                     /* Invalid designation sequence.  Just ignore.  */
3119                     break;
3120                 }
3121               else
3122                 /* Invalid designation sequence.  Just ignore it.  */
3123                 break;
3124             }
3125           else if (c == 'N' || c == 'O')
3126             {
3127               /* ESC <Fe> for SS2 or SS3.  */
3128               single_shifting = 1;
3129               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3130               break;
3131             }
3132           else if (c == '1')
3133             {
3134               /* End of composition.  */
3135               if (composition_count < 0
3136                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3137                 /* Invalid */
3138                 break;
3139               composition_count = -1;
3140               found |= CATEGORY_MASK_ISO;
3141             }
3142           else if (c >= '0' && c <= '4')
3143             {
3144               /* ESC <Fp> for start/end composition.  */
3145               composition_count = 0;
3146               break;
3147             }
3148           else
3149             {
3150               /* Invalid escape sequence.  Just ignore it.  */
3151               break;
3152             }
3153
3154           /* We found a valid designation sequence for CHARSET.  */
3155           rejected |= CATEGORY_MASK_ISO_8BIT;
3156           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3157                               id))
3158             found |= CATEGORY_MASK_ISO_7;
3159           else
3160             rejected |= CATEGORY_MASK_ISO_7;
3161           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3162                               id))
3163             found |= CATEGORY_MASK_ISO_7_TIGHT;
3164           else
3165             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3166           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3167                               id))
3168             found |= CATEGORY_MASK_ISO_7_ELSE;
3169           else
3170             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3171           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3172                               id))
3173             found |= CATEGORY_MASK_ISO_8_ELSE;
3174           else
3175             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3176           break;
3177
3178         case ISO_CODE_SO:
3179         case ISO_CODE_SI:
3180           /* Locking shift out/in.  */
3181           if (inhibit_iso_escape_detection)
3182             break;
3183           single_shifting = 0;
3184           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3185           break;
3186
3187         case ISO_CODE_CSI:
3188           /* Control sequence introducer.  */
3189           single_shifting = 0;
3190           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3191           found |= CATEGORY_MASK_ISO_8_ELSE;
3192           goto check_extra_latin;
3193
3194         case ISO_CODE_SS2:
3195         case ISO_CODE_SS3:
3196           /* Single shift.   */
3197           if (inhibit_iso_escape_detection)
3198             break;
3199           single_shifting = 0;
3200           rejected |= CATEGORY_MASK_ISO_7BIT;
3201           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3202               & CODING_ISO_FLAG_SINGLE_SHIFT)
3203             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3204           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3205               & CODING_ISO_FLAG_SINGLE_SHIFT)
3206             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3207           if (single_shifting)
3208             break;
3209           goto check_extra_latin;
3210
3211         default:
3212           if (c < 0)
3213             continue;
3214           if (c < 0x80)
3215             {
3216               if (composition_count >= 0)
3217                 composition_count++;
3218               single_shifting = 0;
3219               break;
3220             }
3221           if (c >= 0xA0)
3222             {
3223               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3224               found |= CATEGORY_MASK_ISO_8_1;
3225               /* Check the length of succeeding codes of the range
3226                  0xA0..0FF.  If the byte length is even, we include
3227                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3228                  only when we are not single shifting.  */
3229               if (! single_shifting
3230                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3231                 {
3232                   int i = 1;
3233                   while (src < src_end)
3234                     {
3235                       src_base = src;
3236                       ONE_MORE_BYTE (c);
3237                       if (c < 0xA0)
3238                         {
3239                           src = src_base;
3240                           break;
3241                         }
3242                       i++;
3243                     }
3244
3245                   if (i & 1 && src < src_end)
3246                     {
3247                       rejected |= CATEGORY_MASK_ISO_8_2;
3248                       if (composition_count >= 0)
3249                         composition_count += i;
3250                     }
3251                   else
3252                     {
3253                       found |= CATEGORY_MASK_ISO_8_2;
3254                       if (composition_count >= 0)
3255                         composition_count += i / 2;
3256                     }
3257                 }
3258               break;
3259             }
3260         check_extra_latin:
3261           single_shifting = 0;
3262           if (! VECTORP (Vlatin_extra_code_table)
3263               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3264             {
3265               rejected = CATEGORY_MASK_ISO;
3266               break;
3267             }
3268           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3269               & CODING_ISO_FLAG_LATIN_EXTRA)
3270             found |= CATEGORY_MASK_ISO_8_1;
3271           else
3272             rejected |= CATEGORY_MASK_ISO_8_1;
3273           rejected |= CATEGORY_MASK_ISO_8_2;
3274         }
3275     }
3276   detect_info->rejected |= CATEGORY_MASK_ISO;
3277   return 0;
3278
3279  no_more_source:
3280   detect_info->rejected |= rejected;
3281   detect_info->found |= (found & ~rejected);
3282   return 1;
3283 }
3284
3285
3286 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3287    escape sequence should be kept.  */
3288 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3289   do {                                                                  \
3290     int id, prev;                                                       \
3291                                                                         \
3292     if (final < '0' || final >= 128                                     \
3293         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3294         || !SAFE_CHARSET_P (coding, id))                                \
3295       {                                                                 \
3296         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3297         chars_96 = -1;                                                  \
3298         break;                                                          \
3299       }                                                                 \
3300     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3301     if (id == charset_jisx0201_roman)                                   \
3302       {                                                                 \
3303         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3304           id = charset_ascii;                                           \
3305       }                                                                 \
3306     else if (id == charset_jisx0208_1978)                               \
3307       {                                                                 \
3308         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3309           id = charset_jisx0208;                                        \
3310       }                                                                 \
3311     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3312     /* If there was an invalid designation to REG previously, and this  \
3313        designation is ASCII to REG, we should keep this designation     \
3314        sequence.  */                                                    \
3315     if (prev == -2 && id == charset_ascii)                              \
3316       chars_96 = -1;                                                    \
3317   } while (0)
3318
3319
3320 /* Handle these composition sequence (ALT: alternate char):
3321
3322    (1) relative composition: ESC 0 CHAR ... ESC 1
3323    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3324    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3325    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3326
3327    When the start sequence (ESC 0/2/3/4) is found, this annotation
3328    header is produced.
3329
3330         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3331
3332    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3333    produced until the end sequence (ESC 1) is found:
3334
3335    (1) CHAR ... CHAR
3336    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3337    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3338    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3339
3340    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3341    annotation header is updated as below:
3342
3343    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3344    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3345    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3346    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3347
3348    If an error is found while composing, the annotation header is
3349    changed to:
3350
3351         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3352
3353    and the sequence [ -2 DECODED-RULE ] is changed to the original
3354    byte sequence as below:
3355         o the original byte sequence is B: [ B -1 ]
3356         o the original byte sequence is B1 B2: [ B1 B2 ]
3357    and the sequence [ -1 -1 ] is changed to the original byte
3358    sequence:
3359         [ ESC '0' ]
3360 */
3361
3362 /* Decode a composition rule C1 and maybe one more byte from the
3363    source, and set RULE to the encoded composition rule, NBYTES to the
3364    length of the composition rule.  If the rule is invalid, set RULE
3365    to some negative value.  */
3366
3367 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3368   do {                                                                  \
3369     rule = c1 - 32;                                                     \
3370     if (rule < 0)                                                       \
3371       break;                                                            \
3372     if (rule < 81)              /* old format (before ver.21) */        \
3373       {                                                                 \
3374         int gref = (rule) / 9;                                          \
3375         int nref = (rule) % 9;                                          \
3376         if (gref == 4) gref = 10;                                       \
3377         if (nref == 4) nref = 10;                                       \
3378         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3379         nbytes = 1;                                                     \
3380       }                                                                 \
3381     else                        /* new format (after ver.21) */         \
3382       {                                                                 \
3383         int c;                                                          \
3384                                                                         \
3385         ONE_MORE_BYTE (c);                                              \
3386         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
3387         if (rule >= 0)                                                  \
3388           rule += 0x100;   /* to destinguish it from the old format */  \
3389         nbytes = 2;                                                     \
3390       }                                                                 \
3391   } while (0)
3392
3393 #define ENCODE_COMPOSITION_RULE(rule)                           \
3394   do {                                                          \
3395     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3396                                                                 \
3397     if (rule < 0x100)           /* old format */                \
3398       {                                                         \
3399         if (gref == 10) gref = 4;                               \
3400         if (nref == 10) nref = 4;                               \
3401         charbuf[idx] = 32 + gref * 9 + nref;                    \
3402         charbuf[idx + 1] = -1;                                  \
3403         new_chars++;                                            \
3404       }                                                         \
3405     else                                /* new format */        \
3406       {                                                         \
3407         charbuf[idx] = 32 + 81 + gref;                          \
3408         charbuf[idx + 1] = 32 + nref;                           \
3409         new_chars += 2;                                         \
3410       }                                                         \
3411   } while (0)
3412
3413 /* Finish the current composition as invalid.  */
3414
3415 static int finish_composition (int *, struct composition_status *);
3416
3417 static int
3418 finish_composition (int *charbuf, struct composition_status *cmp_status)
3419 {
3420   int idx = - cmp_status->length;
3421   int new_chars;
3422
3423   /* Recover the original ESC sequence */
3424   charbuf[idx++] = ISO_CODE_ESC;
3425   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3426                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3427                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3428                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3429                     : '4');
3430   charbuf[idx++] = -2;
3431   charbuf[idx++] = 0;
3432   charbuf[idx++] = -1;
3433   new_chars = cmp_status->nchars;
3434   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3435     for (; idx < 0; idx++)
3436       {
3437         int elt = charbuf[idx];
3438
3439         if (elt == -2)
3440           {
3441             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3442             idx++;
3443           }
3444         else if (elt == -1)
3445           {
3446             charbuf[idx++] = ISO_CODE_ESC;
3447             charbuf[idx] = '0';
3448             new_chars += 2;
3449           }
3450       }
3451   cmp_status->state = COMPOSING_NO;
3452   return new_chars;
3453 }
3454
3455 /* If characers are under composition, finish the composition.  */
3456 #define MAYBE_FINISH_COMPOSITION()                              \
3457   do {                                                          \
3458     if (cmp_status->state != COMPOSING_NO)                      \
3459       char_offset += finish_composition (charbuf, cmp_status);  \
3460   } while (0)
3461
3462 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3463
3464    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3465    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3466    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3467    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3468
3469    Produce this annotation sequence now:
3470
3471    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3472 */
3473
3474 #define DECODE_COMPOSITION_START(c1)                                       \
3475   do {                                                                     \
3476     if (c1 == '0'                                                          \
3477         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3478              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3479             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3480                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3481       {                                                                    \
3482         *charbuf++ = -1;                                                   \
3483         *charbuf++= -1;                                                    \
3484         cmp_status->state = COMPOSING_CHAR;                                \
3485         cmp_status->length += 2;                                           \
3486       }                                                                    \
3487     else                                                                   \
3488       {                                                                    \
3489         MAYBE_FINISH_COMPOSITION ();                                       \
3490         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3491                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3492                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3493                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3494         cmp_status->state                                                  \
3495           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3496         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3497         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3498         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3499         coding->annotated = 1;                                             \
3500       }                                                                    \
3501   } while (0)
3502
3503
3504 /* Handle composition end sequence ESC 1.  */
3505
3506 #define DECODE_COMPOSITION_END()                                        \
3507   do {                                                                  \
3508     if (cmp_status->nchars == 0                                         \
3509         || ((cmp_status->state == COMPOSING_CHAR)                       \
3510             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3511       {                                                                 \
3512         MAYBE_FINISH_COMPOSITION ();                                    \
3513         goto invalid_code;                                              \
3514       }                                                                 \
3515     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3516       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3517     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3518       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3519     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3520     char_offset += cmp_status->nchars;                                  \
3521     cmp_status->state = COMPOSING_NO;                                   \
3522   } while (0)
3523
3524 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3525
3526 #define STORE_COMPOSITION_RULE(rule)    \
3527   do {                                  \
3528     *charbuf++ = -2;                    \
3529     *charbuf++ = rule;                  \
3530     cmp_status->length += 2;            \
3531     cmp_status->state--;                \
3532   } while (0)
3533
3534 /* Store a composed char or a component char C in charbuf, and update
3535    cmp_status.  */
3536
3537 #define STORE_COMPOSITION_CHAR(c)                                       \
3538   do {                                                                  \
3539     *charbuf++ = (c);                                                   \
3540     cmp_status->length++;                                               \
3541     if (cmp_status->state == COMPOSING_CHAR)                            \
3542       cmp_status->nchars++;                                             \
3543     else                                                                \
3544       cmp_status->ncomps++;                                             \
3545     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3546         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3547             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3548       cmp_status->state++;                                              \
3549   } while (0)
3550
3551
3552 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3553
3554 static void
3555 decode_coding_iso_2022 (struct coding_system *coding)
3556 {
3557   const unsigned char *src = coding->source + coding->consumed;
3558   const unsigned char *src_end = coding->source + coding->src_bytes;
3559   const unsigned char *src_base;
3560   int *charbuf = coding->charbuf + coding->charbuf_used;
3561   /* We may produce two annocations (charset and composition) in one
3562      loop and one more charset annocation at the end.  */
3563   int *charbuf_end
3564     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3565   int consumed_chars = 0, consumed_chars_base;
3566   int multibytep = coding->src_multibyte;
3567   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3568   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3569   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3570   int charset_id_2, charset_id_3;
3571   struct charset *charset;
3572   int c;
3573   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3574   Lisp_Object attrs, charset_list;
3575   int char_offset = coding->produced_char;
3576   int last_offset = char_offset;
3577   int last_id = charset_ascii;
3578   int eol_crlf =
3579     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3580   int byte_after_cr = -1;
3581   int i;
3582
3583   CODING_GET_INFO (coding, attrs, charset_list);
3584   setup_iso_safe_charsets (attrs);
3585   /* Charset list may have been changed.  */
3586   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3587   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3588
3589   if (cmp_status->state != COMPOSING_NO)
3590     {
3591       for (i = 0; i < cmp_status->length; i++)
3592         *charbuf++ = cmp_status->carryover[i];
3593       coding->annotated = 1;
3594     }
3595
3596   while (1)
3597     {
3598       int c1, c2, c3;
3599
3600       src_base = src;
3601       consumed_chars_base = consumed_chars;
3602
3603       if (charbuf >= charbuf_end)
3604         {
3605           if (byte_after_cr >= 0)
3606             src_base--;
3607           break;
3608         }
3609
3610       if (byte_after_cr >= 0)
3611         c1 = byte_after_cr, byte_after_cr = -1;
3612       else
3613         ONE_MORE_BYTE (c1);
3614       if (c1 < 0)
3615         goto invalid_code;
3616
3617       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3618         {
3619           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3620           char_offset++;
3621           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3622           continue;
3623         }
3624
3625       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3626         {
3627           if (c1 == ISO_CODE_ESC)
3628             {
3629               if (src + 1 >= src_end)
3630                 goto no_more_source;
3631               *charbuf++ = ISO_CODE_ESC;
3632               char_offset++;
3633               if (src[0] == '%' && src[1] == '@')
3634                 {
3635                   src += 2;
3636                   consumed_chars += 2;
3637                   char_offset += 2;
3638                   /* We are sure charbuf can contain two more chars. */
3639                   *charbuf++ = '%';
3640                   *charbuf++ = '@';
3641                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3642                 }
3643             }
3644           else
3645             {
3646               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3647               char_offset++;
3648             }
3649           continue;
3650         }
3651
3652       if ((cmp_status->state == COMPOSING_RULE
3653            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3654           && c1 != ISO_CODE_ESC)
3655         {
3656           int rule, nbytes;
3657
3658           DECODE_COMPOSITION_RULE (rule, nbytes);
3659           if (rule < 0)
3660             goto invalid_code;
3661           STORE_COMPOSITION_RULE (rule);
3662           continue;
3663         }
3664
3665       /* We produce at most one character.  */
3666       switch (iso_code_class [c1])
3667         {
3668         case ISO_0x20_or_0x7F:
3669           if (charset_id_0 < 0
3670               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3671             /* This is SPACE or DEL.  */
3672             charset = CHARSET_FROM_ID (charset_ascii);
3673           else
3674             charset = CHARSET_FROM_ID (charset_id_0);
3675           break;
3676
3677         case ISO_graphic_plane_0:
3678           if (charset_id_0 < 0)
3679             charset = CHARSET_FROM_ID (charset_ascii);
3680           else
3681             charset = CHARSET_FROM_ID (charset_id_0);
3682           break;
3683
3684         case ISO_0xA0_or_0xFF:
3685           if (charset_id_1 < 0
3686               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3687               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3688             goto invalid_code;
3689           /* This is a graphic character, we fall down ... */
3690
3691         case ISO_graphic_plane_1:
3692           if (charset_id_1 < 0)
3693             goto invalid_code;
3694           charset = CHARSET_FROM_ID (charset_id_1);
3695           break;
3696
3697         case ISO_control_0:
3698           if (eol_crlf && c1 == '\r')
3699             ONE_MORE_BYTE (byte_after_cr);
3700           MAYBE_FINISH_COMPOSITION ();
3701           charset = CHARSET_FROM_ID (charset_ascii);
3702           break;
3703
3704         case ISO_control_1:
3705           goto invalid_code;
3706
3707         case ISO_shift_out:
3708           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3709               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3710             goto invalid_code;
3711           CODING_ISO_INVOCATION (coding, 0) = 1;
3712           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3713           continue;
3714
3715         case ISO_shift_in:
3716           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3717             goto invalid_code;
3718           CODING_ISO_INVOCATION (coding, 0) = 0;
3719           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3720           continue;
3721
3722         case ISO_single_shift_2_7:
3723           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3724             goto invalid_code;
3725         case ISO_single_shift_2:
3726           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3727             goto invalid_code;
3728           /* SS2 is handled as an escape sequence of ESC 'N' */
3729           c1 = 'N';
3730           goto label_escape_sequence;
3731
3732         case ISO_single_shift_3:
3733           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3734             goto invalid_code;
3735           /* SS2 is handled as an escape sequence of ESC 'O' */
3736           c1 = 'O';
3737           goto label_escape_sequence;
3738
3739         case ISO_control_sequence_introducer:
3740           /* CSI is handled as an escape sequence of ESC '[' ...  */
3741           c1 = '[';
3742           goto label_escape_sequence;
3743
3744         case ISO_escape:
3745           ONE_MORE_BYTE (c1);
3746         label_escape_sequence:
3747           /* Escape sequences handled here are invocation,
3748              designation, direction specification, and character
3749              composition specification.  */
3750           switch (c1)
3751             {
3752             case '&':           /* revision of following character set */
3753               ONE_MORE_BYTE (c1);
3754               if (!(c1 >= '@' && c1 <= '~'))
3755                 goto invalid_code;
3756               ONE_MORE_BYTE (c1);
3757               if (c1 != ISO_CODE_ESC)
3758                 goto invalid_code;
3759               ONE_MORE_BYTE (c1);
3760               goto label_escape_sequence;
3761
3762             case '$':           /* designation of 2-byte character set */
3763               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3764                 goto invalid_code;
3765               {
3766                 int reg, chars96;
3767
3768                 ONE_MORE_BYTE (c1);
3769                 if (c1 >= '@' && c1 <= 'B')
3770                   {     /* designation of JISX0208.1978, GB2312.1980,
3771                            or JISX0208.1980 */
3772                     reg = 0, chars96 = 0;
3773                   }
3774                 else if (c1 >= 0x28 && c1 <= 0x2B)
3775                   { /* designation of DIMENSION2_CHARS94 character set */
3776                     reg = c1 - 0x28, chars96 = 0;
3777                     ONE_MORE_BYTE (c1);
3778                   }
3779                 else if (c1 >= 0x2C && c1 <= 0x2F)
3780                   { /* designation of DIMENSION2_CHARS96 character set */
3781                     reg = c1 - 0x2C, chars96 = 1;
3782                     ONE_MORE_BYTE (c1);
3783                   }
3784                 else
3785                   goto invalid_code;
3786                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3787                 /* We must update these variables now.  */
3788                 if (reg == 0)
3789                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3790                 else if (reg == 1)
3791                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3792                 if (chars96 < 0)
3793                   goto invalid_code;
3794               }
3795               continue;
3796
3797             case 'n':           /* invocation of locking-shift-2 */
3798               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3799                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3800                 goto invalid_code;
3801               CODING_ISO_INVOCATION (coding, 0) = 2;
3802               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3803               continue;
3804
3805             case 'o':           /* invocation of locking-shift-3 */
3806               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3807                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3808                 goto invalid_code;
3809               CODING_ISO_INVOCATION (coding, 0) = 3;
3810               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3811               continue;
3812
3813             case 'N':           /* invocation of single-shift-2 */
3814               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3815                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3816                 goto invalid_code;
3817               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3818               if (charset_id_2 < 0)
3819                 charset = CHARSET_FROM_ID (charset_ascii);
3820               else
3821                 charset = CHARSET_FROM_ID (charset_id_2);
3822               ONE_MORE_BYTE (c1);
3823               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3824                 goto invalid_code;
3825               break;
3826
3827             case 'O':           /* invocation of single-shift-3 */
3828               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3829                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3830                 goto invalid_code;
3831               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3832               if (charset_id_3 < 0)
3833                 charset = CHARSET_FROM_ID (charset_ascii);
3834               else
3835                 charset = CHARSET_FROM_ID (charset_id_3);
3836               ONE_MORE_BYTE (c1);
3837               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3838                 goto invalid_code;
3839               break;
3840
3841             case '0': case '2': case '3': case '4': /* start composition */
3842               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3843                 goto invalid_code;
3844               if (last_id != charset_ascii)
3845                 {
3846                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3847                   last_id = charset_ascii;
3848                   last_offset = char_offset;
3849                 }
3850               DECODE_COMPOSITION_START (c1);
3851               continue;
3852
3853             case '1':           /* end composition */
3854               if (cmp_status->state == COMPOSING_NO)
3855                 goto invalid_code;
3856               DECODE_COMPOSITION_END ();
3857               continue;
3858
3859             case '[':           /* specification of direction */
3860               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3861                 goto invalid_code;
3862               /* For the moment, nested direction is not supported.
3863                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3864                  left-to-right, and nozero means right-to-left.  */
3865               ONE_MORE_BYTE (c1);
3866               switch (c1)
3867                 {
3868                 case ']':       /* end of the current direction */
3869                   coding->mode &= ~CODING_MODE_DIRECTION;
3870
3871                 case '0':       /* end of the current direction */
3872                 case '1':       /* start of left-to-right direction */
3873                   ONE_MORE_BYTE (c1);
3874                   if (c1 == ']')
3875                     coding->mode &= ~CODING_MODE_DIRECTION;
3876                   else
3877                     goto invalid_code;
3878                   break;
3879
3880                 case '2':       /* start of right-to-left direction */
3881                   ONE_MORE_BYTE (c1);
3882                   if (c1 == ']')
3883                     coding->mode |= CODING_MODE_DIRECTION;
3884                   else
3885                     goto invalid_code;
3886                   break;
3887
3888                 default:
3889                   goto invalid_code;
3890                 }
3891               continue;
3892
3893             case '%':
3894               ONE_MORE_BYTE (c1);
3895               if (c1 == '/')
3896                 {
3897                   /* CTEXT extended segment:
3898                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3899                      We keep these bytes as is for the moment.
3900                      They may be decoded by post-read-conversion.  */
3901                   int dim, M, L;
3902                   int size;
3903
3904                   ONE_MORE_BYTE (dim);
3905                   if (dim < '0' || dim > '4')
3906                     goto invalid_code;
3907                   ONE_MORE_BYTE (M);
3908                   if (M < 128)
3909                     goto invalid_code;
3910                   ONE_MORE_BYTE (L);
3911                   if (L < 128)
3912                     goto invalid_code;
3913                   size = ((M - 128) * 128) + (L - 128);
3914                   if (charbuf + 6 > charbuf_end)
3915                     goto break_loop;
3916                   *charbuf++ = ISO_CODE_ESC;
3917                   *charbuf++ = '%';
3918                   *charbuf++ = '/';
3919                   *charbuf++ = dim;
3920                   *charbuf++ = BYTE8_TO_CHAR (M);
3921                   *charbuf++ = BYTE8_TO_CHAR (L);
3922                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3923                 }
3924               else if (c1 == 'G')
3925                 {
3926                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3927                      ESC % G --UTF-8-BYTES-- ESC % @
3928                      We keep these bytes as is for the moment.
3929                      They may be decoded by post-read-conversion.  */
3930                   if (charbuf + 3 > charbuf_end)
3931                     goto break_loop;
3932                   *charbuf++ = ISO_CODE_ESC;
3933                   *charbuf++ = '%';
3934                   *charbuf++ = 'G';
3935                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3936                 }
3937               else
3938                 goto invalid_code;
3939               continue;
3940               break;
3941
3942             default:
3943               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3944                 goto invalid_code;
3945               {
3946                 int reg, chars96;
3947
3948                 if (c1 >= 0x28 && c1 <= 0x2B)
3949                   { /* designation of DIMENSION1_CHARS94 character set */
3950                     reg = c1 - 0x28, chars96 = 0;
3951                     ONE_MORE_BYTE (c1);
3952                   }
3953                 else if (c1 >= 0x2C && c1 <= 0x2F)
3954                   { /* designation of DIMENSION1_CHARS96 character set */
3955                     reg = c1 - 0x2C, chars96 = 1;
3956                     ONE_MORE_BYTE (c1);
3957                   }
3958                 else
3959                   goto invalid_code;
3960                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3961                 /* We must update these variables now.  */
3962                 if (reg == 0)
3963                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3964                 else if (reg == 1)
3965                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3966                 if (chars96 < 0)
3967                   goto invalid_code;
3968               }
3969               continue;
3970             }
3971         }
3972
3973       if (cmp_status->state == COMPOSING_NO
3974           && charset->id != charset_ascii
3975           && last_id != charset->id)
3976         {
3977           if (last_id != charset_ascii)
3978             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3979           last_id = charset->id;
3980           last_offset = char_offset;
3981         }
3982
3983       /* Now we know CHARSET and 1st position code C1 of a character.
3984          Produce a decoded character while getting 2nd and 3rd
3985          position codes C2, C3 if necessary.  */
3986       if (CHARSET_DIMENSION (charset) > 1)
3987         {
3988           ONE_MORE_BYTE (c2);
3989           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3990               || ((c1 & 0x80) != (c2 & 0x80)))
3991             /* C2 is not in a valid range.  */
3992             goto invalid_code;
3993           if (CHARSET_DIMENSION (charset) == 2)
3994             c1 = (c1 << 8) | c2;
3995           else
3996             {
3997               ONE_MORE_BYTE (c3);
3998               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3999                   || ((c1 & 0x80) != (c3 & 0x80)))
4000                 /* C3 is not in a valid range.  */
4001                 goto invalid_code;
4002               c1 = (c1 << 16) | (c2 << 8) | c2;
4003             }
4004         }
4005       c1 &= 0x7F7F7F;
4006       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4007       if (c < 0)
4008         {
4009           MAYBE_FINISH_COMPOSITION ();
4010           for (; src_base < src; src_base++, char_offset++)
4011             {
4012               if (ASCII_BYTE_P (*src_base))
4013                 *charbuf++ = *src_base;
4014               else
4015                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4016             }
4017         }
4018       else if (cmp_status->state == COMPOSING_NO)
4019         {
4020           *charbuf++ = c;
4021           char_offset++;
4022         }
4023       else if ((cmp_status->state == COMPOSING_CHAR
4024                 ? cmp_status->nchars
4025                 : cmp_status->ncomps)
4026                >= MAX_COMPOSITION_COMPONENTS)
4027         {
4028           /* Too long composition.  */
4029           MAYBE_FINISH_COMPOSITION ();
4030           *charbuf++ = c;
4031           char_offset++;
4032         }
4033       else
4034         STORE_COMPOSITION_CHAR (c);
4035       continue;
4036
4037     invalid_code:
4038       MAYBE_FINISH_COMPOSITION ();
4039       src = src_base;
4040       consumed_chars = consumed_chars_base;
4041       ONE_MORE_BYTE (c);
4042       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4043       char_offset++;
4044       coding->errors++;
4045       continue;
4046
4047     break_loop:
4048       break;
4049     }
4050
4051  no_more_source:
4052   if (cmp_status->state != COMPOSING_NO)
4053     {
4054       if (coding->mode & CODING_MODE_LAST_BLOCK)
4055         MAYBE_FINISH_COMPOSITION ();
4056       else
4057         {
4058           charbuf -= cmp_status->length;
4059           for (i = 0; i < cmp_status->length; i++)
4060             cmp_status->carryover[i] = charbuf[i];
4061         }
4062     }
4063   else if (last_id != charset_ascii)
4064     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4065   coding->consumed_char += consumed_chars_base;
4066   coding->consumed = src_base - coding->source;
4067   coding->charbuf_used = charbuf - coding->charbuf;
4068 }
4069
4070
4071 /* ISO2022 encoding stuff.  */
4072
4073 /*
4074    It is not enough to say just "ISO2022" on encoding, we have to
4075    specify more details.  In Emacs, each coding system of ISO2022
4076    variant has the following specifications:
4077         1. Initial designation to G0 thru G3.
4078         2. Allows short-form designation?
4079         3. ASCII should be designated to G0 before control characters?
4080         4. ASCII should be designated to G0 at end of line?
4081         5. 7-bit environment or 8-bit environment?
4082         6. Use locking-shift?
4083         7. Use Single-shift?
4084    And the following two are only for Japanese:
4085         8. Use ASCII in place of JIS0201-1976-Roman?
4086         9. Use JISX0208-1983 in place of JISX0208-1978?
4087    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4088    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4089    details.
4090 */
4091
4092 /* Produce codes (escape sequence) for designating CHARSET to graphic
4093    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4094    '@', 'A', or 'B' and the coding system CODING allows, produce
4095    designation sequence of short-form.  */
4096
4097 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4098   do {                                                                  \
4099     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4100     const char *intermediate_char_94 = "()*+";                          \
4101     const char *intermediate_char_96 = ",-./";                          \
4102     int revision = -1;                                                  \
4103     int c;                                                              \
4104                                                                         \
4105     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4106       revision = CHARSET_ISO_REVISION (charset);                        \
4107                                                                         \
4108     if (revision >= 0)                                                  \
4109       {                                                                 \
4110         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4111         EMIT_ONE_BYTE ('@' + revision);                                 \
4112       }                                                                 \
4113     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4114     if (CHARSET_DIMENSION (charset) == 1)                               \
4115       {                                                                 \
4116         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4117           c = intermediate_char_94[reg];                                \
4118         else                                                            \
4119           c = intermediate_char_96[reg];                                \
4120         EMIT_ONE_ASCII_BYTE (c);                                        \
4121       }                                                                 \
4122     else                                                                \
4123       {                                                                 \
4124         EMIT_ONE_ASCII_BYTE ('$');                                      \
4125         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4126           {                                                             \
4127             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4128                 || reg != 0                                             \
4129                 || final_char < '@' || final_char > 'B')                \
4130               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4131           }                                                             \
4132         else                                                            \
4133           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4134       }                                                                 \
4135     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4136                                                                         \
4137     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4138   } while (0)
4139
4140
4141 /* The following two macros produce codes (control character or escape
4142    sequence) for ISO2022 single-shift functions (single-shift-2 and
4143    single-shift-3).  */
4144
4145 #define ENCODE_SINGLE_SHIFT_2                                           \
4146   do {                                                                  \
4147     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4148       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4149     else                                                                \
4150       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4151     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4152   } while (0)
4153
4154
4155 #define ENCODE_SINGLE_SHIFT_3                                           \
4156   do {                                                                  \
4157     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4158       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4159     else                                                                \
4160       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4161     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4162   } while (0)
4163
4164
4165 /* The following four macros produce codes (control character or
4166    escape sequence) for ISO2022 locking-shift functions (shift-in,
4167    shift-out, locking-shift-2, and locking-shift-3).  */
4168
4169 #define ENCODE_SHIFT_IN                                 \
4170   do {                                                  \
4171     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4172     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4173   } while (0)
4174
4175
4176 #define ENCODE_SHIFT_OUT                                \
4177   do {                                                  \
4178     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4179     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4180   } while (0)
4181
4182
4183 #define ENCODE_LOCKING_SHIFT_2                          \
4184   do {                                                  \
4185     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4186     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4187   } while (0)
4188
4189
4190 #define ENCODE_LOCKING_SHIFT_3                          \
4191   do {                                                  \
4192     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4193     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4194   } while (0)
4195
4196
4197 /* Produce codes for a DIMENSION1 character whose character set is
4198    CHARSET and whose position-code is C1.  Designation and invocation
4199    sequences are also produced in advance if necessary.  */
4200
4201 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4202   do {                                                                  \
4203     int id = CHARSET_ID (charset);                                      \
4204                                                                         \
4205     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4206         && id == charset_ascii)                                         \
4207       {                                                                 \
4208         id = charset_jisx0201_roman;                                    \
4209         charset = CHARSET_FROM_ID (id);                                 \
4210       }                                                                 \
4211                                                                         \
4212     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4213       {                                                                 \
4214         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4215           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4216         else                                                            \
4217           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4218         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4219         break;                                                          \
4220       }                                                                 \
4221     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4222       {                                                                 \
4223         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4224         break;                                                          \
4225       }                                                                 \
4226     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4227       {                                                                 \
4228         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4229         break;                                                          \
4230       }                                                                 \
4231     else                                                                \
4232       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4233          must invoke it, or, at first, designate it to some graphic     \
4234          register.  Then repeat the loop to actually produce the        \
4235          character.  */                                                 \
4236       dst = encode_invocation_designation (charset, coding, dst,        \
4237                                            &produced_chars);            \
4238   } while (1)
4239
4240
4241 /* Produce codes for a DIMENSION2 character whose character set is
4242    CHARSET and whose position-codes are C1 and C2.  Designation and
4243    invocation codes are also produced in advance if necessary.  */
4244
4245 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4246   do {                                                                  \
4247     int id = CHARSET_ID (charset);                                      \
4248                                                                         \
4249     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4250         && id == charset_jisx0208)                                      \
4251       {                                                                 \
4252         id = charset_jisx0208_1978;                                     \
4253         charset = CHARSET_FROM_ID (id);                                 \
4254       }                                                                 \
4255                                                                         \
4256     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4257       {                                                                 \
4258         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4259           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4260         else                                                            \
4261           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4262         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4263         break;                                                          \
4264       }                                                                 \
4265     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4266       {                                                                 \
4267         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4268         break;                                                          \
4269       }                                                                 \
4270     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4271       {                                                                 \
4272         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4273         break;                                                          \
4274       }                                                                 \
4275     else                                                                \
4276       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4277          must invoke it, or, at first, designate it to some graphic     \
4278          register.  Then repeat the loop to actually produce the        \
4279          character.  */                                                 \
4280       dst = encode_invocation_designation (charset, coding, dst,        \
4281                                            &produced_chars);            \
4282   } while (1)
4283
4284
4285 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4286   do {                                                                     \
4287     int code = ENCODE_CHAR ((charset), (c));                               \
4288                                                                            \
4289     if (CHARSET_DIMENSION (charset) == 1)                                  \
4290       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4291     else                                                                   \
4292       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4293   } while (0)
4294
4295
4296 /* Produce designation and invocation codes at a place pointed by DST
4297    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4298    Return new DST.  */
4299
4300 unsigned char *
4301 encode_invocation_designation (struct charset *charset,
4302                                struct coding_system *coding,
4303                                unsigned char *dst, int *p_nchars)
4304 {
4305   int multibytep = coding->dst_multibyte;
4306   int produced_chars = *p_nchars;
4307   int reg;                      /* graphic register number */
4308   int id = CHARSET_ID (charset);
4309
4310   /* At first, check designations.  */
4311   for (reg = 0; reg < 4; reg++)
4312     if (id == CODING_ISO_DESIGNATION (coding, reg))
4313       break;
4314
4315   if (reg >= 4)
4316     {
4317       /* CHARSET is not yet designated to any graphic registers.  */
4318       /* At first check the requested designation.  */
4319       reg = CODING_ISO_REQUEST (coding, id);
4320       if (reg < 0)
4321         /* Since CHARSET requests no special designation, designate it
4322            to graphic register 0.  */
4323         reg = 0;
4324
4325       ENCODE_DESIGNATION (charset, reg, coding);
4326     }
4327
4328   if (CODING_ISO_INVOCATION (coding, 0) != reg
4329       && CODING_ISO_INVOCATION (coding, 1) != reg)
4330     {
4331       /* Since the graphic register REG is not invoked to any graphic
4332          planes, invoke it to graphic plane 0.  */
4333       switch (reg)
4334         {
4335         case 0:                 /* graphic register 0 */
4336           ENCODE_SHIFT_IN;
4337           break;
4338
4339         case 1:                 /* graphic register 1 */
4340           ENCODE_SHIFT_OUT;
4341           break;
4342
4343         case 2:                 /* graphic register 2 */
4344           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4345             ENCODE_SINGLE_SHIFT_2;
4346           else
4347             ENCODE_LOCKING_SHIFT_2;
4348           break;
4349
4350         case 3:                 /* graphic register 3 */
4351           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4352             ENCODE_SINGLE_SHIFT_3;
4353           else
4354             ENCODE_LOCKING_SHIFT_3;
4355           break;
4356         }
4357     }
4358
4359   *p_nchars = produced_chars;
4360   return dst;
4361 }
4362
4363 /* The following three macros produce codes for indicating direction
4364    of text.  */
4365 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
4366   do {                                                                  \
4367     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
4368       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
4369     else                                                                \
4370       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
4371   } while (0)
4372
4373
4374 #define ENCODE_DIRECTION_R2L()                  \
4375   do {                                          \
4376     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4377     EMIT_TWO_ASCII_BYTES ('2', ']');            \
4378   } while (0)
4379
4380
4381 #define ENCODE_DIRECTION_L2R()                  \
4382   do {                                          \
4383     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4384     EMIT_TWO_ASCII_BYTES ('0', ']');            \
4385   } while (0)
4386
4387
4388 /* Produce codes for designation and invocation to reset the graphic
4389    planes and registers to initial state.  */
4390 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4391   do {                                                                  \
4392     int reg;                                                            \
4393     struct charset *charset;                                            \
4394                                                                         \
4395     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4396       ENCODE_SHIFT_IN;                                                  \
4397     for (reg = 0; reg < 4; reg++)                                       \
4398       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4399           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4400               != CODING_ISO_INITIAL (coding, reg)))                     \
4401         {                                                               \
4402           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4403           ENCODE_DESIGNATION (charset, reg, coding);                    \
4404         }                                                               \
4405   } while (0)
4406
4407
4408 /* Produce designation sequences of charsets in the line started from
4409    SRC to a place pointed by DST, and return updated DST.
4410
4411    If the current block ends before any end-of-line, we may fail to
4412    find all the necessary designations.  */
4413
4414 static unsigned char *
4415 encode_designation_at_bol (struct coding_system *coding, int *charbuf,
4416                            int *charbuf_end, unsigned char *dst)
4417 {
4418   struct charset *charset;
4419   /* Table of charsets to be designated to each graphic register.  */
4420   int r[4];
4421   int c, found = 0, reg;
4422   int produced_chars = 0;
4423   int multibytep = coding->dst_multibyte;
4424   Lisp_Object attrs;
4425   Lisp_Object charset_list;
4426
4427   attrs = CODING_ID_ATTRS (coding->id);
4428   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4429   if (EQ (charset_list, Qiso_2022))
4430     charset_list = Viso_2022_charset_list;
4431
4432   for (reg = 0; reg < 4; reg++)
4433     r[reg] = -1;
4434
4435   while (found < 4)
4436     {
4437       int id;
4438
4439       c = *charbuf++;
4440       if (c == '\n')
4441         break;
4442       charset = char_charset (c, charset_list, NULL);
4443       id = CHARSET_ID (charset);
4444       reg = CODING_ISO_REQUEST (coding, id);
4445       if (reg >= 0 && r[reg] < 0)
4446         {
4447           found++;
4448           r[reg] = id;
4449         }
4450     }
4451
4452   if (found)
4453     {
4454       for (reg = 0; reg < 4; reg++)
4455         if (r[reg] >= 0
4456             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4457           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4458     }
4459
4460   return dst;
4461 }
4462
4463 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4464
4465 static int
4466 encode_coding_iso_2022 (struct coding_system *coding)
4467 {
4468   int multibytep = coding->dst_multibyte;
4469   int *charbuf = coding->charbuf;
4470   int *charbuf_end = charbuf + coding->charbuf_used;
4471   unsigned char *dst = coding->destination + coding->produced;
4472   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4473   int safe_room = 16;
4474   int bol_designation
4475     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4476        && CODING_ISO_BOL (coding));
4477   int produced_chars = 0;
4478   Lisp_Object attrs, eol_type, charset_list;
4479   int ascii_compatible;
4480   int c;
4481   int preferred_charset_id = -1;
4482
4483   CODING_GET_INFO (coding, attrs, charset_list);
4484   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4485   if (VECTORP (eol_type))
4486     eol_type = Qunix;
4487
4488   setup_iso_safe_charsets (attrs);
4489   /* Charset list may have been changed.  */
4490   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4491   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4492
4493   ascii_compatible
4494     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4495        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4496                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4497
4498   while (charbuf < charbuf_end)
4499     {
4500       ASSURE_DESTINATION (safe_room);
4501
4502       if (bol_designation)
4503         {
4504           unsigned char *dst_prev = dst;
4505
4506           /* We have to produce designation sequences if any now.  */
4507           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4508           bol_designation = 0;
4509           /* We are sure that designation sequences are all ASCII bytes.  */
4510           produced_chars += dst - dst_prev;
4511         }
4512
4513       c = *charbuf++;
4514
4515       if (c < 0)
4516         {
4517           /* Handle an annotation.  */
4518           switch (*charbuf)
4519             {
4520             case CODING_ANNOTATE_COMPOSITION_MASK:
4521               /* Not yet implemented.  */
4522               break;
4523             case CODING_ANNOTATE_CHARSET_MASK:
4524               preferred_charset_id = charbuf[2];
4525               if (preferred_charset_id >= 0
4526                   && NILP (Fmemq (make_number (preferred_charset_id),
4527                                   charset_list)))
4528                 preferred_charset_id = -1;
4529               break;
4530             default:
4531               abort ();
4532             }
4533           charbuf += -c - 1;
4534           continue;
4535         }
4536
4537       /* Now encode the character C.  */
4538       if (c < 0x20 || c == 0x7F)
4539         {
4540           if (c == '\n'
4541               || (c == '\r' && EQ (eol_type, Qmac)))
4542             {
4543               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4544                 ENCODE_RESET_PLANE_AND_REGISTER ();
4545               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4546                 {
4547                   int i;
4548
4549                   for (i = 0; i < 4; i++)
4550                     CODING_ISO_DESIGNATION (coding, i)
4551                       = CODING_ISO_INITIAL (coding, i);
4552                 }
4553               bol_designation
4554                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4555             }
4556           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4557             ENCODE_RESET_PLANE_AND_REGISTER ();
4558           EMIT_ONE_ASCII_BYTE (c);
4559         }
4560       else if (ASCII_CHAR_P (c))
4561         {
4562           if (ascii_compatible)
4563             EMIT_ONE_ASCII_BYTE (c);
4564           else
4565             {
4566               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4567               ENCODE_ISO_CHARACTER (charset, c);
4568             }
4569         }
4570       else if (CHAR_BYTE8_P (c))
4571         {
4572           c = CHAR_TO_BYTE8 (c);
4573           EMIT_ONE_BYTE (c);
4574         }
4575       else
4576         {
4577           struct charset *charset;
4578
4579           if (preferred_charset_id >= 0)
4580             {
4581               charset = CHARSET_FROM_ID (preferred_charset_id);
4582               if (! CHAR_CHARSET_P (c, charset))
4583                 charset = char_charset (c, charset_list, NULL);
4584             }
4585           else
4586             charset = char_charset (c, charset_list, NULL);
4587           if (!charset)
4588             {
4589               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4590                 {
4591                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4592                   charset = CHARSET_FROM_ID (charset_ascii);
4593                 }
4594               else
4595                 {
4596                   c = coding->default_char;
4597                   charset = char_charset (c, charset_list, NULL);
4598                 }
4599             }
4600           ENCODE_ISO_CHARACTER (charset, c);
4601         }
4602     }
4603
4604   if (coding->mode & CODING_MODE_LAST_BLOCK
4605       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4606     {
4607       ASSURE_DESTINATION (safe_room);
4608       ENCODE_RESET_PLANE_AND_REGISTER ();
4609     }
4610   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4611   CODING_ISO_BOL (coding) = bol_designation;
4612   coding->produced_char += produced_chars;
4613   coding->produced = dst - coding->destination;
4614   return 0;
4615 }
4616
4617 \f
4618 /*** 8,9. SJIS and BIG5 handlers ***/
4619
4620 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4621    quite widely.  So, for the moment, Emacs supports them in the bare
4622    C code.  But, in the future, they may be supported only by CCL.  */
4623
4624 /* SJIS is a coding system encoding three character sets: ASCII, right
4625    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4626    as is.  A character of charset katakana-jisx0201 is encoded by
4627    "position-code + 0x80".  A character of charset japanese-jisx0208
4628    is encoded in 2-byte but two position-codes are divided and shifted
4629    so that it fit in the range below.
4630
4631    --- CODE RANGE of SJIS ---
4632    (character set)      (range)
4633    ASCII                0x00 .. 0x7F
4634    KATAKANA-JISX0201    0xA0 .. 0xDF
4635    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4636             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4637    -------------------------------
4638
4639 */
4640
4641 /* BIG5 is a coding system encoding two character sets: ASCII and
4642    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4643    character set and is encoded in two-byte.
4644
4645    --- CODE RANGE of BIG5 ---
4646    (character set)      (range)
4647    ASCII                0x00 .. 0x7F
4648    Big5 (1st byte)      0xA1 .. 0xFE
4649         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4650    --------------------------
4651
4652   */
4653
4654 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4655    Check if a text is encoded in SJIS.  If it is, return
4656    CATEGORY_MASK_SJIS, else return 0.  */
4657
4658 static int
4659 detect_coding_sjis (struct coding_system *coding,
4660                     struct coding_detection_info *detect_info)
4661 {
4662   const unsigned char *src = coding->source, *src_base;
4663   const unsigned char *src_end = coding->source + coding->src_bytes;
4664   int multibytep = coding->src_multibyte;
4665   int consumed_chars = 0;
4666   int found = 0;
4667   int c;
4668   Lisp_Object attrs, charset_list;
4669   int max_first_byte_of_2_byte_code;
4670
4671   CODING_GET_INFO (coding, attrs, charset_list);
4672   max_first_byte_of_2_byte_code
4673     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4674
4675   detect_info->checked |= CATEGORY_MASK_SJIS;
4676   /* A coding system of this category is always ASCII compatible.  */
4677   src += coding->head_ascii;
4678
4679   while (1)
4680     {
4681       src_base = src;
4682       ONE_MORE_BYTE (c);
4683       if (c < 0x80)
4684         continue;
4685       if ((c >= 0x81 && c <= 0x9F)
4686           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4687         {
4688           ONE_MORE_BYTE (c);
4689           if (c < 0x40 || c == 0x7F || c > 0xFC)
4690             break;
4691           found = CATEGORY_MASK_SJIS;
4692         }
4693       else if (c >= 0xA0 && c < 0xE0)
4694         found = CATEGORY_MASK_SJIS;
4695       else
4696         break;
4697     }
4698   detect_info->rejected |= CATEGORY_MASK_SJIS;
4699   return 0;
4700
4701  no_more_source:
4702   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4703     {
4704       detect_info->rejected |= CATEGORY_MASK_SJIS;
4705       return 0;
4706     }
4707   detect_info->found |= found;
4708   return 1;
4709 }
4710
4711 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4712    Check if a text is encoded in BIG5.  If it is, return
4713    CATEGORY_MASK_BIG5, else return 0.  */
4714
4715 static int
4716 detect_coding_big5 (struct coding_system *coding,
4717                     struct coding_detection_info *detect_info)
4718 {
4719   const unsigned char *src = coding->source, *src_base;
4720   const unsigned char *src_end = coding->source + coding->src_bytes;
4721   int multibytep = coding->src_multibyte;
4722   int consumed_chars = 0;
4723   int found = 0;
4724   int c;
4725
4726   detect_info->checked |= CATEGORY_MASK_BIG5;
4727   /* A coding system of this category is always ASCII compatible.  */
4728   src += coding->head_ascii;
4729
4730   while (1)
4731     {
4732       src_base = src;
4733       ONE_MORE_BYTE (c);
4734       if (c < 0x80)
4735         continue;
4736       if (c >= 0xA1)
4737         {
4738           ONE_MORE_BYTE (c);
4739           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4740             return 0;
4741           found = CATEGORY_MASK_BIG5;
4742         }
4743       else
4744         break;
4745     }
4746   detect_info->rejected |= CATEGORY_MASK_BIG5;
4747   return 0;
4748
4749  no_more_source:
4750   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4751     {
4752       detect_info->rejected |= CATEGORY_MASK_BIG5;
4753       return 0;
4754     }
4755   detect_info->found |= found;
4756   return 1;
4757 }
4758
4759 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4760    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4761
4762 static void
4763 decode_coding_sjis (struct coding_system *coding)
4764 {
4765   const unsigned char *src = coding->source + coding->consumed;
4766   const unsigned char *src_end = coding->source + coding->src_bytes;
4767   const unsigned char *src_base;
4768   int *charbuf = coding->charbuf + coding->charbuf_used;
4769   /* We may produce one charset annocation in one loop and one more at
4770      the end.  */
4771   int *charbuf_end
4772     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4773   int consumed_chars = 0, consumed_chars_base;
4774   int multibytep = coding->src_multibyte;
4775   struct charset *charset_roman, *charset_kanji, *charset_kana;
4776   struct charset *charset_kanji2;
4777   Lisp_Object attrs, charset_list, val;
4778   int char_offset = coding->produced_char;
4779   int last_offset = char_offset;
4780   int last_id = charset_ascii;
4781   int eol_crlf =
4782     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4783   int byte_after_cr = -1;
4784
4785   CODING_GET_INFO (coding, attrs, charset_list);
4786
4787   val = charset_list;
4788   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4789   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4790   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4791   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4792
4793   while (1)
4794     {
4795       int c, c1;
4796       struct charset *charset;
4797
4798       src_base = src;
4799       consumed_chars_base = consumed_chars;
4800
4801       if (charbuf >= charbuf_end)
4802         {
4803           if (byte_after_cr >= 0)
4804             src_base--;
4805           break;
4806         }
4807
4808       if (byte_after_cr >= 0)
4809         c = byte_after_cr, byte_after_cr = -1;
4810       else
4811         ONE_MORE_BYTE (c);
4812       if (c < 0)
4813         goto invalid_code;
4814       if (c < 0x80)
4815         {
4816           if (eol_crlf && c == '\r')
4817             ONE_MORE_BYTE (byte_after_cr);
4818           charset = charset_roman;
4819         }
4820       else if (c == 0x80 || c == 0xA0)
4821         goto invalid_code;
4822       else if (c >= 0xA1 && c <= 0xDF)
4823         {
4824           /* SJIS -> JISX0201-Kana */
4825           c &= 0x7F;
4826           charset = charset_kana;
4827         }
4828       else if (c <= 0xEF)
4829         {
4830           /* SJIS -> JISX0208 */
4831           ONE_MORE_BYTE (c1);
4832           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4833             goto invalid_code;
4834           c = (c << 8) | c1;
4835           SJIS_TO_JIS (c);
4836           charset = charset_kanji;
4837         }
4838       else if (c <= 0xFC && charset_kanji2)
4839         {
4840           /* SJIS -> JISX0213-2 */
4841           ONE_MORE_BYTE (c1);
4842           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4843             goto invalid_code;
4844           c = (c << 8) | c1;
4845           SJIS_TO_JIS2 (c);
4846           charset = charset_kanji2;
4847         }
4848       else
4849         goto invalid_code;
4850       if (charset->id != charset_ascii
4851           && last_id != charset->id)
4852         {
4853           if (last_id != charset_ascii)
4854             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4855           last_id = charset->id;
4856           last_offset = char_offset;
4857         }
4858       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4859       *charbuf++ = c;
4860       char_offset++;
4861       continue;
4862
4863     invalid_code:
4864       src = src_base;
4865       consumed_chars = consumed_chars_base;
4866       ONE_MORE_BYTE (c);
4867       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4868       char_offset++;
4869       coding->errors++;
4870     }
4871
4872  no_more_source:
4873   if (last_id != charset_ascii)
4874     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4875   coding->consumed_char += consumed_chars_base;
4876   coding->consumed = src_base - coding->source;
4877   coding->charbuf_used = charbuf - coding->charbuf;
4878 }
4879
4880 static void
4881 decode_coding_big5 (struct coding_system *coding)
4882 {
4883   const unsigned char *src = coding->source + coding->consumed;
4884   const unsigned char *src_end = coding->source + coding->src_bytes;
4885   const unsigned char *src_base;
4886   int *charbuf = coding->charbuf + coding->charbuf_used;
4887   /* We may produce one charset annocation in one loop and one more at
4888      the end.  */
4889   int *charbuf_end
4890     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4891   int consumed_chars = 0, consumed_chars_base;
4892   int multibytep = coding->src_multibyte;
4893   struct charset *charset_roman, *charset_big5;
4894   Lisp_Object attrs, charset_list, val;
4895   int char_offset = coding->produced_char;
4896   int last_offset = char_offset;
4897   int last_id = charset_ascii;
4898   int eol_crlf =
4899     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4900   int byte_after_cr = -1;
4901
4902   CODING_GET_INFO (coding, attrs, charset_list);
4903   val = charset_list;
4904   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4905   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4906
4907   while (1)
4908     {
4909       int c, c1;
4910       struct charset *charset;
4911
4912       src_base = src;
4913       consumed_chars_base = consumed_chars;
4914
4915       if (charbuf >= charbuf_end)
4916         {
4917           if (byte_after_cr >= 0)
4918             src_base--;
4919           break;
4920         }
4921
4922       if (byte_after_cr >= 0)
4923         c = byte_after_cr, byte_after_cr = -1;
4924       else
4925         ONE_MORE_BYTE (c);
4926
4927       if (c < 0)
4928         goto invalid_code;
4929       if (c < 0x80)
4930         {
4931           if (eol_crlf && c == '\r')
4932             ONE_MORE_BYTE (byte_after_cr);
4933           charset = charset_roman;
4934         }
4935       else
4936         {
4937           /* BIG5 -> Big5 */
4938           if (c < 0xA1 || c > 0xFE)
4939             goto invalid_code;
4940           ONE_MORE_BYTE (c1);
4941           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4942             goto invalid_code;
4943           c = c << 8 | c1;
4944           charset = charset_big5;
4945         }
4946       if (charset->id != charset_ascii
4947           && last_id != charset->id)
4948         {
4949           if (last_id != charset_ascii)
4950             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4951           last_id = charset->id;
4952           last_offset = char_offset;
4953         }
4954       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4955       *charbuf++ = c;
4956       char_offset++;
4957       continue;
4958
4959     invalid_code:
4960       src = src_base;
4961       consumed_chars = consumed_chars_base;
4962       ONE_MORE_BYTE (c);
4963       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4964       char_offset++;
4965       coding->errors++;
4966     }
4967
4968  no_more_source:
4969   if (last_id != charset_ascii)
4970     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4971   coding->consumed_char += consumed_chars_base;
4972   coding->consumed = src_base - coding->source;
4973   coding->charbuf_used = charbuf - coding->charbuf;
4974 }
4975
4976 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4977    This function can encode charsets `ascii', `katakana-jisx0201',
4978    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4979    are sure that all these charsets are registered as official charset
4980    (i.e. do not have extended leading-codes).  Characters of other
4981    charsets are produced without any encoding.  If SJIS_P is 1, encode
4982    SJIS text, else encode BIG5 text.  */
4983
4984 static int
4985 encode_coding_sjis (struct coding_system *coding)
4986 {
4987   int multibytep = coding->dst_multibyte;
4988   int *charbuf = coding->charbuf;
4989   int *charbuf_end = charbuf + coding->charbuf_used;
4990   unsigned char *dst = coding->destination + coding->produced;
4991   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4992   int safe_room = 4;
4993   int produced_chars = 0;
4994   Lisp_Object attrs, charset_list, val;
4995   int ascii_compatible;
4996   struct charset *charset_roman, *charset_kanji, *charset_kana;
4997   struct charset *charset_kanji2;
4998   int c;
4999
5000   CODING_GET_INFO (coding, attrs, charset_list);
5001   val = charset_list;
5002   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5003   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5004   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5005   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
5006
5007   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5008
5009   while (charbuf < charbuf_end)
5010     {
5011       ASSURE_DESTINATION (safe_room);
5012       c = *charbuf++;
5013       /* Now encode the character C.  */
5014       if (ASCII_CHAR_P (c) && ascii_compatible)
5015         EMIT_ONE_ASCII_BYTE (c);
5016       else if (CHAR_BYTE8_P (c))
5017         {
5018           c = CHAR_TO_BYTE8 (c);
5019           EMIT_ONE_BYTE (c);
5020         }
5021       else
5022         {
5023           unsigned code;
5024           struct charset *charset = char_charset (c, charset_list, &code);
5025
5026           if (!charset)
5027             {
5028               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5029                 {
5030                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5031                   charset = CHARSET_FROM_ID (charset_ascii);
5032                 }
5033               else
5034                 {
5035                   c = coding->default_char;
5036                   charset = char_charset (c, charset_list, &code);
5037                 }
5038             }
5039           if (code == CHARSET_INVALID_CODE (charset))
5040             abort ();
5041           if (charset == charset_kanji)
5042             {
5043               int c1, c2;
5044               JIS_TO_SJIS (code);
5045               c1 = code >> 8, c2 = code & 0xFF;
5046               EMIT_TWO_BYTES (c1, c2);
5047             }
5048           else if (charset == charset_kana)
5049             EMIT_ONE_BYTE (code | 0x80);
5050           else if (charset_kanji2 && charset == charset_kanji2)
5051             {
5052               int c1, c2;
5053
5054               c1 = code >> 8;
5055               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5056                   || c1 == 0x28
5057                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5058                 {
5059                   JIS_TO_SJIS2 (code);
5060                   c1 = code >> 8, c2 = code & 0xFF;
5061                   EMIT_TWO_BYTES (c1, c2);
5062                 }
5063               else
5064                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5065             }
5066           else
5067             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5068         }
5069     }
5070   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5071   coding->produced_char += produced_chars;
5072   coding->produced = dst - coding->destination;
5073   return 0;
5074 }
5075
5076 static int
5077 encode_coding_big5 (struct coding_system *coding)
5078 {
5079   int multibytep = coding->dst_multibyte;
5080   int *charbuf = coding->charbuf;
5081   int *charbuf_end = charbuf + coding->charbuf_used;
5082   unsigned char *dst = coding->destination + coding->produced;
5083   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5084   int safe_room = 4;
5085   int produced_chars = 0;
5086   Lisp_Object attrs, charset_list, val;
5087   int ascii_compatible;
5088   struct charset *charset_roman, *charset_big5;
5089   int c;
5090
5091   CODING_GET_INFO (coding, attrs, charset_list);
5092   val = charset_list;
5093   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5094   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5095   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5096
5097   while (charbuf < charbuf_end)
5098     {
5099       ASSURE_DESTINATION (safe_room);
5100       c = *charbuf++;
5101       /* Now encode the character C.  */
5102       if (ASCII_CHAR_P (c) && ascii_compatible)
5103         EMIT_ONE_ASCII_BYTE (c);
5104       else if (CHAR_BYTE8_P (c))
5105         {
5106           c = CHAR_TO_BYTE8 (c);
5107           EMIT_ONE_BYTE (c);
5108         }
5109       else
5110         {
5111           unsigned code;
5112           struct charset *charset = char_charset (c, charset_list, &code);
5113
5114           if (! charset)
5115             {
5116               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5117                 {
5118                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5119                   charset = CHARSET_FROM_ID (charset_ascii);
5120                 }
5121               else
5122                 {
5123                   c = coding->default_char;
5124                   charset = char_charset (c, charset_list, &code);
5125                 }
5126             }
5127           if (code == CHARSET_INVALID_CODE (charset))
5128             abort ();
5129           if (charset == charset_big5)
5130             {
5131               int c1, c2;
5132
5133               c1 = code >> 8, c2 = code & 0xFF;
5134               EMIT_TWO_BYTES (c1, c2);
5135             }
5136           else
5137             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5138         }
5139     }
5140   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5141   coding->produced_char += produced_chars;
5142   coding->produced = dst - coding->destination;
5143   return 0;
5144 }
5145
5146 \f
5147 /*** 10. CCL handlers ***/
5148
5149 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5150    Check if a text is encoded in a coding system of which
5151    encoder/decoder are written in CCL program.  If it is, return
5152    CATEGORY_MASK_CCL, else return 0.  */
5153
5154 static int
5155 detect_coding_ccl (struct coding_system *coding,
5156                    struct coding_detection_info *detect_info)
5157 {
5158   const unsigned char *src = coding->source, *src_base;
5159   const unsigned char *src_end = coding->source + coding->src_bytes;
5160   int multibytep = coding->src_multibyte;
5161   int consumed_chars = 0;
5162   int found = 0;
5163   unsigned char *valids;
5164   int head_ascii = coding->head_ascii;
5165   Lisp_Object attrs;
5166
5167   detect_info->checked |= CATEGORY_MASK_CCL;
5168
5169   coding = &coding_categories[coding_category_ccl];
5170   valids = CODING_CCL_VALIDS (coding);
5171   attrs = CODING_ID_ATTRS (coding->id);
5172   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5173     src += head_ascii;
5174
5175   while (1)
5176     {
5177       int c;
5178
5179       src_base = src;
5180       ONE_MORE_BYTE (c);
5181       if (c < 0 || ! valids[c])
5182         break;
5183       if ((valids[c] > 1))
5184         found = CATEGORY_MASK_CCL;
5185     }
5186   detect_info->rejected |= CATEGORY_MASK_CCL;
5187   return 0;
5188
5189  no_more_source:
5190   detect_info->found |= found;
5191   return 1;
5192 }
5193
5194 static void
5195 decode_coding_ccl (struct coding_system *coding)
5196 {
5197   const unsigned char *src = coding->source + coding->consumed;
5198   const unsigned char *src_end = coding->source + coding->src_bytes;
5199   int *charbuf = coding->charbuf + coding->charbuf_used;
5200   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5201   int consumed_chars = 0;
5202   int multibytep = coding->src_multibyte;
5203   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5204   int source_charbuf[1024];
5205   int source_byteidx[1025];
5206   Lisp_Object attrs, charset_list;
5207
5208   CODING_GET_INFO (coding, attrs, charset_list);
5209
5210   while (1)
5211     {
5212       const unsigned char *p = src;
5213       int i = 0;
5214
5215       if (multibytep)
5216         {
5217           while (i < 1024 && p < src_end)
5218             {
5219               source_byteidx[i] = p - src;
5220               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5221             }
5222           source_byteidx[i] = p - src;
5223         }
5224       else
5225         while (i < 1024 && p < src_end)
5226           source_charbuf[i++] = *p++;
5227
5228       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5229         ccl->last_block = 1;
5230       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5231                   charset_list);
5232       charbuf += ccl->produced;
5233       if (multibytep)
5234         src += source_byteidx[ccl->consumed];
5235       else
5236         src += ccl->consumed;
5237       consumed_chars += ccl->consumed;
5238       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5239         break;
5240     }
5241
5242   switch (ccl->status)
5243     {
5244     case CCL_STAT_SUSPEND_BY_SRC:
5245       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5246       break;
5247     case CCL_STAT_SUSPEND_BY_DST:
5248       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5249       break;
5250     case CCL_STAT_QUIT:
5251     case CCL_STAT_INVALID_CMD:
5252       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5253       break;
5254     default:
5255       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5256       break;
5257     }
5258   coding->consumed_char += consumed_chars;
5259   coding->consumed = src - coding->source;
5260   coding->charbuf_used = charbuf - coding->charbuf;
5261 }
5262
5263 static int
5264 encode_coding_ccl (struct coding_system *coding)
5265 {
5266   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5267   int multibytep = coding->dst_multibyte;
5268   int *charbuf = coding->charbuf;
5269   int *charbuf_end = charbuf + coding->charbuf_used;
5270   unsigned char *dst = coding->destination + coding->produced;
5271   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5272   int destination_charbuf[1024];
5273   int i, produced_chars = 0;
5274   Lisp_Object attrs, charset_list;
5275
5276   CODING_GET_INFO (coding, attrs, charset_list);
5277   if (coding->consumed_char == coding->src_chars
5278       && coding->mode & CODING_MODE_LAST_BLOCK)
5279     ccl->last_block = 1;
5280
5281   while (charbuf < charbuf_end)
5282     {
5283       ccl_driver (ccl, charbuf, destination_charbuf,
5284                   charbuf_end - charbuf, 1024, charset_list);
5285       if (multibytep)
5286         {
5287           ASSURE_DESTINATION (ccl->produced * 2);
5288           for (i = 0; i < ccl->produced; i++)
5289             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5290         }
5291       else
5292         {
5293           ASSURE_DESTINATION (ccl->produced);
5294           for (i = 0; i < ccl->produced; i++)
5295             *dst++ = destination_charbuf[i] & 0xFF;
5296           produced_chars += ccl->produced;
5297         }
5298       charbuf += ccl->consumed;
5299       if (ccl->status == CCL_STAT_QUIT
5300           || ccl->status == CCL_STAT_INVALID_CMD)
5301         break;
5302     }
5303
5304   switch (ccl->status)
5305     {
5306     case CCL_STAT_SUSPEND_BY_SRC:
5307       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5308       break;
5309     case CCL_STAT_SUSPEND_BY_DST:
5310       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5311       break;
5312     case CCL_STAT_QUIT:
5313     case CCL_STAT_INVALID_CMD:
5314       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5315       break;
5316     default:
5317       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5318       break;
5319     }
5320
5321   coding->produced_char += produced_chars;
5322   coding->produced = dst - coding->destination;
5323   return 0;
5324 }
5325
5326
5327 \f
5328 /*** 10, 11. no-conversion handlers ***/
5329
5330 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5331
5332 static void
5333 decode_coding_raw_text (struct coding_system *coding)
5334 {
5335   int eol_crlf =
5336     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5337
5338   coding->chars_at_source = 1;
5339   coding->consumed_char = coding->src_chars;
5340   coding->consumed = coding->src_bytes;
5341   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5342     {
5343       coding->consumed_char--;
5344       coding->consumed--;
5345       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5346     }
5347   else
5348     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5349 }
5350
5351 static int
5352 encode_coding_raw_text (struct coding_system *coding)
5353 {
5354   int multibytep = coding->dst_multibyte;
5355   int *charbuf = coding->charbuf;
5356   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5357   unsigned char *dst = coding->destination + coding->produced;
5358   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5359   int produced_chars = 0;
5360   int c;
5361
5362   if (multibytep)
5363     {
5364       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5365
5366       if (coding->src_multibyte)
5367         while (charbuf < charbuf_end)
5368           {
5369             ASSURE_DESTINATION (safe_room);
5370             c = *charbuf++;
5371             if (ASCII_CHAR_P (c))
5372               EMIT_ONE_ASCII_BYTE (c);
5373             else if (CHAR_BYTE8_P (c))
5374               {
5375                 c = CHAR_TO_BYTE8 (c);
5376                 EMIT_ONE_BYTE (c);
5377               }
5378             else
5379               {
5380                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5381
5382                 CHAR_STRING_ADVANCE (c, p1);
5383                 while (p0 < p1)
5384                   {
5385                     EMIT_ONE_BYTE (*p0);
5386                     p0++;
5387                   }
5388               }
5389           }
5390       else
5391         while (charbuf < charbuf_end)
5392           {
5393             ASSURE_DESTINATION (safe_room);
5394             c = *charbuf++;
5395             EMIT_ONE_BYTE (c);
5396           }
5397     }
5398   else
5399     {
5400       if (coding->src_multibyte)
5401         {
5402           int safe_room = MAX_MULTIBYTE_LENGTH;
5403
5404           while (charbuf < charbuf_end)
5405             {
5406               ASSURE_DESTINATION (safe_room);
5407               c = *charbuf++;
5408               if (ASCII_CHAR_P (c))
5409                 *dst++ = c;
5410               else if (CHAR_BYTE8_P (c))
5411                 *dst++ = CHAR_TO_BYTE8 (c);
5412               else
5413                 CHAR_STRING_ADVANCE (c, dst);
5414             }
5415         }
5416       else
5417         {
5418           ASSURE_DESTINATION (charbuf_end - charbuf);
5419           while (charbuf < charbuf_end && dst < dst_end)
5420             *dst++ = *charbuf++;
5421         }
5422       produced_chars = dst - (coding->destination + coding->produced);
5423     }
5424   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5425   coding->produced_char += produced_chars;
5426   coding->produced = dst - coding->destination;
5427   return 0;
5428 }
5429
5430 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5431    Check if a text is encoded in a charset-based coding system.  If it
5432    is, return 1, else return 0.  */
5433
5434 static int
5435 detect_coding_charset (struct coding_system *coding,
5436                        struct coding_detection_info *detect_info)
5437 {
5438   const unsigned char *src = coding->source, *src_base;
5439   const unsigned char *src_end = coding->source + coding->src_bytes;
5440   int multibytep = coding->src_multibyte;
5441   int consumed_chars = 0;
5442   Lisp_Object attrs, valids, name;
5443   int found = 0;
5444   int head_ascii = coding->head_ascii;
5445   int check_latin_extra = 0;
5446
5447   detect_info->checked |= CATEGORY_MASK_CHARSET;
5448
5449   coding = &coding_categories[coding_category_charset];
5450   attrs = CODING_ID_ATTRS (coding->id);
5451   valids = AREF (attrs, coding_attr_charset_valids);
5452   name = CODING_ID_NAME (coding->id);
5453   if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5454                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5455       || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5456                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5457     check_latin_extra = 1;
5458
5459   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5460     src += head_ascii;
5461
5462   while (1)
5463     {
5464       int c;
5465       Lisp_Object val;
5466       struct charset *charset;
5467       int dim, idx;
5468
5469       src_base = src;
5470       ONE_MORE_BYTE (c);
5471       if (c < 0)
5472         continue;
5473       val = AREF (valids, c);
5474       if (NILP (val))
5475         break;
5476       if (c >= 0x80)
5477         {
5478           if (c < 0xA0
5479               && check_latin_extra
5480               && (!VECTORP (Vlatin_extra_code_table)
5481                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5482             break;
5483           found = CATEGORY_MASK_CHARSET;
5484         }
5485       if (INTEGERP (val))
5486         {
5487           charset = CHARSET_FROM_ID (XFASTINT (val));
5488           dim = CHARSET_DIMENSION (charset);
5489           for (idx = 1; idx < dim; idx++)
5490             {
5491               if (src == src_end)
5492                 goto too_short;
5493               ONE_MORE_BYTE (c);
5494               if (c < charset->code_space[(dim - 1 - idx) * 2]
5495                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5496                 break;
5497             }
5498           if (idx < dim)
5499             break;
5500         }
5501       else
5502         {
5503           idx = 1;
5504           for (; CONSP (val); val = XCDR (val))
5505             {
5506               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5507               dim = CHARSET_DIMENSION (charset);
5508               while (idx < dim)
5509                 {
5510                   if (src == src_end)
5511                     goto too_short;
5512                   ONE_MORE_BYTE (c);
5513                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5514                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5515                     break;
5516                   idx++;
5517                 }
5518               if (idx == dim)
5519                 {
5520                   val = Qnil;
5521                   break;
5522                 }
5523             }
5524           if (CONSP (val))
5525             break;
5526         }
5527     }
5528  too_short:
5529   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5530   return 0;
5531
5532  no_more_source:
5533   detect_info->found |= found;
5534   return 1;
5535 }
5536
5537 static void
5538 decode_coding_charset (struct coding_system *coding)
5539 {
5540   const unsigned char *src = coding->source + coding->consumed;
5541   const unsigned char *src_end = coding->source + coding->src_bytes;
5542   const unsigned char *src_base;
5543   int *charbuf = coding->charbuf + coding->charbuf_used;
5544   /* We may produce one charset annocation in one loop and one more at
5545      the end.  */
5546   int *charbuf_end
5547     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5548   int consumed_chars = 0, consumed_chars_base;
5549   int multibytep = coding->src_multibyte;
5550   Lisp_Object attrs, charset_list, valids;
5551   int char_offset = coding->produced_char;
5552   int last_offset = char_offset;
5553   int last_id = charset_ascii;
5554   int eol_crlf =
5555     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5556   int byte_after_cr = -1;
5557
5558   CODING_GET_INFO (coding, attrs, charset_list);
5559   valids = AREF (attrs, coding_attr_charset_valids);
5560
5561   while (1)
5562     {
5563       int c;
5564       Lisp_Object val;
5565       struct charset *charset;
5566       int dim;
5567       int len = 1;
5568       unsigned code;
5569
5570       src_base = src;
5571       consumed_chars_base = consumed_chars;
5572
5573       if (charbuf >= charbuf_end)
5574         {
5575           if (byte_after_cr >= 0)
5576             src_base--;
5577           break;
5578         }
5579
5580       if (byte_after_cr >= 0)
5581         {
5582           c = byte_after_cr;
5583           byte_after_cr = -1;
5584         }
5585       else
5586         {
5587           ONE_MORE_BYTE (c);
5588           if (eol_crlf && c == '\r')
5589             ONE_MORE_BYTE (byte_after_cr);
5590         }
5591       if (c < 0)
5592         goto invalid_code;
5593       code = c;
5594
5595       val = AREF (valids, c);
5596       if (! INTEGERP (val) && ! CONSP (val))
5597         goto invalid_code;
5598       if (INTEGERP (val))
5599         {
5600           charset = CHARSET_FROM_ID (XFASTINT (val));
5601           dim = CHARSET_DIMENSION (charset);
5602           while (len < dim)
5603             {
5604               ONE_MORE_BYTE (c);
5605               code = (code << 8) | c;
5606               len++;
5607             }
5608           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5609                               charset, code, c);
5610         }
5611       else
5612         {
5613           /* VAL is a list of charset IDs.  It is assured that the
5614              list is sorted by charset dimensions (smaller one
5615              comes first).  */
5616           while (CONSP (val))
5617             {
5618               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5619               dim = CHARSET_DIMENSION (charset);
5620               while (len < dim)
5621                 {
5622                   ONE_MORE_BYTE (c);
5623                   code = (code << 8) | c;
5624                   len++;
5625                 }
5626               CODING_DECODE_CHAR (coding, src, src_base,
5627                                   src_end, charset, code, c);
5628               if (c >= 0)
5629                 break;
5630               val = XCDR (val);
5631             }
5632         }
5633       if (c < 0)
5634         goto invalid_code;
5635       if (charset->id != charset_ascii
5636           && last_id != charset->id)
5637         {
5638           if (last_id != charset_ascii)
5639             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5640           last_id = charset->id;
5641           last_offset = char_offset;
5642         }
5643
5644       *charbuf++ = c;
5645       char_offset++;
5646       continue;
5647
5648     invalid_code:
5649       src = src_base;
5650       consumed_chars = consumed_chars_base;
5651       ONE_MORE_BYTE (c);
5652       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5653       char_offset++;
5654       coding->errors++;
5655     }
5656
5657  no_more_source:
5658   if (last_id != charset_ascii)
5659     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5660   coding->consumed_char += consumed_chars_base;
5661   coding->consumed = src_base - coding->source;
5662   coding->charbuf_used = charbuf - coding->charbuf;
5663 }
5664
5665 static int
5666 encode_coding_charset (struct coding_system *coding)
5667 {
5668   int multibytep = coding->dst_multibyte;
5669   int *charbuf = coding->charbuf;
5670   int *charbuf_end = charbuf + coding->charbuf_used;
5671   unsigned char *dst = coding->destination + coding->produced;
5672   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5673   int safe_room = MAX_MULTIBYTE_LENGTH;
5674   int produced_chars = 0;
5675   Lisp_Object attrs, charset_list;
5676   int ascii_compatible;
5677   int c;
5678
5679   CODING_GET_INFO (coding, attrs, charset_list);
5680   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5681
5682   while (charbuf < charbuf_end)
5683     {
5684       struct charset *charset;
5685       unsigned code;
5686
5687       ASSURE_DESTINATION (safe_room);
5688       c = *charbuf++;
5689       if (ascii_compatible && ASCII_CHAR_P (c))
5690         EMIT_ONE_ASCII_BYTE (c);
5691       else if (CHAR_BYTE8_P (c))
5692         {
5693           c = CHAR_TO_BYTE8 (c);
5694           EMIT_ONE_BYTE (c);
5695         }
5696       else
5697         {
5698           charset = char_charset (c, charset_list, &code);
5699           if (charset)
5700             {
5701               if (CHARSET_DIMENSION (charset) == 1)
5702                 EMIT_ONE_BYTE (code);
5703               else if (CHARSET_DIMENSION (charset) == 2)
5704                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5705               else if (CHARSET_DIMENSION (charset) == 3)
5706                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5707               else
5708                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5709                                  (code >> 8) & 0xFF, code & 0xFF);
5710             }
5711           else
5712             {
5713               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5714                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5715               else
5716                 c = coding->default_char;
5717               EMIT_ONE_BYTE (c);
5718             }
5719         }
5720     }
5721
5722   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5723   coding->produced_char += produced_chars;
5724   coding->produced = dst - coding->destination;
5725   return 0;
5726 }
5727
5728 \f
5729 /*** 7. C library functions ***/
5730
5731 /* Setup coding context CODING from information about CODING_SYSTEM.
5732    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5733    CODING_SYSTEM is invalid, signal an error.  */
5734
5735 void
5736 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5737 {
5738   Lisp_Object attrs;
5739   Lisp_Object eol_type;
5740   Lisp_Object coding_type;
5741   Lisp_Object val;
5742
5743   if (NILP (coding_system))
5744     coding_system = Qundecided;
5745
5746   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5747
5748   attrs = CODING_ID_ATTRS (coding->id);
5749   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5750
5751   coding->mode = 0;
5752   coding->head_ascii = -1;
5753   if (VECTORP (eol_type))
5754     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5755                             | CODING_REQUIRE_DETECTION_MASK);
5756   else if (! EQ (eol_type, Qunix))
5757     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5758                             | CODING_REQUIRE_ENCODING_MASK);
5759   else
5760     coding->common_flags = 0;
5761   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5762     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5763   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5764     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5765   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5766     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5767
5768   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5769   coding->max_charset_id = SCHARS (val) - 1;
5770   coding->safe_charsets = SDATA (val);
5771   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5772   coding->carryover_bytes = 0;
5773
5774   coding_type = CODING_ATTR_TYPE (attrs);
5775   if (EQ (coding_type, Qundecided))
5776     {
5777       coding->detector = NULL;
5778       coding->decoder = decode_coding_raw_text;
5779       coding->encoder = encode_coding_raw_text;
5780       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5781     }
5782   else if (EQ (coding_type, Qiso_2022))
5783     {
5784       int i;
5785       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5786
5787       /* Invoke graphic register 0 to plane 0.  */
5788       CODING_ISO_INVOCATION (coding, 0) = 0;
5789       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5790       CODING_ISO_INVOCATION (coding, 1)
5791         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5792       /* Setup the initial status of designation.  */
5793       for (i = 0; i < 4; i++)
5794         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5795       /* Not single shifting initially.  */
5796       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5797       /* Beginning of buffer should also be regarded as bol. */
5798       CODING_ISO_BOL (coding) = 1;
5799       coding->detector = detect_coding_iso_2022;
5800       coding->decoder = decode_coding_iso_2022;
5801       coding->encoder = encode_coding_iso_2022;
5802       if (flags & CODING_ISO_FLAG_SAFE)
5803         coding->mode |= CODING_MODE_SAFE_ENCODING;
5804       coding->common_flags
5805         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5806             | CODING_REQUIRE_FLUSHING_MASK);
5807       if (flags & CODING_ISO_FLAG_COMPOSITION)
5808         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5809       if (flags & CODING_ISO_FLAG_DESIGNATION)
5810         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5811       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5812         {
5813           setup_iso_safe_charsets (attrs);
5814           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5815           coding->max_charset_id = SCHARS (val) - 1;
5816           coding->safe_charsets = SDATA (val);
5817         }
5818       CODING_ISO_FLAGS (coding) = flags;
5819       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5820       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5821       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5822       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5823     }
5824   else if (EQ (coding_type, Qcharset))
5825     {
5826       coding->detector = detect_coding_charset;
5827       coding->decoder = decode_coding_charset;
5828       coding->encoder = encode_coding_charset;
5829       coding->common_flags
5830         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5831     }
5832   else if (EQ (coding_type, Qutf_8))
5833     {
5834       val = AREF (attrs, coding_attr_utf_bom);
5835       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5836                                    : EQ (val, Qt) ? utf_with_bom
5837                                    : utf_without_bom);
5838       coding->detector = detect_coding_utf_8;
5839       coding->decoder = decode_coding_utf_8;
5840       coding->encoder = encode_coding_utf_8;
5841       coding->common_flags
5842         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5843       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5844         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5845     }
5846   else if (EQ (coding_type, Qutf_16))
5847     {
5848       val = AREF (attrs, coding_attr_utf_bom);
5849       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5850                                     : EQ (val, Qt) ? utf_with_bom
5851                                     : utf_without_bom);
5852       val = AREF (attrs, coding_attr_utf_16_endian);
5853       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5854                                        : utf_16_little_endian);
5855       CODING_UTF_16_SURROGATE (coding) = 0;
5856       coding->detector = detect_coding_utf_16;
5857       coding->decoder = decode_coding_utf_16;
5858       coding->encoder = encode_coding_utf_16;
5859       coding->common_flags
5860         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5861       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5862         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5863     }
5864   else if (EQ (coding_type, Qccl))
5865     {
5866       coding->detector = detect_coding_ccl;
5867       coding->decoder = decode_coding_ccl;
5868       coding->encoder = encode_coding_ccl;
5869       coding->common_flags
5870         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5871             | CODING_REQUIRE_FLUSHING_MASK);
5872     }
5873   else if (EQ (coding_type, Qemacs_mule))
5874     {
5875       coding->detector = detect_coding_emacs_mule;
5876       coding->decoder = decode_coding_emacs_mule;
5877       coding->encoder = encode_coding_emacs_mule;
5878       coding->common_flags
5879         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5880       coding->spec.emacs_mule.full_support = 1;
5881       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5882           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5883         {
5884           Lisp_Object tail, safe_charsets;
5885           int max_charset_id = 0;
5886
5887           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5888                tail = XCDR (tail))
5889             if (max_charset_id < XFASTINT (XCAR (tail)))
5890               max_charset_id = XFASTINT (XCAR (tail));
5891           safe_charsets = make_uninit_string (max_charset_id + 1);
5892           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5893           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5894                tail = XCDR (tail))
5895             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5896           coding->max_charset_id = max_charset_id;
5897           coding->safe_charsets = SDATA (safe_charsets);
5898           coding->spec.emacs_mule.full_support = 1;
5899         }
5900       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5901       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5902     }
5903   else if (EQ (coding_type, Qshift_jis))
5904     {
5905       coding->detector = detect_coding_sjis;
5906       coding->decoder = decode_coding_sjis;
5907       coding->encoder = encode_coding_sjis;
5908       coding->common_flags
5909         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5910     }
5911   else if (EQ (coding_type, Qbig5))
5912     {
5913       coding->detector = detect_coding_big5;
5914       coding->decoder = decode_coding_big5;
5915       coding->encoder = encode_coding_big5;
5916       coding->common_flags
5917         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5918     }
5919   else                          /* EQ (coding_type, Qraw_text) */
5920     {
5921       coding->detector = NULL;
5922       coding->decoder = decode_coding_raw_text;
5923       coding->encoder = encode_coding_raw_text;
5924       if (! EQ (eol_type, Qunix))
5925         {
5926           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5927           if (! VECTORP (eol_type))
5928             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5929         }
5930
5931     }
5932
5933   return;
5934 }
5935
5936 /* Return a list of charsets supported by CODING.  */
5937
5938 Lisp_Object
5939 coding_charset_list (struct coding_system *coding)
5940 {
5941   Lisp_Object attrs, charset_list;
5942
5943   CODING_GET_INFO (coding, attrs, charset_list);
5944   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5945     {
5946       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5947
5948       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5949         charset_list = Viso_2022_charset_list;
5950     }
5951   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5952     {
5953       charset_list = Vemacs_mule_charset_list;
5954     }
5955   return charset_list;
5956 }
5957
5958
5959 /* Return a list of charsets supported by CODING-SYSTEM.  */
5960
5961 Lisp_Object
5962 coding_system_charset_list (Lisp_Object coding_system)
5963 {
5964   int id;
5965   Lisp_Object attrs, charset_list;
5966
5967   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5968   attrs = CODING_ID_ATTRS (id);
5969
5970   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5971     {
5972       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5973
5974       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5975         charset_list = Viso_2022_charset_list;
5976       else
5977         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5978     }
5979   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5980     {
5981       charset_list = Vemacs_mule_charset_list;
5982     }
5983   else
5984     {
5985       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5986     }
5987   return charset_list;
5988 }
5989
5990
5991 /* Return raw-text or one of its subsidiaries that has the same
5992    eol_type as CODING-SYSTEM.  */
5993
5994 Lisp_Object
5995 raw_text_coding_system (Lisp_Object coding_system)
5996 {
5997   Lisp_Object spec, attrs;
5998   Lisp_Object eol_type, raw_text_eol_type;
5999
6000   if (NILP (coding_system))
6001     return Qraw_text;
6002   spec = CODING_SYSTEM_SPEC (coding_system);
6003   attrs = AREF (spec, 0);
6004
6005   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6006     return coding_system;
6007
6008   eol_type = AREF (spec, 2);
6009   if (VECTORP (eol_type))
6010     return Qraw_text;
6011   spec = CODING_SYSTEM_SPEC (Qraw_text);
6012   raw_text_eol_type = AREF (spec, 2);
6013   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6014           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6015           : AREF (raw_text_eol_type, 2));
6016 }
6017
6018
6019 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6020    the subsidiary that has the same eol-spec as PARENT (if it is not
6021    nil and specifies end-of-line format) or the system's setting
6022    (system_eol_type).  */
6023
6024 Lisp_Object
6025 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6026 {
6027   Lisp_Object spec, eol_type;
6028
6029   if (NILP (coding_system))
6030     coding_system = Qraw_text;
6031   spec = CODING_SYSTEM_SPEC (coding_system);
6032   eol_type = AREF (spec, 2);
6033   if (VECTORP (eol_type))
6034     {
6035       Lisp_Object parent_eol_type;
6036
6037       if (! NILP (parent))
6038         {
6039           Lisp_Object parent_spec;
6040
6041           parent_spec = CODING_SYSTEM_SPEC (parent);
6042           parent_eol_type = AREF (parent_spec, 2);
6043           if (VECTORP (parent_eol_type))
6044             parent_eol_type = system_eol_type;
6045         }
6046       else
6047         parent_eol_type = system_eol_type;
6048       if (EQ (parent_eol_type, Qunix))
6049         coding_system = AREF (eol_type, 0);
6050       else if (EQ (parent_eol_type, Qdos))
6051         coding_system = AREF (eol_type, 1);
6052       else if (EQ (parent_eol_type, Qmac))
6053         coding_system = AREF (eol_type, 2);
6054     }
6055   return coding_system;
6056 }
6057
6058
6059 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6060    decided for writing to a process.  If not, complement them, and
6061    return a new coding system.  */
6062
6063 Lisp_Object
6064 complement_process_encoding_system (Lisp_Object coding_system)
6065 {
6066   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6067   Lisp_Object spec, attrs;
6068   int i;
6069
6070   for (i = 0; i < 3; i++)
6071     {
6072       if (i == 1)
6073         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6074       else if (i == 2)
6075         coding_system = preferred_coding_system ();
6076       spec = CODING_SYSTEM_SPEC (coding_system);
6077       if (NILP (spec))
6078         continue;
6079       attrs = AREF (spec, 0);
6080       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6081         coding_base = CODING_ATTR_BASE_NAME (attrs);
6082       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6083         eol_base = coding_system;
6084       if (! NILP (coding_base) && ! NILP (eol_base))
6085         break;
6086     }
6087
6088   if (i > 0)
6089     /* The original CODING_SYSTEM didn't specify text-conversion or
6090        eol-conversion.  Be sure that we return a fully complemented
6091        coding system.  */
6092     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6093   return coding_system;
6094 }
6095
6096
6097 /* Emacs has a mechanism to automatically detect a coding system if it
6098    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6099    it's impossible to distinguish some coding systems accurately
6100    because they use the same range of codes.  So, at first, coding
6101    systems are categorized into 7, those are:
6102
6103    o coding-category-emacs-mule
6104
6105         The category for a coding system which has the same code range
6106         as Emacs' internal format.  Assigned the coding-system (Lisp
6107         symbol) `emacs-mule' by default.
6108
6109    o coding-category-sjis
6110
6111         The category for a coding system which has the same code range
6112         as SJIS.  Assigned the coding-system (Lisp
6113         symbol) `japanese-shift-jis' by default.
6114
6115    o coding-category-iso-7
6116
6117         The category for a coding system which has the same code range
6118         as ISO2022 of 7-bit environment.  This doesn't use any locking
6119         shift and single shift functions.  This can encode/decode all
6120         charsets.  Assigned the coding-system (Lisp symbol)
6121         `iso-2022-7bit' by default.
6122
6123    o coding-category-iso-7-tight
6124
6125         Same as coding-category-iso-7 except that this can
6126         encode/decode only the specified charsets.
6127
6128    o coding-category-iso-8-1
6129
6130         The category for a coding system which has the same code range
6131         as ISO2022 of 8-bit environment and graphic plane 1 used only
6132         for DIMENSION1 charset.  This doesn't use any locking shift
6133         and single shift functions.  Assigned the coding-system (Lisp
6134         symbol) `iso-latin-1' by default.
6135
6136    o coding-category-iso-8-2
6137
6138         The category for a coding system which has the same code range
6139         as ISO2022 of 8-bit environment and graphic plane 1 used only
6140         for DIMENSION2 charset.  This doesn't use any locking shift
6141         and single shift functions.  Assigned the coding-system (Lisp
6142         symbol) `japanese-iso-8bit' by default.
6143
6144    o coding-category-iso-7-else
6145
6146         The category for a coding system which has the same code range
6147         as ISO2022 of 7-bit environemnt but uses locking shift or
6148         single shift functions.  Assigned the coding-system (Lisp
6149         symbol) `iso-2022-7bit-lock' by default.
6150
6151    o coding-category-iso-8-else
6152
6153         The category for a coding system which has the same code range
6154         as ISO2022 of 8-bit environemnt but uses locking shift or
6155         single shift functions.  Assigned the coding-system (Lisp
6156         symbol) `iso-2022-8bit-ss2' by default.
6157
6158    o coding-category-big5
6159
6160         The category for a coding system which has the same code range
6161         as BIG5.  Assigned the coding-system (Lisp symbol)
6162         `cn-big5' by default.
6163
6164    o coding-category-utf-8
6165
6166         The category for a coding system which has the same code range
6167         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6168         symbol) `utf-8' by default.
6169
6170    o coding-category-utf-16-be
6171
6172         The category for a coding system in which a text has an
6173         Unicode signature (cf. Unicode Standard) in the order of BIG
6174         endian at the head.  Assigned the coding-system (Lisp symbol)
6175         `utf-16-be' by default.
6176
6177    o coding-category-utf-16-le
6178
6179         The category for a coding system in which a text has an
6180         Unicode signature (cf. Unicode Standard) in the order of
6181         LITTLE endian at the head.  Assigned the coding-system (Lisp
6182         symbol) `utf-16-le' by default.
6183
6184    o coding-category-ccl
6185
6186         The category for a coding system of which encoder/decoder is
6187         written in CCL programs.  The default value is nil, i.e., no
6188         coding system is assigned.
6189
6190    o coding-category-binary
6191
6192         The category for a coding system not categorized in any of the
6193         above.  Assigned the coding-system (Lisp symbol)
6194         `no-conversion' by default.
6195
6196    Each of them is a Lisp symbol and the value is an actual
6197    `coding-system's (this is also a Lisp symbol) assigned by a user.
6198    What Emacs does actually is to detect a category of coding system.
6199    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6200    decide only one possible category, it selects a category of the
6201    highest priority.  Priorities of categories are also specified by a
6202    user in a Lisp variable `coding-category-list'.
6203
6204 */
6205
6206 #define EOL_SEEN_NONE   0
6207 #define EOL_SEEN_LF     1
6208 #define EOL_SEEN_CR     2
6209 #define EOL_SEEN_CRLF   4
6210
6211 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6212    SOURCE is encoded.  If CATEGORY is one of
6213    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6214    two-byte, else they are encoded by one-byte.
6215
6216    Return one of EOL_SEEN_XXX.  */
6217
6218 #define MAX_EOL_CHECK_COUNT 3
6219
6220 static int
6221 detect_eol (const unsigned char *source, EMACS_INT src_bytes,
6222             enum coding_category category)
6223 {
6224   const unsigned char *src = source, *src_end = src + src_bytes;
6225   unsigned char c;
6226   int total  = 0;
6227   int eol_seen = EOL_SEEN_NONE;
6228
6229   if ((1 << category) & CATEGORY_MASK_UTF_16)
6230     {
6231       int msb, lsb;
6232
6233       msb = category == (coding_category_utf_16_le
6234                          | coding_category_utf_16_le_nosig);
6235       lsb = 1 - msb;
6236
6237       while (src + 1 < src_end)
6238         {
6239           c = src[lsb];
6240           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6241             {
6242               int this_eol;
6243
6244               if (c == '\n')
6245                 this_eol = EOL_SEEN_LF;
6246               else if (src + 3 >= src_end
6247                        || src[msb + 2] != 0
6248                        || src[lsb + 2] != '\n')
6249                 this_eol = EOL_SEEN_CR;
6250               else
6251                 {
6252                   this_eol = EOL_SEEN_CRLF;
6253                   src += 2;
6254                 }
6255
6256               if (eol_seen == EOL_SEEN_NONE)
6257                 /* This is the first end-of-line.  */
6258                 eol_seen = this_eol;
6259               else if (eol_seen != this_eol)
6260                 {
6261                   /* The found type is different from what found before.
6262                      Allow for stray ^M characters in DOS EOL files.  */
6263                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6264                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6265                     eol_seen = EOL_SEEN_CRLF;
6266                   else
6267                     {
6268                       eol_seen = EOL_SEEN_LF;
6269                       break;
6270                     }
6271                 }
6272               if (++total == MAX_EOL_CHECK_COUNT)
6273                 break;
6274             }
6275           src += 2;
6276         }
6277     }
6278   else
6279     {
6280       while (src < src_end)
6281         {
6282           c = *src++;
6283           if (c == '\n' || c == '\r')
6284             {
6285               int this_eol;
6286
6287               if (c == '\n')
6288                 this_eol = EOL_SEEN_LF;
6289               else if (src >= src_end || *src != '\n')
6290                 this_eol = EOL_SEEN_CR;
6291               else
6292                 this_eol = EOL_SEEN_CRLF, src++;
6293
6294               if (eol_seen == EOL_SEEN_NONE)
6295                 /* This is the first end-of-line.  */
6296                 eol_seen = this_eol;
6297               else if (eol_seen != this_eol)
6298                 {
6299                   /* The found type is different from what found before.
6300                      Allow for stray ^M characters in DOS EOL files.  */
6301                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6302                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6303                     eol_seen = EOL_SEEN_CRLF;
6304                   else
6305                     {
6306                       eol_seen = EOL_SEEN_LF;
6307                       break;
6308                     }
6309                 }
6310               if (++total == MAX_EOL_CHECK_COUNT)
6311                 break;
6312             }
6313         }
6314     }
6315   return eol_seen;
6316 }
6317
6318
6319 static Lisp_Object
6320 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6321 {
6322   Lisp_Object eol_type;
6323
6324   eol_type = CODING_ID_EOL_TYPE (coding->id);
6325   if (eol_seen & EOL_SEEN_LF)
6326     {
6327       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6328       eol_type = Qunix;
6329     }
6330   else if (eol_seen & EOL_SEEN_CRLF)
6331     {
6332       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6333       eol_type = Qdos;
6334     }
6335   else if (eol_seen & EOL_SEEN_CR)
6336     {
6337       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6338       eol_type = Qmac;
6339     }
6340   return eol_type;
6341 }
6342
6343 /* Detect how a text specified in CODING is encoded.  If a coding
6344    system is detected, update fields of CODING by the detected coding
6345    system.  */
6346
6347 void
6348 detect_coding (struct coding_system *coding)
6349 {
6350   const unsigned char *src, *src_end;
6351   int saved_mode = coding->mode;
6352
6353   coding->consumed = coding->consumed_char = 0;
6354   coding->produced = coding->produced_char = 0;
6355   coding_set_source (coding);
6356
6357   src_end = coding->source + coding->src_bytes;
6358   coding->head_ascii = 0;
6359
6360   /* If we have not yet decided the text encoding type, detect it
6361      now.  */
6362   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6363     {
6364       int c, i;
6365       struct coding_detection_info detect_info;
6366       int null_byte_found = 0, eight_bit_found = 0;
6367
6368       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6369       for (src = coding->source; src < src_end; src++)
6370         {
6371           c = *src;
6372           if (c & 0x80)
6373             {
6374               eight_bit_found = 1;
6375               if (null_byte_found)
6376                 break;
6377             }
6378           else if (c < 0x20)
6379             {
6380               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6381                   && ! inhibit_iso_escape_detection
6382                   && ! detect_info.checked)
6383                 {
6384                   if (detect_coding_iso_2022 (coding, &detect_info))
6385                     {
6386                       /* We have scanned the whole data.  */
6387                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6388                         {
6389                           /* We didn't find an 8-bit code.  We may
6390                              have found a null-byte, but it's very
6391                              rare that a binary file conforms to
6392                              ISO-2022.  */
6393                           src = src_end;
6394                           coding->head_ascii = src - coding->source;
6395                         }
6396                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6397                       break;
6398                     }
6399                 }
6400               else if (! c && !inhibit_null_byte_detection)
6401                 {
6402                   null_byte_found = 1;
6403                   if (eight_bit_found)
6404                     break;
6405                 }
6406               if (! eight_bit_found)
6407                 coding->head_ascii++;
6408             }
6409           else if (! eight_bit_found)
6410             coding->head_ascii++;
6411         }
6412
6413       if (null_byte_found || eight_bit_found
6414           || coding->head_ascii < coding->src_bytes
6415           || detect_info.found)
6416         {
6417           enum coding_category category;
6418           struct coding_system *this;
6419
6420           if (coding->head_ascii == coding->src_bytes)
6421             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6422             for (i = 0; i < coding_category_raw_text; i++)
6423               {
6424                 category = coding_priorities[i];
6425                 this = coding_categories + category;
6426                 if (detect_info.found & (1 << category))
6427                   break;
6428               }
6429           else
6430             {
6431               if (null_byte_found)
6432                 {
6433                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6434                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6435                 }
6436               for (i = 0; i < coding_category_raw_text; i++)
6437                 {
6438                   category = coding_priorities[i];
6439                   this = coding_categories + category;
6440                   if (this->id < 0)
6441                     {
6442                       /* No coding system of this category is defined.  */
6443                       detect_info.rejected |= (1 << category);
6444                     }
6445                   else if (category >= coding_category_raw_text)
6446                     continue;
6447                   else if (detect_info.checked & (1 << category))
6448                     {
6449                       if (detect_info.found & (1 << category))
6450                         break;
6451                     }
6452                   else if ((*(this->detector)) (coding, &detect_info)
6453                            && detect_info.found & (1 << category))
6454                     {
6455                       if (category == coding_category_utf_16_auto)
6456                         {
6457                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6458                             category = coding_category_utf_16_le;
6459                           else
6460                             category = coding_category_utf_16_be;
6461                         }
6462                       break;
6463                     }
6464                 }
6465             }
6466
6467           if (i < coding_category_raw_text)
6468             setup_coding_system (CODING_ID_NAME (this->id), coding);
6469           else if (null_byte_found)
6470             setup_coding_system (Qno_conversion, coding);
6471           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6472                    == CATEGORY_MASK_ANY)
6473             setup_coding_system (Qraw_text, coding);
6474           else if (detect_info.rejected)
6475             for (i = 0; i < coding_category_raw_text; i++)
6476               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6477                 {
6478                   this = coding_categories + coding_priorities[i];
6479                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6480                   break;
6481                 }
6482         }
6483     }
6484   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6485            == coding_category_utf_8_auto)
6486     {
6487       Lisp_Object coding_systems;
6488       struct coding_detection_info detect_info;
6489
6490       coding_systems
6491         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6492       detect_info.found = detect_info.rejected = 0;
6493       coding->head_ascii = 0;
6494       if (CONSP (coding_systems)
6495           && detect_coding_utf_8 (coding, &detect_info))
6496         {
6497           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6498             setup_coding_system (XCAR (coding_systems), coding);
6499           else
6500             setup_coding_system (XCDR (coding_systems), coding);
6501         }
6502     }
6503   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6504            == coding_category_utf_16_auto)
6505     {
6506       Lisp_Object coding_systems;
6507       struct coding_detection_info detect_info;
6508
6509       coding_systems
6510         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6511       detect_info.found = detect_info.rejected = 0;
6512       coding->head_ascii = 0;
6513       if (CONSP (coding_systems)
6514           && detect_coding_utf_16 (coding, &detect_info))
6515         {
6516           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6517             setup_coding_system (XCAR (coding_systems), coding);
6518           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6519             setup_coding_system (XCDR (coding_systems), coding);
6520         }
6521     }
6522   coding->mode = saved_mode;
6523 }
6524
6525
6526 static void
6527 decode_eol (struct coding_system *coding)
6528 {
6529   Lisp_Object eol_type;
6530   unsigned char *p, *pbeg, *pend;
6531
6532   eol_type = CODING_ID_EOL_TYPE (coding->id);
6533   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6534     return;
6535
6536   if (NILP (coding->dst_object))
6537     pbeg = coding->destination;
6538   else
6539     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6540   pend = pbeg + coding->produced;
6541
6542   if (VECTORP (eol_type))
6543     {
6544       int eol_seen = EOL_SEEN_NONE;
6545
6546       for (p = pbeg; p < pend; p++)
6547         {
6548           if (*p == '\n')
6549             eol_seen |= EOL_SEEN_LF;
6550           else if (*p == '\r')
6551             {
6552               if (p + 1 < pend && *(p + 1) == '\n')
6553                 {
6554                   eol_seen |= EOL_SEEN_CRLF;
6555                   p++;
6556                 }
6557               else
6558                 eol_seen |= EOL_SEEN_CR;
6559             }
6560         }
6561       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6562       if ((eol_seen & EOL_SEEN_CRLF) != 0
6563           && (eol_seen & EOL_SEEN_CR) != 0
6564           && (eol_seen & EOL_SEEN_LF) == 0)
6565         eol_seen = EOL_SEEN_CRLF;
6566       else if (eol_seen != EOL_SEEN_NONE
6567           && eol_seen != EOL_SEEN_LF
6568           && eol_seen != EOL_SEEN_CRLF
6569           && eol_seen != EOL_SEEN_CR)
6570         eol_seen = EOL_SEEN_LF;
6571       if (eol_seen != EOL_SEEN_NONE)
6572         eol_type = adjust_coding_eol_type (coding, eol_seen);
6573     }
6574
6575   if (EQ (eol_type, Qmac))
6576     {
6577       for (p = pbeg; p < pend; p++)
6578         if (*p == '\r')
6579           *p = '\n';
6580     }
6581   else if (EQ (eol_type, Qdos))
6582     {
6583       int n = 0;
6584
6585       if (NILP (coding->dst_object))
6586         {
6587           /* Start deleting '\r' from the tail to minimize the memory
6588              movement.  */
6589           for (p = pend - 2; p >= pbeg; p--)
6590             if (*p == '\r')
6591               {
6592                 memmove (p, p + 1, pend-- - p - 1);
6593                 n++;
6594               }
6595         }
6596       else
6597         {
6598           int pos_byte = coding->dst_pos_byte;
6599           int pos = coding->dst_pos;
6600           int pos_end = pos + coding->produced_char - 1;
6601
6602           while (pos < pos_end)
6603             {
6604               p = BYTE_POS_ADDR (pos_byte);
6605               if (*p == '\r' && p[1] == '\n')
6606                 {
6607                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6608                   n++;
6609                   pos_end--;
6610                 }
6611               pos++;
6612               if (coding->dst_multibyte)
6613                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6614               else
6615                 pos_byte++;
6616             }
6617         }
6618       coding->produced -= n;
6619       coding->produced_char -= n;
6620     }
6621 }
6622
6623
6624 /* Return a translation table (or list of them) from coding system
6625    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6626    decoding (ENCODEP is zero). */
6627
6628 static Lisp_Object
6629 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6630 {
6631   Lisp_Object standard, translation_table;
6632   Lisp_Object val;
6633
6634   if (NILP (Venable_character_translation))
6635     {
6636       if (max_lookup)
6637         *max_lookup = 0;
6638       return Qnil;
6639     }
6640   if (encodep)
6641     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6642       standard = Vstandard_translation_table_for_encode;
6643   else
6644     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6645       standard = Vstandard_translation_table_for_decode;
6646   if (NILP (translation_table))
6647     translation_table = standard;
6648   else
6649     {
6650       if (SYMBOLP (translation_table))
6651         translation_table = Fget (translation_table, Qtranslation_table);
6652       else if (CONSP (translation_table))
6653         {
6654           translation_table = Fcopy_sequence (translation_table);
6655           for (val = translation_table; CONSP (val); val = XCDR (val))
6656             if (SYMBOLP (XCAR (val)))
6657               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6658         }
6659       if (CHAR_TABLE_P (standard))
6660         {
6661           if (CONSP (translation_table))
6662             translation_table = nconc2 (translation_table,
6663                                         Fcons (standard, Qnil));
6664           else
6665             translation_table = Fcons (translation_table,
6666                                        Fcons (standard, Qnil));
6667         }
6668     }
6669
6670   if (max_lookup)
6671     {
6672       *max_lookup = 1;
6673       if (CHAR_TABLE_P (translation_table)
6674           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6675         {
6676           val = XCHAR_TABLE (translation_table)->extras[1];
6677           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6678             *max_lookup = XFASTINT (val);
6679         }
6680       else if (CONSP (translation_table))
6681         {
6682           Lisp_Object tail, val;
6683
6684           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6685             if (CHAR_TABLE_P (XCAR (tail))
6686                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6687               {
6688                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6689                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6690                   *max_lookup = XFASTINT (val);
6691               }
6692         }
6693     }
6694   return translation_table;
6695 }
6696
6697 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6698   do {                                                          \
6699     trans = Qnil;                                               \
6700     if (CHAR_TABLE_P (table))                                   \
6701       {                                                         \
6702         trans = CHAR_TABLE_REF (table, c);                      \
6703         if (CHARACTERP (trans))                                 \
6704           c = XFASTINT (trans), trans = Qnil;                   \
6705       }                                                         \
6706     else if (CONSP (table))                                     \
6707       {                                                         \
6708         Lisp_Object tail;                                       \
6709                                                                 \
6710         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6711           if (CHAR_TABLE_P (XCAR (tail)))                       \
6712             {                                                   \
6713               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6714               if (CHARACTERP (trans))                           \
6715                 c = XFASTINT (trans), trans = Qnil;             \
6716               else if (! NILP (trans))                          \
6717                 break;                                          \
6718             }                                                   \
6719       }                                                         \
6720   } while (0)
6721
6722
6723 /* Return a translation of character(s) at BUF according to TRANS.
6724    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6725    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6726    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6727    translation is found, and Qnil if not found..
6728    If BUF is too short to lookup characters in FROM, return Qt.  */
6729
6730 static Lisp_Object
6731 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6732 {
6733
6734   if (INTEGERP (trans))
6735     return trans;
6736   for (; CONSP (trans); trans = XCDR (trans))
6737     {
6738       Lisp_Object val = XCAR (trans);
6739       Lisp_Object from = XCAR (val);
6740       int len = ASIZE (from);
6741       int i;
6742
6743       for (i = 0; i < len; i++)
6744         {
6745           if (buf + i == buf_end)
6746             return Qt;
6747           if (XINT (AREF (from, i)) != buf[i])
6748             break;
6749         }
6750       if (i == len)
6751         return val;
6752     }
6753   return Qnil;
6754 }
6755
6756
6757 static int
6758 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6759                int last_block)
6760 {
6761   unsigned char *dst = coding->destination + coding->produced;
6762   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6763   EMACS_INT produced;
6764   EMACS_INT produced_chars = 0;
6765   int carryover = 0;
6766
6767   if (! coding->chars_at_source)
6768     {
6769       /* Source characters are in coding->charbuf.  */
6770       int *buf = coding->charbuf;
6771       int *buf_end = buf + coding->charbuf_used;
6772
6773       if (EQ (coding->src_object, coding->dst_object))
6774         {
6775           coding_set_source (coding);
6776           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6777         }
6778
6779       while (buf < buf_end)
6780         {
6781           int c = *buf, i;
6782
6783           if (c >= 0)
6784             {
6785               int from_nchars = 1, to_nchars = 1;
6786               Lisp_Object trans = Qnil;
6787
6788               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6789               if (! NILP (trans))
6790                 {
6791                   trans = get_translation (trans, buf, buf_end);
6792                   if (INTEGERP (trans))
6793                     c = XINT (trans);
6794                   else if (CONSP (trans))
6795                     {
6796                       from_nchars = ASIZE (XCAR (trans));
6797                       trans = XCDR (trans);
6798                       if (INTEGERP (trans))
6799                         c = XINT (trans);
6800                       else
6801                         {
6802                           to_nchars = ASIZE (trans);
6803                           c = XINT (AREF (trans, 0));
6804                         }
6805                     }
6806                   else if (EQ (trans, Qt) && ! last_block)
6807                     break;
6808                 }
6809
6810               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6811                 {
6812                   dst = alloc_destination (coding,
6813                                            buf_end - buf
6814                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6815                                            dst);
6816                   if (EQ (coding->src_object, coding->dst_object))
6817                     {
6818                       coding_set_source (coding);
6819                       dst_end = (((unsigned char *) coding->source)
6820                                  + coding->consumed);
6821                     }
6822                   else
6823                     dst_end = coding->destination + coding->dst_bytes;
6824                 }
6825
6826               for (i = 0; i < to_nchars; i++)
6827                 {
6828                   if (i > 0)
6829                     c = XINT (AREF (trans, i));
6830                   if (coding->dst_multibyte
6831                       || ! CHAR_BYTE8_P (c))
6832                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6833                   else
6834                     *dst++ = CHAR_TO_BYTE8 (c);
6835                 }
6836               produced_chars += to_nchars;
6837               buf += from_nchars;
6838             }
6839           else
6840             /* This is an annotation datum.  (-C) is the length.  */
6841             buf += -c;
6842         }
6843       carryover = buf_end - buf;
6844     }
6845   else
6846     {
6847       /* Source characters are at coding->source.  */
6848       const unsigned char *src = coding->source;
6849       const unsigned char *src_end = src + coding->consumed;
6850
6851       if (EQ (coding->dst_object, coding->src_object))
6852         dst_end = (unsigned char *) src;
6853       if (coding->src_multibyte != coding->dst_multibyte)
6854         {
6855           if (coding->src_multibyte)
6856             {
6857               int multibytep = 1;
6858               EMACS_INT consumed_chars = 0;
6859
6860               while (1)
6861                 {
6862                   const unsigned char *src_base = src;
6863                   int c;
6864
6865                   ONE_MORE_BYTE (c);
6866                   if (dst == dst_end)
6867                     {
6868                       if (EQ (coding->src_object, coding->dst_object))
6869                         dst_end = (unsigned char *) src;
6870                       if (dst == dst_end)
6871                         {
6872                           EMACS_INT offset = src - coding->source;
6873
6874                           dst = alloc_destination (coding, src_end - src + 1,
6875                                                    dst);
6876                           dst_end = coding->destination + coding->dst_bytes;
6877                           coding_set_source (coding);
6878                           src = coding->source + offset;
6879                           src_end = coding->source + coding->src_bytes;
6880                           if (EQ (coding->src_object, coding->dst_object))
6881                             dst_end = (unsigned char *) src;
6882                         }
6883                     }
6884                   *dst++ = c;
6885                   produced_chars++;
6886                 }
6887             no_more_source:
6888               ;
6889             }
6890           else
6891             while (src < src_end)
6892               {
6893                 int multibytep = 1;
6894                 int c = *src++;
6895
6896                 if (dst >= dst_end - 1)
6897                   {
6898                     if (EQ (coding->src_object, coding->dst_object))
6899                       dst_end = (unsigned char *) src;
6900                     if (dst >= dst_end - 1)
6901                       {
6902                         EMACS_INT offset = src - coding->source;
6903                         EMACS_INT more_bytes;
6904
6905                         if (EQ (coding->src_object, coding->dst_object))
6906                           more_bytes = ((src_end - src) / 2) + 2;
6907                         else
6908                           more_bytes = src_end - src + 2;
6909                         dst = alloc_destination (coding, more_bytes, dst);
6910                         dst_end = coding->destination + coding->dst_bytes;
6911                         coding_set_source (coding);
6912                         src = coding->source + offset;
6913                         src_end = coding->source + coding->src_bytes;
6914                         if (EQ (coding->src_object, coding->dst_object))
6915                           dst_end = (unsigned char *) src;
6916                       }
6917                   }
6918                 EMIT_ONE_BYTE (c);
6919               }
6920         }
6921       else
6922         {
6923           if (!EQ (coding->src_object, coding->dst_object))
6924             {
6925               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6926
6927               if (require > 0)
6928                 {
6929                   EMACS_INT offset = src - coding->source;
6930
6931                   dst = alloc_destination (coding, require, dst);
6932                   coding_set_source (coding);
6933                   src = coding->source + offset;
6934                   src_end = coding->source + coding->src_bytes;
6935                 }
6936             }
6937           produced_chars = coding->consumed_char;
6938           while (src < src_end)
6939             *dst++ = *src++;
6940         }
6941     }
6942
6943   produced = dst - (coding->destination + coding->produced);
6944   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6945     insert_from_gap (produced_chars, produced);
6946   coding->produced += produced;
6947   coding->produced_char += produced_chars;
6948   return carryover;
6949 }
6950
6951 /* Compose text in CODING->object according to the annotation data at
6952    CHARBUF.  CHARBUF is an array:
6953      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6954  */
6955
6956 static INLINE void
6957 produce_composition (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6958 {
6959   int len;
6960   EMACS_INT to;
6961   enum composition_method method;
6962   Lisp_Object components;
6963
6964   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6965   to = pos + charbuf[2];
6966   method = (enum composition_method) (charbuf[4]);
6967
6968   if (method == COMPOSITION_RELATIVE)
6969     components = Qnil;
6970   else
6971     {
6972       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6973       int i, j;
6974
6975       if (method == COMPOSITION_WITH_RULE)
6976         len = charbuf[2] * 3 - 2;
6977       charbuf += MAX_ANNOTATION_LENGTH;
6978       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6979       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6980         {
6981           if (charbuf[i] >= 0)
6982             args[j] = make_number (charbuf[i]);
6983           else
6984             {
6985               i++;
6986               args[j] = make_number (charbuf[i] % 0x100);
6987             }
6988         }
6989       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6990     }
6991   compose_text (pos, to, components, Qnil, coding->dst_object);
6992 }
6993
6994
6995 /* Put `charset' property on text in CODING->object according to
6996    the annotation data at CHARBUF.  CHARBUF is an array:
6997      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6998  */
6999
7000 static INLINE void
7001 produce_charset (struct coding_system *coding, int *charbuf, EMACS_INT pos)
7002 {
7003   EMACS_INT from = pos - charbuf[2];
7004   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7005
7006   Fput_text_property (make_number (from), make_number (pos),
7007                       Qcharset, CHARSET_NAME (charset),
7008                       coding->dst_object);
7009 }
7010
7011
7012 #define CHARBUF_SIZE 0x4000
7013
7014 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7015   do {                                                                  \
7016     int size = CHARBUF_SIZE;                                            \
7017                                                                         \
7018     coding->charbuf = NULL;                                             \
7019     while (size > 1024)                                                 \
7020       {                                                                 \
7021         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
7022         if (coding->charbuf)                                            \
7023           break;                                                        \
7024         size >>= 1;                                                     \
7025       }                                                                 \
7026     if (! coding->charbuf)                                              \
7027       {                                                                 \
7028         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7029         return coding->result;                                          \
7030       }                                                                 \
7031     coding->charbuf_size = size;                                        \
7032   } while (0)
7033
7034
7035 static void
7036 produce_annotation (struct coding_system *coding, EMACS_INT pos)
7037 {
7038   int *charbuf = coding->charbuf;
7039   int *charbuf_end = charbuf + coding->charbuf_used;
7040
7041   if (NILP (coding->dst_object))
7042     return;
7043
7044   while (charbuf < charbuf_end)
7045     {
7046       if (*charbuf >= 0)
7047         pos++, charbuf++;
7048       else
7049         {
7050           int len = -*charbuf;
7051
7052           if (len > 2)
7053             switch (charbuf[1])
7054               {
7055               case CODING_ANNOTATE_COMPOSITION_MASK:
7056                 produce_composition (coding, charbuf, pos);
7057                 break;
7058               case CODING_ANNOTATE_CHARSET_MASK:
7059                 produce_charset (coding, charbuf, pos);
7060                 break;
7061               }
7062           charbuf += len;
7063         }
7064     }
7065 }
7066
7067 /* Decode the data at CODING->src_object into CODING->dst_object.
7068    CODING->src_object is a buffer, a string, or nil.
7069    CODING->dst_object is a buffer.
7070
7071    If CODING->src_object is a buffer, it must be the current buffer.
7072    In this case, if CODING->src_pos is positive, it is a position of
7073    the source text in the buffer, otherwise, the source text is in the
7074    gap area of the buffer, and CODING->src_pos specifies the offset of
7075    the text from GPT (which must be the same as PT).  If this is the
7076    same buffer as CODING->dst_object, CODING->src_pos must be
7077    negative.
7078
7079    If CODING->src_object is a string, CODING->src_pos is an index to
7080    that string.
7081
7082    If CODING->src_object is nil, CODING->source must already point to
7083    the non-relocatable memory area.  In this case, CODING->src_pos is
7084    an offset from CODING->source.
7085
7086    The decoded data is inserted at the current point of the buffer
7087    CODING->dst_object.
7088 */
7089
7090 static int
7091 decode_coding (struct coding_system *coding)
7092 {
7093   Lisp_Object attrs;
7094   Lisp_Object undo_list;
7095   Lisp_Object translation_table;
7096   struct ccl_spec cclspec;
7097   int carryover;
7098   int i;
7099
7100   if (BUFFERP (coding->src_object)
7101       && coding->src_pos > 0
7102       && coding->src_pos < GPT
7103       && coding->src_pos + coding->src_chars > GPT)
7104     move_gap_both (coding->src_pos, coding->src_pos_byte);
7105
7106   undo_list = Qt;
7107   if (BUFFERP (coding->dst_object))
7108     {
7109       if (current_buffer != XBUFFER (coding->dst_object))
7110         set_buffer_internal (XBUFFER (coding->dst_object));
7111       if (GPT != PT)
7112         move_gap_both (PT, PT_BYTE);
7113       undo_list = current_buffer->undo_list;
7114       current_buffer->undo_list = Qt;
7115     }
7116
7117   coding->consumed = coding->consumed_char = 0;
7118   coding->produced = coding->produced_char = 0;
7119   coding->chars_at_source = 0;
7120   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7121   coding->errors = 0;
7122
7123   ALLOC_CONVERSION_WORK_AREA (coding);
7124
7125   attrs = CODING_ID_ATTRS (coding->id);
7126   translation_table = get_translation_table (attrs, 0, NULL);
7127
7128   carryover = 0;
7129   if (coding->decoder == decode_coding_ccl)
7130     {
7131       coding->spec.ccl = &cclspec;
7132       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7133     }
7134   do
7135     {
7136       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7137
7138       coding_set_source (coding);
7139       coding->annotated = 0;
7140       coding->charbuf_used = carryover;
7141       (*(coding->decoder)) (coding);
7142       coding_set_destination (coding);
7143       carryover = produce_chars (coding, translation_table, 0);
7144       if (coding->annotated)
7145         produce_annotation (coding, pos);
7146       for (i = 0; i < carryover; i++)
7147         coding->charbuf[i]
7148           = coding->charbuf[coding->charbuf_used - carryover + i];
7149     }
7150   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7151          || (coding->consumed < coding->src_bytes
7152              && (coding->result == CODING_RESULT_SUCCESS
7153                  || coding->result == CODING_RESULT_INVALID_SRC)));
7154
7155   if (carryover > 0)
7156     {
7157       coding_set_destination (coding);
7158       coding->charbuf_used = carryover;
7159       produce_chars (coding, translation_table, 1);
7160     }
7161
7162   coding->carryover_bytes = 0;
7163   if (coding->consumed < coding->src_bytes)
7164     {
7165       int nbytes = coding->src_bytes - coding->consumed;
7166       const unsigned char *src;
7167
7168       coding_set_source (coding);
7169       coding_set_destination (coding);
7170       src = coding->source + coding->consumed;
7171
7172       if (coding->mode & CODING_MODE_LAST_BLOCK)
7173         {
7174           /* Flush out unprocessed data as binary chars.  We are sure
7175              that the number of data is less than the size of
7176              coding->charbuf.  */
7177           coding->charbuf_used = 0;
7178           coding->chars_at_source = 0;
7179
7180           while (nbytes-- > 0)
7181             {
7182               int c = *src++;
7183
7184               if (c & 0x80)
7185                 c = BYTE8_TO_CHAR (c);
7186               coding->charbuf[coding->charbuf_used++] = c;
7187             }
7188           produce_chars (coding, Qnil, 1);
7189         }
7190       else
7191         {
7192           /* Record unprocessed bytes in coding->carryover.  We are
7193              sure that the number of data is less than the size of
7194              coding->carryover.  */
7195           unsigned char *p = coding->carryover;
7196
7197           if (nbytes > sizeof coding->carryover)
7198             nbytes = sizeof coding->carryover;
7199           coding->carryover_bytes = nbytes;
7200           while (nbytes-- > 0)
7201             *p++ = *src++;
7202         }
7203       coding->consumed = coding->src_bytes;
7204     }
7205
7206   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7207       && !inhibit_eol_conversion)
7208     decode_eol (coding);
7209   if (BUFFERP (coding->dst_object))
7210     {
7211       current_buffer->undo_list = undo_list;
7212       record_insert (coding->dst_pos, coding->produced_char);
7213     }
7214   return coding->result;
7215 }
7216
7217
7218 /* Extract an annotation datum from a composition starting at POS and
7219    ending before LIMIT of CODING->src_object (buffer or string), store
7220    the data in BUF, set *STOP to a starting position of the next
7221    composition (if any) or to LIMIT, and return the address of the
7222    next element of BUF.
7223
7224    If such an annotation is not found, set *STOP to a starting
7225    position of a composition after POS (if any) or to LIMIT, and
7226    return BUF.  */
7227
7228 static INLINE int *
7229 handle_composition_annotation (EMACS_INT pos, EMACS_INT limit,
7230                                struct coding_system *coding, int *buf,
7231                                EMACS_INT *stop)
7232 {
7233   EMACS_INT start, end;
7234   Lisp_Object prop;
7235
7236   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7237       || end > limit)
7238     *stop = limit;
7239   else if (start > pos)
7240     *stop = start;
7241   else
7242     {
7243       if (start == pos)
7244         {
7245           /* We found a composition.  Store the corresponding
7246              annotation data in BUF.  */
7247           int *head = buf;
7248           enum composition_method method = COMPOSITION_METHOD (prop);
7249           int nchars = COMPOSITION_LENGTH (prop);
7250
7251           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7252           if (method != COMPOSITION_RELATIVE)
7253             {
7254               Lisp_Object components;
7255               int len, i, i_byte;
7256
7257               components = COMPOSITION_COMPONENTS (prop);
7258               if (VECTORP (components))
7259                 {
7260                   len = XVECTOR (components)->size;
7261                   for (i = 0; i < len; i++)
7262                     *buf++ = XINT (AREF (components, i));
7263                 }
7264               else if (STRINGP (components))
7265                 {
7266                   len = SCHARS (components);
7267                   i = i_byte = 0;
7268                   while (i < len)
7269                     {
7270                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7271                       buf++;
7272                     }
7273                 }
7274               else if (INTEGERP (components))
7275                 {
7276                   len = 1;
7277                   *buf++ = XINT (components);
7278                 }
7279               else if (CONSP (components))
7280                 {
7281                   for (len = 0; CONSP (components);
7282                        len++, components = XCDR (components))
7283                     *buf++ = XINT (XCAR (components));
7284                 }
7285               else
7286                 abort ();
7287               *head -= len;
7288             }
7289         }
7290
7291       if (find_composition (end, limit, &start, &end, &prop,
7292                             coding->src_object)
7293           && end <= limit)
7294         *stop = start;
7295       else
7296         *stop = limit;
7297     }
7298   return buf;
7299 }
7300
7301
7302 /* Extract an annotation datum from a text property `charset' at POS of
7303    CODING->src_object (buffer of string), store the data in BUF, set
7304    *STOP to the position where the value of `charset' property changes
7305    (limiting by LIMIT), and return the address of the next element of
7306    BUF.
7307
7308    If the property value is nil, set *STOP to the position where the
7309    property value is non-nil (limiting by LIMIT), and return BUF.  */
7310
7311 static INLINE int *
7312 handle_charset_annotation (EMACS_INT pos, EMACS_INT limit,
7313                            struct coding_system *coding, int *buf,
7314                            EMACS_INT *stop)
7315 {
7316   Lisp_Object val, next;
7317   int id;
7318
7319   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7320   if (! NILP (val) && CHARSETP (val))
7321     id = XINT (CHARSET_SYMBOL_ID (val));
7322   else
7323     id = -1;
7324   ADD_CHARSET_DATA (buf, 0, id);
7325   next = Fnext_single_property_change (make_number (pos), Qcharset,
7326                                        coding->src_object,
7327                                        make_number (limit));
7328   *stop = XINT (next);
7329   return buf;
7330 }
7331
7332
7333 static void
7334 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7335                int max_lookup)
7336 {
7337   int *buf = coding->charbuf;
7338   int *buf_end = coding->charbuf + coding->charbuf_size;
7339   const unsigned char *src = coding->source + coding->consumed;
7340   const unsigned char *src_end = coding->source + coding->src_bytes;
7341   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7342   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7343   int multibytep = coding->src_multibyte;
7344   Lisp_Object eol_type;
7345   int c;
7346   EMACS_INT stop, stop_composition, stop_charset;
7347   int *lookup_buf = NULL;
7348
7349   if (! NILP (translation_table))
7350     lookup_buf = alloca (sizeof (int) * max_lookup);
7351
7352   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7353   if (VECTORP (eol_type))
7354     eol_type = Qunix;
7355
7356   /* Note: composition handling is not yet implemented.  */
7357   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7358
7359   if (NILP (coding->src_object))
7360     stop = stop_composition = stop_charset = end_pos;
7361   else
7362     {
7363       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7364         stop = stop_composition = pos;
7365       else
7366         stop = stop_composition = end_pos;
7367       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7368         stop = stop_charset = pos;
7369       else
7370         stop_charset = end_pos;
7371     }
7372
7373   /* Compensate for CRLF and conversion.  */
7374   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7375   while (buf < buf_end)
7376     {
7377       Lisp_Object trans;
7378
7379       if (pos == stop)
7380         {
7381           if (pos == end_pos)
7382             break;
7383           if (pos == stop_composition)
7384             buf = handle_composition_annotation (pos, end_pos, coding,
7385                                                  buf, &stop_composition);
7386           if (pos == stop_charset)
7387             buf = handle_charset_annotation (pos, end_pos, coding,
7388                                              buf, &stop_charset);
7389           stop = (stop_composition < stop_charset
7390                   ? stop_composition : stop_charset);
7391         }
7392
7393       if (! multibytep)
7394         {
7395           EMACS_INT bytes;
7396
7397           if (coding->encoder == encode_coding_raw_text
7398               || coding->encoder == encode_coding_ccl)
7399             c = *src++, pos++;
7400           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7401             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7402           else
7403             c = BYTE8_TO_CHAR (*src), src++, pos++;
7404         }
7405       else
7406         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7407       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7408         c = '\n';
7409       if (! EQ (eol_type, Qunix))
7410         {
7411           if (c == '\n')
7412             {
7413               if (EQ (eol_type, Qdos))
7414                 *buf++ = '\r';
7415               else
7416                 c = '\r';
7417             }
7418         }
7419
7420       trans = Qnil;
7421       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7422       if (NILP (trans))
7423         *buf++ = c;
7424       else
7425         {
7426           int from_nchars = 1, to_nchars = 1;
7427           int *lookup_buf_end;
7428           const unsigned char *p = src;
7429           int i;
7430
7431           lookup_buf[0] = c;
7432           for (i = 1; i < max_lookup && p < src_end; i++)
7433             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7434           lookup_buf_end = lookup_buf + i;
7435           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7436           if (INTEGERP (trans))
7437             c = XINT (trans);
7438           else if (CONSP (trans))
7439             {
7440               from_nchars = ASIZE (XCAR (trans));
7441               trans = XCDR (trans);
7442               if (INTEGERP (trans))
7443                 c = XINT (trans);
7444               else
7445                 {
7446                   to_nchars = ASIZE (trans);
7447                   if (buf + to_nchars > buf_end)
7448                     break;
7449                   c = XINT (AREF (trans, 0));
7450                 }
7451             }
7452           else
7453             break;
7454           *buf++ = c;
7455           for (i = 1; i < to_nchars; i++)
7456             *buf++ = XINT (AREF (trans, i));
7457           for (i = 1; i < from_nchars; i++, pos++)
7458             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7459         }
7460     }
7461
7462   coding->consumed = src - coding->source;
7463   coding->consumed_char = pos - coding->src_pos;
7464   coding->charbuf_used = buf - coding->charbuf;
7465   coding->chars_at_source = 0;
7466 }
7467
7468
7469 /* Encode the text at CODING->src_object into CODING->dst_object.
7470    CODING->src_object is a buffer or a string.
7471    CODING->dst_object is a buffer or nil.
7472
7473    If CODING->src_object is a buffer, it must be the current buffer.
7474    In this case, if CODING->src_pos is positive, it is a position of
7475    the source text in the buffer, otherwise. the source text is in the
7476    gap area of the buffer, and coding->src_pos specifies the offset of
7477    the text from GPT (which must be the same as PT).  If this is the
7478    same buffer as CODING->dst_object, CODING->src_pos must be
7479    negative and CODING should not have `pre-write-conversion'.
7480
7481    If CODING->src_object is a string, CODING should not have
7482    `pre-write-conversion'.
7483
7484    If CODING->dst_object is a buffer, the encoded data is inserted at
7485    the current point of that buffer.
7486
7487    If CODING->dst_object is nil, the encoded data is placed at the
7488    memory area specified by CODING->destination.  */
7489
7490 static int
7491 encode_coding (struct coding_system *coding)
7492 {
7493   Lisp_Object attrs;
7494   Lisp_Object translation_table;
7495   int max_lookup;
7496   struct ccl_spec cclspec;
7497
7498   attrs = CODING_ID_ATTRS (coding->id);
7499   if (coding->encoder == encode_coding_raw_text)
7500     translation_table = Qnil, max_lookup = 0;
7501   else
7502     translation_table = get_translation_table (attrs, 1, &max_lookup);
7503
7504   if (BUFFERP (coding->dst_object))
7505     {
7506       set_buffer_internal (XBUFFER (coding->dst_object));
7507       coding->dst_multibyte
7508         = ! NILP (current_buffer->enable_multibyte_characters);
7509     }
7510
7511   coding->consumed = coding->consumed_char = 0;
7512   coding->produced = coding->produced_char = 0;
7513   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7514   coding->errors = 0;
7515
7516   ALLOC_CONVERSION_WORK_AREA (coding);
7517
7518   if (coding->encoder == encode_coding_ccl)
7519     {
7520       coding->spec.ccl = &cclspec;
7521       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7522     }
7523   do {
7524     coding_set_source (coding);
7525     consume_chars (coding, translation_table, max_lookup);
7526     coding_set_destination (coding);
7527     (*(coding->encoder)) (coding);
7528   } while (coding->consumed_char < coding->src_chars);
7529
7530   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7531     insert_from_gap (coding->produced_char, coding->produced);
7532
7533   return (coding->result);
7534 }
7535
7536
7537 /* Name (or base name) of work buffer for code conversion.  */
7538 static Lisp_Object Vcode_conversion_workbuf_name;
7539
7540 /* A working buffer used by the top level conversion.  Once it is
7541    created, it is never destroyed.  It has the name
7542    Vcode_conversion_workbuf_name.  The other working buffers are
7543    destroyed after the use is finished, and their names are modified
7544    versions of Vcode_conversion_workbuf_name.  */
7545 static Lisp_Object Vcode_conversion_reused_workbuf;
7546
7547 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7548 static int reused_workbuf_in_use;
7549
7550
7551 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
7552    multibyteness of returning buffer.  */
7553
7554 static Lisp_Object
7555 make_conversion_work_buffer (int multibyte)
7556 {
7557   Lisp_Object name, workbuf;
7558   struct buffer *current;
7559
7560   if (reused_workbuf_in_use++)
7561     {
7562       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7563       workbuf = Fget_buffer_create (name);
7564     }
7565   else
7566     {
7567       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7568         Vcode_conversion_reused_workbuf
7569           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7570       workbuf = Vcode_conversion_reused_workbuf;
7571     }
7572   current = current_buffer;
7573   set_buffer_internal (XBUFFER (workbuf));
7574   /* We can't allow modification hooks to run in the work buffer.  For
7575      instance, directory_files_internal assumes that file decoding
7576      doesn't compile new regexps.  */
7577   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7578   Ferase_buffer ();
7579   current_buffer->undo_list = Qt;
7580   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7581   set_buffer_internal (current);
7582   return workbuf;
7583 }
7584
7585
7586 static Lisp_Object
7587 code_conversion_restore (Lisp_Object arg)
7588 {
7589   Lisp_Object current, workbuf;
7590   struct gcpro gcpro1;
7591
7592   GCPRO1 (arg);
7593   current = XCAR (arg);
7594   workbuf = XCDR (arg);
7595   if (! NILP (workbuf))
7596     {
7597       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7598         reused_workbuf_in_use = 0;
7599       else if (! NILP (Fbuffer_live_p (workbuf)))
7600         Fkill_buffer (workbuf);
7601     }
7602   set_buffer_internal (XBUFFER (current));
7603   UNGCPRO;
7604   return Qnil;
7605 }
7606
7607 Lisp_Object
7608 code_conversion_save (int with_work_buf, int multibyte)
7609 {
7610   Lisp_Object workbuf = Qnil;
7611
7612   if (with_work_buf)
7613     workbuf = make_conversion_work_buffer (multibyte);
7614   record_unwind_protect (code_conversion_restore,
7615                          Fcons (Fcurrent_buffer (), workbuf));
7616   return workbuf;
7617 }
7618
7619 int
7620 decode_coding_gap (struct coding_system *coding,
7621                    EMACS_INT chars, EMACS_INT bytes)
7622 {
7623   int count = SPECPDL_INDEX ();
7624   Lisp_Object attrs;
7625
7626   code_conversion_save (0, 0);
7627
7628   coding->src_object = Fcurrent_buffer ();
7629   coding->src_chars = chars;
7630   coding->src_bytes = bytes;
7631   coding->src_pos = -chars;
7632   coding->src_pos_byte = -bytes;
7633   coding->src_multibyte = chars < bytes;
7634   coding->dst_object = coding->src_object;
7635   coding->dst_pos = PT;
7636   coding->dst_pos_byte = PT_BYTE;
7637   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7638
7639   if (CODING_REQUIRE_DETECTION (coding))
7640     detect_coding (coding);
7641
7642   coding->mode |= CODING_MODE_LAST_BLOCK;
7643   current_buffer->text->inhibit_shrinking = 1;
7644   decode_coding (coding);
7645   current_buffer->text->inhibit_shrinking = 0;
7646
7647   attrs = CODING_ID_ATTRS (coding->id);
7648   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7649     {
7650       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7651       Lisp_Object val;
7652
7653       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7654       val = call1 (CODING_ATTR_POST_READ (attrs),
7655                    make_number (coding->produced_char));
7656       CHECK_NATNUM (val);
7657       coding->produced_char += Z - prev_Z;
7658       coding->produced += Z_BYTE - prev_Z_BYTE;
7659     }
7660
7661   unbind_to (count, Qnil);
7662   return coding->result;
7663 }
7664
7665 int
7666 encode_coding_gap (struct coding_system *coding,
7667                    EMACS_INT chars, EMACS_INT bytes)
7668 {
7669   int count = SPECPDL_INDEX ();
7670
7671   code_conversion_save (0, 0);
7672
7673   coding->src_object = Fcurrent_buffer ();
7674   coding->src_chars = chars;
7675   coding->src_bytes = bytes;
7676   coding->src_pos = -chars;
7677   coding->src_pos_byte = -bytes;
7678   coding->src_multibyte = chars < bytes;
7679   coding->dst_object = coding->src_object;
7680   coding->dst_pos = PT;
7681   coding->dst_pos_byte = PT_BYTE;
7682
7683   encode_coding (coding);
7684
7685   unbind_to (count, Qnil);
7686   return coding->result;
7687 }
7688
7689
7690 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7691    SRC_OBJECT into DST_OBJECT by coding context CODING.
7692
7693    SRC_OBJECT is a buffer, a string, or Qnil.
7694
7695    If it is a buffer, the text is at point of the buffer.  FROM and TO
7696    are positions in the buffer.
7697
7698    If it is a string, the text is at the beginning of the string.
7699    FROM and TO are indices to the string.
7700
7701    If it is nil, the text is at coding->source.  FROM and TO are
7702    indices to coding->source.
7703
7704    DST_OBJECT is a buffer, Qt, or Qnil.
7705
7706    If it is a buffer, the decoded text is inserted at point of the
7707    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7708    is deleted.
7709
7710    If it is Qt, a string is made from the decoded text, and
7711    set in CODING->dst_object.
7712
7713    If it is Qnil, the decoded text is stored at CODING->destination.
7714    The caller must allocate CODING->dst_bytes bytes at
7715    CODING->destination by xmalloc.  If the decoded text is longer than
7716    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7717  */
7718
7719 void
7720 decode_coding_object (struct coding_system *coding,
7721                       Lisp_Object src_object,
7722                       EMACS_INT from, EMACS_INT from_byte,
7723                       EMACS_INT to, EMACS_INT to_byte,
7724                       Lisp_Object dst_object)
7725 {
7726   int count = SPECPDL_INDEX ();
7727   unsigned char *destination;
7728   EMACS_INT dst_bytes;
7729   EMACS_INT chars = to - from;
7730   EMACS_INT bytes = to_byte - from_byte;
7731   Lisp_Object attrs;
7732   int saved_pt = -1, saved_pt_byte;
7733   int need_marker_adjustment = 0;
7734   Lisp_Object old_deactivate_mark;
7735
7736   old_deactivate_mark = Vdeactivate_mark;
7737
7738   if (NILP (dst_object))
7739     {
7740       destination = coding->destination;
7741       dst_bytes = coding->dst_bytes;
7742     }
7743
7744   coding->src_object = src_object;
7745   coding->src_chars = chars;
7746   coding->src_bytes = bytes;
7747   coding->src_multibyte = chars < bytes;
7748
7749   if (STRINGP (src_object))
7750     {
7751       coding->src_pos = from;
7752       coding->src_pos_byte = from_byte;
7753     }
7754   else if (BUFFERP (src_object))
7755     {
7756       set_buffer_internal (XBUFFER (src_object));
7757       if (from != GPT)
7758         move_gap_both (from, from_byte);
7759       if (EQ (src_object, dst_object))
7760         {
7761           struct Lisp_Marker *tail;
7762
7763           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7764             {
7765               tail->need_adjustment
7766                 = tail->charpos == (tail->insertion_type ? from : to);
7767               need_marker_adjustment |= tail->need_adjustment;
7768             }
7769           saved_pt = PT, saved_pt_byte = PT_BYTE;
7770           TEMP_SET_PT_BOTH (from, from_byte);
7771           current_buffer->text->inhibit_shrinking = 1;
7772           del_range_both (from, from_byte, to, to_byte, 1);
7773           coding->src_pos = -chars;
7774           coding->src_pos_byte = -bytes;
7775         }
7776       else
7777         {
7778           coding->src_pos = from;
7779           coding->src_pos_byte = from_byte;
7780         }
7781     }
7782
7783   if (CODING_REQUIRE_DETECTION (coding))
7784     detect_coding (coding);
7785   attrs = CODING_ID_ATTRS (coding->id);
7786
7787   if (EQ (dst_object, Qt)
7788       || (! NILP (CODING_ATTR_POST_READ (attrs))
7789           && NILP (dst_object)))
7790     {
7791       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7792       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7793       coding->dst_pos = BEG;
7794       coding->dst_pos_byte = BEG_BYTE;
7795     }
7796   else if (BUFFERP (dst_object))
7797     {
7798       code_conversion_save (0, 0);
7799       coding->dst_object = dst_object;
7800       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7801       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7802       coding->dst_multibyte
7803         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7804     }
7805   else
7806     {
7807       code_conversion_save (0, 0);
7808       coding->dst_object = Qnil;
7809       /* Most callers presume this will return a multibyte result, and they
7810          won't use `binary' or `raw-text' anyway, so let's not worry about
7811          CODING_FOR_UNIBYTE.  */
7812       coding->dst_multibyte = 1;
7813     }
7814
7815   decode_coding (coding);
7816
7817   if (BUFFERP (coding->dst_object))
7818     set_buffer_internal (XBUFFER (coding->dst_object));
7819
7820   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7821     {
7822       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7823       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7824       Lisp_Object val;
7825
7826       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7827       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7828               old_deactivate_mark);
7829       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7830                         make_number (coding->produced_char));
7831       UNGCPRO;
7832       CHECK_NATNUM (val);
7833       coding->produced_char += Z - prev_Z;
7834       coding->produced += Z_BYTE - prev_Z_BYTE;
7835     }
7836
7837   if (EQ (dst_object, Qt))
7838     {
7839       coding->dst_object = Fbuffer_string ();
7840     }
7841   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7842     {
7843       set_buffer_internal (XBUFFER (coding->dst_object));
7844       if (dst_bytes < coding->produced)
7845         {
7846           destination = xrealloc (destination, coding->produced);
7847           if (! destination)
7848             {
7849               record_conversion_result (coding,
7850                                         CODING_RESULT_INSUFFICIENT_MEM);
7851               unbind_to (count, Qnil);
7852               return;
7853             }
7854           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7855             move_gap_both (BEGV, BEGV_BYTE);
7856           memcpy (destination, BEGV_ADDR, coding->produced);
7857           coding->destination = destination;
7858         }
7859     }
7860
7861   if (saved_pt >= 0)
7862     {
7863       /* This is the case of:
7864          (BUFFERP (src_object) && EQ (src_object, dst_object))
7865          As we have moved PT while replacing the original buffer
7866          contents, we must recover it now.  */
7867       set_buffer_internal (XBUFFER (src_object));
7868       current_buffer->text->inhibit_shrinking = 0;
7869       if (saved_pt < from)
7870         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7871       else if (saved_pt < from + chars)
7872         TEMP_SET_PT_BOTH (from, from_byte);
7873       else if (! NILP (current_buffer->enable_multibyte_characters))
7874         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7875                           saved_pt_byte + (coding->produced - bytes));
7876       else
7877         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7878                           saved_pt_byte + (coding->produced - bytes));
7879
7880       if (need_marker_adjustment)
7881         {
7882           struct Lisp_Marker *tail;
7883
7884           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7885             if (tail->need_adjustment)
7886               {
7887                 tail->need_adjustment = 0;
7888                 if (tail->insertion_type)
7889                   {
7890                     tail->bytepos = from_byte;
7891                     tail->charpos = from;
7892                   }
7893                 else
7894                   {
7895                     tail->bytepos = from_byte + coding->produced;
7896                     tail->charpos
7897                       = (NILP (current_buffer->enable_multibyte_characters)
7898                          ? tail->bytepos : from + coding->produced_char);
7899                   }
7900               }
7901         }
7902     }
7903
7904   Vdeactivate_mark = old_deactivate_mark;
7905   unbind_to (count, coding->dst_object);
7906 }
7907
7908
7909 void
7910 encode_coding_object (struct coding_system *coding,
7911                       Lisp_Object src_object,
7912                       EMACS_INT from, EMACS_INT from_byte,
7913                       EMACS_INT to, EMACS_INT to_byte,
7914                       Lisp_Object dst_object)
7915 {
7916   int count = SPECPDL_INDEX ();
7917   EMACS_INT chars = to - from;
7918   EMACS_INT bytes = to_byte - from_byte;
7919   Lisp_Object attrs;
7920   int saved_pt = -1, saved_pt_byte;
7921   int need_marker_adjustment = 0;
7922   int kill_src_buffer = 0;
7923   Lisp_Object old_deactivate_mark;
7924
7925   old_deactivate_mark = Vdeactivate_mark;
7926
7927   coding->src_object = src_object;
7928   coding->src_chars = chars;
7929   coding->src_bytes = bytes;
7930   coding->src_multibyte = chars < bytes;
7931
7932   attrs = CODING_ID_ATTRS (coding->id);
7933
7934   if (EQ (src_object, dst_object))
7935     {
7936       struct Lisp_Marker *tail;
7937
7938       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7939         {
7940           tail->need_adjustment
7941             = tail->charpos == (tail->insertion_type ? from : to);
7942           need_marker_adjustment |= tail->need_adjustment;
7943         }
7944     }
7945
7946   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7947     {
7948       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7949       set_buffer_internal (XBUFFER (coding->src_object));
7950       if (STRINGP (src_object))
7951         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7952       else if (BUFFERP (src_object))
7953         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7954       else
7955         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7956
7957       if (EQ (src_object, dst_object))
7958         {
7959           set_buffer_internal (XBUFFER (src_object));
7960           saved_pt = PT, saved_pt_byte = PT_BYTE;
7961           del_range_both (from, from_byte, to, to_byte, 1);
7962           set_buffer_internal (XBUFFER (coding->src_object));
7963         }
7964
7965       {
7966         Lisp_Object args[3];
7967         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7968
7969         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7970                 old_deactivate_mark);
7971         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7972         args[1] = make_number (BEG);
7973         args[2] = make_number (Z);
7974         safe_call (3, args);
7975         UNGCPRO;
7976       }
7977       if (XBUFFER (coding->src_object) != current_buffer)
7978         kill_src_buffer = 1;
7979       coding->src_object = Fcurrent_buffer ();
7980       if (BEG != GPT)
7981         move_gap_both (BEG, BEG_BYTE);
7982       coding->src_chars = Z - BEG;
7983       coding->src_bytes = Z_BYTE - BEG_BYTE;
7984       coding->src_pos = BEG;
7985       coding->src_pos_byte = BEG_BYTE;
7986       coding->src_multibyte = Z < Z_BYTE;
7987     }
7988   else if (STRINGP (src_object))
7989     {
7990       code_conversion_save (0, 0);
7991       coding->src_pos = from;
7992       coding->src_pos_byte = from_byte;
7993     }
7994   else if (BUFFERP (src_object))
7995     {
7996       code_conversion_save (0, 0);
7997       set_buffer_internal (XBUFFER (src_object));
7998       if (EQ (src_object, dst_object))
7999         {
8000           saved_pt = PT, saved_pt_byte = PT_BYTE;
8001           coding->src_object = del_range_1 (from, to, 1, 1);
8002           coding->src_pos = 0;
8003           coding->src_pos_byte = 0;
8004         }
8005       else
8006         {
8007           if (from < GPT && to >= GPT)
8008             move_gap_both (from, from_byte);
8009           coding->src_pos = from;
8010           coding->src_pos_byte = from_byte;
8011         }
8012     }
8013   else
8014     code_conversion_save (0, 0);
8015
8016   if (BUFFERP (dst_object))
8017     {
8018       coding->dst_object = dst_object;
8019       if (EQ (src_object, dst_object))
8020         {
8021           coding->dst_pos = from;
8022           coding->dst_pos_byte = from_byte;
8023         }
8024       else
8025         {
8026           struct buffer *current = current_buffer;
8027
8028           set_buffer_temp (XBUFFER (dst_object));
8029           coding->dst_pos = PT;
8030           coding->dst_pos_byte = PT_BYTE;
8031           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8032           set_buffer_temp (current);
8033         }
8034       coding->dst_multibyte
8035         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8036     }
8037   else if (EQ (dst_object, Qt))
8038     {
8039       coding->dst_object = Qnil;
8040       coding->dst_bytes = coding->src_chars;
8041       if (coding->dst_bytes == 0)
8042         coding->dst_bytes = 1;
8043       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8044       coding->dst_multibyte = 0;
8045     }
8046   else
8047     {
8048       coding->dst_object = Qnil;
8049       coding->dst_multibyte = 0;
8050     }
8051
8052   encode_coding (coding);
8053
8054   if (EQ (dst_object, Qt))
8055     {
8056       if (BUFFERP (coding->dst_object))
8057         coding->dst_object = Fbuffer_string ();
8058       else
8059         {
8060           coding->dst_object
8061             = make_unibyte_string ((char *) coding->destination,
8062                                    coding->produced);
8063           xfree (coding->destination);
8064         }
8065     }
8066
8067   if (saved_pt >= 0)
8068     {
8069       /* This is the case of:
8070          (BUFFERP (src_object) && EQ (src_object, dst_object))
8071          As we have moved PT while replacing the original buffer
8072          contents, we must recover it now.  */
8073       set_buffer_internal (XBUFFER (src_object));
8074       if (saved_pt < from)
8075         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8076       else if (saved_pt < from + chars)
8077         TEMP_SET_PT_BOTH (from, from_byte);
8078       else if (! NILP (current_buffer->enable_multibyte_characters))
8079         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8080                           saved_pt_byte + (coding->produced - bytes));
8081       else
8082         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8083                           saved_pt_byte + (coding->produced - bytes));
8084
8085       if (need_marker_adjustment)
8086         {
8087           struct Lisp_Marker *tail;
8088
8089           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8090             if (tail->need_adjustment)
8091               {
8092                 tail->need_adjustment = 0;
8093                 if (tail->insertion_type)
8094                   {
8095                     tail->bytepos = from_byte;
8096                     tail->charpos = from;
8097                   }
8098                 else
8099                   {
8100                     tail->bytepos = from_byte + coding->produced;
8101                     tail->charpos
8102                       = (NILP (current_buffer->enable_multibyte_characters)
8103                          ? tail->bytepos : from + coding->produced_char);
8104                   }
8105               }
8106         }
8107     }
8108
8109   if (kill_src_buffer)
8110     Fkill_buffer (coding->src_object);
8111
8112   Vdeactivate_mark = old_deactivate_mark;
8113   unbind_to (count, Qnil);
8114 }
8115
8116
8117 Lisp_Object
8118 preferred_coding_system (void)
8119 {
8120   int id = coding_categories[coding_priorities[0]].id;
8121
8122   return CODING_ID_NAME (id);
8123 }
8124
8125 \f
8126 #ifdef emacs
8127 /*** 8. Emacs Lisp library functions ***/
8128
8129 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8130        doc: /* Return t if OBJECT is nil or a coding-system.
8131 See the documentation of `define-coding-system' for information
8132 about coding-system objects.  */)
8133   (Lisp_Object object)
8134 {
8135   if (NILP (object)
8136       || CODING_SYSTEM_ID (object) >= 0)
8137     return Qt;
8138   if (! SYMBOLP (object)
8139       || NILP (Fget (object, Qcoding_system_define_form)))
8140     return Qnil;
8141   return Qt;
8142 }
8143
8144 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8145        Sread_non_nil_coding_system, 1, 1, 0,
8146        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8147   (Lisp_Object prompt)
8148 {
8149   Lisp_Object val;
8150   do
8151     {
8152       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8153                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8154     }
8155   while (SCHARS (val) == 0);
8156   return (Fintern (val, Qnil));
8157 }
8158
8159 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8160        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8161 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8162 Ignores case when completing coding systems (all Emacs coding systems
8163 are lower-case).  */)
8164   (Lisp_Object prompt, Lisp_Object default_coding_system)
8165 {
8166   Lisp_Object val;
8167   int count = SPECPDL_INDEX ();
8168
8169   if (SYMBOLP (default_coding_system))
8170     default_coding_system = SYMBOL_NAME (default_coding_system);
8171   specbind (Qcompletion_ignore_case, Qt);
8172   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8173                           Qt, Qnil, Qcoding_system_history,
8174                           default_coding_system, Qnil);
8175   unbind_to (count, Qnil);
8176   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8177 }
8178
8179 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8180        1, 1, 0,
8181        doc: /* Check validity of CODING-SYSTEM.
8182 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8183 It is valid if it is nil or a symbol defined as a coding system by the
8184 function `define-coding-system'.  */)
8185   (Lisp_Object coding_system)
8186 {
8187   Lisp_Object define_form;
8188
8189   define_form = Fget (coding_system, Qcoding_system_define_form);
8190   if (! NILP (define_form))
8191     {
8192       Fput (coding_system, Qcoding_system_define_form, Qnil);
8193       safe_eval (define_form);
8194     }
8195   if (!NILP (Fcoding_system_p (coding_system)))
8196     return coding_system;
8197   xsignal1 (Qcoding_system_error, coding_system);
8198 }
8199
8200 \f
8201 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8202    HIGHEST is nonzero, return the coding system of the highest
8203    priority among the detected coding systems.  Otherwize return a
8204    list of detected coding systems sorted by their priorities.  If
8205    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8206    multibyte form but contains only ASCII and eight-bit chars.
8207    Otherwise, the bytes are raw bytes.
8208
8209    CODING-SYSTEM controls the detection as below:
8210
8211    If it is nil, detect both text-format and eol-format.  If the
8212    text-format part of CODING-SYSTEM is already specified
8213    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8214    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8215    detect only text-format.  */
8216
8217 Lisp_Object
8218 detect_coding_system (const unsigned char *src,
8219                       EMACS_INT src_chars, EMACS_INT src_bytes,
8220                       int highest, int multibytep,
8221                       Lisp_Object coding_system)
8222 {
8223   const unsigned char *src_end = src + src_bytes;
8224   Lisp_Object attrs, eol_type;
8225   Lisp_Object val = Qnil;
8226   struct coding_system coding;
8227   int id;
8228   struct coding_detection_info detect_info;
8229   enum coding_category base_category;
8230   int null_byte_found = 0, eight_bit_found = 0;
8231
8232   if (NILP (coding_system))
8233     coding_system = Qundecided;
8234   setup_coding_system (coding_system, &coding);
8235   attrs = CODING_ID_ATTRS (coding.id);
8236   eol_type = CODING_ID_EOL_TYPE (coding.id);
8237   coding_system = CODING_ATTR_BASE_NAME (attrs);
8238
8239   coding.source = src;
8240   coding.src_chars = src_chars;
8241   coding.src_bytes = src_bytes;
8242   coding.src_multibyte = multibytep;
8243   coding.consumed = 0;
8244   coding.mode |= CODING_MODE_LAST_BLOCK;
8245   coding.head_ascii = 0;
8246
8247   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8248
8249   /* At first, detect text-format if necessary.  */
8250   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8251   if (base_category == coding_category_undecided)
8252     {
8253       enum coding_category category;
8254       struct coding_system *this;
8255       int c, i;
8256
8257       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8258       for (; src < src_end; src++)
8259         {
8260           c = *src;
8261           if (c & 0x80)
8262             {
8263               eight_bit_found = 1;
8264               if (null_byte_found)
8265                 break;
8266             }
8267           else if (c < 0x20)
8268             {
8269               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8270                   && ! inhibit_iso_escape_detection
8271                   && ! detect_info.checked)
8272                 {
8273                   if (detect_coding_iso_2022 (&coding, &detect_info))
8274                     {
8275                       /* We have scanned the whole data.  */
8276                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8277                         {
8278                           /* We didn't find an 8-bit code.  We may
8279                              have found a null-byte, but it's very
8280                              rare that a binary file confirm to
8281                              ISO-2022.  */
8282                           src = src_end;
8283                           coding.head_ascii = src - coding.source;
8284                         }
8285                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8286                       break;
8287                     }
8288                 }
8289               else if (! c && !inhibit_null_byte_detection)
8290                 {
8291                   null_byte_found = 1;
8292                   if (eight_bit_found)
8293                     break;
8294                 }
8295               if (! eight_bit_found)
8296                 coding.head_ascii++;
8297             }
8298           else if (! eight_bit_found)
8299             coding.head_ascii++;
8300         }
8301
8302       if (null_byte_found || eight_bit_found
8303           || coding.head_ascii < coding.src_bytes
8304           || detect_info.found)
8305         {
8306           if (coding.head_ascii == coding.src_bytes)
8307             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8308             for (i = 0; i < coding_category_raw_text; i++)
8309               {
8310                 category = coding_priorities[i];
8311                 this = coding_categories + category;
8312                 if (detect_info.found & (1 << category))
8313                   break;
8314               }
8315           else
8316             {
8317               if (null_byte_found)
8318                 {
8319                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8320                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8321                 }
8322               for (i = 0; i < coding_category_raw_text; i++)
8323                 {
8324                   category = coding_priorities[i];
8325                   this = coding_categories + category;
8326
8327                   if (this->id < 0)
8328                     {
8329                       /* No coding system of this category is defined.  */
8330                       detect_info.rejected |= (1 << category);
8331                     }
8332                   else if (category >= coding_category_raw_text)
8333                     continue;
8334                   else if (detect_info.checked & (1 << category))
8335                     {
8336                       if (highest
8337                           && (detect_info.found & (1 << category)))
8338                         break;
8339                     }
8340                   else if ((*(this->detector)) (&coding, &detect_info)
8341                            && highest
8342                            && (detect_info.found & (1 << category)))
8343                     {
8344                       if (category == coding_category_utf_16_auto)
8345                         {
8346                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8347                             category = coding_category_utf_16_le;
8348                           else
8349                             category = coding_category_utf_16_be;
8350                         }
8351                       break;
8352                     }
8353                 }
8354             }
8355         }
8356
8357       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8358           || null_byte_found)
8359         {
8360           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8361           id = CODING_SYSTEM_ID (Qno_conversion);
8362           val = Fcons (make_number (id), Qnil);
8363         }
8364       else if (! detect_info.rejected && ! detect_info.found)
8365         {
8366           detect_info.found = CATEGORY_MASK_ANY;
8367           id = coding_categories[coding_category_undecided].id;
8368           val = Fcons (make_number (id), Qnil);
8369         }
8370       else if (highest)
8371         {
8372           if (detect_info.found)
8373             {
8374               detect_info.found = 1 << category;
8375               val = Fcons (make_number (this->id), Qnil);
8376             }
8377           else
8378             for (i = 0; i < coding_category_raw_text; i++)
8379               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8380                 {
8381                   detect_info.found = 1 << coding_priorities[i];
8382                   id = coding_categories[coding_priorities[i]].id;
8383                   val = Fcons (make_number (id), Qnil);
8384                   break;
8385                 }
8386         }
8387       else
8388         {
8389           int mask = detect_info.rejected | detect_info.found;
8390           int found = 0;
8391
8392           for (i = coding_category_raw_text - 1; i >= 0; i--)
8393             {
8394               category = coding_priorities[i];
8395               if (! (mask & (1 << category)))
8396                 {
8397                   found |= 1 << category;
8398                   id = coding_categories[category].id;
8399                   if (id >= 0)
8400                     val = Fcons (make_number (id), val);
8401                 }
8402             }
8403           for (i = coding_category_raw_text - 1; i >= 0; i--)
8404             {
8405               category = coding_priorities[i];
8406               if (detect_info.found & (1 << category))
8407                 {
8408                   id = coding_categories[category].id;
8409                   val = Fcons (make_number (id), val);
8410                 }
8411             }
8412           detect_info.found |= found;
8413         }
8414     }
8415   else if (base_category == coding_category_utf_8_auto)
8416     {
8417       if (detect_coding_utf_8 (&coding, &detect_info))
8418         {
8419           struct coding_system *this;
8420
8421           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8422             this = coding_categories + coding_category_utf_8_sig;
8423           else
8424             this = coding_categories + coding_category_utf_8_nosig;
8425           val = Fcons (make_number (this->id), Qnil);
8426         }
8427     }
8428   else if (base_category == coding_category_utf_16_auto)
8429     {
8430       if (detect_coding_utf_16 (&coding, &detect_info))
8431         {
8432           struct coding_system *this;
8433
8434           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8435             this = coding_categories + coding_category_utf_16_le;
8436           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8437             this = coding_categories + coding_category_utf_16_be;
8438           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8439             this = coding_categories + coding_category_utf_16_be_nosig;
8440           else
8441             this = coding_categories + coding_category_utf_16_le_nosig;
8442           val = Fcons (make_number (this->id), Qnil);
8443         }
8444     }
8445   else
8446     {
8447       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8448       val = Fcons (make_number (coding.id), Qnil);
8449     }
8450
8451   /* Then, detect eol-format if necessary.  */
8452   {
8453     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8454     Lisp_Object tail;
8455
8456     if (VECTORP (eol_type))
8457       {
8458         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8459           {
8460             if (null_byte_found)
8461               normal_eol = EOL_SEEN_LF;
8462             else
8463               normal_eol = detect_eol (coding.source, src_bytes,
8464                                        coding_category_raw_text);
8465           }
8466         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8467                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8468           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8469                                       coding_category_utf_16_be);
8470         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8471                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8472           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8473                                       coding_category_utf_16_le);
8474       }
8475     else
8476       {
8477         if (EQ (eol_type, Qunix))
8478           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8479         else if (EQ (eol_type, Qdos))
8480           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8481         else
8482           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8483       }
8484
8485     for (tail = val; CONSP (tail); tail = XCDR (tail))
8486       {
8487         enum coding_category category;
8488         int this_eol;
8489
8490         id = XINT (XCAR (tail));
8491         attrs = CODING_ID_ATTRS (id);
8492         category = XINT (CODING_ATTR_CATEGORY (attrs));
8493         eol_type = CODING_ID_EOL_TYPE (id);
8494         if (VECTORP (eol_type))
8495           {
8496             if (category == coding_category_utf_16_be
8497                 || category == coding_category_utf_16_be_nosig)
8498               this_eol = utf_16_be_eol;
8499             else if (category == coding_category_utf_16_le
8500                      || category == coding_category_utf_16_le_nosig)
8501               this_eol = utf_16_le_eol;
8502             else
8503               this_eol = normal_eol;
8504
8505             if (this_eol == EOL_SEEN_LF)
8506               XSETCAR (tail, AREF (eol_type, 0));
8507             else if (this_eol == EOL_SEEN_CRLF)
8508               XSETCAR (tail, AREF (eol_type, 1));
8509             else if (this_eol == EOL_SEEN_CR)
8510               XSETCAR (tail, AREF (eol_type, 2));
8511             else
8512               XSETCAR (tail, CODING_ID_NAME (id));
8513           }
8514         else
8515           XSETCAR (tail, CODING_ID_NAME (id));
8516       }
8517   }
8518
8519   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8520 }
8521
8522
8523 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8524        2, 3, 0,
8525        doc: /* Detect coding system of the text in the region between START and END.
8526 Return a list of possible coding systems ordered by priority.
8527 The coding systems to try and their priorities follows what
8528 the function `coding-system-priority-list' (which see) returns.
8529
8530 If only ASCII characters are found (except for such ISO-2022 control
8531 characters as ESC), it returns a list of single element `undecided'
8532 or its subsidiary coding system according to a detected end-of-line
8533 format.
8534
8535 If optional argument HIGHEST is non-nil, return the coding system of
8536 highest priority.  */)
8537   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8538 {
8539   int from, to;
8540   int from_byte, to_byte;
8541
8542   CHECK_NUMBER_COERCE_MARKER (start);
8543   CHECK_NUMBER_COERCE_MARKER (end);
8544
8545   validate_region (&start, &end);
8546   from = XINT (start), to = XINT (end);
8547   from_byte = CHAR_TO_BYTE (from);
8548   to_byte = CHAR_TO_BYTE (to);
8549
8550   if (from < GPT && to >= GPT)
8551     move_gap_both (to, to_byte);
8552
8553   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8554                                to - from, to_byte - from_byte,
8555                                !NILP (highest),
8556                                !NILP (current_buffer
8557                                       ->enable_multibyte_characters),
8558                                Qnil);
8559 }
8560
8561 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8562        1, 2, 0,
8563        doc: /* Detect coding system of the text in STRING.
8564 Return a list of possible coding systems ordered by priority.
8565 The coding systems to try and their priorities follows what
8566 the function `coding-system-priority-list' (which see) returns.
8567
8568 If only ASCII characters are found (except for such ISO-2022 control
8569 characters as ESC), it returns a list of single element `undecided'
8570 or its subsidiary coding system according to a detected end-of-line
8571 format.
8572
8573 If optional argument HIGHEST is non-nil, return the coding system of
8574 highest priority.  */)
8575   (Lisp_Object string, Lisp_Object highest)
8576 {
8577   CHECK_STRING (string);
8578
8579   return detect_coding_system (SDATA (string),
8580                                SCHARS (string), SBYTES (string),
8581                                !NILP (highest), STRING_MULTIBYTE (string),
8582                                Qnil);
8583 }
8584
8585
8586 static INLINE int
8587 char_encodable_p (int c, Lisp_Object attrs)
8588 {
8589   Lisp_Object tail;
8590   struct charset *charset;
8591   Lisp_Object translation_table;
8592
8593   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8594   if (! NILP (translation_table))
8595     c = translate_char (translation_table, c);
8596   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8597        CONSP (tail); tail = XCDR (tail))
8598     {
8599       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8600       if (CHAR_CHARSET_P (c, charset))
8601         break;
8602     }
8603   return (! NILP (tail));
8604 }
8605
8606
8607 /* Return a list of coding systems that safely encode the text between
8608    START and END.  If EXCLUDE is non-nil, it is a list of coding
8609    systems not to check.  The returned list doesn't contain any such
8610    coding systems.  In any case, if the text contains only ASCII or is
8611    unibyte, return t.  */
8612
8613 DEFUN ("find-coding-systems-region-internal",
8614        Ffind_coding_systems_region_internal,
8615        Sfind_coding_systems_region_internal, 2, 3, 0,
8616        doc: /* Internal use only.  */)
8617   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8618 {
8619   Lisp_Object coding_attrs_list, safe_codings;
8620   EMACS_INT start_byte, end_byte;
8621   const unsigned char *p, *pbeg, *pend;
8622   int c;
8623   Lisp_Object tail, elt, work_table;
8624
8625   if (STRINGP (start))
8626     {
8627       if (!STRING_MULTIBYTE (start)
8628           || SCHARS (start) == SBYTES (start))
8629         return Qt;
8630       start_byte = 0;
8631       end_byte = SBYTES (start);
8632     }
8633   else
8634     {
8635       CHECK_NUMBER_COERCE_MARKER (start);
8636       CHECK_NUMBER_COERCE_MARKER (end);
8637       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8638         args_out_of_range (start, end);
8639       if (NILP (current_buffer->enable_multibyte_characters))
8640         return Qt;
8641       start_byte = CHAR_TO_BYTE (XINT (start));
8642       end_byte = CHAR_TO_BYTE (XINT (end));
8643       if (XINT (end) - XINT (start) == end_byte - start_byte)
8644         return Qt;
8645
8646       if (XINT (start) < GPT && XINT (end) > GPT)
8647         {
8648           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8649             move_gap_both (XINT (start), start_byte);
8650           else
8651             move_gap_both (XINT (end), end_byte);
8652         }
8653     }
8654
8655   coding_attrs_list = Qnil;
8656   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8657     if (NILP (exclude)
8658         || NILP (Fmemq (XCAR (tail), exclude)))
8659       {
8660         Lisp_Object attrs;
8661
8662         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8663         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8664             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8665           {
8666             ASET (attrs, coding_attr_trans_tbl,
8667                   get_translation_table (attrs, 1, NULL));
8668             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8669           }
8670       }
8671
8672   if (STRINGP (start))
8673     p = pbeg = SDATA (start);
8674   else
8675     p = pbeg = BYTE_POS_ADDR (start_byte);
8676   pend = p + (end_byte - start_byte);
8677
8678   while (p < pend && ASCII_BYTE_P (*p)) p++;
8679   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8680
8681   work_table = Fmake_char_table (Qnil, Qnil);
8682   while (p < pend)
8683     {
8684       if (ASCII_BYTE_P (*p))
8685         p++;
8686       else
8687         {
8688           c = STRING_CHAR_ADVANCE (p);
8689           if (!NILP (char_table_ref (work_table, c)))
8690             /* This character was already checked.  Ignore it.  */
8691             continue;
8692
8693           charset_map_loaded = 0;
8694           for (tail = coding_attrs_list; CONSP (tail);)
8695             {
8696               elt = XCAR (tail);
8697               if (NILP (elt))
8698                 tail = XCDR (tail);
8699               else if (char_encodable_p (c, elt))
8700                 tail = XCDR (tail);
8701               else if (CONSP (XCDR (tail)))
8702                 {
8703                   XSETCAR (tail, XCAR (XCDR (tail)));
8704                   XSETCDR (tail, XCDR (XCDR (tail)));
8705                 }
8706               else
8707                 {
8708                   XSETCAR (tail, Qnil);
8709                   tail = XCDR (tail);
8710                 }
8711             }
8712           if (charset_map_loaded)
8713             {
8714               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8715
8716               if (STRINGP (start))
8717                 pbeg = SDATA (start);
8718               else
8719                 pbeg = BYTE_POS_ADDR (start_byte);
8720               p = pbeg + p_offset;
8721               pend = pbeg + pend_offset;
8722             }
8723           char_table_set (work_table, c, Qt);
8724         }
8725     }
8726
8727   safe_codings = list2 (Qraw_text, Qno_conversion);
8728   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8729     if (! NILP (XCAR (tail)))
8730       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8731
8732   return safe_codings;
8733 }
8734
8735
8736 DEFUN ("unencodable-char-position", Funencodable_char_position,
8737        Sunencodable_char_position, 3, 5, 0,
8738        doc: /*
8739 Return position of first un-encodable character in a region.
8740 START and END specify the region and CODING-SYSTEM specifies the
8741 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8742
8743 If optional 4th argument COUNT is non-nil, it specifies at most how
8744 many un-encodable characters to search.  In this case, the value is a
8745 list of positions.
8746
8747 If optional 5th argument STRING is non-nil, it is a string to search
8748 for un-encodable characters.  In that case, START and END are indexes
8749 to the string.  */)
8750   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8751 {
8752   int n;
8753   struct coding_system coding;
8754   Lisp_Object attrs, charset_list, translation_table;
8755   Lisp_Object positions;
8756   int from, to;
8757   const unsigned char *p, *stop, *pend;
8758   int ascii_compatible;
8759
8760   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8761   attrs = CODING_ID_ATTRS (coding.id);
8762   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8763     return Qnil;
8764   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8765   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8766   translation_table = get_translation_table (attrs, 1, NULL);
8767
8768   if (NILP (string))
8769     {
8770       validate_region (&start, &end);
8771       from = XINT (start);
8772       to = XINT (end);
8773       if (NILP (current_buffer->enable_multibyte_characters)
8774           || (ascii_compatible
8775               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8776         return Qnil;
8777       p = CHAR_POS_ADDR (from);
8778       pend = CHAR_POS_ADDR (to);
8779       if (from < GPT && to >= GPT)
8780         stop = GPT_ADDR;
8781       else
8782         stop = pend;
8783     }
8784   else
8785     {
8786       CHECK_STRING (string);
8787       CHECK_NATNUM (start);
8788       CHECK_NATNUM (end);
8789       from = XINT (start);
8790       to = XINT (end);
8791       if (from > to
8792           || to > SCHARS (string))
8793         args_out_of_range_3 (string, start, end);
8794       if (! STRING_MULTIBYTE (string))
8795         return Qnil;
8796       p = SDATA (string) + string_char_to_byte (string, from);
8797       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8798       if (ascii_compatible && (to - from) == (pend - p))
8799         return Qnil;
8800     }
8801
8802   if (NILP (count))
8803     n = 1;
8804   else
8805     {
8806       CHECK_NATNUM (count);
8807       n = XINT (count);
8808     }
8809
8810   positions = Qnil;
8811   while (1)
8812     {
8813       int c;
8814
8815       if (ascii_compatible)
8816         while (p < stop && ASCII_BYTE_P (*p))
8817           p++, from++;
8818       if (p >= stop)
8819         {
8820           if (p >= pend)
8821             break;
8822           stop = pend;
8823           p = GAP_END_ADDR;
8824         }
8825
8826       c = STRING_CHAR_ADVANCE (p);
8827       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8828           && ! char_charset (translate_char (translation_table, c),
8829                              charset_list, NULL))
8830         {
8831           positions = Fcons (make_number (from), positions);
8832           n--;
8833           if (n == 0)
8834             break;
8835         }
8836
8837       from++;
8838     }
8839
8840   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8841 }
8842
8843
8844 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8845        Scheck_coding_systems_region, 3, 3, 0,
8846        doc: /* Check if the region is encodable by coding systems.
8847
8848 START and END are buffer positions specifying the region.
8849 CODING-SYSTEM-LIST is a list of coding systems to check.
8850
8851 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8852 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8853 whole region, POS0, POS1, ... are buffer positions where non-encodable
8854 characters are found.
8855
8856 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8857 value is nil.
8858
8859 START may be a string.  In that case, check if the string is
8860 encodable, and the value contains indices to the string instead of
8861 buffer positions.  END is ignored.
8862
8863 If the current buffer (or START if it is a string) is unibyte, the value
8864 is nil.  */)
8865   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8866 {
8867   Lisp_Object list;
8868   EMACS_INT start_byte, end_byte;
8869   int pos;
8870   const unsigned char *p, *pbeg, *pend;
8871   int c;
8872   Lisp_Object tail, elt, attrs;
8873
8874   if (STRINGP (start))
8875     {
8876       if (!STRING_MULTIBYTE (start)
8877           || SCHARS (start) == SBYTES (start))
8878         return Qnil;
8879       start_byte = 0;
8880       end_byte = SBYTES (start);
8881       pos = 0;
8882     }
8883   else
8884     {
8885       CHECK_NUMBER_COERCE_MARKER (start);
8886       CHECK_NUMBER_COERCE_MARKER (end);
8887       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8888         args_out_of_range (start, end);
8889       if (NILP (current_buffer->enable_multibyte_characters))
8890         return Qnil;
8891       start_byte = CHAR_TO_BYTE (XINT (start));
8892       end_byte = CHAR_TO_BYTE (XINT (end));
8893       if (XINT (end) - XINT (start) == end_byte - start_byte)
8894         return Qnil;
8895
8896       if (XINT (start) < GPT && XINT (end) > GPT)
8897         {
8898           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8899             move_gap_both (XINT (start), start_byte);
8900           else
8901             move_gap_both (XINT (end), end_byte);
8902         }
8903       pos = XINT (start);
8904     }
8905
8906   list = Qnil;
8907   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8908     {
8909       elt = XCAR (tail);
8910       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8911       ASET (attrs, coding_attr_trans_tbl,
8912             get_translation_table (attrs, 1, NULL));
8913       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8914     }
8915
8916   if (STRINGP (start))
8917     p = pbeg = SDATA (start);
8918   else
8919     p = pbeg = BYTE_POS_ADDR (start_byte);
8920   pend = p + (end_byte - start_byte);
8921
8922   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8923   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8924
8925   while (p < pend)
8926     {
8927       if (ASCII_BYTE_P (*p))
8928         p++;
8929       else
8930         {
8931           c = STRING_CHAR_ADVANCE (p);
8932
8933           charset_map_loaded = 0;
8934           for (tail = list; CONSP (tail); tail = XCDR (tail))
8935             {
8936               elt = XCDR (XCAR (tail));
8937               if (! char_encodable_p (c, XCAR (elt)))
8938                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8939             }
8940           if (charset_map_loaded)
8941             {
8942               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8943
8944               if (STRINGP (start))
8945                 pbeg = SDATA (start);
8946               else
8947                 pbeg = BYTE_POS_ADDR (start_byte);
8948               p = pbeg + p_offset;
8949               pend = pbeg + pend_offset;
8950             }
8951         }
8952       pos++;
8953     }
8954
8955   tail = list;
8956   list = Qnil;
8957   for (; CONSP (tail); tail = XCDR (tail))
8958     {
8959       elt = XCAR (tail);
8960       if (CONSP (XCDR (XCDR (elt))))
8961         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8962                       list);
8963     }
8964
8965   return list;
8966 }
8967
8968
8969 Lisp_Object
8970 code_convert_region (Lisp_Object start, Lisp_Object end,
8971                      Lisp_Object coding_system, Lisp_Object dst_object,
8972                      int encodep, int norecord)
8973 {
8974   struct coding_system coding;
8975   EMACS_INT from, from_byte, to, to_byte;
8976   Lisp_Object src_object;
8977
8978   CHECK_NUMBER_COERCE_MARKER (start);
8979   CHECK_NUMBER_COERCE_MARKER (end);
8980   if (NILP (coding_system))
8981     coding_system = Qno_conversion;
8982   else
8983     CHECK_CODING_SYSTEM (coding_system);
8984   src_object = Fcurrent_buffer ();
8985   if (NILP (dst_object))
8986     dst_object = src_object;
8987   else if (! EQ (dst_object, Qt))
8988     CHECK_BUFFER (dst_object);
8989
8990   validate_region (&start, &end);
8991   from = XFASTINT (start);
8992   from_byte = CHAR_TO_BYTE (from);
8993   to = XFASTINT (end);
8994   to_byte = CHAR_TO_BYTE (to);
8995
8996   setup_coding_system (coding_system, &coding);
8997   coding.mode |= CODING_MODE_LAST_BLOCK;
8998
8999   if (encodep)
9000     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9001                           dst_object);
9002   else
9003     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9004                           dst_object);
9005   if (! norecord)
9006     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9007
9008   return (BUFFERP (dst_object)
9009           ? make_number (coding.produced_char)
9010           : coding.dst_object);
9011 }
9012
9013
9014 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9015        3, 4, "r\nzCoding system: ",
9016        doc: /* Decode the current region from the specified coding system.
9017 When called from a program, takes four arguments:
9018         START, END, CODING-SYSTEM, and DESTINATION.
9019 START and END are buffer positions.
9020
9021 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9022 If nil, the region between START and END is replaced by the decoded text.
9023 If buffer, the decoded text is inserted in that buffer after point (point
9024 does not move).
9025 In those cases, the length of the decoded text is returned.
9026 If DESTINATION is t, the decoded text is returned.
9027
9028 This function sets `last-coding-system-used' to the precise coding system
9029 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9030 not fully specified.)  */)
9031   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9032 {
9033   return code_convert_region (start, end, coding_system, destination, 0, 0);
9034 }
9035
9036 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9037        3, 4, "r\nzCoding system: ",
9038        doc: /* Encode the current region by specified coding system.
9039 When called from a program, takes four arguments:
9040         START, END, CODING-SYSTEM and DESTINATION.
9041 START and END are buffer positions.
9042
9043 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9044 If nil, the region between START and END is replace by the encoded text.
9045 If buffer, the encoded text is inserted in that buffer after point (point
9046 does not move).
9047 In those cases, the length of the encoded text is returned.
9048 If DESTINATION is t, the encoded text is returned.
9049
9050 This function sets `last-coding-system-used' to the precise coding system
9051 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9052 not fully specified.)  */)
9053   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9054 {
9055   return code_convert_region (start, end, coding_system, destination, 1, 0);
9056 }
9057
9058 Lisp_Object
9059 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9060                      Lisp_Object dst_object, int encodep, int nocopy, int norecord)
9061 {
9062   struct coding_system coding;
9063   EMACS_INT chars, bytes;
9064
9065   CHECK_STRING (string);
9066   if (NILP (coding_system))
9067     {
9068       if (! norecord)
9069         Vlast_coding_system_used = Qno_conversion;
9070       if (NILP (dst_object))
9071         return (nocopy ? Fcopy_sequence (string) : string);
9072     }
9073
9074   if (NILP (coding_system))
9075     coding_system = Qno_conversion;
9076   else
9077     CHECK_CODING_SYSTEM (coding_system);
9078   if (NILP (dst_object))
9079     dst_object = Qt;
9080   else if (! EQ (dst_object, Qt))
9081     CHECK_BUFFER (dst_object);
9082
9083   setup_coding_system (coding_system, &coding);
9084   coding.mode |= CODING_MODE_LAST_BLOCK;
9085   chars = SCHARS (string);
9086   bytes = SBYTES (string);
9087   if (encodep)
9088     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9089   else
9090     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9091   if (! norecord)
9092     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9093
9094   return (BUFFERP (dst_object)
9095           ? make_number (coding.produced_char)
9096           : coding.dst_object);
9097 }
9098
9099
9100 /* Encode or decode STRING according to CODING_SYSTEM.
9101    Do not set Vlast_coding_system_used.
9102
9103    This function is called only from macros DECODE_FILE and
9104    ENCODE_FILE, thus we ignore character composition.  */
9105
9106 Lisp_Object
9107 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9108                               int encodep)
9109 {
9110   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9111 }
9112
9113
9114 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9115        2, 4, 0,
9116        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9117
9118 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9119 if the decoding operation is trivial.
9120
9121 Optional fourth arg BUFFER non-nil means that the decoded text is
9122 inserted in that buffer after point (point does not move).  In this
9123 case, the return value is the length of the decoded text.
9124
9125 This function sets `last-coding-system-used' to the precise coding system
9126 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9127 not fully specified.)  */)
9128   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9129 {
9130   return code_convert_string (string, coding_system, buffer,
9131                               0, ! NILP (nocopy), 0);
9132 }
9133
9134 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9135        2, 4, 0,
9136        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9137
9138 Optional third arg NOCOPY non-nil means it is OK to return STRING
9139 itself if the encoding operation is trivial.
9140
9141 Optional fourth arg BUFFER non-nil means that the encoded text is
9142 inserted in that buffer after point (point does not move).  In this
9143 case, the return value is the length of the encoded text.
9144
9145 This function sets `last-coding-system-used' to the precise coding system
9146 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9147 not fully specified.)  */)
9148   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9149 {
9150   return code_convert_string (string, coding_system, buffer,
9151                               1, ! NILP (nocopy), 1);
9152 }
9153
9154 \f
9155 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9156        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9157 Return the corresponding character.  */)
9158   (Lisp_Object code)
9159 {
9160   Lisp_Object spec, attrs, val;
9161   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9162   int c;
9163
9164   CHECK_NATNUM (code);
9165   c = XFASTINT (code);
9166   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9167   attrs = AREF (spec, 0);
9168
9169   if (ASCII_BYTE_P (c)
9170       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9171     return code;
9172
9173   val = CODING_ATTR_CHARSET_LIST (attrs);
9174   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9175   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9176   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9177
9178   if (c <= 0x7F)
9179     charset = charset_roman;
9180   else if (c >= 0xA0 && c < 0xDF)
9181     {
9182       charset = charset_kana;
9183       c -= 0x80;
9184     }
9185   else
9186     {
9187       int s1 = c >> 8, s2 = c & 0xFF;
9188
9189       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9190           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9191         error ("Invalid code: %d", code);
9192       SJIS_TO_JIS (c);
9193       charset = charset_kanji;
9194     }
9195   c = DECODE_CHAR (charset, c);
9196   if (c < 0)
9197     error ("Invalid code: %d", code);
9198   return make_number (c);
9199 }
9200
9201
9202 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9203        doc: /* Encode a Japanese character CH to shift_jis encoding.
9204 Return the corresponding code in SJIS.  */)
9205   (Lisp_Object ch)
9206 {
9207   Lisp_Object spec, attrs, charset_list;
9208   int c;
9209   struct charset *charset;
9210   unsigned code;
9211
9212   CHECK_CHARACTER (ch);
9213   c = XFASTINT (ch);
9214   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9215   attrs = AREF (spec, 0);
9216
9217   if (ASCII_CHAR_P (c)
9218       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9219     return ch;
9220
9221   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9222   charset = char_charset (c, charset_list, &code);
9223   if (code == CHARSET_INVALID_CODE (charset))
9224     error ("Can't encode by shift_jis encoding: %d", c);
9225   JIS_TO_SJIS (code);
9226
9227   return make_number (code);
9228 }
9229
9230 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9231        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9232 Return the corresponding character.  */)
9233   (Lisp_Object code)
9234 {
9235   Lisp_Object spec, attrs, val;
9236   struct charset *charset_roman, *charset_big5, *charset;
9237   int c;
9238
9239   CHECK_NATNUM (code);
9240   c = XFASTINT (code);
9241   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9242   attrs = AREF (spec, 0);
9243
9244   if (ASCII_BYTE_P (c)
9245       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9246     return code;
9247
9248   val = CODING_ATTR_CHARSET_LIST (attrs);
9249   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9250   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9251
9252   if (c <= 0x7F)
9253     charset = charset_roman;
9254   else
9255     {
9256       int b1 = c >> 8, b2 = c & 0x7F;
9257       if (b1 < 0xA1 || b1 > 0xFE
9258           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9259         error ("Invalid code: %d", code);
9260       charset = charset_big5;
9261     }
9262   c = DECODE_CHAR (charset, (unsigned )c);
9263   if (c < 0)
9264     error ("Invalid code: %d", code);
9265   return make_number (c);
9266 }
9267
9268 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9269        doc: /* Encode the Big5 character CH to BIG5 coding system.
9270 Return the corresponding character code in Big5.  */)
9271   (Lisp_Object ch)
9272 {
9273   Lisp_Object spec, attrs, charset_list;
9274   struct charset *charset;
9275   int c;
9276   unsigned code;
9277
9278   CHECK_CHARACTER (ch);
9279   c = XFASTINT (ch);
9280   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9281   attrs = AREF (spec, 0);
9282   if (ASCII_CHAR_P (c)
9283       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9284     return ch;
9285
9286   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9287   charset = char_charset (c, charset_list, &code);
9288   if (code == CHARSET_INVALID_CODE (charset))
9289     error ("Can't encode by Big5 encoding: %d", c);
9290
9291   return make_number (code);
9292 }
9293
9294 \f
9295 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9296        Sset_terminal_coding_system_internal, 1, 2, 0,
9297        doc: /* Internal use only.  */)
9298   (Lisp_Object coding_system, Lisp_Object terminal)
9299 {
9300   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9301   CHECK_SYMBOL (coding_system);
9302   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9303   /* We had better not send unsafe characters to terminal.  */
9304   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9305   /* Characer composition should be disabled.  */
9306   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9307   terminal_coding->src_multibyte = 1;
9308   terminal_coding->dst_multibyte = 0;
9309   return Qnil;
9310 }
9311
9312 DEFUN ("set-safe-terminal-coding-system-internal",
9313        Fset_safe_terminal_coding_system_internal,
9314        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9315        doc: /* Internal use only.  */)
9316   (Lisp_Object coding_system)
9317 {
9318   CHECK_SYMBOL (coding_system);
9319   setup_coding_system (Fcheck_coding_system (coding_system),
9320                        &safe_terminal_coding);
9321   /* Characer composition should be disabled.  */
9322   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9323   safe_terminal_coding.src_multibyte = 1;
9324   safe_terminal_coding.dst_multibyte = 0;
9325   return Qnil;
9326 }
9327
9328 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9329        Sterminal_coding_system, 0, 1, 0,
9330        doc: /* Return coding system specified for terminal output on the given terminal.
9331 TERMINAL may be a terminal object, a frame, or nil for the selected
9332 frame's terminal device.  */)
9333   (Lisp_Object terminal)
9334 {
9335   struct coding_system *terminal_coding
9336     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9337   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9338
9339   /* For backward compatibility, return nil if it is `undecided'. */
9340   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9341 }
9342
9343 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9344        Sset_keyboard_coding_system_internal, 1, 2, 0,
9345        doc: /* Internal use only.  */)
9346   (Lisp_Object coding_system, Lisp_Object terminal)
9347 {
9348   struct terminal *t = get_terminal (terminal, 1);
9349   CHECK_SYMBOL (coding_system);
9350   if (NILP (coding_system))
9351     coding_system = Qno_conversion;
9352   else
9353     Fcheck_coding_system (coding_system);
9354   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9355   /* Characer composition should be disabled.  */
9356   TERMINAL_KEYBOARD_CODING (t)->common_flags
9357     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9358   return Qnil;
9359 }
9360
9361 DEFUN ("keyboard-coding-system",
9362        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9363        doc: /* Return coding system specified for decoding keyboard input.  */)
9364   (Lisp_Object terminal)
9365 {
9366   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9367                          (get_terminal (terminal, 1))->id);
9368 }
9369
9370 \f
9371 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9372        Sfind_operation_coding_system,  1, MANY, 0,
9373        doc: /* Choose a coding system for an operation based on the target name.
9374 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9375 DECODING-SYSTEM is the coding system to use for decoding
9376 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9377 for encoding (in case OPERATION does encoding).
9378
9379 The first argument OPERATION specifies an I/O primitive:
9380   For file I/O, `insert-file-contents' or `write-region'.
9381   For process I/O, `call-process', `call-process-region', or `start-process'.
9382   For network I/O, `open-network-stream'.
9383
9384 The remaining arguments should be the same arguments that were passed
9385 to the primitive.  Depending on which primitive, one of those arguments
9386 is selected as the TARGET.  For example, if OPERATION does file I/O,
9387 whichever argument specifies the file name is TARGET.
9388
9389 TARGET has a meaning which depends on OPERATION:
9390   For file I/O, TARGET is a file name (except for the special case below).
9391   For process I/O, TARGET is a process name.
9392   For network I/O, TARGET is a service name or a port number.
9393
9394 This function looks up what is specified for TARGET in
9395 `file-coding-system-alist', `process-coding-system-alist',
9396 or `network-coding-system-alist' depending on OPERATION.
9397 They may specify a coding system, a cons of coding systems,
9398 or a function symbol to call.
9399 In the last case, we call the function with one argument,
9400 which is a list of all the arguments given to this function.
9401 If the function can't decide a coding system, it can return
9402 `undecided' so that the normal code-detection is performed.
9403
9404 If OPERATION is `insert-file-contents', the argument corresponding to
9405 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9406 file name to look up, and BUFFER is a buffer that contains the file's
9407 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9408 function to call for FILENAME, that function should examine the
9409 contents of BUFFER instead of reading the file.
9410
9411 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9412   (int nargs, Lisp_Object *args)
9413 {
9414   Lisp_Object operation, target_idx, target, val;
9415   register Lisp_Object chain;
9416
9417   if (nargs < 2)
9418     error ("Too few arguments");
9419   operation = args[0];
9420   if (!SYMBOLP (operation)
9421       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9422     error ("Invalid first argument");
9423   if (nargs < 1 + XINT (target_idx))
9424     error ("Too few arguments for operation: %s",
9425            SDATA (SYMBOL_NAME (operation)));
9426   target = args[XINT (target_idx) + 1];
9427   if (!(STRINGP (target)
9428         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9429             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9430         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9431     error ("Invalid %dth argument", XINT (target_idx) + 1);
9432   if (CONSP (target))
9433     target = XCAR (target);
9434
9435   chain = ((EQ (operation, Qinsert_file_contents)
9436             || EQ (operation, Qwrite_region))
9437            ? Vfile_coding_system_alist
9438            : (EQ (operation, Qopen_network_stream)
9439               ? Vnetwork_coding_system_alist
9440               : Vprocess_coding_system_alist));
9441   if (NILP (chain))
9442     return Qnil;
9443
9444   for (; CONSP (chain); chain = XCDR (chain))
9445     {
9446       Lisp_Object elt;
9447
9448       elt = XCAR (chain);
9449       if (CONSP (elt)
9450           && ((STRINGP (target)
9451                && STRINGP (XCAR (elt))
9452                && fast_string_match (XCAR (elt), target) >= 0)
9453               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9454         {
9455           val = XCDR (elt);
9456           /* Here, if VAL is both a valid coding system and a valid
9457              function symbol, we return VAL as a coding system.  */
9458           if (CONSP (val))
9459             return val;
9460           if (! SYMBOLP (val))
9461             return Qnil;
9462           if (! NILP (Fcoding_system_p (val)))
9463             return Fcons (val, val);
9464           if (! NILP (Ffboundp (val)))
9465             {
9466               /* We use call1 rather than safe_call1
9467                  so as to get bug reports about functions called here
9468                  which don't handle the current interface.  */
9469               val = call1 (val, Flist (nargs, args));
9470               if (CONSP (val))
9471                 return val;
9472               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9473                 return Fcons (val, val);
9474             }
9475           return Qnil;
9476         }
9477     }
9478   return Qnil;
9479 }
9480
9481 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9482        Sset_coding_system_priority, 0, MANY, 0,
9483        doc: /* Assign higher priority to the coding systems given as arguments.
9484 If multiple coding systems belong to the same category,
9485 all but the first one are ignored.
9486
9487 usage: (set-coding-system-priority &rest coding-systems)  */)
9488   (int nargs, Lisp_Object *args)
9489 {
9490   int i, j;
9491   int changed[coding_category_max];
9492   enum coding_category priorities[coding_category_max];
9493
9494   memset (changed, 0, sizeof changed);
9495
9496   for (i = j = 0; i < nargs; i++)
9497     {
9498       enum coding_category category;
9499       Lisp_Object spec, attrs;
9500
9501       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9502       attrs = AREF (spec, 0);
9503       category = XINT (CODING_ATTR_CATEGORY (attrs));
9504       if (changed[category])
9505         /* Ignore this coding system because a coding system of the
9506            same category already had a higher priority.  */
9507         continue;
9508       changed[category] = 1;
9509       priorities[j++] = category;
9510       if (coding_categories[category].id >= 0
9511           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9512         setup_coding_system (args[i], &coding_categories[category]);
9513       Fset (AREF (Vcoding_category_table, category), args[i]);
9514     }
9515
9516   /* Now we have decided top J priorities.  Reflect the order of the
9517      original priorities to the remaining priorities.  */
9518
9519   for (i = j, j = 0; i < coding_category_max; i++, j++)
9520     {
9521       while (j < coding_category_max
9522              && changed[coding_priorities[j]])
9523         j++;
9524       if (j == coding_category_max)
9525         abort ();
9526       priorities[i] = coding_priorities[j];
9527     }
9528
9529   memcpy (coding_priorities, priorities, sizeof priorities);
9530
9531   /* Update `coding-category-list'.  */
9532   Vcoding_category_list = Qnil;
9533   for (i = coding_category_max - 1; i >= 0; i--)
9534     Vcoding_category_list
9535       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9536                Vcoding_category_list);
9537
9538   return Qnil;
9539 }
9540
9541 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9542        Scoding_system_priority_list, 0, 1, 0,
9543        doc: /* Return a list of coding systems ordered by their priorities.
9544 The list contains a subset of coding systems; i.e. coding systems
9545 assigned to each coding category (see `coding-category-list').
9546
9547 HIGHESTP non-nil means just return the highest priority one.  */)
9548   (Lisp_Object highestp)
9549 {
9550   int i;
9551   Lisp_Object val;
9552
9553   for (i = 0, val = Qnil; i < coding_category_max; i++)
9554     {
9555       enum coding_category category = coding_priorities[i];
9556       int id = coding_categories[category].id;
9557       Lisp_Object attrs;
9558
9559       if (id < 0)
9560         continue;
9561       attrs = CODING_ID_ATTRS (id);
9562       if (! NILP (highestp))
9563         return CODING_ATTR_BASE_NAME (attrs);
9564       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9565     }
9566   return Fnreverse (val);
9567 }
9568
9569 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9570
9571 static Lisp_Object
9572 make_subsidiaries (Lisp_Object base)
9573 {
9574   Lisp_Object subsidiaries;
9575   int base_name_len = SBYTES (SYMBOL_NAME (base));
9576   char *buf = (char *) alloca (base_name_len + 6);
9577   int i;
9578
9579   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9580   subsidiaries = Fmake_vector (make_number (3), Qnil);
9581   for (i = 0; i < 3; i++)
9582     {
9583       memcpy (buf + base_name_len, suffixes[i], strlen (suffixes[i]) + 1);
9584       ASET (subsidiaries, i, intern (buf));
9585     }
9586   return subsidiaries;
9587 }
9588
9589
9590 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9591        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9592        doc: /* For internal use only.
9593 usage: (define-coding-system-internal ...)  */)
9594   (int nargs, Lisp_Object *args)
9595 {
9596   Lisp_Object name;
9597   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9598   Lisp_Object attrs;            /* Vector of attributes.  */
9599   Lisp_Object eol_type;
9600   Lisp_Object aliases;
9601   Lisp_Object coding_type, charset_list, safe_charsets;
9602   enum coding_category category;
9603   Lisp_Object tail, val;
9604   int max_charset_id = 0;
9605   int i;
9606
9607   if (nargs < coding_arg_max)
9608     goto short_args;
9609
9610   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9611
9612   name = args[coding_arg_name];
9613   CHECK_SYMBOL (name);
9614   CODING_ATTR_BASE_NAME (attrs) = name;
9615
9616   val = args[coding_arg_mnemonic];
9617   if (! STRINGP (val))
9618     CHECK_CHARACTER (val);
9619   CODING_ATTR_MNEMONIC (attrs) = val;
9620
9621   coding_type = args[coding_arg_coding_type];
9622   CHECK_SYMBOL (coding_type);
9623   CODING_ATTR_TYPE (attrs) = coding_type;
9624
9625   charset_list = args[coding_arg_charset_list];
9626   if (SYMBOLP (charset_list))
9627     {
9628       if (EQ (charset_list, Qiso_2022))
9629         {
9630           if (! EQ (coding_type, Qiso_2022))
9631             error ("Invalid charset-list");
9632           charset_list = Viso_2022_charset_list;
9633         }
9634       else if (EQ (charset_list, Qemacs_mule))
9635         {
9636           if (! EQ (coding_type, Qemacs_mule))
9637             error ("Invalid charset-list");
9638           charset_list = Vemacs_mule_charset_list;
9639         }
9640       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9641         if (max_charset_id < XFASTINT (XCAR (tail)))
9642           max_charset_id = XFASTINT (XCAR (tail));
9643     }
9644   else
9645     {
9646       charset_list = Fcopy_sequence (charset_list);
9647       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9648         {
9649           struct charset *charset;
9650
9651           val = XCAR (tail);
9652           CHECK_CHARSET_GET_CHARSET (val, charset);
9653           if (EQ (coding_type, Qiso_2022)
9654               ? CHARSET_ISO_FINAL (charset) < 0
9655               : EQ (coding_type, Qemacs_mule)
9656               ? CHARSET_EMACS_MULE_ID (charset) < 0
9657               : 0)
9658             error ("Can't handle charset `%s'",
9659                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9660
9661           XSETCAR (tail, make_number (charset->id));
9662           if (max_charset_id < charset->id)
9663             max_charset_id = charset->id;
9664         }
9665     }
9666   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9667
9668   safe_charsets = make_uninit_string (max_charset_id + 1);
9669   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9670   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9671     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9672   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9673
9674   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9675
9676   val = args[coding_arg_decode_translation_table];
9677   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9678     CHECK_SYMBOL (val);
9679   CODING_ATTR_DECODE_TBL (attrs) = val;
9680
9681   val = args[coding_arg_encode_translation_table];
9682   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9683     CHECK_SYMBOL (val);
9684   CODING_ATTR_ENCODE_TBL (attrs) = val;
9685
9686   val = args[coding_arg_post_read_conversion];
9687   CHECK_SYMBOL (val);
9688   CODING_ATTR_POST_READ (attrs) = val;
9689
9690   val = args[coding_arg_pre_write_conversion];
9691   CHECK_SYMBOL (val);
9692   CODING_ATTR_PRE_WRITE (attrs) = val;
9693
9694   val = args[coding_arg_default_char];
9695   if (NILP (val))
9696     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9697   else
9698     {
9699       CHECK_CHARACTER (val);
9700       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9701     }
9702
9703   val = args[coding_arg_for_unibyte];
9704   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9705
9706   val = args[coding_arg_plist];
9707   CHECK_LIST (val);
9708   CODING_ATTR_PLIST (attrs) = val;
9709
9710   if (EQ (coding_type, Qcharset))
9711     {
9712       /* Generate a lisp vector of 256 elements.  Each element is nil,
9713          integer, or a list of charset IDs.
9714
9715          If Nth element is nil, the byte code N is invalid in this
9716          coding system.
9717
9718          If Nth element is a number NUM, N is the first byte of a
9719          charset whose ID is NUM.
9720
9721          If Nth element is a list of charset IDs, N is the first byte
9722          of one of them.  The list is sorted by dimensions of the
9723          charsets.  A charset of smaller dimension comes firtst. */
9724       val = Fmake_vector (make_number (256), Qnil);
9725
9726       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9727         {
9728           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9729           int dim = CHARSET_DIMENSION (charset);
9730           int idx = (dim - 1) * 4;
9731
9732           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9733             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9734
9735           for (i = charset->code_space[idx];
9736                i <= charset->code_space[idx + 1]; i++)
9737             {
9738               Lisp_Object tmp, tmp2;
9739               int dim2;
9740
9741               tmp = AREF (val, i);
9742               if (NILP (tmp))
9743                 tmp = XCAR (tail);
9744               else if (NUMBERP (tmp))
9745                 {
9746                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9747                   if (dim < dim2)
9748                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9749                   else
9750                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9751                 }
9752               else
9753                 {
9754                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9755                     {
9756                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9757                       if (dim < dim2)
9758                         break;
9759                     }
9760                   if (NILP (tmp2))
9761                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9762                   else
9763                     {
9764                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9765                       XSETCAR (tmp2, XCAR (tail));
9766                     }
9767                 }
9768               ASET (val, i, tmp);
9769             }
9770         }
9771       ASET (attrs, coding_attr_charset_valids, val);
9772       category = coding_category_charset;
9773     }
9774   else if (EQ (coding_type, Qccl))
9775     {
9776       Lisp_Object valids;
9777
9778       if (nargs < coding_arg_ccl_max)
9779         goto short_args;
9780
9781       val = args[coding_arg_ccl_decoder];
9782       CHECK_CCL_PROGRAM (val);
9783       if (VECTORP (val))
9784         val = Fcopy_sequence (val);
9785       ASET (attrs, coding_attr_ccl_decoder, val);
9786
9787       val = args[coding_arg_ccl_encoder];
9788       CHECK_CCL_PROGRAM (val);
9789       if (VECTORP (val))
9790         val = Fcopy_sequence (val);
9791       ASET (attrs, coding_attr_ccl_encoder, val);
9792
9793       val = args[coding_arg_ccl_valids];
9794       valids = Fmake_string (make_number (256), make_number (0));
9795       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9796         {
9797           int from, to;
9798
9799           val = Fcar (tail);
9800           if (INTEGERP (val))
9801             {
9802               from = to = XINT (val);
9803               if (from < 0 || from > 255)
9804                 args_out_of_range_3 (val, make_number (0), make_number (255));
9805             }
9806           else
9807             {
9808               CHECK_CONS (val);
9809               CHECK_NATNUM_CAR (val);
9810               CHECK_NATNUM_CDR (val);
9811               from = XINT (XCAR (val));
9812               if (from > 255)
9813                 args_out_of_range_3 (XCAR (val),
9814                                      make_number (0), make_number (255));
9815               to = XINT (XCDR (val));
9816               if (to < from || to > 255)
9817                 args_out_of_range_3 (XCDR (val),
9818                                      XCAR (val), make_number (255));
9819             }
9820           for (i = from; i <= to; i++)
9821             SSET (valids, i, 1);
9822         }
9823       ASET (attrs, coding_attr_ccl_valids, valids);
9824
9825       category = coding_category_ccl;
9826     }
9827   else if (EQ (coding_type, Qutf_16))
9828     {
9829       Lisp_Object bom, endian;
9830
9831       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9832
9833       if (nargs < coding_arg_utf16_max)
9834         goto short_args;
9835
9836       bom = args[coding_arg_utf16_bom];
9837       if (! NILP (bom) && ! EQ (bom, Qt))
9838         {
9839           CHECK_CONS (bom);
9840           val = XCAR (bom);
9841           CHECK_CODING_SYSTEM (val);
9842           val = XCDR (bom);
9843           CHECK_CODING_SYSTEM (val);
9844         }
9845       ASET (attrs, coding_attr_utf_bom, bom);
9846
9847       endian = args[coding_arg_utf16_endian];
9848       CHECK_SYMBOL (endian);
9849       if (NILP (endian))
9850         endian = Qbig;
9851       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9852         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9853       ASET (attrs, coding_attr_utf_16_endian, endian);
9854
9855       category = (CONSP (bom)
9856                   ? coding_category_utf_16_auto
9857                   : NILP (bom)
9858                   ? (EQ (endian, Qbig)
9859                      ? coding_category_utf_16_be_nosig
9860                      : coding_category_utf_16_le_nosig)
9861                   : (EQ (endian, Qbig)
9862                      ? coding_category_utf_16_be
9863                      : coding_category_utf_16_le));
9864     }
9865   else if (EQ (coding_type, Qiso_2022))
9866     {
9867       Lisp_Object initial, reg_usage, request, flags;
9868       int i;
9869
9870       if (nargs < coding_arg_iso2022_max)
9871         goto short_args;
9872
9873       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9874       CHECK_VECTOR (initial);
9875       for (i = 0; i < 4; i++)
9876         {
9877           val = Faref (initial, make_number (i));
9878           if (! NILP (val))
9879             {
9880               struct charset *charset;
9881
9882               CHECK_CHARSET_GET_CHARSET (val, charset);
9883               ASET (initial, i, make_number (CHARSET_ID (charset)));
9884               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9885                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9886             }
9887           else
9888             ASET (initial, i, make_number (-1));
9889         }
9890
9891       reg_usage = args[coding_arg_iso2022_reg_usage];
9892       CHECK_CONS (reg_usage);
9893       CHECK_NUMBER_CAR (reg_usage);
9894       CHECK_NUMBER_CDR (reg_usage);
9895
9896       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9897       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9898         {
9899           int id;
9900           Lisp_Object tmp;
9901
9902           val = Fcar (tail);
9903           CHECK_CONS (val);
9904           tmp = XCAR (val);
9905           CHECK_CHARSET_GET_ID (tmp, id);
9906           CHECK_NATNUM_CDR (val);
9907           if (XINT (XCDR (val)) >= 4)
9908             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9909           XSETCAR (val, make_number (id));
9910         }
9911
9912       flags = args[coding_arg_iso2022_flags];
9913       CHECK_NATNUM (flags);
9914       i = XINT (flags);
9915       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9916         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9917
9918       ASET (attrs, coding_attr_iso_initial, initial);
9919       ASET (attrs, coding_attr_iso_usage, reg_usage);
9920       ASET (attrs, coding_attr_iso_request, request);
9921       ASET (attrs, coding_attr_iso_flags, flags);
9922       setup_iso_safe_charsets (attrs);
9923
9924       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9925         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9926                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9927                     ? coding_category_iso_7_else
9928                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9929                     ? coding_category_iso_7
9930                     : coding_category_iso_7_tight);
9931       else
9932         {
9933           int id = XINT (AREF (initial, 1));
9934
9935           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9936                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9937                        || id < 0)
9938                       ? coding_category_iso_8_else
9939                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9940                       ? coding_category_iso_8_1
9941                       : coding_category_iso_8_2);
9942         }
9943       if (category != coding_category_iso_8_1
9944           && category != coding_category_iso_8_2)
9945         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9946     }
9947   else if (EQ (coding_type, Qemacs_mule))
9948     {
9949       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9950         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9951       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9952       category = coding_category_emacs_mule;
9953     }
9954   else if (EQ (coding_type, Qshift_jis))
9955     {
9956
9957       struct charset *charset;
9958
9959       if (XINT (Flength (charset_list)) != 3
9960           && XINT (Flength (charset_list)) != 4)
9961         error ("There should be three or four charsets");
9962
9963       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9964       if (CHARSET_DIMENSION (charset) != 1)
9965         error ("Dimension of charset %s is not one",
9966                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9967       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9968         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9969
9970       charset_list = XCDR (charset_list);
9971       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9972       if (CHARSET_DIMENSION (charset) != 1)
9973         error ("Dimension of charset %s is not one",
9974                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9975
9976       charset_list = XCDR (charset_list);
9977       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9978       if (CHARSET_DIMENSION (charset) != 2)
9979         error ("Dimension of charset %s is not two",
9980                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9981
9982       charset_list = XCDR (charset_list);
9983       if (! NILP (charset_list))
9984         {
9985           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9986           if (CHARSET_DIMENSION (charset) != 2)
9987             error ("Dimension of charset %s is not two",
9988                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9989         }
9990
9991       category = coding_category_sjis;
9992       Vsjis_coding_system = name;
9993     }
9994   else if (EQ (coding_type, Qbig5))
9995     {
9996       struct charset *charset;
9997
9998       if (XINT (Flength (charset_list)) != 2)
9999         error ("There should be just two charsets");
10000
10001       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10002       if (CHARSET_DIMENSION (charset) != 1)
10003         error ("Dimension of charset %s is not one",
10004                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10005       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10006         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10007
10008       charset_list = XCDR (charset_list);
10009       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10010       if (CHARSET_DIMENSION (charset) != 2)
10011         error ("Dimension of charset %s is not two",
10012                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10013
10014       category = coding_category_big5;
10015       Vbig5_coding_system = name;
10016     }
10017   else if (EQ (coding_type, Qraw_text))
10018     {
10019       category = coding_category_raw_text;
10020       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10021     }
10022   else if (EQ (coding_type, Qutf_8))
10023     {
10024       Lisp_Object bom;
10025
10026       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10027
10028       if (nargs < coding_arg_utf8_max)
10029         goto short_args;
10030
10031       bom = args[coding_arg_utf8_bom];
10032       if (! NILP (bom) && ! EQ (bom, Qt))
10033         {
10034           CHECK_CONS (bom);
10035           val = XCAR (bom);
10036           CHECK_CODING_SYSTEM (val);
10037           val = XCDR (bom);
10038           CHECK_CODING_SYSTEM (val);
10039         }
10040       ASET (attrs, coding_attr_utf_bom, bom);
10041
10042       category = (CONSP (bom) ? coding_category_utf_8_auto
10043                   : NILP (bom) ? coding_category_utf_8_nosig
10044                   : coding_category_utf_8_sig);
10045     }
10046   else if (EQ (coding_type, Qundecided))
10047     category = coding_category_undecided;
10048   else
10049     error ("Invalid coding system type: %s",
10050            SDATA (SYMBOL_NAME (coding_type)));
10051
10052   CODING_ATTR_CATEGORY (attrs) = make_number (category);
10053   CODING_ATTR_PLIST (attrs)
10054     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10055                                 CODING_ATTR_PLIST (attrs)));
10056   CODING_ATTR_PLIST (attrs)
10057     = Fcons (QCascii_compatible_p,
10058              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10059                     CODING_ATTR_PLIST (attrs)));
10060
10061   eol_type = args[coding_arg_eol_type];
10062   if (! NILP (eol_type)
10063       && ! EQ (eol_type, Qunix)
10064       && ! EQ (eol_type, Qdos)
10065       && ! EQ (eol_type, Qmac))
10066     error ("Invalid eol-type");
10067
10068   aliases = Fcons (name, Qnil);
10069
10070   if (NILP (eol_type))
10071     {
10072       eol_type = make_subsidiaries (name);
10073       for (i = 0; i < 3; i++)
10074         {
10075           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10076
10077           this_name = AREF (eol_type, i);
10078           this_aliases = Fcons (this_name, Qnil);
10079           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10080           this_spec = Fmake_vector (make_number (3), attrs);
10081           ASET (this_spec, 1, this_aliases);
10082           ASET (this_spec, 2, this_eol_type);
10083           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10084           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10085           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10086           if (NILP (val))
10087             Vcoding_system_alist
10088               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10089                        Vcoding_system_alist);
10090         }
10091     }
10092
10093   spec_vec = Fmake_vector (make_number (3), attrs);
10094   ASET (spec_vec, 1, aliases);
10095   ASET (spec_vec, 2, eol_type);
10096
10097   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10098   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10099   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10100   if (NILP (val))
10101     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10102                                   Vcoding_system_alist);
10103
10104   {
10105     int id = coding_categories[category].id;
10106
10107     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10108       setup_coding_system (name, &coding_categories[category]);
10109   }
10110
10111   return Qnil;
10112
10113  short_args:
10114   return Fsignal (Qwrong_number_of_arguments,
10115                   Fcons (intern ("define-coding-system-internal"),
10116                          make_number (nargs)));
10117 }
10118
10119
10120 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10121        3, 3, 0,
10122        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10123   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10124 {
10125   Lisp_Object spec, attrs;
10126
10127   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10128   attrs = AREF (spec, 0);
10129   if (EQ (prop, QCmnemonic))
10130     {
10131       if (! STRINGP (val))
10132         CHECK_CHARACTER (val);
10133       CODING_ATTR_MNEMONIC (attrs) = val;
10134     }
10135   else if (EQ (prop, QCdefault_char))
10136     {
10137       if (NILP (val))
10138         val = make_number (' ');
10139       else
10140         CHECK_CHARACTER (val);
10141       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10142     }
10143   else if (EQ (prop, QCdecode_translation_table))
10144     {
10145       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10146         CHECK_SYMBOL (val);
10147       CODING_ATTR_DECODE_TBL (attrs) = val;
10148     }
10149   else if (EQ (prop, QCencode_translation_table))
10150     {
10151       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10152         CHECK_SYMBOL (val);
10153       CODING_ATTR_ENCODE_TBL (attrs) = val;
10154     }
10155   else if (EQ (prop, QCpost_read_conversion))
10156     {
10157       CHECK_SYMBOL (val);
10158       CODING_ATTR_POST_READ (attrs) = val;
10159     }
10160   else if (EQ (prop, QCpre_write_conversion))
10161     {
10162       CHECK_SYMBOL (val);
10163       CODING_ATTR_PRE_WRITE (attrs) = val;
10164     }
10165   else if (EQ (prop, QCascii_compatible_p))
10166     {
10167       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10168     }
10169
10170   CODING_ATTR_PLIST (attrs)
10171     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10172   return val;
10173 }
10174
10175
10176 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10177        Sdefine_coding_system_alias, 2, 2, 0,
10178        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10179   (Lisp_Object alias, Lisp_Object coding_system)
10180 {
10181   Lisp_Object spec, aliases, eol_type, val;
10182
10183   CHECK_SYMBOL (alias);
10184   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10185   aliases = AREF (spec, 1);
10186   /* ALIASES should be a list of length more than zero, and the first
10187      element is a base coding system.  Append ALIAS at the tail of the
10188      list.  */
10189   while (!NILP (XCDR (aliases)))
10190     aliases = XCDR (aliases);
10191   XSETCDR (aliases, Fcons (alias, Qnil));
10192
10193   eol_type = AREF (spec, 2);
10194   if (VECTORP (eol_type))
10195     {
10196       Lisp_Object subsidiaries;
10197       int i;
10198
10199       subsidiaries = make_subsidiaries (alias);
10200       for (i = 0; i < 3; i++)
10201         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10202                                      AREF (eol_type, i));
10203     }
10204
10205   Fputhash (alias, spec, Vcoding_system_hash_table);
10206   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10207   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10208   if (NILP (val))
10209     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10210                                   Vcoding_system_alist);
10211
10212   return Qnil;
10213 }
10214
10215 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10216        1, 1, 0,
10217        doc: /* Return the base of CODING-SYSTEM.
10218 Any alias or subsidiary coding system is not a base coding system.  */)
10219   (Lisp_Object coding_system)
10220 {
10221   Lisp_Object spec, attrs;
10222
10223   if (NILP (coding_system))
10224     return (Qno_conversion);
10225   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10226   attrs = AREF (spec, 0);
10227   return CODING_ATTR_BASE_NAME (attrs);
10228 }
10229
10230 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10231        1, 1, 0,
10232        doc: "Return the property list of CODING-SYSTEM.")
10233   (Lisp_Object coding_system)
10234 {
10235   Lisp_Object spec, attrs;
10236
10237   if (NILP (coding_system))
10238     coding_system = Qno_conversion;
10239   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10240   attrs = AREF (spec, 0);
10241   return CODING_ATTR_PLIST (attrs);
10242 }
10243
10244
10245 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10246        1, 1, 0,
10247        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10248   (Lisp_Object coding_system)
10249 {
10250   Lisp_Object spec;
10251
10252   if (NILP (coding_system))
10253     coding_system = Qno_conversion;
10254   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10255   return AREF (spec, 1);
10256 }
10257
10258 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10259        Scoding_system_eol_type, 1, 1, 0,
10260        doc: /* Return eol-type of CODING-SYSTEM.
10261 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10262
10263 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10264 and CR respectively.
10265
10266 A vector value indicates that a format of end-of-line should be
10267 detected automatically.  Nth element of the vector is the subsidiary
10268 coding system whose eol-type is N.  */)
10269   (Lisp_Object coding_system)
10270 {
10271   Lisp_Object spec, eol_type;
10272   int n;
10273
10274   if (NILP (coding_system))
10275     coding_system = Qno_conversion;
10276   if (! CODING_SYSTEM_P (coding_system))
10277     return Qnil;
10278   spec = CODING_SYSTEM_SPEC (coding_system);
10279   eol_type = AREF (spec, 2);
10280   if (VECTORP (eol_type))
10281     return Fcopy_sequence (eol_type);
10282   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10283   return make_number (n);
10284 }
10285
10286 #endif /* emacs */
10287
10288 \f
10289 /*** 9. Post-amble ***/
10290
10291 void
10292 init_coding_once (void)
10293 {
10294   int i;
10295
10296   for (i = 0; i < coding_category_max; i++)
10297     {
10298       coding_categories[i].id = -1;
10299       coding_priorities[i] = i;
10300     }
10301
10302   /* ISO2022 specific initialize routine.  */
10303   for (i = 0; i < 0x20; i++)
10304     iso_code_class[i] = ISO_control_0;
10305   for (i = 0x21; i < 0x7F; i++)
10306     iso_code_class[i] = ISO_graphic_plane_0;
10307   for (i = 0x80; i < 0xA0; i++)
10308     iso_code_class[i] = ISO_control_1;
10309   for (i = 0xA1; i < 0xFF; i++)
10310     iso_code_class[i] = ISO_graphic_plane_1;
10311   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10312   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10313   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10314   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10315   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10316   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10317   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10318   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10319   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10320
10321   for (i = 0; i < 256; i++)
10322     {
10323       emacs_mule_bytes[i] = 1;
10324     }
10325   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10326   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10327   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10328   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10329 }
10330
10331 #ifdef emacs
10332
10333 void
10334 syms_of_coding (void)
10335 {
10336   staticpro (&Vcoding_system_hash_table);
10337   {
10338     Lisp_Object args[2];
10339     args[0] = QCtest;
10340     args[1] = Qeq;
10341     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10342   }
10343
10344   staticpro (&Vsjis_coding_system);
10345   Vsjis_coding_system = Qnil;
10346
10347   staticpro (&Vbig5_coding_system);
10348   Vbig5_coding_system = Qnil;
10349
10350   staticpro (&Vcode_conversion_reused_workbuf);
10351   Vcode_conversion_reused_workbuf = Qnil;
10352
10353   staticpro (&Vcode_conversion_workbuf_name);
10354   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10355
10356   reused_workbuf_in_use = 0;
10357
10358   DEFSYM (Qcharset, "charset");
10359   DEFSYM (Qtarget_idx, "target-idx");
10360   DEFSYM (Qcoding_system_history, "coding-system-history");
10361   Fset (Qcoding_system_history, Qnil);
10362
10363   /* Target FILENAME is the first argument.  */
10364   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10365   /* Target FILENAME is the third argument.  */
10366   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10367
10368   DEFSYM (Qcall_process, "call-process");
10369   /* Target PROGRAM is the first argument.  */
10370   Fput (Qcall_process, Qtarget_idx, make_number (0));
10371
10372   DEFSYM (Qcall_process_region, "call-process-region");
10373   /* Target PROGRAM is the third argument.  */
10374   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10375
10376   DEFSYM (Qstart_process, "start-process");
10377   /* Target PROGRAM is the third argument.  */
10378   Fput (Qstart_process, Qtarget_idx, make_number (2));
10379
10380   DEFSYM (Qopen_network_stream, "open-network-stream");
10381   /* Target SERVICE is the fourth argument.  */
10382   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10383
10384   DEFSYM (Qcoding_system, "coding-system");
10385   DEFSYM (Qcoding_aliases, "coding-aliases");
10386
10387   DEFSYM (Qeol_type, "eol-type");
10388   DEFSYM (Qunix, "unix");
10389   DEFSYM (Qdos, "dos");
10390
10391   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10392   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10393   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10394   DEFSYM (Qdefault_char, "default-char");
10395   DEFSYM (Qundecided, "undecided");
10396   DEFSYM (Qno_conversion, "no-conversion");
10397   DEFSYM (Qraw_text, "raw-text");
10398
10399   DEFSYM (Qiso_2022, "iso-2022");
10400
10401   DEFSYM (Qutf_8, "utf-8");
10402   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10403
10404   DEFSYM (Qutf_16, "utf-16");
10405   DEFSYM (Qbig, "big");
10406   DEFSYM (Qlittle, "little");
10407
10408   DEFSYM (Qshift_jis, "shift-jis");
10409   DEFSYM (Qbig5, "big5");
10410
10411   DEFSYM (Qcoding_system_p, "coding-system-p");
10412
10413   DEFSYM (Qcoding_system_error, "coding-system-error");
10414   Fput (Qcoding_system_error, Qerror_conditions,
10415         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10416   Fput (Qcoding_system_error, Qerror_message,
10417         make_pure_c_string ("Invalid coding system"));
10418
10419   /* Intern this now in case it isn't already done.
10420      Setting this variable twice is harmless.
10421      But don't staticpro it here--that is done in alloc.c.  */
10422   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10423
10424   DEFSYM (Qtranslation_table, "translation-table");
10425   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10426   DEFSYM (Qtranslation_table_id, "translation-table-id");
10427   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10428   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10429
10430   DEFSYM (Qvalid_codes, "valid-codes");
10431
10432   DEFSYM (Qemacs_mule, "emacs-mule");
10433
10434   DEFSYM (QCcategory, ":category");
10435   DEFSYM (QCmnemonic, ":mnemonic");
10436   DEFSYM (QCdefault_char, ":default-char");
10437   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10438   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10439   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10440   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10441   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10442
10443   Vcoding_category_table
10444     = Fmake_vector (make_number (coding_category_max), Qnil);
10445   staticpro (&Vcoding_category_table);
10446   /* Followings are target of code detection.  */
10447   ASET (Vcoding_category_table, coding_category_iso_7,
10448         intern_c_string ("coding-category-iso-7"));
10449   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10450         intern_c_string ("coding-category-iso-7-tight"));
10451   ASET (Vcoding_category_table, coding_category_iso_8_1,
10452         intern_c_string ("coding-category-iso-8-1"));
10453   ASET (Vcoding_category_table, coding_category_iso_8_2,
10454         intern_c_string ("coding-category-iso-8-2"));
10455   ASET (Vcoding_category_table, coding_category_iso_7_else,
10456         intern_c_string ("coding-category-iso-7-else"));
10457   ASET (Vcoding_category_table, coding_category_iso_8_else,
10458         intern_c_string ("coding-category-iso-8-else"));
10459   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10460         intern_c_string ("coding-category-utf-8-auto"));
10461   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10462         intern_c_string ("coding-category-utf-8"));
10463   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10464         intern_c_string ("coding-category-utf-8-sig"));
10465   ASET (Vcoding_category_table, coding_category_utf_16_be,
10466         intern_c_string ("coding-category-utf-16-be"));
10467   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10468         intern_c_string ("coding-category-utf-16-auto"));
10469   ASET (Vcoding_category_table, coding_category_utf_16_le,
10470         intern_c_string ("coding-category-utf-16-le"));
10471   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10472         intern_c_string ("coding-category-utf-16-be-nosig"));
10473   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10474         intern_c_string ("coding-category-utf-16-le-nosig"));
10475   ASET (Vcoding_category_table, coding_category_charset,
10476         intern_c_string ("coding-category-charset"));
10477   ASET (Vcoding_category_table, coding_category_sjis,
10478         intern_c_string ("coding-category-sjis"));
10479   ASET (Vcoding_category_table, coding_category_big5,
10480         intern_c_string ("coding-category-big5"));
10481   ASET (Vcoding_category_table, coding_category_ccl,
10482         intern_c_string ("coding-category-ccl"));
10483   ASET (Vcoding_category_table, coding_category_emacs_mule,
10484         intern_c_string ("coding-category-emacs-mule"));
10485   /* Followings are NOT target of code detection.  */
10486   ASET (Vcoding_category_table, coding_category_raw_text,
10487         intern_c_string ("coding-category-raw-text"));
10488   ASET (Vcoding_category_table, coding_category_undecided,
10489         intern_c_string ("coding-category-undecided"));
10490
10491   DEFSYM (Qinsufficient_source, "insufficient-source");
10492   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10493   DEFSYM (Qinvalid_source, "invalid-source");
10494   DEFSYM (Qinterrupted, "interrupted");
10495   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10496   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10497
10498   defsubr (&Scoding_system_p);
10499   defsubr (&Sread_coding_system);
10500   defsubr (&Sread_non_nil_coding_system);
10501   defsubr (&Scheck_coding_system);
10502   defsubr (&Sdetect_coding_region);
10503   defsubr (&Sdetect_coding_string);
10504   defsubr (&Sfind_coding_systems_region_internal);
10505   defsubr (&Sunencodable_char_position);
10506   defsubr (&Scheck_coding_systems_region);
10507   defsubr (&Sdecode_coding_region);
10508   defsubr (&Sencode_coding_region);
10509   defsubr (&Sdecode_coding_string);
10510   defsubr (&Sencode_coding_string);
10511   defsubr (&Sdecode_sjis_char);
10512   defsubr (&Sencode_sjis_char);
10513   defsubr (&Sdecode_big5_char);
10514   defsubr (&Sencode_big5_char);
10515   defsubr (&Sset_terminal_coding_system_internal);
10516   defsubr (&Sset_safe_terminal_coding_system_internal);
10517   defsubr (&Sterminal_coding_system);
10518   defsubr (&Sset_keyboard_coding_system_internal);
10519   defsubr (&Skeyboard_coding_system);
10520   defsubr (&Sfind_operation_coding_system);
10521   defsubr (&Sset_coding_system_priority);
10522   defsubr (&Sdefine_coding_system_internal);
10523   defsubr (&Sdefine_coding_system_alias);
10524   defsubr (&Scoding_system_put);
10525   defsubr (&Scoding_system_base);
10526   defsubr (&Scoding_system_plist);
10527   defsubr (&Scoding_system_aliases);
10528   defsubr (&Scoding_system_eol_type);
10529   defsubr (&Scoding_system_priority_list);
10530
10531   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10532                doc: /* List of coding systems.
10533
10534 Do not alter the value of this variable manually.  This variable should be
10535 updated by the functions `define-coding-system' and
10536 `define-coding-system-alias'.  */);
10537   Vcoding_system_list = Qnil;
10538
10539   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10540                doc: /* Alist of coding system names.
10541 Each element is one element list of coding system name.
10542 This variable is given to `completing-read' as COLLECTION argument.
10543
10544 Do not alter the value of this variable manually.  This variable should be
10545 updated by the functions `make-coding-system' and
10546 `define-coding-system-alias'.  */);
10547   Vcoding_system_alist = Qnil;
10548
10549   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10550                doc: /* List of coding-categories (symbols) ordered by priority.
10551
10552 On detecting a coding system, Emacs tries code detection algorithms
10553 associated with each coding-category one by one in this order.  When
10554 one algorithm agrees with a byte sequence of source text, the coding
10555 system bound to the corresponding coding-category is selected.
10556
10557 Don't modify this variable directly, but use `set-coding-priority'.  */);
10558   {
10559     int i;
10560
10561     Vcoding_category_list = Qnil;
10562     for (i = coding_category_max - 1; i >= 0; i--)
10563       Vcoding_category_list
10564         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10565                  Vcoding_category_list);
10566   }
10567
10568   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10569                doc: /* Specify the coding system for read operations.
10570 It is useful to bind this variable with `let', but do not set it globally.
10571 If the value is a coding system, it is used for decoding on read operation.
10572 If not, an appropriate element is used from one of the coding system alists.
10573 There are three such tables: `file-coding-system-alist',
10574 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10575   Vcoding_system_for_read = Qnil;
10576
10577   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10578                doc: /* Specify the coding system for write operations.
10579 Programs bind this variable with `let', but you should not set it globally.
10580 If the value is a coding system, it is used for encoding of output,
10581 when writing it to a file and when sending it to a file or subprocess.
10582
10583 If this does not specify a coding system, an appropriate element
10584 is used from one of the coding system alists.
10585 There are three such tables: `file-coding-system-alist',
10586 `process-coding-system-alist', and `network-coding-system-alist'.
10587 For output to files, if the above procedure does not specify a coding system,
10588 the value of `buffer-file-coding-system' is used.  */);
10589   Vcoding_system_for_write = Qnil;
10590
10591   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10592                doc: /*
10593 Coding system used in the latest file or process I/O.  */);
10594   Vlast_coding_system_used = Qnil;
10595
10596   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10597                doc: /*
10598 Error status of the last code conversion.
10599
10600 When an error was detected in the last code conversion, this variable
10601 is set to one of the following symbols.
10602   `insufficient-source'
10603   `inconsistent-eol'
10604   `invalid-source'
10605   `interrupted'
10606   `insufficient-memory'
10607 When no error was detected, the value doesn't change.  So, to check
10608 the error status of a code conversion by this variable, you must
10609 explicitly set this variable to nil before performing code
10610 conversion.  */);
10611   Vlast_code_conversion_error = Qnil;
10612
10613   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10614                doc: /*
10615 *Non-nil means always inhibit code conversion of end-of-line format.
10616 See info node `Coding Systems' and info node `Text and Binary' concerning
10617 such conversion.  */);
10618   inhibit_eol_conversion = 0;
10619
10620   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10621                doc: /*
10622 Non-nil means process buffer inherits coding system of process output.
10623 Bind it to t if the process output is to be treated as if it were a file
10624 read from some filesystem.  */);
10625   inherit_process_coding_system = 0;
10626
10627   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10628                doc: /*
10629 Alist to decide a coding system to use for a file I/O operation.
10630 The format is ((PATTERN . VAL) ...),
10631 where PATTERN is a regular expression matching a file name,
10632 VAL is a coding system, a cons of coding systems, or a function symbol.
10633 If VAL is a coding system, it is used for both decoding and encoding
10634 the file contents.
10635 If VAL is a cons of coding systems, the car part is used for decoding,
10636 and the cdr part is used for encoding.
10637 If VAL is a function symbol, the function must return a coding system
10638 or a cons of coding systems which are used as above.  The function is
10639 called with an argument that is a list of the arguments with which
10640 `find-operation-coding-system' was called.  If the function can't decide
10641 a coding system, it can return `undecided' so that the normal
10642 code-detection is performed.
10643
10644 See also the function `find-operation-coding-system'
10645 and the variable `auto-coding-alist'.  */);
10646   Vfile_coding_system_alist = Qnil;
10647
10648   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10649                doc: /*
10650 Alist to decide a coding system to use for a process I/O operation.
10651 The format is ((PATTERN . VAL) ...),
10652 where PATTERN is a regular expression matching a program name,
10653 VAL is a coding system, a cons of coding systems, or a function symbol.
10654 If VAL is a coding system, it is used for both decoding what received
10655 from the program and encoding what sent to the program.
10656 If VAL is a cons of coding systems, the car part is used for decoding,
10657 and the cdr part is used for encoding.
10658 If VAL is a function symbol, the function must return a coding system
10659 or a cons of coding systems which are used as above.
10660
10661 See also the function `find-operation-coding-system'.  */);
10662   Vprocess_coding_system_alist = Qnil;
10663
10664   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10665                doc: /*
10666 Alist to decide a coding system to use for a network I/O operation.
10667 The format is ((PATTERN . VAL) ...),
10668 where PATTERN is a regular expression matching a network service name
10669 or is a port number to connect to,
10670 VAL is a coding system, a cons of coding systems, or a function symbol.
10671 If VAL is a coding system, it is used for both decoding what received
10672 from the network stream and encoding what sent to the network stream.
10673 If VAL is a cons of coding systems, the car part is used for decoding,
10674 and the cdr part is used for encoding.
10675 If VAL is a function symbol, the function must return a coding system
10676 or a cons of coding systems which are used as above.
10677
10678 See also the function `find-operation-coding-system'.  */);
10679   Vnetwork_coding_system_alist = Qnil;
10680
10681   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10682                doc: /* Coding system to use with system messages.
10683 Also used for decoding keyboard input on X Window system.  */);
10684   Vlocale_coding_system = Qnil;
10685
10686   /* The eol mnemonics are reset in startup.el system-dependently.  */
10687   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10688                doc: /*
10689 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10690   eol_mnemonic_unix = make_pure_c_string (":");
10691
10692   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10693                doc: /*
10694 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10695   eol_mnemonic_dos = make_pure_c_string ("\\");
10696
10697   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10698                doc: /*
10699 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10700   eol_mnemonic_mac = make_pure_c_string ("/");
10701
10702   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10703                doc: /*
10704 *String displayed in mode line when end-of-line format is not yet determined.  */);
10705   eol_mnemonic_undecided = make_pure_c_string (":");
10706
10707   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10708                doc: /*
10709 *Non-nil enables character translation while encoding and decoding.  */);
10710   Venable_character_translation = Qt;
10711
10712   DEFVAR_LISP ("standard-translation-table-for-decode",
10713                &Vstandard_translation_table_for_decode,
10714                doc: /* Table for translating characters while decoding.  */);
10715   Vstandard_translation_table_for_decode = Qnil;
10716
10717   DEFVAR_LISP ("standard-translation-table-for-encode",
10718                &Vstandard_translation_table_for_encode,
10719                doc: /* Table for translating characters while encoding.  */);
10720   Vstandard_translation_table_for_encode = Qnil;
10721
10722   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10723                doc: /* Alist of charsets vs revision numbers.
10724 While encoding, if a charset (car part of an element) is found,
10725 designate it with the escape sequence identifying revision (cdr part
10726 of the element).  */);
10727   Vcharset_revision_table = Qnil;
10728
10729   DEFVAR_LISP ("default-process-coding-system",
10730                &Vdefault_process_coding_system,
10731                doc: /* Cons of coding systems used for process I/O by default.
10732 The car part is used for decoding a process output,
10733 the cdr part is used for encoding a text to be sent to a process.  */);
10734   Vdefault_process_coding_system = Qnil;
10735
10736   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10737                doc: /*
10738 Table of extra Latin codes in the range 128..159 (inclusive).
10739 This is a vector of length 256.
10740 If Nth element is non-nil, the existence of code N in a file
10741 \(or output of subprocess) doesn't prevent it to be detected as
10742 a coding system of ISO 2022 variant which has a flag
10743 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10744 or reading output of a subprocess.
10745 Only 128th through 159th elements have a meaning.  */);
10746   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10747
10748   DEFVAR_LISP ("select-safe-coding-system-function",
10749                &Vselect_safe_coding_system_function,
10750                doc: /*
10751 Function to call to select safe coding system for encoding a text.
10752
10753 If set, this function is called to force a user to select a proper
10754 coding system which can encode the text in the case that a default
10755 coding system used in each operation can't encode the text.  The
10756 function should take care that the buffer is not modified while
10757 the coding system is being selected.
10758
10759 The default value is `select-safe-coding-system' (which see).  */);
10760   Vselect_safe_coding_system_function = Qnil;
10761
10762   DEFVAR_BOOL ("coding-system-require-warning",
10763                &coding_system_require_warning,
10764                doc: /* Internal use only.
10765 If non-nil, on writing a file, `select-safe-coding-system-function' is
10766 called even if `coding-system-for-write' is non-nil.  The command
10767 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10768   coding_system_require_warning = 0;
10769
10770
10771   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10772                &inhibit_iso_escape_detection,
10773                doc: /*
10774 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10775
10776 When Emacs reads text, it tries to detect how the text is encoded.
10777 This code detection is sensitive to escape sequences.  If Emacs sees
10778 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10779 of the ISO2022 encodings, and decodes text by the corresponding coding
10780 system (e.g. `iso-2022-7bit').
10781
10782 However, there may be a case that you want to read escape sequences in
10783 a file as is.  In such a case, you can set this variable to non-nil.
10784 Then the code detection will ignore any escape sequences, and no text is
10785 detected as encoded in some ISO-2022 encoding.  The result is that all
10786 escape sequences become visible in a buffer.
10787
10788 The default value is nil, and it is strongly recommended not to change
10789 it.  That is because many Emacs Lisp source files that contain
10790 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10791 in Emacs's distribution, and they won't be decoded correctly on
10792 reading if you suppress escape sequence detection.
10793
10794 The other way to read escape sequences in a file without decoding is
10795 to explicitly specify some coding system that doesn't use ISO-2022
10796 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10797   inhibit_iso_escape_detection = 0;
10798
10799   DEFVAR_BOOL ("inhibit-null-byte-detection",
10800                &inhibit_null_byte_detection,
10801                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10802 By default, Emacs treats it as binary data, and does not attempt to
10803 decode it.  The effect is as if you specified `no-conversion' for
10804 reading that text.
10805
10806 Set this to non-nil when a regular text happens to include null bytes.
10807 Examples are Index nodes of Info files and null-byte delimited output
10808 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10809 decode text as usual.  */);
10810   inhibit_null_byte_detection = 0;
10811
10812   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10813                doc: /* Char table for translating self-inserting characters.
10814 This is applied to the result of input methods, not their input.
10815 See also `keyboard-translate-table'.
10816
10817 Use of this variable for character code unification was rendered
10818 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10819 internal character representation.  */);
10820     Vtranslation_table_for_input = Qnil;
10821
10822   {
10823     Lisp_Object args[coding_arg_max];
10824     Lisp_Object plist[16];
10825     int i;
10826
10827     for (i = 0; i < coding_arg_max; i++)
10828       args[i] = Qnil;
10829
10830     plist[0] = intern_c_string (":name");
10831     plist[1] = args[coding_arg_name] = Qno_conversion;
10832     plist[2] = intern_c_string (":mnemonic");
10833     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10834     plist[4] = intern_c_string (":coding-type");
10835     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10836     plist[6] = intern_c_string (":ascii-compatible-p");
10837     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10838     plist[8] = intern_c_string (":default-char");
10839     plist[9] = args[coding_arg_default_char] = make_number (0);
10840     plist[10] = intern_c_string (":for-unibyte");
10841     plist[11] = args[coding_arg_for_unibyte] = Qt;
10842     plist[12] = intern_c_string (":docstring");
10843     plist[13] = make_pure_c_string ("Do no conversion.\n\
10844 \n\
10845 When you visit a file with this coding, the file is read into a\n\
10846 unibyte buffer as is, thus each byte of a file is treated as a\n\
10847 character.");
10848     plist[14] = intern_c_string (":eol-type");
10849     plist[15] = args[coding_arg_eol_type] = Qunix;
10850     args[coding_arg_plist] = Flist (16, plist);
10851     Fdefine_coding_system_internal (coding_arg_max, args);
10852
10853     plist[1] = args[coding_arg_name] = Qundecided;
10854     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10855     plist[5] = args[coding_arg_coding_type] = Qundecided;
10856     /* This is already set.
10857        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10858     plist[8] = intern_c_string (":charset-list");
10859     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10860     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10861     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10862     plist[15] = args[coding_arg_eol_type] = Qnil;
10863     args[coding_arg_plist] = Flist (16, plist);
10864     Fdefine_coding_system_internal (coding_arg_max, args);
10865   }
10866
10867   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10868
10869   {
10870     int i;
10871
10872     for (i = 0; i < coding_category_max; i++)
10873       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10874   }
10875 #if defined (DOS_NT)
10876   system_eol_type = Qdos;
10877 #else
10878   system_eol_type = Qunix;
10879 #endif
10880   staticpro (&system_eol_type);
10881 }
10882
10883 char *
10884 emacs_strerror (int error_number)
10885 {
10886   char *str;
10887
10888   synchronize_system_messages_locale ();
10889   str = strerror (error_number);
10890
10891   if (! NILP (Vlocale_coding_system))
10892     {
10893       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10894                                                       Vlocale_coding_system,
10895                                                       0);
10896       str = (char *) SDATA (dec);
10897     }
10898
10899   return str;
10900 }
10901
10902 #endif /* emacs */
10903
10904 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10905    (do not change this comment) */