src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2011 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   int multibytep = coding->src_multibyte;
 162   int consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   int multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   int multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   int produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288 #include <setjmp.h>
 289
 290 #include "lisp.h"
 291 #include "buffer.h"
 292 #include "character.h"
 293 #include "charset.h"
 294 #include "ccl.h"
 295 #include "composite.h"
 296 #include "coding.h"
 297 #include "window.h"
 298 #include "frame.h"
 299 #include "termhooks.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 304 Lisp_Object Qunix, Qdos;
 305 Lisp_Object Qbuffer_file_coding_system;
 306 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 307 Lisp_Object Qdefault_char;
 308 Lisp_Object Qno_conversion, Qundecided;
 309 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 310 Lisp_Object Qbig, Qlittle;
 311 Lisp_Object Qcoding_system_history;
 312 Lisp_Object Qvalid_codes;
 313 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 314 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 315 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 316 Lisp_Object QCascii_compatible_p;
 317
 318 Lisp_Object Qcall_process, Qcall_process_region;
 319 Lisp_Object Qstart_process, Qopen_network_stream;
 320 Lisp_Object Qtarget_idx;
 321
 322 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 323 Lisp_Object Qinterrupted, Qinsufficient_memory;
 324
 325 /* If a symbol has this property, evaluate the value to define the
 326    symbol as a coding system.  */
 327 static Lisp_Object Qcoding_system_define_form;
 328
 329 /* Format of end-of-line decided by system.  This is Qunix on
 330    Unix and Mac, Qdos on DOS/Windows.
 331    This has an effect only for external encoding (i.e. for output to
 332    file and process), not for in-buffer or Lisp string encoding.  */
 333 static Lisp_Object system_eol_type;
 334
 335 #ifdef emacs
 336
 337 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 338
 339 /* Coding system emacs-mule and raw-text are for converting only
 340    end-of-line format.  */
 341 Lisp_Object Qemacs_mule, Qraw_text;
 342 Lisp_Object Qutf_8_emacs;
 343
 344 /* Coding-systems are handed between Emacs Lisp programs and C internal
 345    routines by the following three variables.  */
 346 /* Coding system to be used to encode text for terminal display when
 347    terminal coding system is nil.  */
 348 struct coding_system safe_terminal_coding;
 349
 350 #endif /* emacs */
 351
 352 Lisp_Object Qtranslation_table;
 353 Lisp_Object Qtranslation_table_id;
 354 Lisp_Object Qtranslation_table_for_decode;
 355 Lisp_Object Qtranslation_table_for_encode;
 356
 357 /* Two special coding systems.  */
 358 Lisp_Object Vsjis_coding_system;
 359 Lisp_Object Vbig5_coding_system;
 360
 361 /* ISO2022 section */
 362
 363 #define CODING_ISO_INITIAL(coding, reg)                 \
 364   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 365                      coding_attr_iso_initial),          \
 366                reg)))
 367
 368
 369 #define CODING_ISO_REQUEST(coding, charset_id)          \
 370   (((charset_id) <= (coding)->max_charset_id            \
 371     ? ((coding)->safe_charsets[charset_id] != 255       \
 372        ? (coding)->safe_charsets[charset_id]            \
 373        : -1)                                            \
 374     : -1))
 375
 376
 377 #define CODING_ISO_FLAGS(coding)        \
 378   ((coding)->spec.iso_2022.flags)
 379 #define CODING_ISO_DESIGNATION(coding, reg)     \
 380   ((coding)->spec.iso_2022.current_designation[reg])
 381 #define CODING_ISO_INVOCATION(coding, plane)    \
 382   ((coding)->spec.iso_2022.current_invocation[plane])
 383 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 384   ((coding)->spec.iso_2022.single_shifting)
 385 #define CODING_ISO_BOL(coding)  \
 386   ((coding)->spec.iso_2022.bol)
 387 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 388   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 389 #define CODING_ISO_CMP_STATUS(coding)   \
 390   (&(coding)->spec.iso_2022.cmp_status)
 391 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 392   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 393 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 394   ((coding)->spec.iso_2022.embedded_utf_8)
 395
 396 /* Control characters of ISO2022.  */
 397                         /* code */      /* function */
 398 #define ISO_CODE_SO     0x0E            /* shift-out */
 399 #define ISO_CODE_SI     0x0F            /* shift-in */
 400 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 401 #define ISO_CODE_ESC    0x1B            /* escape */
 402 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 403 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 404 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 405
 406 /* All code (1-byte) of ISO2022 is classified into one of the
 407    followings.  */
 408 enum iso_code_class_type
 409   {
 410     ISO_control_0,              /* Control codes in the range
 411                                    0x00..0x1F and 0x7F, except for the
 412                                    following 5 codes.  */
 413     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 414     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 415     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 416     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 417     ISO_control_1,              /* Control codes in the range
 418                                    0x80..0x9F, except for the
 419                                    following 3 codes.  */
 420     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 421     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 422     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 423     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 424     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 425     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 426     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 427   };
 428
 429 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 430     `iso-flags' attribute of an iso2022 coding system.  */
 431
 432 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 433    instead of the correct short-form sequence (e.g. ESC $ A).  */
 434 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 435
 436 /* If set, reset graphic planes and registers at end-of-line to the
 437    initial state.  */
 438 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 439
 440 /* If set, reset graphic planes and registers before any control
 441    characters to the initial state.  */
 442 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 443
 444 /* If set, encode by 7-bit environment.  */
 445 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 446
 447 /* If set, use locking-shift function.  */
 448 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 449
 450 /* If set, use single-shift function.  Overwrite
 451    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 452 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 453
 454 /* If set, use designation escape sequence.  */
 455 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 456
 457 /* If set, produce revision number sequence.  */
 458 #define CODING_ISO_FLAG_REVISION        0x0080
 459
 460 /* If set, produce ISO6429's direction specifying sequence.  */
 461 #define CODING_ISO_FLAG_DIRECTION       0x0100
 462
 463 /* If set, assume designation states are reset at beginning of line on
 464    output.  */
 465 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 466
 467 /* If set, designation sequence should be placed at beginning of line
 468    on output.  */
 469 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 470
 471 /* If set, do not encode unsafe characters on output.  */
 472 #define CODING_ISO_FLAG_SAFE            0x0800
 473
 474 /* If set, extra latin codes (128..159) are accepted as a valid code
 475    on input.  */
 476 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 477
 478 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 479
 480 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 481
 482 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 483
 484 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 485
 486 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 487
 488 /* A character to be produced on output if encoding of the original
 489    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 490 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 491
 492 /* UTF-8 section */
 493 #define CODING_UTF_8_BOM(coding)        \
 494   ((coding)->spec.utf_8_bom)
 495
 496 /* UTF-16 section */
 497 #define CODING_UTF_16_BOM(coding)       \
 498   ((coding)->spec.utf_16.bom)
 499
 500 #define CODING_UTF_16_ENDIAN(coding)    \
 501   ((coding)->spec.utf_16.endian)
 502
 503 #define CODING_UTF_16_SURROGATE(coding) \
 504   ((coding)->spec.utf_16.surrogate)
 505
 506
 507 /* CCL section */
 508 #define CODING_CCL_DECODER(coding)      \
 509   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 510 #define CODING_CCL_ENCODER(coding)      \
 511   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 512 #define CODING_CCL_VALIDS(coding)                                          \
 513   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 514
 515 /* Index for each coding category in `coding_categories' */
 516
 517 enum coding_category
 518   {
 519     coding_category_iso_7,
 520     coding_category_iso_7_tight,
 521     coding_category_iso_8_1,
 522     coding_category_iso_8_2,
 523     coding_category_iso_7_else,
 524     coding_category_iso_8_else,
 525     coding_category_utf_8_auto,
 526     coding_category_utf_8_nosig,
 527     coding_category_utf_8_sig,
 528     coding_category_utf_16_auto,
 529     coding_category_utf_16_be,
 530     coding_category_utf_16_le,
 531     coding_category_utf_16_be_nosig,
 532     coding_category_utf_16_le_nosig,
 533     coding_category_charset,
 534     coding_category_sjis,
 535     coding_category_big5,
 536     coding_category_ccl,
 537     coding_category_emacs_mule,
 538     /* All above are targets of code detection.  */
 539     coding_category_raw_text,
 540     coding_category_undecided,
 541     coding_category_max
 542   };
 543
 544 /* Definitions of flag bits used in detect_coding_XXXX.  */
 545 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 546 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 547 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 548 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 549 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 550 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 551 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 552 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 553 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 554 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 555 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 556 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 557 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 558 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 559 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 560 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 561 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 562 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 563 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 564 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 565
 566 /* This value is returned if detect_coding_mask () find nothing other
 567    than ASCII characters.  */
 568 #define CATEGORY_MASK_ANY               \
 569   (CATEGORY_MASK_ISO_7                  \
 570    | CATEGORY_MASK_ISO_7_TIGHT          \
 571    | CATEGORY_MASK_ISO_8_1              \
 572    | CATEGORY_MASK_ISO_8_2              \
 573    | CATEGORY_MASK_ISO_7_ELSE           \
 574    | CATEGORY_MASK_ISO_8_ELSE           \
 575    | CATEGORY_MASK_UTF_8_AUTO           \
 576    | CATEGORY_MASK_UTF_8_NOSIG          \
 577    | CATEGORY_MASK_UTF_8_SIG            \
 578    | CATEGORY_MASK_UTF_16_AUTO          \
 579    | CATEGORY_MASK_UTF_16_BE            \
 580    | CATEGORY_MASK_UTF_16_LE            \
 581    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 582    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 583    | CATEGORY_MASK_CHARSET              \
 584    | CATEGORY_MASK_SJIS                 \
 585    | CATEGORY_MASK_BIG5                 \
 586    | CATEGORY_MASK_CCL                  \
 587    | CATEGORY_MASK_EMACS_MULE)
 588
 589
 590 #define CATEGORY_MASK_ISO_7BIT \
 591   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 592
 593 #define CATEGORY_MASK_ISO_8BIT \
 594   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 595
 596 #define CATEGORY_MASK_ISO_ELSE \
 597   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 598
 599 #define CATEGORY_MASK_ISO_ESCAPE        \
 600   (CATEGORY_MASK_ISO_7                  \
 601    | CATEGORY_MASK_ISO_7_TIGHT          \
 602    | CATEGORY_MASK_ISO_7_ELSE           \
 603    | CATEGORY_MASK_ISO_8_ELSE)
 604
 605 #define CATEGORY_MASK_ISO       \
 606   (  CATEGORY_MASK_ISO_7BIT     \
 607      | CATEGORY_MASK_ISO_8BIT   \
 608      | CATEGORY_MASK_ISO_ELSE)
 609
 610 #define CATEGORY_MASK_UTF_16            \
 611   (CATEGORY_MASK_UTF_16_AUTO            \
 612    | CATEGORY_MASK_UTF_16_BE            \
 613    | CATEGORY_MASK_UTF_16_LE            \
 614    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 615    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 616
 617 #define CATEGORY_MASK_UTF_8     \
 618   (CATEGORY_MASK_UTF_8_AUTO     \
 619    | CATEGORY_MASK_UTF_8_NOSIG  \
 620    | CATEGORY_MASK_UTF_8_SIG)
 621
 622 /* Table of coding categories (Lisp symbols).  This variable is for
 623    internal use only.  */
 624 static Lisp_Object Vcoding_category_table;
 625
 626 /* Table of coding-categories ordered by priority.  */
 627 static enum coding_category coding_priorities[coding_category_max];
 628
 629 /* Nth element is a coding context for the coding system bound to the
 630    Nth coding category.  */
 631 static struct coding_system coding_categories[coding_category_max];
 632
 633 /*** Commonly used macros and functions ***/
 634
 635 #ifndef min
 636 #define min(a, b) ((a) < (b) ? (a) : (b))
 637 #endif
 638 #ifndef max
 639 #define max(a, b) ((a) > (b) ? (a) : (b))
 640 #endif
 641
 642 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 643   do {                                                  \
 644     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 645     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 646   } while (0)
 647
 648
 649 /* Safely get one byte from the source text pointed by SRC which ends
 650    at SRC_END, and set C to that byte.  If there are not enough bytes
 651    in the source, it jumps to `no_more_source'.  If multibytep is
 652    nonzero, and a multibyte character is found at SRC, set C to the
 653    negative value of the character code.  The caller should declare
 654    and set these variables appropriately in advance:
 655         src, src_end, multibytep */
 656
 657 #define ONE_MORE_BYTE(c)                                \
 658   do {                                                  \
 659     if (src == src_end)                                 \
 660       {                                                 \
 661         if (src_base < src)                             \
 662           record_conversion_result                      \
 663             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 664         goto no_more_source;                            \
 665       }                                                 \
 666     c = *src++;                                         \
 667     if (multibytep && (c & 0x80))                       \
 668       {                                                 \
 669         if ((c & 0xFE) == 0xC0)                         \
 670           c = ((c & 1) << 6) | *src++;                  \
 671         else                                            \
 672           {                                             \
 673             src--;                                      \
 674             c = - string_char (src, &src, NULL);        \
 675             record_conversion_result                    \
 676               (coding, CODING_RESULT_INVALID_SRC);      \
 677           }                                             \
 678       }                                                 \
 679     consumed_chars++;                                   \
 680   } while (0)
 681
 682 /* Safely get two bytes from the source text pointed by SRC which ends
 683    at SRC_END, and set C1 and C2 to those bytes while skipping the
 684    heading multibyte characters.  If there are not enough bytes in the
 685    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 686    a multibyte character is found for C2, set C2 to the negative value
 687    of the character code.  The caller should declare and set these
 688    variables appropriately in advance:
 689         src, src_end, multibytep
 690    It is intended that this macro is used in detect_coding_utf_16.  */
 691
 692 #define TWO_MORE_BYTES(c1, c2)                          \
 693   do {                                                  \
 694     do {                                                \
 695       if (src == src_end)                               \
 696         goto no_more_source;                            \
 697       c1 = *src++;                                      \
 698       if (multibytep && (c1 & 0x80))                    \
 699         {                                               \
 700           if ((c1 & 0xFE) == 0xC0)                      \
 701             c1 = ((c1 & 1) << 6) | *src++;              \
 702           else                                          \
 703             {                                           \
 704               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 705               c1 = -1;                                  \
 706             }                                           \
 707         }                                               \
 708     } while (c1 < 0);                                   \
 709     if (src == src_end)                                 \
 710       goto no_more_source;                              \
 711     c2 = *src++;                                        \
 712     if (multibytep && (c2 & 0x80))                      \
 713       {                                                 \
 714         if ((c2 & 0xFE) == 0xC0)                        \
 715           c2 = ((c2 & 1) << 6) | *src++;                \
 716         else                                            \
 717           c2 = -1;                                      \
 718       }                                                 \
 719   } while (0)
 720
 721
 722 /* Store a byte C in the place pointed by DST and increment DST to the
 723    next free point, and increment PRODUCED_CHARS.  The caller should
 724    assure that C is 0..127, and declare and set the variable `dst'
 725    appropriately in advance.
 726 */
 727
 728
 729 #define EMIT_ONE_ASCII_BYTE(c)  \
 730   do {                          \
 731     produced_chars++;           \
 732     *dst++ = (c);               \
 733   } while (0)
 734
 735
 736 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 737
 738 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 739   do {                                  \
 740     produced_chars += 2;                \
 741     *dst++ = (c1), *dst++ = (c2);       \
 742   } while (0)
 743
 744
 745 /* Store a byte C in the place pointed by DST and increment DST to the
 746    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 747    nonzero, store in an appropriate multibyte from.  The caller should
 748    declare and set the variables `dst' and `multibytep' appropriately
 749    in advance.  */
 750
 751 #define EMIT_ONE_BYTE(c)                \
 752   do {                                  \
 753     produced_chars++;                   \
 754     if (multibytep)                     \
 755       {                                 \
 756         int ch = (c);                   \
 757         if (ch >= 0x80)                 \
 758           ch = BYTE8_TO_CHAR (ch);      \
 759         CHAR_STRING_ADVANCE (ch, dst);  \
 760       }                                 \
 761     else                                \
 762       *dst++ = (c);                     \
 763   } while (0)
 764
 765
 766 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 767
 768 #define EMIT_TWO_BYTES(c1, c2)          \
 769   do {                                  \
 770     produced_chars += 2;                \
 771     if (multibytep)                     \
 772       {                                 \
 773         int ch;                         \
 774                                         \
 775         ch = (c1);                      \
 776         if (ch >= 0x80)                 \
 777           ch = BYTE8_TO_CHAR (ch);      \
 778         CHAR_STRING_ADVANCE (ch, dst);  \
 779         ch = (c2);                      \
 780         if (ch >= 0x80)                 \
 781           ch = BYTE8_TO_CHAR (ch);      \
 782         CHAR_STRING_ADVANCE (ch, dst);  \
 783       }                                 \
 784     else                                \
 785       {                                 \
 786         *dst++ = (c1);                  \
 787         *dst++ = (c2);                  \
 788       }                                 \
 789   } while (0)
 790
 791
 792 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 793   do {                                  \
 794     EMIT_ONE_BYTE (c1);                 \
 795     EMIT_TWO_BYTES (c2, c3);            \
 796   } while (0)
 797
 798
 799 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 800   do {                                          \
 801     EMIT_TWO_BYTES (c1, c2);                    \
 802     EMIT_TWO_BYTES (c3, c4);                    \
 803   } while (0)
 804
 805
 806 /* Prototypes for static functions.  */
 807 static void record_conversion_result (struct coding_system *coding,
 808                                       enum coding_result_code result);
 809 static int detect_coding_utf_8 (struct coding_system *,
 810                                 struct coding_detection_info *info);
 811 static void decode_coding_utf_8 (struct coding_system *);
 812 static int encode_coding_utf_8 (struct coding_system *);
 813
 814 static int detect_coding_utf_16 (struct coding_system *,
 815                                  struct coding_detection_info *info);
 816 static void decode_coding_utf_16 (struct coding_system *);
 817 static int encode_coding_utf_16 (struct coding_system *);
 818
 819 static int detect_coding_iso_2022 (struct coding_system *,
 820                                    struct coding_detection_info *info);
 821 static void decode_coding_iso_2022 (struct coding_system *);
 822 static int encode_coding_iso_2022 (struct coding_system *);
 823
 824 static int detect_coding_emacs_mule (struct coding_system *,
 825                                      struct coding_detection_info *info);
 826 static void decode_coding_emacs_mule (struct coding_system *);
 827 static int encode_coding_emacs_mule (struct coding_system *);
 828
 829 static int detect_coding_sjis (struct coding_system *,
 830                                struct coding_detection_info *info);
 831 static void decode_coding_sjis (struct coding_system *);
 832 static int encode_coding_sjis (struct coding_system *);
 833
 834 static int detect_coding_big5 (struct coding_system *,
 835                                struct coding_detection_info *info);
 836 static void decode_coding_big5 (struct coding_system *);
 837 static int encode_coding_big5 (struct coding_system *);
 838
 839 static int detect_coding_ccl (struct coding_system *,
 840                               struct coding_detection_info *info);
 841 static void decode_coding_ccl (struct coding_system *);
 842 static int encode_coding_ccl (struct coding_system *);
 843
 844 static void decode_coding_raw_text (struct coding_system *);
 845 static int encode_coding_raw_text (struct coding_system *);
 846
 847 static void coding_set_source (struct coding_system *);
 848 static void coding_set_destination (struct coding_system *);
 849 static void coding_alloc_by_realloc (struct coding_system *, EMACS_INT);
 850 static void coding_alloc_by_making_gap (struct coding_system *,
 851                                         EMACS_INT, EMACS_INT);
 852 static unsigned char *alloc_destination (struct coding_system *,
 853                                          EMACS_INT, unsigned char *);
 854 static void setup_iso_safe_charsets (Lisp_Object);
 855 static unsigned char *encode_designation_at_bol (struct coding_system *,
 856                                                  int *, int *,
 857                                                  unsigned char *);
 858 static int detect_eol (const unsigned char *,
 859                        EMACS_INT, enum coding_category);
 860 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
 861 static void decode_eol (struct coding_system *);
 862 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
 863 static Lisp_Object get_translation (Lisp_Object, int *, int *);
 864 static int produce_chars (struct coding_system *, Lisp_Object, int);
 865 static INLINE void produce_charset (struct coding_system *, int *,
 866                                     EMACS_INT);
 867 static void produce_annotation (struct coding_system *, EMACS_INT);
 868 static int decode_coding (struct coding_system *);
 869 static INLINE int *handle_composition_annotation (EMACS_INT, EMACS_INT,
 870                                                   struct coding_system *,
 871                                                   int *, EMACS_INT *);
 872 static INLINE int *handle_charset_annotation (EMACS_INT, EMACS_INT,
 873                                               struct coding_system *,
 874                                               int *, EMACS_INT *);
 875 static void consume_chars (struct coding_system *, Lisp_Object, int);
 876 static int encode_coding (struct coding_system *);
 877 static Lisp_Object make_conversion_work_buffer (int);
 878 static Lisp_Object code_conversion_restore (Lisp_Object);
 879 static INLINE int char_encodable_p (int, Lisp_Object);
 880 static Lisp_Object make_subsidiaries (Lisp_Object);
 881
 882 static void
 883 record_conversion_result (struct coding_system *coding,
 884                           enum coding_result_code result)
 885 {
 886   coding->result = result;
 887   switch (result)
 888     {
 889     case CODING_RESULT_INSUFFICIENT_SRC:
 890       Vlast_code_conversion_error = Qinsufficient_source;
 891       break;
 892     case CODING_RESULT_INCONSISTENT_EOL:
 893       Vlast_code_conversion_error = Qinconsistent_eol;
 894       break;
 895     case CODING_RESULT_INVALID_SRC:
 896       Vlast_code_conversion_error = Qinvalid_source;
 897       break;
 898     case CODING_RESULT_INTERRUPT:
 899       Vlast_code_conversion_error = Qinterrupted;
 900       break;
 901     case CODING_RESULT_INSUFFICIENT_MEM:
 902       Vlast_code_conversion_error = Qinsufficient_memory;
 903       break;
 904     case CODING_RESULT_INSUFFICIENT_DST:
 905       /* Don't record this error in Vlast_code_conversion_error
 906          because it happens just temporarily and is resolved when the
 907          whole conversion is finished.  */
 908       break;
 909     case CODING_RESULT_SUCCESS:
 910       break;
 911     default:
 912       Vlast_code_conversion_error = intern ("Unknown error");
 913     }
 914 }
 915
 916 /* This wrapper macro is used to preserve validity of pointers into
 917    buffer text across calls to decode_char, which could cause
 918    relocation of buffers if it loads a charset map, because loading a
 919    charset map allocates large structures.  */
 920 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 921   do {                                                                       \
 922     charset_map_loaded = 0;                                                  \
 923     c = DECODE_CHAR (charset, code);                                         \
 924     if (charset_map_loaded)                                                  \
 925       {                                                                      \
 926         const unsigned char *orig = coding->source;                          \
 927         EMACS_INT offset;                                                    \
 928                                                                              \
 929         coding_set_source (coding);                                          \
 930         offset = coding->source - orig;                                      \
 931         src += offset;                                                       \
 932         src_base += offset;                                                  \
 933         src_end += offset;                                                   \
 934       }                                                                      \
 935   } while (0)
 936
 937
 938 /* If there are at least BYTES length of room at dst, allocate memory
 939    for coding->destination and update dst and dst_end.  We don't have
 940    to take care of coding->source which will be relocated.  It is
 941    handled by calling coding_set_source in encode_coding.  */
 942
 943 #define ASSURE_DESTINATION(bytes)                               \
 944   do {                                                          \
 945     if (dst + (bytes) >= dst_end)                               \
 946       {                                                         \
 947         int more_bytes = charbuf_end - charbuf + (bytes);       \
 948                                                                 \
 949         dst = alloc_destination (coding, more_bytes, dst);      \
 950         dst_end = coding->destination + coding->dst_bytes;      \
 951       }                                                         \
 952   } while (0)
 953
 954
 955 /* Store multibyte form of the character C in P, and advance P to the
 956    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 957    never calls MAYBE_UNIFY_CHAR.  */
 958
 959 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 960   do {                                          \
 961     if ((c) <= MAX_1_BYTE_CHAR)                 \
 962       *(p)++ = (c);                             \
 963     else if ((c) <= MAX_2_BYTE_CHAR)            \
 964       *(p)++ = (0xC0 | ((c) >> 6)),             \
 965         *(p)++ = (0x80 | ((c) & 0x3F));         \
 966     else if ((c) <= MAX_3_BYTE_CHAR)            \
 967       *(p)++ = (0xE0 | ((c) >> 12)),            \
 968         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
 969         *(p)++ = (0x80 | ((c) & 0x3F));         \
 970     else if ((c) <= MAX_4_BYTE_CHAR)            \
 971       *(p)++ = (0xF0 | (c >> 18)),              \
 972         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 973         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 974         *(p)++ = (0x80 | (c & 0x3F));           \
 975     else if ((c) <= MAX_5_BYTE_CHAR)            \
 976       *(p)++ = 0xF8,                            \
 977         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
 978         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 979         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 980         *(p)++ = (0x80 | (c & 0x3F));           \
 981     else                                        \
 982       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
 983   } while (0)
 984
 985
 986 /* Return the character code of character whose multibyte form is at
 987    P, and advance P to the end of the multibyte form.  This is like
 988    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
 989
 990 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
 991   (!((p)[0] & 0x80)                                             \
 992    ? *(p)++                                                     \
 993    : ! ((p)[0] & 0x20)                                          \
 994    ? ((p) += 2,                                                 \
 995       ((((p)[-2] & 0x1F) << 6)                                  \
 996        | ((p)[-1] & 0x3F)                                       \
 997        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
 998    : ! ((p)[0] & 0x10)                                          \
 999    ? ((p) += 3,                                                 \
1000       ((((p)[-3] & 0x0F) << 12)                                 \
1001        | (((p)[-2] & 0x3F) << 6)                                \
1002        | ((p)[-1] & 0x3F)))                                     \
1003    : ! ((p)[0] & 0x08)                                          \
1004    ? ((p) += 4,                                                 \
1005       ((((p)[-4] & 0xF) << 18)                                  \
1006        | (((p)[-3] & 0x3F) << 12)                               \
1007        | (((p)[-2] & 0x3F) << 6)                                \
1008        | ((p)[-1] & 0x3F)))                                     \
1009    : ((p) += 5,                                                 \
1010       ((((p)[-4] & 0x3F) << 18)                                 \
1011        | (((p)[-3] & 0x3F) << 12)                               \
1012        | (((p)[-2] & 0x3F) << 6)                                \
1013        | ((p)[-1] & 0x3F))))
1014
1015
1016 static void
1017 coding_set_source (struct coding_system *coding)
1018 {
1019   if (BUFFERP (coding->src_object))
1020     {
1021       struct buffer *buf = XBUFFER (coding->src_object);
1022
1023       if (coding->src_pos < 0)
1024         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1025       else
1026         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1027     }
1028   else if (STRINGP (coding->src_object))
1029     {
1030       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1031     }
1032   else
1033     {
1034       /* Otherwise, the source is C string and is never relocated
1035          automatically.  Thus we don't have to update anything.  */
1036     }
1037 }
1038
1039 static void
1040 coding_set_destination (struct coding_system *coding)
1041 {
1042   if (BUFFERP (coding->dst_object))
1043     {
1044       if (coding->src_pos < 0)
1045         {
1046           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1047           coding->dst_bytes = (GAP_END_ADDR
1048                                - (coding->src_bytes - coding->consumed)
1049                                - coding->destination);
1050         }
1051       else
1052         {
1053           /* We are sure that coding->dst_pos_byte is before the gap
1054              of the buffer. */
1055           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1056                                  + coding->dst_pos_byte - BEG_BYTE);
1057           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1058                                - coding->destination);
1059         }
1060     }
1061   else
1062     {
1063       /* Otherwise, the destination is C string and is never relocated
1064          automatically.  Thus we don't have to update anything.  */
1065     }
1066 }
1067
1068
1069 static void
1070 coding_alloc_by_realloc (struct coding_system *coding, EMACS_INT bytes)
1071 {
1072   coding->destination = (unsigned char *) xrealloc (coding->destination,
1073                                                     coding->dst_bytes + bytes);
1074   coding->dst_bytes += bytes;
1075 }
1076
1077 static void
1078 coding_alloc_by_making_gap (struct coding_system *coding,
1079                             EMACS_INT gap_head_used, EMACS_INT bytes)
1080 {
1081   if (EQ (coding->src_object, coding->dst_object))
1082     {
1083       /* The gap may contain the produced data at the head and not-yet
1084          consumed data at the tail.  To preserve those data, we at
1085          first make the gap size to zero, then increase the gap
1086          size.  */
1087       EMACS_INT add = GAP_SIZE;
1088
1089       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1090       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1091       make_gap (bytes);
1092       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1093       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1094     }
1095   else
1096     {
1097       Lisp_Object this_buffer;
1098
1099       this_buffer = Fcurrent_buffer ();
1100       set_buffer_internal (XBUFFER (coding->dst_object));
1101       make_gap (bytes);
1102       set_buffer_internal (XBUFFER (this_buffer));
1103     }
1104 }
1105
1106
1107 static unsigned char *
1108 alloc_destination (struct coding_system *coding, EMACS_INT nbytes,
1109                    unsigned char *dst)
1110 {
1111   EMACS_INT offset = dst - coding->destination;
1112
1113   if (BUFFERP (coding->dst_object))
1114     {
1115       struct buffer *buf = XBUFFER (coding->dst_object);
1116
1117       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1118     }
1119   else
1120     coding_alloc_by_realloc (coding, nbytes);
1121   coding_set_destination (coding);
1122   dst = coding->destination + offset;
1123   return dst;
1124 }
1125
1126 /** Macros for annotations.  */
1127
1128 /* An annotation data is stored in the array coding->charbuf in this
1129    format:
1130      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1131    LENGTH is the number of elements in the annotation.
1132    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1133    NCHARS is the number of characters in the text annotated.
1134
1135    The format of the following elements depend on ANNOTATION_MASK.
1136
1137    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1138    follows:
1139      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1140
1141    NBYTES is the number of bytes specified in the header part of
1142    old-style emacs-mule encoding, or 0 for the other kind of
1143    composition.
1144
1145    METHOD is one of enum composition_method.
1146
1147    Optional COMPOSITION-COMPONENTS are characters and composition
1148    rules.
1149
1150    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1151    follows.
1152
1153    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1154    recover from an invalid annotation, and should be skipped by
1155    produce_annotation.  */
1156
1157 /* Maximum length of the header of annotation data.  */
1158 #define MAX_ANNOTATION_LENGTH 5
1159
1160 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1161   do {                                                  \
1162     *(buf)++ = -(len);                                  \
1163     *(buf)++ = (mask);                                  \
1164     *(buf)++ = (nchars);                                \
1165     coding->annotated = 1;                              \
1166   } while (0);
1167
1168 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1169   do {                                                                      \
1170     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1171     *buf++ = nbytes;                                                        \
1172     *buf++ = method;                                                        \
1173   } while (0)
1174
1175
1176 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1177   do {                                                                  \
1178     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1179     *buf++ = id;                                                        \
1180   } while (0)
1181
1182 \f
1183 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1184
1185
1186
1187 \f
1188 /*** 3. UTF-8 ***/
1189
1190 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1191    Check if a text is encoded in UTF-8.  If it is, return 1, else
1192    return 0.  */
1193
1194 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1195 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1196 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1197 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1198 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1199 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1200
1201 #define UTF_8_BOM_1 0xEF
1202 #define UTF_8_BOM_2 0xBB
1203 #define UTF_8_BOM_3 0xBF
1204
1205 static int
1206 detect_coding_utf_8 (struct coding_system *coding,
1207                      struct coding_detection_info *detect_info)
1208 {
1209   const unsigned char *src = coding->source, *src_base;
1210   const unsigned char *src_end = coding->source + coding->src_bytes;
1211   int multibytep = coding->src_multibyte;
1212   int consumed_chars = 0;
1213   int bom_found = 0;
1214   int found = 0;
1215
1216   detect_info->checked |= CATEGORY_MASK_UTF_8;
1217   /* A coding system of this category is always ASCII compatible.  */
1218   src += coding->head_ascii;
1219
1220   while (1)
1221     {
1222       int c, c1, c2, c3, c4;
1223
1224       src_base = src;
1225       ONE_MORE_BYTE (c);
1226       if (c < 0 || UTF_8_1_OCTET_P (c))
1227         continue;
1228       ONE_MORE_BYTE (c1);
1229       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1230         break;
1231       if (UTF_8_2_OCTET_LEADING_P (c))
1232         {
1233           found = 1;
1234           continue;
1235         }
1236       ONE_MORE_BYTE (c2);
1237       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1238         break;
1239       if (UTF_8_3_OCTET_LEADING_P (c))
1240         {
1241           found = 1;
1242           if (src_base == coding->source
1243               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1244             bom_found = 1;
1245           continue;
1246         }
1247       ONE_MORE_BYTE (c3);
1248       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1249         break;
1250       if (UTF_8_4_OCTET_LEADING_P (c))
1251         {
1252           found = 1;
1253           continue;
1254         }
1255       ONE_MORE_BYTE (c4);
1256       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1257         break;
1258       if (UTF_8_5_OCTET_LEADING_P (c))
1259         {
1260           found = 1;
1261           continue;
1262         }
1263       break;
1264     }
1265   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1266   return 0;
1267
1268  no_more_source:
1269   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1270     {
1271       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1272       return 0;
1273     }
1274   if (bom_found)
1275     {
1276       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1277       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1278     }
1279   else
1280     {
1281       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1282       if (found)
1283         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1284     }
1285   return 1;
1286 }
1287
1288
1289 static void
1290 decode_coding_utf_8 (struct coding_system *coding)
1291 {
1292   const unsigned char *src = coding->source + coding->consumed;
1293   const unsigned char *src_end = coding->source + coding->src_bytes;
1294   const unsigned char *src_base;
1295   int *charbuf = coding->charbuf + coding->charbuf_used;
1296   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1297   int consumed_chars = 0, consumed_chars_base = 0;
1298   int multibytep = coding->src_multibyte;
1299   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1300   Lisp_Object attr, charset_list;
1301   int eol_dos =
1302     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1303   int byte_after_cr = -1;
1304
1305   CODING_GET_INFO (coding, attr, charset_list);
1306
1307   if (bom != utf_without_bom)
1308     {
1309       int c1, c2, c3;
1310
1311       src_base = src;
1312       ONE_MORE_BYTE (c1);
1313       if (! UTF_8_3_OCTET_LEADING_P (c1))
1314         src = src_base;
1315       else
1316         {
1317           ONE_MORE_BYTE (c2);
1318           if (! UTF_8_EXTRA_OCTET_P (c2))
1319             src = src_base;
1320           else
1321             {
1322               ONE_MORE_BYTE (c3);
1323               if (! UTF_8_EXTRA_OCTET_P (c3))
1324                 src = src_base;
1325               else
1326                 {
1327                   if ((c1 != UTF_8_BOM_1)
1328                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1329                     src = src_base;
1330                   else
1331                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1332                 }
1333             }
1334         }
1335     }
1336   CODING_UTF_8_BOM (coding) = utf_without_bom;
1337
1338   while (1)
1339     {
1340       int c, c1, c2, c3, c4, c5;
1341
1342       src_base = src;
1343       consumed_chars_base = consumed_chars;
1344
1345       if (charbuf >= charbuf_end)
1346         {
1347           if (byte_after_cr >= 0)
1348             src_base--;
1349           break;
1350         }
1351
1352       if (byte_after_cr >= 0)
1353         c1 = byte_after_cr, byte_after_cr = -1;
1354       else
1355         ONE_MORE_BYTE (c1);
1356       if (c1 < 0)
1357         {
1358           c = - c1;
1359         }
1360       else if (UTF_8_1_OCTET_P (c1))
1361         {
1362           if (eol_dos && c1 == '\r')
1363             ONE_MORE_BYTE (byte_after_cr);
1364           c = c1;
1365         }
1366       else
1367         {
1368           ONE_MORE_BYTE (c2);
1369           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1370             goto invalid_code;
1371           if (UTF_8_2_OCTET_LEADING_P (c1))
1372             {
1373               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1374               /* Reject overlong sequences here and below.  Encoders
1375                  producing them are incorrect, they can be misleading,
1376                  and they mess up read/write invariance.  */
1377               if (c < 128)
1378                 goto invalid_code;
1379             }
1380           else
1381             {
1382               ONE_MORE_BYTE (c3);
1383               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1384                 goto invalid_code;
1385               if (UTF_8_3_OCTET_LEADING_P (c1))
1386                 {
1387                   c = (((c1 & 0xF) << 12)
1388                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1389                   if (c < 0x800
1390                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1391                     goto invalid_code;
1392                 }
1393               else
1394                 {
1395                   ONE_MORE_BYTE (c4);
1396                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1397                     goto invalid_code;
1398                   if (UTF_8_4_OCTET_LEADING_P (c1))
1399                     {
1400                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1401                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1402                     if (c < 0x10000)
1403                       goto invalid_code;
1404                     }
1405                   else
1406                     {
1407                       ONE_MORE_BYTE (c5);
1408                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1409                         goto invalid_code;
1410                       if (UTF_8_5_OCTET_LEADING_P (c1))
1411                         {
1412                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1413                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1414                                | (c5 & 0x3F));
1415                           if ((c > MAX_CHAR) || (c < 0x200000))
1416                             goto invalid_code;
1417                         }
1418                       else
1419                         goto invalid_code;
1420                     }
1421                 }
1422             }
1423         }
1424
1425       *charbuf++ = c;
1426       continue;
1427
1428     invalid_code:
1429       src = src_base;
1430       consumed_chars = consumed_chars_base;
1431       ONE_MORE_BYTE (c);
1432       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1433       coding->errors++;
1434     }
1435
1436  no_more_source:
1437   coding->consumed_char += consumed_chars_base;
1438   coding->consumed = src_base - coding->source;
1439   coding->charbuf_used = charbuf - coding->charbuf;
1440 }
1441
1442
1443 static int
1444 encode_coding_utf_8 (struct coding_system *coding)
1445 {
1446   int multibytep = coding->dst_multibyte;
1447   int *charbuf = coding->charbuf;
1448   int *charbuf_end = charbuf + coding->charbuf_used;
1449   unsigned char *dst = coding->destination + coding->produced;
1450   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1451   int produced_chars = 0;
1452   int c;
1453
1454   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1455     {
1456       ASSURE_DESTINATION (3);
1457       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1458       CODING_UTF_8_BOM (coding) = utf_without_bom;
1459     }
1460
1461   if (multibytep)
1462     {
1463       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1464
1465       while (charbuf < charbuf_end)
1466         {
1467           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1468
1469           ASSURE_DESTINATION (safe_room);
1470           c = *charbuf++;
1471           if (CHAR_BYTE8_P (c))
1472             {
1473               c = CHAR_TO_BYTE8 (c);
1474               EMIT_ONE_BYTE (c);
1475             }
1476           else
1477             {
1478               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1479               for (p = str; p < pend; p++)
1480                 EMIT_ONE_BYTE (*p);
1481             }
1482         }
1483     }
1484   else
1485     {
1486       int safe_room = MAX_MULTIBYTE_LENGTH;
1487
1488       while (charbuf < charbuf_end)
1489         {
1490           ASSURE_DESTINATION (safe_room);
1491           c = *charbuf++;
1492           if (CHAR_BYTE8_P (c))
1493             *dst++ = CHAR_TO_BYTE8 (c);
1494           else
1495             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1496           produced_chars++;
1497         }
1498     }
1499   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1500   coding->produced_char += produced_chars;
1501   coding->produced = dst - coding->destination;
1502   return 0;
1503 }
1504
1505
1506 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1507    Check if a text is encoded in one of UTF-16 based coding systems.
1508    If it is, return 1, else return 0.  */
1509
1510 #define UTF_16_HIGH_SURROGATE_P(val) \
1511   (((val) & 0xFC00) == 0xD800)
1512
1513 #define UTF_16_LOW_SURROGATE_P(val) \
1514   (((val) & 0xFC00) == 0xDC00)
1515
1516
1517 static int
1518 detect_coding_utf_16 (struct coding_system *coding,
1519                       struct coding_detection_info *detect_info)
1520 {
1521   const unsigned char *src = coding->source;
1522   const unsigned char *src_end = coding->source + coding->src_bytes;
1523   int multibytep = coding->src_multibyte;
1524   int c1, c2;
1525
1526   detect_info->checked |= CATEGORY_MASK_UTF_16;
1527   if (coding->mode & CODING_MODE_LAST_BLOCK
1528       && (coding->src_chars & 1))
1529     {
1530       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1531       return 0;
1532     }
1533
1534   TWO_MORE_BYTES (c1, c2);
1535   if ((c1 == 0xFF) && (c2 == 0xFE))
1536     {
1537       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1538                              | CATEGORY_MASK_UTF_16_AUTO);
1539       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1540                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1541                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1542     }
1543   else if ((c1 == 0xFE) && (c2 == 0xFF))
1544     {
1545       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1546                              | CATEGORY_MASK_UTF_16_AUTO);
1547       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1548                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1549                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1550     }
1551   else if (c2 < 0)
1552     {
1553       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1554       return 0;
1555     }
1556   else
1557     {
1558       /* We check the dispersion of Eth and Oth bytes where E is even and
1559          O is odd.  If both are high, we assume binary data.*/
1560       unsigned char e[256], o[256];
1561       unsigned e_num = 1, o_num = 1;
1562
1563       memset (e, 0, 256);
1564       memset (o, 0, 256);
1565       e[c1] = 1;
1566       o[c2] = 1;
1567
1568       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1569                                 |CATEGORY_MASK_UTF_16_BE
1570                                 | CATEGORY_MASK_UTF_16_LE);
1571
1572       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1573              != CATEGORY_MASK_UTF_16)
1574         {
1575           TWO_MORE_BYTES (c1, c2);
1576           if (c2 < 0)
1577             break;
1578           if (! e[c1])
1579             {
1580               e[c1] = 1;
1581               e_num++;
1582               if (e_num >= 128)
1583                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1584             }
1585           if (! o[c2])
1586             {
1587               o[c2] = 1;
1588               o_num++;
1589               if (o_num >= 128)
1590                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1591             }
1592         }
1593       return 0;
1594     }
1595
1596  no_more_source:
1597   return 1;
1598 }
1599
1600 static void
1601 decode_coding_utf_16 (struct coding_system *coding)
1602 {
1603   const unsigned char *src = coding->source + coding->consumed;
1604   const unsigned char *src_end = coding->source + coding->src_bytes;
1605   const unsigned char *src_base;
1606   int *charbuf = coding->charbuf + coding->charbuf_used;
1607   /* We may produces at most 3 chars in one loop.  */
1608   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1609   int consumed_chars = 0, consumed_chars_base = 0;
1610   int multibytep = coding->src_multibyte;
1611   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1612   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1613   int surrogate = CODING_UTF_16_SURROGATE (coding);
1614   Lisp_Object attr, charset_list;
1615   int eol_dos =
1616     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1617   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1618
1619   CODING_GET_INFO (coding, attr, charset_list);
1620
1621   if (bom == utf_with_bom)
1622     {
1623       int c, c1, c2;
1624
1625       src_base = src;
1626       ONE_MORE_BYTE (c1);
1627       ONE_MORE_BYTE (c2);
1628       c = (c1 << 8) | c2;
1629
1630       if (endian == utf_16_big_endian
1631           ? c != 0xFEFF : c != 0xFFFE)
1632         {
1633           /* The first two bytes are not BOM.  Treat them as bytes
1634              for a normal character.  */
1635           src = src_base;
1636           coding->errors++;
1637         }
1638       CODING_UTF_16_BOM (coding) = utf_without_bom;
1639     }
1640   else if (bom == utf_detect_bom)
1641     {
1642       /* We have already tried to detect BOM and failed in
1643          detect_coding.  */
1644       CODING_UTF_16_BOM (coding) = utf_without_bom;
1645     }
1646
1647   while (1)
1648     {
1649       int c, c1, c2;
1650
1651       src_base = src;
1652       consumed_chars_base = consumed_chars;
1653
1654       if (charbuf >= charbuf_end)
1655         {
1656           if (byte_after_cr1 >= 0)
1657             src_base -= 2;
1658           break;
1659         }
1660
1661       if (byte_after_cr1 >= 0)
1662         c1 = byte_after_cr1, byte_after_cr1 = -1;
1663       else
1664         ONE_MORE_BYTE (c1);
1665       if (c1 < 0)
1666         {
1667           *charbuf++ = -c1;
1668           continue;
1669         }
1670       if (byte_after_cr2 >= 0)
1671         c2 = byte_after_cr2, byte_after_cr2 = -1;
1672       else
1673         ONE_MORE_BYTE (c2);
1674       if (c2 < 0)
1675         {
1676           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1677           *charbuf++ = -c2;
1678           continue;
1679         }
1680       c = (endian == utf_16_big_endian
1681            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1682
1683       if (surrogate)
1684         {
1685           if (! UTF_16_LOW_SURROGATE_P (c))
1686             {
1687               if (endian == utf_16_big_endian)
1688                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1689               else
1690                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1691               *charbuf++ = c1;
1692               *charbuf++ = c2;
1693               coding->errors++;
1694               if (UTF_16_HIGH_SURROGATE_P (c))
1695                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1696               else
1697                 *charbuf++ = c;
1698             }
1699           else
1700             {
1701               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1702               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1703               *charbuf++ = 0x10000 + c;
1704             }
1705         }
1706       else
1707         {
1708           if (UTF_16_HIGH_SURROGATE_P (c))
1709             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1710           else
1711             {
1712               if (eol_dos && c == '\r')
1713                 {
1714                   ONE_MORE_BYTE (byte_after_cr1);
1715                   ONE_MORE_BYTE (byte_after_cr2);
1716                 }
1717               *charbuf++ = c;
1718             }
1719         }
1720     }
1721
1722  no_more_source:
1723   coding->consumed_char += consumed_chars_base;
1724   coding->consumed = src_base - coding->source;
1725   coding->charbuf_used = charbuf - coding->charbuf;
1726 }
1727
1728 static int
1729 encode_coding_utf_16 (struct coding_system *coding)
1730 {
1731   int multibytep = coding->dst_multibyte;
1732   int *charbuf = coding->charbuf;
1733   int *charbuf_end = charbuf + coding->charbuf_used;
1734   unsigned char *dst = coding->destination + coding->produced;
1735   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1736   int safe_room = 8;
1737   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1738   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1739   int produced_chars = 0;
1740   Lisp_Object attrs, charset_list;
1741   int c;
1742
1743   CODING_GET_INFO (coding, attrs, charset_list);
1744
1745   if (bom != utf_without_bom)
1746     {
1747       ASSURE_DESTINATION (safe_room);
1748       if (big_endian)
1749         EMIT_TWO_BYTES (0xFE, 0xFF);
1750       else
1751         EMIT_TWO_BYTES (0xFF, 0xFE);
1752       CODING_UTF_16_BOM (coding) = utf_without_bom;
1753     }
1754
1755   while (charbuf < charbuf_end)
1756     {
1757       ASSURE_DESTINATION (safe_room);
1758       c = *charbuf++;
1759       if (c > MAX_UNICODE_CHAR)
1760         c = coding->default_char;
1761
1762       if (c < 0x10000)
1763         {
1764           if (big_endian)
1765             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1766           else
1767             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1768         }
1769       else
1770         {
1771           int c1, c2;
1772
1773           c -= 0x10000;
1774           c1 = (c >> 10) + 0xD800;
1775           c2 = (c & 0x3FF) + 0xDC00;
1776           if (big_endian)
1777             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1778           else
1779             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1780         }
1781     }
1782   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1783   coding->produced = dst - coding->destination;
1784   coding->produced_char += produced_chars;
1785   return 0;
1786 }
1787
1788 \f
1789 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1790
1791 /* Emacs' internal format for representation of multiple character
1792    sets is a kind of multi-byte encoding, i.e. characters are
1793    represented by variable-length sequences of one-byte codes.
1794
1795    ASCII characters and control characters (e.g. `tab', `newline') are
1796    represented by one-byte sequences which are their ASCII codes, in
1797    the range 0x00 through 0x7F.
1798
1799    8-bit characters of the range 0x80..0x9F are represented by
1800    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1801    code + 0x20).
1802
1803    8-bit characters of the range 0xA0..0xFF are represented by
1804    one-byte sequences which are their 8-bit code.
1805
1806    The other characters are represented by a sequence of `base
1807    leading-code', optional `extended leading-code', and one or two
1808    `position-code's.  The length of the sequence is determined by the
1809    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1810    whereas extended leading-code and position-code take the range 0xA0
1811    through 0xFF.  See `charset.h' for more details about leading-code
1812    and position-code.
1813
1814    --- CODE RANGE of Emacs' internal format ---
1815    character set        range
1816    -------------        -----
1817    ascii                0x00..0x7F
1818    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1819    eight-bit-graphic    0xA0..0xBF
1820    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1821    ---------------------------------------------
1822
1823    As this is the internal character representation, the format is
1824    usually not used externally (i.e. in a file or in a data sent to a
1825    process).  But, it is possible to have a text externally in this
1826    format (i.e. by encoding by the coding system `emacs-mule').
1827
1828    In that case, a sequence of one-byte codes has a slightly different
1829    form.
1830
1831    At first, all characters in eight-bit-control are represented by
1832    one-byte sequences which are their 8-bit code.
1833
1834    Next, character composition data are represented by the byte
1835    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1836    where,
1837         METHOD is 0xF2 plus one of composition method (enum
1838         composition_method),
1839
1840         BYTES is 0xA0 plus a byte length of this composition data,
1841
1842         CHARS is 0xA0 plus a number of characters composed by this
1843         data,
1844
1845         COMPONENTs are characters of multibyte form or composition
1846         rules encoded by two-byte of ASCII codes.
1847
1848    In addition, for backward compatibility, the following formats are
1849    also recognized as composition data on decoding.
1850
1851    0x80 MSEQ ...
1852    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1853
1854    Here,
1855         MSEQ is a multibyte form but in these special format:
1856           ASCII: 0xA0 ASCII_CODE+0x80,
1857           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1858         RULE is a one byte code of the range 0xA0..0xF0 that
1859         represents a composition rule.
1860   */
1861
1862 char emacs_mule_bytes[256];
1863
1864
1865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1866    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1867    else return 0.  */
1868
1869 static int
1870 detect_coding_emacs_mule (struct coding_system *coding,
1871                           struct coding_detection_info *detect_info)
1872 {
1873   const unsigned char *src = coding->source, *src_base;
1874   const unsigned char *src_end = coding->source + coding->src_bytes;
1875   int multibytep = coding->src_multibyte;
1876   int consumed_chars = 0;
1877   int c;
1878   int found = 0;
1879
1880   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1881   /* A coding system of this category is always ASCII compatible.  */
1882   src += coding->head_ascii;
1883
1884   while (1)
1885     {
1886       src_base = src;
1887       ONE_MORE_BYTE (c);
1888       if (c < 0)
1889         continue;
1890       if (c == 0x80)
1891         {
1892           /* Perhaps the start of composite character.  We simply skip
1893              it because analyzing it is too heavy for detecting.  But,
1894              at least, we check that the composite character
1895              constitutes of more than 4 bytes.  */
1896           const unsigned char *src_start;
1897
1898         repeat:
1899           src_start = src;
1900           do
1901             {
1902               ONE_MORE_BYTE (c);
1903             }
1904           while (c >= 0xA0);
1905
1906           if (src - src_start <= 4)
1907             break;
1908           found = CATEGORY_MASK_EMACS_MULE;
1909           if (c == 0x80)
1910             goto repeat;
1911         }
1912
1913       if (c < 0x80)
1914         {
1915           if (c < 0x20
1916               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1917             break;
1918         }
1919       else
1920         {
1921           int more_bytes = emacs_mule_bytes[c] - 1;
1922
1923           while (more_bytes > 0)
1924             {
1925               ONE_MORE_BYTE (c);
1926               if (c < 0xA0)
1927                 {
1928                   src--;        /* Unread the last byte.  */
1929                   break;
1930                 }
1931               more_bytes--;
1932             }
1933           if (more_bytes != 0)
1934             break;
1935           found = CATEGORY_MASK_EMACS_MULE;
1936         }
1937     }
1938   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1939   return 0;
1940
1941  no_more_source:
1942   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1943     {
1944       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1945       return 0;
1946     }
1947   detect_info->found |= found;
1948   return 1;
1949 }
1950
1951
1952 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1953    character.  If CMP_STATUS indicates that we must expect MSEQ or
1954    RULE described above, decode it and return the negative value of
1955    the decoded character or rule.  If an invalid byte is found, return
1956    -1.  If SRC is too short, return -2.  */
1957
1958 static int
1959 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1960                  int *nbytes, int *nchars, int *id,
1961                  struct composition_status *cmp_status)
1962 {
1963   const unsigned char *src_end = coding->source + coding->src_bytes;
1964   const unsigned char *src_base = src;
1965   int multibytep = coding->src_multibyte;
1966   int charset_ID;
1967   unsigned code;
1968   int c;
1969   int consumed_chars = 0;
1970   int mseq_found = 0;
1971
1972   ONE_MORE_BYTE (c);
1973   if (c < 0)
1974     {
1975       c = -c;
1976       charset_ID = emacs_mule_charset[0];
1977     }
1978   else
1979     {
1980       if (c >= 0xA0)
1981         {
1982           if (cmp_status->state != COMPOSING_NO
1983               && cmp_status->old_form)
1984             {
1985               if (cmp_status->state == COMPOSING_CHAR)
1986                 {
1987                   if (c == 0xA0)
1988                     {
1989                       ONE_MORE_BYTE (c);
1990                       c -= 0x80;
1991                       if (c < 0)
1992                         goto invalid_code;
1993                     }
1994                   else
1995                     c -= 0x20;
1996                   mseq_found = 1;
1997                 }
1998               else
1999                 {
2000                   *nbytes = src - src_base;
2001                   *nchars = consumed_chars;
2002                   return -c;
2003                 }
2004             }
2005           else
2006             goto invalid_code;
2007         }
2008
2009       switch (emacs_mule_bytes[c])
2010         {
2011         case 2:
2012           if ((charset_ID = emacs_mule_charset[c]) < 0)
2013             goto invalid_code;
2014           ONE_MORE_BYTE (c);
2015           if (c < 0xA0)
2016             goto invalid_code;
2017           code = c & 0x7F;
2018           break;
2019
2020         case 3:
2021           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2022               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2023             {
2024               ONE_MORE_BYTE (c);
2025               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2026                 goto invalid_code;
2027               ONE_MORE_BYTE (c);
2028               if (c < 0xA0)
2029                 goto invalid_code;
2030               code = c & 0x7F;
2031             }
2032           else
2033             {
2034               if ((charset_ID = emacs_mule_charset[c]) < 0)
2035                 goto invalid_code;
2036               ONE_MORE_BYTE (c);
2037               if (c < 0xA0)
2038                 goto invalid_code;
2039               code = (c & 0x7F) << 8;
2040               ONE_MORE_BYTE (c);
2041               if (c < 0xA0)
2042                 goto invalid_code;
2043               code |= c & 0x7F;
2044             }
2045           break;
2046
2047         case 4:
2048           ONE_MORE_BYTE (c);
2049           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2050             goto invalid_code;
2051           ONE_MORE_BYTE (c);
2052           if (c < 0xA0)
2053             goto invalid_code;
2054           code = (c & 0x7F) << 8;
2055           ONE_MORE_BYTE (c);
2056           if (c < 0xA0)
2057             goto invalid_code;
2058           code |= c & 0x7F;
2059           break;
2060
2061         case 1:
2062           code = c;
2063           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2064           break;
2065
2066         default:
2067           abort ();
2068         }
2069       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2070                           CHARSET_FROM_ID (charset_ID), code, c);
2071       if (c < 0)
2072         goto invalid_code;
2073     }
2074   *nbytes = src - src_base;
2075   *nchars = consumed_chars;
2076   if (id)
2077     *id = charset_ID;
2078   return (mseq_found ? -c : c);
2079
2080  no_more_source:
2081   return -2;
2082
2083  invalid_code:
2084   return -1;
2085 }
2086
2087
2088 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2089
2090 /* Handle these composition sequence ('|': the end of header elements,
2091    BYTES and CHARS >= 0xA0):
2092
2093    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2094    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2095    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2096
2097    and these old form:
2098
2099    (4) relative composition: 0x80 | MSEQ ... MSEQ
2100    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2101
2102    When the starter 0x80 and the following header elements are found,
2103    this annotation header is produced.
2104
2105         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2106
2107    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2108    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2109
2110    Then, upon reading the following elements, these codes are produced
2111    until the composition end is found:
2112
2113    (1) CHAR ... CHAR
2114    (2) ALT ... ALT CHAR ... CHAR
2115    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2116    (4) CHAR ... CHAR
2117    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2118
2119    When the composition end is found, LENGTH and NCHARS in the
2120    annotation header is updated as below:
2121
2122    (1) LENGTH: unchanged, NCHARS: unchanged
2123    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2124    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2125    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2126    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2127
2128    If an error is found while composing, the annotation header is
2129    changed to the original composition header (plus filler -1s) as
2130    below:
2131
2132    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2133    (5)          [ 0x80 0xFF -1 -1- -1 ]
2134
2135    and the sequence [ -2 DECODED-RULE ] is changed to the original
2136    byte sequence as below:
2137         o the original byte sequence is B: [ B -1 ]
2138         o the original byte sequence is B1 B2: [ B1 B2 ]
2139
2140    Most of the routines are implemented by macros because many
2141    variables and labels in the caller decode_coding_emacs_mule must be
2142    accessible, and they are usually called just once (thus doesn't
2143    increase the size of compiled object).  */
2144
2145 /* Decode a composition rule represented by C as a component of
2146    composition sequence of Emacs 20 style.  Set RULE to the decoded
2147    rule. */
2148
2149 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2150   do {                                                  \
2151     int gref, nref;                                     \
2152                                                         \
2153     c -= 0xA0;                                          \
2154     if (c < 0 || c >= 81)                               \
2155       goto invalid_code;                                \
2156     gref = c / 9, nref = c % 9;                         \
2157     if (gref == 4) gref = 10;                           \
2158     if (nref == 4) nref = 10;                           \
2159     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2160   } while (0)
2161
2162
2163 /* Decode a composition rule represented by C and the following byte
2164    at SRC as a component of composition sequence of Emacs 21 style.
2165    Set RULE to the decoded rule.  */
2166
2167 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2168   do {                                                  \
2169     int gref, nref;                                     \
2170                                                         \
2171     gref = c - 0x20;                                    \
2172     if (gref < 0 || gref >= 81)                         \
2173       goto invalid_code;                                \
2174     ONE_MORE_BYTE (c);                                  \
2175     nref = c - 0x20;                                    \
2176     if (nref < 0 || nref >= 81)                         \
2177       goto invalid_code;                                \
2178     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2179   } while (0)
2180
2181
2182 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2183    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2184    byte length of this composition information, CHARS is the number of
2185    characters composed by this composition.  */
2186
2187 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2188   do {                                                                  \
2189     enum composition_method method = c - 0xF2;                          \
2190     int nbytes, nchars;                                                 \
2191                                                                         \
2192     ONE_MORE_BYTE (c);                                                  \
2193     if (c < 0)                                                          \
2194       goto invalid_code;                                                \
2195     nbytes = c - 0xA0;                                                  \
2196     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2197       goto invalid_code;                                                \
2198     ONE_MORE_BYTE (c);                                                  \
2199     nchars = c - 0xA0;                                                  \
2200     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2201       goto invalid_code;                                                \
2202     cmp_status->old_form = 0;                                           \
2203     cmp_status->method = method;                                        \
2204     if (method == COMPOSITION_RELATIVE)                                 \
2205       cmp_status->state = COMPOSING_CHAR;                               \
2206     else                                                                \
2207       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2208     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2209     cmp_status->nchars = nchars;                                        \
2210     cmp_status->ncomps = nbytes - 4;                                    \
2211     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2212   } while (0)
2213
2214
2215 /* Start of Emacs 20 style format for relative composition.  */
2216
2217 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2218   do {                                                          \
2219     cmp_status->old_form = 1;                                   \
2220     cmp_status->method = COMPOSITION_RELATIVE;                  \
2221     cmp_status->state = COMPOSING_CHAR;                         \
2222     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2223     cmp_status->nchars = cmp_status->ncomps = 0;                \
2224     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2225   } while (0)
2226
2227
2228 /* Start of Emacs 20 style format for rule-base composition.  */
2229
2230 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2231   do {                                                          \
2232     cmp_status->old_form = 1;                                   \
2233     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2234     cmp_status->state = COMPOSING_CHAR;                         \
2235     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2236     cmp_status->nchars = cmp_status->ncomps = 0;                \
2237     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2238   } while (0)
2239
2240
2241 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2242   do {                                                  \
2243     const unsigned char *current_src = src;             \
2244                                                         \
2245     ONE_MORE_BYTE (c);                                  \
2246     if (c < 0)                                          \
2247       goto invalid_code;                                \
2248     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2249         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2250       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2251     else if (c < 0xA0)                                  \
2252       goto invalid_code;                                \
2253     else if (c < 0xC0)                                  \
2254       {                                                 \
2255         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2256         /* Re-read C as a composition component.  */    \
2257         src = current_src;                              \
2258       }                                                 \
2259     else if (c == 0xFF)                                 \
2260       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2261     else                                                \
2262       goto invalid_code;                                \
2263   } while (0)
2264
2265 #define EMACS_MULE_COMPOSITION_END()                            \
2266   do {                                                          \
2267     int idx = - cmp_status->length;                             \
2268                                                                 \
2269     if (cmp_status->old_form)                                   \
2270       charbuf[idx + 2] = cmp_status->nchars;                    \
2271     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2272       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2273     cmp_status->state = COMPOSING_NO;                           \
2274   } while (0)
2275
2276
2277 static int
2278 emacs_mule_finish_composition (int *charbuf,
2279                                struct composition_status *cmp_status)
2280 {
2281   int idx = - cmp_status->length;
2282   int new_chars;
2283
2284   if (cmp_status->old_form && cmp_status->nchars > 0)
2285     {
2286       charbuf[idx + 2] = cmp_status->nchars;
2287       new_chars = 0;
2288       if (cmp_status->method == COMPOSITION_WITH_RULE
2289           && cmp_status->state == COMPOSING_CHAR)
2290         {
2291           /* The last rule was invalid.  */
2292           int rule = charbuf[-1] + 0xA0;
2293
2294           charbuf[-2] = BYTE8_TO_CHAR (rule);
2295           charbuf[-1] = -1;
2296           new_chars = 1;
2297         }
2298     }
2299   else
2300     {
2301       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2302
2303       if (cmp_status->method == COMPOSITION_WITH_RULE)
2304         {
2305           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2306           charbuf[idx++] = -3;
2307           charbuf[idx++] = 0;
2308           new_chars = 1;
2309         }
2310       else
2311         {
2312           int nchars = charbuf[idx + 1] + 0xA0;
2313           int nbytes = charbuf[idx + 2] + 0xA0;
2314
2315           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2316           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2317           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2318           charbuf[idx++] = -1;
2319           new_chars = 4;
2320         }
2321     }
2322   cmp_status->state = COMPOSING_NO;
2323   return new_chars;
2324 }
2325
2326 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2327   do {                                                                    \
2328     if (cmp_status->state != COMPOSING_NO)                                \
2329       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2330   } while (0)
2331
2332
2333 static void
2334 decode_coding_emacs_mule (struct coding_system *coding)
2335 {
2336   const unsigned char *src = coding->source + coding->consumed;
2337   const unsigned char *src_end = coding->source + coding->src_bytes;
2338   const unsigned char *src_base;
2339   int *charbuf = coding->charbuf + coding->charbuf_used;
2340   /* We may produce two annotations (charset and composition) in one
2341      loop and one more charset annotation at the end.  */
2342   int *charbuf_end
2343     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2344   int consumed_chars = 0, consumed_chars_base;
2345   int multibytep = coding->src_multibyte;
2346   Lisp_Object attrs, charset_list;
2347   int char_offset = coding->produced_char;
2348   int last_offset = char_offset;
2349   int last_id = charset_ascii;
2350   int eol_dos =
2351     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2352   int byte_after_cr = -1;
2353   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2354
2355   CODING_GET_INFO (coding, attrs, charset_list);
2356
2357   if (cmp_status->state != COMPOSING_NO)
2358     {
2359       int i;
2360
2361       for (i = 0; i < cmp_status->length; i++)
2362         *charbuf++ = cmp_status->carryover[i];
2363       coding->annotated = 1;
2364     }
2365
2366   while (1)
2367     {
2368       int c, id IF_LINT (= 0);
2369
2370       src_base = src;
2371       consumed_chars_base = consumed_chars;
2372
2373       if (charbuf >= charbuf_end)
2374         {
2375           if (byte_after_cr >= 0)
2376             src_base--;
2377           break;
2378         }
2379
2380       if (byte_after_cr >= 0)
2381         c = byte_after_cr, byte_after_cr = -1;
2382       else
2383         ONE_MORE_BYTE (c);
2384
2385       if (c < 0 || c == 0x80)
2386         {
2387           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2388           if (c < 0)
2389             {
2390               *charbuf++ = -c;
2391               char_offset++;
2392             }
2393           else
2394             DECODE_EMACS_MULE_COMPOSITION_START ();
2395           continue;
2396         }
2397
2398       if (c < 0x80)
2399         {
2400           if (eol_dos && c == '\r')
2401             ONE_MORE_BYTE (byte_after_cr);
2402           id = charset_ascii;
2403           if (cmp_status->state != COMPOSING_NO)
2404             {
2405               if (cmp_status->old_form)
2406                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2407               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2408                 cmp_status->ncomps--;
2409             }
2410         }
2411       else
2412         {
2413           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2414           /* emacs_mule_char can load a charset map from a file, which
2415              allocates a large structure and might cause buffer text
2416              to be relocated as result.  Thus, we need to remember the
2417              original pointer to buffer text, and fix up all related
2418              pointers after the call.  */
2419           const unsigned char *orig = coding->source;
2420           EMACS_INT offset;
2421
2422           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2423                                cmp_status);
2424           offset = coding->source - orig;
2425           if (offset)
2426             {
2427               src += offset;
2428               src_base += offset;
2429               src_end += offset;
2430             }
2431           if (c < 0)
2432             {
2433               if (c == -1)
2434                 goto invalid_code;
2435               if (c == -2)
2436                 break;
2437             }
2438           src = src_base + nbytes;
2439           consumed_chars = consumed_chars_base + nchars;
2440           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2441             cmp_status->ncomps -= nchars;
2442         }
2443
2444       /* Now if C >= 0, we found a normally encoded character, if C <
2445          0, we found an old-style composition component character or
2446          rule.  */
2447
2448       if (cmp_status->state == COMPOSING_NO)
2449         {
2450           if (last_id != id)
2451             {
2452               if (last_id != charset_ascii)
2453                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2454                                   last_id);
2455               last_id = id;
2456               last_offset = char_offset;
2457             }
2458           *charbuf++ = c;
2459           char_offset++;
2460         }
2461       else if (cmp_status->state == COMPOSING_CHAR)
2462         {
2463           if (cmp_status->old_form)
2464             {
2465               if (c >= 0)
2466                 {
2467                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2468                   *charbuf++ = c;
2469                   char_offset++;
2470                 }
2471               else
2472                 {
2473                   *charbuf++ = -c;
2474                   cmp_status->nchars++;
2475                   cmp_status->length++;
2476                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2477                     EMACS_MULE_COMPOSITION_END ();
2478                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2479                     cmp_status->state = COMPOSING_RULE;
2480                 }
2481             }
2482           else
2483             {
2484               *charbuf++ = c;
2485               cmp_status->length++;
2486               cmp_status->nchars--;
2487               if (cmp_status->nchars == 0)
2488                 EMACS_MULE_COMPOSITION_END ();
2489             }
2490         }
2491       else if (cmp_status->state == COMPOSING_RULE)
2492         {
2493           int rule;
2494
2495           if (c >= 0)
2496             {
2497               EMACS_MULE_COMPOSITION_END ();
2498               *charbuf++ = c;
2499               char_offset++;
2500             }
2501           else
2502             {
2503               c = -c;
2504               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2505               if (rule < 0)
2506                 goto invalid_code;
2507               *charbuf++ = -2;
2508               *charbuf++ = rule;
2509               cmp_status->length += 2;
2510               cmp_status->state = COMPOSING_CHAR;
2511             }
2512         }
2513       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2514         {
2515           *charbuf++ = c;
2516           cmp_status->length++;
2517           if (cmp_status->ncomps == 0)
2518             cmp_status->state = COMPOSING_CHAR;
2519           else if (cmp_status->ncomps > 0)
2520             {
2521               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2522                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2523             }
2524           else
2525             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2526         }
2527       else                      /* COMPOSING_COMPONENT_RULE */
2528         {
2529           int rule;
2530
2531           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2532           if (rule < 0)
2533             goto invalid_code;
2534           *charbuf++ = -2;
2535           *charbuf++ = rule;
2536           cmp_status->length += 2;
2537           cmp_status->ncomps--;
2538           if (cmp_status->ncomps > 0)
2539             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2540           else
2541             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2542         }
2543       continue;
2544
2545     invalid_code:
2546       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2547       src = src_base;
2548       consumed_chars = consumed_chars_base;
2549       ONE_MORE_BYTE (c);
2550       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2551       char_offset++;
2552       coding->errors++;
2553     }
2554
2555  no_more_source:
2556   if (cmp_status->state != COMPOSING_NO)
2557     {
2558       if (coding->mode & CODING_MODE_LAST_BLOCK)
2559         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2560       else
2561         {
2562           int i;
2563
2564           charbuf -= cmp_status->length;
2565           for (i = 0; i < cmp_status->length; i++)
2566             cmp_status->carryover[i] = charbuf[i];
2567         }
2568     }
2569   if (last_id != charset_ascii)
2570     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2571   coding->consumed_char += consumed_chars_base;
2572   coding->consumed = src_base - coding->source;
2573   coding->charbuf_used = charbuf - coding->charbuf;
2574 }
2575
2576
2577 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2578   do {                                          \
2579     if (id < 0xA0)                              \
2580       codes[0] = id, codes[1] = 0;              \
2581     else if (id < 0xE0)                         \
2582       codes[0] = 0x9A, codes[1] = id;           \
2583     else if (id < 0xF0)                         \
2584       codes[0] = 0x9B, codes[1] = id;           \
2585     else if (id < 0xF5)                         \
2586       codes[0] = 0x9C, codes[1] = id;           \
2587     else                                        \
2588       codes[0] = 0x9D, codes[1] = id;           \
2589   } while (0);
2590
2591
2592 static int
2593 encode_coding_emacs_mule (struct coding_system *coding)
2594 {
2595   int multibytep = coding->dst_multibyte;
2596   int *charbuf = coding->charbuf;
2597   int *charbuf_end = charbuf + coding->charbuf_used;
2598   unsigned char *dst = coding->destination + coding->produced;
2599   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2600   int safe_room = 8;
2601   int produced_chars = 0;
2602   Lisp_Object attrs, charset_list;
2603   int c;
2604   int preferred_charset_id = -1;
2605
2606   CODING_GET_INFO (coding, attrs, charset_list);
2607   if (! EQ (charset_list, Vemacs_mule_charset_list))
2608     {
2609       CODING_ATTR_CHARSET_LIST (attrs)
2610         = charset_list = Vemacs_mule_charset_list;
2611     }
2612
2613   while (charbuf < charbuf_end)
2614     {
2615       ASSURE_DESTINATION (safe_room);
2616       c = *charbuf++;
2617
2618       if (c < 0)
2619         {
2620           /* Handle an annotation.  */
2621           switch (*charbuf)
2622             {
2623             case CODING_ANNOTATE_COMPOSITION_MASK:
2624               /* Not yet implemented.  */
2625               break;
2626             case CODING_ANNOTATE_CHARSET_MASK:
2627               preferred_charset_id = charbuf[3];
2628               if (preferred_charset_id >= 0
2629                   && NILP (Fmemq (make_number (preferred_charset_id),
2630                                   charset_list)))
2631                 preferred_charset_id = -1;
2632               break;
2633             default:
2634               abort ();
2635             }
2636           charbuf += -c - 1;
2637           continue;
2638         }
2639
2640       if (ASCII_CHAR_P (c))
2641         EMIT_ONE_ASCII_BYTE (c);
2642       else if (CHAR_BYTE8_P (c))
2643         {
2644           c = CHAR_TO_BYTE8 (c);
2645           EMIT_ONE_BYTE (c);
2646         }
2647       else
2648         {
2649           struct charset *charset;
2650           unsigned code;
2651           int dimension;
2652           int emacs_mule_id;
2653           unsigned char leading_codes[2];
2654
2655           if (preferred_charset_id >= 0)
2656             {
2657               charset = CHARSET_FROM_ID (preferred_charset_id);
2658               if (CHAR_CHARSET_P (c, charset))
2659                 code = ENCODE_CHAR (charset, c);
2660               else
2661                 charset = char_charset (c, charset_list, &code);
2662             }
2663           else
2664             charset = char_charset (c, charset_list, &code);
2665           if (! charset)
2666             {
2667               c = coding->default_char;
2668               if (ASCII_CHAR_P (c))
2669                 {
2670                   EMIT_ONE_ASCII_BYTE (c);
2671                   continue;
2672                 }
2673               charset = char_charset (c, charset_list, &code);
2674             }
2675           dimension = CHARSET_DIMENSION (charset);
2676           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2677           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2678           EMIT_ONE_BYTE (leading_codes[0]);
2679           if (leading_codes[1])
2680             EMIT_ONE_BYTE (leading_codes[1]);
2681           if (dimension == 1)
2682             EMIT_ONE_BYTE (code | 0x80);
2683           else
2684             {
2685               code |= 0x8080;
2686               EMIT_ONE_BYTE (code >> 8);
2687               EMIT_ONE_BYTE (code & 0xFF);
2688             }
2689         }
2690     }
2691   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2692   coding->produced_char += produced_chars;
2693   coding->produced = dst - coding->destination;
2694   return 0;
2695 }
2696
2697 \f
2698 /*** 7. ISO2022 handlers ***/
2699
2700 /* The following note describes the coding system ISO2022 briefly.
2701    Since the intention of this note is to help understand the
2702    functions in this file, some parts are NOT ACCURATE or are OVERLY
2703    SIMPLIFIED.  For thorough understanding, please refer to the
2704    original document of ISO2022.  This is equivalent to the standard
2705    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2706
2707    ISO2022 provides many mechanisms to encode several character sets
2708    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2709    is encoded using bytes less than 128.  This may make the encoded
2710    text a little bit longer, but the text passes more easily through
2711    several types of gateway, some of which strip off the MSB (Most
2712    Significant Bit).
2713
2714    There are two kinds of character sets: control character sets and
2715    graphic character sets.  The former contain control characters such
2716    as `newline' and `escape' to provide control functions (control
2717    functions are also provided by escape sequences).  The latter
2718    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2719    two control character sets and many graphic character sets.
2720
2721    Graphic character sets are classified into one of the following
2722    four classes, according to the number of bytes (DIMENSION) and
2723    number of characters in one dimension (CHARS) of the set:
2724    - DIMENSION1_CHARS94
2725    - DIMENSION1_CHARS96
2726    - DIMENSION2_CHARS94
2727    - DIMENSION2_CHARS96
2728
2729    In addition, each character set is assigned an identification tag,
2730    unique for each set, called the "final character" (denoted as <F>
2731    hereafter).  The <F> of each character set is decided by ECMA(*)
2732    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2733    (0x30..0x3F are for private use only).
2734
2735    Note (*): ECMA = European Computer Manufacturers Association
2736
2737    Here are examples of graphic character sets [NAME(<F>)]:
2738         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2739         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2740         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2741         o DIMENSION2_CHARS96 -- none for the moment
2742
2743    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2744         C0 [0x00..0x1F] -- control character plane 0
2745         GL [0x20..0x7F] -- graphic character plane 0
2746         C1 [0x80..0x9F] -- control character plane 1
2747         GR [0xA0..0xFF] -- graphic character plane 1
2748
2749    A control character set is directly designated and invoked to C0 or
2750    C1 by an escape sequence.  The most common case is that:
2751    - ISO646's  control character set is designated/invoked to C0, and
2752    - ISO6429's control character set is designated/invoked to C1,
2753    and usually these designations/invocations are omitted in encoded
2754    text.  In a 7-bit environment, only C0 can be used, and a control
2755    character for C1 is encoded by an appropriate escape sequence to
2756    fit into the environment.  All control characters for C1 are
2757    defined to have corresponding escape sequences.
2758
2759    A graphic character set is at first designated to one of four
2760    graphic registers (G0 through G3), then these graphic registers are
2761    invoked to GL or GR.  These designations and invocations can be
2762    done independently.  The most common case is that G0 is invoked to
2763    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2764    these invocations and designations are omitted in encoded text.
2765    In a 7-bit environment, only GL can be used.
2766
2767    When a graphic character set of CHARS94 is invoked to GL, codes
2768    0x20 and 0x7F of the GL area work as control characters SPACE and
2769    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2770    be used.
2771
2772    There are two ways of invocation: locking-shift and single-shift.
2773    With locking-shift, the invocation lasts until the next different
2774    invocation, whereas with single-shift, the invocation affects the
2775    following character only and doesn't affect the locking-shift
2776    state.  Invocations are done by the following control characters or
2777    escape sequences:
2778
2779    ----------------------------------------------------------------------
2780    abbrev  function                  cntrl escape seq   description
2781    ----------------------------------------------------------------------
2782    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2783    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2784    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2785    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2786    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2787    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2788    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2789    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2790    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2791    ----------------------------------------------------------------------
2792    (*) These are not used by any known coding system.
2793
2794    Control characters for these functions are defined by macros
2795    ISO_CODE_XXX in `coding.h'.
2796
2797    Designations are done by the following escape sequences:
2798    ----------------------------------------------------------------------
2799    escape sequence      description
2800    ----------------------------------------------------------------------
2801    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2802    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2803    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2804    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2805    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2806    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2807    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2808    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2809    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2810    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2811    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2812    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2813    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2814    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2815    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2816    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2817    ----------------------------------------------------------------------
2818
2819    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2820    of dimension 1, chars 94, and final character <F>, etc...
2821
2822    Note (*): Although these designations are not allowed in ISO2022,
2823    Emacs accepts them on decoding, and produces them on encoding
2824    CHARS96 character sets in a coding system which is characterized as
2825    7-bit environment, non-locking-shift, and non-single-shift.
2826
2827    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2828    '(' must be omitted.  We refer to this as "short-form" hereafter.
2829
2830    Now you may notice that there are a lot of ways of encoding the
2831    same multilingual text in ISO2022.  Actually, there exist many
2832    coding systems such as Compound Text (used in X11's inter client
2833    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2834    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2835    localized platforms), and all of these are variants of ISO2022.
2836
2837    In addition to the above, Emacs handles two more kinds of escape
2838    sequences: ISO6429's direction specification and Emacs' private
2839    sequence for specifying character composition.
2840
2841    ISO6429's direction specification takes the following form:
2842         o CSI ']'      -- end of the current direction
2843         o CSI '0' ']'  -- end of the current direction
2844         o CSI '1' ']'  -- start of left-to-right text
2845         o CSI '2' ']'  -- start of right-to-left text
2846    The control character CSI (0x9B: control sequence introducer) is
2847    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2848
2849    Character composition specification takes the following form:
2850         o ESC '0' -- start relative composition
2851         o ESC '1' -- end composition
2852         o ESC '2' -- start rule-base composition (*)
2853         o ESC '3' -- start relative composition with alternate chars  (**)
2854         o ESC '4' -- start rule-base composition with alternate chars  (**)
2855   Since these are not standard escape sequences of any ISO standard,
2856   the use of them with these meanings is restricted to Emacs only.
2857
2858   (*) This form is used only in Emacs 20.7 and older versions,
2859   but newer versions can safely decode it.
2860   (**) This form is used only in Emacs 21.1 and newer versions,
2861   and older versions can't decode it.
2862
2863   Here's a list of example usages of these composition escape
2864   sequences (categorized by `enum composition_method').
2865
2866   COMPOSITION_RELATIVE:
2867         ESC 0 CHAR [ CHAR ] ESC 1
2868   COMPOSITION_WITH_RULE:
2869         ESC 2 CHAR [ RULE CHAR ] ESC 1
2870   COMPOSITION_WITH_ALTCHARS:
2871         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2872   COMPOSITION_WITH_RULE_ALTCHARS:
2873         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2874
2875 enum iso_code_class_type iso_code_class[256];
2876
2877 #define SAFE_CHARSET_P(coding, id)      \
2878   ((id) <= (coding)->max_charset_id     \
2879    && (coding)->safe_charsets[id] != 255)
2880
2881 static void
2882 setup_iso_safe_charsets (Lisp_Object attrs)
2883 {
2884   Lisp_Object charset_list, safe_charsets;
2885   Lisp_Object request;
2886   Lisp_Object reg_usage;
2887   Lisp_Object tail;
2888   int reg94, reg96;
2889   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2890   int max_charset_id;
2891
2892   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2893   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2894       && ! EQ (charset_list, Viso_2022_charset_list))
2895     {
2896       CODING_ATTR_CHARSET_LIST (attrs)
2897         = charset_list = Viso_2022_charset_list;
2898       ASET (attrs, coding_attr_safe_charsets, Qnil);
2899     }
2900
2901   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2902     return;
2903
2904   max_charset_id = 0;
2905   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2906     {
2907       int id = XINT (XCAR (tail));
2908       if (max_charset_id < id)
2909         max_charset_id = id;
2910     }
2911
2912   safe_charsets = make_uninit_string (max_charset_id + 1);
2913   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2914   request = AREF (attrs, coding_attr_iso_request);
2915   reg_usage = AREF (attrs, coding_attr_iso_usage);
2916   reg94 = XINT (XCAR (reg_usage));
2917   reg96 = XINT (XCDR (reg_usage));
2918
2919   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2920     {
2921       Lisp_Object id;
2922       Lisp_Object reg;
2923       struct charset *charset;
2924
2925       id = XCAR (tail);
2926       charset = CHARSET_FROM_ID (XINT (id));
2927       reg = Fcdr (Fassq (id, request));
2928       if (! NILP (reg))
2929         SSET (safe_charsets, XINT (id), XINT (reg));
2930       else if (charset->iso_chars_96)
2931         {
2932           if (reg96 < 4)
2933             SSET (safe_charsets, XINT (id), reg96);
2934         }
2935       else
2936         {
2937           if (reg94 < 4)
2938             SSET (safe_charsets, XINT (id), reg94);
2939         }
2940     }
2941   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2942 }
2943
2944
2945 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2946    Check if a text is encoded in one of ISO-2022 based coding systems.
2947    If it is, return 1, else return 0.  */
2948
2949 static int
2950 detect_coding_iso_2022 (struct coding_system *coding,
2951                         struct coding_detection_info *detect_info)
2952 {
2953   const unsigned char *src = coding->source, *src_base = src;
2954   const unsigned char *src_end = coding->source + coding->src_bytes;
2955   int multibytep = coding->src_multibyte;
2956   int single_shifting = 0;
2957
2958   /* FIXME: Does ID need to be initialized here?  The "End of composition"
2959      code below does not initialize ID even though ID is used
2960      afterwards, and perhaps that is a bug.  */
2961   int id = 0;
2962
2963   int c, c1;
2964   int consumed_chars = 0;
2965   int i;
2966   int rejected = 0;
2967   int found = 0;
2968   int composition_count = -1;
2969
2970   detect_info->checked |= CATEGORY_MASK_ISO;
2971
2972   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2973     {
2974       struct coding_system *this = &(coding_categories[i]);
2975       Lisp_Object attrs, val;
2976
2977       if (this->id < 0)
2978         continue;
2979       attrs = CODING_ID_ATTRS (this->id);
2980       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2981           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2982         setup_iso_safe_charsets (attrs);
2983       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2984       this->max_charset_id = SCHARS (val) - 1;
2985       this->safe_charsets = SDATA (val);
2986     }
2987
2988   /* A coding system of this category is always ASCII compatible.  */
2989   src += coding->head_ascii;
2990
2991   while (rejected != CATEGORY_MASK_ISO)
2992     {
2993       src_base = src;
2994       ONE_MORE_BYTE (c);
2995       switch (c)
2996         {
2997         case ISO_CODE_ESC:
2998           if (inhibit_iso_escape_detection)
2999             break;
3000           single_shifting = 0;
3001           ONE_MORE_BYTE (c);
3002           if (c >= '(' && c <= '/')
3003             {
3004               /* Designation sequence for a charset of dimension 1.  */
3005               ONE_MORE_BYTE (c1);
3006               if (c1 < ' ' || c1 >= 0x80
3007                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3008                 /* Invalid designation sequence.  Just ignore.  */
3009                 break;
3010             }
3011           else if (c == '$')
3012             {
3013               /* Designation sequence for a charset of dimension 2.  */
3014               ONE_MORE_BYTE (c);
3015               if (c >= '@' && c <= 'B')
3016                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3017                 id = iso_charset_table[1][0][c];
3018               else if (c >= '(' && c <= '/')
3019                 {
3020                   ONE_MORE_BYTE (c1);
3021                   if (c1 < ' ' || c1 >= 0x80
3022                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3023                     /* Invalid designation sequence.  Just ignore.  */
3024                     break;
3025                 }
3026               else
3027                 /* Invalid designation sequence.  Just ignore it.  */
3028                 break;
3029             }
3030           else if (c == 'N' || c == 'O')
3031             {
3032               /* ESC <Fe> for SS2 or SS3.  */
3033               single_shifting = 1;
3034               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3035               break;
3036             }
3037           else if (c == '1')
3038             {
3039               /* End of composition.  */
3040               if (composition_count < 0
3041                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3042                 /* Invalid */
3043                 break;
3044               composition_count = -1;
3045               found |= CATEGORY_MASK_ISO;
3046             }
3047           else if (c >= '0' && c <= '4')
3048             {
3049               /* ESC <Fp> for start/end composition.  */
3050               composition_count = 0;
3051               break;
3052             }
3053           else
3054             {
3055               /* Invalid escape sequence.  Just ignore it.  */
3056               break;
3057             }
3058
3059           /* We found a valid designation sequence for CHARSET.  */
3060           rejected |= CATEGORY_MASK_ISO_8BIT;
3061           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3062                               id))
3063             found |= CATEGORY_MASK_ISO_7;
3064           else
3065             rejected |= CATEGORY_MASK_ISO_7;
3066           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3067                               id))
3068             found |= CATEGORY_MASK_ISO_7_TIGHT;
3069           else
3070             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3071           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3072                               id))
3073             found |= CATEGORY_MASK_ISO_7_ELSE;
3074           else
3075             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3076           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3077                               id))
3078             found |= CATEGORY_MASK_ISO_8_ELSE;
3079           else
3080             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3081           break;
3082
3083         case ISO_CODE_SO:
3084         case ISO_CODE_SI:
3085           /* Locking shift out/in.  */
3086           if (inhibit_iso_escape_detection)
3087             break;
3088           single_shifting = 0;
3089           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3090           break;
3091
3092         case ISO_CODE_CSI:
3093           /* Control sequence introducer.  */
3094           single_shifting = 0;
3095           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3096           found |= CATEGORY_MASK_ISO_8_ELSE;
3097           goto check_extra_latin;
3098
3099         case ISO_CODE_SS2:
3100         case ISO_CODE_SS3:
3101           /* Single shift.   */
3102           if (inhibit_iso_escape_detection)
3103             break;
3104           single_shifting = 0;
3105           rejected |= CATEGORY_MASK_ISO_7BIT;
3106           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3107               & CODING_ISO_FLAG_SINGLE_SHIFT)
3108             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3109           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3110               & CODING_ISO_FLAG_SINGLE_SHIFT)
3111             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3112           if (single_shifting)
3113             break;
3114           goto check_extra_latin;
3115
3116         default:
3117           if (c < 0)
3118             continue;
3119           if (c < 0x80)
3120             {
3121               if (composition_count >= 0)
3122                 composition_count++;
3123               single_shifting = 0;
3124               break;
3125             }
3126           if (c >= 0xA0)
3127             {
3128               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3129               found |= CATEGORY_MASK_ISO_8_1;
3130               /* Check the length of succeeding codes of the range
3131                  0xA0..0FF.  If the byte length is even, we include
3132                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3133                  only when we are not single shifting.  */
3134               if (! single_shifting
3135                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3136                 {
3137                   int len = 1;
3138                   while (src < src_end)
3139                     {
3140                       src_base = src;
3141                       ONE_MORE_BYTE (c);
3142                       if (c < 0xA0)
3143                         {
3144                           src = src_base;
3145                           break;
3146                         }
3147                       len++;
3148                     }
3149
3150                   if (len & 1 && src < src_end)
3151                     {
3152                       rejected |= CATEGORY_MASK_ISO_8_2;
3153                       if (composition_count >= 0)
3154                         composition_count += len;
3155                     }
3156                   else
3157                     {
3158                       found |= CATEGORY_MASK_ISO_8_2;
3159                       if (composition_count >= 0)
3160                         composition_count += len / 2;
3161                     }
3162                 }
3163               break;
3164             }
3165         check_extra_latin:
3166           single_shifting = 0;
3167           if (! VECTORP (Vlatin_extra_code_table)
3168               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3169             {
3170               rejected = CATEGORY_MASK_ISO;
3171               break;
3172             }
3173           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3174               & CODING_ISO_FLAG_LATIN_EXTRA)
3175             found |= CATEGORY_MASK_ISO_8_1;
3176           else
3177             rejected |= CATEGORY_MASK_ISO_8_1;
3178           rejected |= CATEGORY_MASK_ISO_8_2;
3179         }
3180     }
3181   detect_info->rejected |= CATEGORY_MASK_ISO;
3182   return 0;
3183
3184  no_more_source:
3185   detect_info->rejected |= rejected;
3186   detect_info->found |= (found & ~rejected);
3187   return 1;
3188 }
3189
3190
3191 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3192    escape sequence should be kept.  */
3193 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3194   do {                                                                  \
3195     int id, prev;                                                       \
3196                                                                         \
3197     if (final < '0' || final >= 128                                     \
3198         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3199         || !SAFE_CHARSET_P (coding, id))                                \
3200       {                                                                 \
3201         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3202         chars_96 = -1;                                                  \
3203         break;                                                          \
3204       }                                                                 \
3205     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3206     if (id == charset_jisx0201_roman)                                   \
3207       {                                                                 \
3208         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3209           id = charset_ascii;                                           \
3210       }                                                                 \
3211     else if (id == charset_jisx0208_1978)                               \
3212       {                                                                 \
3213         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3214           id = charset_jisx0208;                                        \
3215       }                                                                 \
3216     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3217     /* If there was an invalid designation to REG previously, and this  \
3218        designation is ASCII to REG, we should keep this designation     \
3219        sequence.  */                                                    \
3220     if (prev == -2 && id == charset_ascii)                              \
3221       chars_96 = -1;                                                    \
3222   } while (0)
3223
3224
3225 /* Handle these composition sequence (ALT: alternate char):
3226
3227    (1) relative composition: ESC 0 CHAR ... ESC 1
3228    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3229    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3230    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3231
3232    When the start sequence (ESC 0/2/3/4) is found, this annotation
3233    header is produced.
3234
3235         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3236
3237    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3238    produced until the end sequence (ESC 1) is found:
3239
3240    (1) CHAR ... CHAR
3241    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3242    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3243    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3244
3245    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3246    annotation header is updated as below:
3247
3248    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3249    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3250    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3251    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3252
3253    If an error is found while composing, the annotation header is
3254    changed to:
3255
3256         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3257
3258    and the sequence [ -2 DECODED-RULE ] is changed to the original
3259    byte sequence as below:
3260         o the original byte sequence is B: [ B -1 ]
3261         o the original byte sequence is B1 B2: [ B1 B2 ]
3262    and the sequence [ -1 -1 ] is changed to the original byte
3263    sequence:
3264         [ ESC '0' ]
3265 */
3266
3267 /* Decode a composition rule C1 and maybe one more byte from the
3268    source, and set RULE to the encoded composition rule, NBYTES to the
3269    length of the composition rule.  If the rule is invalid, set RULE
3270    to some negative value.  */
3271
3272 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3273   do {                                                                  \
3274     rule = c1 - 32;                                                     \
3275     if (rule < 0)                                                       \
3276       break;                                                            \
3277     if (rule < 81)              /* old format (before ver.21) */        \
3278       {                                                                 \
3279         int gref = (rule) / 9;                                          \
3280         int nref = (rule) % 9;                                          \
3281         if (gref == 4) gref = 10;                                       \
3282         if (nref == 4) nref = 10;                                       \
3283         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3284         nbytes = 1;                                                     \
3285       }                                                                 \
3286     else                        /* new format (after ver.21) */         \
3287       {                                                                 \
3288         int b;                                                          \
3289                                                                         \
3290         ONE_MORE_BYTE (b);                                              \
3291         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3292         if (rule >= 0)                                                  \
3293           rule += 0x100;   /* to destinguish it from the old format */  \
3294         nbytes = 2;                                                     \
3295       }                                                                 \
3296   } while (0)
3297
3298 #define ENCODE_COMPOSITION_RULE(rule)                           \
3299   do {                                                          \
3300     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3301                                                                 \
3302     if (rule < 0x100)           /* old format */                \
3303       {                                                         \
3304         if (gref == 10) gref = 4;                               \
3305         if (nref == 10) nref = 4;                               \
3306         charbuf[idx] = 32 + gref * 9 + nref;                    \
3307         charbuf[idx + 1] = -1;                                  \
3308         new_chars++;                                            \
3309       }                                                         \
3310     else                                /* new format */        \
3311       {                                                         \
3312         charbuf[idx] = 32 + 81 + gref;                          \
3313         charbuf[idx + 1] = 32 + nref;                           \
3314         new_chars += 2;                                         \
3315       }                                                         \
3316   } while (0)
3317
3318 /* Finish the current composition as invalid.  */
3319
3320 static int finish_composition (int *, struct composition_status *);
3321
3322 static int
3323 finish_composition (int *charbuf, struct composition_status *cmp_status)
3324 {
3325   int idx = - cmp_status->length;
3326   int new_chars;
3327
3328   /* Recover the original ESC sequence */
3329   charbuf[idx++] = ISO_CODE_ESC;
3330   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3331                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3332                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3333                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3334                     : '4');
3335   charbuf[idx++] = -2;
3336   charbuf[idx++] = 0;
3337   charbuf[idx++] = -1;
3338   new_chars = cmp_status->nchars;
3339   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3340     for (; idx < 0; idx++)
3341       {
3342         int elt = charbuf[idx];
3343
3344         if (elt == -2)
3345           {
3346             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3347             idx++;
3348           }
3349         else if (elt == -1)
3350           {
3351             charbuf[idx++] = ISO_CODE_ESC;
3352             charbuf[idx] = '0';
3353             new_chars += 2;
3354           }
3355       }
3356   cmp_status->state = COMPOSING_NO;
3357   return new_chars;
3358 }
3359
3360 /* If characters are under composition, finish the composition.  */
3361 #define MAYBE_FINISH_COMPOSITION()                              \
3362   do {                                                          \
3363     if (cmp_status->state != COMPOSING_NO)                      \
3364       char_offset += finish_composition (charbuf, cmp_status);  \
3365   } while (0)
3366
3367 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3368
3369    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3370    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3371    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3372    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3373
3374    Produce this annotation sequence now:
3375
3376    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3377 */
3378
3379 #define DECODE_COMPOSITION_START(c1)                                       \
3380   do {                                                                     \
3381     if (c1 == '0'                                                          \
3382         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3383              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3384             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3385                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3386       {                                                                    \
3387         *charbuf++ = -1;                                                   \
3388         *charbuf++= -1;                                                    \
3389         cmp_status->state = COMPOSING_CHAR;                                \
3390         cmp_status->length += 2;                                           \
3391       }                                                                    \
3392     else                                                                   \
3393       {                                                                    \
3394         MAYBE_FINISH_COMPOSITION ();                                       \
3395         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3396                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3397                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3398                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3399         cmp_status->state                                                  \
3400           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3401         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3402         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3403         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3404         coding->annotated = 1;                                             \
3405       }                                                                    \
3406   } while (0)
3407
3408
3409 /* Handle composition end sequence ESC 1.  */
3410
3411 #define DECODE_COMPOSITION_END()                                        \
3412   do {                                                                  \
3413     if (cmp_status->nchars == 0                                         \
3414         || ((cmp_status->state == COMPOSING_CHAR)                       \
3415             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3416       {                                                                 \
3417         MAYBE_FINISH_COMPOSITION ();                                    \
3418         goto invalid_code;                                              \
3419       }                                                                 \
3420     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3421       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3422     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3423       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3424     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3425     char_offset += cmp_status->nchars;                                  \
3426     cmp_status->state = COMPOSING_NO;                                   \
3427   } while (0)
3428
3429 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3430
3431 #define STORE_COMPOSITION_RULE(rule)    \
3432   do {                                  \
3433     *charbuf++ = -2;                    \
3434     *charbuf++ = rule;                  \
3435     cmp_status->length += 2;            \
3436     cmp_status->state--;                \
3437   } while (0)
3438
3439 /* Store a composed char or a component char C in charbuf, and update
3440    cmp_status.  */
3441
3442 #define STORE_COMPOSITION_CHAR(c)                                       \
3443   do {                                                                  \
3444     *charbuf++ = (c);                                                   \
3445     cmp_status->length++;                                               \
3446     if (cmp_status->state == COMPOSING_CHAR)                            \
3447       cmp_status->nchars++;                                             \
3448     else                                                                \
3449       cmp_status->ncomps++;                                             \
3450     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3451         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3452             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3453       cmp_status->state++;                                              \
3454   } while (0)
3455
3456
3457 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3458
3459 static void
3460 decode_coding_iso_2022 (struct coding_system *coding)
3461 {
3462   const unsigned char *src = coding->source + coding->consumed;
3463   const unsigned char *src_end = coding->source + coding->src_bytes;
3464   const unsigned char *src_base;
3465   int *charbuf = coding->charbuf + coding->charbuf_used;
3466   /* We may produce two annotations (charset and composition) in one
3467      loop and one more charset annotation at the end.  */
3468   int *charbuf_end
3469     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3470   int consumed_chars = 0, consumed_chars_base;
3471   int multibytep = coding->src_multibyte;
3472   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3473   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3474   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3475   int charset_id_2, charset_id_3;
3476   struct charset *charset;
3477   int c;
3478   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3479   Lisp_Object attrs, charset_list;
3480   int char_offset = coding->produced_char;
3481   int last_offset = char_offset;
3482   int last_id = charset_ascii;
3483   int eol_dos =
3484     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3485   int byte_after_cr = -1;
3486   int i;
3487
3488   CODING_GET_INFO (coding, attrs, charset_list);
3489   setup_iso_safe_charsets (attrs);
3490   /* Charset list may have been changed.  */
3491   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3492   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3493
3494   if (cmp_status->state != COMPOSING_NO)
3495     {
3496       for (i = 0; i < cmp_status->length; i++)
3497         *charbuf++ = cmp_status->carryover[i];
3498       coding->annotated = 1;
3499     }
3500
3501   while (1)
3502     {
3503       int c1, c2, c3;
3504
3505       src_base = src;
3506       consumed_chars_base = consumed_chars;
3507
3508       if (charbuf >= charbuf_end)
3509         {
3510           if (byte_after_cr >= 0)
3511             src_base--;
3512           break;
3513         }
3514
3515       if (byte_after_cr >= 0)
3516         c1 = byte_after_cr, byte_after_cr = -1;
3517       else
3518         ONE_MORE_BYTE (c1);
3519       if (c1 < 0)
3520         goto invalid_code;
3521
3522       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3523         {
3524           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3525           char_offset++;
3526           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3527           continue;
3528         }
3529
3530       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3531         {
3532           if (c1 == ISO_CODE_ESC)
3533             {
3534               if (src + 1 >= src_end)
3535                 goto no_more_source;
3536               *charbuf++ = ISO_CODE_ESC;
3537               char_offset++;
3538               if (src[0] == '%' && src[1] == '@')
3539                 {
3540                   src += 2;
3541                   consumed_chars += 2;
3542                   char_offset += 2;
3543                   /* We are sure charbuf can contain two more chars. */
3544                   *charbuf++ = '%';
3545                   *charbuf++ = '@';
3546                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3547                 }
3548             }
3549           else
3550             {
3551               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3552               char_offset++;
3553             }
3554           continue;
3555         }
3556
3557       if ((cmp_status->state == COMPOSING_RULE
3558            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3559           && c1 != ISO_CODE_ESC)
3560         {
3561           int rule, nbytes;
3562
3563           DECODE_COMPOSITION_RULE (rule, nbytes);
3564           if (rule < 0)
3565             goto invalid_code;
3566           STORE_COMPOSITION_RULE (rule);
3567           continue;
3568         }
3569
3570       /* We produce at most one character.  */
3571       switch (iso_code_class [c1])
3572         {
3573         case ISO_0x20_or_0x7F:
3574           if (charset_id_0 < 0
3575               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3576             /* This is SPACE or DEL.  */
3577             charset = CHARSET_FROM_ID (charset_ascii);
3578           else
3579             charset = CHARSET_FROM_ID (charset_id_0);
3580           break;
3581
3582         case ISO_graphic_plane_0:
3583           if (charset_id_0 < 0)
3584             charset = CHARSET_FROM_ID (charset_ascii);
3585           else
3586             charset = CHARSET_FROM_ID (charset_id_0);
3587           break;
3588
3589         case ISO_0xA0_or_0xFF:
3590           if (charset_id_1 < 0
3591               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3592               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3593             goto invalid_code;
3594           /* This is a graphic character, we fall down ... */
3595
3596         case ISO_graphic_plane_1:
3597           if (charset_id_1 < 0)
3598             goto invalid_code;
3599           charset = CHARSET_FROM_ID (charset_id_1);
3600           break;
3601
3602         case ISO_control_0:
3603           if (eol_dos && c1 == '\r')
3604             ONE_MORE_BYTE (byte_after_cr);
3605           MAYBE_FINISH_COMPOSITION ();
3606           charset = CHARSET_FROM_ID (charset_ascii);
3607           break;
3608
3609         case ISO_control_1:
3610           goto invalid_code;
3611
3612         case ISO_shift_out:
3613           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3614               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3615             goto invalid_code;
3616           CODING_ISO_INVOCATION (coding, 0) = 1;
3617           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3618           continue;
3619
3620         case ISO_shift_in:
3621           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3622             goto invalid_code;
3623           CODING_ISO_INVOCATION (coding, 0) = 0;
3624           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3625           continue;
3626
3627         case ISO_single_shift_2_7:
3628           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3629             goto invalid_code;
3630         case ISO_single_shift_2:
3631           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3632             goto invalid_code;
3633           /* SS2 is handled as an escape sequence of ESC 'N' */
3634           c1 = 'N';
3635           goto label_escape_sequence;
3636
3637         case ISO_single_shift_3:
3638           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3639             goto invalid_code;
3640           /* SS2 is handled as an escape sequence of ESC 'O' */
3641           c1 = 'O';
3642           goto label_escape_sequence;
3643
3644         case ISO_control_sequence_introducer:
3645           /* CSI is handled as an escape sequence of ESC '[' ...  */
3646           c1 = '[';
3647           goto label_escape_sequence;
3648
3649         case ISO_escape:
3650           ONE_MORE_BYTE (c1);
3651         label_escape_sequence:
3652           /* Escape sequences handled here are invocation,
3653              designation, direction specification, and character
3654              composition specification.  */
3655           switch (c1)
3656             {
3657             case '&':           /* revision of following character set */
3658               ONE_MORE_BYTE (c1);
3659               if (!(c1 >= '@' && c1 <= '~'))
3660                 goto invalid_code;
3661               ONE_MORE_BYTE (c1);
3662               if (c1 != ISO_CODE_ESC)
3663                 goto invalid_code;
3664               ONE_MORE_BYTE (c1);
3665               goto label_escape_sequence;
3666
3667             case '$':           /* designation of 2-byte character set */
3668               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3669                 goto invalid_code;
3670               {
3671                 int reg, chars96;
3672
3673                 ONE_MORE_BYTE (c1);
3674                 if (c1 >= '@' && c1 <= 'B')
3675                   {     /* designation of JISX0208.1978, GB2312.1980,
3676                            or JISX0208.1980 */
3677                     reg = 0, chars96 = 0;
3678                   }
3679                 else if (c1 >= 0x28 && c1 <= 0x2B)
3680                   { /* designation of DIMENSION2_CHARS94 character set */
3681                     reg = c1 - 0x28, chars96 = 0;
3682                     ONE_MORE_BYTE (c1);
3683                   }
3684                 else if (c1 >= 0x2C && c1 <= 0x2F)
3685                   { /* designation of DIMENSION2_CHARS96 character set */
3686                     reg = c1 - 0x2C, chars96 = 1;
3687                     ONE_MORE_BYTE (c1);
3688                   }
3689                 else
3690                   goto invalid_code;
3691                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3692                 /* We must update these variables now.  */
3693                 if (reg == 0)
3694                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3695                 else if (reg == 1)
3696                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3697                 if (chars96 < 0)
3698                   goto invalid_code;
3699               }
3700               continue;
3701
3702             case 'n':           /* invocation of locking-shift-2 */
3703               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3704                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3705                 goto invalid_code;
3706               CODING_ISO_INVOCATION (coding, 0) = 2;
3707               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3708               continue;
3709
3710             case 'o':           /* invocation of locking-shift-3 */
3711               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3712                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3713                 goto invalid_code;
3714               CODING_ISO_INVOCATION (coding, 0) = 3;
3715               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3716               continue;
3717
3718             case 'N':           /* invocation of single-shift-2 */
3719               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3720                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3721                 goto invalid_code;
3722               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3723               if (charset_id_2 < 0)
3724                 charset = CHARSET_FROM_ID (charset_ascii);
3725               else
3726                 charset = CHARSET_FROM_ID (charset_id_2);
3727               ONE_MORE_BYTE (c1);
3728               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3729                 goto invalid_code;
3730               break;
3731
3732             case 'O':           /* invocation of single-shift-3 */
3733               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3734                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3735                 goto invalid_code;
3736               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3737               if (charset_id_3 < 0)
3738                 charset = CHARSET_FROM_ID (charset_ascii);
3739               else
3740                 charset = CHARSET_FROM_ID (charset_id_3);
3741               ONE_MORE_BYTE (c1);
3742               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3743                 goto invalid_code;
3744               break;
3745
3746             case '0': case '2': case '3': case '4': /* start composition */
3747               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3748                 goto invalid_code;
3749               if (last_id != charset_ascii)
3750                 {
3751                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3752                   last_id = charset_ascii;
3753                   last_offset = char_offset;
3754                 }
3755               DECODE_COMPOSITION_START (c1);
3756               continue;
3757
3758             case '1':           /* end composition */
3759               if (cmp_status->state == COMPOSING_NO)
3760                 goto invalid_code;
3761               DECODE_COMPOSITION_END ();
3762               continue;
3763
3764             case '[':           /* specification of direction */
3765               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3766                 goto invalid_code;
3767               /* For the moment, nested direction is not supported.
3768                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3769                  left-to-right, and nonzero means right-to-left.  */
3770               ONE_MORE_BYTE (c1);
3771               switch (c1)
3772                 {
3773                 case ']':       /* end of the current direction */
3774                   coding->mode &= ~CODING_MODE_DIRECTION;
3775
3776                 case '0':       /* end of the current direction */
3777                 case '1':       /* start of left-to-right direction */
3778                   ONE_MORE_BYTE (c1);
3779                   if (c1 == ']')
3780                     coding->mode &= ~CODING_MODE_DIRECTION;
3781                   else
3782                     goto invalid_code;
3783                   break;
3784
3785                 case '2':       /* start of right-to-left direction */
3786                   ONE_MORE_BYTE (c1);
3787                   if (c1 == ']')
3788                     coding->mode |= CODING_MODE_DIRECTION;
3789                   else
3790                     goto invalid_code;
3791                   break;
3792
3793                 default:
3794                   goto invalid_code;
3795                 }
3796               continue;
3797
3798             case '%':
3799               ONE_MORE_BYTE (c1);
3800               if (c1 == '/')
3801                 {
3802                   /* CTEXT extended segment:
3803                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3804                      We keep these bytes as is for the moment.
3805                      They may be decoded by post-read-conversion.  */
3806                   int dim, M, L;
3807                   int size;
3808
3809                   ONE_MORE_BYTE (dim);
3810                   if (dim < '0' || dim > '4')
3811                     goto invalid_code;
3812                   ONE_MORE_BYTE (M);
3813                   if (M < 128)
3814                     goto invalid_code;
3815                   ONE_MORE_BYTE (L);
3816                   if (L < 128)
3817                     goto invalid_code;
3818                   size = ((M - 128) * 128) + (L - 128);
3819                   if (charbuf + 6 > charbuf_end)
3820                     goto break_loop;
3821                   *charbuf++ = ISO_CODE_ESC;
3822                   *charbuf++ = '%';
3823                   *charbuf++ = '/';
3824                   *charbuf++ = dim;
3825                   *charbuf++ = BYTE8_TO_CHAR (M);
3826                   *charbuf++ = BYTE8_TO_CHAR (L);
3827                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3828                 }
3829               else if (c1 == 'G')
3830                 {
3831                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3832                      ESC % G --UTF-8-BYTES-- ESC % @
3833                      We keep these bytes as is for the moment.
3834                      They may be decoded by post-read-conversion.  */
3835                   if (charbuf + 3 > charbuf_end)
3836                     goto break_loop;
3837                   *charbuf++ = ISO_CODE_ESC;
3838                   *charbuf++ = '%';
3839                   *charbuf++ = 'G';
3840                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3841                 }
3842               else
3843                 goto invalid_code;
3844               continue;
3845               break;
3846
3847             default:
3848               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3849                 goto invalid_code;
3850               {
3851                 int reg, chars96;
3852
3853                 if (c1 >= 0x28 && c1 <= 0x2B)
3854                   { /* designation of DIMENSION1_CHARS94 character set */
3855                     reg = c1 - 0x28, chars96 = 0;
3856                     ONE_MORE_BYTE (c1);
3857                   }
3858                 else if (c1 >= 0x2C && c1 <= 0x2F)
3859                   { /* designation of DIMENSION1_CHARS96 character set */
3860                     reg = c1 - 0x2C, chars96 = 1;
3861                     ONE_MORE_BYTE (c1);
3862                   }
3863                 else
3864                   goto invalid_code;
3865                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3866                 /* We must update these variables now.  */
3867                 if (reg == 0)
3868                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3869                 else if (reg == 1)
3870                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3871                 if (chars96 < 0)
3872                   goto invalid_code;
3873               }
3874               continue;
3875             }
3876           break;
3877
3878         default:
3879           abort ();
3880         }
3881
3882       if (cmp_status->state == COMPOSING_NO
3883           && charset->id != charset_ascii
3884           && last_id != charset->id)
3885         {
3886           if (last_id != charset_ascii)
3887             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3888           last_id = charset->id;
3889           last_offset = char_offset;
3890         }
3891
3892       /* Now we know CHARSET and 1st position code C1 of a character.
3893          Produce a decoded character while getting 2nd and 3rd
3894          position codes C2, C3 if necessary.  */
3895       if (CHARSET_DIMENSION (charset) > 1)
3896         {
3897           ONE_MORE_BYTE (c2);
3898           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3899               || ((c1 & 0x80) != (c2 & 0x80)))
3900             /* C2 is not in a valid range.  */
3901             goto invalid_code;
3902           if (CHARSET_DIMENSION (charset) == 2)
3903             c1 = (c1 << 8) | c2;
3904           else
3905             {
3906               ONE_MORE_BYTE (c3);
3907               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3908                   || ((c1 & 0x80) != (c3 & 0x80)))
3909                 /* C3 is not in a valid range.  */
3910                 goto invalid_code;
3911               c1 = (c1 << 16) | (c2 << 8) | c2;
3912             }
3913         }
3914       c1 &= 0x7F7F7F;
3915       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3916       if (c < 0)
3917         {
3918           MAYBE_FINISH_COMPOSITION ();
3919           for (; src_base < src; src_base++, char_offset++)
3920             {
3921               if (ASCII_BYTE_P (*src_base))
3922                 *charbuf++ = *src_base;
3923               else
3924                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3925             }
3926         }
3927       else if (cmp_status->state == COMPOSING_NO)
3928         {
3929           *charbuf++ = c;
3930           char_offset++;
3931         }
3932       else if ((cmp_status->state == COMPOSING_CHAR
3933                 ? cmp_status->nchars
3934                 : cmp_status->ncomps)
3935                >= MAX_COMPOSITION_COMPONENTS)
3936         {
3937           /* Too long composition.  */
3938           MAYBE_FINISH_COMPOSITION ();
3939           *charbuf++ = c;
3940           char_offset++;
3941         }
3942       else
3943         STORE_COMPOSITION_CHAR (c);
3944       continue;
3945
3946     invalid_code:
3947       MAYBE_FINISH_COMPOSITION ();
3948       src = src_base;
3949       consumed_chars = consumed_chars_base;
3950       ONE_MORE_BYTE (c);
3951       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3952       char_offset++;
3953       coding->errors++;
3954       continue;
3955
3956     break_loop:
3957       break;
3958     }
3959
3960  no_more_source:
3961   if (cmp_status->state != COMPOSING_NO)
3962     {
3963       if (coding->mode & CODING_MODE_LAST_BLOCK)
3964         MAYBE_FINISH_COMPOSITION ();
3965       else
3966         {
3967           charbuf -= cmp_status->length;
3968           for (i = 0; i < cmp_status->length; i++)
3969             cmp_status->carryover[i] = charbuf[i];
3970         }
3971     }
3972   else if (last_id != charset_ascii)
3973     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3974   coding->consumed_char += consumed_chars_base;
3975   coding->consumed = src_base - coding->source;
3976   coding->charbuf_used = charbuf - coding->charbuf;
3977 }
3978
3979
3980 /* ISO2022 encoding stuff.  */
3981
3982 /*
3983    It is not enough to say just "ISO2022" on encoding, we have to
3984    specify more details.  In Emacs, each coding system of ISO2022
3985    variant has the following specifications:
3986         1. Initial designation to G0 thru G3.
3987         2. Allows short-form designation?
3988         3. ASCII should be designated to G0 before control characters?
3989         4. ASCII should be designated to G0 at end of line?
3990         5. 7-bit environment or 8-bit environment?
3991         6. Use locking-shift?
3992         7. Use Single-shift?
3993    And the following two are only for Japanese:
3994         8. Use ASCII in place of JIS0201-1976-Roman?
3995         9. Use JISX0208-1983 in place of JISX0208-1978?
3996    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3997    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3998    details.
3999 */
4000
4001 /* Produce codes (escape sequence) for designating CHARSET to graphic
4002    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4003    '@', 'A', or 'B' and the coding system CODING allows, produce
4004    designation sequence of short-form.  */
4005
4006 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4007   do {                                                                  \
4008     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4009     const char *intermediate_char_94 = "()*+";                          \
4010     const char *intermediate_char_96 = ",-./";                          \
4011     int revision = -1;                                                  \
4012                                                                         \
4013     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4014       revision = CHARSET_ISO_REVISION (charset);                        \
4015                                                                         \
4016     if (revision >= 0)                                                  \
4017       {                                                                 \
4018         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4019         EMIT_ONE_BYTE ('@' + revision);                                 \
4020       }                                                                 \
4021     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4022     if (CHARSET_DIMENSION (charset) == 1)                               \
4023       {                                                                 \
4024         int b;                                                          \
4025         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4026           b = intermediate_char_94[reg];                                \
4027         else                                                            \
4028           b = intermediate_char_96[reg];                                \
4029         EMIT_ONE_ASCII_BYTE (b);                                        \
4030       }                                                                 \
4031     else                                                                \
4032       {                                                                 \
4033         EMIT_ONE_ASCII_BYTE ('$');                                      \
4034         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4035           {                                                             \
4036             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4037                 || reg != 0                                             \
4038                 || final_char < '@' || final_char > 'B')                \
4039               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4040           }                                                             \
4041         else                                                            \
4042           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4043       }                                                                 \
4044     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4045                                                                         \
4046     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4047   } while (0)
4048
4049
4050 /* The following two macros produce codes (control character or escape
4051    sequence) for ISO2022 single-shift functions (single-shift-2 and
4052    single-shift-3).  */
4053
4054 #define ENCODE_SINGLE_SHIFT_2                                           \
4055   do {                                                                  \
4056     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4057       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4058     else                                                                \
4059       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4060     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4061   } while (0)
4062
4063
4064 #define ENCODE_SINGLE_SHIFT_3                                           \
4065   do {                                                                  \
4066     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4067       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4068     else                                                                \
4069       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4070     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4071   } while (0)
4072
4073
4074 /* The following four macros produce codes (control character or
4075    escape sequence) for ISO2022 locking-shift functions (shift-in,
4076    shift-out, locking-shift-2, and locking-shift-3).  */
4077
4078 #define ENCODE_SHIFT_IN                                 \
4079   do {                                                  \
4080     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4081     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4082   } while (0)
4083
4084
4085 #define ENCODE_SHIFT_OUT                                \
4086   do {                                                  \
4087     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4088     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4089   } while (0)
4090
4091
4092 #define ENCODE_LOCKING_SHIFT_2                          \
4093   do {                                                  \
4094     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4095     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4096   } while (0)
4097
4098
4099 #define ENCODE_LOCKING_SHIFT_3                          \
4100   do {                                                  \
4101     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4102     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4103   } while (0)
4104
4105
4106 /* Produce codes for a DIMENSION1 character whose character set is
4107    CHARSET and whose position-code is C1.  Designation and invocation
4108    sequences are also produced in advance if necessary.  */
4109
4110 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4111   do {                                                                  \
4112     int id = CHARSET_ID (charset);                                      \
4113                                                                         \
4114     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4115         && id == charset_ascii)                                         \
4116       {                                                                 \
4117         id = charset_jisx0201_roman;                                    \
4118         charset = CHARSET_FROM_ID (id);                                 \
4119       }                                                                 \
4120                                                                         \
4121     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4122       {                                                                 \
4123         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4124           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4125         else                                                            \
4126           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4127         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4128         break;                                                          \
4129       }                                                                 \
4130     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4131       {                                                                 \
4132         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4133         break;                                                          \
4134       }                                                                 \
4135     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4136       {                                                                 \
4137         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4138         break;                                                          \
4139       }                                                                 \
4140     else                                                                \
4141       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4142          must invoke it, or, at first, designate it to some graphic     \
4143          register.  Then repeat the loop to actually produce the        \
4144          character.  */                                                 \
4145       dst = encode_invocation_designation (charset, coding, dst,        \
4146                                            &produced_chars);            \
4147   } while (1)
4148
4149
4150 /* Produce codes for a DIMENSION2 character whose character set is
4151    CHARSET and whose position-codes are C1 and C2.  Designation and
4152    invocation codes are also produced in advance if necessary.  */
4153
4154 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4155   do {                                                                  \
4156     int id = CHARSET_ID (charset);                                      \
4157                                                                         \
4158     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4159         && id == charset_jisx0208)                                      \
4160       {                                                                 \
4161         id = charset_jisx0208_1978;                                     \
4162         charset = CHARSET_FROM_ID (id);                                 \
4163       }                                                                 \
4164                                                                         \
4165     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4166       {                                                                 \
4167         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4168           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4169         else                                                            \
4170           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4171         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4172         break;                                                          \
4173       }                                                                 \
4174     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4175       {                                                                 \
4176         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4177         break;                                                          \
4178       }                                                                 \
4179     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4180       {                                                                 \
4181         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4182         break;                                                          \
4183       }                                                                 \
4184     else                                                                \
4185       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4186          must invoke it, or, at first, designate it to some graphic     \
4187          register.  Then repeat the loop to actually produce the        \
4188          character.  */                                                 \
4189       dst = encode_invocation_designation (charset, coding, dst,        \
4190                                            &produced_chars);            \
4191   } while (1)
4192
4193
4194 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4195   do {                                                                     \
4196     int code = ENCODE_CHAR ((charset), (c));                               \
4197                                                                            \
4198     if (CHARSET_DIMENSION (charset) == 1)                                  \
4199       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4200     else                                                                   \
4201       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4202   } while (0)
4203
4204
4205 /* Produce designation and invocation codes at a place pointed by DST
4206    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4207    Return new DST.  */
4208
4209 static unsigned char *
4210 encode_invocation_designation (struct charset *charset,
4211                                struct coding_system *coding,
4212                                unsigned char *dst, int *p_nchars)
4213 {
4214   int multibytep = coding->dst_multibyte;
4215   int produced_chars = *p_nchars;
4216   int reg;                      /* graphic register number */
4217   int id = CHARSET_ID (charset);
4218
4219   /* At first, check designations.  */
4220   for (reg = 0; reg < 4; reg++)
4221     if (id == CODING_ISO_DESIGNATION (coding, reg))
4222       break;
4223
4224   if (reg >= 4)
4225     {
4226       /* CHARSET is not yet designated to any graphic registers.  */
4227       /* At first check the requested designation.  */
4228       reg = CODING_ISO_REQUEST (coding, id);
4229       if (reg < 0)
4230         /* Since CHARSET requests no special designation, designate it
4231            to graphic register 0.  */
4232         reg = 0;
4233
4234       ENCODE_DESIGNATION (charset, reg, coding);
4235     }
4236
4237   if (CODING_ISO_INVOCATION (coding, 0) != reg
4238       && CODING_ISO_INVOCATION (coding, 1) != reg)
4239     {
4240       /* Since the graphic register REG is not invoked to any graphic
4241          planes, invoke it to graphic plane 0.  */
4242       switch (reg)
4243         {
4244         case 0:                 /* graphic register 0 */
4245           ENCODE_SHIFT_IN;
4246           break;
4247
4248         case 1:                 /* graphic register 1 */
4249           ENCODE_SHIFT_OUT;
4250           break;
4251
4252         case 2:                 /* graphic register 2 */
4253           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4254             ENCODE_SINGLE_SHIFT_2;
4255           else
4256             ENCODE_LOCKING_SHIFT_2;
4257           break;
4258
4259         case 3:                 /* graphic register 3 */
4260           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4261             ENCODE_SINGLE_SHIFT_3;
4262           else
4263             ENCODE_LOCKING_SHIFT_3;
4264           break;
4265         }
4266     }
4267
4268   *p_nchars = produced_chars;
4269   return dst;
4270 }
4271
4272
4273 /* Produce codes for designation and invocation to reset the graphic
4274    planes and registers to initial state.  */
4275 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4276   do {                                                                  \
4277     int reg;                                                            \
4278     struct charset *charset;                                            \
4279                                                                         \
4280     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4281       ENCODE_SHIFT_IN;                                                  \
4282     for (reg = 0; reg < 4; reg++)                                       \
4283       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4284           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4285               != CODING_ISO_INITIAL (coding, reg)))                     \
4286         {                                                               \
4287           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4288           ENCODE_DESIGNATION (charset, reg, coding);                    \
4289         }                                                               \
4290   } while (0)
4291
4292
4293 /* Produce designation sequences of charsets in the line started from
4294    SRC to a place pointed by DST, and return updated DST.
4295
4296    If the current block ends before any end-of-line, we may fail to
4297    find all the necessary designations.  */
4298
4299 static unsigned char *
4300 encode_designation_at_bol (struct coding_system *coding, int *charbuf,
4301                            int *charbuf_end, unsigned char *dst)
4302 {
4303   struct charset *charset;
4304   /* Table of charsets to be designated to each graphic register.  */
4305   int r[4];
4306   int c, found = 0, reg;
4307   int produced_chars = 0;
4308   int multibytep = coding->dst_multibyte;
4309   Lisp_Object attrs;
4310   Lisp_Object charset_list;
4311
4312   attrs = CODING_ID_ATTRS (coding->id);
4313   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4314   if (EQ (charset_list, Qiso_2022))
4315     charset_list = Viso_2022_charset_list;
4316
4317   for (reg = 0; reg < 4; reg++)
4318     r[reg] = -1;
4319
4320   while (found < 4)
4321     {
4322       int id;
4323
4324       c = *charbuf++;
4325       if (c == '\n')
4326         break;
4327       charset = char_charset (c, charset_list, NULL);
4328       id = CHARSET_ID (charset);
4329       reg = CODING_ISO_REQUEST (coding, id);
4330       if (reg >= 0 && r[reg] < 0)
4331         {
4332           found++;
4333           r[reg] = id;
4334         }
4335     }
4336
4337   if (found)
4338     {
4339       for (reg = 0; reg < 4; reg++)
4340         if (r[reg] >= 0
4341             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4342           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4343     }
4344
4345   return dst;
4346 }
4347
4348 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4349
4350 static int
4351 encode_coding_iso_2022 (struct coding_system *coding)
4352 {
4353   int multibytep = coding->dst_multibyte;
4354   int *charbuf = coding->charbuf;
4355   int *charbuf_end = charbuf + coding->charbuf_used;
4356   unsigned char *dst = coding->destination + coding->produced;
4357   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4358   int safe_room = 16;
4359   int bol_designation
4360     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4361        && CODING_ISO_BOL (coding));
4362   int produced_chars = 0;
4363   Lisp_Object attrs, eol_type, charset_list;
4364   int ascii_compatible;
4365   int c;
4366   int preferred_charset_id = -1;
4367
4368   CODING_GET_INFO (coding, attrs, charset_list);
4369   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4370   if (VECTORP (eol_type))
4371     eol_type = Qunix;
4372
4373   setup_iso_safe_charsets (attrs);
4374   /* Charset list may have been changed.  */
4375   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4376   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4377
4378   ascii_compatible
4379     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4380        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4381                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4382
4383   while (charbuf < charbuf_end)
4384     {
4385       ASSURE_DESTINATION (safe_room);
4386
4387       if (bol_designation)
4388         {
4389           unsigned char *dst_prev = dst;
4390
4391           /* We have to produce designation sequences if any now.  */
4392           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4393           bol_designation = 0;
4394           /* We are sure that designation sequences are all ASCII bytes.  */
4395           produced_chars += dst - dst_prev;
4396         }
4397
4398       c = *charbuf++;
4399
4400       if (c < 0)
4401         {
4402           /* Handle an annotation.  */
4403           switch (*charbuf)
4404             {
4405             case CODING_ANNOTATE_COMPOSITION_MASK:
4406               /* Not yet implemented.  */
4407               break;
4408             case CODING_ANNOTATE_CHARSET_MASK:
4409               preferred_charset_id = charbuf[2];
4410               if (preferred_charset_id >= 0
4411                   && NILP (Fmemq (make_number (preferred_charset_id),
4412                                   charset_list)))
4413                 preferred_charset_id = -1;
4414               break;
4415             default:
4416               abort ();
4417             }
4418           charbuf += -c - 1;
4419           continue;
4420         }
4421
4422       /* Now encode the character C.  */
4423       if (c < 0x20 || c == 0x7F)
4424         {
4425           if (c == '\n'
4426               || (c == '\r' && EQ (eol_type, Qmac)))
4427             {
4428               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4429                 ENCODE_RESET_PLANE_AND_REGISTER ();
4430               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4431                 {
4432                   int i;
4433
4434                   for (i = 0; i < 4; i++)
4435                     CODING_ISO_DESIGNATION (coding, i)
4436                       = CODING_ISO_INITIAL (coding, i);
4437                 }
4438               bol_designation
4439                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4440             }
4441           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4442             ENCODE_RESET_PLANE_AND_REGISTER ();
4443           EMIT_ONE_ASCII_BYTE (c);
4444         }
4445       else if (ASCII_CHAR_P (c))
4446         {
4447           if (ascii_compatible)
4448             EMIT_ONE_ASCII_BYTE (c);
4449           else
4450             {
4451               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4452               ENCODE_ISO_CHARACTER (charset, c);
4453             }
4454         }
4455       else if (CHAR_BYTE8_P (c))
4456         {
4457           c = CHAR_TO_BYTE8 (c);
4458           EMIT_ONE_BYTE (c);
4459         }
4460       else
4461         {
4462           struct charset *charset;
4463
4464           if (preferred_charset_id >= 0)
4465             {
4466               charset = CHARSET_FROM_ID (preferred_charset_id);
4467               if (! CHAR_CHARSET_P (c, charset))
4468                 charset = char_charset (c, charset_list, NULL);
4469             }
4470           else
4471             charset = char_charset (c, charset_list, NULL);
4472           if (!charset)
4473             {
4474               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4475                 {
4476                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4477                   charset = CHARSET_FROM_ID (charset_ascii);
4478                 }
4479               else
4480                 {
4481                   c = coding->default_char;
4482                   charset = char_charset (c, charset_list, NULL);
4483                 }
4484             }
4485           ENCODE_ISO_CHARACTER (charset, c);
4486         }
4487     }
4488
4489   if (coding->mode & CODING_MODE_LAST_BLOCK
4490       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4491     {
4492       ASSURE_DESTINATION (safe_room);
4493       ENCODE_RESET_PLANE_AND_REGISTER ();
4494     }
4495   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4496   CODING_ISO_BOL (coding) = bol_designation;
4497   coding->produced_char += produced_chars;
4498   coding->produced = dst - coding->destination;
4499   return 0;
4500 }
4501
4502 \f
4503 /*** 8,9. SJIS and BIG5 handlers ***/
4504
4505 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4506    quite widely.  So, for the moment, Emacs supports them in the bare
4507    C code.  But, in the future, they may be supported only by CCL.  */
4508
4509 /* SJIS is a coding system encoding three character sets: ASCII, right
4510    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4511    as is.  A character of charset katakana-jisx0201 is encoded by
4512    "position-code + 0x80".  A character of charset japanese-jisx0208
4513    is encoded in 2-byte but two position-codes are divided and shifted
4514    so that it fit in the range below.
4515
4516    --- CODE RANGE of SJIS ---
4517    (character set)      (range)
4518    ASCII                0x00 .. 0x7F
4519    KATAKANA-JISX0201    0xA0 .. 0xDF
4520    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4521             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4522    -------------------------------
4523
4524 */
4525
4526 /* BIG5 is a coding system encoding two character sets: ASCII and
4527    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4528    character set and is encoded in two-byte.
4529
4530    --- CODE RANGE of BIG5 ---
4531    (character set)      (range)
4532    ASCII                0x00 .. 0x7F
4533    Big5 (1st byte)      0xA1 .. 0xFE
4534         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4535    --------------------------
4536
4537   */
4538
4539 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4540    Check if a text is encoded in SJIS.  If it is, return
4541    CATEGORY_MASK_SJIS, else return 0.  */
4542
4543 static int
4544 detect_coding_sjis (struct coding_system *coding,
4545                     struct coding_detection_info *detect_info)
4546 {
4547   const unsigned char *src = coding->source, *src_base;
4548   const unsigned char *src_end = coding->source + coding->src_bytes;
4549   int multibytep = coding->src_multibyte;
4550   int consumed_chars = 0;
4551   int found = 0;
4552   int c;
4553   Lisp_Object attrs, charset_list;
4554   int max_first_byte_of_2_byte_code;
4555
4556   CODING_GET_INFO (coding, attrs, charset_list);
4557   max_first_byte_of_2_byte_code
4558     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4559
4560   detect_info->checked |= CATEGORY_MASK_SJIS;
4561   /* A coding system of this category is always ASCII compatible.  */
4562   src += coding->head_ascii;
4563
4564   while (1)
4565     {
4566       src_base = src;
4567       ONE_MORE_BYTE (c);
4568       if (c < 0x80)
4569         continue;
4570       if ((c >= 0x81 && c <= 0x9F)
4571           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4572         {
4573           ONE_MORE_BYTE (c);
4574           if (c < 0x40 || c == 0x7F || c > 0xFC)
4575             break;
4576           found = CATEGORY_MASK_SJIS;
4577         }
4578       else if (c >= 0xA0 && c < 0xE0)
4579         found = CATEGORY_MASK_SJIS;
4580       else
4581         break;
4582     }
4583   detect_info->rejected |= CATEGORY_MASK_SJIS;
4584   return 0;
4585
4586  no_more_source:
4587   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4588     {
4589       detect_info->rejected |= CATEGORY_MASK_SJIS;
4590       return 0;
4591     }
4592   detect_info->found |= found;
4593   return 1;
4594 }
4595
4596 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4597    Check if a text is encoded in BIG5.  If it is, return
4598    CATEGORY_MASK_BIG5, else return 0.  */
4599
4600 static int
4601 detect_coding_big5 (struct coding_system *coding,
4602                     struct coding_detection_info *detect_info)
4603 {
4604   const unsigned char *src = coding->source, *src_base;
4605   const unsigned char *src_end = coding->source + coding->src_bytes;
4606   int multibytep = coding->src_multibyte;
4607   int consumed_chars = 0;
4608   int found = 0;
4609   int c;
4610
4611   detect_info->checked |= CATEGORY_MASK_BIG5;
4612   /* A coding system of this category is always ASCII compatible.  */
4613   src += coding->head_ascii;
4614
4615   while (1)
4616     {
4617       src_base = src;
4618       ONE_MORE_BYTE (c);
4619       if (c < 0x80)
4620         continue;
4621       if (c >= 0xA1)
4622         {
4623           ONE_MORE_BYTE (c);
4624           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4625             return 0;
4626           found = CATEGORY_MASK_BIG5;
4627         }
4628       else
4629         break;
4630     }
4631   detect_info->rejected |= CATEGORY_MASK_BIG5;
4632   return 0;
4633
4634  no_more_source:
4635   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4636     {
4637       detect_info->rejected |= CATEGORY_MASK_BIG5;
4638       return 0;
4639     }
4640   detect_info->found |= found;
4641   return 1;
4642 }
4643
4644 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4645    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4646
4647 static void
4648 decode_coding_sjis (struct coding_system *coding)
4649 {
4650   const unsigned char *src = coding->source + coding->consumed;
4651   const unsigned char *src_end = coding->source + coding->src_bytes;
4652   const unsigned char *src_base;
4653   int *charbuf = coding->charbuf + coding->charbuf_used;
4654   /* We may produce one charset annotation in one loop and one more at
4655      the end.  */
4656   int *charbuf_end
4657     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4658   int consumed_chars = 0, consumed_chars_base;
4659   int multibytep = coding->src_multibyte;
4660   struct charset *charset_roman, *charset_kanji, *charset_kana;
4661   struct charset *charset_kanji2;
4662   Lisp_Object attrs, charset_list, val;
4663   int char_offset = coding->produced_char;
4664   int last_offset = char_offset;
4665   int last_id = charset_ascii;
4666   int eol_dos =
4667     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4668   int byte_after_cr = -1;
4669
4670   CODING_GET_INFO (coding, attrs, charset_list);
4671
4672   val = charset_list;
4673   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4674   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4675   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4676   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4677
4678   while (1)
4679     {
4680       int c, c1;
4681       struct charset *charset;
4682
4683       src_base = src;
4684       consumed_chars_base = consumed_chars;
4685
4686       if (charbuf >= charbuf_end)
4687         {
4688           if (byte_after_cr >= 0)
4689             src_base--;
4690           break;
4691         }
4692
4693       if (byte_after_cr >= 0)
4694         c = byte_after_cr, byte_after_cr = -1;
4695       else
4696         ONE_MORE_BYTE (c);
4697       if (c < 0)
4698         goto invalid_code;
4699       if (c < 0x80)
4700         {
4701           if (eol_dos && c == '\r')
4702             ONE_MORE_BYTE (byte_after_cr);
4703           charset = charset_roman;
4704         }
4705       else if (c == 0x80 || c == 0xA0)
4706         goto invalid_code;
4707       else if (c >= 0xA1 && c <= 0xDF)
4708         {
4709           /* SJIS -> JISX0201-Kana */
4710           c &= 0x7F;
4711           charset = charset_kana;
4712         }
4713       else if (c <= 0xEF)
4714         {
4715           /* SJIS -> JISX0208 */
4716           ONE_MORE_BYTE (c1);
4717           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4718             goto invalid_code;
4719           c = (c << 8) | c1;
4720           SJIS_TO_JIS (c);
4721           charset = charset_kanji;
4722         }
4723       else if (c <= 0xFC && charset_kanji2)
4724         {
4725           /* SJIS -> JISX0213-2 */
4726           ONE_MORE_BYTE (c1);
4727           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4728             goto invalid_code;
4729           c = (c << 8) | c1;
4730           SJIS_TO_JIS2 (c);
4731           charset = charset_kanji2;
4732         }
4733       else
4734         goto invalid_code;
4735       if (charset->id != charset_ascii
4736           && last_id != charset->id)
4737         {
4738           if (last_id != charset_ascii)
4739             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4740           last_id = charset->id;
4741           last_offset = char_offset;
4742         }
4743       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4744       *charbuf++ = c;
4745       char_offset++;
4746       continue;
4747
4748     invalid_code:
4749       src = src_base;
4750       consumed_chars = consumed_chars_base;
4751       ONE_MORE_BYTE (c);
4752       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4753       char_offset++;
4754       coding->errors++;
4755     }
4756
4757  no_more_source:
4758   if (last_id != charset_ascii)
4759     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4760   coding->consumed_char += consumed_chars_base;
4761   coding->consumed = src_base - coding->source;
4762   coding->charbuf_used = charbuf - coding->charbuf;
4763 }
4764
4765 static void
4766 decode_coding_big5 (struct coding_system *coding)
4767 {
4768   const unsigned char *src = coding->source + coding->consumed;
4769   const unsigned char *src_end = coding->source + coding->src_bytes;
4770   const unsigned char *src_base;
4771   int *charbuf = coding->charbuf + coding->charbuf_used;
4772   /* We may produce one charset annotation in one loop and one more at
4773      the end.  */
4774   int *charbuf_end
4775     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4776   int consumed_chars = 0, consumed_chars_base;
4777   int multibytep = coding->src_multibyte;
4778   struct charset *charset_roman, *charset_big5;
4779   Lisp_Object attrs, charset_list, val;
4780   int char_offset = coding->produced_char;
4781   int last_offset = char_offset;
4782   int last_id = charset_ascii;
4783   int eol_dos =
4784     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4785   int byte_after_cr = -1;
4786
4787   CODING_GET_INFO (coding, attrs, charset_list);
4788   val = charset_list;
4789   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4790   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4791
4792   while (1)
4793     {
4794       int c, c1;
4795       struct charset *charset;
4796
4797       src_base = src;
4798       consumed_chars_base = consumed_chars;
4799
4800       if (charbuf >= charbuf_end)
4801         {
4802           if (byte_after_cr >= 0)
4803             src_base--;
4804           break;
4805         }
4806
4807       if (byte_after_cr >= 0)
4808         c = byte_after_cr, byte_after_cr = -1;
4809       else
4810         ONE_MORE_BYTE (c);
4811
4812       if (c < 0)
4813         goto invalid_code;
4814       if (c < 0x80)
4815         {
4816           if (eol_dos && c == '\r')
4817             ONE_MORE_BYTE (byte_after_cr);
4818           charset = charset_roman;
4819         }
4820       else
4821         {
4822           /* BIG5 -> Big5 */
4823           if (c < 0xA1 || c > 0xFE)
4824             goto invalid_code;
4825           ONE_MORE_BYTE (c1);
4826           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4827             goto invalid_code;
4828           c = c << 8 | c1;
4829           charset = charset_big5;
4830         }
4831       if (charset->id != charset_ascii
4832           && last_id != charset->id)
4833         {
4834           if (last_id != charset_ascii)
4835             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4836           last_id = charset->id;
4837           last_offset = char_offset;
4838         }
4839       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4840       *charbuf++ = c;
4841       char_offset++;
4842       continue;
4843
4844     invalid_code:
4845       src = src_base;
4846       consumed_chars = consumed_chars_base;
4847       ONE_MORE_BYTE (c);
4848       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4849       char_offset++;
4850       coding->errors++;
4851     }
4852
4853  no_more_source:
4854   if (last_id != charset_ascii)
4855     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4856   coding->consumed_char += consumed_chars_base;
4857   coding->consumed = src_base - coding->source;
4858   coding->charbuf_used = charbuf - coding->charbuf;
4859 }
4860
4861 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4862    This function can encode charsets `ascii', `katakana-jisx0201',
4863    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4864    are sure that all these charsets are registered as official charset
4865    (i.e. do not have extended leading-codes).  Characters of other
4866    charsets are produced without any encoding.  If SJIS_P is 1, encode
4867    SJIS text, else encode BIG5 text.  */
4868
4869 static int
4870 encode_coding_sjis (struct coding_system *coding)
4871 {
4872   int multibytep = coding->dst_multibyte;
4873   int *charbuf = coding->charbuf;
4874   int *charbuf_end = charbuf + coding->charbuf_used;
4875   unsigned char *dst = coding->destination + coding->produced;
4876   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4877   int safe_room = 4;
4878   int produced_chars = 0;
4879   Lisp_Object attrs, charset_list, val;
4880   int ascii_compatible;
4881   struct charset *charset_roman, *charset_kanji, *charset_kana;
4882   struct charset *charset_kanji2;
4883   int c;
4884
4885   CODING_GET_INFO (coding, attrs, charset_list);
4886   val = charset_list;
4887   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4888   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4889   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4890   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4891
4892   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4893
4894   while (charbuf < charbuf_end)
4895     {
4896       ASSURE_DESTINATION (safe_room);
4897       c = *charbuf++;
4898       /* Now encode the character C.  */
4899       if (ASCII_CHAR_P (c) && ascii_compatible)
4900         EMIT_ONE_ASCII_BYTE (c);
4901       else if (CHAR_BYTE8_P (c))
4902         {
4903           c = CHAR_TO_BYTE8 (c);
4904           EMIT_ONE_BYTE (c);
4905         }
4906       else
4907         {
4908           unsigned code;
4909           struct charset *charset = char_charset (c, charset_list, &code);
4910
4911           if (!charset)
4912             {
4913               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4914                 {
4915                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4916                   charset = CHARSET_FROM_ID (charset_ascii);
4917                 }
4918               else
4919                 {
4920                   c = coding->default_char;
4921                   charset = char_charset (c, charset_list, &code);
4922                 }
4923             }
4924           if (code == CHARSET_INVALID_CODE (charset))
4925             abort ();
4926           if (charset == charset_kanji)
4927             {
4928               int c1, c2;
4929               JIS_TO_SJIS (code);
4930               c1 = code >> 8, c2 = code & 0xFF;
4931               EMIT_TWO_BYTES (c1, c2);
4932             }
4933           else if (charset == charset_kana)
4934             EMIT_ONE_BYTE (code | 0x80);
4935           else if (charset_kanji2 && charset == charset_kanji2)
4936             {
4937               int c1, c2;
4938
4939               c1 = code >> 8;
4940               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4941                   || c1 == 0x28
4942                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4943                 {
4944                   JIS_TO_SJIS2 (code);
4945                   c1 = code >> 8, c2 = code & 0xFF;
4946                   EMIT_TWO_BYTES (c1, c2);
4947                 }
4948               else
4949                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4950             }
4951           else
4952             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4953         }
4954     }
4955   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4956   coding->produced_char += produced_chars;
4957   coding->produced = dst - coding->destination;
4958   return 0;
4959 }
4960
4961 static int
4962 encode_coding_big5 (struct coding_system *coding)
4963 {
4964   int multibytep = coding->dst_multibyte;
4965   int *charbuf = coding->charbuf;
4966   int *charbuf_end = charbuf + coding->charbuf_used;
4967   unsigned char *dst = coding->destination + coding->produced;
4968   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4969   int safe_room = 4;
4970   int produced_chars = 0;
4971   Lisp_Object attrs, charset_list, val;
4972   int ascii_compatible;
4973   struct charset *charset_roman, *charset_big5;
4974   int c;
4975
4976   CODING_GET_INFO (coding, attrs, charset_list);
4977   val = charset_list;
4978   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4979   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4980   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4981
4982   while (charbuf < charbuf_end)
4983     {
4984       ASSURE_DESTINATION (safe_room);
4985       c = *charbuf++;
4986       /* Now encode the character C.  */
4987       if (ASCII_CHAR_P (c) && ascii_compatible)
4988         EMIT_ONE_ASCII_BYTE (c);
4989       else if (CHAR_BYTE8_P (c))
4990         {
4991           c = CHAR_TO_BYTE8 (c);
4992           EMIT_ONE_BYTE (c);
4993         }
4994       else
4995         {
4996           unsigned code;
4997           struct charset *charset = char_charset (c, charset_list, &code);
4998
4999           if (! charset)
5000             {
5001               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5002                 {
5003                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5004                   charset = CHARSET_FROM_ID (charset_ascii);
5005                 }
5006               else
5007                 {
5008                   c = coding->default_char;
5009                   charset = char_charset (c, charset_list, &code);
5010                 }
5011             }
5012           if (code == CHARSET_INVALID_CODE (charset))
5013             abort ();
5014           if (charset == charset_big5)
5015             {
5016               int c1, c2;
5017
5018               c1 = code >> 8, c2 = code & 0xFF;
5019               EMIT_TWO_BYTES (c1, c2);
5020             }
5021           else
5022             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5023         }
5024     }
5025   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5026   coding->produced_char += produced_chars;
5027   coding->produced = dst - coding->destination;
5028   return 0;
5029 }
5030
5031 \f
5032 /*** 10. CCL handlers ***/
5033
5034 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5035    Check if a text is encoded in a coding system of which
5036    encoder/decoder are written in CCL program.  If it is, return
5037    CATEGORY_MASK_CCL, else return 0.  */
5038
5039 static int
5040 detect_coding_ccl (struct coding_system *coding,
5041                    struct coding_detection_info *detect_info)
5042 {
5043   const unsigned char *src = coding->source, *src_base;
5044   const unsigned char *src_end = coding->source + coding->src_bytes;
5045   int multibytep = coding->src_multibyte;
5046   int consumed_chars = 0;
5047   int found = 0;
5048   unsigned char *valids;
5049   int head_ascii = coding->head_ascii;
5050   Lisp_Object attrs;
5051
5052   detect_info->checked |= CATEGORY_MASK_CCL;
5053
5054   coding = &coding_categories[coding_category_ccl];
5055   valids = CODING_CCL_VALIDS (coding);
5056   attrs = CODING_ID_ATTRS (coding->id);
5057   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5058     src += head_ascii;
5059
5060   while (1)
5061     {
5062       int c;
5063
5064       src_base = src;
5065       ONE_MORE_BYTE (c);
5066       if (c < 0 || ! valids[c])
5067         break;
5068       if ((valids[c] > 1))
5069         found = CATEGORY_MASK_CCL;
5070     }
5071   detect_info->rejected |= CATEGORY_MASK_CCL;
5072   return 0;
5073
5074  no_more_source:
5075   detect_info->found |= found;
5076   return 1;
5077 }
5078
5079 static void
5080 decode_coding_ccl (struct coding_system *coding)
5081 {
5082   const unsigned char *src = coding->source + coding->consumed;
5083   const unsigned char *src_end = coding->source + coding->src_bytes;
5084   int *charbuf = coding->charbuf + coding->charbuf_used;
5085   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5086   int consumed_chars = 0;
5087   int multibytep = coding->src_multibyte;
5088   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5089   int source_charbuf[1024];
5090   int source_byteidx[1025];
5091   Lisp_Object attrs, charset_list;
5092
5093   CODING_GET_INFO (coding, attrs, charset_list);
5094
5095   while (1)
5096     {
5097       const unsigned char *p = src;
5098       int i = 0;
5099
5100       if (multibytep)
5101         {
5102           while (i < 1024 && p < src_end)
5103             {
5104               source_byteidx[i] = p - src;
5105               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5106             }
5107           source_byteidx[i] = p - src;
5108         }
5109       else
5110         while (i < 1024 && p < src_end)
5111           source_charbuf[i++] = *p++;
5112
5113       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5114         ccl->last_block = 1;
5115       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5116                   charset_list);
5117       charbuf += ccl->produced;
5118       if (multibytep)
5119         src += source_byteidx[ccl->consumed];
5120       else
5121         src += ccl->consumed;
5122       consumed_chars += ccl->consumed;
5123       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5124         break;
5125     }
5126
5127   switch (ccl->status)
5128     {
5129     case CCL_STAT_SUSPEND_BY_SRC:
5130       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5131       break;
5132     case CCL_STAT_SUSPEND_BY_DST:
5133       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5134       break;
5135     case CCL_STAT_QUIT:
5136     case CCL_STAT_INVALID_CMD:
5137       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5138       break;
5139     default:
5140       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5141       break;
5142     }
5143   coding->consumed_char += consumed_chars;
5144   coding->consumed = src - coding->source;
5145   coding->charbuf_used = charbuf - coding->charbuf;
5146 }
5147
5148 static int
5149 encode_coding_ccl (struct coding_system *coding)
5150 {
5151   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5152   int multibytep = coding->dst_multibyte;
5153   int *charbuf = coding->charbuf;
5154   int *charbuf_end = charbuf + coding->charbuf_used;
5155   unsigned char *dst = coding->destination + coding->produced;
5156   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5157   int destination_charbuf[1024];
5158   int i, produced_chars = 0;
5159   Lisp_Object attrs, charset_list;
5160
5161   CODING_GET_INFO (coding, attrs, charset_list);
5162   if (coding->consumed_char == coding->src_chars
5163       && coding->mode & CODING_MODE_LAST_BLOCK)
5164     ccl->last_block = 1;
5165
5166   while (charbuf < charbuf_end)
5167     {
5168       ccl_driver (ccl, charbuf, destination_charbuf,
5169                   charbuf_end - charbuf, 1024, charset_list);
5170       if (multibytep)
5171         {
5172           ASSURE_DESTINATION (ccl->produced * 2);
5173           for (i = 0; i < ccl->produced; i++)
5174             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5175         }
5176       else
5177         {
5178           ASSURE_DESTINATION (ccl->produced);
5179           for (i = 0; i < ccl->produced; i++)
5180             *dst++ = destination_charbuf[i] & 0xFF;
5181           produced_chars += ccl->produced;
5182         }
5183       charbuf += ccl->consumed;
5184       if (ccl->status == CCL_STAT_QUIT
5185           || ccl->status == CCL_STAT_INVALID_CMD)
5186         break;
5187     }
5188
5189   switch (ccl->status)
5190     {
5191     case CCL_STAT_SUSPEND_BY_SRC:
5192       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5193       break;
5194     case CCL_STAT_SUSPEND_BY_DST:
5195       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5196       break;
5197     case CCL_STAT_QUIT:
5198     case CCL_STAT_INVALID_CMD:
5199       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5200       break;
5201     default:
5202       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5203       break;
5204     }
5205
5206   coding->produced_char += produced_chars;
5207   coding->produced = dst - coding->destination;
5208   return 0;
5209 }
5210
5211
5212 \f
5213 /*** 10, 11. no-conversion handlers ***/
5214
5215 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5216
5217 static void
5218 decode_coding_raw_text (struct coding_system *coding)
5219 {
5220   int eol_dos =
5221     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5222
5223   coding->chars_at_source = 1;
5224   coding->consumed_char = coding->src_chars;
5225   coding->consumed = coding->src_bytes;
5226   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5227     {
5228       coding->consumed_char--;
5229       coding->consumed--;
5230       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5231     }
5232   else
5233     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5234 }
5235
5236 static int
5237 encode_coding_raw_text (struct coding_system *coding)
5238 {
5239   int multibytep = coding->dst_multibyte;
5240   int *charbuf = coding->charbuf;
5241   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5242   unsigned char *dst = coding->destination + coding->produced;
5243   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5244   int produced_chars = 0;
5245   int c;
5246
5247   if (multibytep)
5248     {
5249       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5250
5251       if (coding->src_multibyte)
5252         while (charbuf < charbuf_end)
5253           {
5254             ASSURE_DESTINATION (safe_room);
5255             c = *charbuf++;
5256             if (ASCII_CHAR_P (c))
5257               EMIT_ONE_ASCII_BYTE (c);
5258             else if (CHAR_BYTE8_P (c))
5259               {
5260                 c = CHAR_TO_BYTE8 (c);
5261                 EMIT_ONE_BYTE (c);
5262               }
5263             else
5264               {
5265                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5266
5267                 CHAR_STRING_ADVANCE (c, p1);
5268                 while (p0 < p1)
5269                   {
5270                     EMIT_ONE_BYTE (*p0);
5271                     p0++;
5272                   }
5273               }
5274           }
5275       else
5276         while (charbuf < charbuf_end)
5277           {
5278             ASSURE_DESTINATION (safe_room);
5279             c = *charbuf++;
5280             EMIT_ONE_BYTE (c);
5281           }
5282     }
5283   else
5284     {
5285       if (coding->src_multibyte)
5286         {
5287           int safe_room = MAX_MULTIBYTE_LENGTH;
5288
5289           while (charbuf < charbuf_end)
5290             {
5291               ASSURE_DESTINATION (safe_room);
5292               c = *charbuf++;
5293               if (ASCII_CHAR_P (c))
5294                 *dst++ = c;
5295               else if (CHAR_BYTE8_P (c))
5296                 *dst++ = CHAR_TO_BYTE8 (c);
5297               else
5298                 CHAR_STRING_ADVANCE (c, dst);
5299             }
5300         }
5301       else
5302         {
5303           ASSURE_DESTINATION (charbuf_end - charbuf);
5304           while (charbuf < charbuf_end && dst < dst_end)
5305             *dst++ = *charbuf++;
5306         }
5307       produced_chars = dst - (coding->destination + coding->produced);
5308     }
5309   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5310   coding->produced_char += produced_chars;
5311   coding->produced = dst - coding->destination;
5312   return 0;
5313 }
5314
5315 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5316    Check if a text is encoded in a charset-based coding system.  If it
5317    is, return 1, else return 0.  */
5318
5319 static int
5320 detect_coding_charset (struct coding_system *coding,
5321                        struct coding_detection_info *detect_info)
5322 {
5323   const unsigned char *src = coding->source, *src_base;
5324   const unsigned char *src_end = coding->source + coding->src_bytes;
5325   int multibytep = coding->src_multibyte;
5326   int consumed_chars = 0;
5327   Lisp_Object attrs, valids, name;
5328   int found = 0;
5329   int head_ascii = coding->head_ascii;
5330   int check_latin_extra = 0;
5331
5332   detect_info->checked |= CATEGORY_MASK_CHARSET;
5333
5334   coding = &coding_categories[coding_category_charset];
5335   attrs = CODING_ID_ATTRS (coding->id);
5336   valids = AREF (attrs, coding_attr_charset_valids);
5337   name = CODING_ID_NAME (coding->id);
5338   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5339                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5340       || strncmp (SSDATA (SYMBOL_NAME (name)),
5341                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5342     check_latin_extra = 1;
5343
5344   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5345     src += head_ascii;
5346
5347   while (1)
5348     {
5349       int c;
5350       Lisp_Object val;
5351       struct charset *charset;
5352       int dim, idx;
5353
5354       src_base = src;
5355       ONE_MORE_BYTE (c);
5356       if (c < 0)
5357         continue;
5358       val = AREF (valids, c);
5359       if (NILP (val))
5360         break;
5361       if (c >= 0x80)
5362         {
5363           if (c < 0xA0
5364               && check_latin_extra
5365               && (!VECTORP (Vlatin_extra_code_table)
5366                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5367             break;
5368           found = CATEGORY_MASK_CHARSET;
5369         }
5370       if (INTEGERP (val))
5371         {
5372           charset = CHARSET_FROM_ID (XFASTINT (val));
5373           dim = CHARSET_DIMENSION (charset);
5374           for (idx = 1; idx < dim; idx++)
5375             {
5376               if (src == src_end)
5377                 goto too_short;
5378               ONE_MORE_BYTE (c);
5379               if (c < charset->code_space[(dim - 1 - idx) * 2]
5380                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5381                 break;
5382             }
5383           if (idx < dim)
5384             break;
5385         }
5386       else
5387         {
5388           idx = 1;
5389           for (; CONSP (val); val = XCDR (val))
5390             {
5391               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5392               dim = CHARSET_DIMENSION (charset);
5393               while (idx < dim)
5394                 {
5395                   if (src == src_end)
5396                     goto too_short;
5397                   ONE_MORE_BYTE (c);
5398                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5399                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5400                     break;
5401                   idx++;
5402                 }
5403               if (idx == dim)
5404                 {
5405                   val = Qnil;
5406                   break;
5407                 }
5408             }
5409           if (CONSP (val))
5410             break;
5411         }
5412     }
5413  too_short:
5414   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5415   return 0;
5416
5417  no_more_source:
5418   detect_info->found |= found;
5419   return 1;
5420 }
5421
5422 static void
5423 decode_coding_charset (struct coding_system *coding)
5424 {
5425   const unsigned char *src = coding->source + coding->consumed;
5426   const unsigned char *src_end = coding->source + coding->src_bytes;
5427   const unsigned char *src_base;
5428   int *charbuf = coding->charbuf + coding->charbuf_used;
5429   /* We may produce one charset annotation in one loop and one more at
5430      the end.  */
5431   int *charbuf_end
5432     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5433   int consumed_chars = 0, consumed_chars_base;
5434   int multibytep = coding->src_multibyte;
5435   Lisp_Object attrs, charset_list, valids;
5436   int char_offset = coding->produced_char;
5437   int last_offset = char_offset;
5438   int last_id = charset_ascii;
5439   int eol_dos =
5440     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5441   int byte_after_cr = -1;
5442
5443   CODING_GET_INFO (coding, attrs, charset_list);
5444   valids = AREF (attrs, coding_attr_charset_valids);
5445
5446   while (1)
5447     {
5448       int c;
5449       Lisp_Object val;
5450       struct charset *charset;
5451       int dim;
5452       int len = 1;
5453       unsigned code;
5454
5455       src_base = src;
5456       consumed_chars_base = consumed_chars;
5457
5458       if (charbuf >= charbuf_end)
5459         {
5460           if (byte_after_cr >= 0)
5461             src_base--;
5462           break;
5463         }
5464
5465       if (byte_after_cr >= 0)
5466         {
5467           c = byte_after_cr;
5468           byte_after_cr = -1;
5469         }
5470       else
5471         {
5472           ONE_MORE_BYTE (c);
5473           if (eol_dos && c == '\r')
5474             ONE_MORE_BYTE (byte_after_cr);
5475         }
5476       if (c < 0)
5477         goto invalid_code;
5478       code = c;
5479
5480       val = AREF (valids, c);
5481       if (! INTEGERP (val) && ! CONSP (val))
5482         goto invalid_code;
5483       if (INTEGERP (val))
5484         {
5485           charset = CHARSET_FROM_ID (XFASTINT (val));
5486           dim = CHARSET_DIMENSION (charset);
5487           while (len < dim)
5488             {
5489               ONE_MORE_BYTE (c);
5490               code = (code << 8) | c;
5491               len++;
5492             }
5493           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5494                               charset, code, c);
5495         }
5496       else
5497         {
5498           /* VAL is a list of charset IDs.  It is assured that the
5499              list is sorted by charset dimensions (smaller one
5500              comes first).  */
5501           while (CONSP (val))
5502             {
5503               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5504               dim = CHARSET_DIMENSION (charset);
5505               while (len < dim)
5506                 {
5507                   ONE_MORE_BYTE (c);
5508                   code = (code << 8) | c;
5509                   len++;
5510                 }
5511               CODING_DECODE_CHAR (coding, src, src_base,
5512                                   src_end, charset, code, c);
5513               if (c >= 0)
5514                 break;
5515               val = XCDR (val);
5516             }
5517         }
5518       if (c < 0)
5519         goto invalid_code;
5520       if (charset->id != charset_ascii
5521           && last_id != charset->id)
5522         {
5523           if (last_id != charset_ascii)
5524             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5525           last_id = charset->id;
5526           last_offset = char_offset;
5527         }
5528
5529       *charbuf++ = c;
5530       char_offset++;
5531       continue;
5532
5533     invalid_code:
5534       src = src_base;
5535       consumed_chars = consumed_chars_base;
5536       ONE_MORE_BYTE (c);
5537       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5538       char_offset++;
5539       coding->errors++;
5540     }
5541
5542  no_more_source:
5543   if (last_id != charset_ascii)
5544     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5545   coding->consumed_char += consumed_chars_base;
5546   coding->consumed = src_base - coding->source;
5547   coding->charbuf_used = charbuf - coding->charbuf;
5548 }
5549
5550 static int
5551 encode_coding_charset (struct coding_system *coding)
5552 {
5553   int multibytep = coding->dst_multibyte;
5554   int *charbuf = coding->charbuf;
5555   int *charbuf_end = charbuf + coding->charbuf_used;
5556   unsigned char *dst = coding->destination + coding->produced;
5557   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5558   int safe_room = MAX_MULTIBYTE_LENGTH;
5559   int produced_chars = 0;
5560   Lisp_Object attrs, charset_list;
5561   int ascii_compatible;
5562   int c;
5563
5564   CODING_GET_INFO (coding, attrs, charset_list);
5565   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5566
5567   while (charbuf < charbuf_end)
5568     {
5569       struct charset *charset;
5570       unsigned code;
5571
5572       ASSURE_DESTINATION (safe_room);
5573       c = *charbuf++;
5574       if (ascii_compatible && ASCII_CHAR_P (c))
5575         EMIT_ONE_ASCII_BYTE (c);
5576       else if (CHAR_BYTE8_P (c))
5577         {
5578           c = CHAR_TO_BYTE8 (c);
5579           EMIT_ONE_BYTE (c);
5580         }
5581       else
5582         {
5583           charset = char_charset (c, charset_list, &code);
5584           if (charset)
5585             {
5586               if (CHARSET_DIMENSION (charset) == 1)
5587                 EMIT_ONE_BYTE (code);
5588               else if (CHARSET_DIMENSION (charset) == 2)
5589                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5590               else if (CHARSET_DIMENSION (charset) == 3)
5591                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5592               else
5593                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5594                                  (code >> 8) & 0xFF, code & 0xFF);
5595             }
5596           else
5597             {
5598               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5599                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5600               else
5601                 c = coding->default_char;
5602               EMIT_ONE_BYTE (c);
5603             }
5604         }
5605     }
5606
5607   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5608   coding->produced_char += produced_chars;
5609   coding->produced = dst - coding->destination;
5610   return 0;
5611 }
5612
5613 \f
5614 /*** 7. C library functions ***/
5615
5616 /* Setup coding context CODING from information about CODING_SYSTEM.
5617    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5618    CODING_SYSTEM is invalid, signal an error.  */
5619
5620 void
5621 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5622 {
5623   Lisp_Object attrs;
5624   Lisp_Object eol_type;
5625   Lisp_Object coding_type;
5626   Lisp_Object val;
5627
5628   if (NILP (coding_system))
5629     coding_system = Qundecided;
5630
5631   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5632
5633   attrs = CODING_ID_ATTRS (coding->id);
5634   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5635
5636   coding->mode = 0;
5637   coding->head_ascii = -1;
5638   if (VECTORP (eol_type))
5639     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5640                             | CODING_REQUIRE_DETECTION_MASK);
5641   else if (! EQ (eol_type, Qunix))
5642     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5643                             | CODING_REQUIRE_ENCODING_MASK);
5644   else
5645     coding->common_flags = 0;
5646   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5647     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5648   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5649     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5650   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5651     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5652
5653   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5654   coding->max_charset_id = SCHARS (val) - 1;
5655   coding->safe_charsets = SDATA (val);
5656   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5657   coding->carryover_bytes = 0;
5658
5659   coding_type = CODING_ATTR_TYPE (attrs);
5660   if (EQ (coding_type, Qundecided))
5661     {
5662       coding->detector = NULL;
5663       coding->decoder = decode_coding_raw_text;
5664       coding->encoder = encode_coding_raw_text;
5665       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5666     }
5667   else if (EQ (coding_type, Qiso_2022))
5668     {
5669       int i;
5670       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5671
5672       /* Invoke graphic register 0 to plane 0.  */
5673       CODING_ISO_INVOCATION (coding, 0) = 0;
5674       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5675       CODING_ISO_INVOCATION (coding, 1)
5676         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5677       /* Setup the initial status of designation.  */
5678       for (i = 0; i < 4; i++)
5679         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5680       /* Not single shifting initially.  */
5681       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5682       /* Beginning of buffer should also be regarded as bol. */
5683       CODING_ISO_BOL (coding) = 1;
5684       coding->detector = detect_coding_iso_2022;
5685       coding->decoder = decode_coding_iso_2022;
5686       coding->encoder = encode_coding_iso_2022;
5687       if (flags & CODING_ISO_FLAG_SAFE)
5688         coding->mode |= CODING_MODE_SAFE_ENCODING;
5689       coding->common_flags
5690         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5691             | CODING_REQUIRE_FLUSHING_MASK);
5692       if (flags & CODING_ISO_FLAG_COMPOSITION)
5693         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5694       if (flags & CODING_ISO_FLAG_DESIGNATION)
5695         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5696       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5697         {
5698           setup_iso_safe_charsets (attrs);
5699           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5700           coding->max_charset_id = SCHARS (val) - 1;
5701           coding->safe_charsets = SDATA (val);
5702         }
5703       CODING_ISO_FLAGS (coding) = flags;
5704       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5705       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5706       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5707       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5708     }
5709   else if (EQ (coding_type, Qcharset))
5710     {
5711       coding->detector = detect_coding_charset;
5712       coding->decoder = decode_coding_charset;
5713       coding->encoder = encode_coding_charset;
5714       coding->common_flags
5715         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5716     }
5717   else if (EQ (coding_type, Qutf_8))
5718     {
5719       val = AREF (attrs, coding_attr_utf_bom);
5720       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5721                                    : EQ (val, Qt) ? utf_with_bom
5722                                    : utf_without_bom);
5723       coding->detector = detect_coding_utf_8;
5724       coding->decoder = decode_coding_utf_8;
5725       coding->encoder = encode_coding_utf_8;
5726       coding->common_flags
5727         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5728       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5729         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5730     }
5731   else if (EQ (coding_type, Qutf_16))
5732     {
5733       val = AREF (attrs, coding_attr_utf_bom);
5734       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5735                                     : EQ (val, Qt) ? utf_with_bom
5736                                     : utf_without_bom);
5737       val = AREF (attrs, coding_attr_utf_16_endian);
5738       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5739                                        : utf_16_little_endian);
5740       CODING_UTF_16_SURROGATE (coding) = 0;
5741       coding->detector = detect_coding_utf_16;
5742       coding->decoder = decode_coding_utf_16;
5743       coding->encoder = encode_coding_utf_16;
5744       coding->common_flags
5745         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5746       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5747         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5748     }
5749   else if (EQ (coding_type, Qccl))
5750     {
5751       coding->detector = detect_coding_ccl;
5752       coding->decoder = decode_coding_ccl;
5753       coding->encoder = encode_coding_ccl;
5754       coding->common_flags
5755         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5756             | CODING_REQUIRE_FLUSHING_MASK);
5757     }
5758   else if (EQ (coding_type, Qemacs_mule))
5759     {
5760       coding->detector = detect_coding_emacs_mule;
5761       coding->decoder = decode_coding_emacs_mule;
5762       coding->encoder = encode_coding_emacs_mule;
5763       coding->common_flags
5764         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5765       coding->spec.emacs_mule.full_support = 1;
5766       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5767           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5768         {
5769           Lisp_Object tail, safe_charsets;
5770           int max_charset_id = 0;
5771
5772           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5773                tail = XCDR (tail))
5774             if (max_charset_id < XFASTINT (XCAR (tail)))
5775               max_charset_id = XFASTINT (XCAR (tail));
5776           safe_charsets = make_uninit_string (max_charset_id + 1);
5777           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5778           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5779                tail = XCDR (tail))
5780             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5781           coding->max_charset_id = max_charset_id;
5782           coding->safe_charsets = SDATA (safe_charsets);
5783           coding->spec.emacs_mule.full_support = 1;
5784         }
5785       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5786       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5787     }
5788   else if (EQ (coding_type, Qshift_jis))
5789     {
5790       coding->detector = detect_coding_sjis;
5791       coding->decoder = decode_coding_sjis;
5792       coding->encoder = encode_coding_sjis;
5793       coding->common_flags
5794         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5795     }
5796   else if (EQ (coding_type, Qbig5))
5797     {
5798       coding->detector = detect_coding_big5;
5799       coding->decoder = decode_coding_big5;
5800       coding->encoder = encode_coding_big5;
5801       coding->common_flags
5802         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5803     }
5804   else                          /* EQ (coding_type, Qraw_text) */
5805     {
5806       coding->detector = NULL;
5807       coding->decoder = decode_coding_raw_text;
5808       coding->encoder = encode_coding_raw_text;
5809       if (! EQ (eol_type, Qunix))
5810         {
5811           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5812           if (! VECTORP (eol_type))
5813             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5814         }
5815
5816     }
5817
5818   return;
5819 }
5820
5821 /* Return a list of charsets supported by CODING.  */
5822
5823 Lisp_Object
5824 coding_charset_list (struct coding_system *coding)
5825 {
5826   Lisp_Object attrs, charset_list;
5827
5828   CODING_GET_INFO (coding, attrs, charset_list);
5829   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5830     {
5831       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5832
5833       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5834         charset_list = Viso_2022_charset_list;
5835     }
5836   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5837     {
5838       charset_list = Vemacs_mule_charset_list;
5839     }
5840   return charset_list;
5841 }
5842
5843
5844 /* Return a list of charsets supported by CODING-SYSTEM.  */
5845
5846 Lisp_Object
5847 coding_system_charset_list (Lisp_Object coding_system)
5848 {
5849   int id;
5850   Lisp_Object attrs, charset_list;
5851
5852   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5853   attrs = CODING_ID_ATTRS (id);
5854
5855   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5856     {
5857       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5858
5859       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5860         charset_list = Viso_2022_charset_list;
5861       else
5862         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5863     }
5864   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5865     {
5866       charset_list = Vemacs_mule_charset_list;
5867     }
5868   else
5869     {
5870       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5871     }
5872   return charset_list;
5873 }
5874
5875
5876 /* Return raw-text or one of its subsidiaries that has the same
5877    eol_type as CODING-SYSTEM.  */
5878
5879 Lisp_Object
5880 raw_text_coding_system (Lisp_Object coding_system)
5881 {
5882   Lisp_Object spec, attrs;
5883   Lisp_Object eol_type, raw_text_eol_type;
5884
5885   if (NILP (coding_system))
5886     return Qraw_text;
5887   spec = CODING_SYSTEM_SPEC (coding_system);
5888   attrs = AREF (spec, 0);
5889
5890   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5891     return coding_system;
5892
5893   eol_type = AREF (spec, 2);
5894   if (VECTORP (eol_type))
5895     return Qraw_text;
5896   spec = CODING_SYSTEM_SPEC (Qraw_text);
5897   raw_text_eol_type = AREF (spec, 2);
5898   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5899           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5900           : AREF (raw_text_eol_type, 2));
5901 }
5902
5903
5904 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5905    the subsidiary that has the same eol-spec as PARENT (if it is not
5906    nil and specifies end-of-line format) or the system's setting
5907    (system_eol_type).  */
5908
5909 Lisp_Object
5910 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5911 {
5912   Lisp_Object spec, eol_type;
5913
5914   if (NILP (coding_system))
5915     coding_system = Qraw_text;
5916   spec = CODING_SYSTEM_SPEC (coding_system);
5917   eol_type = AREF (spec, 2);
5918   if (VECTORP (eol_type))
5919     {
5920       Lisp_Object parent_eol_type;
5921
5922       if (! NILP (parent))
5923         {
5924           Lisp_Object parent_spec;
5925
5926           parent_spec = CODING_SYSTEM_SPEC (parent);
5927           parent_eol_type = AREF (parent_spec, 2);
5928           if (VECTORP (parent_eol_type))
5929             parent_eol_type = system_eol_type;
5930         }
5931       else
5932         parent_eol_type = system_eol_type;
5933       if (EQ (parent_eol_type, Qunix))
5934         coding_system = AREF (eol_type, 0);
5935       else if (EQ (parent_eol_type, Qdos))
5936         coding_system = AREF (eol_type, 1);
5937       else if (EQ (parent_eol_type, Qmac))
5938         coding_system = AREF (eol_type, 2);
5939     }
5940   return coding_system;
5941 }
5942
5943
5944 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5945    decided for writing to a process.  If not, complement them, and
5946    return a new coding system.  */
5947
5948 Lisp_Object
5949 complement_process_encoding_system (Lisp_Object coding_system)
5950 {
5951   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5952   Lisp_Object spec, attrs;
5953   int i;
5954
5955   for (i = 0; i < 3; i++)
5956     {
5957       if (i == 1)
5958         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5959       else if (i == 2)
5960         coding_system = preferred_coding_system ();
5961       spec = CODING_SYSTEM_SPEC (coding_system);
5962       if (NILP (spec))
5963         continue;
5964       attrs = AREF (spec, 0);
5965       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5966         coding_base = CODING_ATTR_BASE_NAME (attrs);
5967       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5968         eol_base = coding_system;
5969       if (! NILP (coding_base) && ! NILP (eol_base))
5970         break;
5971     }
5972
5973   if (i > 0)
5974     /* The original CODING_SYSTEM didn't specify text-conversion or
5975        eol-conversion.  Be sure that we return a fully complemented
5976        coding system.  */
5977     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5978   return coding_system;
5979 }
5980
5981
5982 /* Emacs has a mechanism to automatically detect a coding system if it
5983    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5984    it's impossible to distinguish some coding systems accurately
5985    because they use the same range of codes.  So, at first, coding
5986    systems are categorized into 7, those are:
5987
5988    o coding-category-emacs-mule
5989
5990         The category for a coding system which has the same code range
5991         as Emacs' internal format.  Assigned the coding-system (Lisp
5992         symbol) `emacs-mule' by default.
5993
5994    o coding-category-sjis
5995
5996         The category for a coding system which has the same code range
5997         as SJIS.  Assigned the coding-system (Lisp
5998         symbol) `japanese-shift-jis' by default.
5999
6000    o coding-category-iso-7
6001
6002         The category for a coding system which has the same code range
6003         as ISO2022 of 7-bit environment.  This doesn't use any locking
6004         shift and single shift functions.  This can encode/decode all
6005         charsets.  Assigned the coding-system (Lisp symbol)
6006         `iso-2022-7bit' by default.
6007
6008    o coding-category-iso-7-tight
6009
6010         Same as coding-category-iso-7 except that this can
6011         encode/decode only the specified charsets.
6012
6013    o coding-category-iso-8-1
6014
6015         The category for a coding system which has the same code range
6016         as ISO2022 of 8-bit environment and graphic plane 1 used only
6017         for DIMENSION1 charset.  This doesn't use any locking shift
6018         and single shift functions.  Assigned the coding-system (Lisp
6019         symbol) `iso-latin-1' by default.
6020
6021    o coding-category-iso-8-2
6022
6023         The category for a coding system which has the same code range
6024         as ISO2022 of 8-bit environment and graphic plane 1 used only
6025         for DIMENSION2 charset.  This doesn't use any locking shift
6026         and single shift functions.  Assigned the coding-system (Lisp
6027         symbol) `japanese-iso-8bit' by default.
6028
6029    o coding-category-iso-7-else
6030
6031         The category for a coding system which has the same code range
6032         as ISO2022 of 7-bit environment but uses locking shift or
6033         single shift functions.  Assigned the coding-system (Lisp
6034         symbol) `iso-2022-7bit-lock' by default.
6035
6036    o coding-category-iso-8-else
6037
6038         The category for a coding system which has the same code range
6039         as ISO2022 of 8-bit environment but uses locking shift or
6040         single shift functions.  Assigned the coding-system (Lisp
6041         symbol) `iso-2022-8bit-ss2' by default.
6042
6043    o coding-category-big5
6044
6045         The category for a coding system which has the same code range
6046         as BIG5.  Assigned the coding-system (Lisp symbol)
6047         `cn-big5' by default.
6048
6049    o coding-category-utf-8
6050
6051         The category for a coding system which has the same code range
6052         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6053         symbol) `utf-8' by default.
6054
6055    o coding-category-utf-16-be
6056
6057         The category for a coding system in which a text has an
6058         Unicode signature (cf. Unicode Standard) in the order of BIG
6059         endian at the head.  Assigned the coding-system (Lisp symbol)
6060         `utf-16-be' by default.
6061
6062    o coding-category-utf-16-le
6063
6064         The category for a coding system in which a text has an
6065         Unicode signature (cf. Unicode Standard) in the order of
6066         LITTLE endian at the head.  Assigned the coding-system (Lisp
6067         symbol) `utf-16-le' by default.
6068
6069    o coding-category-ccl
6070
6071         The category for a coding system of which encoder/decoder is
6072         written in CCL programs.  The default value is nil, i.e., no
6073         coding system is assigned.
6074
6075    o coding-category-binary
6076
6077         The category for a coding system not categorized in any of the
6078         above.  Assigned the coding-system (Lisp symbol)
6079         `no-conversion' by default.
6080
6081    Each of them is a Lisp symbol and the value is an actual
6082    `coding-system's (this is also a Lisp symbol) assigned by a user.
6083    What Emacs does actually is to detect a category of coding system.
6084    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6085    decide only one possible category, it selects a category of the
6086    highest priority.  Priorities of categories are also specified by a
6087    user in a Lisp variable `coding-category-list'.
6088
6089 */
6090
6091 #define EOL_SEEN_NONE   0
6092 #define EOL_SEEN_LF     1
6093 #define EOL_SEEN_CR     2
6094 #define EOL_SEEN_CRLF   4
6095
6096 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6097    SOURCE is encoded.  If CATEGORY is one of
6098    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6099    two-byte, else they are encoded by one-byte.
6100
6101    Return one of EOL_SEEN_XXX.  */
6102
6103 #define MAX_EOL_CHECK_COUNT 3
6104
6105 static int
6106 detect_eol (const unsigned char *source, EMACS_INT src_bytes,
6107             enum coding_category category)
6108 {
6109   const unsigned char *src = source, *src_end = src + src_bytes;
6110   unsigned char c;
6111   int total  = 0;
6112   int eol_seen = EOL_SEEN_NONE;
6113
6114   if ((1 << category) & CATEGORY_MASK_UTF_16)
6115     {
6116       int msb, lsb;
6117
6118       msb = category == (coding_category_utf_16_le
6119                          | coding_category_utf_16_le_nosig);
6120       lsb = 1 - msb;
6121
6122       while (src + 1 < src_end)
6123         {
6124           c = src[lsb];
6125           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6126             {
6127               int this_eol;
6128
6129               if (c == '\n')
6130                 this_eol = EOL_SEEN_LF;
6131               else if (src + 3 >= src_end
6132                        || src[msb + 2] != 0
6133                        || src[lsb + 2] != '\n')
6134                 this_eol = EOL_SEEN_CR;
6135               else
6136                 {
6137                   this_eol = EOL_SEEN_CRLF;
6138                   src += 2;
6139                 }
6140
6141               if (eol_seen == EOL_SEEN_NONE)
6142                 /* This is the first end-of-line.  */
6143                 eol_seen = this_eol;
6144               else if (eol_seen != this_eol)
6145                 {
6146                   /* The found type is different from what found before.
6147                      Allow for stray ^M characters in DOS EOL files.  */
6148                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6149                       || (eol_seen == EOL_SEEN_CRLF
6150                           && this_eol == EOL_SEEN_CR))
6151                     eol_seen = EOL_SEEN_CRLF;
6152                   else
6153                     {
6154                       eol_seen = EOL_SEEN_LF;
6155                       break;
6156                     }
6157                 }
6158               if (++total == MAX_EOL_CHECK_COUNT)
6159                 break;
6160             }
6161           src += 2;
6162         }
6163     }
6164   else
6165     while (src < src_end)
6166       {
6167         c = *src++;
6168         if (c == '\n' || c == '\r')
6169           {
6170             int this_eol;
6171
6172             if (c == '\n')
6173               this_eol = EOL_SEEN_LF;
6174             else if (src >= src_end || *src != '\n')
6175               this_eol = EOL_SEEN_CR;
6176             else
6177               this_eol = EOL_SEEN_CRLF, src++;
6178
6179             if (eol_seen == EOL_SEEN_NONE)
6180               /* This is the first end-of-line.  */
6181               eol_seen = this_eol;
6182             else if (eol_seen != this_eol)
6183               {
6184                 /* The found type is different from what found before.
6185                    Allow for stray ^M characters in DOS EOL files.  */
6186                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6187                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6188                   eol_seen = EOL_SEEN_CRLF;
6189                 else
6190                   {
6191                     eol_seen = EOL_SEEN_LF;
6192                     break;
6193                   }
6194               }
6195             if (++total == MAX_EOL_CHECK_COUNT)
6196               break;
6197           }
6198       }
6199   return eol_seen;
6200 }
6201
6202
6203 static Lisp_Object
6204 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6205 {
6206   Lisp_Object eol_type;
6207
6208   eol_type = CODING_ID_EOL_TYPE (coding->id);
6209   if (eol_seen & EOL_SEEN_LF)
6210     {
6211       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6212       eol_type = Qunix;
6213     }
6214   else if (eol_seen & EOL_SEEN_CRLF)
6215     {
6216       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6217       eol_type = Qdos;
6218     }
6219   else if (eol_seen & EOL_SEEN_CR)
6220     {
6221       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6222       eol_type = Qmac;
6223     }
6224   return eol_type;
6225 }
6226
6227 /* Detect how a text specified in CODING is encoded.  If a coding
6228    system is detected, update fields of CODING by the detected coding
6229    system.  */
6230
6231 void
6232 detect_coding (struct coding_system *coding)
6233 {
6234   const unsigned char *src, *src_end;
6235   int saved_mode = coding->mode;
6236
6237   coding->consumed = coding->consumed_char = 0;
6238   coding->produced = coding->produced_char = 0;
6239   coding_set_source (coding);
6240
6241   src_end = coding->source + coding->src_bytes;
6242   coding->head_ascii = 0;
6243
6244   /* If we have not yet decided the text encoding type, detect it
6245      now.  */
6246   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6247     {
6248       int c, i;
6249       struct coding_detection_info detect_info;
6250       int null_byte_found = 0, eight_bit_found = 0;
6251
6252       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6253       for (src = coding->source; src < src_end; src++)
6254         {
6255           c = *src;
6256           if (c & 0x80)
6257             {
6258               eight_bit_found = 1;
6259               if (null_byte_found)
6260                 break;
6261             }
6262           else if (c < 0x20)
6263             {
6264               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6265                   && ! inhibit_iso_escape_detection
6266                   && ! detect_info.checked)
6267                 {
6268                   if (detect_coding_iso_2022 (coding, &detect_info))
6269                     {
6270                       /* We have scanned the whole data.  */
6271                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6272                         {
6273                           /* We didn't find an 8-bit code.  We may
6274                              have found a null-byte, but it's very
6275                              rare that a binary file conforms to
6276                              ISO-2022.  */
6277                           src = src_end;
6278                           coding->head_ascii = src - coding->source;
6279                         }
6280                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6281                       break;
6282                     }
6283                 }
6284               else if (! c && !inhibit_null_byte_detection)
6285                 {
6286                   null_byte_found = 1;
6287                   if (eight_bit_found)
6288                     break;
6289                 }
6290               if (! eight_bit_found)
6291                 coding->head_ascii++;
6292             }
6293           else if (! eight_bit_found)
6294             coding->head_ascii++;
6295         }
6296
6297       if (null_byte_found || eight_bit_found
6298           || coding->head_ascii < coding->src_bytes
6299           || detect_info.found)
6300         {
6301           enum coding_category category;
6302           struct coding_system *this;
6303
6304           if (coding->head_ascii == coding->src_bytes)
6305             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6306             for (i = 0; i < coding_category_raw_text; i++)
6307               {
6308                 category = coding_priorities[i];
6309                 this = coding_categories + category;
6310                 if (detect_info.found & (1 << category))
6311                   break;
6312               }
6313           else
6314             {
6315               if (null_byte_found)
6316                 {
6317                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6318                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6319                 }
6320               for (i = 0; i < coding_category_raw_text; i++)
6321                 {
6322                   category = coding_priorities[i];
6323                   this = coding_categories + category;
6324                   if (this->id < 0)
6325                     {
6326                       /* No coding system of this category is defined.  */
6327                       detect_info.rejected |= (1 << category);
6328                     }
6329                   else if (category >= coding_category_raw_text)
6330                     continue;
6331                   else if (detect_info.checked & (1 << category))
6332                     {
6333                       if (detect_info.found & (1 << category))
6334                         break;
6335                     }
6336                   else if ((*(this->detector)) (coding, &detect_info)
6337                            && detect_info.found & (1 << category))
6338                     {
6339                       if (category == coding_category_utf_16_auto)
6340                         {
6341                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6342                             category = coding_category_utf_16_le;
6343                           else
6344                             category = coding_category_utf_16_be;
6345                         }
6346                       break;
6347                     }
6348                 }
6349             }
6350
6351           if (i < coding_category_raw_text)
6352             setup_coding_system (CODING_ID_NAME (this->id), coding);
6353           else if (null_byte_found)
6354             setup_coding_system (Qno_conversion, coding);
6355           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6356                    == CATEGORY_MASK_ANY)
6357             setup_coding_system (Qraw_text, coding);
6358           else if (detect_info.rejected)
6359             for (i = 0; i < coding_category_raw_text; i++)
6360               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6361                 {
6362                   this = coding_categories + coding_priorities[i];
6363                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6364                   break;
6365                 }
6366         }
6367     }
6368   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6369            == coding_category_utf_8_auto)
6370     {
6371       Lisp_Object coding_systems;
6372       struct coding_detection_info detect_info;
6373
6374       coding_systems
6375         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6376       detect_info.found = detect_info.rejected = 0;
6377       coding->head_ascii = 0;
6378       if (CONSP (coding_systems)
6379           && detect_coding_utf_8 (coding, &detect_info))
6380         {
6381           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6382             setup_coding_system (XCAR (coding_systems), coding);
6383           else
6384             setup_coding_system (XCDR (coding_systems), coding);
6385         }
6386     }
6387   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6388            == coding_category_utf_16_auto)
6389     {
6390       Lisp_Object coding_systems;
6391       struct coding_detection_info detect_info;
6392
6393       coding_systems
6394         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6395       detect_info.found = detect_info.rejected = 0;
6396       coding->head_ascii = 0;
6397       if (CONSP (coding_systems)
6398           && detect_coding_utf_16 (coding, &detect_info))
6399         {
6400           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6401             setup_coding_system (XCAR (coding_systems), coding);
6402           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6403             setup_coding_system (XCDR (coding_systems), coding);
6404         }
6405     }
6406   coding->mode = saved_mode;
6407 }
6408
6409
6410 static void
6411 decode_eol (struct coding_system *coding)
6412 {
6413   Lisp_Object eol_type;
6414   unsigned char *p, *pbeg, *pend;
6415
6416   eol_type = CODING_ID_EOL_TYPE (coding->id);
6417   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6418     return;
6419
6420   if (NILP (coding->dst_object))
6421     pbeg = coding->destination;
6422   else
6423     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6424   pend = pbeg + coding->produced;
6425
6426   if (VECTORP (eol_type))
6427     {
6428       int eol_seen = EOL_SEEN_NONE;
6429
6430       for (p = pbeg; p < pend; p++)
6431         {
6432           if (*p == '\n')
6433             eol_seen |= EOL_SEEN_LF;
6434           else if (*p == '\r')
6435             {
6436               if (p + 1 < pend && *(p + 1) == '\n')
6437                 {
6438                   eol_seen |= EOL_SEEN_CRLF;
6439                   p++;
6440                 }
6441               else
6442                 eol_seen |= EOL_SEEN_CR;
6443             }
6444         }
6445       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6446       if ((eol_seen & EOL_SEEN_CRLF) != 0
6447           && (eol_seen & EOL_SEEN_CR) != 0
6448           && (eol_seen & EOL_SEEN_LF) == 0)
6449         eol_seen = EOL_SEEN_CRLF;
6450       else if (eol_seen != EOL_SEEN_NONE
6451           && eol_seen != EOL_SEEN_LF
6452           && eol_seen != EOL_SEEN_CRLF
6453           && eol_seen != EOL_SEEN_CR)
6454         eol_seen = EOL_SEEN_LF;
6455       if (eol_seen != EOL_SEEN_NONE)
6456         eol_type = adjust_coding_eol_type (coding, eol_seen);
6457     }
6458
6459   if (EQ (eol_type, Qmac))
6460     {
6461       for (p = pbeg; p < pend; p++)
6462         if (*p == '\r')
6463           *p = '\n';
6464     }
6465   else if (EQ (eol_type, Qdos))
6466     {
6467       int n = 0;
6468
6469       if (NILP (coding->dst_object))
6470         {
6471           /* Start deleting '\r' from the tail to minimize the memory
6472              movement.  */
6473           for (p = pend - 2; p >= pbeg; p--)
6474             if (*p == '\r')
6475               {
6476                 memmove (p, p + 1, pend-- - p - 1);
6477                 n++;
6478               }
6479         }
6480       else
6481         {
6482           int pos_byte = coding->dst_pos_byte;
6483           int pos = coding->dst_pos;
6484           int pos_end = pos + coding->produced_char - 1;
6485
6486           while (pos < pos_end)
6487             {
6488               p = BYTE_POS_ADDR (pos_byte);
6489               if (*p == '\r' && p[1] == '\n')
6490                 {
6491                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6492                   n++;
6493                   pos_end--;
6494                 }
6495               pos++;
6496               if (coding->dst_multibyte)
6497                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6498               else
6499                 pos_byte++;
6500             }
6501         }
6502       coding->produced -= n;
6503       coding->produced_char -= n;
6504     }
6505 }
6506
6507
6508 /* Return a translation table (or list of them) from coding system
6509    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6510    decoding (ENCODEP is zero). */
6511
6512 static Lisp_Object
6513 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6514 {
6515   Lisp_Object standard, translation_table;
6516   Lisp_Object val;
6517
6518   if (NILP (Venable_character_translation))
6519     {
6520       if (max_lookup)
6521         *max_lookup = 0;
6522       return Qnil;
6523     }
6524   if (encodep)
6525     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6526       standard = Vstandard_translation_table_for_encode;
6527   else
6528     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6529       standard = Vstandard_translation_table_for_decode;
6530   if (NILP (translation_table))
6531     translation_table = standard;
6532   else
6533     {
6534       if (SYMBOLP (translation_table))
6535         translation_table = Fget (translation_table, Qtranslation_table);
6536       else if (CONSP (translation_table))
6537         {
6538           translation_table = Fcopy_sequence (translation_table);
6539           for (val = translation_table; CONSP (val); val = XCDR (val))
6540             if (SYMBOLP (XCAR (val)))
6541               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6542         }
6543       if (CHAR_TABLE_P (standard))
6544         {
6545           if (CONSP (translation_table))
6546             translation_table = nconc2 (translation_table,
6547                                         Fcons (standard, Qnil));
6548           else
6549             translation_table = Fcons (translation_table,
6550                                        Fcons (standard, Qnil));
6551         }
6552     }
6553
6554   if (max_lookup)
6555     {
6556       *max_lookup = 1;
6557       if (CHAR_TABLE_P (translation_table)
6558           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6559         {
6560           val = XCHAR_TABLE (translation_table)->extras[1];
6561           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6562             *max_lookup = XFASTINT (val);
6563         }
6564       else if (CONSP (translation_table))
6565         {
6566           Lisp_Object tail;
6567
6568           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6569             if (CHAR_TABLE_P (XCAR (tail))
6570                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6571               {
6572                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6573                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6574                   *max_lookup = XFASTINT (tailval);
6575               }
6576         }
6577     }
6578   return translation_table;
6579 }
6580
6581 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6582   do {                                                          \
6583     trans = Qnil;                                               \
6584     if (CHAR_TABLE_P (table))                                   \
6585       {                                                         \
6586         trans = CHAR_TABLE_REF (table, c);                      \
6587         if (CHARACTERP (trans))                                 \
6588           c = XFASTINT (trans), trans = Qnil;                   \
6589       }                                                         \
6590     else if (CONSP (table))                                     \
6591       {                                                         \
6592         Lisp_Object tail;                                       \
6593                                                                 \
6594         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6595           if (CHAR_TABLE_P (XCAR (tail)))                       \
6596             {                                                   \
6597               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6598               if (CHARACTERP (trans))                           \
6599                 c = XFASTINT (trans), trans = Qnil;             \
6600               else if (! NILP (trans))                          \
6601                 break;                                          \
6602             }                                                   \
6603       }                                                         \
6604   } while (0)
6605
6606
6607 /* Return a translation of character(s) at BUF according to TRANS.
6608    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6609    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6610    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6611    translation is found, and Qnil if not found..
6612    If BUF is too short to lookup characters in FROM, return Qt.  */
6613
6614 static Lisp_Object
6615 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6616 {
6617
6618   if (INTEGERP (trans))
6619     return trans;
6620   for (; CONSP (trans); trans = XCDR (trans))
6621     {
6622       Lisp_Object val = XCAR (trans);
6623       Lisp_Object from = XCAR (val);
6624       int len = ASIZE (from);
6625       int i;
6626
6627       for (i = 0; i < len; i++)
6628         {
6629           if (buf + i == buf_end)
6630             return Qt;
6631           if (XINT (AREF (from, i)) != buf[i])
6632             break;
6633         }
6634       if (i == len)
6635         return val;
6636     }
6637   return Qnil;
6638 }
6639
6640
6641 static int
6642 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6643                int last_block)
6644 {
6645   unsigned char *dst = coding->destination + coding->produced;
6646   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6647   EMACS_INT produced;
6648   EMACS_INT produced_chars = 0;
6649   int carryover = 0;
6650
6651   if (! coding->chars_at_source)
6652     {
6653       /* Source characters are in coding->charbuf.  */
6654       int *buf = coding->charbuf;
6655       int *buf_end = buf + coding->charbuf_used;
6656
6657       if (EQ (coding->src_object, coding->dst_object))
6658         {
6659           coding_set_source (coding);
6660           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6661         }
6662
6663       while (buf < buf_end)
6664         {
6665           int c = *buf, i;
6666
6667           if (c >= 0)
6668             {
6669               int from_nchars = 1, to_nchars = 1;
6670               Lisp_Object trans = Qnil;
6671
6672               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6673               if (! NILP (trans))
6674                 {
6675                   trans = get_translation (trans, buf, buf_end);
6676                   if (INTEGERP (trans))
6677                     c = XINT (trans);
6678                   else if (CONSP (trans))
6679                     {
6680                       from_nchars = ASIZE (XCAR (trans));
6681                       trans = XCDR (trans);
6682                       if (INTEGERP (trans))
6683                         c = XINT (trans);
6684                       else
6685                         {
6686                           to_nchars = ASIZE (trans);
6687                           c = XINT (AREF (trans, 0));
6688                         }
6689                     }
6690                   else if (EQ (trans, Qt) && ! last_block)
6691                     break;
6692                 }
6693
6694               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6695                 {
6696                   dst = alloc_destination (coding,
6697                                            buf_end - buf
6698                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6699                                            dst);
6700                   if (EQ (coding->src_object, coding->dst_object))
6701                     {
6702                       coding_set_source (coding);
6703                       dst_end = (((unsigned char *) coding->source)
6704                                  + coding->consumed);
6705                     }
6706                   else
6707                     dst_end = coding->destination + coding->dst_bytes;
6708                 }
6709
6710               for (i = 0; i < to_nchars; i++)
6711                 {
6712                   if (i > 0)
6713                     c = XINT (AREF (trans, i));
6714                   if (coding->dst_multibyte
6715                       || ! CHAR_BYTE8_P (c))
6716                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6717                   else
6718                     *dst++ = CHAR_TO_BYTE8 (c);
6719                 }
6720               produced_chars += to_nchars;
6721               buf += from_nchars;
6722             }
6723           else
6724             /* This is an annotation datum.  (-C) is the length.  */
6725             buf += -c;
6726         }
6727       carryover = buf_end - buf;
6728     }
6729   else
6730     {
6731       /* Source characters are at coding->source.  */
6732       const unsigned char *src = coding->source;
6733       const unsigned char *src_end = src + coding->consumed;
6734
6735       if (EQ (coding->dst_object, coding->src_object))
6736         dst_end = (unsigned char *) src;
6737       if (coding->src_multibyte != coding->dst_multibyte)
6738         {
6739           if (coding->src_multibyte)
6740             {
6741               int multibytep = 1;
6742               EMACS_INT consumed_chars = 0;
6743
6744               while (1)
6745                 {
6746                   const unsigned char *src_base = src;
6747                   int c;
6748
6749                   ONE_MORE_BYTE (c);
6750                   if (dst == dst_end)
6751                     {
6752                       if (EQ (coding->src_object, coding->dst_object))
6753                         dst_end = (unsigned char *) src;
6754                       if (dst == dst_end)
6755                         {
6756                           EMACS_INT offset = src - coding->source;
6757
6758                           dst = alloc_destination (coding, src_end - src + 1,
6759                                                    dst);
6760                           dst_end = coding->destination + coding->dst_bytes;
6761                           coding_set_source (coding);
6762                           src = coding->source + offset;
6763                           src_end = coding->source + coding->src_bytes;
6764                           if (EQ (coding->src_object, coding->dst_object))
6765                             dst_end = (unsigned char *) src;
6766                         }
6767                     }
6768                   *dst++ = c;
6769                   produced_chars++;
6770                 }
6771             no_more_source:
6772               ;
6773             }
6774           else
6775             while (src < src_end)
6776               {
6777                 int multibytep = 1;
6778                 int c = *src++;
6779
6780                 if (dst >= dst_end - 1)
6781                   {
6782                     if (EQ (coding->src_object, coding->dst_object))
6783                       dst_end = (unsigned char *) src;
6784                     if (dst >= dst_end - 1)
6785                       {
6786                         EMACS_INT offset = src - coding->source;
6787                         EMACS_INT more_bytes;
6788
6789                         if (EQ (coding->src_object, coding->dst_object))
6790                           more_bytes = ((src_end - src) / 2) + 2;
6791                         else
6792                           more_bytes = src_end - src + 2;
6793                         dst = alloc_destination (coding, more_bytes, dst);
6794                         dst_end = coding->destination + coding->dst_bytes;
6795                         coding_set_source (coding);
6796                         src = coding->source + offset;
6797                         src_end = coding->source + coding->src_bytes;
6798                         if (EQ (coding->src_object, coding->dst_object))
6799                           dst_end = (unsigned char *) src;
6800                       }
6801                   }
6802                 EMIT_ONE_BYTE (c);
6803               }
6804         }
6805       else
6806         {
6807           if (!EQ (coding->src_object, coding->dst_object))
6808             {
6809               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6810
6811               if (require > 0)
6812                 {
6813                   EMACS_INT offset = src - coding->source;
6814
6815                   dst = alloc_destination (coding, require, dst);
6816                   coding_set_source (coding);
6817                   src = coding->source + offset;
6818                   src_end = coding->source + coding->src_bytes;
6819                 }
6820             }
6821           produced_chars = coding->consumed_char;
6822           while (src < src_end)
6823             *dst++ = *src++;
6824         }
6825     }
6826
6827   produced = dst - (coding->destination + coding->produced);
6828   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6829     insert_from_gap (produced_chars, produced);
6830   coding->produced += produced;
6831   coding->produced_char += produced_chars;
6832   return carryover;
6833 }
6834
6835 /* Compose text in CODING->object according to the annotation data at
6836    CHARBUF.  CHARBUF is an array:
6837      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6838  */
6839
6840 static INLINE void
6841 produce_composition (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6842 {
6843   int len;
6844   EMACS_INT to;
6845   enum composition_method method;
6846   Lisp_Object components;
6847
6848   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6849   to = pos + charbuf[2];
6850   method = (enum composition_method) (charbuf[4]);
6851
6852   if (method == COMPOSITION_RELATIVE)
6853     components = Qnil;
6854   else
6855     {
6856       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6857       int i, j;
6858
6859       if (method == COMPOSITION_WITH_RULE)
6860         len = charbuf[2] * 3 - 2;
6861       charbuf += MAX_ANNOTATION_LENGTH;
6862       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6863       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6864         {
6865           if (charbuf[i] >= 0)
6866             args[j] = make_number (charbuf[i]);
6867           else
6868             {
6869               i++;
6870               args[j] = make_number (charbuf[i] % 0x100);
6871             }
6872         }
6873       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6874     }
6875   compose_text (pos, to, components, Qnil, coding->dst_object);
6876 }
6877
6878
6879 /* Put `charset' property on text in CODING->object according to
6880    the annotation data at CHARBUF.  CHARBUF is an array:
6881      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6882  */
6883
6884 static INLINE void
6885 produce_charset (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6886 {
6887   EMACS_INT from = pos - charbuf[2];
6888   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6889
6890   Fput_text_property (make_number (from), make_number (pos),
6891                       Qcharset, CHARSET_NAME (charset),
6892                       coding->dst_object);
6893 }
6894
6895
6896 #define CHARBUF_SIZE 0x4000
6897
6898 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6899   do {                                                                  \
6900     int size = CHARBUF_SIZE;                                            \
6901                                                                         \
6902     coding->charbuf = NULL;                                             \
6903     while (size > 1024)                                                 \
6904       {                                                                 \
6905         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6906         if (coding->charbuf)                                            \
6907           break;                                                        \
6908         size >>= 1;                                                     \
6909       }                                                                 \
6910     if (! coding->charbuf)                                              \
6911       {                                                                 \
6912         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6913         return coding->result;                                          \
6914       }                                                                 \
6915     coding->charbuf_size = size;                                        \
6916   } while (0)
6917
6918
6919 static void
6920 produce_annotation (struct coding_system *coding, EMACS_INT pos)
6921 {
6922   int *charbuf = coding->charbuf;
6923   int *charbuf_end = charbuf + coding->charbuf_used;
6924
6925   if (NILP (coding->dst_object))
6926     return;
6927
6928   while (charbuf < charbuf_end)
6929     {
6930       if (*charbuf >= 0)
6931         pos++, charbuf++;
6932       else
6933         {
6934           int len = -*charbuf;
6935
6936           if (len > 2)
6937             switch (charbuf[1])
6938               {
6939               case CODING_ANNOTATE_COMPOSITION_MASK:
6940                 produce_composition (coding, charbuf, pos);
6941                 break;
6942               case CODING_ANNOTATE_CHARSET_MASK:
6943                 produce_charset (coding, charbuf, pos);
6944                 break;
6945               }
6946           charbuf += len;
6947         }
6948     }
6949 }
6950
6951 /* Decode the data at CODING->src_object into CODING->dst_object.
6952    CODING->src_object is a buffer, a string, or nil.
6953    CODING->dst_object is a buffer.
6954
6955    If CODING->src_object is a buffer, it must be the current buffer.
6956    In this case, if CODING->src_pos is positive, it is a position of
6957    the source text in the buffer, otherwise, the source text is in the
6958    gap area of the buffer, and CODING->src_pos specifies the offset of
6959    the text from GPT (which must be the same as PT).  If this is the
6960    same buffer as CODING->dst_object, CODING->src_pos must be
6961    negative.
6962
6963    If CODING->src_object is a string, CODING->src_pos is an index to
6964    that string.
6965
6966    If CODING->src_object is nil, CODING->source must already point to
6967    the non-relocatable memory area.  In this case, CODING->src_pos is
6968    an offset from CODING->source.
6969
6970    The decoded data is inserted at the current point of the buffer
6971    CODING->dst_object.
6972 */
6973
6974 static int
6975 decode_coding (struct coding_system *coding)
6976 {
6977   Lisp_Object attrs;
6978   Lisp_Object undo_list;
6979   Lisp_Object translation_table;
6980   struct ccl_spec cclspec;
6981   int carryover;
6982   int i;
6983
6984   if (BUFFERP (coding->src_object)
6985       && coding->src_pos > 0
6986       && coding->src_pos < GPT
6987       && coding->src_pos + coding->src_chars > GPT)
6988     move_gap_both (coding->src_pos, coding->src_pos_byte);
6989
6990   undo_list = Qt;
6991   if (BUFFERP (coding->dst_object))
6992     {
6993       if (current_buffer != XBUFFER (coding->dst_object))
6994         set_buffer_internal (XBUFFER (coding->dst_object));
6995       if (GPT != PT)
6996         move_gap_both (PT, PT_BYTE);
6997       undo_list = BVAR (current_buffer, undo_list);
6998       BVAR (current_buffer, undo_list) = Qt;
6999     }
7000
7001   coding->consumed = coding->consumed_char = 0;
7002   coding->produced = coding->produced_char = 0;
7003   coding->chars_at_source = 0;
7004   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7005   coding->errors = 0;
7006
7007   ALLOC_CONVERSION_WORK_AREA (coding);
7008
7009   attrs = CODING_ID_ATTRS (coding->id);
7010   translation_table = get_translation_table (attrs, 0, NULL);
7011
7012   carryover = 0;
7013   if (coding->decoder == decode_coding_ccl)
7014     {
7015       coding->spec.ccl = &cclspec;
7016       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7017     }
7018   do
7019     {
7020       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7021
7022       coding_set_source (coding);
7023       coding->annotated = 0;
7024       coding->charbuf_used = carryover;
7025       (*(coding->decoder)) (coding);
7026       coding_set_destination (coding);
7027       carryover = produce_chars (coding, translation_table, 0);
7028       if (coding->annotated)
7029         produce_annotation (coding, pos);
7030       for (i = 0; i < carryover; i++)
7031         coding->charbuf[i]
7032           = coding->charbuf[coding->charbuf_used - carryover + i];
7033     }
7034   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7035          || (coding->consumed < coding->src_bytes
7036              && (coding->result == CODING_RESULT_SUCCESS
7037                  || coding->result == CODING_RESULT_INVALID_SRC)));
7038
7039   if (carryover > 0)
7040     {
7041       coding_set_destination (coding);
7042       coding->charbuf_used = carryover;
7043       produce_chars (coding, translation_table, 1);
7044     }
7045
7046   coding->carryover_bytes = 0;
7047   if (coding->consumed < coding->src_bytes)
7048     {
7049       int nbytes = coding->src_bytes - coding->consumed;
7050       const unsigned char *src;
7051
7052       coding_set_source (coding);
7053       coding_set_destination (coding);
7054       src = coding->source + coding->consumed;
7055
7056       if (coding->mode & CODING_MODE_LAST_BLOCK)
7057         {
7058           /* Flush out unprocessed data as binary chars.  We are sure
7059              that the number of data is less than the size of
7060              coding->charbuf.  */
7061           coding->charbuf_used = 0;
7062           coding->chars_at_source = 0;
7063
7064           while (nbytes-- > 0)
7065             {
7066               int c = *src++;
7067
7068               if (c & 0x80)
7069                 c = BYTE8_TO_CHAR (c);
7070               coding->charbuf[coding->charbuf_used++] = c;
7071             }
7072           produce_chars (coding, Qnil, 1);
7073         }
7074       else
7075         {
7076           /* Record unprocessed bytes in coding->carryover.  We are
7077              sure that the number of data is less than the size of
7078              coding->carryover.  */
7079           unsigned char *p = coding->carryover;
7080
7081           if (nbytes > sizeof coding->carryover)
7082             nbytes = sizeof coding->carryover;
7083           coding->carryover_bytes = nbytes;
7084           while (nbytes-- > 0)
7085             *p++ = *src++;
7086         }
7087       coding->consumed = coding->src_bytes;
7088     }
7089
7090   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7091       && !inhibit_eol_conversion)
7092     decode_eol (coding);
7093   if (BUFFERP (coding->dst_object))
7094     {
7095       BVAR (current_buffer, undo_list) = undo_list;
7096       record_insert (coding->dst_pos, coding->produced_char);
7097     }
7098   return coding->result;
7099 }
7100
7101
7102 /* Extract an annotation datum from a composition starting at POS and
7103    ending before LIMIT of CODING->src_object (buffer or string), store
7104    the data in BUF, set *STOP to a starting position of the next
7105    composition (if any) or to LIMIT, and return the address of the
7106    next element of BUF.
7107
7108    If such an annotation is not found, set *STOP to a starting
7109    position of a composition after POS (if any) or to LIMIT, and
7110    return BUF.  */
7111
7112 static INLINE int *
7113 handle_composition_annotation (EMACS_INT pos, EMACS_INT limit,
7114                                struct coding_system *coding, int *buf,
7115                                EMACS_INT *stop)
7116 {
7117   EMACS_INT start, end;
7118   Lisp_Object prop;
7119
7120   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7121       || end > limit)
7122     *stop = limit;
7123   else if (start > pos)
7124     *stop = start;
7125   else
7126     {
7127       if (start == pos)
7128         {
7129           /* We found a composition.  Store the corresponding
7130              annotation data in BUF.  */
7131           int *head = buf;
7132           enum composition_method method = COMPOSITION_METHOD (prop);
7133           int nchars = COMPOSITION_LENGTH (prop);
7134
7135           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7136           if (method != COMPOSITION_RELATIVE)
7137             {
7138               Lisp_Object components;
7139               int len, i, i_byte;
7140
7141               components = COMPOSITION_COMPONENTS (prop);
7142               if (VECTORP (components))
7143                 {
7144                   len = XVECTOR (components)->size;
7145                   for (i = 0; i < len; i++)
7146                     *buf++ = XINT (AREF (components, i));
7147                 }
7148               else if (STRINGP (components))
7149                 {
7150                   len = SCHARS (components);
7151                   i = i_byte = 0;
7152                   while (i < len)
7153                     {
7154                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7155                       buf++;
7156                     }
7157                 }
7158               else if (INTEGERP (components))
7159                 {
7160                   len = 1;
7161                   *buf++ = XINT (components);
7162                 }
7163               else if (CONSP (components))
7164                 {
7165                   for (len = 0; CONSP (components);
7166                        len++, components = XCDR (components))
7167                     *buf++ = XINT (XCAR (components));
7168                 }
7169               else
7170                 abort ();
7171               *head -= len;
7172             }
7173         }
7174
7175       if (find_composition (end, limit, &start, &end, &prop,
7176                             coding->src_object)
7177           && end <= limit)
7178         *stop = start;
7179       else
7180         *stop = limit;
7181     }
7182   return buf;
7183 }
7184
7185
7186 /* Extract an annotation datum from a text property `charset' at POS of
7187    CODING->src_object (buffer of string), store the data in BUF, set
7188    *STOP to the position where the value of `charset' property changes
7189    (limiting by LIMIT), and return the address of the next element of
7190    BUF.
7191
7192    If the property value is nil, set *STOP to the position where the
7193    property value is non-nil (limiting by LIMIT), and return BUF.  */
7194
7195 static INLINE int *
7196 handle_charset_annotation (EMACS_INT pos, EMACS_INT limit,
7197                            struct coding_system *coding, int *buf,
7198                            EMACS_INT *stop)
7199 {
7200   Lisp_Object val, next;
7201   int id;
7202
7203   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7204   if (! NILP (val) && CHARSETP (val))
7205     id = XINT (CHARSET_SYMBOL_ID (val));
7206   else
7207     id = -1;
7208   ADD_CHARSET_DATA (buf, 0, id);
7209   next = Fnext_single_property_change (make_number (pos), Qcharset,
7210                                        coding->src_object,
7211                                        make_number (limit));
7212   *stop = XINT (next);
7213   return buf;
7214 }
7215
7216
7217 static void
7218 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7219                int max_lookup)
7220 {
7221   int *buf = coding->charbuf;
7222   int *buf_end = coding->charbuf + coding->charbuf_size;
7223   const unsigned char *src = coding->source + coding->consumed;
7224   const unsigned char *src_end = coding->source + coding->src_bytes;
7225   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7226   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7227   int multibytep = coding->src_multibyte;
7228   Lisp_Object eol_type;
7229   int c;
7230   EMACS_INT stop, stop_composition, stop_charset;
7231   int *lookup_buf = NULL;
7232
7233   if (! NILP (translation_table))
7234     lookup_buf = alloca (sizeof (int) * max_lookup);
7235
7236   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7237   if (VECTORP (eol_type))
7238     eol_type = Qunix;
7239
7240   /* Note: composition handling is not yet implemented.  */
7241   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7242
7243   if (NILP (coding->src_object))
7244     stop = stop_composition = stop_charset = end_pos;
7245   else
7246     {
7247       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7248         stop = stop_composition = pos;
7249       else
7250         stop = stop_composition = end_pos;
7251       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7252         stop = stop_charset = pos;
7253       else
7254         stop_charset = end_pos;
7255     }
7256
7257   /* Compensate for CRLF and conversion.  */
7258   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7259   while (buf < buf_end)
7260     {
7261       Lisp_Object trans;
7262
7263       if (pos == stop)
7264         {
7265           if (pos == end_pos)
7266             break;
7267           if (pos == stop_composition)
7268             buf = handle_composition_annotation (pos, end_pos, coding,
7269                                                  buf, &stop_composition);
7270           if (pos == stop_charset)
7271             buf = handle_charset_annotation (pos, end_pos, coding,
7272                                              buf, &stop_charset);
7273           stop = (stop_composition < stop_charset
7274                   ? stop_composition : stop_charset);
7275         }
7276
7277       if (! multibytep)
7278         {
7279           EMACS_INT bytes;
7280
7281           if (coding->encoder == encode_coding_raw_text
7282               || coding->encoder == encode_coding_ccl)
7283             c = *src++, pos++;
7284           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7285             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7286           else
7287             c = BYTE8_TO_CHAR (*src), src++, pos++;
7288         }
7289       else
7290         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7291       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7292         c = '\n';
7293       if (! EQ (eol_type, Qunix))
7294         {
7295           if (c == '\n')
7296             {
7297               if (EQ (eol_type, Qdos))
7298                 *buf++ = '\r';
7299               else
7300                 c = '\r';
7301             }
7302         }
7303
7304       trans = Qnil;
7305       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7306       if (NILP (trans))
7307         *buf++ = c;
7308       else
7309         {
7310           int from_nchars = 1, to_nchars = 1;
7311           int *lookup_buf_end;
7312           const unsigned char *p = src;
7313           int i;
7314
7315           lookup_buf[0] = c;
7316           for (i = 1; i < max_lookup && p < src_end; i++)
7317             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7318           lookup_buf_end = lookup_buf + i;
7319           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7320           if (INTEGERP (trans))
7321             c = XINT (trans);
7322           else if (CONSP (trans))
7323             {
7324               from_nchars = ASIZE (XCAR (trans));
7325               trans = XCDR (trans);
7326               if (INTEGERP (trans))
7327                 c = XINT (trans);
7328               else
7329                 {
7330                   to_nchars = ASIZE (trans);
7331                   if (buf + to_nchars > buf_end)
7332                     break;
7333                   c = XINT (AREF (trans, 0));
7334                 }
7335             }
7336           else
7337             break;
7338           *buf++ = c;
7339           for (i = 1; i < to_nchars; i++)
7340             *buf++ = XINT (AREF (trans, i));
7341           for (i = 1; i < from_nchars; i++, pos++)
7342             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7343         }
7344     }
7345
7346   coding->consumed = src - coding->source;
7347   coding->consumed_char = pos - coding->src_pos;
7348   coding->charbuf_used = buf - coding->charbuf;
7349   coding->chars_at_source = 0;
7350 }
7351
7352
7353 /* Encode the text at CODING->src_object into CODING->dst_object.
7354    CODING->src_object is a buffer or a string.
7355    CODING->dst_object is a buffer or nil.
7356
7357    If CODING->src_object is a buffer, it must be the current buffer.
7358    In this case, if CODING->src_pos is positive, it is a position of
7359    the source text in the buffer, otherwise. the source text is in the
7360    gap area of the buffer, and coding->src_pos specifies the offset of
7361    the text from GPT (which must be the same as PT).  If this is the
7362    same buffer as CODING->dst_object, CODING->src_pos must be
7363    negative and CODING should not have `pre-write-conversion'.
7364
7365    If CODING->src_object is a string, CODING should not have
7366    `pre-write-conversion'.
7367
7368    If CODING->dst_object is a buffer, the encoded data is inserted at
7369    the current point of that buffer.
7370
7371    If CODING->dst_object is nil, the encoded data is placed at the
7372    memory area specified by CODING->destination.  */
7373
7374 static int
7375 encode_coding (struct coding_system *coding)
7376 {
7377   Lisp_Object attrs;
7378   Lisp_Object translation_table;
7379   int max_lookup;
7380   struct ccl_spec cclspec;
7381
7382   attrs = CODING_ID_ATTRS (coding->id);
7383   if (coding->encoder == encode_coding_raw_text)
7384     translation_table = Qnil, max_lookup = 0;
7385   else
7386     translation_table = get_translation_table (attrs, 1, &max_lookup);
7387
7388   if (BUFFERP (coding->dst_object))
7389     {
7390       set_buffer_internal (XBUFFER (coding->dst_object));
7391       coding->dst_multibyte
7392         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7393     }
7394
7395   coding->consumed = coding->consumed_char = 0;
7396   coding->produced = coding->produced_char = 0;
7397   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7398   coding->errors = 0;
7399
7400   ALLOC_CONVERSION_WORK_AREA (coding);
7401
7402   if (coding->encoder == encode_coding_ccl)
7403     {
7404       coding->spec.ccl = &cclspec;
7405       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7406     }
7407   do {
7408     coding_set_source (coding);
7409     consume_chars (coding, translation_table, max_lookup);
7410     coding_set_destination (coding);
7411     (*(coding->encoder)) (coding);
7412   } while (coding->consumed_char < coding->src_chars);
7413
7414   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7415     insert_from_gap (coding->produced_char, coding->produced);
7416
7417   return (coding->result);
7418 }
7419
7420
7421 /* Name (or base name) of work buffer for code conversion.  */
7422 static Lisp_Object Vcode_conversion_workbuf_name;
7423
7424 /* A working buffer used by the top level conversion.  Once it is
7425    created, it is never destroyed.  It has the name
7426    Vcode_conversion_workbuf_name.  The other working buffers are
7427    destroyed after the use is finished, and their names are modified
7428    versions of Vcode_conversion_workbuf_name.  */
7429 static Lisp_Object Vcode_conversion_reused_workbuf;
7430
7431 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7432 static int reused_workbuf_in_use;
7433
7434
7435 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7436    multibyteness of returning buffer.  */
7437
7438 static Lisp_Object
7439 make_conversion_work_buffer (int multibyte)
7440 {
7441   Lisp_Object name, workbuf;
7442   struct buffer *current;
7443
7444   if (reused_workbuf_in_use++)
7445     {
7446       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7447       workbuf = Fget_buffer_create (name);
7448     }
7449   else
7450     {
7451       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7452         Vcode_conversion_reused_workbuf
7453           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7454       workbuf = Vcode_conversion_reused_workbuf;
7455     }
7456   current = current_buffer;
7457   set_buffer_internal (XBUFFER (workbuf));
7458   /* We can't allow modification hooks to run in the work buffer.  For
7459      instance, directory_files_internal assumes that file decoding
7460      doesn't compile new regexps.  */
7461   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7462   Ferase_buffer ();
7463   BVAR (current_buffer, undo_list) = Qt;
7464   BVAR (current_buffer, enable_multibyte_characters) = multibyte ? Qt : Qnil;
7465   set_buffer_internal (current);
7466   return workbuf;
7467 }
7468
7469
7470 static Lisp_Object
7471 code_conversion_restore (Lisp_Object arg)
7472 {
7473   Lisp_Object current, workbuf;
7474   struct gcpro gcpro1;
7475
7476   GCPRO1 (arg);
7477   current = XCAR (arg);
7478   workbuf = XCDR (arg);
7479   if (! NILP (workbuf))
7480     {
7481       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7482         reused_workbuf_in_use = 0;
7483       else if (! NILP (Fbuffer_live_p (workbuf)))
7484         Fkill_buffer (workbuf);
7485     }
7486   set_buffer_internal (XBUFFER (current));
7487   UNGCPRO;
7488   return Qnil;
7489 }
7490
7491 Lisp_Object
7492 code_conversion_save (int with_work_buf, int multibyte)
7493 {
7494   Lisp_Object workbuf = Qnil;
7495
7496   if (with_work_buf)
7497     workbuf = make_conversion_work_buffer (multibyte);
7498   record_unwind_protect (code_conversion_restore,
7499                          Fcons (Fcurrent_buffer (), workbuf));
7500   return workbuf;
7501 }
7502
7503 int
7504 decode_coding_gap (struct coding_system *coding,
7505                    EMACS_INT chars, EMACS_INT bytes)
7506 {
7507   int count = SPECPDL_INDEX ();
7508   Lisp_Object attrs;
7509
7510   code_conversion_save (0, 0);
7511
7512   coding->src_object = Fcurrent_buffer ();
7513   coding->src_chars = chars;
7514   coding->src_bytes = bytes;
7515   coding->src_pos = -chars;
7516   coding->src_pos_byte = -bytes;
7517   coding->src_multibyte = chars < bytes;
7518   coding->dst_object = coding->src_object;
7519   coding->dst_pos = PT;
7520   coding->dst_pos_byte = PT_BYTE;
7521   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7522
7523   if (CODING_REQUIRE_DETECTION (coding))
7524     detect_coding (coding);
7525
7526   coding->mode |= CODING_MODE_LAST_BLOCK;
7527   current_buffer->text->inhibit_shrinking = 1;
7528   decode_coding (coding);
7529   current_buffer->text->inhibit_shrinking = 0;
7530
7531   attrs = CODING_ID_ATTRS (coding->id);
7532   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7533     {
7534       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7535       Lisp_Object val;
7536
7537       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7538       val = call1 (CODING_ATTR_POST_READ (attrs),
7539                    make_number (coding->produced_char));
7540       CHECK_NATNUM (val);
7541       coding->produced_char += Z - prev_Z;
7542       coding->produced += Z_BYTE - prev_Z_BYTE;
7543     }
7544
7545   unbind_to (count, Qnil);
7546   return coding->result;
7547 }
7548
7549 int
7550 encode_coding_gap (struct coding_system *coding,
7551                    EMACS_INT chars, EMACS_INT bytes)
7552 {
7553   int count = SPECPDL_INDEX ();
7554
7555   code_conversion_save (0, 0);
7556
7557   coding->src_object = Fcurrent_buffer ();
7558   coding->src_chars = chars;
7559   coding->src_bytes = bytes;
7560   coding->src_pos = -chars;
7561   coding->src_pos_byte = -bytes;
7562   coding->src_multibyte = chars < bytes;
7563   coding->dst_object = coding->src_object;
7564   coding->dst_pos = PT;
7565   coding->dst_pos_byte = PT_BYTE;
7566
7567   encode_coding (coding);
7568
7569   unbind_to (count, Qnil);
7570   return coding->result;
7571 }
7572
7573
7574 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7575    SRC_OBJECT into DST_OBJECT by coding context CODING.
7576
7577    SRC_OBJECT is a buffer, a string, or Qnil.
7578
7579    If it is a buffer, the text is at point of the buffer.  FROM and TO
7580    are positions in the buffer.
7581
7582    If it is a string, the text is at the beginning of the string.
7583    FROM and TO are indices to the string.
7584
7585    If it is nil, the text is at coding->source.  FROM and TO are
7586    indices to coding->source.
7587
7588    DST_OBJECT is a buffer, Qt, or Qnil.
7589
7590    If it is a buffer, the decoded text is inserted at point of the
7591    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7592    is deleted.
7593
7594    If it is Qt, a string is made from the decoded text, and
7595    set in CODING->dst_object.
7596
7597    If it is Qnil, the decoded text is stored at CODING->destination.
7598    The caller must allocate CODING->dst_bytes bytes at
7599    CODING->destination by xmalloc.  If the decoded text is longer than
7600    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7601  */
7602
7603 void
7604 decode_coding_object (struct coding_system *coding,
7605                       Lisp_Object src_object,
7606                       EMACS_INT from, EMACS_INT from_byte,
7607                       EMACS_INT to, EMACS_INT to_byte,
7608                       Lisp_Object dst_object)
7609 {
7610   int count = SPECPDL_INDEX ();
7611   unsigned char *destination IF_LINT (= NULL);
7612   EMACS_INT dst_bytes IF_LINT (= 0);
7613   EMACS_INT chars = to - from;
7614   EMACS_INT bytes = to_byte - from_byte;
7615   Lisp_Object attrs;
7616   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7617   int need_marker_adjustment = 0;
7618   Lisp_Object old_deactivate_mark;
7619
7620   old_deactivate_mark = Vdeactivate_mark;
7621
7622   if (NILP (dst_object))
7623     {
7624       destination = coding->destination;
7625       dst_bytes = coding->dst_bytes;
7626     }
7627
7628   coding->src_object = src_object;
7629   coding->src_chars = chars;
7630   coding->src_bytes = bytes;
7631   coding->src_multibyte = chars < bytes;
7632
7633   if (STRINGP (src_object))
7634     {
7635       coding->src_pos = from;
7636       coding->src_pos_byte = from_byte;
7637     }
7638   else if (BUFFERP (src_object))
7639     {
7640       set_buffer_internal (XBUFFER (src_object));
7641       if (from != GPT)
7642         move_gap_both (from, from_byte);
7643       if (EQ (src_object, dst_object))
7644         {
7645           struct Lisp_Marker *tail;
7646
7647           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7648             {
7649               tail->need_adjustment
7650                 = tail->charpos == (tail->insertion_type ? from : to);
7651               need_marker_adjustment |= tail->need_adjustment;
7652             }
7653           saved_pt = PT, saved_pt_byte = PT_BYTE;
7654           TEMP_SET_PT_BOTH (from, from_byte);
7655           current_buffer->text->inhibit_shrinking = 1;
7656           del_range_both (from, from_byte, to, to_byte, 1);
7657           coding->src_pos = -chars;
7658           coding->src_pos_byte = -bytes;
7659         }
7660       else
7661         {
7662           coding->src_pos = from;
7663           coding->src_pos_byte = from_byte;
7664         }
7665     }
7666
7667   if (CODING_REQUIRE_DETECTION (coding))
7668     detect_coding (coding);
7669   attrs = CODING_ID_ATTRS (coding->id);
7670
7671   if (EQ (dst_object, Qt)
7672       || (! NILP (CODING_ATTR_POST_READ (attrs))
7673           && NILP (dst_object)))
7674     {
7675       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7676       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7677       coding->dst_pos = BEG;
7678       coding->dst_pos_byte = BEG_BYTE;
7679     }
7680   else if (BUFFERP (dst_object))
7681     {
7682       code_conversion_save (0, 0);
7683       coding->dst_object = dst_object;
7684       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7685       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7686       coding->dst_multibyte
7687         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7688     }
7689   else
7690     {
7691       code_conversion_save (0, 0);
7692       coding->dst_object = Qnil;
7693       /* Most callers presume this will return a multibyte result, and they
7694          won't use `binary' or `raw-text' anyway, so let's not worry about
7695          CODING_FOR_UNIBYTE.  */
7696       coding->dst_multibyte = 1;
7697     }
7698
7699   decode_coding (coding);
7700
7701   if (BUFFERP (coding->dst_object))
7702     set_buffer_internal (XBUFFER (coding->dst_object));
7703
7704   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7705     {
7706       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7707       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7708       Lisp_Object val;
7709
7710       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7711       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7712               old_deactivate_mark);
7713       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7714                         make_number (coding->produced_char));
7715       UNGCPRO;
7716       CHECK_NATNUM (val);
7717       coding->produced_char += Z - prev_Z;
7718       coding->produced += Z_BYTE - prev_Z_BYTE;
7719     }
7720
7721   if (EQ (dst_object, Qt))
7722     {
7723       coding->dst_object = Fbuffer_string ();
7724     }
7725   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7726     {
7727       set_buffer_internal (XBUFFER (coding->dst_object));
7728       if (dst_bytes < coding->produced)
7729         {
7730           destination = xrealloc (destination, coding->produced);
7731           if (! destination)
7732             {
7733               record_conversion_result (coding,
7734                                         CODING_RESULT_INSUFFICIENT_MEM);
7735               unbind_to (count, Qnil);
7736               return;
7737             }
7738           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7739             move_gap_both (BEGV, BEGV_BYTE);
7740           memcpy (destination, BEGV_ADDR, coding->produced);
7741           coding->destination = destination;
7742         }
7743     }
7744
7745   if (saved_pt >= 0)
7746     {
7747       /* This is the case of:
7748          (BUFFERP (src_object) && EQ (src_object, dst_object))
7749          As we have moved PT while replacing the original buffer
7750          contents, we must recover it now.  */
7751       set_buffer_internal (XBUFFER (src_object));
7752       current_buffer->text->inhibit_shrinking = 0;
7753       if (saved_pt < from)
7754         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7755       else if (saved_pt < from + chars)
7756         TEMP_SET_PT_BOTH (from, from_byte);
7757       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7758         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7759                           saved_pt_byte + (coding->produced - bytes));
7760       else
7761         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7762                           saved_pt_byte + (coding->produced - bytes));
7763
7764       if (need_marker_adjustment)
7765         {
7766           struct Lisp_Marker *tail;
7767
7768           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7769             if (tail->need_adjustment)
7770               {
7771                 tail->need_adjustment = 0;
7772                 if (tail->insertion_type)
7773                   {
7774                     tail->bytepos = from_byte;
7775                     tail->charpos = from;
7776                   }
7777                 else
7778                   {
7779                     tail->bytepos = from_byte + coding->produced;
7780                     tail->charpos
7781                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7782                          ? tail->bytepos : from + coding->produced_char);
7783                   }
7784               }
7785         }
7786     }
7787
7788   Vdeactivate_mark = old_deactivate_mark;
7789   unbind_to (count, coding->dst_object);
7790 }
7791
7792
7793 void
7794 encode_coding_object (struct coding_system *coding,
7795                       Lisp_Object src_object,
7796                       EMACS_INT from, EMACS_INT from_byte,
7797                       EMACS_INT to, EMACS_INT to_byte,
7798                       Lisp_Object dst_object)
7799 {
7800   int count = SPECPDL_INDEX ();
7801   EMACS_INT chars = to - from;
7802   EMACS_INT bytes = to_byte - from_byte;
7803   Lisp_Object attrs;
7804   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7805   int need_marker_adjustment = 0;
7806   int kill_src_buffer = 0;
7807   Lisp_Object old_deactivate_mark;
7808
7809   old_deactivate_mark = Vdeactivate_mark;
7810
7811   coding->src_object = src_object;
7812   coding->src_chars = chars;
7813   coding->src_bytes = bytes;
7814   coding->src_multibyte = chars < bytes;
7815
7816   attrs = CODING_ID_ATTRS (coding->id);
7817
7818   if (EQ (src_object, dst_object))
7819     {
7820       struct Lisp_Marker *tail;
7821
7822       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7823         {
7824           tail->need_adjustment
7825             = tail->charpos == (tail->insertion_type ? from : to);
7826           need_marker_adjustment |= tail->need_adjustment;
7827         }
7828     }
7829
7830   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7831     {
7832       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7833       set_buffer_internal (XBUFFER (coding->src_object));
7834       if (STRINGP (src_object))
7835         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7836       else if (BUFFERP (src_object))
7837         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7838       else
7839         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7840
7841       if (EQ (src_object, dst_object))
7842         {
7843           set_buffer_internal (XBUFFER (src_object));
7844           saved_pt = PT, saved_pt_byte = PT_BYTE;
7845           del_range_both (from, from_byte, to, to_byte, 1);
7846           set_buffer_internal (XBUFFER (coding->src_object));
7847         }
7848
7849       {
7850         Lisp_Object args[3];
7851         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7852
7853         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7854                 old_deactivate_mark);
7855         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7856         args[1] = make_number (BEG);
7857         args[2] = make_number (Z);
7858         safe_call (3, args);
7859         UNGCPRO;
7860       }
7861       if (XBUFFER (coding->src_object) != current_buffer)
7862         kill_src_buffer = 1;
7863       coding->src_object = Fcurrent_buffer ();
7864       if (BEG != GPT)
7865         move_gap_both (BEG, BEG_BYTE);
7866       coding->src_chars = Z - BEG;
7867       coding->src_bytes = Z_BYTE - BEG_BYTE;
7868       coding->src_pos = BEG;
7869       coding->src_pos_byte = BEG_BYTE;
7870       coding->src_multibyte = Z < Z_BYTE;
7871     }
7872   else if (STRINGP (src_object))
7873     {
7874       code_conversion_save (0, 0);
7875       coding->src_pos = from;
7876       coding->src_pos_byte = from_byte;
7877     }
7878   else if (BUFFERP (src_object))
7879     {
7880       code_conversion_save (0, 0);
7881       set_buffer_internal (XBUFFER (src_object));
7882       if (EQ (src_object, dst_object))
7883         {
7884           saved_pt = PT, saved_pt_byte = PT_BYTE;
7885           coding->src_object = del_range_1 (from, to, 1, 1);
7886           coding->src_pos = 0;
7887           coding->src_pos_byte = 0;
7888         }
7889       else
7890         {
7891           if (from < GPT && to >= GPT)
7892             move_gap_both (from, from_byte);
7893           coding->src_pos = from;
7894           coding->src_pos_byte = from_byte;
7895         }
7896     }
7897   else
7898     code_conversion_save (0, 0);
7899
7900   if (BUFFERP (dst_object))
7901     {
7902       coding->dst_object = dst_object;
7903       if (EQ (src_object, dst_object))
7904         {
7905           coding->dst_pos = from;
7906           coding->dst_pos_byte = from_byte;
7907         }
7908       else
7909         {
7910           struct buffer *current = current_buffer;
7911
7912           set_buffer_temp (XBUFFER (dst_object));
7913           coding->dst_pos = PT;
7914           coding->dst_pos_byte = PT_BYTE;
7915           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7916           set_buffer_temp (current);
7917         }
7918       coding->dst_multibyte
7919         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7920     }
7921   else if (EQ (dst_object, Qt))
7922     {
7923       coding->dst_object = Qnil;
7924       coding->dst_bytes = coding->src_chars;
7925       if (coding->dst_bytes == 0)
7926         coding->dst_bytes = 1;
7927       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7928       coding->dst_multibyte = 0;
7929     }
7930   else
7931     {
7932       coding->dst_object = Qnil;
7933       coding->dst_multibyte = 0;
7934     }
7935
7936   encode_coding (coding);
7937
7938   if (EQ (dst_object, Qt))
7939     {
7940       if (BUFFERP (coding->dst_object))
7941         coding->dst_object = Fbuffer_string ();
7942       else
7943         {
7944           coding->dst_object
7945             = make_unibyte_string ((char *) coding->destination,
7946                                    coding->produced);
7947           xfree (coding->destination);
7948         }
7949     }
7950
7951   if (saved_pt >= 0)
7952     {
7953       /* This is the case of:
7954          (BUFFERP (src_object) && EQ (src_object, dst_object))
7955          As we have moved PT while replacing the original buffer
7956          contents, we must recover it now.  */
7957       set_buffer_internal (XBUFFER (src_object));
7958       if (saved_pt < from)
7959         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7960       else if (saved_pt < from + chars)
7961         TEMP_SET_PT_BOTH (from, from_byte);
7962       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7963         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7964                           saved_pt_byte + (coding->produced - bytes));
7965       else
7966         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7967                           saved_pt_byte + (coding->produced - bytes));
7968
7969       if (need_marker_adjustment)
7970         {
7971           struct Lisp_Marker *tail;
7972
7973           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7974             if (tail->need_adjustment)
7975               {
7976                 tail->need_adjustment = 0;
7977                 if (tail->insertion_type)
7978                   {
7979                     tail->bytepos = from_byte;
7980                     tail->charpos = from;
7981                   }
7982                 else
7983                   {
7984                     tail->bytepos = from_byte + coding->produced;
7985                     tail->charpos
7986                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7987                          ? tail->bytepos : from + coding->produced_char);
7988                   }
7989               }
7990         }
7991     }
7992
7993   if (kill_src_buffer)
7994     Fkill_buffer (coding->src_object);
7995
7996   Vdeactivate_mark = old_deactivate_mark;
7997   unbind_to (count, Qnil);
7998 }
7999
8000
8001 Lisp_Object
8002 preferred_coding_system (void)
8003 {
8004   int id = coding_categories[coding_priorities[0]].id;
8005
8006   return CODING_ID_NAME (id);
8007 }
8008
8009 \f
8010 #ifdef emacs
8011 /*** 8. Emacs Lisp library functions ***/
8012
8013 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8014        doc: /* Return t if OBJECT is nil or a coding-system.
8015 See the documentation of `define-coding-system' for information
8016 about coding-system objects.  */)
8017   (Lisp_Object object)
8018 {
8019   if (NILP (object)
8020       || CODING_SYSTEM_ID (object) >= 0)
8021     return Qt;
8022   if (! SYMBOLP (object)
8023       || NILP (Fget (object, Qcoding_system_define_form)))
8024     return Qnil;
8025   return Qt;
8026 }
8027
8028 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8029        Sread_non_nil_coding_system, 1, 1, 0,
8030        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8031   (Lisp_Object prompt)
8032 {
8033   Lisp_Object val;
8034   do
8035     {
8036       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8037                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8038     }
8039   while (SCHARS (val) == 0);
8040   return (Fintern (val, Qnil));
8041 }
8042
8043 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8044        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8045 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8046 Ignores case when completing coding systems (all Emacs coding systems
8047 are lower-case).  */)
8048   (Lisp_Object prompt, Lisp_Object default_coding_system)
8049 {
8050   Lisp_Object val;
8051   int count = SPECPDL_INDEX ();
8052
8053   if (SYMBOLP (default_coding_system))
8054     default_coding_system = SYMBOL_NAME (default_coding_system);
8055   specbind (Qcompletion_ignore_case, Qt);
8056   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8057                           Qt, Qnil, Qcoding_system_history,
8058                           default_coding_system, Qnil);
8059   unbind_to (count, Qnil);
8060   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8061 }
8062
8063 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8064        1, 1, 0,
8065        doc: /* Check validity of CODING-SYSTEM.
8066 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8067 It is valid if it is nil or a symbol defined as a coding system by the
8068 function `define-coding-system'.  */)
8069   (Lisp_Object coding_system)
8070 {
8071   Lisp_Object define_form;
8072
8073   define_form = Fget (coding_system, Qcoding_system_define_form);
8074   if (! NILP (define_form))
8075     {
8076       Fput (coding_system, Qcoding_system_define_form, Qnil);
8077       safe_eval (define_form);
8078     }
8079   if (!NILP (Fcoding_system_p (coding_system)))
8080     return coding_system;
8081   xsignal1 (Qcoding_system_error, coding_system);
8082 }
8083
8084 \f
8085 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8086    HIGHEST is nonzero, return the coding system of the highest
8087    priority among the detected coding systems.  Otherwise return a
8088    list of detected coding systems sorted by their priorities.  If
8089    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8090    multibyte form but contains only ASCII and eight-bit chars.
8091    Otherwise, the bytes are raw bytes.
8092
8093    CODING-SYSTEM controls the detection as below:
8094
8095    If it is nil, detect both text-format and eol-format.  If the
8096    text-format part of CODING-SYSTEM is already specified
8097    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8098    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8099    detect only text-format.  */
8100
8101 Lisp_Object
8102 detect_coding_system (const unsigned char *src,
8103                       EMACS_INT src_chars, EMACS_INT src_bytes,
8104                       int highest, int multibytep,
8105                       Lisp_Object coding_system)
8106 {
8107   const unsigned char *src_end = src + src_bytes;
8108   Lisp_Object attrs, eol_type;
8109   Lisp_Object val = Qnil;
8110   struct coding_system coding;
8111   int id;
8112   struct coding_detection_info detect_info;
8113   enum coding_category base_category;
8114   int null_byte_found = 0, eight_bit_found = 0;
8115
8116   if (NILP (coding_system))
8117     coding_system = Qundecided;
8118   setup_coding_system (coding_system, &coding);
8119   attrs = CODING_ID_ATTRS (coding.id);
8120   eol_type = CODING_ID_EOL_TYPE (coding.id);
8121   coding_system = CODING_ATTR_BASE_NAME (attrs);
8122
8123   coding.source = src;
8124   coding.src_chars = src_chars;
8125   coding.src_bytes = src_bytes;
8126   coding.src_multibyte = multibytep;
8127   coding.consumed = 0;
8128   coding.mode |= CODING_MODE_LAST_BLOCK;
8129   coding.head_ascii = 0;
8130
8131   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8132
8133   /* At first, detect text-format if necessary.  */
8134   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8135   if (base_category == coding_category_undecided)
8136     {
8137       enum coding_category category IF_LINT (= 0);
8138       struct coding_system *this IF_LINT (= NULL);
8139       int c, i;
8140
8141       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8142       for (; src < src_end; src++)
8143         {
8144           c = *src;
8145           if (c & 0x80)
8146             {
8147               eight_bit_found = 1;
8148               if (null_byte_found)
8149                 break;
8150             }
8151           else if (c < 0x20)
8152             {
8153               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8154                   && ! inhibit_iso_escape_detection
8155                   && ! detect_info.checked)
8156                 {
8157                   if (detect_coding_iso_2022 (&coding, &detect_info))
8158                     {
8159                       /* We have scanned the whole data.  */
8160                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8161                         {
8162                           /* We didn't find an 8-bit code.  We may
8163                              have found a null-byte, but it's very
8164                              rare that a binary file confirm to
8165                              ISO-2022.  */
8166                           src = src_end;
8167                           coding.head_ascii = src - coding.source;
8168                         }
8169                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8170                       break;
8171                     }
8172                 }
8173               else if (! c && !inhibit_null_byte_detection)
8174                 {
8175                   null_byte_found = 1;
8176                   if (eight_bit_found)
8177                     break;
8178                 }
8179               if (! eight_bit_found)
8180                 coding.head_ascii++;
8181             }
8182           else if (! eight_bit_found)
8183             coding.head_ascii++;
8184         }
8185
8186       if (null_byte_found || eight_bit_found
8187           || coding.head_ascii < coding.src_bytes
8188           || detect_info.found)
8189         {
8190           if (coding.head_ascii == coding.src_bytes)
8191             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8192             for (i = 0; i < coding_category_raw_text; i++)
8193               {
8194                 category = coding_priorities[i];
8195                 this = coding_categories + category;
8196                 if (detect_info.found & (1 << category))
8197                   break;
8198               }
8199           else
8200             {
8201               if (null_byte_found)
8202                 {
8203                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8204                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8205                 }
8206               for (i = 0; i < coding_category_raw_text; i++)
8207                 {
8208                   category = coding_priorities[i];
8209                   this = coding_categories + category;
8210
8211                   if (this->id < 0)
8212                     {
8213                       /* No coding system of this category is defined.  */
8214                       detect_info.rejected |= (1 << category);
8215                     }
8216                   else if (category >= coding_category_raw_text)
8217                     continue;
8218                   else if (detect_info.checked & (1 << category))
8219                     {
8220                       if (highest
8221                           && (detect_info.found & (1 << category)))
8222                         break;
8223                     }
8224                   else if ((*(this->detector)) (&coding, &detect_info)
8225                            && highest
8226                            && (detect_info.found & (1 << category)))
8227                     {
8228                       if (category == coding_category_utf_16_auto)
8229                         {
8230                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8231                             category = coding_category_utf_16_le;
8232                           else
8233                             category = coding_category_utf_16_be;
8234                         }
8235                       break;
8236                     }
8237                 }
8238             }
8239         }
8240
8241       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8242           || null_byte_found)
8243         {
8244           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8245           id = CODING_SYSTEM_ID (Qno_conversion);
8246           val = Fcons (make_number (id), Qnil);
8247         }
8248       else if (! detect_info.rejected && ! detect_info.found)
8249         {
8250           detect_info.found = CATEGORY_MASK_ANY;
8251           id = coding_categories[coding_category_undecided].id;
8252           val = Fcons (make_number (id), Qnil);
8253         }
8254       else if (highest)
8255         {
8256           if (detect_info.found)
8257             {
8258               detect_info.found = 1 << category;
8259               val = Fcons (make_number (this->id), Qnil);
8260             }
8261           else
8262             for (i = 0; i < coding_category_raw_text; i++)
8263               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8264                 {
8265                   detect_info.found = 1 << coding_priorities[i];
8266                   id = coding_categories[coding_priorities[i]].id;
8267                   val = Fcons (make_number (id), Qnil);
8268                   break;
8269                 }
8270         }
8271       else
8272         {
8273           int mask = detect_info.rejected | detect_info.found;
8274           int found = 0;
8275
8276           for (i = coding_category_raw_text - 1; i >= 0; i--)
8277             {
8278               category = coding_priorities[i];
8279               if (! (mask & (1 << category)))
8280                 {
8281                   found |= 1 << category;
8282                   id = coding_categories[category].id;
8283                   if (id >= 0)
8284                     val = Fcons (make_number (id), val);
8285                 }
8286             }
8287           for (i = coding_category_raw_text - 1; i >= 0; i--)
8288             {
8289               category = coding_priorities[i];
8290               if (detect_info.found & (1 << category))
8291                 {
8292                   id = coding_categories[category].id;
8293                   val = Fcons (make_number (id), val);
8294                 }
8295             }
8296           detect_info.found |= found;
8297         }
8298     }
8299   else if (base_category == coding_category_utf_8_auto)
8300     {
8301       if (detect_coding_utf_8 (&coding, &detect_info))
8302         {
8303           struct coding_system *this;
8304
8305           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8306             this = coding_categories + coding_category_utf_8_sig;
8307           else
8308             this = coding_categories + coding_category_utf_8_nosig;
8309           val = Fcons (make_number (this->id), Qnil);
8310         }
8311     }
8312   else if (base_category == coding_category_utf_16_auto)
8313     {
8314       if (detect_coding_utf_16 (&coding, &detect_info))
8315         {
8316           struct coding_system *this;
8317
8318           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8319             this = coding_categories + coding_category_utf_16_le;
8320           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8321             this = coding_categories + coding_category_utf_16_be;
8322           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8323             this = coding_categories + coding_category_utf_16_be_nosig;
8324           else
8325             this = coding_categories + coding_category_utf_16_le_nosig;
8326           val = Fcons (make_number (this->id), Qnil);
8327         }
8328     }
8329   else
8330     {
8331       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8332       val = Fcons (make_number (coding.id), Qnil);
8333     }
8334
8335   /* Then, detect eol-format if necessary.  */
8336   {
8337     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8338     Lisp_Object tail;
8339
8340     if (VECTORP (eol_type))
8341       {
8342         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8343           {
8344             if (null_byte_found)
8345               normal_eol = EOL_SEEN_LF;
8346             else
8347               normal_eol = detect_eol (coding.source, src_bytes,
8348                                        coding_category_raw_text);
8349           }
8350         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8351                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8352           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8353                                       coding_category_utf_16_be);
8354         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8355                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8356           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8357                                       coding_category_utf_16_le);
8358       }
8359     else
8360       {
8361         if (EQ (eol_type, Qunix))
8362           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8363         else if (EQ (eol_type, Qdos))
8364           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8365         else
8366           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8367       }
8368
8369     for (tail = val; CONSP (tail); tail = XCDR (tail))
8370       {
8371         enum coding_category category;
8372         int this_eol;
8373
8374         id = XINT (XCAR (tail));
8375         attrs = CODING_ID_ATTRS (id);
8376         category = XINT (CODING_ATTR_CATEGORY (attrs));
8377         eol_type = CODING_ID_EOL_TYPE (id);
8378         if (VECTORP (eol_type))
8379           {
8380             if (category == coding_category_utf_16_be
8381                 || category == coding_category_utf_16_be_nosig)
8382               this_eol = utf_16_be_eol;
8383             else if (category == coding_category_utf_16_le
8384                      || category == coding_category_utf_16_le_nosig)
8385               this_eol = utf_16_le_eol;
8386             else
8387               this_eol = normal_eol;
8388
8389             if (this_eol == EOL_SEEN_LF)
8390               XSETCAR (tail, AREF (eol_type, 0));
8391             else if (this_eol == EOL_SEEN_CRLF)
8392               XSETCAR (tail, AREF (eol_type, 1));
8393             else if (this_eol == EOL_SEEN_CR)
8394               XSETCAR (tail, AREF (eol_type, 2));
8395             else
8396               XSETCAR (tail, CODING_ID_NAME (id));
8397           }
8398         else
8399           XSETCAR (tail, CODING_ID_NAME (id));
8400       }
8401   }
8402
8403   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8404 }
8405
8406
8407 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8408        2, 3, 0,
8409        doc: /* Detect coding system of the text in the region between START and END.
8410 Return a list of possible coding systems ordered by priority.
8411 The coding systems to try and their priorities follows what
8412 the function `coding-system-priority-list' (which see) returns.
8413
8414 If only ASCII characters are found (except for such ISO-2022 control
8415 characters as ESC), it returns a list of single element `undecided'
8416 or its subsidiary coding system according to a detected end-of-line
8417 format.
8418
8419 If optional argument HIGHEST is non-nil, return the coding system of
8420 highest priority.  */)
8421   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8422 {
8423   int from, to;
8424   int from_byte, to_byte;
8425
8426   CHECK_NUMBER_COERCE_MARKER (start);
8427   CHECK_NUMBER_COERCE_MARKER (end);
8428
8429   validate_region (&start, &end);
8430   from = XINT (start), to = XINT (end);
8431   from_byte = CHAR_TO_BYTE (from);
8432   to_byte = CHAR_TO_BYTE (to);
8433
8434   if (from < GPT && to >= GPT)
8435     move_gap_both (to, to_byte);
8436
8437   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8438                                to - from, to_byte - from_byte,
8439                                !NILP (highest),
8440                                !NILP (BVAR (current_buffer
8441                                       , enable_multibyte_characters)),
8442                                Qnil);
8443 }
8444
8445 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8446        1, 2, 0,
8447        doc: /* Detect coding system of the text in STRING.
8448 Return a list of possible coding systems ordered by priority.
8449 The coding systems to try and their priorities follows what
8450 the function `coding-system-priority-list' (which see) returns.
8451
8452 If only ASCII characters are found (except for such ISO-2022 control
8453 characters as ESC), it returns a list of single element `undecided'
8454 or its subsidiary coding system according to a detected end-of-line
8455 format.
8456
8457 If optional argument HIGHEST is non-nil, return the coding system of
8458 highest priority.  */)
8459   (Lisp_Object string, Lisp_Object highest)
8460 {
8461   CHECK_STRING (string);
8462
8463   return detect_coding_system (SDATA (string),
8464                                SCHARS (string), SBYTES (string),
8465                                !NILP (highest), STRING_MULTIBYTE (string),
8466                                Qnil);
8467 }
8468
8469
8470 static INLINE int
8471 char_encodable_p (int c, Lisp_Object attrs)
8472 {
8473   Lisp_Object tail;
8474   struct charset *charset;
8475   Lisp_Object translation_table;
8476
8477   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8478   if (! NILP (translation_table))
8479     c = translate_char (translation_table, c);
8480   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8481        CONSP (tail); tail = XCDR (tail))
8482     {
8483       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8484       if (CHAR_CHARSET_P (c, charset))
8485         break;
8486     }
8487   return (! NILP (tail));
8488 }
8489
8490
8491 /* Return a list of coding systems that safely encode the text between
8492    START and END.  If EXCLUDE is non-nil, it is a list of coding
8493    systems not to check.  The returned list doesn't contain any such
8494    coding systems.  In any case, if the text contains only ASCII or is
8495    unibyte, return t.  */
8496
8497 DEFUN ("find-coding-systems-region-internal",
8498        Ffind_coding_systems_region_internal,
8499        Sfind_coding_systems_region_internal, 2, 3, 0,
8500        doc: /* Internal use only.  */)
8501   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8502 {
8503   Lisp_Object coding_attrs_list, safe_codings;
8504   EMACS_INT start_byte, end_byte;
8505   const unsigned char *p, *pbeg, *pend;
8506   int c;
8507   Lisp_Object tail, elt, work_table;
8508
8509   if (STRINGP (start))
8510     {
8511       if (!STRING_MULTIBYTE (start)
8512           || SCHARS (start) == SBYTES (start))
8513         return Qt;
8514       start_byte = 0;
8515       end_byte = SBYTES (start);
8516     }
8517   else
8518     {
8519       CHECK_NUMBER_COERCE_MARKER (start);
8520       CHECK_NUMBER_COERCE_MARKER (end);
8521       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8522         args_out_of_range (start, end);
8523       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8524         return Qt;
8525       start_byte = CHAR_TO_BYTE (XINT (start));
8526       end_byte = CHAR_TO_BYTE (XINT (end));
8527       if (XINT (end) - XINT (start) == end_byte - start_byte)
8528         return Qt;
8529
8530       if (XINT (start) < GPT && XINT (end) > GPT)
8531         {
8532           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8533             move_gap_both (XINT (start), start_byte);
8534           else
8535             move_gap_both (XINT (end), end_byte);
8536         }
8537     }
8538
8539   coding_attrs_list = Qnil;
8540   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8541     if (NILP (exclude)
8542         || NILP (Fmemq (XCAR (tail), exclude)))
8543       {
8544         Lisp_Object attrs;
8545
8546         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8547         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8548             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8549           {
8550             ASET (attrs, coding_attr_trans_tbl,
8551                   get_translation_table (attrs, 1, NULL));
8552             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8553           }
8554       }
8555
8556   if (STRINGP (start))
8557     p = pbeg = SDATA (start);
8558   else
8559     p = pbeg = BYTE_POS_ADDR (start_byte);
8560   pend = p + (end_byte - start_byte);
8561
8562   while (p < pend && ASCII_BYTE_P (*p)) p++;
8563   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8564
8565   work_table = Fmake_char_table (Qnil, Qnil);
8566   while (p < pend)
8567     {
8568       if (ASCII_BYTE_P (*p))
8569         p++;
8570       else
8571         {
8572           c = STRING_CHAR_ADVANCE (p);
8573           if (!NILP (char_table_ref (work_table, c)))
8574             /* This character was already checked.  Ignore it.  */
8575             continue;
8576
8577           charset_map_loaded = 0;
8578           for (tail = coding_attrs_list; CONSP (tail);)
8579             {
8580               elt = XCAR (tail);
8581               if (NILP (elt))
8582                 tail = XCDR (tail);
8583               else if (char_encodable_p (c, elt))
8584                 tail = XCDR (tail);
8585               else if (CONSP (XCDR (tail)))
8586                 {
8587                   XSETCAR (tail, XCAR (XCDR (tail)));
8588                   XSETCDR (tail, XCDR (XCDR (tail)));
8589                 }
8590               else
8591                 {
8592                   XSETCAR (tail, Qnil);
8593                   tail = XCDR (tail);
8594                 }
8595             }
8596           if (charset_map_loaded)
8597             {
8598               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8599
8600               if (STRINGP (start))
8601                 pbeg = SDATA (start);
8602               else
8603                 pbeg = BYTE_POS_ADDR (start_byte);
8604               p = pbeg + p_offset;
8605               pend = pbeg + pend_offset;
8606             }
8607           char_table_set (work_table, c, Qt);
8608         }
8609     }
8610
8611   safe_codings = list2 (Qraw_text, Qno_conversion);
8612   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8613     if (! NILP (XCAR (tail)))
8614       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8615
8616   return safe_codings;
8617 }
8618
8619
8620 DEFUN ("unencodable-char-position", Funencodable_char_position,
8621        Sunencodable_char_position, 3, 5, 0,
8622        doc: /*
8623 Return position of first un-encodable character in a region.
8624 START and END specify the region and CODING-SYSTEM specifies the
8625 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8626
8627 If optional 4th argument COUNT is non-nil, it specifies at most how
8628 many un-encodable characters to search.  In this case, the value is a
8629 list of positions.
8630
8631 If optional 5th argument STRING is non-nil, it is a string to search
8632 for un-encodable characters.  In that case, START and END are indexes
8633 to the string.  */)
8634   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8635 {
8636   int n;
8637   struct coding_system coding;
8638   Lisp_Object attrs, charset_list, translation_table;
8639   Lisp_Object positions;
8640   int from, to;
8641   const unsigned char *p, *stop, *pend;
8642   int ascii_compatible;
8643
8644   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8645   attrs = CODING_ID_ATTRS (coding.id);
8646   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8647     return Qnil;
8648   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8649   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8650   translation_table = get_translation_table (attrs, 1, NULL);
8651
8652   if (NILP (string))
8653     {
8654       validate_region (&start, &end);
8655       from = XINT (start);
8656       to = XINT (end);
8657       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8658           || (ascii_compatible
8659               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8660         return Qnil;
8661       p = CHAR_POS_ADDR (from);
8662       pend = CHAR_POS_ADDR (to);
8663       if (from < GPT && to >= GPT)
8664         stop = GPT_ADDR;
8665       else
8666         stop = pend;
8667     }
8668   else
8669     {
8670       CHECK_STRING (string);
8671       CHECK_NATNUM (start);
8672       CHECK_NATNUM (end);
8673       from = XINT (start);
8674       to = XINT (end);
8675       if (from > to
8676           || to > SCHARS (string))
8677         args_out_of_range_3 (string, start, end);
8678       if (! STRING_MULTIBYTE (string))
8679         return Qnil;
8680       p = SDATA (string) + string_char_to_byte (string, from);
8681       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8682       if (ascii_compatible && (to - from) == (pend - p))
8683         return Qnil;
8684     }
8685
8686   if (NILP (count))
8687     n = 1;
8688   else
8689     {
8690       CHECK_NATNUM (count);
8691       n = XINT (count);
8692     }
8693
8694   positions = Qnil;
8695   while (1)
8696     {
8697       int c;
8698
8699       if (ascii_compatible)
8700         while (p < stop && ASCII_BYTE_P (*p))
8701           p++, from++;
8702       if (p >= stop)
8703         {
8704           if (p >= pend)
8705             break;
8706           stop = pend;
8707           p = GAP_END_ADDR;
8708         }
8709
8710       c = STRING_CHAR_ADVANCE (p);
8711       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8712           && ! char_charset (translate_char (translation_table, c),
8713                              charset_list, NULL))
8714         {
8715           positions = Fcons (make_number (from), positions);
8716           n--;
8717           if (n == 0)
8718             break;
8719         }
8720
8721       from++;
8722     }
8723
8724   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8725 }
8726
8727
8728 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8729        Scheck_coding_systems_region, 3, 3, 0,
8730        doc: /* Check if the region is encodable by coding systems.
8731
8732 START and END are buffer positions specifying the region.
8733 CODING-SYSTEM-LIST is a list of coding systems to check.
8734
8735 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8736 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8737 whole region, POS0, POS1, ... are buffer positions where non-encodable
8738 characters are found.
8739
8740 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8741 value is nil.
8742
8743 START may be a string.  In that case, check if the string is
8744 encodable, and the value contains indices to the string instead of
8745 buffer positions.  END is ignored.
8746
8747 If the current buffer (or START if it is a string) is unibyte, the value
8748 is nil.  */)
8749   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8750 {
8751   Lisp_Object list;
8752   EMACS_INT start_byte, end_byte;
8753   int pos;
8754   const unsigned char *p, *pbeg, *pend;
8755   int c;
8756   Lisp_Object tail, elt, attrs;
8757
8758   if (STRINGP (start))
8759     {
8760       if (!STRING_MULTIBYTE (start)
8761           || SCHARS (start) == SBYTES (start))
8762         return Qnil;
8763       start_byte = 0;
8764       end_byte = SBYTES (start);
8765       pos = 0;
8766     }
8767   else
8768     {
8769       CHECK_NUMBER_COERCE_MARKER (start);
8770       CHECK_NUMBER_COERCE_MARKER (end);
8771       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8772         args_out_of_range (start, end);
8773       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8774         return Qnil;
8775       start_byte = CHAR_TO_BYTE (XINT (start));
8776       end_byte = CHAR_TO_BYTE (XINT (end));
8777       if (XINT (end) - XINT (start) == end_byte - start_byte)
8778         return Qnil;
8779
8780       if (XINT (start) < GPT && XINT (end) > GPT)
8781         {
8782           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8783             move_gap_both (XINT (start), start_byte);
8784           else
8785             move_gap_both (XINT (end), end_byte);
8786         }
8787       pos = XINT (start);
8788     }
8789
8790   list = Qnil;
8791   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8792     {
8793       elt = XCAR (tail);
8794       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8795       ASET (attrs, coding_attr_trans_tbl,
8796             get_translation_table (attrs, 1, NULL));
8797       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8798     }
8799
8800   if (STRINGP (start))
8801     p = pbeg = SDATA (start);
8802   else
8803     p = pbeg = BYTE_POS_ADDR (start_byte);
8804   pend = p + (end_byte - start_byte);
8805
8806   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8807   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8808
8809   while (p < pend)
8810     {
8811       if (ASCII_BYTE_P (*p))
8812         p++;
8813       else
8814         {
8815           c = STRING_CHAR_ADVANCE (p);
8816
8817           charset_map_loaded = 0;
8818           for (tail = list; CONSP (tail); tail = XCDR (tail))
8819             {
8820               elt = XCDR (XCAR (tail));
8821               if (! char_encodable_p (c, XCAR (elt)))
8822                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8823             }
8824           if (charset_map_loaded)
8825             {
8826               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8827
8828               if (STRINGP (start))
8829                 pbeg = SDATA (start);
8830               else
8831                 pbeg = BYTE_POS_ADDR (start_byte);
8832               p = pbeg + p_offset;
8833               pend = pbeg + pend_offset;
8834             }
8835         }
8836       pos++;
8837     }
8838
8839   tail = list;
8840   list = Qnil;
8841   for (; CONSP (tail); tail = XCDR (tail))
8842     {
8843       elt = XCAR (tail);
8844       if (CONSP (XCDR (XCDR (elt))))
8845         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8846                       list);
8847     }
8848
8849   return list;
8850 }
8851
8852
8853 Lisp_Object
8854 code_convert_region (Lisp_Object start, Lisp_Object end,
8855                      Lisp_Object coding_system, Lisp_Object dst_object,
8856                      int encodep, int norecord)
8857 {
8858   struct coding_system coding;
8859   EMACS_INT from, from_byte, to, to_byte;
8860   Lisp_Object src_object;
8861
8862   CHECK_NUMBER_COERCE_MARKER (start);
8863   CHECK_NUMBER_COERCE_MARKER (end);
8864   if (NILP (coding_system))
8865     coding_system = Qno_conversion;
8866   else
8867     CHECK_CODING_SYSTEM (coding_system);
8868   src_object = Fcurrent_buffer ();
8869   if (NILP (dst_object))
8870     dst_object = src_object;
8871   else if (! EQ (dst_object, Qt))
8872     CHECK_BUFFER (dst_object);
8873
8874   validate_region (&start, &end);
8875   from = XFASTINT (start);
8876   from_byte = CHAR_TO_BYTE (from);
8877   to = XFASTINT (end);
8878   to_byte = CHAR_TO_BYTE (to);
8879
8880   setup_coding_system (coding_system, &coding);
8881   coding.mode |= CODING_MODE_LAST_BLOCK;
8882
8883   if (encodep)
8884     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8885                           dst_object);
8886   else
8887     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8888                           dst_object);
8889   if (! norecord)
8890     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8891
8892   return (BUFFERP (dst_object)
8893           ? make_number (coding.produced_char)
8894           : coding.dst_object);
8895 }
8896
8897
8898 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8899        3, 4, "r\nzCoding system: ",
8900        doc: /* Decode the current region from the specified coding system.
8901 When called from a program, takes four arguments:
8902         START, END, CODING-SYSTEM, and DESTINATION.
8903 START and END are buffer positions.
8904
8905 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8906 If nil, the region between START and END is replaced by the decoded text.
8907 If buffer, the decoded text is inserted in that buffer after point (point
8908 does not move).
8909 In those cases, the length of the decoded text is returned.
8910 If DESTINATION is t, the decoded text is returned.
8911
8912 This function sets `last-coding-system-used' to the precise coding system
8913 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8914 not fully specified.)  */)
8915   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8916 {
8917   return code_convert_region (start, end, coding_system, destination, 0, 0);
8918 }
8919
8920 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8921        3, 4, "r\nzCoding system: ",
8922        doc: /* Encode the current region by specified coding system.
8923 When called from a program, takes four arguments:
8924         START, END, CODING-SYSTEM and DESTINATION.
8925 START and END are buffer positions.
8926
8927 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8928 If nil, the region between START and END is replace by the encoded text.
8929 If buffer, the encoded text is inserted in that buffer after point (point
8930 does not move).
8931 In those cases, the length of the encoded text is returned.
8932 If DESTINATION is t, the encoded text is returned.
8933
8934 This function sets `last-coding-system-used' to the precise coding system
8935 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8936 not fully specified.)  */)
8937   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8938 {
8939   return code_convert_region (start, end, coding_system, destination, 1, 0);
8940 }
8941
8942 Lisp_Object
8943 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8944                      Lisp_Object dst_object, int encodep, int nocopy, int norecord)
8945 {
8946   struct coding_system coding;
8947   EMACS_INT chars, bytes;
8948
8949   CHECK_STRING (string);
8950   if (NILP (coding_system))
8951     {
8952       if (! norecord)
8953         Vlast_coding_system_used = Qno_conversion;
8954       if (NILP (dst_object))
8955         return (nocopy ? Fcopy_sequence (string) : string);
8956     }
8957
8958   if (NILP (coding_system))
8959     coding_system = Qno_conversion;
8960   else
8961     CHECK_CODING_SYSTEM (coding_system);
8962   if (NILP (dst_object))
8963     dst_object = Qt;
8964   else if (! EQ (dst_object, Qt))
8965     CHECK_BUFFER (dst_object);
8966
8967   setup_coding_system (coding_system, &coding);
8968   coding.mode |= CODING_MODE_LAST_BLOCK;
8969   chars = SCHARS (string);
8970   bytes = SBYTES (string);
8971   if (encodep)
8972     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8973   else
8974     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8975   if (! norecord)
8976     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8977
8978   return (BUFFERP (dst_object)
8979           ? make_number (coding.produced_char)
8980           : coding.dst_object);
8981 }
8982
8983
8984 /* Encode or decode STRING according to CODING_SYSTEM.
8985    Do not set Vlast_coding_system_used.
8986
8987    This function is called only from macros DECODE_FILE and
8988    ENCODE_FILE, thus we ignore character composition.  */
8989
8990 Lisp_Object
8991 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8992                               int encodep)
8993 {
8994   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8995 }
8996
8997
8998 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8999        2, 4, 0,
9000        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9001
9002 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9003 if the decoding operation is trivial.
9004
9005 Optional fourth arg BUFFER non-nil means that the decoded text is
9006 inserted in that buffer after point (point does not move).  In this
9007 case, the return value is the length of the decoded text.
9008
9009 This function sets `last-coding-system-used' to the precise coding system
9010 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9011 not fully specified.)  */)
9012   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9013 {
9014   return code_convert_string (string, coding_system, buffer,
9015                               0, ! NILP (nocopy), 0);
9016 }
9017
9018 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9019        2, 4, 0,
9020        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9021
9022 Optional third arg NOCOPY non-nil means it is OK to return STRING
9023 itself if the encoding operation is trivial.
9024
9025 Optional fourth arg BUFFER non-nil means that the encoded text is
9026 inserted in that buffer after point (point does not move).  In this
9027 case, the return value is the length of the encoded text.
9028
9029 This function sets `last-coding-system-used' to the precise coding system
9030 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9031 not fully specified.)  */)
9032   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9033 {
9034   return code_convert_string (string, coding_system, buffer,
9035                               1, ! NILP (nocopy), 1);
9036 }
9037
9038 \f
9039 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9040        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9041 Return the corresponding character.  */)
9042   (Lisp_Object code)
9043 {
9044   Lisp_Object spec, attrs, val;
9045   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9046   int c;
9047
9048   CHECK_NATNUM (code);
9049   c = XFASTINT (code);
9050   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9051   attrs = AREF (spec, 0);
9052
9053   if (ASCII_BYTE_P (c)
9054       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9055     return code;
9056
9057   val = CODING_ATTR_CHARSET_LIST (attrs);
9058   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9059   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9060   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9061
9062   if (c <= 0x7F)
9063     charset = charset_roman;
9064   else if (c >= 0xA0 && c < 0xDF)
9065     {
9066       charset = charset_kana;
9067       c -= 0x80;
9068     }
9069   else
9070     {
9071       int c1 = c >> 8, c2 = c & 0xFF;
9072
9073       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9074           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9075         error ("Invalid code: %d", code);
9076       SJIS_TO_JIS (c);
9077       charset = charset_kanji;
9078     }
9079   c = DECODE_CHAR (charset, c);
9080   if (c < 0)
9081     error ("Invalid code: %d", code);
9082   return make_number (c);
9083 }
9084
9085
9086 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9087        doc: /* Encode a Japanese character CH to shift_jis encoding.
9088 Return the corresponding code in SJIS.  */)
9089   (Lisp_Object ch)
9090 {
9091   Lisp_Object spec, attrs, charset_list;
9092   int c;
9093   struct charset *charset;
9094   unsigned code;
9095
9096   CHECK_CHARACTER (ch);
9097   c = XFASTINT (ch);
9098   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9099   attrs = AREF (spec, 0);
9100
9101   if (ASCII_CHAR_P (c)
9102       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9103     return ch;
9104
9105   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9106   charset = char_charset (c, charset_list, &code);
9107   if (code == CHARSET_INVALID_CODE (charset))
9108     error ("Can't encode by shift_jis encoding: %d", c);
9109   JIS_TO_SJIS (code);
9110
9111   return make_number (code);
9112 }
9113
9114 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9115        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9116 Return the corresponding character.  */)
9117   (Lisp_Object code)
9118 {
9119   Lisp_Object spec, attrs, val;
9120   struct charset *charset_roman, *charset_big5, *charset;
9121   int c;
9122
9123   CHECK_NATNUM (code);
9124   c = XFASTINT (code);
9125   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9126   attrs = AREF (spec, 0);
9127
9128   if (ASCII_BYTE_P (c)
9129       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9130     return code;
9131
9132   val = CODING_ATTR_CHARSET_LIST (attrs);
9133   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9134   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9135
9136   if (c <= 0x7F)
9137     charset = charset_roman;
9138   else
9139     {
9140       int b1 = c >> 8, b2 = c & 0x7F;
9141       if (b1 < 0xA1 || b1 > 0xFE
9142           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9143         error ("Invalid code: %d", code);
9144       charset = charset_big5;
9145     }
9146   c = DECODE_CHAR (charset, (unsigned )c);
9147   if (c < 0)
9148     error ("Invalid code: %d", code);
9149   return make_number (c);
9150 }
9151
9152 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9153        doc: /* Encode the Big5 character CH to BIG5 coding system.
9154 Return the corresponding character code in Big5.  */)
9155   (Lisp_Object ch)
9156 {
9157   Lisp_Object spec, attrs, charset_list;
9158   struct charset *charset;
9159   int c;
9160   unsigned code;
9161
9162   CHECK_CHARACTER (ch);
9163   c = XFASTINT (ch);
9164   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9165   attrs = AREF (spec, 0);
9166   if (ASCII_CHAR_P (c)
9167       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9168     return ch;
9169
9170   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9171   charset = char_charset (c, charset_list, &code);
9172   if (code == CHARSET_INVALID_CODE (charset))
9173     error ("Can't encode by Big5 encoding: %d", c);
9174
9175   return make_number (code);
9176 }
9177
9178 \f
9179 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9180        Sset_terminal_coding_system_internal, 1, 2, 0,
9181        doc: /* Internal use only.  */)
9182   (Lisp_Object coding_system, Lisp_Object terminal)
9183 {
9184   struct terminal *term = get_terminal (terminal, 1);
9185   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9186   CHECK_SYMBOL (coding_system);
9187   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9188   /* We had better not send unsafe characters to terminal.  */
9189   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9190   /* Character composition should be disabled.  */
9191   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9192   terminal_coding->src_multibyte = 1;
9193   terminal_coding->dst_multibyte = 0;
9194   if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9195     term->charset_list = coding_charset_list (terminal_coding);
9196   else
9197     term->charset_list = Fcons (make_number (charset_ascii), Qnil);
9198   return Qnil;
9199 }
9200
9201 DEFUN ("set-safe-terminal-coding-system-internal",
9202        Fset_safe_terminal_coding_system_internal,
9203        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9204        doc: /* Internal use only.  */)
9205   (Lisp_Object coding_system)
9206 {
9207   CHECK_SYMBOL (coding_system);
9208   setup_coding_system (Fcheck_coding_system (coding_system),
9209                        &safe_terminal_coding);
9210   /* Character composition should be disabled.  */
9211   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9212   safe_terminal_coding.src_multibyte = 1;
9213   safe_terminal_coding.dst_multibyte = 0;
9214   return Qnil;
9215 }
9216
9217 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9218        Sterminal_coding_system, 0, 1, 0,
9219        doc: /* Return coding system specified for terminal output on the given terminal.
9220 TERMINAL may be a terminal object, a frame, or nil for the selected
9221 frame's terminal device.  */)
9222   (Lisp_Object terminal)
9223 {
9224   struct coding_system *terminal_coding
9225     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9226   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9227
9228   /* For backward compatibility, return nil if it is `undecided'. */
9229   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9230 }
9231
9232 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9233        Sset_keyboard_coding_system_internal, 1, 2, 0,
9234        doc: /* Internal use only.  */)
9235   (Lisp_Object coding_system, Lisp_Object terminal)
9236 {
9237   struct terminal *t = get_terminal (terminal, 1);
9238   CHECK_SYMBOL (coding_system);
9239   if (NILP (coding_system))
9240     coding_system = Qno_conversion;
9241   else
9242     Fcheck_coding_system (coding_system);
9243   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9244   /* Character composition should be disabled.  */
9245   TERMINAL_KEYBOARD_CODING (t)->common_flags
9246     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9247   return Qnil;
9248 }
9249
9250 DEFUN ("keyboard-coding-system",
9251        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9252        doc: /* Return coding system specified for decoding keyboard input.  */)
9253   (Lisp_Object terminal)
9254 {
9255   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9256                          (get_terminal (terminal, 1))->id);
9257 }
9258
9259 \f
9260 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9261        Sfind_operation_coding_system,  1, MANY, 0,
9262        doc: /* Choose a coding system for an operation based on the target name.
9263 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9264 DECODING-SYSTEM is the coding system to use for decoding
9265 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9266 for encoding (in case OPERATION does encoding).
9267
9268 The first argument OPERATION specifies an I/O primitive:
9269   For file I/O, `insert-file-contents' or `write-region'.
9270   For process I/O, `call-process', `call-process-region', or `start-process'.
9271   For network I/O, `open-network-stream'.
9272
9273 The remaining arguments should be the same arguments that were passed
9274 to the primitive.  Depending on which primitive, one of those arguments
9275 is selected as the TARGET.  For example, if OPERATION does file I/O,
9276 whichever argument specifies the file name is TARGET.
9277
9278 TARGET has a meaning which depends on OPERATION:
9279   For file I/O, TARGET is a file name (except for the special case below).
9280   For process I/O, TARGET is a process name.
9281   For network I/O, TARGET is a service name or a port number.
9282
9283 This function looks up what is specified for TARGET in
9284 `file-coding-system-alist', `process-coding-system-alist',
9285 or `network-coding-system-alist' depending on OPERATION.
9286 They may specify a coding system, a cons of coding systems,
9287 or a function symbol to call.
9288 In the last case, we call the function with one argument,
9289 which is a list of all the arguments given to this function.
9290 If the function can't decide a coding system, it can return
9291 `undecided' so that the normal code-detection is performed.
9292
9293 If OPERATION is `insert-file-contents', the argument corresponding to
9294 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9295 file name to look up, and BUFFER is a buffer that contains the file's
9296 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9297 function to call for FILENAME, that function should examine the
9298 contents of BUFFER instead of reading the file.
9299
9300 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9301   (int nargs, Lisp_Object *args)
9302 {
9303   Lisp_Object operation, target_idx, target, val;
9304   register Lisp_Object chain;
9305
9306   if (nargs < 2)
9307     error ("Too few arguments");
9308   operation = args[0];
9309   if (!SYMBOLP (operation)
9310       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9311     error ("Invalid first argument");
9312   if (nargs < 1 + XINT (target_idx))
9313     error ("Too few arguments for operation: %s",
9314            SDATA (SYMBOL_NAME (operation)));
9315   target = args[XINT (target_idx) + 1];
9316   if (!(STRINGP (target)
9317         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9318             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9319         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9320     error ("Invalid %dth argument", XINT (target_idx) + 1);
9321   if (CONSP (target))
9322     target = XCAR (target);
9323
9324   chain = ((EQ (operation, Qinsert_file_contents)
9325             || EQ (operation, Qwrite_region))
9326            ? Vfile_coding_system_alist
9327            : (EQ (operation, Qopen_network_stream)
9328               ? Vnetwork_coding_system_alist
9329               : Vprocess_coding_system_alist));
9330   if (NILP (chain))
9331     return Qnil;
9332
9333   for (; CONSP (chain); chain = XCDR (chain))
9334     {
9335       Lisp_Object elt;
9336
9337       elt = XCAR (chain);
9338       if (CONSP (elt)
9339           && ((STRINGP (target)
9340                && STRINGP (XCAR (elt))
9341                && fast_string_match (XCAR (elt), target) >= 0)
9342               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9343         {
9344           val = XCDR (elt);
9345           /* Here, if VAL is both a valid coding system and a valid
9346              function symbol, we return VAL as a coding system.  */
9347           if (CONSP (val))
9348             return val;
9349           if (! SYMBOLP (val))
9350             return Qnil;
9351           if (! NILP (Fcoding_system_p (val)))
9352             return Fcons (val, val);
9353           if (! NILP (Ffboundp (val)))
9354             {
9355               /* We use call1 rather than safe_call1
9356                  so as to get bug reports about functions called here
9357                  which don't handle the current interface.  */
9358               val = call1 (val, Flist (nargs, args));
9359               if (CONSP (val))
9360                 return val;
9361               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9362                 return Fcons (val, val);
9363             }
9364           return Qnil;
9365         }
9366     }
9367   return Qnil;
9368 }
9369
9370 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9371        Sset_coding_system_priority, 0, MANY, 0,
9372        doc: /* Assign higher priority to the coding systems given as arguments.
9373 If multiple coding systems belong to the same category,
9374 all but the first one are ignored.
9375
9376 usage: (set-coding-system-priority &rest coding-systems)  */)
9377   (int nargs, Lisp_Object *args)
9378 {
9379   int i, j;
9380   int changed[coding_category_max];
9381   enum coding_category priorities[coding_category_max];
9382
9383   memset (changed, 0, sizeof changed);
9384
9385   for (i = j = 0; i < nargs; i++)
9386     {
9387       enum coding_category category;
9388       Lisp_Object spec, attrs;
9389
9390       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9391       attrs = AREF (spec, 0);
9392       category = XINT (CODING_ATTR_CATEGORY (attrs));
9393       if (changed[category])
9394         /* Ignore this coding system because a coding system of the
9395            same category already had a higher priority.  */
9396         continue;
9397       changed[category] = 1;
9398       priorities[j++] = category;
9399       if (coding_categories[category].id >= 0
9400           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9401         setup_coding_system (args[i], &coding_categories[category]);
9402       Fset (AREF (Vcoding_category_table, category), args[i]);
9403     }
9404
9405   /* Now we have decided top J priorities.  Reflect the order of the
9406      original priorities to the remaining priorities.  */
9407
9408   for (i = j, j = 0; i < coding_category_max; i++, j++)
9409     {
9410       while (j < coding_category_max
9411              && changed[coding_priorities[j]])
9412         j++;
9413       if (j == coding_category_max)
9414         abort ();
9415       priorities[i] = coding_priorities[j];
9416     }
9417
9418   memcpy (coding_priorities, priorities, sizeof priorities);
9419
9420   /* Update `coding-category-list'.  */
9421   Vcoding_category_list = Qnil;
9422   for (i = coding_category_max - 1; i >= 0; i--)
9423     Vcoding_category_list
9424       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9425                Vcoding_category_list);
9426
9427   return Qnil;
9428 }
9429
9430 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9431        Scoding_system_priority_list, 0, 1, 0,
9432        doc: /* Return a list of coding systems ordered by their priorities.
9433 The list contains a subset of coding systems; i.e. coding systems
9434 assigned to each coding category (see `coding-category-list').
9435
9436 HIGHESTP non-nil means just return the highest priority one.  */)
9437   (Lisp_Object highestp)
9438 {
9439   int i;
9440   Lisp_Object val;
9441
9442   for (i = 0, val = Qnil; i < coding_category_max; i++)
9443     {
9444       enum coding_category category = coding_priorities[i];
9445       int id = coding_categories[category].id;
9446       Lisp_Object attrs;
9447
9448       if (id < 0)
9449         continue;
9450       attrs = CODING_ID_ATTRS (id);
9451       if (! NILP (highestp))
9452         return CODING_ATTR_BASE_NAME (attrs);
9453       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9454     }
9455   return Fnreverse (val);
9456 }
9457
9458 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9459
9460 static Lisp_Object
9461 make_subsidiaries (Lisp_Object base)
9462 {
9463   Lisp_Object subsidiaries;
9464   int base_name_len = SBYTES (SYMBOL_NAME (base));
9465   char *buf = (char *) alloca (base_name_len + 6);
9466   int i;
9467
9468   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9469   subsidiaries = Fmake_vector (make_number (3), Qnil);
9470   for (i = 0; i < 3; i++)
9471     {
9472       memcpy (buf + base_name_len, suffixes[i], strlen (suffixes[i]) + 1);
9473       ASET (subsidiaries, i, intern (buf));
9474     }
9475   return subsidiaries;
9476 }
9477
9478
9479 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9480        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9481        doc: /* For internal use only.
9482 usage: (define-coding-system-internal ...)  */)
9483   (int nargs, Lisp_Object *args)
9484 {
9485   Lisp_Object name;
9486   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9487   Lisp_Object attrs;            /* Vector of attributes.  */
9488   Lisp_Object eol_type;
9489   Lisp_Object aliases;
9490   Lisp_Object coding_type, charset_list, safe_charsets;
9491   enum coding_category category;
9492   Lisp_Object tail, val;
9493   int max_charset_id = 0;
9494   int i;
9495
9496   if (nargs < coding_arg_max)
9497     goto short_args;
9498
9499   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9500
9501   name = args[coding_arg_name];
9502   CHECK_SYMBOL (name);
9503   CODING_ATTR_BASE_NAME (attrs) = name;
9504
9505   val = args[coding_arg_mnemonic];
9506   if (! STRINGP (val))
9507     CHECK_CHARACTER (val);
9508   CODING_ATTR_MNEMONIC (attrs) = val;
9509
9510   coding_type = args[coding_arg_coding_type];
9511   CHECK_SYMBOL (coding_type);
9512   CODING_ATTR_TYPE (attrs) = coding_type;
9513
9514   charset_list = args[coding_arg_charset_list];
9515   if (SYMBOLP (charset_list))
9516     {
9517       if (EQ (charset_list, Qiso_2022))
9518         {
9519           if (! EQ (coding_type, Qiso_2022))
9520             error ("Invalid charset-list");
9521           charset_list = Viso_2022_charset_list;
9522         }
9523       else if (EQ (charset_list, Qemacs_mule))
9524         {
9525           if (! EQ (coding_type, Qemacs_mule))
9526             error ("Invalid charset-list");
9527           charset_list = Vemacs_mule_charset_list;
9528         }
9529       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9530         if (max_charset_id < XFASTINT (XCAR (tail)))
9531           max_charset_id = XFASTINT (XCAR (tail));
9532     }
9533   else
9534     {
9535       charset_list = Fcopy_sequence (charset_list);
9536       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9537         {
9538           struct charset *charset;
9539
9540           val = XCAR (tail);
9541           CHECK_CHARSET_GET_CHARSET (val, charset);
9542           if (EQ (coding_type, Qiso_2022)
9543               ? CHARSET_ISO_FINAL (charset) < 0
9544               : EQ (coding_type, Qemacs_mule)
9545               ? CHARSET_EMACS_MULE_ID (charset) < 0
9546               : 0)
9547             error ("Can't handle charset `%s'",
9548                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9549
9550           XSETCAR (tail, make_number (charset->id));
9551           if (max_charset_id < charset->id)
9552             max_charset_id = charset->id;
9553         }
9554     }
9555   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9556
9557   safe_charsets = make_uninit_string (max_charset_id + 1);
9558   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9559   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9560     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9561   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9562
9563   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9564
9565   val = args[coding_arg_decode_translation_table];
9566   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9567     CHECK_SYMBOL (val);
9568   CODING_ATTR_DECODE_TBL (attrs) = val;
9569
9570   val = args[coding_arg_encode_translation_table];
9571   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9572     CHECK_SYMBOL (val);
9573   CODING_ATTR_ENCODE_TBL (attrs) = val;
9574
9575   val = args[coding_arg_post_read_conversion];
9576   CHECK_SYMBOL (val);
9577   CODING_ATTR_POST_READ (attrs) = val;
9578
9579   val = args[coding_arg_pre_write_conversion];
9580   CHECK_SYMBOL (val);
9581   CODING_ATTR_PRE_WRITE (attrs) = val;
9582
9583   val = args[coding_arg_default_char];
9584   if (NILP (val))
9585     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9586   else
9587     {
9588       CHECK_CHARACTER (val);
9589       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9590     }
9591
9592   val = args[coding_arg_for_unibyte];
9593   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9594
9595   val = args[coding_arg_plist];
9596   CHECK_LIST (val);
9597   CODING_ATTR_PLIST (attrs) = val;
9598
9599   if (EQ (coding_type, Qcharset))
9600     {
9601       /* Generate a lisp vector of 256 elements.  Each element is nil,
9602          integer, or a list of charset IDs.
9603
9604          If Nth element is nil, the byte code N is invalid in this
9605          coding system.
9606
9607          If Nth element is a number NUM, N is the first byte of a
9608          charset whose ID is NUM.
9609
9610          If Nth element is a list of charset IDs, N is the first byte
9611          of one of them.  The list is sorted by dimensions of the
9612          charsets.  A charset of smaller dimension comes first. */
9613       val = Fmake_vector (make_number (256), Qnil);
9614
9615       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9616         {
9617           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9618           int dim = CHARSET_DIMENSION (charset);
9619           int idx = (dim - 1) * 4;
9620
9621           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9622             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9623
9624           for (i = charset->code_space[idx];
9625                i <= charset->code_space[idx + 1]; i++)
9626             {
9627               Lisp_Object tmp, tmp2;
9628               int dim2;
9629
9630               tmp = AREF (val, i);
9631               if (NILP (tmp))
9632                 tmp = XCAR (tail);
9633               else if (NUMBERP (tmp))
9634                 {
9635                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9636                   if (dim < dim2)
9637                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9638                   else
9639                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9640                 }
9641               else
9642                 {
9643                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9644                     {
9645                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9646                       if (dim < dim2)
9647                         break;
9648                     }
9649                   if (NILP (tmp2))
9650                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9651                   else
9652                     {
9653                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9654                       XSETCAR (tmp2, XCAR (tail));
9655                     }
9656                 }
9657               ASET (val, i, tmp);
9658             }
9659         }
9660       ASET (attrs, coding_attr_charset_valids, val);
9661       category = coding_category_charset;
9662     }
9663   else if (EQ (coding_type, Qccl))
9664     {
9665       Lisp_Object valids;
9666
9667       if (nargs < coding_arg_ccl_max)
9668         goto short_args;
9669
9670       val = args[coding_arg_ccl_decoder];
9671       CHECK_CCL_PROGRAM (val);
9672       if (VECTORP (val))
9673         val = Fcopy_sequence (val);
9674       ASET (attrs, coding_attr_ccl_decoder, val);
9675
9676       val = args[coding_arg_ccl_encoder];
9677       CHECK_CCL_PROGRAM (val);
9678       if (VECTORP (val))
9679         val = Fcopy_sequence (val);
9680       ASET (attrs, coding_attr_ccl_encoder, val);
9681
9682       val = args[coding_arg_ccl_valids];
9683       valids = Fmake_string (make_number (256), make_number (0));
9684       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9685         {
9686           int from, to;
9687
9688           val = Fcar (tail);
9689           if (INTEGERP (val))
9690             {
9691               from = to = XINT (val);
9692               if (from < 0 || from > 255)
9693                 args_out_of_range_3 (val, make_number (0), make_number (255));
9694             }
9695           else
9696             {
9697               CHECK_CONS (val);
9698               CHECK_NATNUM_CAR (val);
9699               CHECK_NATNUM_CDR (val);
9700               from = XINT (XCAR (val));
9701               if (from > 255)
9702                 args_out_of_range_3 (XCAR (val),
9703                                      make_number (0), make_number (255));
9704               to = XINT (XCDR (val));
9705               if (to < from || to > 255)
9706                 args_out_of_range_3 (XCDR (val),
9707                                      XCAR (val), make_number (255));
9708             }
9709           for (i = from; i <= to; i++)
9710             SSET (valids, i, 1);
9711         }
9712       ASET (attrs, coding_attr_ccl_valids, valids);
9713
9714       category = coding_category_ccl;
9715     }
9716   else if (EQ (coding_type, Qutf_16))
9717     {
9718       Lisp_Object bom, endian;
9719
9720       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9721
9722       if (nargs < coding_arg_utf16_max)
9723         goto short_args;
9724
9725       bom = args[coding_arg_utf16_bom];
9726       if (! NILP (bom) && ! EQ (bom, Qt))
9727         {
9728           CHECK_CONS (bom);
9729           val = XCAR (bom);
9730           CHECK_CODING_SYSTEM (val);
9731           val = XCDR (bom);
9732           CHECK_CODING_SYSTEM (val);
9733         }
9734       ASET (attrs, coding_attr_utf_bom, bom);
9735
9736       endian = args[coding_arg_utf16_endian];
9737       CHECK_SYMBOL (endian);
9738       if (NILP (endian))
9739         endian = Qbig;
9740       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9741         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9742       ASET (attrs, coding_attr_utf_16_endian, endian);
9743
9744       category = (CONSP (bom)
9745                   ? coding_category_utf_16_auto
9746                   : NILP (bom)
9747                   ? (EQ (endian, Qbig)
9748                      ? coding_category_utf_16_be_nosig
9749                      : coding_category_utf_16_le_nosig)
9750                   : (EQ (endian, Qbig)
9751                      ? coding_category_utf_16_be
9752                      : coding_category_utf_16_le));
9753     }
9754   else if (EQ (coding_type, Qiso_2022))
9755     {
9756       Lisp_Object initial, reg_usage, request, flags;
9757
9758       if (nargs < coding_arg_iso2022_max)
9759         goto short_args;
9760
9761       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9762       CHECK_VECTOR (initial);
9763       for (i = 0; i < 4; i++)
9764         {
9765           val = Faref (initial, make_number (i));
9766           if (! NILP (val))
9767             {
9768               struct charset *charset;
9769
9770               CHECK_CHARSET_GET_CHARSET (val, charset);
9771               ASET (initial, i, make_number (CHARSET_ID (charset)));
9772               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9773                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9774             }
9775           else
9776             ASET (initial, i, make_number (-1));
9777         }
9778
9779       reg_usage = args[coding_arg_iso2022_reg_usage];
9780       CHECK_CONS (reg_usage);
9781       CHECK_NUMBER_CAR (reg_usage);
9782       CHECK_NUMBER_CDR (reg_usage);
9783
9784       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9785       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9786         {
9787           int id;
9788           Lisp_Object tmp1;
9789
9790           val = Fcar (tail);
9791           CHECK_CONS (val);
9792           tmp1 = XCAR (val);
9793           CHECK_CHARSET_GET_ID (tmp1, id);
9794           CHECK_NATNUM_CDR (val);
9795           if (XINT (XCDR (val)) >= 4)
9796             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9797           XSETCAR (val, make_number (id));
9798         }
9799
9800       flags = args[coding_arg_iso2022_flags];
9801       CHECK_NATNUM (flags);
9802       i = XINT (flags);
9803       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9804         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9805
9806       ASET (attrs, coding_attr_iso_initial, initial);
9807       ASET (attrs, coding_attr_iso_usage, reg_usage);
9808       ASET (attrs, coding_attr_iso_request, request);
9809       ASET (attrs, coding_attr_iso_flags, flags);
9810       setup_iso_safe_charsets (attrs);
9811
9812       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9813         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9814                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9815                     ? coding_category_iso_7_else
9816                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9817                     ? coding_category_iso_7
9818                     : coding_category_iso_7_tight);
9819       else
9820         {
9821           int id = XINT (AREF (initial, 1));
9822
9823           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9824                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9825                        || id < 0)
9826                       ? coding_category_iso_8_else
9827                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9828                       ? coding_category_iso_8_1
9829                       : coding_category_iso_8_2);
9830         }
9831       if (category != coding_category_iso_8_1
9832           && category != coding_category_iso_8_2)
9833         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9834     }
9835   else if (EQ (coding_type, Qemacs_mule))
9836     {
9837       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9838         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9839       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9840       category = coding_category_emacs_mule;
9841     }
9842   else if (EQ (coding_type, Qshift_jis))
9843     {
9844
9845       struct charset *charset;
9846
9847       if (XINT (Flength (charset_list)) != 3
9848           && XINT (Flength (charset_list)) != 4)
9849         error ("There should be three or four charsets");
9850
9851       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9852       if (CHARSET_DIMENSION (charset) != 1)
9853         error ("Dimension of charset %s is not one",
9854                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9855       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9856         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9857
9858       charset_list = XCDR (charset_list);
9859       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9860       if (CHARSET_DIMENSION (charset) != 1)
9861         error ("Dimension of charset %s is not one",
9862                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9863
9864       charset_list = XCDR (charset_list);
9865       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9866       if (CHARSET_DIMENSION (charset) != 2)
9867         error ("Dimension of charset %s is not two",
9868                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9869
9870       charset_list = XCDR (charset_list);
9871       if (! NILP (charset_list))
9872         {
9873           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9874           if (CHARSET_DIMENSION (charset) != 2)
9875             error ("Dimension of charset %s is not two",
9876                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9877         }
9878
9879       category = coding_category_sjis;
9880       Vsjis_coding_system = name;
9881     }
9882   else if (EQ (coding_type, Qbig5))
9883     {
9884       struct charset *charset;
9885
9886       if (XINT (Flength (charset_list)) != 2)
9887         error ("There should be just two charsets");
9888
9889       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9890       if (CHARSET_DIMENSION (charset) != 1)
9891         error ("Dimension of charset %s is not one",
9892                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9893       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9894         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9895
9896       charset_list = XCDR (charset_list);
9897       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9898       if (CHARSET_DIMENSION (charset) != 2)
9899         error ("Dimension of charset %s is not two",
9900                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9901
9902       category = coding_category_big5;
9903       Vbig5_coding_system = name;
9904     }
9905   else if (EQ (coding_type, Qraw_text))
9906     {
9907       category = coding_category_raw_text;
9908       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9909     }
9910   else if (EQ (coding_type, Qutf_8))
9911     {
9912       Lisp_Object bom;
9913
9914       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9915
9916       if (nargs < coding_arg_utf8_max)
9917         goto short_args;
9918
9919       bom = args[coding_arg_utf8_bom];
9920       if (! NILP (bom) && ! EQ (bom, Qt))
9921         {
9922           CHECK_CONS (bom);
9923           val = XCAR (bom);
9924           CHECK_CODING_SYSTEM (val);
9925           val = XCDR (bom);
9926           CHECK_CODING_SYSTEM (val);
9927         }
9928       ASET (attrs, coding_attr_utf_bom, bom);
9929
9930       category = (CONSP (bom) ? coding_category_utf_8_auto
9931                   : NILP (bom) ? coding_category_utf_8_nosig
9932                   : coding_category_utf_8_sig);
9933     }
9934   else if (EQ (coding_type, Qundecided))
9935     category = coding_category_undecided;
9936   else
9937     error ("Invalid coding system type: %s",
9938            SDATA (SYMBOL_NAME (coding_type)));
9939
9940   CODING_ATTR_CATEGORY (attrs) = make_number (category);
9941   CODING_ATTR_PLIST (attrs)
9942     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9943                                 CODING_ATTR_PLIST (attrs)));
9944   CODING_ATTR_PLIST (attrs)
9945     = Fcons (QCascii_compatible_p,
9946              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9947                     CODING_ATTR_PLIST (attrs)));
9948
9949   eol_type = args[coding_arg_eol_type];
9950   if (! NILP (eol_type)
9951       && ! EQ (eol_type, Qunix)
9952       && ! EQ (eol_type, Qdos)
9953       && ! EQ (eol_type, Qmac))
9954     error ("Invalid eol-type");
9955
9956   aliases = Fcons (name, Qnil);
9957
9958   if (NILP (eol_type))
9959     {
9960       eol_type = make_subsidiaries (name);
9961       for (i = 0; i < 3; i++)
9962         {
9963           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9964
9965           this_name = AREF (eol_type, i);
9966           this_aliases = Fcons (this_name, Qnil);
9967           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9968           this_spec = Fmake_vector (make_number (3), attrs);
9969           ASET (this_spec, 1, this_aliases);
9970           ASET (this_spec, 2, this_eol_type);
9971           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9972           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9973           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9974           if (NILP (val))
9975             Vcoding_system_alist
9976               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9977                        Vcoding_system_alist);
9978         }
9979     }
9980
9981   spec_vec = Fmake_vector (make_number (3), attrs);
9982   ASET (spec_vec, 1, aliases);
9983   ASET (spec_vec, 2, eol_type);
9984
9985   Fputhash (name, spec_vec, Vcoding_system_hash_table);
9986   Vcoding_system_list = Fcons (name, Vcoding_system_list);
9987   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9988   if (NILP (val))
9989     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9990                                   Vcoding_system_alist);
9991
9992   {
9993     int id = coding_categories[category].id;
9994
9995     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9996       setup_coding_system (name, &coding_categories[category]);
9997   }
9998
9999   return Qnil;
10000
10001  short_args:
10002   return Fsignal (Qwrong_number_of_arguments,
10003                   Fcons (intern ("define-coding-system-internal"),
10004                          make_number (nargs)));
10005 }
10006
10007
10008 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10009        3, 3, 0,
10010        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10011   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10012 {
10013   Lisp_Object spec, attrs;
10014
10015   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10016   attrs = AREF (spec, 0);
10017   if (EQ (prop, QCmnemonic))
10018     {
10019       if (! STRINGP (val))
10020         CHECK_CHARACTER (val);
10021       CODING_ATTR_MNEMONIC (attrs) = val;
10022     }
10023   else if (EQ (prop, QCdefault_char))
10024     {
10025       if (NILP (val))
10026         val = make_number (' ');
10027       else
10028         CHECK_CHARACTER (val);
10029       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10030     }
10031   else if (EQ (prop, QCdecode_translation_table))
10032     {
10033       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10034         CHECK_SYMBOL (val);
10035       CODING_ATTR_DECODE_TBL (attrs) = val;
10036     }
10037   else if (EQ (prop, QCencode_translation_table))
10038     {
10039       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10040         CHECK_SYMBOL (val);
10041       CODING_ATTR_ENCODE_TBL (attrs) = val;
10042     }
10043   else if (EQ (prop, QCpost_read_conversion))
10044     {
10045       CHECK_SYMBOL (val);
10046       CODING_ATTR_POST_READ (attrs) = val;
10047     }
10048   else if (EQ (prop, QCpre_write_conversion))
10049     {
10050       CHECK_SYMBOL (val);
10051       CODING_ATTR_PRE_WRITE (attrs) = val;
10052     }
10053   else if (EQ (prop, QCascii_compatible_p))
10054     {
10055       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10056     }
10057
10058   CODING_ATTR_PLIST (attrs)
10059     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10060   return val;
10061 }
10062
10063
10064 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10065        Sdefine_coding_system_alias, 2, 2, 0,
10066        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10067   (Lisp_Object alias, Lisp_Object coding_system)
10068 {
10069   Lisp_Object spec, aliases, eol_type, val;
10070
10071   CHECK_SYMBOL (alias);
10072   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10073   aliases = AREF (spec, 1);
10074   /* ALIASES should be a list of length more than zero, and the first
10075      element is a base coding system.  Append ALIAS at the tail of the
10076      list.  */
10077   while (!NILP (XCDR (aliases)))
10078     aliases = XCDR (aliases);
10079   XSETCDR (aliases, Fcons (alias, Qnil));
10080
10081   eol_type = AREF (spec, 2);
10082   if (VECTORP (eol_type))
10083     {
10084       Lisp_Object subsidiaries;
10085       int i;
10086
10087       subsidiaries = make_subsidiaries (alias);
10088       for (i = 0; i < 3; i++)
10089         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10090                                      AREF (eol_type, i));
10091     }
10092
10093   Fputhash (alias, spec, Vcoding_system_hash_table);
10094   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10095   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10096   if (NILP (val))
10097     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10098                                   Vcoding_system_alist);
10099
10100   return Qnil;
10101 }
10102
10103 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10104        1, 1, 0,
10105        doc: /* Return the base of CODING-SYSTEM.
10106 Any alias or subsidiary coding system is not a base coding system.  */)
10107   (Lisp_Object coding_system)
10108 {
10109   Lisp_Object spec, attrs;
10110
10111   if (NILP (coding_system))
10112     return (Qno_conversion);
10113   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10114   attrs = AREF (spec, 0);
10115   return CODING_ATTR_BASE_NAME (attrs);
10116 }
10117
10118 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10119        1, 1, 0,
10120        doc: "Return the property list of CODING-SYSTEM.")
10121   (Lisp_Object coding_system)
10122 {
10123   Lisp_Object spec, attrs;
10124
10125   if (NILP (coding_system))
10126     coding_system = Qno_conversion;
10127   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10128   attrs = AREF (spec, 0);
10129   return CODING_ATTR_PLIST (attrs);
10130 }
10131
10132
10133 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10134        1, 1, 0,
10135        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10136   (Lisp_Object coding_system)
10137 {
10138   Lisp_Object spec;
10139
10140   if (NILP (coding_system))
10141     coding_system = Qno_conversion;
10142   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10143   return AREF (spec, 1);
10144 }
10145
10146 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10147        Scoding_system_eol_type, 1, 1, 0,
10148        doc: /* Return eol-type of CODING-SYSTEM.
10149 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10150
10151 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10152 and CR respectively.
10153
10154 A vector value indicates that a format of end-of-line should be
10155 detected automatically.  Nth element of the vector is the subsidiary
10156 coding system whose eol-type is N.  */)
10157   (Lisp_Object coding_system)
10158 {
10159   Lisp_Object spec, eol_type;
10160   int n;
10161
10162   if (NILP (coding_system))
10163     coding_system = Qno_conversion;
10164   if (! CODING_SYSTEM_P (coding_system))
10165     return Qnil;
10166   spec = CODING_SYSTEM_SPEC (coding_system);
10167   eol_type = AREF (spec, 2);
10168   if (VECTORP (eol_type))
10169     return Fcopy_sequence (eol_type);
10170   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10171   return make_number (n);
10172 }
10173
10174 #endif /* emacs */
10175
10176 \f
10177 /*** 9. Post-amble ***/
10178
10179 void
10180 init_coding_once (void)
10181 {
10182   int i;
10183
10184   for (i = 0; i < coding_category_max; i++)
10185     {
10186       coding_categories[i].id = -1;
10187       coding_priorities[i] = i;
10188     }
10189
10190   /* ISO2022 specific initialize routine.  */
10191   for (i = 0; i < 0x20; i++)
10192     iso_code_class[i] = ISO_control_0;
10193   for (i = 0x21; i < 0x7F; i++)
10194     iso_code_class[i] = ISO_graphic_plane_0;
10195   for (i = 0x80; i < 0xA0; i++)
10196     iso_code_class[i] = ISO_control_1;
10197   for (i = 0xA1; i < 0xFF; i++)
10198     iso_code_class[i] = ISO_graphic_plane_1;
10199   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10200   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10201   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10202   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10203   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10204   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10205   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10206   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10207   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10208
10209   for (i = 0; i < 256; i++)
10210     {
10211       emacs_mule_bytes[i] = 1;
10212     }
10213   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10214   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10215   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10216   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10217 }
10218
10219 #ifdef emacs
10220
10221 void
10222 syms_of_coding (void)
10223 {
10224   staticpro (&Vcoding_system_hash_table);
10225   {
10226     Lisp_Object args[2];
10227     args[0] = QCtest;
10228     args[1] = Qeq;
10229     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10230   }
10231
10232   staticpro (&Vsjis_coding_system);
10233   Vsjis_coding_system = Qnil;
10234
10235   staticpro (&Vbig5_coding_system);
10236   Vbig5_coding_system = Qnil;
10237
10238   staticpro (&Vcode_conversion_reused_workbuf);
10239   Vcode_conversion_reused_workbuf = Qnil;
10240
10241   staticpro (&Vcode_conversion_workbuf_name);
10242   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10243
10244   reused_workbuf_in_use = 0;
10245
10246   DEFSYM (Qcharset, "charset");
10247   DEFSYM (Qtarget_idx, "target-idx");
10248   DEFSYM (Qcoding_system_history, "coding-system-history");
10249   Fset (Qcoding_system_history, Qnil);
10250
10251   /* Target FILENAME is the first argument.  */
10252   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10253   /* Target FILENAME is the third argument.  */
10254   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10255
10256   DEFSYM (Qcall_process, "call-process");
10257   /* Target PROGRAM is the first argument.  */
10258   Fput (Qcall_process, Qtarget_idx, make_number (0));
10259
10260   DEFSYM (Qcall_process_region, "call-process-region");
10261   /* Target PROGRAM is the third argument.  */
10262   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10263
10264   DEFSYM (Qstart_process, "start-process");
10265   /* Target PROGRAM is the third argument.  */
10266   Fput (Qstart_process, Qtarget_idx, make_number (2));
10267
10268   DEFSYM (Qopen_network_stream, "open-network-stream");
10269   /* Target SERVICE is the fourth argument.  */
10270   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10271
10272   DEFSYM (Qcoding_system, "coding-system");
10273   DEFSYM (Qcoding_aliases, "coding-aliases");
10274
10275   DEFSYM (Qeol_type, "eol-type");
10276   DEFSYM (Qunix, "unix");
10277   DEFSYM (Qdos, "dos");
10278
10279   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10280   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10281   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10282   DEFSYM (Qdefault_char, "default-char");
10283   DEFSYM (Qundecided, "undecided");
10284   DEFSYM (Qno_conversion, "no-conversion");
10285   DEFSYM (Qraw_text, "raw-text");
10286
10287   DEFSYM (Qiso_2022, "iso-2022");
10288
10289   DEFSYM (Qutf_8, "utf-8");
10290   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10291
10292   DEFSYM (Qutf_16, "utf-16");
10293   DEFSYM (Qbig, "big");
10294   DEFSYM (Qlittle, "little");
10295
10296   DEFSYM (Qshift_jis, "shift-jis");
10297   DEFSYM (Qbig5, "big5");
10298
10299   DEFSYM (Qcoding_system_p, "coding-system-p");
10300
10301   DEFSYM (Qcoding_system_error, "coding-system-error");
10302   Fput (Qcoding_system_error, Qerror_conditions,
10303         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10304   Fput (Qcoding_system_error, Qerror_message,
10305         make_pure_c_string ("Invalid coding system"));
10306
10307   /* Intern this now in case it isn't already done.
10308      Setting this variable twice is harmless.
10309      But don't staticpro it here--that is done in alloc.c.  */
10310   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10311
10312   DEFSYM (Qtranslation_table, "translation-table");
10313   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10314   DEFSYM (Qtranslation_table_id, "translation-table-id");
10315   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10316   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10317
10318   DEFSYM (Qvalid_codes, "valid-codes");
10319
10320   DEFSYM (Qemacs_mule, "emacs-mule");
10321
10322   DEFSYM (QCcategory, ":category");
10323   DEFSYM (QCmnemonic, ":mnemonic");
10324   DEFSYM (QCdefault_char, ":default-char");
10325   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10326   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10327   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10328   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10329   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10330
10331   Vcoding_category_table
10332     = Fmake_vector (make_number (coding_category_max), Qnil);
10333   staticpro (&Vcoding_category_table);
10334   /* Followings are target of code detection.  */
10335   ASET (Vcoding_category_table, coding_category_iso_7,
10336         intern_c_string ("coding-category-iso-7"));
10337   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10338         intern_c_string ("coding-category-iso-7-tight"));
10339   ASET (Vcoding_category_table, coding_category_iso_8_1,
10340         intern_c_string ("coding-category-iso-8-1"));
10341   ASET (Vcoding_category_table, coding_category_iso_8_2,
10342         intern_c_string ("coding-category-iso-8-2"));
10343   ASET (Vcoding_category_table, coding_category_iso_7_else,
10344         intern_c_string ("coding-category-iso-7-else"));
10345   ASET (Vcoding_category_table, coding_category_iso_8_else,
10346         intern_c_string ("coding-category-iso-8-else"));
10347   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10348         intern_c_string ("coding-category-utf-8-auto"));
10349   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10350         intern_c_string ("coding-category-utf-8"));
10351   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10352         intern_c_string ("coding-category-utf-8-sig"));
10353   ASET (Vcoding_category_table, coding_category_utf_16_be,
10354         intern_c_string ("coding-category-utf-16-be"));
10355   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10356         intern_c_string ("coding-category-utf-16-auto"));
10357   ASET (Vcoding_category_table, coding_category_utf_16_le,
10358         intern_c_string ("coding-category-utf-16-le"));
10359   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10360         intern_c_string ("coding-category-utf-16-be-nosig"));
10361   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10362         intern_c_string ("coding-category-utf-16-le-nosig"));
10363   ASET (Vcoding_category_table, coding_category_charset,
10364         intern_c_string ("coding-category-charset"));
10365   ASET (Vcoding_category_table, coding_category_sjis,
10366         intern_c_string ("coding-category-sjis"));
10367   ASET (Vcoding_category_table, coding_category_big5,
10368         intern_c_string ("coding-category-big5"));
10369   ASET (Vcoding_category_table, coding_category_ccl,
10370         intern_c_string ("coding-category-ccl"));
10371   ASET (Vcoding_category_table, coding_category_emacs_mule,
10372         intern_c_string ("coding-category-emacs-mule"));
10373   /* Followings are NOT target of code detection.  */
10374   ASET (Vcoding_category_table, coding_category_raw_text,
10375         intern_c_string ("coding-category-raw-text"));
10376   ASET (Vcoding_category_table, coding_category_undecided,
10377         intern_c_string ("coding-category-undecided"));
10378
10379   DEFSYM (Qinsufficient_source, "insufficient-source");
10380   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10381   DEFSYM (Qinvalid_source, "invalid-source");
10382   DEFSYM (Qinterrupted, "interrupted");
10383   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10384   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10385
10386   defsubr (&Scoding_system_p);
10387   defsubr (&Sread_coding_system);
10388   defsubr (&Sread_non_nil_coding_system);
10389   defsubr (&Scheck_coding_system);
10390   defsubr (&Sdetect_coding_region);
10391   defsubr (&Sdetect_coding_string);
10392   defsubr (&Sfind_coding_systems_region_internal);
10393   defsubr (&Sunencodable_char_position);
10394   defsubr (&Scheck_coding_systems_region);
10395   defsubr (&Sdecode_coding_region);
10396   defsubr (&Sencode_coding_region);
10397   defsubr (&Sdecode_coding_string);
10398   defsubr (&Sencode_coding_string);
10399   defsubr (&Sdecode_sjis_char);
10400   defsubr (&Sencode_sjis_char);
10401   defsubr (&Sdecode_big5_char);
10402   defsubr (&Sencode_big5_char);
10403   defsubr (&Sset_terminal_coding_system_internal);
10404   defsubr (&Sset_safe_terminal_coding_system_internal);
10405   defsubr (&Sterminal_coding_system);
10406   defsubr (&Sset_keyboard_coding_system_internal);
10407   defsubr (&Skeyboard_coding_system);
10408   defsubr (&Sfind_operation_coding_system);
10409   defsubr (&Sset_coding_system_priority);
10410   defsubr (&Sdefine_coding_system_internal);
10411   defsubr (&Sdefine_coding_system_alias);
10412   defsubr (&Scoding_system_put);
10413   defsubr (&Scoding_system_base);
10414   defsubr (&Scoding_system_plist);
10415   defsubr (&Scoding_system_aliases);
10416   defsubr (&Scoding_system_eol_type);
10417   defsubr (&Scoding_system_priority_list);
10418
10419   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10420                doc: /* List of coding systems.
10421
10422 Do not alter the value of this variable manually.  This variable should be
10423 updated by the functions `define-coding-system' and
10424 `define-coding-system-alias'.  */);
10425   Vcoding_system_list = Qnil;
10426
10427   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10428                doc: /* Alist of coding system names.
10429 Each element is one element list of coding system name.
10430 This variable is given to `completing-read' as COLLECTION argument.
10431
10432 Do not alter the value of this variable manually.  This variable should be
10433 updated by the functions `make-coding-system' and
10434 `define-coding-system-alias'.  */);
10435   Vcoding_system_alist = Qnil;
10436
10437   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10438                doc: /* List of coding-categories (symbols) ordered by priority.
10439
10440 On detecting a coding system, Emacs tries code detection algorithms
10441 associated with each coding-category one by one in this order.  When
10442 one algorithm agrees with a byte sequence of source text, the coding
10443 system bound to the corresponding coding-category is selected.
10444
10445 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10446   {
10447     int i;
10448
10449     Vcoding_category_list = Qnil;
10450     for (i = coding_category_max - 1; i >= 0; i--)
10451       Vcoding_category_list
10452         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10453                  Vcoding_category_list);
10454   }
10455
10456   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10457                doc: /* Specify the coding system for read operations.
10458 It is useful to bind this variable with `let', but do not set it globally.
10459 If the value is a coding system, it is used for decoding on read operation.
10460 If not, an appropriate element is used from one of the coding system alists.
10461 There are three such tables: `file-coding-system-alist',
10462 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10463   Vcoding_system_for_read = Qnil;
10464
10465   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10466                doc: /* Specify the coding system for write operations.
10467 Programs bind this variable with `let', but you should not set it globally.
10468 If the value is a coding system, it is used for encoding of output,
10469 when writing it to a file and when sending it to a file or subprocess.
10470
10471 If this does not specify a coding system, an appropriate element
10472 is used from one of the coding system alists.
10473 There are three such tables: `file-coding-system-alist',
10474 `process-coding-system-alist', and `network-coding-system-alist'.
10475 For output to files, if the above procedure does not specify a coding system,
10476 the value of `buffer-file-coding-system' is used.  */);
10477   Vcoding_system_for_write = Qnil;
10478
10479   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10480                doc: /*
10481 Coding system used in the latest file or process I/O.  */);
10482   Vlast_coding_system_used = Qnil;
10483
10484   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10485                doc: /*
10486 Error status of the last code conversion.
10487
10488 When an error was detected in the last code conversion, this variable
10489 is set to one of the following symbols.
10490   `insufficient-source'
10491   `inconsistent-eol'
10492   `invalid-source'
10493   `interrupted'
10494   `insufficient-memory'
10495 When no error was detected, the value doesn't change.  So, to check
10496 the error status of a code conversion by this variable, you must
10497 explicitly set this variable to nil before performing code
10498 conversion.  */);
10499   Vlast_code_conversion_error = Qnil;
10500
10501   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10502                doc: /*
10503 *Non-nil means always inhibit code conversion of end-of-line format.
10504 See info node `Coding Systems' and info node `Text and Binary' concerning
10505 such conversion.  */);
10506   inhibit_eol_conversion = 0;
10507
10508   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10509                doc: /*
10510 Non-nil means process buffer inherits coding system of process output.
10511 Bind it to t if the process output is to be treated as if it were a file
10512 read from some filesystem.  */);
10513   inherit_process_coding_system = 0;
10514
10515   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10516                doc: /*
10517 Alist to decide a coding system to use for a file I/O operation.
10518 The format is ((PATTERN . VAL) ...),
10519 where PATTERN is a regular expression matching a file name,
10520 VAL is a coding system, a cons of coding systems, or a function symbol.
10521 If VAL is a coding system, it is used for both decoding and encoding
10522 the file contents.
10523 If VAL is a cons of coding systems, the car part is used for decoding,
10524 and the cdr part is used for encoding.
10525 If VAL is a function symbol, the function must return a coding system
10526 or a cons of coding systems which are used as above.  The function is
10527 called with an argument that is a list of the arguments with which
10528 `find-operation-coding-system' was called.  If the function can't decide
10529 a coding system, it can return `undecided' so that the normal
10530 code-detection is performed.
10531
10532 See also the function `find-operation-coding-system'
10533 and the variable `auto-coding-alist'.  */);
10534   Vfile_coding_system_alist = Qnil;
10535
10536   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10537                doc: /*
10538 Alist to decide a coding system to use for a process I/O operation.
10539 The format is ((PATTERN . VAL) ...),
10540 where PATTERN is a regular expression matching a program name,
10541 VAL is a coding system, a cons of coding systems, or a function symbol.
10542 If VAL is a coding system, it is used for both decoding what received
10543 from the program and encoding what sent to the program.
10544 If VAL is a cons of coding systems, the car part is used for decoding,
10545 and the cdr part is used for encoding.
10546 If VAL is a function symbol, the function must return a coding system
10547 or a cons of coding systems which are used as above.
10548
10549 See also the function `find-operation-coding-system'.  */);
10550   Vprocess_coding_system_alist = Qnil;
10551
10552   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10553                doc: /*
10554 Alist to decide a coding system to use for a network I/O operation.
10555 The format is ((PATTERN . VAL) ...),
10556 where PATTERN is a regular expression matching a network service name
10557 or is a port number to connect to,
10558 VAL is a coding system, a cons of coding systems, or a function symbol.
10559 If VAL is a coding system, it is used for both decoding what received
10560 from the network stream and encoding what sent to the network stream.
10561 If VAL is a cons of coding systems, the car part is used for decoding,
10562 and the cdr part is used for encoding.
10563 If VAL is a function symbol, the function must return a coding system
10564 or a cons of coding systems which are used as above.
10565
10566 See also the function `find-operation-coding-system'.  */);
10567   Vnetwork_coding_system_alist = Qnil;
10568
10569   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10570                doc: /* Coding system to use with system messages.
10571 Also used for decoding keyboard input on X Window system.  */);
10572   Vlocale_coding_system = Qnil;
10573
10574   /* The eol mnemonics are reset in startup.el system-dependently.  */
10575   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10576                doc: /*
10577 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10578   eol_mnemonic_unix = make_pure_c_string (":");
10579
10580   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10581                doc: /*
10582 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10583   eol_mnemonic_dos = make_pure_c_string ("\\");
10584
10585   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10586                doc: /*
10587 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10588   eol_mnemonic_mac = make_pure_c_string ("/");
10589
10590   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10591                doc: /*
10592 *String displayed in mode line when end-of-line format is not yet determined.  */);
10593   eol_mnemonic_undecided = make_pure_c_string (":");
10594
10595   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10596                doc: /*
10597 *Non-nil enables character translation while encoding and decoding.  */);
10598   Venable_character_translation = Qt;
10599
10600   DEFVAR_LISP ("standard-translation-table-for-decode",
10601                Vstandard_translation_table_for_decode,
10602                doc: /* Table for translating characters while decoding.  */);
10603   Vstandard_translation_table_for_decode = Qnil;
10604
10605   DEFVAR_LISP ("standard-translation-table-for-encode",
10606                Vstandard_translation_table_for_encode,
10607                doc: /* Table for translating characters while encoding.  */);
10608   Vstandard_translation_table_for_encode = Qnil;
10609
10610   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10611                doc: /* Alist of charsets vs revision numbers.
10612 While encoding, if a charset (car part of an element) is found,
10613 designate it with the escape sequence identifying revision (cdr part
10614 of the element).  */);
10615   Vcharset_revision_table = Qnil;
10616
10617   DEFVAR_LISP ("default-process-coding-system",
10618                Vdefault_process_coding_system,
10619                doc: /* Cons of coding systems used for process I/O by default.
10620 The car part is used for decoding a process output,
10621 the cdr part is used for encoding a text to be sent to a process.  */);
10622   Vdefault_process_coding_system = Qnil;
10623
10624   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10625                doc: /*
10626 Table of extra Latin codes in the range 128..159 (inclusive).
10627 This is a vector of length 256.
10628 If Nth element is non-nil, the existence of code N in a file
10629 \(or output of subprocess) doesn't prevent it to be detected as
10630 a coding system of ISO 2022 variant which has a flag
10631 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10632 or reading output of a subprocess.
10633 Only 128th through 159th elements have a meaning.  */);
10634   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10635
10636   DEFVAR_LISP ("select-safe-coding-system-function",
10637                Vselect_safe_coding_system_function,
10638                doc: /*
10639 Function to call to select safe coding system for encoding a text.
10640
10641 If set, this function is called to force a user to select a proper
10642 coding system which can encode the text in the case that a default
10643 coding system used in each operation can't encode the text.  The
10644 function should take care that the buffer is not modified while
10645 the coding system is being selected.
10646
10647 The default value is `select-safe-coding-system' (which see).  */);
10648   Vselect_safe_coding_system_function = Qnil;
10649
10650   DEFVAR_BOOL ("coding-system-require-warning",
10651                coding_system_require_warning,
10652                doc: /* Internal use only.
10653 If non-nil, on writing a file, `select-safe-coding-system-function' is
10654 called even if `coding-system-for-write' is non-nil.  The command
10655 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10656   coding_system_require_warning = 0;
10657
10658
10659   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10660                inhibit_iso_escape_detection,
10661                doc: /*
10662 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10663
10664 When Emacs reads text, it tries to detect how the text is encoded.
10665 This code detection is sensitive to escape sequences.  If Emacs sees
10666 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10667 of the ISO2022 encodings, and decodes text by the corresponding coding
10668 system (e.g. `iso-2022-7bit').
10669
10670 However, there may be a case that you want to read escape sequences in
10671 a file as is.  In such a case, you can set this variable to non-nil.
10672 Then the code detection will ignore any escape sequences, and no text is
10673 detected as encoded in some ISO-2022 encoding.  The result is that all
10674 escape sequences become visible in a buffer.
10675
10676 The default value is nil, and it is strongly recommended not to change
10677 it.  That is because many Emacs Lisp source files that contain
10678 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10679 in Emacs's distribution, and they won't be decoded correctly on
10680 reading if you suppress escape sequence detection.
10681
10682 The other way to read escape sequences in a file without decoding is
10683 to explicitly specify some coding system that doesn't use ISO-2022
10684 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10685   inhibit_iso_escape_detection = 0;
10686
10687   DEFVAR_BOOL ("inhibit-null-byte-detection",
10688                inhibit_null_byte_detection,
10689                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10690 By default, Emacs treats it as binary data, and does not attempt to
10691 decode it.  The effect is as if you specified `no-conversion' for
10692 reading that text.
10693
10694 Set this to non-nil when a regular text happens to include null bytes.
10695 Examples are Index nodes of Info files and null-byte delimited output
10696 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10697 decode text as usual.  */);
10698   inhibit_null_byte_detection = 0;
10699
10700   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10701                doc: /* Char table for translating self-inserting characters.
10702 This is applied to the result of input methods, not their input.
10703 See also `keyboard-translate-table'.
10704
10705 Use of this variable for character code unification was rendered
10706 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10707 internal character representation.  */);
10708     Vtranslation_table_for_input = Qnil;
10709
10710   {
10711     Lisp_Object args[coding_arg_max];
10712     Lisp_Object plist[16];
10713     int i;
10714
10715     for (i = 0; i < coding_arg_max; i++)
10716       args[i] = Qnil;
10717
10718     plist[0] = intern_c_string (":name");
10719     plist[1] = args[coding_arg_name] = Qno_conversion;
10720     plist[2] = intern_c_string (":mnemonic");
10721     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10722     plist[4] = intern_c_string (":coding-type");
10723     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10724     plist[6] = intern_c_string (":ascii-compatible-p");
10725     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10726     plist[8] = intern_c_string (":default-char");
10727     plist[9] = args[coding_arg_default_char] = make_number (0);
10728     plist[10] = intern_c_string (":for-unibyte");
10729     plist[11] = args[coding_arg_for_unibyte] = Qt;
10730     plist[12] = intern_c_string (":docstring");
10731     plist[13] = make_pure_c_string ("Do no conversion.\n\
10732 \n\
10733 When you visit a file with this coding, the file is read into a\n\
10734 unibyte buffer as is, thus each byte of a file is treated as a\n\
10735 character.");
10736     plist[14] = intern_c_string (":eol-type");
10737     plist[15] = args[coding_arg_eol_type] = Qunix;
10738     args[coding_arg_plist] = Flist (16, plist);
10739     Fdefine_coding_system_internal (coding_arg_max, args);
10740
10741     plist[1] = args[coding_arg_name] = Qundecided;
10742     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10743     plist[5] = args[coding_arg_coding_type] = Qundecided;
10744     /* This is already set.
10745        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10746     plist[8] = intern_c_string (":charset-list");
10747     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10748     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10749     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10750     plist[15] = args[coding_arg_eol_type] = Qnil;
10751     args[coding_arg_plist] = Flist (16, plist);
10752     Fdefine_coding_system_internal (coding_arg_max, args);
10753   }
10754
10755   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10756
10757   {
10758     int i;
10759
10760     for (i = 0; i < coding_category_max; i++)
10761       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10762   }
10763 #if defined (DOS_NT)
10764   system_eol_type = Qdos;
10765 #else
10766   system_eol_type = Qunix;
10767 #endif
10768   staticpro (&system_eol_type);
10769 }
10770
10771 char *
10772 emacs_strerror (int error_number)
10773 {
10774   char *str;
10775
10776   synchronize_system_messages_locale ();
10777   str = strerror (error_number);
10778
10779   if (! NILP (Vlocale_coding_system))
10780     {
10781       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10782                                                       Vlocale_coding_system,
10783                                                       0);
10784       str = SSDATA (dec);
10785     }
10786
10787   return str;
10788 }
10789
10790 #endif /* emacs */