src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2011 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   int multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   int multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   int multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288 #include <setjmp.h>
 289
 290 #include "lisp.h"
 291 #include "buffer.h"
 292 #include "character.h"
 293 #include "charset.h"
 294 #include "ccl.h"
 295 #include "composite.h"
 296 #include "coding.h"
 297 #include "window.h"
 298 #include "frame.h"
 299 #include "termhooks.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 static Lisp_Object Qcoding_system, Qeol_type;
 304 static Lisp_Object Qcoding_aliases;
 305 Lisp_Object Qunix, Qdos;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 static Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qutf_8;
 311 static Lisp_Object Qiso_2022;
 312 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 313 static Lisp_Object Qbig, Qlittle;
 314 static Lisp_Object Qcoding_system_history;
 315 static Lisp_Object Qvalid_codes;
 316 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 317 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 static Lisp_Object QCascii_compatible_p;
 320
 321 Lisp_Object Qcall_process, Qcall_process_region;
 322 Lisp_Object Qstart_process, Qopen_network_stream;
 323 static Lisp_Object Qtarget_idx;
 324
 325 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 326 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 327
 328 /* If a symbol has this property, evaluate the value to define the
 329    symbol as a coding system.  */
 330 static Lisp_Object Qcoding_system_define_form;
 331
 332 /* Format of end-of-line decided by system.  This is Qunix on
 333    Unix and Mac, Qdos on DOS/Windows.
 334    This has an effect only for external encoding (i.e. for output to
 335    file and process), not for in-buffer or Lisp string encoding.  */
 336 static Lisp_Object system_eol_type;
 337
 338 #ifdef emacs
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 /* Coding-systems are handed between Emacs Lisp programs and C internal
 348    routines by the following three variables.  */
 349 /* Coding system to be used to encode text for terminal display when
 350    terminal coding system is nil.  */
 351 struct coding_system safe_terminal_coding;
 352
 353 #endif /* emacs */
 354
 355 Lisp_Object Qtranslation_table;
 356 Lisp_Object Qtranslation_table_id;
 357 static Lisp_Object Qtranslation_table_for_decode;
 358 static Lisp_Object Qtranslation_table_for_encode;
 359
 360 /* Two special coding systems.  */
 361 static Lisp_Object Vsjis_coding_system;
 362 static Lisp_Object Vbig5_coding_system;
 363
 364 /* ISO2022 section */
 365
 366 #define CODING_ISO_INITIAL(coding, reg)                 \
 367   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 368                      coding_attr_iso_initial),          \
 369                reg)))
 370
 371
 372 #define CODING_ISO_REQUEST(coding, charset_id)          \
 373   (((charset_id) <= (coding)->max_charset_id            \
 374     ? ((coding)->safe_charsets[charset_id] != 255       \
 375        ? (coding)->safe_charsets[charset_id]            \
 376        : -1)                                            \
 377     : -1))
 378
 379
 380 #define CODING_ISO_FLAGS(coding)        \
 381   ((coding)->spec.iso_2022.flags)
 382 #define CODING_ISO_DESIGNATION(coding, reg)     \
 383   ((coding)->spec.iso_2022.current_designation[reg])
 384 #define CODING_ISO_INVOCATION(coding, plane)    \
 385   ((coding)->spec.iso_2022.current_invocation[plane])
 386 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 387   ((coding)->spec.iso_2022.single_shifting)
 388 #define CODING_ISO_BOL(coding)  \
 389   ((coding)->spec.iso_2022.bol)
 390 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 391   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 392 #define CODING_ISO_CMP_STATUS(coding)   \
 393   (&(coding)->spec.iso_2022.cmp_status)
 394 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 395   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 396 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 397   ((coding)->spec.iso_2022.embedded_utf_8)
 398
 399 /* Control characters of ISO2022.  */
 400                         /* code */      /* function */
 401 #define ISO_CODE_SO     0x0E            /* shift-out */
 402 #define ISO_CODE_SI     0x0F            /* shift-in */
 403 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 404 #define ISO_CODE_ESC    0x1B            /* escape */
 405 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 406 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 407 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 408
 409 /* All code (1-byte) of ISO2022 is classified into one of the
 410    followings.  */
 411 enum iso_code_class_type
 412   {
 413     ISO_control_0,              /* Control codes in the range
 414                                    0x00..0x1F and 0x7F, except for the
 415                                    following 5 codes.  */
 416     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 417     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 418     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 419     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 420     ISO_control_1,              /* Control codes in the range
 421                                    0x80..0x9F, except for the
 422                                    following 3 codes.  */
 423     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 424     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 425     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 426     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 427     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 428     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 429     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 430   };
 431
 432 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 433     `iso-flags' attribute of an iso2022 coding system.  */
 434
 435 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 436    instead of the correct short-form sequence (e.g. ESC $ A).  */
 437 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 438
 439 /* If set, reset graphic planes and registers at end-of-line to the
 440    initial state.  */
 441 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 442
 443 /* If set, reset graphic planes and registers before any control
 444    characters to the initial state.  */
 445 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 446
 447 /* If set, encode by 7-bit environment.  */
 448 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 449
 450 /* If set, use locking-shift function.  */
 451 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 452
 453 /* If set, use single-shift function.  Overwrite
 454    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 455 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 456
 457 /* If set, use designation escape sequence.  */
 458 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 459
 460 /* If set, produce revision number sequence.  */
 461 #define CODING_ISO_FLAG_REVISION        0x0080
 462
 463 /* If set, produce ISO6429's direction specifying sequence.  */
 464 #define CODING_ISO_FLAG_DIRECTION       0x0100
 465
 466 /* If set, assume designation states are reset at beginning of line on
 467    output.  */
 468 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 469
 470 /* If set, designation sequence should be placed at beginning of line
 471    on output.  */
 472 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 473
 474 /* If set, do not encode unsafe characters on output.  */
 475 #define CODING_ISO_FLAG_SAFE            0x0800
 476
 477 /* If set, extra latin codes (128..159) are accepted as a valid code
 478    on input.  */
 479 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 480
 481 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 482
 483 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 484
 485 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 486
 487 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 488
 489 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 490
 491 /* A character to be produced on output if encoding of the original
 492    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 493 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 494
 495 /* UTF-8 section */
 496 #define CODING_UTF_8_BOM(coding)        \
 497   ((coding)->spec.utf_8_bom)
 498
 499 /* UTF-16 section */
 500 #define CODING_UTF_16_BOM(coding)       \
 501   ((coding)->spec.utf_16.bom)
 502
 503 #define CODING_UTF_16_ENDIAN(coding)    \
 504   ((coding)->spec.utf_16.endian)
 505
 506 #define CODING_UTF_16_SURROGATE(coding) \
 507   ((coding)->spec.utf_16.surrogate)
 508
 509
 510 /* CCL section */
 511 #define CODING_CCL_DECODER(coding)      \
 512   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 513 #define CODING_CCL_ENCODER(coding)      \
 514   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 515 #define CODING_CCL_VALIDS(coding)                                          \
 516   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 517
 518 /* Index for each coding category in `coding_categories' */
 519
 520 enum coding_category
 521   {
 522     coding_category_iso_7,
 523     coding_category_iso_7_tight,
 524     coding_category_iso_8_1,
 525     coding_category_iso_8_2,
 526     coding_category_iso_7_else,
 527     coding_category_iso_8_else,
 528     coding_category_utf_8_auto,
 529     coding_category_utf_8_nosig,
 530     coding_category_utf_8_sig,
 531     coding_category_utf_16_auto,
 532     coding_category_utf_16_be,
 533     coding_category_utf_16_le,
 534     coding_category_utf_16_be_nosig,
 535     coding_category_utf_16_le_nosig,
 536     coding_category_charset,
 537     coding_category_sjis,
 538     coding_category_big5,
 539     coding_category_ccl,
 540     coding_category_emacs_mule,
 541     /* All above are targets of code detection.  */
 542     coding_category_raw_text,
 543     coding_category_undecided,
 544     coding_category_max
 545   };
 546
 547 /* Definitions of flag bits used in detect_coding_XXXX.  */
 548 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 549 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 550 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 551 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 552 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 553 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 554 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 555 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 556 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 557 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 558 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 559 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 560 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 561 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 562 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 563 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 564 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 565 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 566 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 567 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 568
 569 /* This value is returned if detect_coding_mask () find nothing other
 570    than ASCII characters.  */
 571 #define CATEGORY_MASK_ANY               \
 572   (CATEGORY_MASK_ISO_7                  \
 573    | CATEGORY_MASK_ISO_7_TIGHT          \
 574    | CATEGORY_MASK_ISO_8_1              \
 575    | CATEGORY_MASK_ISO_8_2              \
 576    | CATEGORY_MASK_ISO_7_ELSE           \
 577    | CATEGORY_MASK_ISO_8_ELSE           \
 578    | CATEGORY_MASK_UTF_8_AUTO           \
 579    | CATEGORY_MASK_UTF_8_NOSIG          \
 580    | CATEGORY_MASK_UTF_8_SIG            \
 581    | CATEGORY_MASK_UTF_16_AUTO          \
 582    | CATEGORY_MASK_UTF_16_BE            \
 583    | CATEGORY_MASK_UTF_16_LE            \
 584    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 585    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 586    | CATEGORY_MASK_CHARSET              \
 587    | CATEGORY_MASK_SJIS                 \
 588    | CATEGORY_MASK_BIG5                 \
 589    | CATEGORY_MASK_CCL                  \
 590    | CATEGORY_MASK_EMACS_MULE)
 591
 592
 593 #define CATEGORY_MASK_ISO_7BIT \
 594   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 595
 596 #define CATEGORY_MASK_ISO_8BIT \
 597   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 598
 599 #define CATEGORY_MASK_ISO_ELSE \
 600   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 601
 602 #define CATEGORY_MASK_ISO_ESCAPE        \
 603   (CATEGORY_MASK_ISO_7                  \
 604    | CATEGORY_MASK_ISO_7_TIGHT          \
 605    | CATEGORY_MASK_ISO_7_ELSE           \
 606    | CATEGORY_MASK_ISO_8_ELSE)
 607
 608 #define CATEGORY_MASK_ISO       \
 609   (  CATEGORY_MASK_ISO_7BIT     \
 610      | CATEGORY_MASK_ISO_8BIT   \
 611      | CATEGORY_MASK_ISO_ELSE)
 612
 613 #define CATEGORY_MASK_UTF_16            \
 614   (CATEGORY_MASK_UTF_16_AUTO            \
 615    | CATEGORY_MASK_UTF_16_BE            \
 616    | CATEGORY_MASK_UTF_16_LE            \
 617    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 618    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 619
 620 #define CATEGORY_MASK_UTF_8     \
 621   (CATEGORY_MASK_UTF_8_AUTO     \
 622    | CATEGORY_MASK_UTF_8_NOSIG  \
 623    | CATEGORY_MASK_UTF_8_SIG)
 624
 625 /* Table of coding categories (Lisp symbols).  This variable is for
 626    internal use only.  */
 627 static Lisp_Object Vcoding_category_table;
 628
 629 /* Table of coding-categories ordered by priority.  */
 630 static enum coding_category coding_priorities[coding_category_max];
 631
 632 /* Nth element is a coding context for the coding system bound to the
 633    Nth coding category.  */
 634 static struct coding_system coding_categories[coding_category_max];
 635
 636 /*** Commonly used macros and functions ***/
 637
 638 #ifndef min
 639 #define min(a, b) ((a) < (b) ? (a) : (b))
 640 #endif
 641 #ifndef max
 642 #define max(a, b) ((a) > (b) ? (a) : (b))
 643 #endif
 644
 645 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 646   do {                                                  \
 647     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 648     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 649   } while (0)
 650
 651
 652 /* Safely get one byte from the source text pointed by SRC which ends
 653    at SRC_END, and set C to that byte.  If there are not enough bytes
 654    in the source, it jumps to `no_more_source'.  If multibytep is
 655    nonzero, and a multibyte character is found at SRC, set C to the
 656    negative value of the character code.  The caller should declare
 657    and set these variables appropriately in advance:
 658         src, src_end, multibytep */
 659
 660 #define ONE_MORE_BYTE(c)                                \
 661   do {                                                  \
 662     if (src == src_end)                                 \
 663       {                                                 \
 664         if (src_base < src)                             \
 665           record_conversion_result                      \
 666             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 667         goto no_more_source;                            \
 668       }                                                 \
 669     c = *src++;                                         \
 670     if (multibytep && (c & 0x80))                       \
 671       {                                                 \
 672         if ((c & 0xFE) == 0xC0)                         \
 673           c = ((c & 1) << 6) | *src++;                  \
 674         else                                            \
 675           {                                             \
 676             src--;                                      \
 677             c = - string_char (src, &src, NULL);        \
 678             record_conversion_result                    \
 679               (coding, CODING_RESULT_INVALID_SRC);      \
 680           }                                             \
 681       }                                                 \
 682     consumed_chars++;                                   \
 683   } while (0)
 684
 685 /* Safely get two bytes from the source text pointed by SRC which ends
 686    at SRC_END, and set C1 and C2 to those bytes while skipping the
 687    heading multibyte characters.  If there are not enough bytes in the
 688    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 689    a multibyte character is found for C2, set C2 to the negative value
 690    of the character code.  The caller should declare and set these
 691    variables appropriately in advance:
 692         src, src_end, multibytep
 693    It is intended that this macro is used in detect_coding_utf_16.  */
 694
 695 #define TWO_MORE_BYTES(c1, c2)                          \
 696   do {                                                  \
 697     do {                                                \
 698       if (src == src_end)                               \
 699         goto no_more_source;                            \
 700       c1 = *src++;                                      \
 701       if (multibytep && (c1 & 0x80))                    \
 702         {                                               \
 703           if ((c1 & 0xFE) == 0xC0)                      \
 704             c1 = ((c1 & 1) << 6) | *src++;              \
 705           else                                          \
 706             {                                           \
 707               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 708               c1 = -1;                                  \
 709             }                                           \
 710         }                                               \
 711     } while (c1 < 0);                                   \
 712     if (src == src_end)                                 \
 713       goto no_more_source;                              \
 714     c2 = *src++;                                        \
 715     if (multibytep && (c2 & 0x80))                      \
 716       {                                                 \
 717         if ((c2 & 0xFE) == 0xC0)                        \
 718           c2 = ((c2 & 1) << 6) | *src++;                \
 719         else                                            \
 720           c2 = -1;                                      \
 721       }                                                 \
 722   } while (0)
 723
 724
 725 /* Store a byte C in the place pointed by DST and increment DST to the
 726    next free point, and increment PRODUCED_CHARS.  The caller should
 727    assure that C is 0..127, and declare and set the variable `dst'
 728    appropriately in advance.
 729 */
 730
 731
 732 #define EMIT_ONE_ASCII_BYTE(c)  \
 733   do {                          \
 734     produced_chars++;           \
 735     *dst++ = (c);               \
 736   } while (0)
 737
 738
 739 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 740
 741 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 742   do {                                  \
 743     produced_chars += 2;                \
 744     *dst++ = (c1), *dst++ = (c2);       \
 745   } while (0)
 746
 747
 748 /* Store a byte C in the place pointed by DST and increment DST to the
 749    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 750    nonzero, store in an appropriate multibyte from.  The caller should
 751    declare and set the variables `dst' and `multibytep' appropriately
 752    in advance.  */
 753
 754 #define EMIT_ONE_BYTE(c)                \
 755   do {                                  \
 756     produced_chars++;                   \
 757     if (multibytep)                     \
 758       {                                 \
 759         unsigned ch = (c);              \
 760         if (ch >= 0x80)                 \
 761           ch = BYTE8_TO_CHAR (ch);      \
 762         CHAR_STRING_ADVANCE (ch, dst);  \
 763       }                                 \
 764     else                                \
 765       *dst++ = (c);                     \
 766   } while (0)
 767
 768
 769 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 770
 771 #define EMIT_TWO_BYTES(c1, c2)          \
 772   do {                                  \
 773     produced_chars += 2;                \
 774     if (multibytep)                     \
 775       {                                 \
 776         unsigned ch;                    \
 777                                         \
 778         ch = (c1);                      \
 779         if (ch >= 0x80)                 \
 780           ch = BYTE8_TO_CHAR (ch);      \
 781         CHAR_STRING_ADVANCE (ch, dst);  \
 782         ch = (c2);                      \
 783         if (ch >= 0x80)                 \
 784           ch = BYTE8_TO_CHAR (ch);      \
 785         CHAR_STRING_ADVANCE (ch, dst);  \
 786       }                                 \
 787     else                                \
 788       {                                 \
 789         *dst++ = (c1);                  \
 790         *dst++ = (c2);                  \
 791       }                                 \
 792   } while (0)
 793
 794
 795 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 796   do {                                  \
 797     EMIT_ONE_BYTE (c1);                 \
 798     EMIT_TWO_BYTES (c2, c3);            \
 799   } while (0)
 800
 801
 802 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 803   do {                                          \
 804     EMIT_TWO_BYTES (c1, c2);                    \
 805     EMIT_TWO_BYTES (c3, c4);                    \
 806   } while (0)
 807
 808
 809 /* Prototypes for static functions.  */
 810 static void record_conversion_result (struct coding_system *coding,
 811                                       enum coding_result_code result);
 812 static int detect_coding_utf_8 (struct coding_system *,
 813                                 struct coding_detection_info *info);
 814 static void decode_coding_utf_8 (struct coding_system *);
 815 static int encode_coding_utf_8 (struct coding_system *);
 816
 817 static int detect_coding_utf_16 (struct coding_system *,
 818                                  struct coding_detection_info *info);
 819 static void decode_coding_utf_16 (struct coding_system *);
 820 static int encode_coding_utf_16 (struct coding_system *);
 821
 822 static int detect_coding_iso_2022 (struct coding_system *,
 823                                    struct coding_detection_info *info);
 824 static void decode_coding_iso_2022 (struct coding_system *);
 825 static int encode_coding_iso_2022 (struct coding_system *);
 826
 827 static int detect_coding_emacs_mule (struct coding_system *,
 828                                      struct coding_detection_info *info);
 829 static void decode_coding_emacs_mule (struct coding_system *);
 830 static int encode_coding_emacs_mule (struct coding_system *);
 831
 832 static int detect_coding_sjis (struct coding_system *,
 833                                struct coding_detection_info *info);
 834 static void decode_coding_sjis (struct coding_system *);
 835 static int encode_coding_sjis (struct coding_system *);
 836
 837 static int detect_coding_big5 (struct coding_system *,
 838                                struct coding_detection_info *info);
 839 static void decode_coding_big5 (struct coding_system *);
 840 static int encode_coding_big5 (struct coding_system *);
 841
 842 static int detect_coding_ccl (struct coding_system *,
 843                               struct coding_detection_info *info);
 844 static void decode_coding_ccl (struct coding_system *);
 845 static int encode_coding_ccl (struct coding_system *);
 846
 847 static void decode_coding_raw_text (struct coding_system *);
 848 static int encode_coding_raw_text (struct coding_system *);
 849
 850 static void coding_set_source (struct coding_system *);
 851 static void coding_set_destination (struct coding_system *);
 852 static void coding_alloc_by_realloc (struct coding_system *, ptrdiff_t);
 853 static void coding_alloc_by_making_gap (struct coding_system *,
 854                                         ptrdiff_t, ptrdiff_t);
 855 static unsigned char *alloc_destination (struct coding_system *,
 856                                          ptrdiff_t, unsigned char *);
 857 static void setup_iso_safe_charsets (Lisp_Object);
 858 static unsigned char *encode_designation_at_bol (struct coding_system *,
 859                                                  int *, unsigned char *);
 860 static int detect_eol (const unsigned char *,
 861                        ptrdiff_t, enum coding_category);
 862 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
 863 static void decode_eol (struct coding_system *);
 864 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
 865 static Lisp_Object get_translation (Lisp_Object, int *, int *);
 866 static int produce_chars (struct coding_system *, Lisp_Object, int);
 867 static inline void produce_charset (struct coding_system *, int *,
 868                                     ptrdiff_t);
 869 static void produce_annotation (struct coding_system *, ptrdiff_t);
 870 static int decode_coding (struct coding_system *);
 871 static inline int *handle_composition_annotation (ptrdiff_t, ptrdiff_t,
 872                                                   struct coding_system *,
 873                                                   int *, ptrdiff_t *);
 874 static inline int *handle_charset_annotation (ptrdiff_t, ptrdiff_t,
 875                                               struct coding_system *,
 876                                               int *, ptrdiff_t *);
 877 static void consume_chars (struct coding_system *, Lisp_Object, int);
 878 static int encode_coding (struct coding_system *);
 879 static Lisp_Object make_conversion_work_buffer (int);
 880 static Lisp_Object code_conversion_restore (Lisp_Object);
 881 static inline int char_encodable_p (int, Lisp_Object);
 882 static Lisp_Object make_subsidiaries (Lisp_Object);
 883
 884 static void
 885 record_conversion_result (struct coding_system *coding,
 886                           enum coding_result_code result)
 887 {
 888   coding->result = result;
 889   switch (result)
 890     {
 891     case CODING_RESULT_INSUFFICIENT_SRC:
 892       Vlast_code_conversion_error = Qinsufficient_source;
 893       break;
 894     case CODING_RESULT_INCONSISTENT_EOL:
 895       Vlast_code_conversion_error = Qinconsistent_eol;
 896       break;
 897     case CODING_RESULT_INVALID_SRC:
 898       Vlast_code_conversion_error = Qinvalid_source;
 899       break;
 900     case CODING_RESULT_INTERRUPT:
 901       Vlast_code_conversion_error = Qinterrupted;
 902       break;
 903     case CODING_RESULT_INSUFFICIENT_MEM:
 904       Vlast_code_conversion_error = Qinsufficient_memory;
 905       break;
 906     case CODING_RESULT_INSUFFICIENT_DST:
 907       /* Don't record this error in Vlast_code_conversion_error
 908          because it happens just temporarily and is resolved when the
 909          whole conversion is finished.  */
 910       break;
 911     case CODING_RESULT_SUCCESS:
 912       break;
 913     default:
 914       Vlast_code_conversion_error = intern ("Unknown error");
 915     }
 916 }
 917
 918 /* This wrapper macro is used to preserve validity of pointers into
 919    buffer text across calls to decode_char, which could cause
 920    relocation of buffers if it loads a charset map, because loading a
 921    charset map allocates large structures.  */
 922 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 923   do {                                                                       \
 924     charset_map_loaded = 0;                                                  \
 925     c = DECODE_CHAR (charset, code);                                         \
 926     if (charset_map_loaded)                                                  \
 927       {                                                                      \
 928         const unsigned char *orig = coding->source;                          \
 929         ptrdiff_t offset;                                                    \
 930                                                                              \
 931         coding_set_source (coding);                                          \
 932         offset = coding->source - orig;                                      \
 933         src += offset;                                                       \
 934         src_base += offset;                                                  \
 935         src_end += offset;                                                   \
 936       }                                                                      \
 937   } while (0)
 938
 939
 940 /* If there are at least BYTES length of room at dst, allocate memory
 941    for coding->destination and update dst and dst_end.  We don't have
 942    to take care of coding->source which will be relocated.  It is
 943    handled by calling coding_set_source in encode_coding.  */
 944
 945 #define ASSURE_DESTINATION(bytes)                               \
 946   do {                                                          \
 947     if (dst + (bytes) >= dst_end)                               \
 948       {                                                         \
 949         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 950                                                                 \
 951         dst = alloc_destination (coding, more_bytes, dst);      \
 952         dst_end = coding->destination + coding->dst_bytes;      \
 953       }                                                         \
 954   } while (0)
 955
 956
 957 /* Store multibyte form of the character C in P, and advance P to the
 958    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 959    never calls MAYBE_UNIFY_CHAR.  */
 960
 961 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 962   do {                                          \
 963     if ((c) <= MAX_1_BYTE_CHAR)                 \
 964       *(p)++ = (c);                             \
 965     else if ((c) <= MAX_2_BYTE_CHAR)            \
 966       *(p)++ = (0xC0 | ((c) >> 6)),             \
 967         *(p)++ = (0x80 | ((c) & 0x3F));         \
 968     else if ((c) <= MAX_3_BYTE_CHAR)            \
 969       *(p)++ = (0xE0 | ((c) >> 12)),            \
 970         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
 971         *(p)++ = (0x80 | ((c) & 0x3F));         \
 972     else if ((c) <= MAX_4_BYTE_CHAR)            \
 973       *(p)++ = (0xF0 | (c >> 18)),              \
 974         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 975         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 976         *(p)++ = (0x80 | (c & 0x3F));           \
 977     else if ((c) <= MAX_5_BYTE_CHAR)            \
 978       *(p)++ = 0xF8,                            \
 979         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
 980         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 981         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 982         *(p)++ = (0x80 | (c & 0x3F));           \
 983     else                                        \
 984       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
 985   } while (0)
 986
 987
 988 /* Return the character code of character whose multibyte form is at
 989    P, and advance P to the end of the multibyte form.  This is like
 990    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
 991
 992 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
 993   (!((p)[0] & 0x80)                                             \
 994    ? *(p)++                                                     \
 995    : ! ((p)[0] & 0x20)                                          \
 996    ? ((p) += 2,                                                 \
 997       ((((p)[-2] & 0x1F) << 6)                                  \
 998        | ((p)[-1] & 0x3F)                                       \
 999        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1000    : ! ((p)[0] & 0x10)                                          \
1001    ? ((p) += 3,                                                 \
1002       ((((p)[-3] & 0x0F) << 12)                                 \
1003        | (((p)[-2] & 0x3F) << 6)                                \
1004        | ((p)[-1] & 0x3F)))                                     \
1005    : ! ((p)[0] & 0x08)                                          \
1006    ? ((p) += 4,                                                 \
1007       ((((p)[-4] & 0xF) << 18)                                  \
1008        | (((p)[-3] & 0x3F) << 12)                               \
1009        | (((p)[-2] & 0x3F) << 6)                                \
1010        | ((p)[-1] & 0x3F)))                                     \
1011    : ((p) += 5,                                                 \
1012       ((((p)[-4] & 0x3F) << 18)                                 \
1013        | (((p)[-3] & 0x3F) << 12)                               \
1014        | (((p)[-2] & 0x3F) << 6)                                \
1015        | ((p)[-1] & 0x3F))))
1016
1017
1018 static void
1019 coding_set_source (struct coding_system *coding)
1020 {
1021   if (BUFFERP (coding->src_object))
1022     {
1023       struct buffer *buf = XBUFFER (coding->src_object);
1024
1025       if (coding->src_pos < 0)
1026         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1027       else
1028         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1029     }
1030   else if (STRINGP (coding->src_object))
1031     {
1032       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1033     }
1034   else
1035     {
1036       /* Otherwise, the source is C string and is never relocated
1037          automatically.  Thus we don't have to update anything.  */
1038     }
1039 }
1040
1041 static void
1042 coding_set_destination (struct coding_system *coding)
1043 {
1044   if (BUFFERP (coding->dst_object))
1045     {
1046       if (coding->src_pos < 0)
1047         {
1048           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1049           coding->dst_bytes = (GAP_END_ADDR
1050                                - (coding->src_bytes - coding->consumed)
1051                                - coding->destination);
1052         }
1053       else
1054         {
1055           /* We are sure that coding->dst_pos_byte is before the gap
1056              of the buffer. */
1057           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1058                                  + coding->dst_pos_byte - BEG_BYTE);
1059           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1060                                - coding->destination);
1061         }
1062     }
1063   else
1064     {
1065       /* Otherwise, the destination is C string and is never relocated
1066          automatically.  Thus we don't have to update anything.  */
1067     }
1068 }
1069
1070
1071 static void
1072 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1073 {
1074   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1075     string_overflow ();
1076   coding->destination = (unsigned char *) xrealloc (coding->destination,
1077                                                     coding->dst_bytes + bytes);
1078   coding->dst_bytes += bytes;
1079 }
1080
1081 static void
1082 coding_alloc_by_making_gap (struct coding_system *coding,
1083                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1084 {
1085   if (EQ (coding->src_object, coding->dst_object))
1086     {
1087       /* The gap may contain the produced data at the head and not-yet
1088          consumed data at the tail.  To preserve those data, we at
1089          first make the gap size to zero, then increase the gap
1090          size.  */
1091       ptrdiff_t add = GAP_SIZE;
1092
1093       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1094       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1095       make_gap (bytes);
1096       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1097       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1098     }
1099   else
1100     {
1101       Lisp_Object this_buffer;
1102
1103       this_buffer = Fcurrent_buffer ();
1104       set_buffer_internal (XBUFFER (coding->dst_object));
1105       make_gap (bytes);
1106       set_buffer_internal (XBUFFER (this_buffer));
1107     }
1108 }
1109
1110
1111 static unsigned char *
1112 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1113                    unsigned char *dst)
1114 {
1115   ptrdiff_t offset = dst - coding->destination;
1116
1117   if (BUFFERP (coding->dst_object))
1118     {
1119       struct buffer *buf = XBUFFER (coding->dst_object);
1120
1121       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1122     }
1123   else
1124     coding_alloc_by_realloc (coding, nbytes);
1125   coding_set_destination (coding);
1126   dst = coding->destination + offset;
1127   return dst;
1128 }
1129
1130 /** Macros for annotations.  */
1131
1132 /* An annotation data is stored in the array coding->charbuf in this
1133    format:
1134      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1135    LENGTH is the number of elements in the annotation.
1136    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1137    NCHARS is the number of characters in the text annotated.
1138
1139    The format of the following elements depend on ANNOTATION_MASK.
1140
1141    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1142    follows:
1143      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1144
1145    NBYTES is the number of bytes specified in the header part of
1146    old-style emacs-mule encoding, or 0 for the other kind of
1147    composition.
1148
1149    METHOD is one of enum composition_method.
1150
1151    Optional COMPOSITION-COMPONENTS are characters and composition
1152    rules.
1153
1154    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1155    follows.
1156
1157    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1158    recover from an invalid annotation, and should be skipped by
1159    produce_annotation.  */
1160
1161 /* Maximum length of the header of annotation data.  */
1162 #define MAX_ANNOTATION_LENGTH 5
1163
1164 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1165   do {                                                  \
1166     *(buf)++ = -(len);                                  \
1167     *(buf)++ = (mask);                                  \
1168     *(buf)++ = (nchars);                                \
1169     coding->annotated = 1;                              \
1170   } while (0);
1171
1172 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1173   do {                                                                      \
1174     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1175     *buf++ = nbytes;                                                        \
1176     *buf++ = method;                                                        \
1177   } while (0)
1178
1179
1180 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1181   do {                                                                  \
1182     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1183     *buf++ = id;                                                        \
1184   } while (0)
1185
1186 \f
1187 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1188
1189
1190
1191 \f
1192 /*** 3. UTF-8 ***/
1193
1194 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1195    Check if a text is encoded in UTF-8.  If it is, return 1, else
1196    return 0.  */
1197
1198 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1199 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1200 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1201 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1202 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1203 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1204
1205 #define UTF_8_BOM_1 0xEF
1206 #define UTF_8_BOM_2 0xBB
1207 #define UTF_8_BOM_3 0xBF
1208
1209 static int
1210 detect_coding_utf_8 (struct coding_system *coding,
1211                      struct coding_detection_info *detect_info)
1212 {
1213   const unsigned char *src = coding->source, *src_base;
1214   const unsigned char *src_end = coding->source + coding->src_bytes;
1215   int multibytep = coding->src_multibyte;
1216   ptrdiff_t consumed_chars = 0;
1217   int bom_found = 0;
1218   int found = 0;
1219
1220   detect_info->checked |= CATEGORY_MASK_UTF_8;
1221   /* A coding system of this category is always ASCII compatible.  */
1222   src += coding->head_ascii;
1223
1224   while (1)
1225     {
1226       int c, c1, c2, c3, c4;
1227
1228       src_base = src;
1229       ONE_MORE_BYTE (c);
1230       if (c < 0 || UTF_8_1_OCTET_P (c))
1231         continue;
1232       ONE_MORE_BYTE (c1);
1233       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1234         break;
1235       if (UTF_8_2_OCTET_LEADING_P (c))
1236         {
1237           found = 1;
1238           continue;
1239         }
1240       ONE_MORE_BYTE (c2);
1241       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1242         break;
1243       if (UTF_8_3_OCTET_LEADING_P (c))
1244         {
1245           found = 1;
1246           if (src_base == coding->source
1247               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1248             bom_found = 1;
1249           continue;
1250         }
1251       ONE_MORE_BYTE (c3);
1252       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1253         break;
1254       if (UTF_8_4_OCTET_LEADING_P (c))
1255         {
1256           found = 1;
1257           continue;
1258         }
1259       ONE_MORE_BYTE (c4);
1260       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1261         break;
1262       if (UTF_8_5_OCTET_LEADING_P (c))
1263         {
1264           found = 1;
1265           continue;
1266         }
1267       break;
1268     }
1269   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1270   return 0;
1271
1272  no_more_source:
1273   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1274     {
1275       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1276       return 0;
1277     }
1278   if (bom_found)
1279     {
1280       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1281       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1282     }
1283   else
1284     {
1285       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1286       if (found)
1287         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1288     }
1289   return 1;
1290 }
1291
1292
1293 static void
1294 decode_coding_utf_8 (struct coding_system *coding)
1295 {
1296   const unsigned char *src = coding->source + coding->consumed;
1297   const unsigned char *src_end = coding->source + coding->src_bytes;
1298   const unsigned char *src_base;
1299   int *charbuf = coding->charbuf + coding->charbuf_used;
1300   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1301   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1302   int multibytep = coding->src_multibyte;
1303   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1304   int eol_dos =
1305     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1306   int byte_after_cr = -1;
1307
1308   if (bom != utf_without_bom)
1309     {
1310       int c1, c2, c3;
1311
1312       src_base = src;
1313       ONE_MORE_BYTE (c1);
1314       if (! UTF_8_3_OCTET_LEADING_P (c1))
1315         src = src_base;
1316       else
1317         {
1318           ONE_MORE_BYTE (c2);
1319           if (! UTF_8_EXTRA_OCTET_P (c2))
1320             src = src_base;
1321           else
1322             {
1323               ONE_MORE_BYTE (c3);
1324               if (! UTF_8_EXTRA_OCTET_P (c3))
1325                 src = src_base;
1326               else
1327                 {
1328                   if ((c1 != UTF_8_BOM_1)
1329                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1330                     src = src_base;
1331                   else
1332                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1333                 }
1334             }
1335         }
1336     }
1337   CODING_UTF_8_BOM (coding) = utf_without_bom;
1338
1339   while (1)
1340     {
1341       int c, c1, c2, c3, c4, c5;
1342
1343       src_base = src;
1344       consumed_chars_base = consumed_chars;
1345
1346       if (charbuf >= charbuf_end)
1347         {
1348           if (byte_after_cr >= 0)
1349             src_base--;
1350           break;
1351         }
1352
1353       if (byte_after_cr >= 0)
1354         c1 = byte_after_cr, byte_after_cr = -1;
1355       else
1356         ONE_MORE_BYTE (c1);
1357       if (c1 < 0)
1358         {
1359           c = - c1;
1360         }
1361       else if (UTF_8_1_OCTET_P (c1))
1362         {
1363           if (eol_dos && c1 == '\r')
1364             ONE_MORE_BYTE (byte_after_cr);
1365           c = c1;
1366         }
1367       else
1368         {
1369           ONE_MORE_BYTE (c2);
1370           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1371             goto invalid_code;
1372           if (UTF_8_2_OCTET_LEADING_P (c1))
1373             {
1374               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1375               /* Reject overlong sequences here and below.  Encoders
1376                  producing them are incorrect, they can be misleading,
1377                  and they mess up read/write invariance.  */
1378               if (c < 128)
1379                 goto invalid_code;
1380             }
1381           else
1382             {
1383               ONE_MORE_BYTE (c3);
1384               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1385                 goto invalid_code;
1386               if (UTF_8_3_OCTET_LEADING_P (c1))
1387                 {
1388                   c = (((c1 & 0xF) << 12)
1389                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1390                   if (c < 0x800
1391                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1392                     goto invalid_code;
1393                 }
1394               else
1395                 {
1396                   ONE_MORE_BYTE (c4);
1397                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1398                     goto invalid_code;
1399                   if (UTF_8_4_OCTET_LEADING_P (c1))
1400                     {
1401                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1402                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1403                     if (c < 0x10000)
1404                       goto invalid_code;
1405                     }
1406                   else
1407                     {
1408                       ONE_MORE_BYTE (c5);
1409                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1410                         goto invalid_code;
1411                       if (UTF_8_5_OCTET_LEADING_P (c1))
1412                         {
1413                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1414                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1415                                | (c5 & 0x3F));
1416                           if ((c > MAX_CHAR) || (c < 0x200000))
1417                             goto invalid_code;
1418                         }
1419                       else
1420                         goto invalid_code;
1421                     }
1422                 }
1423             }
1424         }
1425
1426       *charbuf++ = c;
1427       continue;
1428
1429     invalid_code:
1430       src = src_base;
1431       consumed_chars = consumed_chars_base;
1432       ONE_MORE_BYTE (c);
1433       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1434       coding->errors++;
1435     }
1436
1437  no_more_source:
1438   coding->consumed_char += consumed_chars_base;
1439   coding->consumed = src_base - coding->source;
1440   coding->charbuf_used = charbuf - coding->charbuf;
1441 }
1442
1443
1444 static int
1445 encode_coding_utf_8 (struct coding_system *coding)
1446 {
1447   int multibytep = coding->dst_multibyte;
1448   int *charbuf = coding->charbuf;
1449   int *charbuf_end = charbuf + coding->charbuf_used;
1450   unsigned char *dst = coding->destination + coding->produced;
1451   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1452   ptrdiff_t produced_chars = 0;
1453   int c;
1454
1455   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1456     {
1457       ASSURE_DESTINATION (3);
1458       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1459       CODING_UTF_8_BOM (coding) = utf_without_bom;
1460     }
1461
1462   if (multibytep)
1463     {
1464       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1465
1466       while (charbuf < charbuf_end)
1467         {
1468           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1469
1470           ASSURE_DESTINATION (safe_room);
1471           c = *charbuf++;
1472           if (CHAR_BYTE8_P (c))
1473             {
1474               c = CHAR_TO_BYTE8 (c);
1475               EMIT_ONE_BYTE (c);
1476             }
1477           else
1478             {
1479               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1480               for (p = str; p < pend; p++)
1481                 EMIT_ONE_BYTE (*p);
1482             }
1483         }
1484     }
1485   else
1486     {
1487       int safe_room = MAX_MULTIBYTE_LENGTH;
1488
1489       while (charbuf < charbuf_end)
1490         {
1491           ASSURE_DESTINATION (safe_room);
1492           c = *charbuf++;
1493           if (CHAR_BYTE8_P (c))
1494             *dst++ = CHAR_TO_BYTE8 (c);
1495           else
1496             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1497           produced_chars++;
1498         }
1499     }
1500   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1501   coding->produced_char += produced_chars;
1502   coding->produced = dst - coding->destination;
1503   return 0;
1504 }
1505
1506
1507 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1508    Check if a text is encoded in one of UTF-16 based coding systems.
1509    If it is, return 1, else return 0.  */
1510
1511 #define UTF_16_HIGH_SURROGATE_P(val) \
1512   (((val) & 0xFC00) == 0xD800)
1513
1514 #define UTF_16_LOW_SURROGATE_P(val) \
1515   (((val) & 0xFC00) == 0xDC00)
1516
1517
1518 static int
1519 detect_coding_utf_16 (struct coding_system *coding,
1520                       struct coding_detection_info *detect_info)
1521 {
1522   const unsigned char *src = coding->source;
1523   const unsigned char *src_end = coding->source + coding->src_bytes;
1524   int multibytep = coding->src_multibyte;
1525   int c1, c2;
1526
1527   detect_info->checked |= CATEGORY_MASK_UTF_16;
1528   if (coding->mode & CODING_MODE_LAST_BLOCK
1529       && (coding->src_chars & 1))
1530     {
1531       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1532       return 0;
1533     }
1534
1535   TWO_MORE_BYTES (c1, c2);
1536   if ((c1 == 0xFF) && (c2 == 0xFE))
1537     {
1538       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1539                              | CATEGORY_MASK_UTF_16_AUTO);
1540       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1541                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1542                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1543     }
1544   else if ((c1 == 0xFE) && (c2 == 0xFF))
1545     {
1546       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1547                              | CATEGORY_MASK_UTF_16_AUTO);
1548       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1549                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1550                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1551     }
1552   else if (c2 < 0)
1553     {
1554       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1555       return 0;
1556     }
1557   else
1558     {
1559       /* We check the dispersion of Eth and Oth bytes where E is even and
1560          O is odd.  If both are high, we assume binary data.*/
1561       unsigned char e[256], o[256];
1562       unsigned e_num = 1, o_num = 1;
1563
1564       memset (e, 0, 256);
1565       memset (o, 0, 256);
1566       e[c1] = 1;
1567       o[c2] = 1;
1568
1569       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1570                                 |CATEGORY_MASK_UTF_16_BE
1571                                 | CATEGORY_MASK_UTF_16_LE);
1572
1573       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1574              != CATEGORY_MASK_UTF_16)
1575         {
1576           TWO_MORE_BYTES (c1, c2);
1577           if (c2 < 0)
1578             break;
1579           if (! e[c1])
1580             {
1581               e[c1] = 1;
1582               e_num++;
1583               if (e_num >= 128)
1584                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1585             }
1586           if (! o[c2])
1587             {
1588               o[c2] = 1;
1589               o_num++;
1590               if (o_num >= 128)
1591                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1592             }
1593         }
1594       return 0;
1595     }
1596
1597  no_more_source:
1598   return 1;
1599 }
1600
1601 static void
1602 decode_coding_utf_16 (struct coding_system *coding)
1603 {
1604   const unsigned char *src = coding->source + coding->consumed;
1605   const unsigned char *src_end = coding->source + coding->src_bytes;
1606   const unsigned char *src_base;
1607   int *charbuf = coding->charbuf + coding->charbuf_used;
1608   /* We may produces at most 3 chars in one loop.  */
1609   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1610   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1611   int multibytep = coding->src_multibyte;
1612   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1613   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1614   int surrogate = CODING_UTF_16_SURROGATE (coding);
1615   int eol_dos =
1616     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1617   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1618
1619   if (bom == utf_with_bom)
1620     {
1621       int c, c1, c2;
1622
1623       src_base = src;
1624       ONE_MORE_BYTE (c1);
1625       ONE_MORE_BYTE (c2);
1626       c = (c1 << 8) | c2;
1627
1628       if (endian == utf_16_big_endian
1629           ? c != 0xFEFF : c != 0xFFFE)
1630         {
1631           /* The first two bytes are not BOM.  Treat them as bytes
1632              for a normal character.  */
1633           src = src_base;
1634           coding->errors++;
1635         }
1636       CODING_UTF_16_BOM (coding) = utf_without_bom;
1637     }
1638   else if (bom == utf_detect_bom)
1639     {
1640       /* We have already tried to detect BOM and failed in
1641          detect_coding.  */
1642       CODING_UTF_16_BOM (coding) = utf_without_bom;
1643     }
1644
1645   while (1)
1646     {
1647       int c, c1, c2;
1648
1649       src_base = src;
1650       consumed_chars_base = consumed_chars;
1651
1652       if (charbuf >= charbuf_end)
1653         {
1654           if (byte_after_cr1 >= 0)
1655             src_base -= 2;
1656           break;
1657         }
1658
1659       if (byte_after_cr1 >= 0)
1660         c1 = byte_after_cr1, byte_after_cr1 = -1;
1661       else
1662         ONE_MORE_BYTE (c1);
1663       if (c1 < 0)
1664         {
1665           *charbuf++ = -c1;
1666           continue;
1667         }
1668       if (byte_after_cr2 >= 0)
1669         c2 = byte_after_cr2, byte_after_cr2 = -1;
1670       else
1671         ONE_MORE_BYTE (c2);
1672       if (c2 < 0)
1673         {
1674           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1675           *charbuf++ = -c2;
1676           continue;
1677         }
1678       c = (endian == utf_16_big_endian
1679            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1680
1681       if (surrogate)
1682         {
1683           if (! UTF_16_LOW_SURROGATE_P (c))
1684             {
1685               if (endian == utf_16_big_endian)
1686                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1687               else
1688                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1689               *charbuf++ = c1;
1690               *charbuf++ = c2;
1691               coding->errors++;
1692               if (UTF_16_HIGH_SURROGATE_P (c))
1693                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1694               else
1695                 *charbuf++ = c;
1696             }
1697           else
1698             {
1699               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1700               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1701               *charbuf++ = 0x10000 + c;
1702             }
1703         }
1704       else
1705         {
1706           if (UTF_16_HIGH_SURROGATE_P (c))
1707             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1708           else
1709             {
1710               if (eol_dos && c == '\r')
1711                 {
1712                   ONE_MORE_BYTE (byte_after_cr1);
1713                   ONE_MORE_BYTE (byte_after_cr2);
1714                 }
1715               *charbuf++ = c;
1716             }
1717         }
1718     }
1719
1720  no_more_source:
1721   coding->consumed_char += consumed_chars_base;
1722   coding->consumed = src_base - coding->source;
1723   coding->charbuf_used = charbuf - coding->charbuf;
1724 }
1725
1726 static int
1727 encode_coding_utf_16 (struct coding_system *coding)
1728 {
1729   int multibytep = coding->dst_multibyte;
1730   int *charbuf = coding->charbuf;
1731   int *charbuf_end = charbuf + coding->charbuf_used;
1732   unsigned char *dst = coding->destination + coding->produced;
1733   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1734   int safe_room = 8;
1735   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1736   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1737   ptrdiff_t produced_chars = 0;
1738   int c;
1739
1740   if (bom != utf_without_bom)
1741     {
1742       ASSURE_DESTINATION (safe_room);
1743       if (big_endian)
1744         EMIT_TWO_BYTES (0xFE, 0xFF);
1745       else
1746         EMIT_TWO_BYTES (0xFF, 0xFE);
1747       CODING_UTF_16_BOM (coding) = utf_without_bom;
1748     }
1749
1750   while (charbuf < charbuf_end)
1751     {
1752       ASSURE_DESTINATION (safe_room);
1753       c = *charbuf++;
1754       if (c > MAX_UNICODE_CHAR)
1755         c = coding->default_char;
1756
1757       if (c < 0x10000)
1758         {
1759           if (big_endian)
1760             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1761           else
1762             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1763         }
1764       else
1765         {
1766           int c1, c2;
1767
1768           c -= 0x10000;
1769           c1 = (c >> 10) + 0xD800;
1770           c2 = (c & 0x3FF) + 0xDC00;
1771           if (big_endian)
1772             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1773           else
1774             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1775         }
1776     }
1777   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1778   coding->produced = dst - coding->destination;
1779   coding->produced_char += produced_chars;
1780   return 0;
1781 }
1782
1783 \f
1784 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1785
1786 /* Emacs' internal format for representation of multiple character
1787    sets is a kind of multi-byte encoding, i.e. characters are
1788    represented by variable-length sequences of one-byte codes.
1789
1790    ASCII characters and control characters (e.g. `tab', `newline') are
1791    represented by one-byte sequences which are their ASCII codes, in
1792    the range 0x00 through 0x7F.
1793
1794    8-bit characters of the range 0x80..0x9F are represented by
1795    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1796    code + 0x20).
1797
1798    8-bit characters of the range 0xA0..0xFF are represented by
1799    one-byte sequences which are their 8-bit code.
1800
1801    The other characters are represented by a sequence of `base
1802    leading-code', optional `extended leading-code', and one or two
1803    `position-code's.  The length of the sequence is determined by the
1804    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1805    whereas extended leading-code and position-code take the range 0xA0
1806    through 0xFF.  See `charset.h' for more details about leading-code
1807    and position-code.
1808
1809    --- CODE RANGE of Emacs' internal format ---
1810    character set        range
1811    -------------        -----
1812    ascii                0x00..0x7F
1813    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1814    eight-bit-graphic    0xA0..0xBF
1815    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1816    ---------------------------------------------
1817
1818    As this is the internal character representation, the format is
1819    usually not used externally (i.e. in a file or in a data sent to a
1820    process).  But, it is possible to have a text externally in this
1821    format (i.e. by encoding by the coding system `emacs-mule').
1822
1823    In that case, a sequence of one-byte codes has a slightly different
1824    form.
1825
1826    At first, all characters in eight-bit-control are represented by
1827    one-byte sequences which are their 8-bit code.
1828
1829    Next, character composition data are represented by the byte
1830    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1831    where,
1832         METHOD is 0xF2 plus one of composition method (enum
1833         composition_method),
1834
1835         BYTES is 0xA0 plus a byte length of this composition data,
1836
1837         CHARS is 0xA0 plus a number of characters composed by this
1838         data,
1839
1840         COMPONENTs are characters of multibyte form or composition
1841         rules encoded by two-byte of ASCII codes.
1842
1843    In addition, for backward compatibility, the following formats are
1844    also recognized as composition data on decoding.
1845
1846    0x80 MSEQ ...
1847    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1848
1849    Here,
1850         MSEQ is a multibyte form but in these special format:
1851           ASCII: 0xA0 ASCII_CODE+0x80,
1852           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1853         RULE is a one byte code of the range 0xA0..0xF0 that
1854         represents a composition rule.
1855   */
1856
1857 char emacs_mule_bytes[256];
1858
1859
1860 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1861    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1862    else return 0.  */
1863
1864 static int
1865 detect_coding_emacs_mule (struct coding_system *coding,
1866                           struct coding_detection_info *detect_info)
1867 {
1868   const unsigned char *src = coding->source, *src_base;
1869   const unsigned char *src_end = coding->source + coding->src_bytes;
1870   int multibytep = coding->src_multibyte;
1871   ptrdiff_t consumed_chars = 0;
1872   int c;
1873   int found = 0;
1874
1875   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1876   /* A coding system of this category is always ASCII compatible.  */
1877   src += coding->head_ascii;
1878
1879   while (1)
1880     {
1881       src_base = src;
1882       ONE_MORE_BYTE (c);
1883       if (c < 0)
1884         continue;
1885       if (c == 0x80)
1886         {
1887           /* Perhaps the start of composite character.  We simply skip
1888              it because analyzing it is too heavy for detecting.  But,
1889              at least, we check that the composite character
1890              constitutes of more than 4 bytes.  */
1891           const unsigned char *src_start;
1892
1893         repeat:
1894           src_start = src;
1895           do
1896             {
1897               ONE_MORE_BYTE (c);
1898             }
1899           while (c >= 0xA0);
1900
1901           if (src - src_start <= 4)
1902             break;
1903           found = CATEGORY_MASK_EMACS_MULE;
1904           if (c == 0x80)
1905             goto repeat;
1906         }
1907
1908       if (c < 0x80)
1909         {
1910           if (c < 0x20
1911               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1912             break;
1913         }
1914       else
1915         {
1916           int more_bytes = emacs_mule_bytes[c] - 1;
1917
1918           while (more_bytes > 0)
1919             {
1920               ONE_MORE_BYTE (c);
1921               if (c < 0xA0)
1922                 {
1923                   src--;        /* Unread the last byte.  */
1924                   break;
1925                 }
1926               more_bytes--;
1927             }
1928           if (more_bytes != 0)
1929             break;
1930           found = CATEGORY_MASK_EMACS_MULE;
1931         }
1932     }
1933   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1934   return 0;
1935
1936  no_more_source:
1937   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1938     {
1939       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1940       return 0;
1941     }
1942   detect_info->found |= found;
1943   return 1;
1944 }
1945
1946
1947 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1948    character.  If CMP_STATUS indicates that we must expect MSEQ or
1949    RULE described above, decode it and return the negative value of
1950    the decoded character or rule.  If an invalid byte is found, return
1951    -1.  If SRC is too short, return -2.  */
1952
1953 static int
1954 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1955                  int *nbytes, int *nchars, int *id,
1956                  struct composition_status *cmp_status)
1957 {
1958   const unsigned char *src_end = coding->source + coding->src_bytes;
1959   const unsigned char *src_base = src;
1960   int multibytep = coding->src_multibyte;
1961   int charset_ID;
1962   unsigned code;
1963   int c;
1964   int consumed_chars = 0;
1965   int mseq_found = 0;
1966
1967   ONE_MORE_BYTE (c);
1968   if (c < 0)
1969     {
1970       c = -c;
1971       charset_ID = emacs_mule_charset[0];
1972     }
1973   else
1974     {
1975       if (c >= 0xA0)
1976         {
1977           if (cmp_status->state != COMPOSING_NO
1978               && cmp_status->old_form)
1979             {
1980               if (cmp_status->state == COMPOSING_CHAR)
1981                 {
1982                   if (c == 0xA0)
1983                     {
1984                       ONE_MORE_BYTE (c);
1985                       c -= 0x80;
1986                       if (c < 0)
1987                         goto invalid_code;
1988                     }
1989                   else
1990                     c -= 0x20;
1991                   mseq_found = 1;
1992                 }
1993               else
1994                 {
1995                   *nbytes = src - src_base;
1996                   *nchars = consumed_chars;
1997                   return -c;
1998                 }
1999             }
2000           else
2001             goto invalid_code;
2002         }
2003
2004       switch (emacs_mule_bytes[c])
2005         {
2006         case 2:
2007           if ((charset_ID = emacs_mule_charset[c]) < 0)
2008             goto invalid_code;
2009           ONE_MORE_BYTE (c);
2010           if (c < 0xA0)
2011             goto invalid_code;
2012           code = c & 0x7F;
2013           break;
2014
2015         case 3:
2016           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2017               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2018             {
2019               ONE_MORE_BYTE (c);
2020               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2021                 goto invalid_code;
2022               ONE_MORE_BYTE (c);
2023               if (c < 0xA0)
2024                 goto invalid_code;
2025               code = c & 0x7F;
2026             }
2027           else
2028             {
2029               if ((charset_ID = emacs_mule_charset[c]) < 0)
2030                 goto invalid_code;
2031               ONE_MORE_BYTE (c);
2032               if (c < 0xA0)
2033                 goto invalid_code;
2034               code = (c & 0x7F) << 8;
2035               ONE_MORE_BYTE (c);
2036               if (c < 0xA0)
2037                 goto invalid_code;
2038               code |= c & 0x7F;
2039             }
2040           break;
2041
2042         case 4:
2043           ONE_MORE_BYTE (c);
2044           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2045             goto invalid_code;
2046           ONE_MORE_BYTE (c);
2047           if (c < 0xA0)
2048             goto invalid_code;
2049           code = (c & 0x7F) << 8;
2050           ONE_MORE_BYTE (c);
2051           if (c < 0xA0)
2052             goto invalid_code;
2053           code |= c & 0x7F;
2054           break;
2055
2056         case 1:
2057           code = c;
2058           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2059           break;
2060
2061         default:
2062           abort ();
2063         }
2064       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2065                           CHARSET_FROM_ID (charset_ID), code, c);
2066       if (c < 0)
2067         goto invalid_code;
2068     }
2069   *nbytes = src - src_base;
2070   *nchars = consumed_chars;
2071   if (id)
2072     *id = charset_ID;
2073   return (mseq_found ? -c : c);
2074
2075  no_more_source:
2076   return -2;
2077
2078  invalid_code:
2079   return -1;
2080 }
2081
2082
2083 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2084
2085 /* Handle these composition sequence ('|': the end of header elements,
2086    BYTES and CHARS >= 0xA0):
2087
2088    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2089    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2090    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2091
2092    and these old form:
2093
2094    (4) relative composition: 0x80 | MSEQ ... MSEQ
2095    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2096
2097    When the starter 0x80 and the following header elements are found,
2098    this annotation header is produced.
2099
2100         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2101
2102    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2103    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2104
2105    Then, upon reading the following elements, these codes are produced
2106    until the composition end is found:
2107
2108    (1) CHAR ... CHAR
2109    (2) ALT ... ALT CHAR ... CHAR
2110    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2111    (4) CHAR ... CHAR
2112    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2113
2114    When the composition end is found, LENGTH and NCHARS in the
2115    annotation header is updated as below:
2116
2117    (1) LENGTH: unchanged, NCHARS: unchanged
2118    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2119    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2120    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2121    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2122
2123    If an error is found while composing, the annotation header is
2124    changed to the original composition header (plus filler -1s) as
2125    below:
2126
2127    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2128    (5)          [ 0x80 0xFF -1 -1- -1 ]
2129
2130    and the sequence [ -2 DECODED-RULE ] is changed to the original
2131    byte sequence as below:
2132         o the original byte sequence is B: [ B -1 ]
2133         o the original byte sequence is B1 B2: [ B1 B2 ]
2134
2135    Most of the routines are implemented by macros because many
2136    variables and labels in the caller decode_coding_emacs_mule must be
2137    accessible, and they are usually called just once (thus doesn't
2138    increase the size of compiled object).  */
2139
2140 /* Decode a composition rule represented by C as a component of
2141    composition sequence of Emacs 20 style.  Set RULE to the decoded
2142    rule. */
2143
2144 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2145   do {                                                  \
2146     int gref, nref;                                     \
2147                                                         \
2148     c -= 0xA0;                                          \
2149     if (c < 0 || c >= 81)                               \
2150       goto invalid_code;                                \
2151     gref = c / 9, nref = c % 9;                         \
2152     if (gref == 4) gref = 10;                           \
2153     if (nref == 4) nref = 10;                           \
2154     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2155   } while (0)
2156
2157
2158 /* Decode a composition rule represented by C and the following byte
2159    at SRC as a component of composition sequence of Emacs 21 style.
2160    Set RULE to the decoded rule.  */
2161
2162 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2163   do {                                                  \
2164     int gref, nref;                                     \
2165                                                         \
2166     gref = c - 0x20;                                    \
2167     if (gref < 0 || gref >= 81)                         \
2168       goto invalid_code;                                \
2169     ONE_MORE_BYTE (c);                                  \
2170     nref = c - 0x20;                                    \
2171     if (nref < 0 || nref >= 81)                         \
2172       goto invalid_code;                                \
2173     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2174   } while (0)
2175
2176
2177 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2178    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2179    byte length of this composition information, CHARS is the number of
2180    characters composed by this composition.  */
2181
2182 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2183   do {                                                                  \
2184     enum composition_method method = c - 0xF2;                          \
2185     int nbytes, nchars;                                                 \
2186                                                                         \
2187     ONE_MORE_BYTE (c);                                                  \
2188     if (c < 0)                                                          \
2189       goto invalid_code;                                                \
2190     nbytes = c - 0xA0;                                                  \
2191     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2192       goto invalid_code;                                                \
2193     ONE_MORE_BYTE (c);                                                  \
2194     nchars = c - 0xA0;                                                  \
2195     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2196       goto invalid_code;                                                \
2197     cmp_status->old_form = 0;                                           \
2198     cmp_status->method = method;                                        \
2199     if (method == COMPOSITION_RELATIVE)                                 \
2200       cmp_status->state = COMPOSING_CHAR;                               \
2201     else                                                                \
2202       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2203     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2204     cmp_status->nchars = nchars;                                        \
2205     cmp_status->ncomps = nbytes - 4;                                    \
2206     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2207   } while (0)
2208
2209
2210 /* Start of Emacs 20 style format for relative composition.  */
2211
2212 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2213   do {                                                          \
2214     cmp_status->old_form = 1;                                   \
2215     cmp_status->method = COMPOSITION_RELATIVE;                  \
2216     cmp_status->state = COMPOSING_CHAR;                         \
2217     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2218     cmp_status->nchars = cmp_status->ncomps = 0;                \
2219     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2220   } while (0)
2221
2222
2223 /* Start of Emacs 20 style format for rule-base composition.  */
2224
2225 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2226   do {                                                          \
2227     cmp_status->old_form = 1;                                   \
2228     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2229     cmp_status->state = COMPOSING_CHAR;                         \
2230     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2231     cmp_status->nchars = cmp_status->ncomps = 0;                \
2232     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2233   } while (0)
2234
2235
2236 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2237   do {                                                  \
2238     const unsigned char *current_src = src;             \
2239                                                         \
2240     ONE_MORE_BYTE (c);                                  \
2241     if (c < 0)                                          \
2242       goto invalid_code;                                \
2243     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2244         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2245       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2246     else if (c < 0xA0)                                  \
2247       goto invalid_code;                                \
2248     else if (c < 0xC0)                                  \
2249       {                                                 \
2250         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2251         /* Re-read C as a composition component.  */    \
2252         src = current_src;                              \
2253       }                                                 \
2254     else if (c == 0xFF)                                 \
2255       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2256     else                                                \
2257       goto invalid_code;                                \
2258   } while (0)
2259
2260 #define EMACS_MULE_COMPOSITION_END()                            \
2261   do {                                                          \
2262     int idx = - cmp_status->length;                             \
2263                                                                 \
2264     if (cmp_status->old_form)                                   \
2265       charbuf[idx + 2] = cmp_status->nchars;                    \
2266     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2267       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2268     cmp_status->state = COMPOSING_NO;                           \
2269   } while (0)
2270
2271
2272 static int
2273 emacs_mule_finish_composition (int *charbuf,
2274                                struct composition_status *cmp_status)
2275 {
2276   int idx = - cmp_status->length;
2277   int new_chars;
2278
2279   if (cmp_status->old_form && cmp_status->nchars > 0)
2280     {
2281       charbuf[idx + 2] = cmp_status->nchars;
2282       new_chars = 0;
2283       if (cmp_status->method == COMPOSITION_WITH_RULE
2284           && cmp_status->state == COMPOSING_CHAR)
2285         {
2286           /* The last rule was invalid.  */
2287           int rule = charbuf[-1] + 0xA0;
2288
2289           charbuf[-2] = BYTE8_TO_CHAR (rule);
2290           charbuf[-1] = -1;
2291           new_chars = 1;
2292         }
2293     }
2294   else
2295     {
2296       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2297
2298       if (cmp_status->method == COMPOSITION_WITH_RULE)
2299         {
2300           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2301           charbuf[idx++] = -3;
2302           charbuf[idx++] = 0;
2303           new_chars = 1;
2304         }
2305       else
2306         {
2307           int nchars = charbuf[idx + 1] + 0xA0;
2308           int nbytes = charbuf[idx + 2] + 0xA0;
2309
2310           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2311           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2312           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2313           charbuf[idx++] = -1;
2314           new_chars = 4;
2315         }
2316     }
2317   cmp_status->state = COMPOSING_NO;
2318   return new_chars;
2319 }
2320
2321 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2322   do {                                                                    \
2323     if (cmp_status->state != COMPOSING_NO)                                \
2324       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2325   } while (0)
2326
2327
2328 static void
2329 decode_coding_emacs_mule (struct coding_system *coding)
2330 {
2331   const unsigned char *src = coding->source + coding->consumed;
2332   const unsigned char *src_end = coding->source + coding->src_bytes;
2333   const unsigned char *src_base;
2334   int *charbuf = coding->charbuf + coding->charbuf_used;
2335   /* We may produce two annotations (charset and composition) in one
2336      loop and one more charset annotation at the end.  */
2337   int *charbuf_end
2338     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2339       /* We can produce up to 2 characters in a loop.  */
2340       - 1;
2341   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2342   int multibytep = coding->src_multibyte;
2343   ptrdiff_t char_offset = coding->produced_char;
2344   ptrdiff_t last_offset = char_offset;
2345   int last_id = charset_ascii;
2346   int eol_dos =
2347     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2348   int byte_after_cr = -1;
2349   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2350
2351   if (cmp_status->state != COMPOSING_NO)
2352     {
2353       int i;
2354
2355       if (charbuf_end - charbuf < cmp_status->length)
2356         abort ();
2357       for (i = 0; i < cmp_status->length; i++)
2358         *charbuf++ = cmp_status->carryover[i];
2359       coding->annotated = 1;
2360     }
2361
2362   while (1)
2363     {
2364       int c, id IF_LINT (= 0);
2365
2366       src_base = src;
2367       consumed_chars_base = consumed_chars;
2368
2369       if (charbuf >= charbuf_end)
2370         {
2371           if (byte_after_cr >= 0)
2372             src_base--;
2373           break;
2374         }
2375
2376       if (byte_after_cr >= 0)
2377         c = byte_after_cr, byte_after_cr = -1;
2378       else
2379         ONE_MORE_BYTE (c);
2380
2381       if (c < 0 || c == 0x80)
2382         {
2383           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2384           if (c < 0)
2385             {
2386               *charbuf++ = -c;
2387               char_offset++;
2388             }
2389           else
2390             DECODE_EMACS_MULE_COMPOSITION_START ();
2391           continue;
2392         }
2393
2394       if (c < 0x80)
2395         {
2396           if (eol_dos && c == '\r')
2397             ONE_MORE_BYTE (byte_after_cr);
2398           id = charset_ascii;
2399           if (cmp_status->state != COMPOSING_NO)
2400             {
2401               if (cmp_status->old_form)
2402                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2403               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2404                 cmp_status->ncomps--;
2405             }
2406         }
2407       else
2408         {
2409           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2410           /* emacs_mule_char can load a charset map from a file, which
2411              allocates a large structure and might cause buffer text
2412              to be relocated as result.  Thus, we need to remember the
2413              original pointer to buffer text, and fix up all related
2414              pointers after the call.  */
2415           const unsigned char *orig = coding->source;
2416           ptrdiff_t offset;
2417
2418           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2419                                cmp_status);
2420           offset = coding->source - orig;
2421           if (offset)
2422             {
2423               src += offset;
2424               src_base += offset;
2425               src_end += offset;
2426             }
2427           if (c < 0)
2428             {
2429               if (c == -1)
2430                 goto invalid_code;
2431               if (c == -2)
2432                 break;
2433             }
2434           src = src_base + nbytes;
2435           consumed_chars = consumed_chars_base + nchars;
2436           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2437             cmp_status->ncomps -= nchars;
2438         }
2439
2440       /* Now if C >= 0, we found a normally encoded character, if C <
2441          0, we found an old-style composition component character or
2442          rule.  */
2443
2444       if (cmp_status->state == COMPOSING_NO)
2445         {
2446           if (last_id != id)
2447             {
2448               if (last_id != charset_ascii)
2449                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2450                                   last_id);
2451               last_id = id;
2452               last_offset = char_offset;
2453             }
2454           *charbuf++ = c;
2455           char_offset++;
2456         }
2457       else if (cmp_status->state == COMPOSING_CHAR)
2458         {
2459           if (cmp_status->old_form)
2460             {
2461               if (c >= 0)
2462                 {
2463                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2464                   *charbuf++ = c;
2465                   char_offset++;
2466                 }
2467               else
2468                 {
2469                   *charbuf++ = -c;
2470                   cmp_status->nchars++;
2471                   cmp_status->length++;
2472                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2473                     EMACS_MULE_COMPOSITION_END ();
2474                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2475                     cmp_status->state = COMPOSING_RULE;
2476                 }
2477             }
2478           else
2479             {
2480               *charbuf++ = c;
2481               cmp_status->length++;
2482               cmp_status->nchars--;
2483               if (cmp_status->nchars == 0)
2484                 EMACS_MULE_COMPOSITION_END ();
2485             }
2486         }
2487       else if (cmp_status->state == COMPOSING_RULE)
2488         {
2489           int rule;
2490
2491           if (c >= 0)
2492             {
2493               EMACS_MULE_COMPOSITION_END ();
2494               *charbuf++ = c;
2495               char_offset++;
2496             }
2497           else
2498             {
2499               c = -c;
2500               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2501               if (rule < 0)
2502                 goto invalid_code;
2503               *charbuf++ = -2;
2504               *charbuf++ = rule;
2505               cmp_status->length += 2;
2506               cmp_status->state = COMPOSING_CHAR;
2507             }
2508         }
2509       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2510         {
2511           *charbuf++ = c;
2512           cmp_status->length++;
2513           if (cmp_status->ncomps == 0)
2514             cmp_status->state = COMPOSING_CHAR;
2515           else if (cmp_status->ncomps > 0)
2516             {
2517               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2518                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2519             }
2520           else
2521             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2522         }
2523       else                      /* COMPOSING_COMPONENT_RULE */
2524         {
2525           int rule;
2526
2527           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2528           if (rule < 0)
2529             goto invalid_code;
2530           *charbuf++ = -2;
2531           *charbuf++ = rule;
2532           cmp_status->length += 2;
2533           cmp_status->ncomps--;
2534           if (cmp_status->ncomps > 0)
2535             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2536           else
2537             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2538         }
2539       continue;
2540
2541     invalid_code:
2542       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2543       src = src_base;
2544       consumed_chars = consumed_chars_base;
2545       ONE_MORE_BYTE (c);
2546       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2547       char_offset++;
2548       coding->errors++;
2549     }
2550
2551  no_more_source:
2552   if (cmp_status->state != COMPOSING_NO)
2553     {
2554       if (coding->mode & CODING_MODE_LAST_BLOCK)
2555         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2556       else
2557         {
2558           int i;
2559
2560           charbuf -= cmp_status->length;
2561           for (i = 0; i < cmp_status->length; i++)
2562             cmp_status->carryover[i] = charbuf[i];
2563         }
2564     }
2565   if (last_id != charset_ascii)
2566     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2567   coding->consumed_char += consumed_chars_base;
2568   coding->consumed = src_base - coding->source;
2569   coding->charbuf_used = charbuf - coding->charbuf;
2570 }
2571
2572
2573 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2574   do {                                          \
2575     if (id < 0xA0)                              \
2576       codes[0] = id, codes[1] = 0;              \
2577     else if (id < 0xE0)                         \
2578       codes[0] = 0x9A, codes[1] = id;           \
2579     else if (id < 0xF0)                         \
2580       codes[0] = 0x9B, codes[1] = id;           \
2581     else if (id < 0xF5)                         \
2582       codes[0] = 0x9C, codes[1] = id;           \
2583     else                                        \
2584       codes[0] = 0x9D, codes[1] = id;           \
2585   } while (0);
2586
2587
2588 static int
2589 encode_coding_emacs_mule (struct coding_system *coding)
2590 {
2591   int multibytep = coding->dst_multibyte;
2592   int *charbuf = coding->charbuf;
2593   int *charbuf_end = charbuf + coding->charbuf_used;
2594   unsigned char *dst = coding->destination + coding->produced;
2595   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2596   int safe_room = 8;
2597   ptrdiff_t produced_chars = 0;
2598   Lisp_Object attrs, charset_list;
2599   int c;
2600   int preferred_charset_id = -1;
2601
2602   CODING_GET_INFO (coding, attrs, charset_list);
2603   if (! EQ (charset_list, Vemacs_mule_charset_list))
2604     {
2605       CODING_ATTR_CHARSET_LIST (attrs)
2606         = charset_list = Vemacs_mule_charset_list;
2607     }
2608
2609   while (charbuf < charbuf_end)
2610     {
2611       ASSURE_DESTINATION (safe_room);
2612       c = *charbuf++;
2613
2614       if (c < 0)
2615         {
2616           /* Handle an annotation.  */
2617           switch (*charbuf)
2618             {
2619             case CODING_ANNOTATE_COMPOSITION_MASK:
2620               /* Not yet implemented.  */
2621               break;
2622             case CODING_ANNOTATE_CHARSET_MASK:
2623               preferred_charset_id = charbuf[3];
2624               if (preferred_charset_id >= 0
2625                   && NILP (Fmemq (make_number (preferred_charset_id),
2626                                   charset_list)))
2627                 preferred_charset_id = -1;
2628               break;
2629             default:
2630               abort ();
2631             }
2632           charbuf += -c - 1;
2633           continue;
2634         }
2635
2636       if (ASCII_CHAR_P (c))
2637         EMIT_ONE_ASCII_BYTE (c);
2638       else if (CHAR_BYTE8_P (c))
2639         {
2640           c = CHAR_TO_BYTE8 (c);
2641           EMIT_ONE_BYTE (c);
2642         }
2643       else
2644         {
2645           struct charset *charset;
2646           unsigned code;
2647           int dimension;
2648           int emacs_mule_id;
2649           unsigned char leading_codes[2];
2650
2651           if (preferred_charset_id >= 0)
2652             {
2653               charset = CHARSET_FROM_ID (preferred_charset_id);
2654               if (CHAR_CHARSET_P (c, charset))
2655                 code = ENCODE_CHAR (charset, c);
2656               else
2657                 charset = char_charset (c, charset_list, &code);
2658             }
2659           else
2660             charset = char_charset (c, charset_list, &code);
2661           if (! charset)
2662             {
2663               c = coding->default_char;
2664               if (ASCII_CHAR_P (c))
2665                 {
2666                   EMIT_ONE_ASCII_BYTE (c);
2667                   continue;
2668                 }
2669               charset = char_charset (c, charset_list, &code);
2670             }
2671           dimension = CHARSET_DIMENSION (charset);
2672           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2673           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2674           EMIT_ONE_BYTE (leading_codes[0]);
2675           if (leading_codes[1])
2676             EMIT_ONE_BYTE (leading_codes[1]);
2677           if (dimension == 1)
2678             EMIT_ONE_BYTE (code | 0x80);
2679           else
2680             {
2681               code |= 0x8080;
2682               EMIT_ONE_BYTE (code >> 8);
2683               EMIT_ONE_BYTE (code & 0xFF);
2684             }
2685         }
2686     }
2687   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2688   coding->produced_char += produced_chars;
2689   coding->produced = dst - coding->destination;
2690   return 0;
2691 }
2692
2693 \f
2694 /*** 7. ISO2022 handlers ***/
2695
2696 /* The following note describes the coding system ISO2022 briefly.
2697    Since the intention of this note is to help understand the
2698    functions in this file, some parts are NOT ACCURATE or are OVERLY
2699    SIMPLIFIED.  For thorough understanding, please refer to the
2700    original document of ISO2022.  This is equivalent to the standard
2701    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2702
2703    ISO2022 provides many mechanisms to encode several character sets
2704    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2705    is encoded using bytes less than 128.  This may make the encoded
2706    text a little bit longer, but the text passes more easily through
2707    several types of gateway, some of which strip off the MSB (Most
2708    Significant Bit).
2709
2710    There are two kinds of character sets: control character sets and
2711    graphic character sets.  The former contain control characters such
2712    as `newline' and `escape' to provide control functions (control
2713    functions are also provided by escape sequences).  The latter
2714    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2715    two control character sets and many graphic character sets.
2716
2717    Graphic character sets are classified into one of the following
2718    four classes, according to the number of bytes (DIMENSION) and
2719    number of characters in one dimension (CHARS) of the set:
2720    - DIMENSION1_CHARS94
2721    - DIMENSION1_CHARS96
2722    - DIMENSION2_CHARS94
2723    - DIMENSION2_CHARS96
2724
2725    In addition, each character set is assigned an identification tag,
2726    unique for each set, called the "final character" (denoted as <F>
2727    hereafter).  The <F> of each character set is decided by ECMA(*)
2728    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2729    (0x30..0x3F are for private use only).
2730
2731    Note (*): ECMA = European Computer Manufacturers Association
2732
2733    Here are examples of graphic character sets [NAME(<F>)]:
2734         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2735         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2736         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2737         o DIMENSION2_CHARS96 -- none for the moment
2738
2739    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2740         C0 [0x00..0x1F] -- control character plane 0
2741         GL [0x20..0x7F] -- graphic character plane 0
2742         C1 [0x80..0x9F] -- control character plane 1
2743         GR [0xA0..0xFF] -- graphic character plane 1
2744
2745    A control character set is directly designated and invoked to C0 or
2746    C1 by an escape sequence.  The most common case is that:
2747    - ISO646's  control character set is designated/invoked to C0, and
2748    - ISO6429's control character set is designated/invoked to C1,
2749    and usually these designations/invocations are omitted in encoded
2750    text.  In a 7-bit environment, only C0 can be used, and a control
2751    character for C1 is encoded by an appropriate escape sequence to
2752    fit into the environment.  All control characters for C1 are
2753    defined to have corresponding escape sequences.
2754
2755    A graphic character set is at first designated to one of four
2756    graphic registers (G0 through G3), then these graphic registers are
2757    invoked to GL or GR.  These designations and invocations can be
2758    done independently.  The most common case is that G0 is invoked to
2759    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2760    these invocations and designations are omitted in encoded text.
2761    In a 7-bit environment, only GL can be used.
2762
2763    When a graphic character set of CHARS94 is invoked to GL, codes
2764    0x20 and 0x7F of the GL area work as control characters SPACE and
2765    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2766    be used.
2767
2768    There are two ways of invocation: locking-shift and single-shift.
2769    With locking-shift, the invocation lasts until the next different
2770    invocation, whereas with single-shift, the invocation affects the
2771    following character only and doesn't affect the locking-shift
2772    state.  Invocations are done by the following control characters or
2773    escape sequences:
2774
2775    ----------------------------------------------------------------------
2776    abbrev  function                  cntrl escape seq   description
2777    ----------------------------------------------------------------------
2778    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2779    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2780    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2781    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2782    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2783    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2784    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2785    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2786    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2787    ----------------------------------------------------------------------
2788    (*) These are not used by any known coding system.
2789
2790    Control characters for these functions are defined by macros
2791    ISO_CODE_XXX in `coding.h'.
2792
2793    Designations are done by the following escape sequences:
2794    ----------------------------------------------------------------------
2795    escape sequence      description
2796    ----------------------------------------------------------------------
2797    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2798    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2799    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2800    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2801    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2802    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2803    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2804    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2805    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2806    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2807    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2808    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2809    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2810    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2811    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2812    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2813    ----------------------------------------------------------------------
2814
2815    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2816    of dimension 1, chars 94, and final character <F>, etc...
2817
2818    Note (*): Although these designations are not allowed in ISO2022,
2819    Emacs accepts them on decoding, and produces them on encoding
2820    CHARS96 character sets in a coding system which is characterized as
2821    7-bit environment, non-locking-shift, and non-single-shift.
2822
2823    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2824    '(' must be omitted.  We refer to this as "short-form" hereafter.
2825
2826    Now you may notice that there are a lot of ways of encoding the
2827    same multilingual text in ISO2022.  Actually, there exist many
2828    coding systems such as Compound Text (used in X11's inter client
2829    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2830    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2831    localized platforms), and all of these are variants of ISO2022.
2832
2833    In addition to the above, Emacs handles two more kinds of escape
2834    sequences: ISO6429's direction specification and Emacs' private
2835    sequence for specifying character composition.
2836
2837    ISO6429's direction specification takes the following form:
2838         o CSI ']'      -- end of the current direction
2839         o CSI '0' ']'  -- end of the current direction
2840         o CSI '1' ']'  -- start of left-to-right text
2841         o CSI '2' ']'  -- start of right-to-left text
2842    The control character CSI (0x9B: control sequence introducer) is
2843    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2844
2845    Character composition specification takes the following form:
2846         o ESC '0' -- start relative composition
2847         o ESC '1' -- end composition
2848         o ESC '2' -- start rule-base composition (*)
2849         o ESC '3' -- start relative composition with alternate chars  (**)
2850         o ESC '4' -- start rule-base composition with alternate chars  (**)
2851   Since these are not standard escape sequences of any ISO standard,
2852   the use of them with these meanings is restricted to Emacs only.
2853
2854   (*) This form is used only in Emacs 20.7 and older versions,
2855   but newer versions can safely decode it.
2856   (**) This form is used only in Emacs 21.1 and newer versions,
2857   and older versions can't decode it.
2858
2859   Here's a list of example usages of these composition escape
2860   sequences (categorized by `enum composition_method').
2861
2862   COMPOSITION_RELATIVE:
2863         ESC 0 CHAR [ CHAR ] ESC 1
2864   COMPOSITION_WITH_RULE:
2865         ESC 2 CHAR [ RULE CHAR ] ESC 1
2866   COMPOSITION_WITH_ALTCHARS:
2867         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2868   COMPOSITION_WITH_RULE_ALTCHARS:
2869         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2870
2871 static enum iso_code_class_type iso_code_class[256];
2872
2873 #define SAFE_CHARSET_P(coding, id)      \
2874   ((id) <= (coding)->max_charset_id     \
2875    && (coding)->safe_charsets[id] != 255)
2876
2877 static void
2878 setup_iso_safe_charsets (Lisp_Object attrs)
2879 {
2880   Lisp_Object charset_list, safe_charsets;
2881   Lisp_Object request;
2882   Lisp_Object reg_usage;
2883   Lisp_Object tail;
2884   EMACS_INT reg94, reg96;
2885   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2886   int max_charset_id;
2887
2888   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2889   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2890       && ! EQ (charset_list, Viso_2022_charset_list))
2891     {
2892       CODING_ATTR_CHARSET_LIST (attrs)
2893         = charset_list = Viso_2022_charset_list;
2894       ASET (attrs, coding_attr_safe_charsets, Qnil);
2895     }
2896
2897   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2898     return;
2899
2900   max_charset_id = 0;
2901   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2902     {
2903       int id = XINT (XCAR (tail));
2904       if (max_charset_id < id)
2905         max_charset_id = id;
2906     }
2907
2908   safe_charsets = make_uninit_string (max_charset_id + 1);
2909   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2910   request = AREF (attrs, coding_attr_iso_request);
2911   reg_usage = AREF (attrs, coding_attr_iso_usage);
2912   reg94 = XINT (XCAR (reg_usage));
2913   reg96 = XINT (XCDR (reg_usage));
2914
2915   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2916     {
2917       Lisp_Object id;
2918       Lisp_Object reg;
2919       struct charset *charset;
2920
2921       id = XCAR (tail);
2922       charset = CHARSET_FROM_ID (XINT (id));
2923       reg = Fcdr (Fassq (id, request));
2924       if (! NILP (reg))
2925         SSET (safe_charsets, XINT (id), XINT (reg));
2926       else if (charset->iso_chars_96)
2927         {
2928           if (reg96 < 4)
2929             SSET (safe_charsets, XINT (id), reg96);
2930         }
2931       else
2932         {
2933           if (reg94 < 4)
2934             SSET (safe_charsets, XINT (id), reg94);
2935         }
2936     }
2937   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2938 }
2939
2940
2941 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2942    Check if a text is encoded in one of ISO-2022 based coding systems.
2943    If it is, return 1, else return 0.  */
2944
2945 static int
2946 detect_coding_iso_2022 (struct coding_system *coding,
2947                         struct coding_detection_info *detect_info)
2948 {
2949   const unsigned char *src = coding->source, *src_base = src;
2950   const unsigned char *src_end = coding->source + coding->src_bytes;
2951   int multibytep = coding->src_multibyte;
2952   int single_shifting = 0;
2953   int id;
2954   int c, c1;
2955   ptrdiff_t consumed_chars = 0;
2956   int i;
2957   int rejected = 0;
2958   int found = 0;
2959   int composition_count = -1;
2960
2961   detect_info->checked |= CATEGORY_MASK_ISO;
2962
2963   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2964     {
2965       struct coding_system *this = &(coding_categories[i]);
2966       Lisp_Object attrs, val;
2967
2968       if (this->id < 0)
2969         continue;
2970       attrs = CODING_ID_ATTRS (this->id);
2971       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2972           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2973         setup_iso_safe_charsets (attrs);
2974       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2975       this->max_charset_id = SCHARS (val) - 1;
2976       this->safe_charsets = SDATA (val);
2977     }
2978
2979   /* A coding system of this category is always ASCII compatible.  */
2980   src += coding->head_ascii;
2981
2982   while (rejected != CATEGORY_MASK_ISO)
2983     {
2984       src_base = src;
2985       ONE_MORE_BYTE (c);
2986       switch (c)
2987         {
2988         case ISO_CODE_ESC:
2989           if (inhibit_iso_escape_detection)
2990             break;
2991           single_shifting = 0;
2992           ONE_MORE_BYTE (c);
2993           if (c == 'N' || c == 'O')
2994             {
2995               /* ESC <Fe> for SS2 or SS3.  */
2996               single_shifting = 1;
2997               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2998             }
2999           else if (c == '1')
3000             {
3001               /* End of composition.  */
3002               if (composition_count < 0
3003                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3004                 /* Invalid */
3005                 break;
3006               composition_count = -1;
3007               found |= CATEGORY_MASK_ISO;
3008             }
3009           else if (c >= '0' && c <= '4')
3010             {
3011               /* ESC <Fp> for start/end composition.  */
3012               composition_count = 0;
3013             }
3014           else
3015             {
3016               if (c >= '(' && c <= '/')
3017                 {
3018                   /* Designation sequence for a charset of dimension 1.  */
3019                   ONE_MORE_BYTE (c1);
3020                   if (c1 < ' ' || c1 >= 0x80
3021                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3022                     /* Invalid designation sequence.  Just ignore.  */
3023                     break;
3024                 }
3025               else if (c == '$')
3026                 {
3027                   /* Designation sequence for a charset of dimension 2.  */
3028                   ONE_MORE_BYTE (c);
3029                   if (c >= '@' && c <= 'B')
3030                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3031                     id = iso_charset_table[1][0][c];
3032                   else if (c >= '(' && c <= '/')
3033                     {
3034                       ONE_MORE_BYTE (c1);
3035                       if (c1 < ' ' || c1 >= 0x80
3036                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3037                         /* Invalid designation sequence.  Just ignore.  */
3038                         break;
3039                     }
3040                   else
3041                     /* Invalid designation sequence.  Just ignore it.  */
3042                     break;
3043                 }
3044               else
3045                 {
3046                   /* Invalid escape sequence.  Just ignore it.  */
3047                   break;
3048                 }
3049
3050               /* We found a valid designation sequence for CHARSET.  */
3051               rejected |= CATEGORY_MASK_ISO_8BIT;
3052               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3053                                   id))
3054                 found |= CATEGORY_MASK_ISO_7;
3055               else
3056                 rejected |= CATEGORY_MASK_ISO_7;
3057               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3058                                   id))
3059                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3060               else
3061                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3062               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3063                                   id))
3064                 found |= CATEGORY_MASK_ISO_7_ELSE;
3065               else
3066                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3067               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3068                                   id))
3069                 found |= CATEGORY_MASK_ISO_8_ELSE;
3070               else
3071                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3072             }
3073           break;
3074
3075         case ISO_CODE_SO:
3076         case ISO_CODE_SI:
3077           /* Locking shift out/in.  */
3078           if (inhibit_iso_escape_detection)
3079             break;
3080           single_shifting = 0;
3081           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3082           break;
3083
3084         case ISO_CODE_CSI:
3085           /* Control sequence introducer.  */
3086           single_shifting = 0;
3087           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3088           found |= CATEGORY_MASK_ISO_8_ELSE;
3089           goto check_extra_latin;
3090
3091         case ISO_CODE_SS2:
3092         case ISO_CODE_SS3:
3093           /* Single shift.   */
3094           if (inhibit_iso_escape_detection)
3095             break;
3096           single_shifting = 0;
3097           rejected |= CATEGORY_MASK_ISO_7BIT;
3098           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3099               & CODING_ISO_FLAG_SINGLE_SHIFT)
3100             {
3101               found |= CATEGORY_MASK_ISO_8_1;
3102               single_shifting = 1;
3103             }
3104           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3105               & CODING_ISO_FLAG_SINGLE_SHIFT)
3106             {
3107               found |= CATEGORY_MASK_ISO_8_2;
3108               single_shifting = 1;
3109             }
3110           if (single_shifting)
3111             break;
3112         check_extra_latin:
3113           if (! VECTORP (Vlatin_extra_code_table)
3114               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3115             {
3116               rejected = CATEGORY_MASK_ISO;
3117               break;
3118             }
3119           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3120               & CODING_ISO_FLAG_LATIN_EXTRA)
3121             found |= CATEGORY_MASK_ISO_8_1;
3122           else
3123             rejected |= CATEGORY_MASK_ISO_8_1;
3124           rejected |= CATEGORY_MASK_ISO_8_2;
3125           break;
3126
3127         default:
3128           if (c < 0)
3129             continue;
3130           if (c < 0x80)
3131             {
3132               if (composition_count >= 0)
3133                 composition_count++;
3134               single_shifting = 0;
3135               break;
3136             }
3137           if (c >= 0xA0)
3138             {
3139               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3140               found |= CATEGORY_MASK_ISO_8_1;
3141               /* Check the length of succeeding codes of the range
3142                  0xA0..0FF.  If the byte length is even, we include
3143                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3144                  only when we are not single shifting.  */
3145               if (! single_shifting
3146                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3147                 {
3148                   int len = 1;
3149                   while (src < src_end)
3150                     {
3151                       src_base = src;
3152                       ONE_MORE_BYTE (c);
3153                       if (c < 0xA0)
3154                         {
3155                           src = src_base;
3156                           break;
3157                         }
3158                       len++;
3159                     }
3160
3161                   if (len & 1 && src < src_end)
3162                     {
3163                       rejected |= CATEGORY_MASK_ISO_8_2;
3164                       if (composition_count >= 0)
3165                         composition_count += len;
3166                     }
3167                   else
3168                     {
3169                       found |= CATEGORY_MASK_ISO_8_2;
3170                       if (composition_count >= 0)
3171                         composition_count += len / 2;
3172                     }
3173                 }
3174               break;
3175             }
3176         }
3177     }
3178   detect_info->rejected |= CATEGORY_MASK_ISO;
3179   return 0;
3180
3181  no_more_source:
3182   detect_info->rejected |= rejected;
3183   detect_info->found |= (found & ~rejected);
3184   return 1;
3185 }
3186
3187
3188 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3189    escape sequence should be kept.  */
3190 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3191   do {                                                                  \
3192     int id, prev;                                                       \
3193                                                                         \
3194     if (final < '0' || final >= 128                                     \
3195         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3196         || !SAFE_CHARSET_P (coding, id))                                \
3197       {                                                                 \
3198         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3199         chars_96 = -1;                                                  \
3200         break;                                                          \
3201       }                                                                 \
3202     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3203     if (id == charset_jisx0201_roman)                                   \
3204       {                                                                 \
3205         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3206           id = charset_ascii;                                           \
3207       }                                                                 \
3208     else if (id == charset_jisx0208_1978)                               \
3209       {                                                                 \
3210         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3211           id = charset_jisx0208;                                        \
3212       }                                                                 \
3213     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3214     /* If there was an invalid designation to REG previously, and this  \
3215        designation is ASCII to REG, we should keep this designation     \
3216        sequence.  */                                                    \
3217     if (prev == -2 && id == charset_ascii)                              \
3218       chars_96 = -1;                                                    \
3219   } while (0)
3220
3221
3222 /* Handle these composition sequence (ALT: alternate char):
3223
3224    (1) relative composition: ESC 0 CHAR ... ESC 1
3225    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3226    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3227    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3228
3229    When the start sequence (ESC 0/2/3/4) is found, this annotation
3230    header is produced.
3231
3232         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3233
3234    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3235    produced until the end sequence (ESC 1) is found:
3236
3237    (1) CHAR ... CHAR
3238    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3239    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3240    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3241
3242    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3243    annotation header is updated as below:
3244
3245    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3246    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3247    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3248    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3249
3250    If an error is found while composing, the annotation header is
3251    changed to:
3252
3253         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3254
3255    and the sequence [ -2 DECODED-RULE ] is changed to the original
3256    byte sequence as below:
3257         o the original byte sequence is B: [ B -1 ]
3258         o the original byte sequence is B1 B2: [ B1 B2 ]
3259    and the sequence [ -1 -1 ] is changed to the original byte
3260    sequence:
3261         [ ESC '0' ]
3262 */
3263
3264 /* Decode a composition rule C1 and maybe one more byte from the
3265    source, and set RULE to the encoded composition rule.  If the rule
3266    is invalid, goto invalid_code.  */
3267
3268 #define DECODE_COMPOSITION_RULE(rule)                                   \
3269   do {                                                                  \
3270     rule = c1 - 32;                                                     \
3271     if (rule < 0)                                                       \
3272       goto invalid_code;                                                \
3273     if (rule < 81)              /* old format (before ver.21) */        \
3274       {                                                                 \
3275         int gref = (rule) / 9;                                          \
3276         int nref = (rule) % 9;                                          \
3277         if (gref == 4) gref = 10;                                       \
3278         if (nref == 4) nref = 10;                                       \
3279         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3280       }                                                                 \
3281     else                        /* new format (after ver.21) */         \
3282       {                                                                 \
3283         int b;                                                          \
3284                                                                         \
3285         ONE_MORE_BYTE (b);                                              \
3286         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3287           goto invalid_code;                                            \
3288         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3289         rule += 0x100;   /* Distinguish it from the old format.  */     \
3290       }                                                                 \
3291   } while (0)
3292
3293 #define ENCODE_COMPOSITION_RULE(rule)                           \
3294   do {                                                          \
3295     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3296                                                                 \
3297     if (rule < 0x100)           /* old format */                \
3298       {                                                         \
3299         if (gref == 10) gref = 4;                               \
3300         if (nref == 10) nref = 4;                               \
3301         charbuf[idx] = 32 + gref * 9 + nref;                    \
3302         charbuf[idx + 1] = -1;                                  \
3303         new_chars++;                                            \
3304       }                                                         \
3305     else                                /* new format */        \
3306       {                                                         \
3307         charbuf[idx] = 32 + 81 + gref;                          \
3308         charbuf[idx + 1] = 32 + nref;                           \
3309         new_chars += 2;                                         \
3310       }                                                         \
3311   } while (0)
3312
3313 /* Finish the current composition as invalid.  */
3314
3315 static int finish_composition (int *, struct composition_status *);
3316
3317 static int
3318 finish_composition (int *charbuf, struct composition_status *cmp_status)
3319 {
3320   int idx = - cmp_status->length;
3321   int new_chars;
3322
3323   /* Recover the original ESC sequence */
3324   charbuf[idx++] = ISO_CODE_ESC;
3325   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3326                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3327                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3328                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3329                     : '4');
3330   charbuf[idx++] = -2;
3331   charbuf[idx++] = 0;
3332   charbuf[idx++] = -1;
3333   new_chars = cmp_status->nchars;
3334   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3335     for (; idx < 0; idx++)
3336       {
3337         int elt = charbuf[idx];
3338
3339         if (elt == -2)
3340           {
3341             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3342             idx++;
3343           }
3344         else if (elt == -1)
3345           {
3346             charbuf[idx++] = ISO_CODE_ESC;
3347             charbuf[idx] = '0';
3348             new_chars += 2;
3349           }
3350       }
3351   cmp_status->state = COMPOSING_NO;
3352   return new_chars;
3353 }
3354
3355 /* If characters are under composition, finish the composition.  */
3356 #define MAYBE_FINISH_COMPOSITION()                              \
3357   do {                                                          \
3358     if (cmp_status->state != COMPOSING_NO)                      \
3359       char_offset += finish_composition (charbuf, cmp_status);  \
3360   } while (0)
3361
3362 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3363
3364    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3365    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3366    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3367    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3368
3369    Produce this annotation sequence now:
3370
3371    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3372 */
3373
3374 #define DECODE_COMPOSITION_START(c1)                                       \
3375   do {                                                                     \
3376     if (c1 == '0'                                                          \
3377         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3378              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3379             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3380                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3381       {                                                                    \
3382         *charbuf++ = -1;                                                   \
3383         *charbuf++= -1;                                                    \
3384         cmp_status->state = COMPOSING_CHAR;                                \
3385         cmp_status->length += 2;                                           \
3386       }                                                                    \
3387     else                                                                   \
3388       {                                                                    \
3389         MAYBE_FINISH_COMPOSITION ();                                       \
3390         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3391                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3392                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3393                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3394         cmp_status->state                                                  \
3395           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3396         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3397         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3398         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3399         coding->annotated = 1;                                             \
3400       }                                                                    \
3401   } while (0)
3402
3403
3404 /* Handle composition end sequence ESC 1.  */
3405
3406 #define DECODE_COMPOSITION_END()                                        \
3407   do {                                                                  \
3408     if (cmp_status->nchars == 0                                         \
3409         || ((cmp_status->state == COMPOSING_CHAR)                       \
3410             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3411       {                                                                 \
3412         MAYBE_FINISH_COMPOSITION ();                                    \
3413         goto invalid_code;                                              \
3414       }                                                                 \
3415     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3416       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3417     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3418       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3419     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3420     char_offset += cmp_status->nchars;                                  \
3421     cmp_status->state = COMPOSING_NO;                                   \
3422   } while (0)
3423
3424 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3425
3426 #define STORE_COMPOSITION_RULE(rule)    \
3427   do {                                  \
3428     *charbuf++ = -2;                    \
3429     *charbuf++ = rule;                  \
3430     cmp_status->length += 2;            \
3431     cmp_status->state--;                \
3432   } while (0)
3433
3434 /* Store a composed char or a component char C in charbuf, and update
3435    cmp_status.  */
3436
3437 #define STORE_COMPOSITION_CHAR(c)                                       \
3438   do {                                                                  \
3439     *charbuf++ = (c);                                                   \
3440     cmp_status->length++;                                               \
3441     if (cmp_status->state == COMPOSING_CHAR)                            \
3442       cmp_status->nchars++;                                             \
3443     else                                                                \
3444       cmp_status->ncomps++;                                             \
3445     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3446         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3447             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3448       cmp_status->state++;                                              \
3449   } while (0)
3450
3451
3452 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3453
3454 static void
3455 decode_coding_iso_2022 (struct coding_system *coding)
3456 {
3457   const unsigned char *src = coding->source + coding->consumed;
3458   const unsigned char *src_end = coding->source + coding->src_bytes;
3459   const unsigned char *src_base;
3460   int *charbuf = coding->charbuf + coding->charbuf_used;
3461   /* We may produce two annotations (charset and composition) in one
3462      loop and one more charset annotation at the end.  */
3463   int *charbuf_end
3464     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3465   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3466   int multibytep = coding->src_multibyte;
3467   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3468   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3469   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3470   int charset_id_2, charset_id_3;
3471   struct charset *charset;
3472   int c;
3473   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3474   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3475   ptrdiff_t char_offset = coding->produced_char;
3476   ptrdiff_t last_offset = char_offset;
3477   int last_id = charset_ascii;
3478   int eol_dos =
3479     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3480   int byte_after_cr = -1;
3481   int i;
3482
3483   setup_iso_safe_charsets (attrs);
3484   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3485
3486   if (cmp_status->state != COMPOSING_NO)
3487     {
3488       if (charbuf_end - charbuf < cmp_status->length)
3489         abort ();
3490       for (i = 0; i < cmp_status->length; i++)
3491         *charbuf++ = cmp_status->carryover[i];
3492       coding->annotated = 1;
3493     }
3494
3495   while (1)
3496     {
3497       int c1, c2, c3;
3498
3499       src_base = src;
3500       consumed_chars_base = consumed_chars;
3501
3502       if (charbuf >= charbuf_end)
3503         {
3504           if (byte_after_cr >= 0)
3505             src_base--;
3506           break;
3507         }
3508
3509       if (byte_after_cr >= 0)
3510         c1 = byte_after_cr, byte_after_cr = -1;
3511       else
3512         ONE_MORE_BYTE (c1);
3513       if (c1 < 0)
3514         goto invalid_code;
3515
3516       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3517         {
3518           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3519           char_offset++;
3520           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3521           continue;
3522         }
3523
3524       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3525         {
3526           if (c1 == ISO_CODE_ESC)
3527             {
3528               if (src + 1 >= src_end)
3529                 goto no_more_source;
3530               *charbuf++ = ISO_CODE_ESC;
3531               char_offset++;
3532               if (src[0] == '%' && src[1] == '@')
3533                 {
3534                   src += 2;
3535                   consumed_chars += 2;
3536                   char_offset += 2;
3537                   /* We are sure charbuf can contain two more chars. */
3538                   *charbuf++ = '%';
3539                   *charbuf++ = '@';
3540                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3541                 }
3542             }
3543           else
3544             {
3545               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3546               char_offset++;
3547             }
3548           continue;
3549         }
3550
3551       if ((cmp_status->state == COMPOSING_RULE
3552            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3553           && c1 != ISO_CODE_ESC)
3554         {
3555           int rule;
3556
3557           DECODE_COMPOSITION_RULE (rule);
3558           STORE_COMPOSITION_RULE (rule);
3559           continue;
3560         }
3561
3562       /* We produce at most one character.  */
3563       switch (iso_code_class [c1])
3564         {
3565         case ISO_0x20_or_0x7F:
3566           if (charset_id_0 < 0
3567               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3568             /* This is SPACE or DEL.  */
3569             charset = CHARSET_FROM_ID (charset_ascii);
3570           else
3571             charset = CHARSET_FROM_ID (charset_id_0);
3572           break;
3573
3574         case ISO_graphic_plane_0:
3575           if (charset_id_0 < 0)
3576             charset = CHARSET_FROM_ID (charset_ascii);
3577           else
3578             charset = CHARSET_FROM_ID (charset_id_0);
3579           break;
3580
3581         case ISO_0xA0_or_0xFF:
3582           if (charset_id_1 < 0
3583               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3584               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3585             goto invalid_code;
3586           /* This is a graphic character, we fall down ... */
3587
3588         case ISO_graphic_plane_1:
3589           if (charset_id_1 < 0)
3590             goto invalid_code;
3591           charset = CHARSET_FROM_ID (charset_id_1);
3592           break;
3593
3594         case ISO_control_0:
3595           if (eol_dos && c1 == '\r')
3596             ONE_MORE_BYTE (byte_after_cr);
3597           MAYBE_FINISH_COMPOSITION ();
3598           charset = CHARSET_FROM_ID (charset_ascii);
3599           break;
3600
3601         case ISO_control_1:
3602           goto invalid_code;
3603
3604         case ISO_shift_out:
3605           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3606               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3607             goto invalid_code;
3608           CODING_ISO_INVOCATION (coding, 0) = 1;
3609           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3610           continue;
3611
3612         case ISO_shift_in:
3613           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3614             goto invalid_code;
3615           CODING_ISO_INVOCATION (coding, 0) = 0;
3616           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3617           continue;
3618
3619         case ISO_single_shift_2_7:
3620           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3621             goto invalid_code;
3622         case ISO_single_shift_2:
3623           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3624             goto invalid_code;
3625           /* SS2 is handled as an escape sequence of ESC 'N' */
3626           c1 = 'N';
3627           goto label_escape_sequence;
3628
3629         case ISO_single_shift_3:
3630           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3631             goto invalid_code;
3632           /* SS2 is handled as an escape sequence of ESC 'O' */
3633           c1 = 'O';
3634           goto label_escape_sequence;
3635
3636         case ISO_control_sequence_introducer:
3637           /* CSI is handled as an escape sequence of ESC '[' ...  */
3638           c1 = '[';
3639           goto label_escape_sequence;
3640
3641         case ISO_escape:
3642           ONE_MORE_BYTE (c1);
3643         label_escape_sequence:
3644           /* Escape sequences handled here are invocation,
3645              designation, direction specification, and character
3646              composition specification.  */
3647           switch (c1)
3648             {
3649             case '&':           /* revision of following character set */
3650               ONE_MORE_BYTE (c1);
3651               if (!(c1 >= '@' && c1 <= '~'))
3652                 goto invalid_code;
3653               ONE_MORE_BYTE (c1);
3654               if (c1 != ISO_CODE_ESC)
3655                 goto invalid_code;
3656               ONE_MORE_BYTE (c1);
3657               goto label_escape_sequence;
3658
3659             case '$':           /* designation of 2-byte character set */
3660               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3661                 goto invalid_code;
3662               {
3663                 int reg, chars96;
3664
3665                 ONE_MORE_BYTE (c1);
3666                 if (c1 >= '@' && c1 <= 'B')
3667                   {     /* designation of JISX0208.1978, GB2312.1980,
3668                            or JISX0208.1980 */
3669                     reg = 0, chars96 = 0;
3670                   }
3671                 else if (c1 >= 0x28 && c1 <= 0x2B)
3672                   { /* designation of DIMENSION2_CHARS94 character set */
3673                     reg = c1 - 0x28, chars96 = 0;
3674                     ONE_MORE_BYTE (c1);
3675                   }
3676                 else if (c1 >= 0x2C && c1 <= 0x2F)
3677                   { /* designation of DIMENSION2_CHARS96 character set */
3678                     reg = c1 - 0x2C, chars96 = 1;
3679                     ONE_MORE_BYTE (c1);
3680                   }
3681                 else
3682                   goto invalid_code;
3683                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3684                 /* We must update these variables now.  */
3685                 if (reg == 0)
3686                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3687                 else if (reg == 1)
3688                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3689                 if (chars96 < 0)
3690                   goto invalid_code;
3691               }
3692               continue;
3693
3694             case 'n':           /* invocation of locking-shift-2 */
3695               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3696                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3697                 goto invalid_code;
3698               CODING_ISO_INVOCATION (coding, 0) = 2;
3699               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3700               continue;
3701
3702             case 'o':           /* invocation of locking-shift-3 */
3703               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3704                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3705                 goto invalid_code;
3706               CODING_ISO_INVOCATION (coding, 0) = 3;
3707               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3708               continue;
3709
3710             case 'N':           /* invocation of single-shift-2 */
3711               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3712                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3713                 goto invalid_code;
3714               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3715               if (charset_id_2 < 0)
3716                 charset = CHARSET_FROM_ID (charset_ascii);
3717               else
3718                 charset = CHARSET_FROM_ID (charset_id_2);
3719               ONE_MORE_BYTE (c1);
3720               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3721                 goto invalid_code;
3722               break;
3723
3724             case 'O':           /* invocation of single-shift-3 */
3725               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3726                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3727                 goto invalid_code;
3728               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3729               if (charset_id_3 < 0)
3730                 charset = CHARSET_FROM_ID (charset_ascii);
3731               else
3732                 charset = CHARSET_FROM_ID (charset_id_3);
3733               ONE_MORE_BYTE (c1);
3734               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3735                 goto invalid_code;
3736               break;
3737
3738             case '0': case '2': case '3': case '4': /* start composition */
3739               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3740                 goto invalid_code;
3741               if (last_id != charset_ascii)
3742                 {
3743                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3744                   last_id = charset_ascii;
3745                   last_offset = char_offset;
3746                 }
3747               DECODE_COMPOSITION_START (c1);
3748               continue;
3749
3750             case '1':           /* end composition */
3751               if (cmp_status->state == COMPOSING_NO)
3752                 goto invalid_code;
3753               DECODE_COMPOSITION_END ();
3754               continue;
3755
3756             case '[':           /* specification of direction */
3757               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3758                 goto invalid_code;
3759               /* For the moment, nested direction is not supported.
3760                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3761                  left-to-right, and nonzero means right-to-left.  */
3762               ONE_MORE_BYTE (c1);
3763               switch (c1)
3764                 {
3765                 case ']':       /* end of the current direction */
3766                   coding->mode &= ~CODING_MODE_DIRECTION;
3767
3768                 case '0':       /* end of the current direction */
3769                 case '1':       /* start of left-to-right direction */
3770                   ONE_MORE_BYTE (c1);
3771                   if (c1 == ']')
3772                     coding->mode &= ~CODING_MODE_DIRECTION;
3773                   else
3774                     goto invalid_code;
3775                   break;
3776
3777                 case '2':       /* start of right-to-left direction */
3778                   ONE_MORE_BYTE (c1);
3779                   if (c1 == ']')
3780                     coding->mode |= CODING_MODE_DIRECTION;
3781                   else
3782                     goto invalid_code;
3783                   break;
3784
3785                 default:
3786                   goto invalid_code;
3787                 }
3788               continue;
3789
3790             case '%':
3791               ONE_MORE_BYTE (c1);
3792               if (c1 == '/')
3793                 {
3794                   /* CTEXT extended segment:
3795                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3796                      We keep these bytes as is for the moment.
3797                      They may be decoded by post-read-conversion.  */
3798                   int dim, M, L;
3799                   int size;
3800
3801                   ONE_MORE_BYTE (dim);
3802                   if (dim < '0' || dim > '4')
3803                     goto invalid_code;
3804                   ONE_MORE_BYTE (M);
3805                   if (M < 128)
3806                     goto invalid_code;
3807                   ONE_MORE_BYTE (L);
3808                   if (L < 128)
3809                     goto invalid_code;
3810                   size = ((M - 128) * 128) + (L - 128);
3811                   if (charbuf + 6 > charbuf_end)
3812                     goto break_loop;
3813                   *charbuf++ = ISO_CODE_ESC;
3814                   *charbuf++ = '%';
3815                   *charbuf++ = '/';
3816                   *charbuf++ = dim;
3817                   *charbuf++ = BYTE8_TO_CHAR (M);
3818                   *charbuf++ = BYTE8_TO_CHAR (L);
3819                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3820                 }
3821               else if (c1 == 'G')
3822                 {
3823                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3824                      ESC % G --UTF-8-BYTES-- ESC % @
3825                      We keep these bytes as is for the moment.
3826                      They may be decoded by post-read-conversion.  */
3827                   if (charbuf + 3 > charbuf_end)
3828                     goto break_loop;
3829                   *charbuf++ = ISO_CODE_ESC;
3830                   *charbuf++ = '%';
3831                   *charbuf++ = 'G';
3832                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3833                 }
3834               else
3835                 goto invalid_code;
3836               continue;
3837               break;
3838
3839             default:
3840               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3841                 goto invalid_code;
3842               {
3843                 int reg, chars96;
3844
3845                 if (c1 >= 0x28 && c1 <= 0x2B)
3846                   { /* designation of DIMENSION1_CHARS94 character set */
3847                     reg = c1 - 0x28, chars96 = 0;
3848                     ONE_MORE_BYTE (c1);
3849                   }
3850                 else if (c1 >= 0x2C && c1 <= 0x2F)
3851                   { /* designation of DIMENSION1_CHARS96 character set */
3852                     reg = c1 - 0x2C, chars96 = 1;
3853                     ONE_MORE_BYTE (c1);
3854                   }
3855                 else
3856                   goto invalid_code;
3857                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3858                 /* We must update these variables now.  */
3859                 if (reg == 0)
3860                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3861                 else if (reg == 1)
3862                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3863                 if (chars96 < 0)
3864                   goto invalid_code;
3865               }
3866               continue;
3867             }
3868           break;
3869
3870         default:
3871           abort ();
3872         }
3873
3874       if (cmp_status->state == COMPOSING_NO
3875           && charset->id != charset_ascii
3876           && last_id != charset->id)
3877         {
3878           if (last_id != charset_ascii)
3879             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3880           last_id = charset->id;
3881           last_offset = char_offset;
3882         }
3883
3884       /* Now we know CHARSET and 1st position code C1 of a character.
3885          Produce a decoded character while getting 2nd and 3rd
3886          position codes C2, C3 if necessary.  */
3887       if (CHARSET_DIMENSION (charset) > 1)
3888         {
3889           ONE_MORE_BYTE (c2);
3890           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3891               || ((c1 & 0x80) != (c2 & 0x80)))
3892             /* C2 is not in a valid range.  */
3893             goto invalid_code;
3894           if (CHARSET_DIMENSION (charset) == 2)
3895             c1 = (c1 << 8) | c2;
3896           else
3897             {
3898               ONE_MORE_BYTE (c3);
3899               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3900                   || ((c1 & 0x80) != (c3 & 0x80)))
3901                 /* C3 is not in a valid range.  */
3902                 goto invalid_code;
3903               c1 = (c1 << 16) | (c2 << 8) | c2;
3904             }
3905         }
3906       c1 &= 0x7F7F7F;
3907       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3908       if (c < 0)
3909         {
3910           MAYBE_FINISH_COMPOSITION ();
3911           for (; src_base < src; src_base++, char_offset++)
3912             {
3913               if (ASCII_BYTE_P (*src_base))
3914                 *charbuf++ = *src_base;
3915               else
3916                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3917             }
3918         }
3919       else if (cmp_status->state == COMPOSING_NO)
3920         {
3921           *charbuf++ = c;
3922           char_offset++;
3923         }
3924       else if ((cmp_status->state == COMPOSING_CHAR
3925                 ? cmp_status->nchars
3926                 : cmp_status->ncomps)
3927                >= MAX_COMPOSITION_COMPONENTS)
3928         {
3929           /* Too long composition.  */
3930           MAYBE_FINISH_COMPOSITION ();
3931           *charbuf++ = c;
3932           char_offset++;
3933         }
3934       else
3935         STORE_COMPOSITION_CHAR (c);
3936       continue;
3937
3938     invalid_code:
3939       MAYBE_FINISH_COMPOSITION ();
3940       src = src_base;
3941       consumed_chars = consumed_chars_base;
3942       ONE_MORE_BYTE (c);
3943       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3944       char_offset++;
3945       coding->errors++;
3946       continue;
3947
3948     break_loop:
3949       break;
3950     }
3951
3952  no_more_source:
3953   if (cmp_status->state != COMPOSING_NO)
3954     {
3955       if (coding->mode & CODING_MODE_LAST_BLOCK)
3956         MAYBE_FINISH_COMPOSITION ();
3957       else
3958         {
3959           charbuf -= cmp_status->length;
3960           for (i = 0; i < cmp_status->length; i++)
3961             cmp_status->carryover[i] = charbuf[i];
3962         }
3963     }
3964   else if (last_id != charset_ascii)
3965     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3966   coding->consumed_char += consumed_chars_base;
3967   coding->consumed = src_base - coding->source;
3968   coding->charbuf_used = charbuf - coding->charbuf;
3969 }
3970
3971
3972 /* ISO2022 encoding stuff.  */
3973
3974 /*
3975    It is not enough to say just "ISO2022" on encoding, we have to
3976    specify more details.  In Emacs, each coding system of ISO2022
3977    variant has the following specifications:
3978         1. Initial designation to G0 thru G3.
3979         2. Allows short-form designation?
3980         3. ASCII should be designated to G0 before control characters?
3981         4. ASCII should be designated to G0 at end of line?
3982         5. 7-bit environment or 8-bit environment?
3983         6. Use locking-shift?
3984         7. Use Single-shift?
3985    And the following two are only for Japanese:
3986         8. Use ASCII in place of JIS0201-1976-Roman?
3987         9. Use JISX0208-1983 in place of JISX0208-1978?
3988    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3989    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3990    details.
3991 */
3992
3993 /* Produce codes (escape sequence) for designating CHARSET to graphic
3994    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3995    '@', 'A', or 'B' and the coding system CODING allows, produce
3996    designation sequence of short-form.  */
3997
3998 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3999   do {                                                                  \
4000     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4001     const char *intermediate_char_94 = "()*+";                          \
4002     const char *intermediate_char_96 = ",-./";                          \
4003     int revision = -1;                                                  \
4004                                                                         \
4005     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4006       revision = CHARSET_ISO_REVISION (charset);                        \
4007                                                                         \
4008     if (revision >= 0)                                                  \
4009       {                                                                 \
4010         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4011         EMIT_ONE_BYTE ('@' + revision);                                 \
4012       }                                                                 \
4013     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4014     if (CHARSET_DIMENSION (charset) == 1)                               \
4015       {                                                                 \
4016         int b;                                                          \
4017         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4018           b = intermediate_char_94[reg];                                \
4019         else                                                            \
4020           b = intermediate_char_96[reg];                                \
4021         EMIT_ONE_ASCII_BYTE (b);                                        \
4022       }                                                                 \
4023     else                                                                \
4024       {                                                                 \
4025         EMIT_ONE_ASCII_BYTE ('$');                                      \
4026         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4027           {                                                             \
4028             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4029                 || reg != 0                                             \
4030                 || final_char < '@' || final_char > 'B')                \
4031               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4032           }                                                             \
4033         else                                                            \
4034           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4035       }                                                                 \
4036     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4037                                                                         \
4038     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4039   } while (0)
4040
4041
4042 /* The following two macros produce codes (control character or escape
4043    sequence) for ISO2022 single-shift functions (single-shift-2 and
4044    single-shift-3).  */
4045
4046 #define ENCODE_SINGLE_SHIFT_2                                           \
4047   do {                                                                  \
4048     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4049       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4050     else                                                                \
4051       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4052     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4053   } while (0)
4054
4055
4056 #define ENCODE_SINGLE_SHIFT_3                                           \
4057   do {                                                                  \
4058     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4059       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4060     else                                                                \
4061       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4062     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4063   } while (0)
4064
4065
4066 /* The following four macros produce codes (control character or
4067    escape sequence) for ISO2022 locking-shift functions (shift-in,
4068    shift-out, locking-shift-2, and locking-shift-3).  */
4069
4070 #define ENCODE_SHIFT_IN                                 \
4071   do {                                                  \
4072     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4073     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4074   } while (0)
4075
4076
4077 #define ENCODE_SHIFT_OUT                                \
4078   do {                                                  \
4079     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4080     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4081   } while (0)
4082
4083
4084 #define ENCODE_LOCKING_SHIFT_2                          \
4085   do {                                                  \
4086     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4087     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4088   } while (0)
4089
4090
4091 #define ENCODE_LOCKING_SHIFT_3                          \
4092   do {                                                  \
4093     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4094     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4095   } while (0)
4096
4097
4098 /* Produce codes for a DIMENSION1 character whose character set is
4099    CHARSET and whose position-code is C1.  Designation and invocation
4100    sequences are also produced in advance if necessary.  */
4101
4102 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4103   do {                                                                  \
4104     int id = CHARSET_ID (charset);                                      \
4105                                                                         \
4106     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4107         && id == charset_ascii)                                         \
4108       {                                                                 \
4109         id = charset_jisx0201_roman;                                    \
4110         charset = CHARSET_FROM_ID (id);                                 \
4111       }                                                                 \
4112                                                                         \
4113     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4114       {                                                                 \
4115         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4116           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4117         else                                                            \
4118           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4119         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4120         break;                                                          \
4121       }                                                                 \
4122     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4123       {                                                                 \
4124         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4125         break;                                                          \
4126       }                                                                 \
4127     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4128       {                                                                 \
4129         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4130         break;                                                          \
4131       }                                                                 \
4132     else                                                                \
4133       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4134          must invoke it, or, at first, designate it to some graphic     \
4135          register.  Then repeat the loop to actually produce the        \
4136          character.  */                                                 \
4137       dst = encode_invocation_designation (charset, coding, dst,        \
4138                                            &produced_chars);            \
4139   } while (1)
4140
4141
4142 /* Produce codes for a DIMENSION2 character whose character set is
4143    CHARSET and whose position-codes are C1 and C2.  Designation and
4144    invocation codes are also produced in advance if necessary.  */
4145
4146 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4147   do {                                                                  \
4148     int id = CHARSET_ID (charset);                                      \
4149                                                                         \
4150     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4151         && id == charset_jisx0208)                                      \
4152       {                                                                 \
4153         id = charset_jisx0208_1978;                                     \
4154         charset = CHARSET_FROM_ID (id);                                 \
4155       }                                                                 \
4156                                                                         \
4157     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4158       {                                                                 \
4159         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4160           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4161         else                                                            \
4162           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4163         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4164         break;                                                          \
4165       }                                                                 \
4166     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4167       {                                                                 \
4168         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4169         break;                                                          \
4170       }                                                                 \
4171     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4172       {                                                                 \
4173         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4174         break;                                                          \
4175       }                                                                 \
4176     else                                                                \
4177       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4178          must invoke it, or, at first, designate it to some graphic     \
4179          register.  Then repeat the loop to actually produce the        \
4180          character.  */                                                 \
4181       dst = encode_invocation_designation (charset, coding, dst,        \
4182                                            &produced_chars);            \
4183   } while (1)
4184
4185
4186 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4187   do {                                                                     \
4188     unsigned code = ENCODE_CHAR ((charset), (c));                          \
4189                                                                            \
4190     if (CHARSET_DIMENSION (charset) == 1)                                  \
4191       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4192     else                                                                   \
4193       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4194   } while (0)
4195
4196
4197 /* Produce designation and invocation codes at a place pointed by DST
4198    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4199    Return new DST.  */
4200
4201 static unsigned char *
4202 encode_invocation_designation (struct charset *charset,
4203                                struct coding_system *coding,
4204                                unsigned char *dst, ptrdiff_t *p_nchars)
4205 {
4206   int multibytep = coding->dst_multibyte;
4207   ptrdiff_t produced_chars = *p_nchars;
4208   int reg;                      /* graphic register number */
4209   int id = CHARSET_ID (charset);
4210
4211   /* At first, check designations.  */
4212   for (reg = 0; reg < 4; reg++)
4213     if (id == CODING_ISO_DESIGNATION (coding, reg))
4214       break;
4215
4216   if (reg >= 4)
4217     {
4218       /* CHARSET is not yet designated to any graphic registers.  */
4219       /* At first check the requested designation.  */
4220       reg = CODING_ISO_REQUEST (coding, id);
4221       if (reg < 0)
4222         /* Since CHARSET requests no special designation, designate it
4223            to graphic register 0.  */
4224         reg = 0;
4225
4226       ENCODE_DESIGNATION (charset, reg, coding);
4227     }
4228
4229   if (CODING_ISO_INVOCATION (coding, 0) != reg
4230       && CODING_ISO_INVOCATION (coding, 1) != reg)
4231     {
4232       /* Since the graphic register REG is not invoked to any graphic
4233          planes, invoke it to graphic plane 0.  */
4234       switch (reg)
4235         {
4236         case 0:                 /* graphic register 0 */
4237           ENCODE_SHIFT_IN;
4238           break;
4239
4240         case 1:                 /* graphic register 1 */
4241           ENCODE_SHIFT_OUT;
4242           break;
4243
4244         case 2:                 /* graphic register 2 */
4245           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4246             ENCODE_SINGLE_SHIFT_2;
4247           else
4248             ENCODE_LOCKING_SHIFT_2;
4249           break;
4250
4251         case 3:                 /* graphic register 3 */
4252           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4253             ENCODE_SINGLE_SHIFT_3;
4254           else
4255             ENCODE_LOCKING_SHIFT_3;
4256           break;
4257         }
4258     }
4259
4260   *p_nchars = produced_chars;
4261   return dst;
4262 }
4263
4264
4265 /* Produce codes for designation and invocation to reset the graphic
4266    planes and registers to initial state.  */
4267 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4268   do {                                                                  \
4269     int reg;                                                            \
4270     struct charset *charset;                                            \
4271                                                                         \
4272     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4273       ENCODE_SHIFT_IN;                                                  \
4274     for (reg = 0; reg < 4; reg++)                                       \
4275       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4276           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4277               != CODING_ISO_INITIAL (coding, reg)))                     \
4278         {                                                               \
4279           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4280           ENCODE_DESIGNATION (charset, reg, coding);                    \
4281         }                                                               \
4282   } while (0)
4283
4284
4285 /* Produce designation sequences of charsets in the line started from
4286    SRC to a place pointed by DST, and return updated DST.
4287
4288    If the current block ends before any end-of-line, we may fail to
4289    find all the necessary designations.  */
4290
4291 static unsigned char *
4292 encode_designation_at_bol (struct coding_system *coding, int *charbuf,
4293                            unsigned char *dst)
4294 {
4295   struct charset *charset;
4296   /* Table of charsets to be designated to each graphic register.  */
4297   int r[4];
4298   int c, found = 0, reg;
4299   ptrdiff_t produced_chars = 0;
4300   int multibytep = coding->dst_multibyte;
4301   Lisp_Object attrs;
4302   Lisp_Object charset_list;
4303
4304   attrs = CODING_ID_ATTRS (coding->id);
4305   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4306   if (EQ (charset_list, Qiso_2022))
4307     charset_list = Viso_2022_charset_list;
4308
4309   for (reg = 0; reg < 4; reg++)
4310     r[reg] = -1;
4311
4312   while (found < 4)
4313     {
4314       int id;
4315
4316       c = *charbuf++;
4317       if (c == '\n')
4318         break;
4319       charset = char_charset (c, charset_list, NULL);
4320       id = CHARSET_ID (charset);
4321       reg = CODING_ISO_REQUEST (coding, id);
4322       if (reg >= 0 && r[reg] < 0)
4323         {
4324           found++;
4325           r[reg] = id;
4326         }
4327     }
4328
4329   if (found)
4330     {
4331       for (reg = 0; reg < 4; reg++)
4332         if (r[reg] >= 0
4333             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4334           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4335     }
4336
4337   return dst;
4338 }
4339
4340 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4341
4342 static int
4343 encode_coding_iso_2022 (struct coding_system *coding)
4344 {
4345   int multibytep = coding->dst_multibyte;
4346   int *charbuf = coding->charbuf;
4347   int *charbuf_end = charbuf + coding->charbuf_used;
4348   unsigned char *dst = coding->destination + coding->produced;
4349   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4350   int safe_room = 16;
4351   int bol_designation
4352     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4353        && CODING_ISO_BOL (coding));
4354   ptrdiff_t produced_chars = 0;
4355   Lisp_Object attrs, eol_type, charset_list;
4356   int ascii_compatible;
4357   int c;
4358   int preferred_charset_id = -1;
4359
4360   CODING_GET_INFO (coding, attrs, charset_list);
4361   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4362   if (VECTORP (eol_type))
4363     eol_type = Qunix;
4364
4365   setup_iso_safe_charsets (attrs);
4366   /* Charset list may have been changed.  */
4367   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4368   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4369
4370   ascii_compatible
4371     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4372        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4373                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4374
4375   while (charbuf < charbuf_end)
4376     {
4377       ASSURE_DESTINATION (safe_room);
4378
4379       if (bol_designation)
4380         {
4381           unsigned char *dst_prev = dst;
4382
4383           /* We have to produce designation sequences if any now.  */
4384           dst = encode_designation_at_bol (coding, charbuf, dst);
4385           bol_designation = 0;
4386           /* We are sure that designation sequences are all ASCII bytes.  */
4387           produced_chars += dst - dst_prev;
4388         }
4389
4390       c = *charbuf++;
4391
4392       if (c < 0)
4393         {
4394           /* Handle an annotation.  */
4395           switch (*charbuf)
4396             {
4397             case CODING_ANNOTATE_COMPOSITION_MASK:
4398               /* Not yet implemented.  */
4399               break;
4400             case CODING_ANNOTATE_CHARSET_MASK:
4401               preferred_charset_id = charbuf[2];
4402               if (preferred_charset_id >= 0
4403                   && NILP (Fmemq (make_number (preferred_charset_id),
4404                                   charset_list)))
4405                 preferred_charset_id = -1;
4406               break;
4407             default:
4408               abort ();
4409             }
4410           charbuf += -c - 1;
4411           continue;
4412         }
4413
4414       /* Now encode the character C.  */
4415       if (c < 0x20 || c == 0x7F)
4416         {
4417           if (c == '\n'
4418               || (c == '\r' && EQ (eol_type, Qmac)))
4419             {
4420               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4421                 ENCODE_RESET_PLANE_AND_REGISTER ();
4422               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4423                 {
4424                   int i;
4425
4426                   for (i = 0; i < 4; i++)
4427                     CODING_ISO_DESIGNATION (coding, i)
4428                       = CODING_ISO_INITIAL (coding, i);
4429                 }
4430               bol_designation
4431                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4432             }
4433           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4434             ENCODE_RESET_PLANE_AND_REGISTER ();
4435           EMIT_ONE_ASCII_BYTE (c);
4436         }
4437       else if (ASCII_CHAR_P (c))
4438         {
4439           if (ascii_compatible)
4440             EMIT_ONE_ASCII_BYTE (c);
4441           else
4442             {
4443               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4444               ENCODE_ISO_CHARACTER (charset, c);
4445             }
4446         }
4447       else if (CHAR_BYTE8_P (c))
4448         {
4449           c = CHAR_TO_BYTE8 (c);
4450           EMIT_ONE_BYTE (c);
4451         }
4452       else
4453         {
4454           struct charset *charset;
4455
4456           if (preferred_charset_id >= 0)
4457             {
4458               charset = CHARSET_FROM_ID (preferred_charset_id);
4459               if (! CHAR_CHARSET_P (c, charset))
4460                 charset = char_charset (c, charset_list, NULL);
4461             }
4462           else
4463             charset = char_charset (c, charset_list, NULL);
4464           if (!charset)
4465             {
4466               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4467                 {
4468                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4469                   charset = CHARSET_FROM_ID (charset_ascii);
4470                 }
4471               else
4472                 {
4473                   c = coding->default_char;
4474                   charset = char_charset (c, charset_list, NULL);
4475                 }
4476             }
4477           ENCODE_ISO_CHARACTER (charset, c);
4478         }
4479     }
4480
4481   if (coding->mode & CODING_MODE_LAST_BLOCK
4482       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4483     {
4484       ASSURE_DESTINATION (safe_room);
4485       ENCODE_RESET_PLANE_AND_REGISTER ();
4486     }
4487   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4488   CODING_ISO_BOL (coding) = bol_designation;
4489   coding->produced_char += produced_chars;
4490   coding->produced = dst - coding->destination;
4491   return 0;
4492 }
4493
4494 \f
4495 /*** 8,9. SJIS and BIG5 handlers ***/
4496
4497 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4498    quite widely.  So, for the moment, Emacs supports them in the bare
4499    C code.  But, in the future, they may be supported only by CCL.  */
4500
4501 /* SJIS is a coding system encoding three character sets: ASCII, right
4502    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4503    as is.  A character of charset katakana-jisx0201 is encoded by
4504    "position-code + 0x80".  A character of charset japanese-jisx0208
4505    is encoded in 2-byte but two position-codes are divided and shifted
4506    so that it fit in the range below.
4507
4508    --- CODE RANGE of SJIS ---
4509    (character set)      (range)
4510    ASCII                0x00 .. 0x7F
4511    KATAKANA-JISX0201    0xA0 .. 0xDF
4512    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4513             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4514    -------------------------------
4515
4516 */
4517
4518 /* BIG5 is a coding system encoding two character sets: ASCII and
4519    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4520    character set and is encoded in two-byte.
4521
4522    --- CODE RANGE of BIG5 ---
4523    (character set)      (range)
4524    ASCII                0x00 .. 0x7F
4525    Big5 (1st byte)      0xA1 .. 0xFE
4526         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4527    --------------------------
4528
4529   */
4530
4531 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4532    Check if a text is encoded in SJIS.  If it is, return
4533    CATEGORY_MASK_SJIS, else return 0.  */
4534
4535 static int
4536 detect_coding_sjis (struct coding_system *coding,
4537                     struct coding_detection_info *detect_info)
4538 {
4539   const unsigned char *src = coding->source, *src_base;
4540   const unsigned char *src_end = coding->source + coding->src_bytes;
4541   int multibytep = coding->src_multibyte;
4542   ptrdiff_t consumed_chars = 0;
4543   int found = 0;
4544   int c;
4545   Lisp_Object attrs, charset_list;
4546   int max_first_byte_of_2_byte_code;
4547
4548   CODING_GET_INFO (coding, attrs, charset_list);
4549   max_first_byte_of_2_byte_code
4550     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4551
4552   detect_info->checked |= CATEGORY_MASK_SJIS;
4553   /* A coding system of this category is always ASCII compatible.  */
4554   src += coding->head_ascii;
4555
4556   while (1)
4557     {
4558       src_base = src;
4559       ONE_MORE_BYTE (c);
4560       if (c < 0x80)
4561         continue;
4562       if ((c >= 0x81 && c <= 0x9F)
4563           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4564         {
4565           ONE_MORE_BYTE (c);
4566           if (c < 0x40 || c == 0x7F || c > 0xFC)
4567             break;
4568           found = CATEGORY_MASK_SJIS;
4569         }
4570       else if (c >= 0xA0 && c < 0xE0)
4571         found = CATEGORY_MASK_SJIS;
4572       else
4573         break;
4574     }
4575   detect_info->rejected |= CATEGORY_MASK_SJIS;
4576   return 0;
4577
4578  no_more_source:
4579   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4580     {
4581       detect_info->rejected |= CATEGORY_MASK_SJIS;
4582       return 0;
4583     }
4584   detect_info->found |= found;
4585   return 1;
4586 }
4587
4588 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4589    Check if a text is encoded in BIG5.  If it is, return
4590    CATEGORY_MASK_BIG5, else return 0.  */
4591
4592 static int
4593 detect_coding_big5 (struct coding_system *coding,
4594                     struct coding_detection_info *detect_info)
4595 {
4596   const unsigned char *src = coding->source, *src_base;
4597   const unsigned char *src_end = coding->source + coding->src_bytes;
4598   int multibytep = coding->src_multibyte;
4599   ptrdiff_t consumed_chars = 0;
4600   int found = 0;
4601   int c;
4602
4603   detect_info->checked |= CATEGORY_MASK_BIG5;
4604   /* A coding system of this category is always ASCII compatible.  */
4605   src += coding->head_ascii;
4606
4607   while (1)
4608     {
4609       src_base = src;
4610       ONE_MORE_BYTE (c);
4611       if (c < 0x80)
4612         continue;
4613       if (c >= 0xA1)
4614         {
4615           ONE_MORE_BYTE (c);
4616           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4617             return 0;
4618           found = CATEGORY_MASK_BIG5;
4619         }
4620       else
4621         break;
4622     }
4623   detect_info->rejected |= CATEGORY_MASK_BIG5;
4624   return 0;
4625
4626  no_more_source:
4627   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4628     {
4629       detect_info->rejected |= CATEGORY_MASK_BIG5;
4630       return 0;
4631     }
4632   detect_info->found |= found;
4633   return 1;
4634 }
4635
4636 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4637    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4638
4639 static void
4640 decode_coding_sjis (struct coding_system *coding)
4641 {
4642   const unsigned char *src = coding->source + coding->consumed;
4643   const unsigned char *src_end = coding->source + coding->src_bytes;
4644   const unsigned char *src_base;
4645   int *charbuf = coding->charbuf + coding->charbuf_used;
4646   /* We may produce one charset annotation in one loop and one more at
4647      the end.  */
4648   int *charbuf_end
4649     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4650   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4651   int multibytep = coding->src_multibyte;
4652   struct charset *charset_roman, *charset_kanji, *charset_kana;
4653   struct charset *charset_kanji2;
4654   Lisp_Object attrs, charset_list, val;
4655   ptrdiff_t char_offset = coding->produced_char;
4656   ptrdiff_t last_offset = char_offset;
4657   int last_id = charset_ascii;
4658   int eol_dos =
4659     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4660   int byte_after_cr = -1;
4661
4662   CODING_GET_INFO (coding, attrs, charset_list);
4663
4664   val = charset_list;
4665   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4666   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4667   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4668   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4669
4670   while (1)
4671     {
4672       int c, c1;
4673       struct charset *charset;
4674
4675       src_base = src;
4676       consumed_chars_base = consumed_chars;
4677
4678       if (charbuf >= charbuf_end)
4679         {
4680           if (byte_after_cr >= 0)
4681             src_base--;
4682           break;
4683         }
4684
4685       if (byte_after_cr >= 0)
4686         c = byte_after_cr, byte_after_cr = -1;
4687       else
4688         ONE_MORE_BYTE (c);
4689       if (c < 0)
4690         goto invalid_code;
4691       if (c < 0x80)
4692         {
4693           if (eol_dos && c == '\r')
4694             ONE_MORE_BYTE (byte_after_cr);
4695           charset = charset_roman;
4696         }
4697       else if (c == 0x80 || c == 0xA0)
4698         goto invalid_code;
4699       else if (c >= 0xA1 && c <= 0xDF)
4700         {
4701           /* SJIS -> JISX0201-Kana */
4702           c &= 0x7F;
4703           charset = charset_kana;
4704         }
4705       else if (c <= 0xEF)
4706         {
4707           /* SJIS -> JISX0208 */
4708           ONE_MORE_BYTE (c1);
4709           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4710             goto invalid_code;
4711           c = (c << 8) | c1;
4712           SJIS_TO_JIS (c);
4713           charset = charset_kanji;
4714         }
4715       else if (c <= 0xFC && charset_kanji2)
4716         {
4717           /* SJIS -> JISX0213-2 */
4718           ONE_MORE_BYTE (c1);
4719           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4720             goto invalid_code;
4721           c = (c << 8) | c1;
4722           SJIS_TO_JIS2 (c);
4723           charset = charset_kanji2;
4724         }
4725       else
4726         goto invalid_code;
4727       if (charset->id != charset_ascii
4728           && last_id != charset->id)
4729         {
4730           if (last_id != charset_ascii)
4731             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4732           last_id = charset->id;
4733           last_offset = char_offset;
4734         }
4735       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4736       *charbuf++ = c;
4737       char_offset++;
4738       continue;
4739
4740     invalid_code:
4741       src = src_base;
4742       consumed_chars = consumed_chars_base;
4743       ONE_MORE_BYTE (c);
4744       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4745       char_offset++;
4746       coding->errors++;
4747     }
4748
4749  no_more_source:
4750   if (last_id != charset_ascii)
4751     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4752   coding->consumed_char += consumed_chars_base;
4753   coding->consumed = src_base - coding->source;
4754   coding->charbuf_used = charbuf - coding->charbuf;
4755 }
4756
4757 static void
4758 decode_coding_big5 (struct coding_system *coding)
4759 {
4760   const unsigned char *src = coding->source + coding->consumed;
4761   const unsigned char *src_end = coding->source + coding->src_bytes;
4762   const unsigned char *src_base;
4763   int *charbuf = coding->charbuf + coding->charbuf_used;
4764   /* We may produce one charset annotation in one loop and one more at
4765      the end.  */
4766   int *charbuf_end
4767     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4768   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4769   int multibytep = coding->src_multibyte;
4770   struct charset *charset_roman, *charset_big5;
4771   Lisp_Object attrs, charset_list, val;
4772   ptrdiff_t char_offset = coding->produced_char;
4773   ptrdiff_t last_offset = char_offset;
4774   int last_id = charset_ascii;
4775   int eol_dos =
4776     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4777   int byte_after_cr = -1;
4778
4779   CODING_GET_INFO (coding, attrs, charset_list);
4780   val = charset_list;
4781   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4782   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4783
4784   while (1)
4785     {
4786       int c, c1;
4787       struct charset *charset;
4788
4789       src_base = src;
4790       consumed_chars_base = consumed_chars;
4791
4792       if (charbuf >= charbuf_end)
4793         {
4794           if (byte_after_cr >= 0)
4795             src_base--;
4796           break;
4797         }
4798
4799       if (byte_after_cr >= 0)
4800         c = byte_after_cr, byte_after_cr = -1;
4801       else
4802         ONE_MORE_BYTE (c);
4803
4804       if (c < 0)
4805         goto invalid_code;
4806       if (c < 0x80)
4807         {
4808           if (eol_dos && c == '\r')
4809             ONE_MORE_BYTE (byte_after_cr);
4810           charset = charset_roman;
4811         }
4812       else
4813         {
4814           /* BIG5 -> Big5 */
4815           if (c < 0xA1 || c > 0xFE)
4816             goto invalid_code;
4817           ONE_MORE_BYTE (c1);
4818           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4819             goto invalid_code;
4820           c = c << 8 | c1;
4821           charset = charset_big5;
4822         }
4823       if (charset->id != charset_ascii
4824           && last_id != charset->id)
4825         {
4826           if (last_id != charset_ascii)
4827             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4828           last_id = charset->id;
4829           last_offset = char_offset;
4830         }
4831       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4832       *charbuf++ = c;
4833       char_offset++;
4834       continue;
4835
4836     invalid_code:
4837       src = src_base;
4838       consumed_chars = consumed_chars_base;
4839       ONE_MORE_BYTE (c);
4840       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4841       char_offset++;
4842       coding->errors++;
4843     }
4844
4845  no_more_source:
4846   if (last_id != charset_ascii)
4847     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4848   coding->consumed_char += consumed_chars_base;
4849   coding->consumed = src_base - coding->source;
4850   coding->charbuf_used = charbuf - coding->charbuf;
4851 }
4852
4853 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4854    This function can encode charsets `ascii', `katakana-jisx0201',
4855    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4856    are sure that all these charsets are registered as official charset
4857    (i.e. do not have extended leading-codes).  Characters of other
4858    charsets are produced without any encoding.  If SJIS_P is 1, encode
4859    SJIS text, else encode BIG5 text.  */
4860
4861 static int
4862 encode_coding_sjis (struct coding_system *coding)
4863 {
4864   int multibytep = coding->dst_multibyte;
4865   int *charbuf = coding->charbuf;
4866   int *charbuf_end = charbuf + coding->charbuf_used;
4867   unsigned char *dst = coding->destination + coding->produced;
4868   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4869   int safe_room = 4;
4870   ptrdiff_t produced_chars = 0;
4871   Lisp_Object attrs, charset_list, val;
4872   int ascii_compatible;
4873   struct charset *charset_kanji, *charset_kana;
4874   struct charset *charset_kanji2;
4875   int c;
4876
4877   CODING_GET_INFO (coding, attrs, charset_list);
4878   val = XCDR (charset_list);
4879   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4880   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4881   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4882
4883   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4884
4885   while (charbuf < charbuf_end)
4886     {
4887       ASSURE_DESTINATION (safe_room);
4888       c = *charbuf++;
4889       /* Now encode the character C.  */
4890       if (ASCII_CHAR_P (c) && ascii_compatible)
4891         EMIT_ONE_ASCII_BYTE (c);
4892       else if (CHAR_BYTE8_P (c))
4893         {
4894           c = CHAR_TO_BYTE8 (c);
4895           EMIT_ONE_BYTE (c);
4896         }
4897       else
4898         {
4899           unsigned code;
4900           struct charset *charset = char_charset (c, charset_list, &code);
4901
4902           if (!charset)
4903             {
4904               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4905                 {
4906                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4907                   charset = CHARSET_FROM_ID (charset_ascii);
4908                 }
4909               else
4910                 {
4911                   c = coding->default_char;
4912                   charset = char_charset (c, charset_list, &code);
4913                 }
4914             }
4915           if (code == CHARSET_INVALID_CODE (charset))
4916             abort ();
4917           if (charset == charset_kanji)
4918             {
4919               int c1, c2;
4920               JIS_TO_SJIS (code);
4921               c1 = code >> 8, c2 = code & 0xFF;
4922               EMIT_TWO_BYTES (c1, c2);
4923             }
4924           else if (charset == charset_kana)
4925             EMIT_ONE_BYTE (code | 0x80);
4926           else if (charset_kanji2 && charset == charset_kanji2)
4927             {
4928               int c1, c2;
4929
4930               c1 = code >> 8;
4931               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4932                   || c1 == 0x28
4933                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4934                 {
4935                   JIS_TO_SJIS2 (code);
4936                   c1 = code >> 8, c2 = code & 0xFF;
4937                   EMIT_TWO_BYTES (c1, c2);
4938                 }
4939               else
4940                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4941             }
4942           else
4943             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4944         }
4945     }
4946   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4947   coding->produced_char += produced_chars;
4948   coding->produced = dst - coding->destination;
4949   return 0;
4950 }
4951
4952 static int
4953 encode_coding_big5 (struct coding_system *coding)
4954 {
4955   int multibytep = coding->dst_multibyte;
4956   int *charbuf = coding->charbuf;
4957   int *charbuf_end = charbuf + coding->charbuf_used;
4958   unsigned char *dst = coding->destination + coding->produced;
4959   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4960   int safe_room = 4;
4961   ptrdiff_t produced_chars = 0;
4962   Lisp_Object attrs, charset_list, val;
4963   int ascii_compatible;
4964   struct charset *charset_big5;
4965   int c;
4966
4967   CODING_GET_INFO (coding, attrs, charset_list);
4968   val = XCDR (charset_list);
4969   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4970   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4971
4972   while (charbuf < charbuf_end)
4973     {
4974       ASSURE_DESTINATION (safe_room);
4975       c = *charbuf++;
4976       /* Now encode the character C.  */
4977       if (ASCII_CHAR_P (c) && ascii_compatible)
4978         EMIT_ONE_ASCII_BYTE (c);
4979       else if (CHAR_BYTE8_P (c))
4980         {
4981           c = CHAR_TO_BYTE8 (c);
4982           EMIT_ONE_BYTE (c);
4983         }
4984       else
4985         {
4986           unsigned code;
4987           struct charset *charset = char_charset (c, charset_list, &code);
4988
4989           if (! charset)
4990             {
4991               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4992                 {
4993                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4994                   charset = CHARSET_FROM_ID (charset_ascii);
4995                 }
4996               else
4997                 {
4998                   c = coding->default_char;
4999                   charset = char_charset (c, charset_list, &code);
5000                 }
5001             }
5002           if (code == CHARSET_INVALID_CODE (charset))
5003             abort ();
5004           if (charset == charset_big5)
5005             {
5006               int c1, c2;
5007
5008               c1 = code >> 8, c2 = code & 0xFF;
5009               EMIT_TWO_BYTES (c1, c2);
5010             }
5011           else
5012             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5013         }
5014     }
5015   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5016   coding->produced_char += produced_chars;
5017   coding->produced = dst - coding->destination;
5018   return 0;
5019 }
5020
5021 \f
5022 /*** 10. CCL handlers ***/
5023
5024 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5025    Check if a text is encoded in a coding system of which
5026    encoder/decoder are written in CCL program.  If it is, return
5027    CATEGORY_MASK_CCL, else return 0.  */
5028
5029 static int
5030 detect_coding_ccl (struct coding_system *coding,
5031                    struct coding_detection_info *detect_info)
5032 {
5033   const unsigned char *src = coding->source, *src_base;
5034   const unsigned char *src_end = coding->source + coding->src_bytes;
5035   int multibytep = coding->src_multibyte;
5036   ptrdiff_t consumed_chars = 0;
5037   int found = 0;
5038   unsigned char *valids;
5039   ptrdiff_t head_ascii = coding->head_ascii;
5040   Lisp_Object attrs;
5041
5042   detect_info->checked |= CATEGORY_MASK_CCL;
5043
5044   coding = &coding_categories[coding_category_ccl];
5045   valids = CODING_CCL_VALIDS (coding);
5046   attrs = CODING_ID_ATTRS (coding->id);
5047   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5048     src += head_ascii;
5049
5050   while (1)
5051     {
5052       int c;
5053
5054       src_base = src;
5055       ONE_MORE_BYTE (c);
5056       if (c < 0 || ! valids[c])
5057         break;
5058       if ((valids[c] > 1))
5059         found = CATEGORY_MASK_CCL;
5060     }
5061   detect_info->rejected |= CATEGORY_MASK_CCL;
5062   return 0;
5063
5064  no_more_source:
5065   detect_info->found |= found;
5066   return 1;
5067 }
5068
5069 static void
5070 decode_coding_ccl (struct coding_system *coding)
5071 {
5072   const unsigned char *src = coding->source + coding->consumed;
5073   const unsigned char *src_end = coding->source + coding->src_bytes;
5074   int *charbuf = coding->charbuf + coding->charbuf_used;
5075   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5076   ptrdiff_t consumed_chars = 0;
5077   int multibytep = coding->src_multibyte;
5078   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5079   int source_charbuf[1024];
5080   int source_byteidx[1025];
5081   Lisp_Object attrs, charset_list;
5082
5083   CODING_GET_INFO (coding, attrs, charset_list);
5084
5085   while (1)
5086     {
5087       const unsigned char *p = src;
5088       int i = 0;
5089
5090       if (multibytep)
5091         {
5092           while (i < 1024 && p < src_end)
5093             {
5094               source_byteidx[i] = p - src;
5095               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5096             }
5097           source_byteidx[i] = p - src;
5098         }
5099       else
5100         while (i < 1024 && p < src_end)
5101           source_charbuf[i++] = *p++;
5102
5103       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5104         ccl->last_block = 1;
5105       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5106                   charset_list);
5107       charbuf += ccl->produced;
5108       if (multibytep)
5109         src += source_byteidx[ccl->consumed];
5110       else
5111         src += ccl->consumed;
5112       consumed_chars += ccl->consumed;
5113       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5114         break;
5115     }
5116
5117   switch (ccl->status)
5118     {
5119     case CCL_STAT_SUSPEND_BY_SRC:
5120       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5121       break;
5122     case CCL_STAT_SUSPEND_BY_DST:
5123       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5124       break;
5125     case CCL_STAT_QUIT:
5126     case CCL_STAT_INVALID_CMD:
5127       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5128       break;
5129     default:
5130       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5131       break;
5132     }
5133   coding->consumed_char += consumed_chars;
5134   coding->consumed = src - coding->source;
5135   coding->charbuf_used = charbuf - coding->charbuf;
5136 }
5137
5138 static int
5139 encode_coding_ccl (struct coding_system *coding)
5140 {
5141   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5142   int multibytep = coding->dst_multibyte;
5143   int *charbuf = coding->charbuf;
5144   int *charbuf_end = charbuf + coding->charbuf_used;
5145   unsigned char *dst = coding->destination + coding->produced;
5146   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5147   int destination_charbuf[1024];
5148   ptrdiff_t produced_chars = 0;
5149   int i;
5150   Lisp_Object attrs, charset_list;
5151
5152   CODING_GET_INFO (coding, attrs, charset_list);
5153   if (coding->consumed_char == coding->src_chars
5154       && coding->mode & CODING_MODE_LAST_BLOCK)
5155     ccl->last_block = 1;
5156
5157   while (charbuf < charbuf_end)
5158     {
5159       ccl_driver (ccl, charbuf, destination_charbuf,
5160                   charbuf_end - charbuf, 1024, charset_list);
5161       if (multibytep)
5162         {
5163           ASSURE_DESTINATION (ccl->produced * 2);
5164           for (i = 0; i < ccl->produced; i++)
5165             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5166         }
5167       else
5168         {
5169           ASSURE_DESTINATION (ccl->produced);
5170           for (i = 0; i < ccl->produced; i++)
5171             *dst++ = destination_charbuf[i] & 0xFF;
5172           produced_chars += ccl->produced;
5173         }
5174       charbuf += ccl->consumed;
5175       if (ccl->status == CCL_STAT_QUIT
5176           || ccl->status == CCL_STAT_INVALID_CMD)
5177         break;
5178     }
5179
5180   switch (ccl->status)
5181     {
5182     case CCL_STAT_SUSPEND_BY_SRC:
5183       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5184       break;
5185     case CCL_STAT_SUSPEND_BY_DST:
5186       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5187       break;
5188     case CCL_STAT_QUIT:
5189     case CCL_STAT_INVALID_CMD:
5190       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5191       break;
5192     default:
5193       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5194       break;
5195     }
5196
5197   coding->produced_char += produced_chars;
5198   coding->produced = dst - coding->destination;
5199   return 0;
5200 }
5201
5202
5203 \f
5204 /*** 10, 11. no-conversion handlers ***/
5205
5206 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5207
5208 static void
5209 decode_coding_raw_text (struct coding_system *coding)
5210 {
5211   int eol_dos =
5212     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5213
5214   coding->chars_at_source = 1;
5215   coding->consumed_char = coding->src_chars;
5216   coding->consumed = coding->src_bytes;
5217   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5218     {
5219       coding->consumed_char--;
5220       coding->consumed--;
5221       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5222     }
5223   else
5224     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5225 }
5226
5227 static int
5228 encode_coding_raw_text (struct coding_system *coding)
5229 {
5230   int multibytep = coding->dst_multibyte;
5231   int *charbuf = coding->charbuf;
5232   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5233   unsigned char *dst = coding->destination + coding->produced;
5234   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5235   ptrdiff_t produced_chars = 0;
5236   int c;
5237
5238   if (multibytep)
5239     {
5240       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5241
5242       if (coding->src_multibyte)
5243         while (charbuf < charbuf_end)
5244           {
5245             ASSURE_DESTINATION (safe_room);
5246             c = *charbuf++;
5247             if (ASCII_CHAR_P (c))
5248               EMIT_ONE_ASCII_BYTE (c);
5249             else if (CHAR_BYTE8_P (c))
5250               {
5251                 c = CHAR_TO_BYTE8 (c);
5252                 EMIT_ONE_BYTE (c);
5253               }
5254             else
5255               {
5256                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5257
5258                 CHAR_STRING_ADVANCE (c, p1);
5259                 do
5260                   {
5261                     EMIT_ONE_BYTE (*p0);
5262                     p0++;
5263                   }
5264                 while (p0 < p1);
5265               }
5266           }
5267       else
5268         while (charbuf < charbuf_end)
5269           {
5270             ASSURE_DESTINATION (safe_room);
5271             c = *charbuf++;
5272             EMIT_ONE_BYTE (c);
5273           }
5274     }
5275   else
5276     {
5277       if (coding->src_multibyte)
5278         {
5279           int safe_room = MAX_MULTIBYTE_LENGTH;
5280
5281           while (charbuf < charbuf_end)
5282             {
5283               ASSURE_DESTINATION (safe_room);
5284               c = *charbuf++;
5285               if (ASCII_CHAR_P (c))
5286                 *dst++ = c;
5287               else if (CHAR_BYTE8_P (c))
5288                 *dst++ = CHAR_TO_BYTE8 (c);
5289               else
5290                 CHAR_STRING_ADVANCE (c, dst);
5291             }
5292         }
5293       else
5294         {
5295           ASSURE_DESTINATION (charbuf_end - charbuf);
5296           while (charbuf < charbuf_end && dst < dst_end)
5297             *dst++ = *charbuf++;
5298         }
5299       produced_chars = dst - (coding->destination + coding->produced);
5300     }
5301   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5302   coding->produced_char += produced_chars;
5303   coding->produced = dst - coding->destination;
5304   return 0;
5305 }
5306
5307 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5308    Check if a text is encoded in a charset-based coding system.  If it
5309    is, return 1, else return 0.  */
5310
5311 static int
5312 detect_coding_charset (struct coding_system *coding,
5313                        struct coding_detection_info *detect_info)
5314 {
5315   const unsigned char *src = coding->source, *src_base;
5316   const unsigned char *src_end = coding->source + coding->src_bytes;
5317   int multibytep = coding->src_multibyte;
5318   ptrdiff_t consumed_chars = 0;
5319   Lisp_Object attrs, valids, name;
5320   int found = 0;
5321   ptrdiff_t head_ascii = coding->head_ascii;
5322   int check_latin_extra = 0;
5323
5324   detect_info->checked |= CATEGORY_MASK_CHARSET;
5325
5326   coding = &coding_categories[coding_category_charset];
5327   attrs = CODING_ID_ATTRS (coding->id);
5328   valids = AREF (attrs, coding_attr_charset_valids);
5329   name = CODING_ID_NAME (coding->id);
5330   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5331                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5332       || strncmp (SSDATA (SYMBOL_NAME (name)),
5333                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5334     check_latin_extra = 1;
5335
5336   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5337     src += head_ascii;
5338
5339   while (1)
5340     {
5341       int c;
5342       Lisp_Object val;
5343       struct charset *charset;
5344       int dim, idx;
5345
5346       src_base = src;
5347       ONE_MORE_BYTE (c);
5348       if (c < 0)
5349         continue;
5350       val = AREF (valids, c);
5351       if (NILP (val))
5352         break;
5353       if (c >= 0x80)
5354         {
5355           if (c < 0xA0
5356               && check_latin_extra
5357               && (!VECTORP (Vlatin_extra_code_table)
5358                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5359             break;
5360           found = CATEGORY_MASK_CHARSET;
5361         }
5362       if (INTEGERP (val))
5363         {
5364           charset = CHARSET_FROM_ID (XFASTINT (val));
5365           dim = CHARSET_DIMENSION (charset);
5366           for (idx = 1; idx < dim; idx++)
5367             {
5368               if (src == src_end)
5369                 goto too_short;
5370               ONE_MORE_BYTE (c);
5371               if (c < charset->code_space[(dim - 1 - idx) * 4]
5372                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5373                 break;
5374             }
5375           if (idx < dim)
5376             break;
5377         }
5378       else
5379         {
5380           idx = 1;
5381           for (; CONSP (val); val = XCDR (val))
5382             {
5383               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5384               dim = CHARSET_DIMENSION (charset);
5385               while (idx < dim)
5386                 {
5387                   if (src == src_end)
5388                     goto too_short;
5389                   ONE_MORE_BYTE (c);
5390                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5391                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5392                     break;
5393                   idx++;
5394                 }
5395               if (idx == dim)
5396                 {
5397                   val = Qnil;
5398                   break;
5399                 }
5400             }
5401           if (CONSP (val))
5402             break;
5403         }
5404     }
5405  too_short:
5406   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5407   return 0;
5408
5409  no_more_source:
5410   detect_info->found |= found;
5411   return 1;
5412 }
5413
5414 static void
5415 decode_coding_charset (struct coding_system *coding)
5416 {
5417   const unsigned char *src = coding->source + coding->consumed;
5418   const unsigned char *src_end = coding->source + coding->src_bytes;
5419   const unsigned char *src_base;
5420   int *charbuf = coding->charbuf + coding->charbuf_used;
5421   /* We may produce one charset annotation in one loop and one more at
5422      the end.  */
5423   int *charbuf_end
5424     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5425   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5426   int multibytep = coding->src_multibyte;
5427   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5428   Lisp_Object valids;
5429   ptrdiff_t char_offset = coding->produced_char;
5430   ptrdiff_t last_offset = char_offset;
5431   int last_id = charset_ascii;
5432   int eol_dos =
5433     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5434   int byte_after_cr = -1;
5435
5436   valids = AREF (attrs, coding_attr_charset_valids);
5437
5438   while (1)
5439     {
5440       int c;
5441       Lisp_Object val;
5442       struct charset *charset;
5443       int dim;
5444       int len = 1;
5445       unsigned code;
5446
5447       src_base = src;
5448       consumed_chars_base = consumed_chars;
5449
5450       if (charbuf >= charbuf_end)
5451         {
5452           if (byte_after_cr >= 0)
5453             src_base--;
5454           break;
5455         }
5456
5457       if (byte_after_cr >= 0)
5458         {
5459           c = byte_after_cr;
5460           byte_after_cr = -1;
5461         }
5462       else
5463         {
5464           ONE_MORE_BYTE (c);
5465           if (eol_dos && c == '\r')
5466             ONE_MORE_BYTE (byte_after_cr);
5467         }
5468       if (c < 0)
5469         goto invalid_code;
5470       code = c;
5471
5472       val = AREF (valids, c);
5473       if (! INTEGERP (val) && ! CONSP (val))
5474         goto invalid_code;
5475       if (INTEGERP (val))
5476         {
5477           charset = CHARSET_FROM_ID (XFASTINT (val));
5478           dim = CHARSET_DIMENSION (charset);
5479           while (len < dim)
5480             {
5481               ONE_MORE_BYTE (c);
5482               code = (code << 8) | c;
5483               len++;
5484             }
5485           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5486                               charset, code, c);
5487         }
5488       else
5489         {
5490           /* VAL is a list of charset IDs.  It is assured that the
5491              list is sorted by charset dimensions (smaller one
5492              comes first).  */
5493           while (CONSP (val))
5494             {
5495               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5496               dim = CHARSET_DIMENSION (charset);
5497               while (len < dim)
5498                 {
5499                   ONE_MORE_BYTE (c);
5500                   code = (code << 8) | c;
5501                   len++;
5502                 }
5503               CODING_DECODE_CHAR (coding, src, src_base,
5504                                   src_end, charset, code, c);
5505               if (c >= 0)
5506                 break;
5507               val = XCDR (val);
5508             }
5509         }
5510       if (c < 0)
5511         goto invalid_code;
5512       if (charset->id != charset_ascii
5513           && last_id != charset->id)
5514         {
5515           if (last_id != charset_ascii)
5516             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5517           last_id = charset->id;
5518           last_offset = char_offset;
5519         }
5520
5521       *charbuf++ = c;
5522       char_offset++;
5523       continue;
5524
5525     invalid_code:
5526       src = src_base;
5527       consumed_chars = consumed_chars_base;
5528       ONE_MORE_BYTE (c);
5529       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5530       char_offset++;
5531       coding->errors++;
5532     }
5533
5534  no_more_source:
5535   if (last_id != charset_ascii)
5536     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5537   coding->consumed_char += consumed_chars_base;
5538   coding->consumed = src_base - coding->source;
5539   coding->charbuf_used = charbuf - coding->charbuf;
5540 }
5541
5542 static int
5543 encode_coding_charset (struct coding_system *coding)
5544 {
5545   int multibytep = coding->dst_multibyte;
5546   int *charbuf = coding->charbuf;
5547   int *charbuf_end = charbuf + coding->charbuf_used;
5548   unsigned char *dst = coding->destination + coding->produced;
5549   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5550   int safe_room = MAX_MULTIBYTE_LENGTH;
5551   ptrdiff_t produced_chars = 0;
5552   Lisp_Object attrs, charset_list;
5553   int ascii_compatible;
5554   int c;
5555
5556   CODING_GET_INFO (coding, attrs, charset_list);
5557   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5558
5559   while (charbuf < charbuf_end)
5560     {
5561       struct charset *charset;
5562       unsigned code;
5563
5564       ASSURE_DESTINATION (safe_room);
5565       c = *charbuf++;
5566       if (ascii_compatible && ASCII_CHAR_P (c))
5567         EMIT_ONE_ASCII_BYTE (c);
5568       else if (CHAR_BYTE8_P (c))
5569         {
5570           c = CHAR_TO_BYTE8 (c);
5571           EMIT_ONE_BYTE (c);
5572         }
5573       else
5574         {
5575           charset = char_charset (c, charset_list, &code);
5576           if (charset)
5577             {
5578               if (CHARSET_DIMENSION (charset) == 1)
5579                 EMIT_ONE_BYTE (code);
5580               else if (CHARSET_DIMENSION (charset) == 2)
5581                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5582               else if (CHARSET_DIMENSION (charset) == 3)
5583                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5584               else
5585                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5586                                  (code >> 8) & 0xFF, code & 0xFF);
5587             }
5588           else
5589             {
5590               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5591                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5592               else
5593                 c = coding->default_char;
5594               EMIT_ONE_BYTE (c);
5595             }
5596         }
5597     }
5598
5599   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5600   coding->produced_char += produced_chars;
5601   coding->produced = dst - coding->destination;
5602   return 0;
5603 }
5604
5605 \f
5606 /*** 7. C library functions ***/
5607
5608 /* Setup coding context CODING from information about CODING_SYSTEM.
5609    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5610    CODING_SYSTEM is invalid, signal an error.  */
5611
5612 void
5613 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5614 {
5615   Lisp_Object attrs;
5616   Lisp_Object eol_type;
5617   Lisp_Object coding_type;
5618   Lisp_Object val;
5619
5620   if (NILP (coding_system))
5621     coding_system = Qundecided;
5622
5623   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5624
5625   attrs = CODING_ID_ATTRS (coding->id);
5626   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5627
5628   coding->mode = 0;
5629   coding->head_ascii = -1;
5630   if (VECTORP (eol_type))
5631     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5632                             | CODING_REQUIRE_DETECTION_MASK);
5633   else if (! EQ (eol_type, Qunix))
5634     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5635                             | CODING_REQUIRE_ENCODING_MASK);
5636   else
5637     coding->common_flags = 0;
5638   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5639     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5640   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5641     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5642   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5643     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5644
5645   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5646   coding->max_charset_id = SCHARS (val) - 1;
5647   coding->safe_charsets = SDATA (val);
5648   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5649   coding->carryover_bytes = 0;
5650
5651   coding_type = CODING_ATTR_TYPE (attrs);
5652   if (EQ (coding_type, Qundecided))
5653     {
5654       coding->detector = NULL;
5655       coding->decoder = decode_coding_raw_text;
5656       coding->encoder = encode_coding_raw_text;
5657       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5658     }
5659   else if (EQ (coding_type, Qiso_2022))
5660     {
5661       int i;
5662       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5663
5664       /* Invoke graphic register 0 to plane 0.  */
5665       CODING_ISO_INVOCATION (coding, 0) = 0;
5666       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5667       CODING_ISO_INVOCATION (coding, 1)
5668         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5669       /* Setup the initial status of designation.  */
5670       for (i = 0; i < 4; i++)
5671         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5672       /* Not single shifting initially.  */
5673       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5674       /* Beginning of buffer should also be regarded as bol. */
5675       CODING_ISO_BOL (coding) = 1;
5676       coding->detector = detect_coding_iso_2022;
5677       coding->decoder = decode_coding_iso_2022;
5678       coding->encoder = encode_coding_iso_2022;
5679       if (flags & CODING_ISO_FLAG_SAFE)
5680         coding->mode |= CODING_MODE_SAFE_ENCODING;
5681       coding->common_flags
5682         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5683             | CODING_REQUIRE_FLUSHING_MASK);
5684       if (flags & CODING_ISO_FLAG_COMPOSITION)
5685         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5686       if (flags & CODING_ISO_FLAG_DESIGNATION)
5687         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5688       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5689         {
5690           setup_iso_safe_charsets (attrs);
5691           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5692           coding->max_charset_id = SCHARS (val) - 1;
5693           coding->safe_charsets = SDATA (val);
5694         }
5695       CODING_ISO_FLAGS (coding) = flags;
5696       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5697       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5698       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5699       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5700     }
5701   else if (EQ (coding_type, Qcharset))
5702     {
5703       coding->detector = detect_coding_charset;
5704       coding->decoder = decode_coding_charset;
5705       coding->encoder = encode_coding_charset;
5706       coding->common_flags
5707         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5708     }
5709   else if (EQ (coding_type, Qutf_8))
5710     {
5711       val = AREF (attrs, coding_attr_utf_bom);
5712       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5713                                    : EQ (val, Qt) ? utf_with_bom
5714                                    : utf_without_bom);
5715       coding->detector = detect_coding_utf_8;
5716       coding->decoder = decode_coding_utf_8;
5717       coding->encoder = encode_coding_utf_8;
5718       coding->common_flags
5719         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5720       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5721         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5722     }
5723   else if (EQ (coding_type, Qutf_16))
5724     {
5725       val = AREF (attrs, coding_attr_utf_bom);
5726       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5727                                     : EQ (val, Qt) ? utf_with_bom
5728                                     : utf_without_bom);
5729       val = AREF (attrs, coding_attr_utf_16_endian);
5730       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5731                                        : utf_16_little_endian);
5732       CODING_UTF_16_SURROGATE (coding) = 0;
5733       coding->detector = detect_coding_utf_16;
5734       coding->decoder = decode_coding_utf_16;
5735       coding->encoder = encode_coding_utf_16;
5736       coding->common_flags
5737         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5738       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5739         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5740     }
5741   else if (EQ (coding_type, Qccl))
5742     {
5743       coding->detector = detect_coding_ccl;
5744       coding->decoder = decode_coding_ccl;
5745       coding->encoder = encode_coding_ccl;
5746       coding->common_flags
5747         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5748             | CODING_REQUIRE_FLUSHING_MASK);
5749     }
5750   else if (EQ (coding_type, Qemacs_mule))
5751     {
5752       coding->detector = detect_coding_emacs_mule;
5753       coding->decoder = decode_coding_emacs_mule;
5754       coding->encoder = encode_coding_emacs_mule;
5755       coding->common_flags
5756         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5757       coding->spec.emacs_mule.full_support = 1;
5758       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5759           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5760         {
5761           Lisp_Object tail, safe_charsets;
5762           int max_charset_id = 0;
5763
5764           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5765                tail = XCDR (tail))
5766             if (max_charset_id < XFASTINT (XCAR (tail)))
5767               max_charset_id = XFASTINT (XCAR (tail));
5768           safe_charsets = make_uninit_string (max_charset_id + 1);
5769           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5770           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5771                tail = XCDR (tail))
5772             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5773           coding->max_charset_id = max_charset_id;
5774           coding->safe_charsets = SDATA (safe_charsets);
5775           coding->spec.emacs_mule.full_support = 1;
5776         }
5777       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5778       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5779     }
5780   else if (EQ (coding_type, Qshift_jis))
5781     {
5782       coding->detector = detect_coding_sjis;
5783       coding->decoder = decode_coding_sjis;
5784       coding->encoder = encode_coding_sjis;
5785       coding->common_flags
5786         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5787     }
5788   else if (EQ (coding_type, Qbig5))
5789     {
5790       coding->detector = detect_coding_big5;
5791       coding->decoder = decode_coding_big5;
5792       coding->encoder = encode_coding_big5;
5793       coding->common_flags
5794         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5795     }
5796   else                          /* EQ (coding_type, Qraw_text) */
5797     {
5798       coding->detector = NULL;
5799       coding->decoder = decode_coding_raw_text;
5800       coding->encoder = encode_coding_raw_text;
5801       if (! EQ (eol_type, Qunix))
5802         {
5803           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5804           if (! VECTORP (eol_type))
5805             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5806         }
5807
5808     }
5809
5810   return;
5811 }
5812
5813 /* Return a list of charsets supported by CODING.  */
5814
5815 Lisp_Object
5816 coding_charset_list (struct coding_system *coding)
5817 {
5818   Lisp_Object attrs, charset_list;
5819
5820   CODING_GET_INFO (coding, attrs, charset_list);
5821   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5822     {
5823       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5824
5825       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5826         charset_list = Viso_2022_charset_list;
5827     }
5828   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5829     {
5830       charset_list = Vemacs_mule_charset_list;
5831     }
5832   return charset_list;
5833 }
5834
5835
5836 /* Return a list of charsets supported by CODING-SYSTEM.  */
5837
5838 Lisp_Object
5839 coding_system_charset_list (Lisp_Object coding_system)
5840 {
5841   ptrdiff_t id;
5842   Lisp_Object attrs, charset_list;
5843
5844   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5845   attrs = CODING_ID_ATTRS (id);
5846
5847   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5848     {
5849       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5850
5851       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5852         charset_list = Viso_2022_charset_list;
5853       else
5854         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5855     }
5856   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5857     {
5858       charset_list = Vemacs_mule_charset_list;
5859     }
5860   else
5861     {
5862       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5863     }
5864   return charset_list;
5865 }
5866
5867
5868 /* Return raw-text or one of its subsidiaries that has the same
5869    eol_type as CODING-SYSTEM.  */
5870
5871 Lisp_Object
5872 raw_text_coding_system (Lisp_Object coding_system)
5873 {
5874   Lisp_Object spec, attrs;
5875   Lisp_Object eol_type, raw_text_eol_type;
5876
5877   if (NILP (coding_system))
5878     return Qraw_text;
5879   spec = CODING_SYSTEM_SPEC (coding_system);
5880   attrs = AREF (spec, 0);
5881
5882   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5883     return coding_system;
5884
5885   eol_type = AREF (spec, 2);
5886   if (VECTORP (eol_type))
5887     return Qraw_text;
5888   spec = CODING_SYSTEM_SPEC (Qraw_text);
5889   raw_text_eol_type = AREF (spec, 2);
5890   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5891           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5892           : AREF (raw_text_eol_type, 2));
5893 }
5894
5895
5896 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5897    the subsidiary that has the same eol-spec as PARENT (if it is not
5898    nil and specifies end-of-line format) or the system's setting
5899    (system_eol_type).  */
5900
5901 Lisp_Object
5902 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5903 {
5904   Lisp_Object spec, eol_type;
5905
5906   if (NILP (coding_system))
5907     coding_system = Qraw_text;
5908   spec = CODING_SYSTEM_SPEC (coding_system);
5909   eol_type = AREF (spec, 2);
5910   if (VECTORP (eol_type))
5911     {
5912       Lisp_Object parent_eol_type;
5913
5914       if (! NILP (parent))
5915         {
5916           Lisp_Object parent_spec;
5917
5918           parent_spec = CODING_SYSTEM_SPEC (parent);
5919           parent_eol_type = AREF (parent_spec, 2);
5920           if (VECTORP (parent_eol_type))
5921             parent_eol_type = system_eol_type;
5922         }
5923       else
5924         parent_eol_type = system_eol_type;
5925       if (EQ (parent_eol_type, Qunix))
5926         coding_system = AREF (eol_type, 0);
5927       else if (EQ (parent_eol_type, Qdos))
5928         coding_system = AREF (eol_type, 1);
5929       else if (EQ (parent_eol_type, Qmac))
5930         coding_system = AREF (eol_type, 2);
5931     }
5932   return coding_system;
5933 }
5934
5935
5936 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5937    decided for writing to a process.  If not, complement them, and
5938    return a new coding system.  */
5939
5940 Lisp_Object
5941 complement_process_encoding_system (Lisp_Object coding_system)
5942 {
5943   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5944   Lisp_Object spec, attrs;
5945   int i;
5946
5947   for (i = 0; i < 3; i++)
5948     {
5949       if (i == 1)
5950         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5951       else if (i == 2)
5952         coding_system = preferred_coding_system ();
5953       spec = CODING_SYSTEM_SPEC (coding_system);
5954       if (NILP (spec))
5955         continue;
5956       attrs = AREF (spec, 0);
5957       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5958         coding_base = CODING_ATTR_BASE_NAME (attrs);
5959       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5960         eol_base = coding_system;
5961       if (! NILP (coding_base) && ! NILP (eol_base))
5962         break;
5963     }
5964
5965   if (i > 0)
5966     /* The original CODING_SYSTEM didn't specify text-conversion or
5967        eol-conversion.  Be sure that we return a fully complemented
5968        coding system.  */
5969     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5970   return coding_system;
5971 }
5972
5973
5974 /* Emacs has a mechanism to automatically detect a coding system if it
5975    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5976    it's impossible to distinguish some coding systems accurately
5977    because they use the same range of codes.  So, at first, coding
5978    systems are categorized into 7, those are:
5979
5980    o coding-category-emacs-mule
5981
5982         The category for a coding system which has the same code range
5983         as Emacs' internal format.  Assigned the coding-system (Lisp
5984         symbol) `emacs-mule' by default.
5985
5986    o coding-category-sjis
5987
5988         The category for a coding system which has the same code range
5989         as SJIS.  Assigned the coding-system (Lisp
5990         symbol) `japanese-shift-jis' by default.
5991
5992    o coding-category-iso-7
5993
5994         The category for a coding system which has the same code range
5995         as ISO2022 of 7-bit environment.  This doesn't use any locking
5996         shift and single shift functions.  This can encode/decode all
5997         charsets.  Assigned the coding-system (Lisp symbol)
5998         `iso-2022-7bit' by default.
5999
6000    o coding-category-iso-7-tight
6001
6002         Same as coding-category-iso-7 except that this can
6003         encode/decode only the specified charsets.
6004
6005    o coding-category-iso-8-1
6006
6007         The category for a coding system which has the same code range
6008         as ISO2022 of 8-bit environment and graphic plane 1 used only
6009         for DIMENSION1 charset.  This doesn't use any locking shift
6010         and single shift functions.  Assigned the coding-system (Lisp
6011         symbol) `iso-latin-1' by default.
6012
6013    o coding-category-iso-8-2
6014
6015         The category for a coding system which has the same code range
6016         as ISO2022 of 8-bit environment and graphic plane 1 used only
6017         for DIMENSION2 charset.  This doesn't use any locking shift
6018         and single shift functions.  Assigned the coding-system (Lisp
6019         symbol) `japanese-iso-8bit' by default.
6020
6021    o coding-category-iso-7-else
6022
6023         The category for a coding system which has the same code range
6024         as ISO2022 of 7-bit environment but uses locking shift or
6025         single shift functions.  Assigned the coding-system (Lisp
6026         symbol) `iso-2022-7bit-lock' by default.
6027
6028    o coding-category-iso-8-else
6029
6030         The category for a coding system which has the same code range
6031         as ISO2022 of 8-bit environment but uses locking shift or
6032         single shift functions.  Assigned the coding-system (Lisp
6033         symbol) `iso-2022-8bit-ss2' by default.
6034
6035    o coding-category-big5
6036
6037         The category for a coding system which has the same code range
6038         as BIG5.  Assigned the coding-system (Lisp symbol)
6039         `cn-big5' by default.
6040
6041    o coding-category-utf-8
6042
6043         The category for a coding system which has the same code range
6044         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6045         symbol) `utf-8' by default.
6046
6047    o coding-category-utf-16-be
6048
6049         The category for a coding system in which a text has an
6050         Unicode signature (cf. Unicode Standard) in the order of BIG
6051         endian at the head.  Assigned the coding-system (Lisp symbol)
6052         `utf-16-be' by default.
6053
6054    o coding-category-utf-16-le
6055
6056         The category for a coding system in which a text has an
6057         Unicode signature (cf. Unicode Standard) in the order of
6058         LITTLE endian at the head.  Assigned the coding-system (Lisp
6059         symbol) `utf-16-le' by default.
6060
6061    o coding-category-ccl
6062
6063         The category for a coding system of which encoder/decoder is
6064         written in CCL programs.  The default value is nil, i.e., no
6065         coding system is assigned.
6066
6067    o coding-category-binary
6068
6069         The category for a coding system not categorized in any of the
6070         above.  Assigned the coding-system (Lisp symbol)
6071         `no-conversion' by default.
6072
6073    Each of them is a Lisp symbol and the value is an actual
6074    `coding-system's (this is also a Lisp symbol) assigned by a user.
6075    What Emacs does actually is to detect a category of coding system.
6076    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6077    decide only one possible category, it selects a category of the
6078    highest priority.  Priorities of categories are also specified by a
6079    user in a Lisp variable `coding-category-list'.
6080
6081 */
6082
6083 #define EOL_SEEN_NONE   0
6084 #define EOL_SEEN_LF     1
6085 #define EOL_SEEN_CR     2
6086 #define EOL_SEEN_CRLF   4
6087
6088 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6089    SOURCE is encoded.  If CATEGORY is one of
6090    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6091    two-byte, else they are encoded by one-byte.
6092
6093    Return one of EOL_SEEN_XXX.  */
6094
6095 #define MAX_EOL_CHECK_COUNT 3
6096
6097 static int
6098 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6099             enum coding_category category)
6100 {
6101   const unsigned char *src = source, *src_end = src + src_bytes;
6102   unsigned char c;
6103   int total  = 0;
6104   int eol_seen = EOL_SEEN_NONE;
6105
6106   if ((1 << category) & CATEGORY_MASK_UTF_16)
6107     {
6108       int msb, lsb;
6109
6110       msb = category == (coding_category_utf_16_le
6111                          | coding_category_utf_16_le_nosig);
6112       lsb = 1 - msb;
6113
6114       while (src + 1 < src_end)
6115         {
6116           c = src[lsb];
6117           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6118             {
6119               int this_eol;
6120
6121               if (c == '\n')
6122                 this_eol = EOL_SEEN_LF;
6123               else if (src + 3 >= src_end
6124                        || src[msb + 2] != 0
6125                        || src[lsb + 2] != '\n')
6126                 this_eol = EOL_SEEN_CR;
6127               else
6128                 {
6129                   this_eol = EOL_SEEN_CRLF;
6130                   src += 2;
6131                 }
6132
6133               if (eol_seen == EOL_SEEN_NONE)
6134                 /* This is the first end-of-line.  */
6135                 eol_seen = this_eol;
6136               else if (eol_seen != this_eol)
6137                 {
6138                   /* The found type is different from what found before.
6139                      Allow for stray ^M characters in DOS EOL files.  */
6140                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6141                       || (eol_seen == EOL_SEEN_CRLF
6142                           && this_eol == EOL_SEEN_CR))
6143                     eol_seen = EOL_SEEN_CRLF;
6144                   else
6145                     {
6146                       eol_seen = EOL_SEEN_LF;
6147                       break;
6148                     }
6149                 }
6150               if (++total == MAX_EOL_CHECK_COUNT)
6151                 break;
6152             }
6153           src += 2;
6154         }
6155     }
6156   else
6157     while (src < src_end)
6158       {
6159         c = *src++;
6160         if (c == '\n' || c == '\r')
6161           {
6162             int this_eol;
6163
6164             if (c == '\n')
6165               this_eol = EOL_SEEN_LF;
6166             else if (src >= src_end || *src != '\n')
6167               this_eol = EOL_SEEN_CR;
6168             else
6169               this_eol = EOL_SEEN_CRLF, src++;
6170
6171             if (eol_seen == EOL_SEEN_NONE)
6172               /* This is the first end-of-line.  */
6173               eol_seen = this_eol;
6174             else if (eol_seen != this_eol)
6175               {
6176                 /* The found type is different from what found before.
6177                    Allow for stray ^M characters in DOS EOL files.  */
6178                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6179                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6180                   eol_seen = EOL_SEEN_CRLF;
6181                 else
6182                   {
6183                     eol_seen = EOL_SEEN_LF;
6184                     break;
6185                   }
6186               }
6187             if (++total == MAX_EOL_CHECK_COUNT)
6188               break;
6189           }
6190       }
6191   return eol_seen;
6192 }
6193
6194
6195 static Lisp_Object
6196 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6197 {
6198   Lisp_Object eol_type;
6199
6200   eol_type = CODING_ID_EOL_TYPE (coding->id);
6201   if (eol_seen & EOL_SEEN_LF)
6202     {
6203       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6204       eol_type = Qunix;
6205     }
6206   else if (eol_seen & EOL_SEEN_CRLF)
6207     {
6208       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6209       eol_type = Qdos;
6210     }
6211   else if (eol_seen & EOL_SEEN_CR)
6212     {
6213       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6214       eol_type = Qmac;
6215     }
6216   return eol_type;
6217 }
6218
6219 /* Detect how a text specified in CODING is encoded.  If a coding
6220    system is detected, update fields of CODING by the detected coding
6221    system.  */
6222
6223 static void
6224 detect_coding (struct coding_system *coding)
6225 {
6226   const unsigned char *src, *src_end;
6227   int saved_mode = coding->mode;
6228
6229   coding->consumed = coding->consumed_char = 0;
6230   coding->produced = coding->produced_char = 0;
6231   coding_set_source (coding);
6232
6233   src_end = coding->source + coding->src_bytes;
6234   coding->head_ascii = 0;
6235
6236   /* If we have not yet decided the text encoding type, detect it
6237      now.  */
6238   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6239     {
6240       int c, i;
6241       struct coding_detection_info detect_info;
6242       int null_byte_found = 0, eight_bit_found = 0;
6243
6244       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6245       for (src = coding->source; src < src_end; src++)
6246         {
6247           c = *src;
6248           if (c & 0x80)
6249             {
6250               eight_bit_found = 1;
6251               if (null_byte_found)
6252                 break;
6253             }
6254           else if (c < 0x20)
6255             {
6256               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6257                   && ! inhibit_iso_escape_detection
6258                   && ! detect_info.checked)
6259                 {
6260                   if (detect_coding_iso_2022 (coding, &detect_info))
6261                     {
6262                       /* We have scanned the whole data.  */
6263                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6264                         {
6265                           /* We didn't find an 8-bit code.  We may
6266                              have found a null-byte, but it's very
6267                              rare that a binary file conforms to
6268                              ISO-2022.  */
6269                           src = src_end;
6270                           coding->head_ascii = src - coding->source;
6271                         }
6272                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6273                       break;
6274                     }
6275                 }
6276               else if (! c && !inhibit_null_byte_detection)
6277                 {
6278                   null_byte_found = 1;
6279                   if (eight_bit_found)
6280                     break;
6281                 }
6282               if (! eight_bit_found)
6283                 coding->head_ascii++;
6284             }
6285           else if (! eight_bit_found)
6286             coding->head_ascii++;
6287         }
6288
6289       if (null_byte_found || eight_bit_found
6290           || coding->head_ascii < coding->src_bytes
6291           || detect_info.found)
6292         {
6293           enum coding_category category;
6294           struct coding_system *this;
6295
6296           if (coding->head_ascii == coding->src_bytes)
6297             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6298             for (i = 0; i < coding_category_raw_text; i++)
6299               {
6300                 category = coding_priorities[i];
6301                 this = coding_categories + category;
6302                 if (detect_info.found & (1 << category))
6303                   break;
6304               }
6305           else
6306             {
6307               if (null_byte_found)
6308                 {
6309                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6310                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6311                 }
6312               for (i = 0; i < coding_category_raw_text; i++)
6313                 {
6314                   category = coding_priorities[i];
6315                   this = coding_categories + category;
6316                   if (this->id < 0)
6317                     {
6318                       /* No coding system of this category is defined.  */
6319                       detect_info.rejected |= (1 << category);
6320                     }
6321                   else if (category >= coding_category_raw_text)
6322                     continue;
6323                   else if (detect_info.checked & (1 << category))
6324                     {
6325                       if (detect_info.found & (1 << category))
6326                         break;
6327                     }
6328                   else if ((*(this->detector)) (coding, &detect_info)
6329                            && detect_info.found & (1 << category))
6330                     {
6331                       if (category == coding_category_utf_16_auto)
6332                         {
6333                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6334                             category = coding_category_utf_16_le;
6335                           else
6336                             category = coding_category_utf_16_be;
6337                         }
6338                       break;
6339                     }
6340                 }
6341             }
6342
6343           if (i < coding_category_raw_text)
6344             setup_coding_system (CODING_ID_NAME (this->id), coding);
6345           else if (null_byte_found)
6346             setup_coding_system (Qno_conversion, coding);
6347           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6348                    == CATEGORY_MASK_ANY)
6349             setup_coding_system (Qraw_text, coding);
6350           else if (detect_info.rejected)
6351             for (i = 0; i < coding_category_raw_text; i++)
6352               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6353                 {
6354                   this = coding_categories + coding_priorities[i];
6355                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6356                   break;
6357                 }
6358         }
6359     }
6360   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6361            == coding_category_utf_8_auto)
6362     {
6363       Lisp_Object coding_systems;
6364       struct coding_detection_info detect_info;
6365
6366       coding_systems
6367         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6368       detect_info.found = detect_info.rejected = 0;
6369       coding->head_ascii = 0;
6370       if (CONSP (coding_systems)
6371           && detect_coding_utf_8 (coding, &detect_info))
6372         {
6373           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6374             setup_coding_system (XCAR (coding_systems), coding);
6375           else
6376             setup_coding_system (XCDR (coding_systems), coding);
6377         }
6378     }
6379   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6380            == coding_category_utf_16_auto)
6381     {
6382       Lisp_Object coding_systems;
6383       struct coding_detection_info detect_info;
6384
6385       coding_systems
6386         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6387       detect_info.found = detect_info.rejected = 0;
6388       coding->head_ascii = 0;
6389       if (CONSP (coding_systems)
6390           && detect_coding_utf_16 (coding, &detect_info))
6391         {
6392           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6393             setup_coding_system (XCAR (coding_systems), coding);
6394           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6395             setup_coding_system (XCDR (coding_systems), coding);
6396         }
6397     }
6398   coding->mode = saved_mode;
6399 }
6400
6401
6402 static void
6403 decode_eol (struct coding_system *coding)
6404 {
6405   Lisp_Object eol_type;
6406   unsigned char *p, *pbeg, *pend;
6407
6408   eol_type = CODING_ID_EOL_TYPE (coding->id);
6409   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6410     return;
6411
6412   if (NILP (coding->dst_object))
6413     pbeg = coding->destination;
6414   else
6415     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6416   pend = pbeg + coding->produced;
6417
6418   if (VECTORP (eol_type))
6419     {
6420       int eol_seen = EOL_SEEN_NONE;
6421
6422       for (p = pbeg; p < pend; p++)
6423         {
6424           if (*p == '\n')
6425             eol_seen |= EOL_SEEN_LF;
6426           else if (*p == '\r')
6427             {
6428               if (p + 1 < pend && *(p + 1) == '\n')
6429                 {
6430                   eol_seen |= EOL_SEEN_CRLF;
6431                   p++;
6432                 }
6433               else
6434                 eol_seen |= EOL_SEEN_CR;
6435             }
6436         }
6437       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6438       if ((eol_seen & EOL_SEEN_CRLF) != 0
6439           && (eol_seen & EOL_SEEN_CR) != 0
6440           && (eol_seen & EOL_SEEN_LF) == 0)
6441         eol_seen = EOL_SEEN_CRLF;
6442       else if (eol_seen != EOL_SEEN_NONE
6443           && eol_seen != EOL_SEEN_LF
6444           && eol_seen != EOL_SEEN_CRLF
6445           && eol_seen != EOL_SEEN_CR)
6446         eol_seen = EOL_SEEN_LF;
6447       if (eol_seen != EOL_SEEN_NONE)
6448         eol_type = adjust_coding_eol_type (coding, eol_seen);
6449     }
6450
6451   if (EQ (eol_type, Qmac))
6452     {
6453       for (p = pbeg; p < pend; p++)
6454         if (*p == '\r')
6455           *p = '\n';
6456     }
6457   else if (EQ (eol_type, Qdos))
6458     {
6459       ptrdiff_t n = 0;
6460
6461       if (NILP (coding->dst_object))
6462         {
6463           /* Start deleting '\r' from the tail to minimize the memory
6464              movement.  */
6465           for (p = pend - 2; p >= pbeg; p--)
6466             if (*p == '\r')
6467               {
6468                 memmove (p, p + 1, pend-- - p - 1);
6469                 n++;
6470               }
6471         }
6472       else
6473         {
6474           ptrdiff_t pos_byte = coding->dst_pos_byte;
6475           ptrdiff_t pos = coding->dst_pos;
6476           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6477
6478           while (pos < pos_end)
6479             {
6480               p = BYTE_POS_ADDR (pos_byte);
6481               if (*p == '\r' && p[1] == '\n')
6482                 {
6483                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6484                   n++;
6485                   pos_end--;
6486                 }
6487               pos++;
6488               if (coding->dst_multibyte)
6489                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6490               else
6491                 pos_byte++;
6492             }
6493         }
6494       coding->produced -= n;
6495       coding->produced_char -= n;
6496     }
6497 }
6498
6499
6500 /* Return a translation table (or list of them) from coding system
6501    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6502    decoding (ENCODEP is zero). */
6503
6504 static Lisp_Object
6505 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6506 {
6507   Lisp_Object standard, translation_table;
6508   Lisp_Object val;
6509
6510   if (NILP (Venable_character_translation))
6511     {
6512       if (max_lookup)
6513         *max_lookup = 0;
6514       return Qnil;
6515     }
6516   if (encodep)
6517     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6518       standard = Vstandard_translation_table_for_encode;
6519   else
6520     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6521       standard = Vstandard_translation_table_for_decode;
6522   if (NILP (translation_table))
6523     translation_table = standard;
6524   else
6525     {
6526       if (SYMBOLP (translation_table))
6527         translation_table = Fget (translation_table, Qtranslation_table);
6528       else if (CONSP (translation_table))
6529         {
6530           translation_table = Fcopy_sequence (translation_table);
6531           for (val = translation_table; CONSP (val); val = XCDR (val))
6532             if (SYMBOLP (XCAR (val)))
6533               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6534         }
6535       if (CHAR_TABLE_P (standard))
6536         {
6537           if (CONSP (translation_table))
6538             translation_table = nconc2 (translation_table,
6539                                         Fcons (standard, Qnil));
6540           else
6541             translation_table = Fcons (translation_table,
6542                                        Fcons (standard, Qnil));
6543         }
6544     }
6545
6546   if (max_lookup)
6547     {
6548       *max_lookup = 1;
6549       if (CHAR_TABLE_P (translation_table)
6550           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6551         {
6552           val = XCHAR_TABLE (translation_table)->extras[1];
6553           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6554             *max_lookup = XFASTINT (val);
6555         }
6556       else if (CONSP (translation_table))
6557         {
6558           Lisp_Object tail;
6559
6560           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6561             if (CHAR_TABLE_P (XCAR (tail))
6562                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6563               {
6564                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6565                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6566                   *max_lookup = XFASTINT (tailval);
6567               }
6568         }
6569     }
6570   return translation_table;
6571 }
6572
6573 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6574   do {                                                          \
6575     trans = Qnil;                                               \
6576     if (CHAR_TABLE_P (table))                                   \
6577       {                                                         \
6578         trans = CHAR_TABLE_REF (table, c);                      \
6579         if (CHARACTERP (trans))                                 \
6580           c = XFASTINT (trans), trans = Qnil;                   \
6581       }                                                         \
6582     else if (CONSP (table))                                     \
6583       {                                                         \
6584         Lisp_Object tail;                                       \
6585                                                                 \
6586         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6587           if (CHAR_TABLE_P (XCAR (tail)))                       \
6588             {                                                   \
6589               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6590               if (CHARACTERP (trans))                           \
6591                 c = XFASTINT (trans), trans = Qnil;             \
6592               else if (! NILP (trans))                          \
6593                 break;                                          \
6594             }                                                   \
6595       }                                                         \
6596   } while (0)
6597
6598
6599 /* Return a translation of character(s) at BUF according to TRANS.
6600    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6601    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6602    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6603    translation is found, and Qnil if not found..
6604    If BUF is too short to lookup characters in FROM, return Qt.  */
6605
6606 static Lisp_Object
6607 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6608 {
6609
6610   if (INTEGERP (trans))
6611     return trans;
6612   for (; CONSP (trans); trans = XCDR (trans))
6613     {
6614       Lisp_Object val = XCAR (trans);
6615       Lisp_Object from = XCAR (val);
6616       ptrdiff_t len = ASIZE (from);
6617       ptrdiff_t i;
6618
6619       for (i = 0; i < len; i++)
6620         {
6621           if (buf + i == buf_end)
6622             return Qt;
6623           if (XINT (AREF (from, i)) != buf[i])
6624             break;
6625         }
6626       if (i == len)
6627         return val;
6628     }
6629   return Qnil;
6630 }
6631
6632
6633 static int
6634 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6635                int last_block)
6636 {
6637   unsigned char *dst = coding->destination + coding->produced;
6638   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6639   ptrdiff_t produced;
6640   ptrdiff_t produced_chars = 0;
6641   int carryover = 0;
6642
6643   if (! coding->chars_at_source)
6644     {
6645       /* Source characters are in coding->charbuf.  */
6646       int *buf = coding->charbuf;
6647       int *buf_end = buf + coding->charbuf_used;
6648
6649       if (EQ (coding->src_object, coding->dst_object))
6650         {
6651           coding_set_source (coding);
6652           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6653         }
6654
6655       while (buf < buf_end)
6656         {
6657           int c = *buf, i;
6658
6659           if (c >= 0)
6660             {
6661               ptrdiff_t from_nchars = 1, to_nchars = 1;
6662               Lisp_Object trans = Qnil;
6663
6664               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6665               if (! NILP (trans))
6666                 {
6667                   trans = get_translation (trans, buf, buf_end);
6668                   if (INTEGERP (trans))
6669                     c = XINT (trans);
6670                   else if (CONSP (trans))
6671                     {
6672                       from_nchars = ASIZE (XCAR (trans));
6673                       trans = XCDR (trans);
6674                       if (INTEGERP (trans))
6675                         c = XINT (trans);
6676                       else
6677                         {
6678                           to_nchars = ASIZE (trans);
6679                           c = XINT (AREF (trans, 0));
6680                         }
6681                     }
6682                   else if (EQ (trans, Qt) && ! last_block)
6683                     break;
6684                 }
6685
6686               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6687                 {
6688                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6689                        / MAX_MULTIBYTE_LENGTH)
6690                       < to_nchars)
6691                     memory_full (SIZE_MAX);
6692                   dst = alloc_destination (coding,
6693                                            buf_end - buf
6694                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6695                                            dst);
6696                   if (EQ (coding->src_object, coding->dst_object))
6697                     {
6698                       coding_set_source (coding);
6699                       dst_end = (((unsigned char *) coding->source)
6700                                  + coding->consumed);
6701                     }
6702                   else
6703                     dst_end = coding->destination + coding->dst_bytes;
6704                 }
6705
6706               for (i = 0; i < to_nchars; i++)
6707                 {
6708                   if (i > 0)
6709                     c = XINT (AREF (trans, i));
6710                   if (coding->dst_multibyte
6711                       || ! CHAR_BYTE8_P (c))
6712                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6713                   else
6714                     *dst++ = CHAR_TO_BYTE8 (c);
6715                 }
6716               produced_chars += to_nchars;
6717               buf += from_nchars;
6718             }
6719           else
6720             /* This is an annotation datum.  (-C) is the length.  */
6721             buf += -c;
6722         }
6723       carryover = buf_end - buf;
6724     }
6725   else
6726     {
6727       /* Source characters are at coding->source.  */
6728       const unsigned char *src = coding->source;
6729       const unsigned char *src_end = src + coding->consumed;
6730
6731       if (EQ (coding->dst_object, coding->src_object))
6732         dst_end = (unsigned char *) src;
6733       if (coding->src_multibyte != coding->dst_multibyte)
6734         {
6735           if (coding->src_multibyte)
6736             {
6737               int multibytep = 1;
6738               ptrdiff_t consumed_chars = 0;
6739
6740               while (1)
6741                 {
6742                   const unsigned char *src_base = src;
6743                   int c;
6744
6745                   ONE_MORE_BYTE (c);
6746                   if (dst == dst_end)
6747                     {
6748                       if (EQ (coding->src_object, coding->dst_object))
6749                         dst_end = (unsigned char *) src;
6750                       if (dst == dst_end)
6751                         {
6752                           ptrdiff_t offset = src - coding->source;
6753
6754                           dst = alloc_destination (coding, src_end - src + 1,
6755                                                    dst);
6756                           dst_end = coding->destination + coding->dst_bytes;
6757                           coding_set_source (coding);
6758                           src = coding->source + offset;
6759                           src_end = coding->source + coding->src_bytes;
6760                           if (EQ (coding->src_object, coding->dst_object))
6761                             dst_end = (unsigned char *) src;
6762                         }
6763                     }
6764                   *dst++ = c;
6765                   produced_chars++;
6766                 }
6767             no_more_source:
6768               ;
6769             }
6770           else
6771             while (src < src_end)
6772               {
6773                 int multibytep = 1;
6774                 int c = *src++;
6775
6776                 if (dst >= dst_end - 1)
6777                   {
6778                     if (EQ (coding->src_object, coding->dst_object))
6779                       dst_end = (unsigned char *) src;
6780                     if (dst >= dst_end - 1)
6781                       {
6782                         ptrdiff_t offset = src - coding->source;
6783                         ptrdiff_t more_bytes;
6784
6785                         if (EQ (coding->src_object, coding->dst_object))
6786                           more_bytes = ((src_end - src) / 2) + 2;
6787                         else
6788                           more_bytes = src_end - src + 2;
6789                         dst = alloc_destination (coding, more_bytes, dst);
6790                         dst_end = coding->destination + coding->dst_bytes;
6791                         coding_set_source (coding);
6792                         src = coding->source + offset;
6793                         src_end = coding->source + coding->src_bytes;
6794                         if (EQ (coding->src_object, coding->dst_object))
6795                           dst_end = (unsigned char *) src;
6796                       }
6797                   }
6798                 EMIT_ONE_BYTE (c);
6799               }
6800         }
6801       else
6802         {
6803           if (!EQ (coding->src_object, coding->dst_object))
6804             {
6805               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6806
6807               if (require > 0)
6808                 {
6809                   ptrdiff_t offset = src - coding->source;
6810
6811                   dst = alloc_destination (coding, require, dst);
6812                   coding_set_source (coding);
6813                   src = coding->source + offset;
6814                   src_end = coding->source + coding->src_bytes;
6815                 }
6816             }
6817           produced_chars = coding->consumed_char;
6818           while (src < src_end)
6819             *dst++ = *src++;
6820         }
6821     }
6822
6823   produced = dst - (coding->destination + coding->produced);
6824   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6825     insert_from_gap (produced_chars, produced);
6826   coding->produced += produced;
6827   coding->produced_char += produced_chars;
6828   return carryover;
6829 }
6830
6831 /* Compose text in CODING->object according to the annotation data at
6832    CHARBUF.  CHARBUF is an array:
6833      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6834  */
6835
6836 static inline void
6837 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6838 {
6839   int len;
6840   ptrdiff_t to;
6841   enum composition_method method;
6842   Lisp_Object components;
6843
6844   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6845   to = pos + charbuf[2];
6846   method = (enum composition_method) (charbuf[4]);
6847
6848   if (method == COMPOSITION_RELATIVE)
6849     components = Qnil;
6850   else
6851     {
6852       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6853       int i, j;
6854
6855       if (method == COMPOSITION_WITH_RULE)
6856         len = charbuf[2] * 3 - 2;
6857       charbuf += MAX_ANNOTATION_LENGTH;
6858       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6859       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6860         {
6861           if (charbuf[i] >= 0)
6862             args[j] = make_number (charbuf[i]);
6863           else
6864             {
6865               i++;
6866               args[j] = make_number (charbuf[i] % 0x100);
6867             }
6868         }
6869       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6870     }
6871   compose_text (pos, to, components, Qnil, coding->dst_object);
6872 }
6873
6874
6875 /* Put `charset' property on text in CODING->object according to
6876    the annotation data at CHARBUF.  CHARBUF is an array:
6877      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6878  */
6879
6880 static inline void
6881 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6882 {
6883   ptrdiff_t from = pos - charbuf[2];
6884   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6885
6886   Fput_text_property (make_number (from), make_number (pos),
6887                       Qcharset, CHARSET_NAME (charset),
6888                       coding->dst_object);
6889 }
6890
6891
6892 #define CHARBUF_SIZE 0x4000
6893
6894 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6895   do {                                                                  \
6896     int size = CHARBUF_SIZE;                                            \
6897                                                                         \
6898     coding->charbuf = NULL;                                             \
6899     while (size > 1024)                                                 \
6900       {                                                                 \
6901         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6902         if (coding->charbuf)                                            \
6903           break;                                                        \
6904         size >>= 1;                                                     \
6905       }                                                                 \
6906     if (! coding->charbuf)                                              \
6907       {                                                                 \
6908         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6909         return coding->result;                                          \
6910       }                                                                 \
6911     coding->charbuf_size = size;                                        \
6912   } while (0)
6913
6914
6915 static void
6916 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6917 {
6918   int *charbuf = coding->charbuf;
6919   int *charbuf_end = charbuf + coding->charbuf_used;
6920
6921   if (NILP (coding->dst_object))
6922     return;
6923
6924   while (charbuf < charbuf_end)
6925     {
6926       if (*charbuf >= 0)
6927         pos++, charbuf++;
6928       else
6929         {
6930           int len = -*charbuf;
6931
6932           if (len > 2)
6933             switch (charbuf[1])
6934               {
6935               case CODING_ANNOTATE_COMPOSITION_MASK:
6936                 produce_composition (coding, charbuf, pos);
6937                 break;
6938               case CODING_ANNOTATE_CHARSET_MASK:
6939                 produce_charset (coding, charbuf, pos);
6940                 break;
6941               }
6942           charbuf += len;
6943         }
6944     }
6945 }
6946
6947 /* Decode the data at CODING->src_object into CODING->dst_object.
6948    CODING->src_object is a buffer, a string, or nil.
6949    CODING->dst_object is a buffer.
6950
6951    If CODING->src_object is a buffer, it must be the current buffer.
6952    In this case, if CODING->src_pos is positive, it is a position of
6953    the source text in the buffer, otherwise, the source text is in the
6954    gap area of the buffer, and CODING->src_pos specifies the offset of
6955    the text from GPT (which must be the same as PT).  If this is the
6956    same buffer as CODING->dst_object, CODING->src_pos must be
6957    negative.
6958
6959    If CODING->src_object is a string, CODING->src_pos is an index to
6960    that string.
6961
6962    If CODING->src_object is nil, CODING->source must already point to
6963    the non-relocatable memory area.  In this case, CODING->src_pos is
6964    an offset from CODING->source.
6965
6966    The decoded data is inserted at the current point of the buffer
6967    CODING->dst_object.
6968 */
6969
6970 static int
6971 decode_coding (struct coding_system *coding)
6972 {
6973   Lisp_Object attrs;
6974   Lisp_Object undo_list;
6975   Lisp_Object translation_table;
6976   struct ccl_spec cclspec;
6977   int carryover;
6978   int i;
6979
6980   if (BUFFERP (coding->src_object)
6981       && coding->src_pos > 0
6982       && coding->src_pos < GPT
6983       && coding->src_pos + coding->src_chars > GPT)
6984     move_gap_both (coding->src_pos, coding->src_pos_byte);
6985
6986   undo_list = Qt;
6987   if (BUFFERP (coding->dst_object))
6988     {
6989       if (current_buffer != XBUFFER (coding->dst_object))
6990         set_buffer_internal (XBUFFER (coding->dst_object));
6991       if (GPT != PT)
6992         move_gap_both (PT, PT_BYTE);
6993       undo_list = BVAR (current_buffer, undo_list);
6994       BVAR (current_buffer, undo_list) = Qt;
6995     }
6996
6997   coding->consumed = coding->consumed_char = 0;
6998   coding->produced = coding->produced_char = 0;
6999   coding->chars_at_source = 0;
7000   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7001   coding->errors = 0;
7002
7003   ALLOC_CONVERSION_WORK_AREA (coding);
7004
7005   attrs = CODING_ID_ATTRS (coding->id);
7006   translation_table = get_translation_table (attrs, 0, NULL);
7007
7008   carryover = 0;
7009   if (coding->decoder == decode_coding_ccl)
7010     {
7011       coding->spec.ccl = &cclspec;
7012       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7013     }
7014   do
7015     {
7016       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7017
7018       coding_set_source (coding);
7019       coding->annotated = 0;
7020       coding->charbuf_used = carryover;
7021       (*(coding->decoder)) (coding);
7022       coding_set_destination (coding);
7023       carryover = produce_chars (coding, translation_table, 0);
7024       if (coding->annotated)
7025         produce_annotation (coding, pos);
7026       for (i = 0; i < carryover; i++)
7027         coding->charbuf[i]
7028           = coding->charbuf[coding->charbuf_used - carryover + i];
7029     }
7030   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7031          || (coding->consumed < coding->src_bytes
7032              && (coding->result == CODING_RESULT_SUCCESS
7033                  || coding->result == CODING_RESULT_INVALID_SRC)));
7034
7035   if (carryover > 0)
7036     {
7037       coding_set_destination (coding);
7038       coding->charbuf_used = carryover;
7039       produce_chars (coding, translation_table, 1);
7040     }
7041
7042   coding->carryover_bytes = 0;
7043   if (coding->consumed < coding->src_bytes)
7044     {
7045       int nbytes = coding->src_bytes - coding->consumed;
7046       const unsigned char *src;
7047
7048       coding_set_source (coding);
7049       coding_set_destination (coding);
7050       src = coding->source + coding->consumed;
7051
7052       if (coding->mode & CODING_MODE_LAST_BLOCK)
7053         {
7054           /* Flush out unprocessed data as binary chars.  We are sure
7055              that the number of data is less than the size of
7056              coding->charbuf.  */
7057           coding->charbuf_used = 0;
7058           coding->chars_at_source = 0;
7059
7060           while (nbytes-- > 0)
7061             {
7062               int c = *src++;
7063
7064               if (c & 0x80)
7065                 c = BYTE8_TO_CHAR (c);
7066               coding->charbuf[coding->charbuf_used++] = c;
7067             }
7068           produce_chars (coding, Qnil, 1);
7069         }
7070       else
7071         {
7072           /* Record unprocessed bytes in coding->carryover.  We are
7073              sure that the number of data is less than the size of
7074              coding->carryover.  */
7075           unsigned char *p = coding->carryover;
7076
7077           if (nbytes > sizeof coding->carryover)
7078             nbytes = sizeof coding->carryover;
7079           coding->carryover_bytes = nbytes;
7080           while (nbytes-- > 0)
7081             *p++ = *src++;
7082         }
7083       coding->consumed = coding->src_bytes;
7084     }
7085
7086   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7087       && !inhibit_eol_conversion)
7088     decode_eol (coding);
7089   if (BUFFERP (coding->dst_object))
7090     {
7091       BVAR (current_buffer, undo_list) = undo_list;
7092       record_insert (coding->dst_pos, coding->produced_char);
7093     }
7094   return coding->result;
7095 }
7096
7097
7098 /* Extract an annotation datum from a composition starting at POS and
7099    ending before LIMIT of CODING->src_object (buffer or string), store
7100    the data in BUF, set *STOP to a starting position of the next
7101    composition (if any) or to LIMIT, and return the address of the
7102    next element of BUF.
7103
7104    If such an annotation is not found, set *STOP to a starting
7105    position of a composition after POS (if any) or to LIMIT, and
7106    return BUF.  */
7107
7108 static inline int *
7109 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7110                                struct coding_system *coding, int *buf,
7111                                ptrdiff_t *stop)
7112 {
7113   ptrdiff_t start, end;
7114   Lisp_Object prop;
7115
7116   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7117       || end > limit)
7118     *stop = limit;
7119   else if (start > pos)
7120     *stop = start;
7121   else
7122     {
7123       if (start == pos)
7124         {
7125           /* We found a composition.  Store the corresponding
7126              annotation data in BUF.  */
7127           int *head = buf;
7128           enum composition_method method = COMPOSITION_METHOD (prop);
7129           int nchars = COMPOSITION_LENGTH (prop);
7130
7131           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7132           if (method != COMPOSITION_RELATIVE)
7133             {
7134               Lisp_Object components;
7135               ptrdiff_t i, len, i_byte;
7136
7137               components = COMPOSITION_COMPONENTS (prop);
7138               if (VECTORP (components))
7139                 {
7140                   len = ASIZE (components);
7141                   for (i = 0; i < len; i++)
7142                     *buf++ = XINT (AREF (components, i));
7143                 }
7144               else if (STRINGP (components))
7145                 {
7146                   len = SCHARS (components);
7147                   i = i_byte = 0;
7148                   while (i < len)
7149                     {
7150                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7151                       buf++;
7152                     }
7153                 }
7154               else if (INTEGERP (components))
7155                 {
7156                   len = 1;
7157                   *buf++ = XINT (components);
7158                 }
7159               else if (CONSP (components))
7160                 {
7161                   for (len = 0; CONSP (components);
7162                        len++, components = XCDR (components))
7163                     *buf++ = XINT (XCAR (components));
7164                 }
7165               else
7166                 abort ();
7167               *head -= len;
7168             }
7169         }
7170
7171       if (find_composition (end, limit, &start, &end, &prop,
7172                             coding->src_object)
7173           && end <= limit)
7174         *stop = start;
7175       else
7176         *stop = limit;
7177     }
7178   return buf;
7179 }
7180
7181
7182 /* Extract an annotation datum from a text property `charset' at POS of
7183    CODING->src_object (buffer of string), store the data in BUF, set
7184    *STOP to the position where the value of `charset' property changes
7185    (limiting by LIMIT), and return the address of the next element of
7186    BUF.
7187
7188    If the property value is nil, set *STOP to the position where the
7189    property value is non-nil (limiting by LIMIT), and return BUF.  */
7190
7191 static inline int *
7192 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7193                            struct coding_system *coding, int *buf,
7194                            ptrdiff_t *stop)
7195 {
7196   Lisp_Object val, next;
7197   int id;
7198
7199   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7200   if (! NILP (val) && CHARSETP (val))
7201     id = XINT (CHARSET_SYMBOL_ID (val));
7202   else
7203     id = -1;
7204   ADD_CHARSET_DATA (buf, 0, id);
7205   next = Fnext_single_property_change (make_number (pos), Qcharset,
7206                                        coding->src_object,
7207                                        make_number (limit));
7208   *stop = XINT (next);
7209   return buf;
7210 }
7211
7212
7213 static void
7214 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7215                int max_lookup)
7216 {
7217   int *buf = coding->charbuf;
7218   int *buf_end = coding->charbuf + coding->charbuf_size;
7219   const unsigned char *src = coding->source + coding->consumed;
7220   const unsigned char *src_end = coding->source + coding->src_bytes;
7221   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7222   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7223   int multibytep = coding->src_multibyte;
7224   Lisp_Object eol_type;
7225   int c;
7226   ptrdiff_t stop, stop_composition, stop_charset;
7227   int *lookup_buf = NULL;
7228
7229   if (! NILP (translation_table))
7230     lookup_buf = alloca (sizeof (int) * max_lookup);
7231
7232   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7233   if (VECTORP (eol_type))
7234     eol_type = Qunix;
7235
7236   /* Note: composition handling is not yet implemented.  */
7237   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7238
7239   if (NILP (coding->src_object))
7240     stop = stop_composition = stop_charset = end_pos;
7241   else
7242     {
7243       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7244         stop = stop_composition = pos;
7245       else
7246         stop = stop_composition = end_pos;
7247       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7248         stop = stop_charset = pos;
7249       else
7250         stop_charset = end_pos;
7251     }
7252
7253   /* Compensate for CRLF and conversion.  */
7254   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7255   while (buf < buf_end)
7256     {
7257       Lisp_Object trans;
7258
7259       if (pos == stop)
7260         {
7261           if (pos == end_pos)
7262             break;
7263           if (pos == stop_composition)
7264             buf = handle_composition_annotation (pos, end_pos, coding,
7265                                                  buf, &stop_composition);
7266           if (pos == stop_charset)
7267             buf = handle_charset_annotation (pos, end_pos, coding,
7268                                              buf, &stop_charset);
7269           stop = (stop_composition < stop_charset
7270                   ? stop_composition : stop_charset);
7271         }
7272
7273       if (! multibytep)
7274         {
7275           int bytes;
7276
7277           if (coding->encoder == encode_coding_raw_text
7278               || coding->encoder == encode_coding_ccl)
7279             c = *src++, pos++;
7280           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7281             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7282           else
7283             c = BYTE8_TO_CHAR (*src), src++, pos++;
7284         }
7285       else
7286         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7287       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7288         c = '\n';
7289       if (! EQ (eol_type, Qunix))
7290         {
7291           if (c == '\n')
7292             {
7293               if (EQ (eol_type, Qdos))
7294                 *buf++ = '\r';
7295               else
7296                 c = '\r';
7297             }
7298         }
7299
7300       trans = Qnil;
7301       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7302       if (NILP (trans))
7303         *buf++ = c;
7304       else
7305         {
7306           ptrdiff_t from_nchars = 1, to_nchars = 1;
7307           int *lookup_buf_end;
7308           const unsigned char *p = src;
7309           int i;
7310
7311           lookup_buf[0] = c;
7312           for (i = 1; i < max_lookup && p < src_end; i++)
7313             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7314           lookup_buf_end = lookup_buf + i;
7315           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7316           if (INTEGERP (trans))
7317             c = XINT (trans);
7318           else if (CONSP (trans))
7319             {
7320               from_nchars = ASIZE (XCAR (trans));
7321               trans = XCDR (trans);
7322               if (INTEGERP (trans))
7323                 c = XINT (trans);
7324               else
7325                 {
7326                   to_nchars = ASIZE (trans);
7327                   if (buf_end - buf < to_nchars)
7328                     break;
7329                   c = XINT (AREF (trans, 0));
7330                 }
7331             }
7332           else
7333             break;
7334           *buf++ = c;
7335           for (i = 1; i < to_nchars; i++)
7336             *buf++ = XINT (AREF (trans, i));
7337           for (i = 1; i < from_nchars; i++, pos++)
7338             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7339         }
7340     }
7341
7342   coding->consumed = src - coding->source;
7343   coding->consumed_char = pos - coding->src_pos;
7344   coding->charbuf_used = buf - coding->charbuf;
7345   coding->chars_at_source = 0;
7346 }
7347
7348
7349 /* Encode the text at CODING->src_object into CODING->dst_object.
7350    CODING->src_object is a buffer or a string.
7351    CODING->dst_object is a buffer or nil.
7352
7353    If CODING->src_object is a buffer, it must be the current buffer.
7354    In this case, if CODING->src_pos is positive, it is a position of
7355    the source text in the buffer, otherwise. the source text is in the
7356    gap area of the buffer, and coding->src_pos specifies the offset of
7357    the text from GPT (which must be the same as PT).  If this is the
7358    same buffer as CODING->dst_object, CODING->src_pos must be
7359    negative and CODING should not have `pre-write-conversion'.
7360
7361    If CODING->src_object is a string, CODING should not have
7362    `pre-write-conversion'.
7363
7364    If CODING->dst_object is a buffer, the encoded data is inserted at
7365    the current point of that buffer.
7366
7367    If CODING->dst_object is nil, the encoded data is placed at the
7368    memory area specified by CODING->destination.  */
7369
7370 static int
7371 encode_coding (struct coding_system *coding)
7372 {
7373   Lisp_Object attrs;
7374   Lisp_Object translation_table;
7375   int max_lookup;
7376   struct ccl_spec cclspec;
7377
7378   attrs = CODING_ID_ATTRS (coding->id);
7379   if (coding->encoder == encode_coding_raw_text)
7380     translation_table = Qnil, max_lookup = 0;
7381   else
7382     translation_table = get_translation_table (attrs, 1, &max_lookup);
7383
7384   if (BUFFERP (coding->dst_object))
7385     {
7386       set_buffer_internal (XBUFFER (coding->dst_object));
7387       coding->dst_multibyte
7388         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7389     }
7390
7391   coding->consumed = coding->consumed_char = 0;
7392   coding->produced = coding->produced_char = 0;
7393   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7394   coding->errors = 0;
7395
7396   ALLOC_CONVERSION_WORK_AREA (coding);
7397
7398   if (coding->encoder == encode_coding_ccl)
7399     {
7400       coding->spec.ccl = &cclspec;
7401       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7402     }
7403   do {
7404     coding_set_source (coding);
7405     consume_chars (coding, translation_table, max_lookup);
7406     coding_set_destination (coding);
7407     (*(coding->encoder)) (coding);
7408   } while (coding->consumed_char < coding->src_chars);
7409
7410   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7411     insert_from_gap (coding->produced_char, coding->produced);
7412
7413   return (coding->result);
7414 }
7415
7416
7417 /* Name (or base name) of work buffer for code conversion.  */
7418 static Lisp_Object Vcode_conversion_workbuf_name;
7419
7420 /* A working buffer used by the top level conversion.  Once it is
7421    created, it is never destroyed.  It has the name
7422    Vcode_conversion_workbuf_name.  The other working buffers are
7423    destroyed after the use is finished, and their names are modified
7424    versions of Vcode_conversion_workbuf_name.  */
7425 static Lisp_Object Vcode_conversion_reused_workbuf;
7426
7427 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7428 static int reused_workbuf_in_use;
7429
7430
7431 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7432    multibyteness of returning buffer.  */
7433
7434 static Lisp_Object
7435 make_conversion_work_buffer (int multibyte)
7436 {
7437   Lisp_Object name, workbuf;
7438   struct buffer *current;
7439
7440   if (reused_workbuf_in_use++)
7441     {
7442       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7443       workbuf = Fget_buffer_create (name);
7444     }
7445   else
7446     {
7447       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7448         Vcode_conversion_reused_workbuf
7449           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7450       workbuf = Vcode_conversion_reused_workbuf;
7451     }
7452   current = current_buffer;
7453   set_buffer_internal (XBUFFER (workbuf));
7454   /* We can't allow modification hooks to run in the work buffer.  For
7455      instance, directory_files_internal assumes that file decoding
7456      doesn't compile new regexps.  */
7457   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7458   Ferase_buffer ();
7459   BVAR (current_buffer, undo_list) = Qt;
7460   BVAR (current_buffer, enable_multibyte_characters) = multibyte ? Qt : Qnil;
7461   set_buffer_internal (current);
7462   return workbuf;
7463 }
7464
7465
7466 static Lisp_Object
7467 code_conversion_restore (Lisp_Object arg)
7468 {
7469   Lisp_Object current, workbuf;
7470   struct gcpro gcpro1;
7471
7472   GCPRO1 (arg);
7473   current = XCAR (arg);
7474   workbuf = XCDR (arg);
7475   if (! NILP (workbuf))
7476     {
7477       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7478         reused_workbuf_in_use = 0;
7479       else if (! NILP (Fbuffer_live_p (workbuf)))
7480         Fkill_buffer (workbuf);
7481     }
7482   set_buffer_internal (XBUFFER (current));
7483   UNGCPRO;
7484   return Qnil;
7485 }
7486
7487 Lisp_Object
7488 code_conversion_save (int with_work_buf, int multibyte)
7489 {
7490   Lisp_Object workbuf = Qnil;
7491
7492   if (with_work_buf)
7493     workbuf = make_conversion_work_buffer (multibyte);
7494   record_unwind_protect (code_conversion_restore,
7495                          Fcons (Fcurrent_buffer (), workbuf));
7496   return workbuf;
7497 }
7498
7499 int
7500 decode_coding_gap (struct coding_system *coding,
7501                    ptrdiff_t chars, ptrdiff_t bytes)
7502 {
7503   ptrdiff_t count = SPECPDL_INDEX ();
7504   Lisp_Object attrs;
7505
7506   code_conversion_save (0, 0);
7507
7508   coding->src_object = Fcurrent_buffer ();
7509   coding->src_chars = chars;
7510   coding->src_bytes = bytes;
7511   coding->src_pos = -chars;
7512   coding->src_pos_byte = -bytes;
7513   coding->src_multibyte = chars < bytes;
7514   coding->dst_object = coding->src_object;
7515   coding->dst_pos = PT;
7516   coding->dst_pos_byte = PT_BYTE;
7517   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7518
7519   if (CODING_REQUIRE_DETECTION (coding))
7520     detect_coding (coding);
7521
7522   coding->mode |= CODING_MODE_LAST_BLOCK;
7523   current_buffer->text->inhibit_shrinking = 1;
7524   decode_coding (coding);
7525   current_buffer->text->inhibit_shrinking = 0;
7526
7527   attrs = CODING_ID_ATTRS (coding->id);
7528   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7529     {
7530       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7531       Lisp_Object val;
7532
7533       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7534       val = call1 (CODING_ATTR_POST_READ (attrs),
7535                    make_number (coding->produced_char));
7536       CHECK_NATNUM (val);
7537       coding->produced_char += Z - prev_Z;
7538       coding->produced += Z_BYTE - prev_Z_BYTE;
7539     }
7540
7541   unbind_to (count, Qnil);
7542   return coding->result;
7543 }
7544
7545
7546 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7547    SRC_OBJECT into DST_OBJECT by coding context CODING.
7548
7549    SRC_OBJECT is a buffer, a string, or Qnil.
7550
7551    If it is a buffer, the text is at point of the buffer.  FROM and TO
7552    are positions in the buffer.
7553
7554    If it is a string, the text is at the beginning of the string.
7555    FROM and TO are indices to the string.
7556
7557    If it is nil, the text is at coding->source.  FROM and TO are
7558    indices to coding->source.
7559
7560    DST_OBJECT is a buffer, Qt, or Qnil.
7561
7562    If it is a buffer, the decoded text is inserted at point of the
7563    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7564    is deleted.
7565
7566    If it is Qt, a string is made from the decoded text, and
7567    set in CODING->dst_object.
7568
7569    If it is Qnil, the decoded text is stored at CODING->destination.
7570    The caller must allocate CODING->dst_bytes bytes at
7571    CODING->destination by xmalloc.  If the decoded text is longer than
7572    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7573  */
7574
7575 void
7576 decode_coding_object (struct coding_system *coding,
7577                       Lisp_Object src_object,
7578                       ptrdiff_t from, ptrdiff_t from_byte,
7579                       ptrdiff_t to, ptrdiff_t to_byte,
7580                       Lisp_Object dst_object)
7581 {
7582   ptrdiff_t count = SPECPDL_INDEX ();
7583   unsigned char *destination IF_LINT (= NULL);
7584   ptrdiff_t dst_bytes IF_LINT (= 0);
7585   ptrdiff_t chars = to - from;
7586   ptrdiff_t bytes = to_byte - from_byte;
7587   Lisp_Object attrs;
7588   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7589   int need_marker_adjustment = 0;
7590   Lisp_Object old_deactivate_mark;
7591
7592   old_deactivate_mark = Vdeactivate_mark;
7593
7594   if (NILP (dst_object))
7595     {
7596       destination = coding->destination;
7597       dst_bytes = coding->dst_bytes;
7598     }
7599
7600   coding->src_object = src_object;
7601   coding->src_chars = chars;
7602   coding->src_bytes = bytes;
7603   coding->src_multibyte = chars < bytes;
7604
7605   if (STRINGP (src_object))
7606     {
7607       coding->src_pos = from;
7608       coding->src_pos_byte = from_byte;
7609     }
7610   else if (BUFFERP (src_object))
7611     {
7612       set_buffer_internal (XBUFFER (src_object));
7613       if (from != GPT)
7614         move_gap_both (from, from_byte);
7615       if (EQ (src_object, dst_object))
7616         {
7617           struct Lisp_Marker *tail;
7618
7619           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7620             {
7621               tail->need_adjustment
7622                 = tail->charpos == (tail->insertion_type ? from : to);
7623               need_marker_adjustment |= tail->need_adjustment;
7624             }
7625           saved_pt = PT, saved_pt_byte = PT_BYTE;
7626           TEMP_SET_PT_BOTH (from, from_byte);
7627           current_buffer->text->inhibit_shrinking = 1;
7628           del_range_both (from, from_byte, to, to_byte, 1);
7629           coding->src_pos = -chars;
7630           coding->src_pos_byte = -bytes;
7631         }
7632       else
7633         {
7634           coding->src_pos = from;
7635           coding->src_pos_byte = from_byte;
7636         }
7637     }
7638
7639   if (CODING_REQUIRE_DETECTION (coding))
7640     detect_coding (coding);
7641   attrs = CODING_ID_ATTRS (coding->id);
7642
7643   if (EQ (dst_object, Qt)
7644       || (! NILP (CODING_ATTR_POST_READ (attrs))
7645           && NILP (dst_object)))
7646     {
7647       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7648       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7649       coding->dst_pos = BEG;
7650       coding->dst_pos_byte = BEG_BYTE;
7651     }
7652   else if (BUFFERP (dst_object))
7653     {
7654       code_conversion_save (0, 0);
7655       coding->dst_object = dst_object;
7656       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7657       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7658       coding->dst_multibyte
7659         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7660     }
7661   else
7662     {
7663       code_conversion_save (0, 0);
7664       coding->dst_object = Qnil;
7665       /* Most callers presume this will return a multibyte result, and they
7666          won't use `binary' or `raw-text' anyway, so let's not worry about
7667          CODING_FOR_UNIBYTE.  */
7668       coding->dst_multibyte = 1;
7669     }
7670
7671   decode_coding (coding);
7672
7673   if (BUFFERP (coding->dst_object))
7674     set_buffer_internal (XBUFFER (coding->dst_object));
7675
7676   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7677     {
7678       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7679       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7680       Lisp_Object val;
7681
7682       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7683       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7684               old_deactivate_mark);
7685       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7686                         make_number (coding->produced_char));
7687       UNGCPRO;
7688       CHECK_NATNUM (val);
7689       coding->produced_char += Z - prev_Z;
7690       coding->produced += Z_BYTE - prev_Z_BYTE;
7691     }
7692
7693   if (EQ (dst_object, Qt))
7694     {
7695       coding->dst_object = Fbuffer_string ();
7696     }
7697   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7698     {
7699       set_buffer_internal (XBUFFER (coding->dst_object));
7700       if (dst_bytes < coding->produced)
7701         {
7702           destination = xrealloc (destination, coding->produced);
7703           if (! destination)
7704             {
7705               record_conversion_result (coding,
7706                                         CODING_RESULT_INSUFFICIENT_MEM);
7707               unbind_to (count, Qnil);
7708               return;
7709             }
7710           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7711             move_gap_both (BEGV, BEGV_BYTE);
7712           memcpy (destination, BEGV_ADDR, coding->produced);
7713           coding->destination = destination;
7714         }
7715     }
7716
7717   if (saved_pt >= 0)
7718     {
7719       /* This is the case of:
7720          (BUFFERP (src_object) && EQ (src_object, dst_object))
7721          As we have moved PT while replacing the original buffer
7722          contents, we must recover it now.  */
7723       set_buffer_internal (XBUFFER (src_object));
7724       current_buffer->text->inhibit_shrinking = 0;
7725       if (saved_pt < from)
7726         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7727       else if (saved_pt < from + chars)
7728         TEMP_SET_PT_BOTH (from, from_byte);
7729       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7730         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7731                           saved_pt_byte + (coding->produced - bytes));
7732       else
7733         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7734                           saved_pt_byte + (coding->produced - bytes));
7735
7736       if (need_marker_adjustment)
7737         {
7738           struct Lisp_Marker *tail;
7739
7740           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7741             if (tail->need_adjustment)
7742               {
7743                 tail->need_adjustment = 0;
7744                 if (tail->insertion_type)
7745                   {
7746                     tail->bytepos = from_byte;
7747                     tail->charpos = from;
7748                   }
7749                 else
7750                   {
7751                     tail->bytepos = from_byte + coding->produced;
7752                     tail->charpos
7753                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7754                          ? tail->bytepos : from + coding->produced_char);
7755                   }
7756               }
7757         }
7758     }
7759
7760   Vdeactivate_mark = old_deactivate_mark;
7761   unbind_to (count, coding->dst_object);
7762 }
7763
7764
7765 void
7766 encode_coding_object (struct coding_system *coding,
7767                       Lisp_Object src_object,
7768                       ptrdiff_t from, ptrdiff_t from_byte,
7769                       ptrdiff_t to, ptrdiff_t to_byte,
7770                       Lisp_Object dst_object)
7771 {
7772   ptrdiff_t count = SPECPDL_INDEX ();
7773   ptrdiff_t chars = to - from;
7774   ptrdiff_t bytes = to_byte - from_byte;
7775   Lisp_Object attrs;
7776   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7777   int need_marker_adjustment = 0;
7778   int kill_src_buffer = 0;
7779   Lisp_Object old_deactivate_mark;
7780
7781   old_deactivate_mark = Vdeactivate_mark;
7782
7783   coding->src_object = src_object;
7784   coding->src_chars = chars;
7785   coding->src_bytes = bytes;
7786   coding->src_multibyte = chars < bytes;
7787
7788   attrs = CODING_ID_ATTRS (coding->id);
7789
7790   if (EQ (src_object, dst_object))
7791     {
7792       struct Lisp_Marker *tail;
7793
7794       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7795         {
7796           tail->need_adjustment
7797             = tail->charpos == (tail->insertion_type ? from : to);
7798           need_marker_adjustment |= tail->need_adjustment;
7799         }
7800     }
7801
7802   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7803     {
7804       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7805       set_buffer_internal (XBUFFER (coding->src_object));
7806       if (STRINGP (src_object))
7807         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7808       else if (BUFFERP (src_object))
7809         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7810       else
7811         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7812
7813       if (EQ (src_object, dst_object))
7814         {
7815           set_buffer_internal (XBUFFER (src_object));
7816           saved_pt = PT, saved_pt_byte = PT_BYTE;
7817           del_range_both (from, from_byte, to, to_byte, 1);
7818           set_buffer_internal (XBUFFER (coding->src_object));
7819         }
7820
7821       {
7822         Lisp_Object args[3];
7823         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7824
7825         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7826                 old_deactivate_mark);
7827         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7828         args[1] = make_number (BEG);
7829         args[2] = make_number (Z);
7830         safe_call (3, args);
7831         UNGCPRO;
7832       }
7833       if (XBUFFER (coding->src_object) != current_buffer)
7834         kill_src_buffer = 1;
7835       coding->src_object = Fcurrent_buffer ();
7836       if (BEG != GPT)
7837         move_gap_both (BEG, BEG_BYTE);
7838       coding->src_chars = Z - BEG;
7839       coding->src_bytes = Z_BYTE - BEG_BYTE;
7840       coding->src_pos = BEG;
7841       coding->src_pos_byte = BEG_BYTE;
7842       coding->src_multibyte = Z < Z_BYTE;
7843     }
7844   else if (STRINGP (src_object))
7845     {
7846       code_conversion_save (0, 0);
7847       coding->src_pos = from;
7848       coding->src_pos_byte = from_byte;
7849     }
7850   else if (BUFFERP (src_object))
7851     {
7852       code_conversion_save (0, 0);
7853       set_buffer_internal (XBUFFER (src_object));
7854       if (EQ (src_object, dst_object))
7855         {
7856           saved_pt = PT, saved_pt_byte = PT_BYTE;
7857           coding->src_object = del_range_1 (from, to, 1, 1);
7858           coding->src_pos = 0;
7859           coding->src_pos_byte = 0;
7860         }
7861       else
7862         {
7863           if (from < GPT && to >= GPT)
7864             move_gap_both (from, from_byte);
7865           coding->src_pos = from;
7866           coding->src_pos_byte = from_byte;
7867         }
7868     }
7869   else
7870     code_conversion_save (0, 0);
7871
7872   if (BUFFERP (dst_object))
7873     {
7874       coding->dst_object = dst_object;
7875       if (EQ (src_object, dst_object))
7876         {
7877           coding->dst_pos = from;
7878           coding->dst_pos_byte = from_byte;
7879         }
7880       else
7881         {
7882           struct buffer *current = current_buffer;
7883
7884           set_buffer_temp (XBUFFER (dst_object));
7885           coding->dst_pos = PT;
7886           coding->dst_pos_byte = PT_BYTE;
7887           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7888           set_buffer_temp (current);
7889         }
7890       coding->dst_multibyte
7891         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7892     }
7893   else if (EQ (dst_object, Qt))
7894     {
7895       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7896       coding->dst_object = Qnil;
7897       coding->destination = (unsigned char *) xmalloc (dst_bytes);
7898       coding->dst_bytes = dst_bytes;
7899       coding->dst_multibyte = 0;
7900     }
7901   else
7902     {
7903       coding->dst_object = Qnil;
7904       coding->dst_multibyte = 0;
7905     }
7906
7907   encode_coding (coding);
7908
7909   if (EQ (dst_object, Qt))
7910     {
7911       if (BUFFERP (coding->dst_object))
7912         coding->dst_object = Fbuffer_string ();
7913       else
7914         {
7915           coding->dst_object
7916             = make_unibyte_string ((char *) coding->destination,
7917                                    coding->produced);
7918           xfree (coding->destination);
7919         }
7920     }
7921
7922   if (saved_pt >= 0)
7923     {
7924       /* This is the case of:
7925          (BUFFERP (src_object) && EQ (src_object, dst_object))
7926          As we have moved PT while replacing the original buffer
7927          contents, we must recover it now.  */
7928       set_buffer_internal (XBUFFER (src_object));
7929       if (saved_pt < from)
7930         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7931       else if (saved_pt < from + chars)
7932         TEMP_SET_PT_BOTH (from, from_byte);
7933       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7934         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7935                           saved_pt_byte + (coding->produced - bytes));
7936       else
7937         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7938                           saved_pt_byte + (coding->produced - bytes));
7939
7940       if (need_marker_adjustment)
7941         {
7942           struct Lisp_Marker *tail;
7943
7944           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7945             if (tail->need_adjustment)
7946               {
7947                 tail->need_adjustment = 0;
7948                 if (tail->insertion_type)
7949                   {
7950                     tail->bytepos = from_byte;
7951                     tail->charpos = from;
7952                   }
7953                 else
7954                   {
7955                     tail->bytepos = from_byte + coding->produced;
7956                     tail->charpos
7957                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7958                          ? tail->bytepos : from + coding->produced_char);
7959                   }
7960               }
7961         }
7962     }
7963
7964   if (kill_src_buffer)
7965     Fkill_buffer (coding->src_object);
7966
7967   Vdeactivate_mark = old_deactivate_mark;
7968   unbind_to (count, Qnil);
7969 }
7970
7971
7972 Lisp_Object
7973 preferred_coding_system (void)
7974 {
7975   int id = coding_categories[coding_priorities[0]].id;
7976
7977   return CODING_ID_NAME (id);
7978 }
7979
7980 \f
7981 #ifdef emacs
7982 /*** 8. Emacs Lisp library functions ***/
7983
7984 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7985        doc: /* Return t if OBJECT is nil or a coding-system.
7986 See the documentation of `define-coding-system' for information
7987 about coding-system objects.  */)
7988   (Lisp_Object object)
7989 {
7990   if (NILP (object)
7991       || CODING_SYSTEM_ID (object) >= 0)
7992     return Qt;
7993   if (! SYMBOLP (object)
7994       || NILP (Fget (object, Qcoding_system_define_form)))
7995     return Qnil;
7996   return Qt;
7997 }
7998
7999 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8000        Sread_non_nil_coding_system, 1, 1, 0,
8001        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8002   (Lisp_Object prompt)
8003 {
8004   Lisp_Object val;
8005   do
8006     {
8007       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8008                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8009     }
8010   while (SCHARS (val) == 0);
8011   return (Fintern (val, Qnil));
8012 }
8013
8014 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8015        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8016 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8017 Ignores case when completing coding systems (all Emacs coding systems
8018 are lower-case).  */)
8019   (Lisp_Object prompt, Lisp_Object default_coding_system)
8020 {
8021   Lisp_Object val;
8022   ptrdiff_t count = SPECPDL_INDEX ();
8023
8024   if (SYMBOLP (default_coding_system))
8025     default_coding_system = SYMBOL_NAME (default_coding_system);
8026   specbind (Qcompletion_ignore_case, Qt);
8027   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8028                           Qt, Qnil, Qcoding_system_history,
8029                           default_coding_system, Qnil);
8030   unbind_to (count, Qnil);
8031   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8032 }
8033
8034 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8035        1, 1, 0,
8036        doc: /* Check validity of CODING-SYSTEM.
8037 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8038 It is valid if it is nil or a symbol defined as a coding system by the
8039 function `define-coding-system'.  */)
8040   (Lisp_Object coding_system)
8041 {
8042   Lisp_Object define_form;
8043
8044   define_form = Fget (coding_system, Qcoding_system_define_form);
8045   if (! NILP (define_form))
8046     {
8047       Fput (coding_system, Qcoding_system_define_form, Qnil);
8048       safe_eval (define_form);
8049     }
8050   if (!NILP (Fcoding_system_p (coding_system)))
8051     return coding_system;
8052   xsignal1 (Qcoding_system_error, coding_system);
8053 }
8054
8055 \f
8056 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8057    HIGHEST is nonzero, return the coding system of the highest
8058    priority among the detected coding systems.  Otherwise return a
8059    list of detected coding systems sorted by their priorities.  If
8060    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8061    multibyte form but contains only ASCII and eight-bit chars.
8062    Otherwise, the bytes are raw bytes.
8063
8064    CODING-SYSTEM controls the detection as below:
8065
8066    If it is nil, detect both text-format and eol-format.  If the
8067    text-format part of CODING-SYSTEM is already specified
8068    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8069    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8070    detect only text-format.  */
8071
8072 Lisp_Object
8073 detect_coding_system (const unsigned char *src,
8074                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8075                       int highest, int multibytep,
8076                       Lisp_Object coding_system)
8077 {
8078   const unsigned char *src_end = src + src_bytes;
8079   Lisp_Object attrs, eol_type;
8080   Lisp_Object val = Qnil;
8081   struct coding_system coding;
8082   ptrdiff_t id;
8083   struct coding_detection_info detect_info;
8084   enum coding_category base_category;
8085   int null_byte_found = 0, eight_bit_found = 0;
8086
8087   if (NILP (coding_system))
8088     coding_system = Qundecided;
8089   setup_coding_system (coding_system, &coding);
8090   attrs = CODING_ID_ATTRS (coding.id);
8091   eol_type = CODING_ID_EOL_TYPE (coding.id);
8092   coding_system = CODING_ATTR_BASE_NAME (attrs);
8093
8094   coding.source = src;
8095   coding.src_chars = src_chars;
8096   coding.src_bytes = src_bytes;
8097   coding.src_multibyte = multibytep;
8098   coding.consumed = 0;
8099   coding.mode |= CODING_MODE_LAST_BLOCK;
8100   coding.head_ascii = 0;
8101
8102   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8103
8104   /* At first, detect text-format if necessary.  */
8105   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8106   if (base_category == coding_category_undecided)
8107     {
8108       enum coding_category category IF_LINT (= 0);
8109       struct coding_system *this IF_LINT (= NULL);
8110       int c, i;
8111
8112       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8113       for (; src < src_end; src++)
8114         {
8115           c = *src;
8116           if (c & 0x80)
8117             {
8118               eight_bit_found = 1;
8119               if (null_byte_found)
8120                 break;
8121             }
8122           else if (c < 0x20)
8123             {
8124               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8125                   && ! inhibit_iso_escape_detection
8126                   && ! detect_info.checked)
8127                 {
8128                   if (detect_coding_iso_2022 (&coding, &detect_info))
8129                     {
8130                       /* We have scanned the whole data.  */
8131                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8132                         {
8133                           /* We didn't find an 8-bit code.  We may
8134                              have found a null-byte, but it's very
8135                              rare that a binary file confirm to
8136                              ISO-2022.  */
8137                           src = src_end;
8138                           coding.head_ascii = src - coding.source;
8139                         }
8140                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8141                       break;
8142                     }
8143                 }
8144               else if (! c && !inhibit_null_byte_detection)
8145                 {
8146                   null_byte_found = 1;
8147                   if (eight_bit_found)
8148                     break;
8149                 }
8150               if (! eight_bit_found)
8151                 coding.head_ascii++;
8152             }
8153           else if (! eight_bit_found)
8154             coding.head_ascii++;
8155         }
8156
8157       if (null_byte_found || eight_bit_found
8158           || coding.head_ascii < coding.src_bytes
8159           || detect_info.found)
8160         {
8161           if (coding.head_ascii == coding.src_bytes)
8162             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8163             for (i = 0; i < coding_category_raw_text; i++)
8164               {
8165                 category = coding_priorities[i];
8166                 this = coding_categories + category;
8167                 if (detect_info.found & (1 << category))
8168                   break;
8169               }
8170           else
8171             {
8172               if (null_byte_found)
8173                 {
8174                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8175                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8176                 }
8177               for (i = 0; i < coding_category_raw_text; i++)
8178                 {
8179                   category = coding_priorities[i];
8180                   this = coding_categories + category;
8181
8182                   if (this->id < 0)
8183                     {
8184                       /* No coding system of this category is defined.  */
8185                       detect_info.rejected |= (1 << category);
8186                     }
8187                   else if (category >= coding_category_raw_text)
8188                     continue;
8189                   else if (detect_info.checked & (1 << category))
8190                     {
8191                       if (highest
8192                           && (detect_info.found & (1 << category)))
8193                         break;
8194                     }
8195                   else if ((*(this->detector)) (&coding, &detect_info)
8196                            && highest
8197                            && (detect_info.found & (1 << category)))
8198                     {
8199                       if (category == coding_category_utf_16_auto)
8200                         {
8201                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8202                             category = coding_category_utf_16_le;
8203                           else
8204                             category = coding_category_utf_16_be;
8205                         }
8206                       break;
8207                     }
8208                 }
8209             }
8210         }
8211
8212       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8213           || null_byte_found)
8214         {
8215           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8216           id = CODING_SYSTEM_ID (Qno_conversion);
8217           val = Fcons (make_number (id), Qnil);
8218         }
8219       else if (! detect_info.rejected && ! detect_info.found)
8220         {
8221           detect_info.found = CATEGORY_MASK_ANY;
8222           id = coding_categories[coding_category_undecided].id;
8223           val = Fcons (make_number (id), Qnil);
8224         }
8225       else if (highest)
8226         {
8227           if (detect_info.found)
8228             {
8229               detect_info.found = 1 << category;
8230               val = Fcons (make_number (this->id), Qnil);
8231             }
8232           else
8233             for (i = 0; i < coding_category_raw_text; i++)
8234               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8235                 {
8236                   detect_info.found = 1 << coding_priorities[i];
8237                   id = coding_categories[coding_priorities[i]].id;
8238                   val = Fcons (make_number (id), Qnil);
8239                   break;
8240                 }
8241         }
8242       else
8243         {
8244           int mask = detect_info.rejected | detect_info.found;
8245           int found = 0;
8246
8247           for (i = coding_category_raw_text - 1; i >= 0; i--)
8248             {
8249               category = coding_priorities[i];
8250               if (! (mask & (1 << category)))
8251                 {
8252                   found |= 1 << category;
8253                   id = coding_categories[category].id;
8254                   if (id >= 0)
8255                     val = Fcons (make_number (id), val);
8256                 }
8257             }
8258           for (i = coding_category_raw_text - 1; i >= 0; i--)
8259             {
8260               category = coding_priorities[i];
8261               if (detect_info.found & (1 << category))
8262                 {
8263                   id = coding_categories[category].id;
8264                   val = Fcons (make_number (id), val);
8265                 }
8266             }
8267           detect_info.found |= found;
8268         }
8269     }
8270   else if (base_category == coding_category_utf_8_auto)
8271     {
8272       if (detect_coding_utf_8 (&coding, &detect_info))
8273         {
8274           struct coding_system *this;
8275
8276           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8277             this = coding_categories + coding_category_utf_8_sig;
8278           else
8279             this = coding_categories + coding_category_utf_8_nosig;
8280           val = Fcons (make_number (this->id), Qnil);
8281         }
8282     }
8283   else if (base_category == coding_category_utf_16_auto)
8284     {
8285       if (detect_coding_utf_16 (&coding, &detect_info))
8286         {
8287           struct coding_system *this;
8288
8289           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8290             this = coding_categories + coding_category_utf_16_le;
8291           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8292             this = coding_categories + coding_category_utf_16_be;
8293           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8294             this = coding_categories + coding_category_utf_16_be_nosig;
8295           else
8296             this = coding_categories + coding_category_utf_16_le_nosig;
8297           val = Fcons (make_number (this->id), Qnil);
8298         }
8299     }
8300   else
8301     {
8302       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8303       val = Fcons (make_number (coding.id), Qnil);
8304     }
8305
8306   /* Then, detect eol-format if necessary.  */
8307   {
8308     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8309     Lisp_Object tail;
8310
8311     if (VECTORP (eol_type))
8312       {
8313         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8314           {
8315             if (null_byte_found)
8316               normal_eol = EOL_SEEN_LF;
8317             else
8318               normal_eol = detect_eol (coding.source, src_bytes,
8319                                        coding_category_raw_text);
8320           }
8321         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8322                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8323           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8324                                       coding_category_utf_16_be);
8325         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8326                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8327           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8328                                       coding_category_utf_16_le);
8329       }
8330     else
8331       {
8332         if (EQ (eol_type, Qunix))
8333           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8334         else if (EQ (eol_type, Qdos))
8335           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8336         else
8337           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8338       }
8339
8340     for (tail = val; CONSP (tail); tail = XCDR (tail))
8341       {
8342         enum coding_category category;
8343         int this_eol;
8344
8345         id = XINT (XCAR (tail));
8346         attrs = CODING_ID_ATTRS (id);
8347         category = XINT (CODING_ATTR_CATEGORY (attrs));
8348         eol_type = CODING_ID_EOL_TYPE (id);
8349         if (VECTORP (eol_type))
8350           {
8351             if (category == coding_category_utf_16_be
8352                 || category == coding_category_utf_16_be_nosig)
8353               this_eol = utf_16_be_eol;
8354             else if (category == coding_category_utf_16_le
8355                      || category == coding_category_utf_16_le_nosig)
8356               this_eol = utf_16_le_eol;
8357             else
8358               this_eol = normal_eol;
8359
8360             if (this_eol == EOL_SEEN_LF)
8361               XSETCAR (tail, AREF (eol_type, 0));
8362             else if (this_eol == EOL_SEEN_CRLF)
8363               XSETCAR (tail, AREF (eol_type, 1));
8364             else if (this_eol == EOL_SEEN_CR)
8365               XSETCAR (tail, AREF (eol_type, 2));
8366             else
8367               XSETCAR (tail, CODING_ID_NAME (id));
8368           }
8369         else
8370           XSETCAR (tail, CODING_ID_NAME (id));
8371       }
8372   }
8373
8374   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8375 }
8376
8377
8378 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8379        2, 3, 0,
8380        doc: /* Detect coding system of the text in the region between START and END.
8381 Return a list of possible coding systems ordered by priority.
8382 The coding systems to try and their priorities follows what
8383 the function `coding-system-priority-list' (which see) returns.
8384
8385 If only ASCII characters are found (except for such ISO-2022 control
8386 characters as ESC), it returns a list of single element `undecided'
8387 or its subsidiary coding system according to a detected end-of-line
8388 format.
8389
8390 If optional argument HIGHEST is non-nil, return the coding system of
8391 highest priority.  */)
8392   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8393 {
8394   ptrdiff_t from, to;
8395   ptrdiff_t from_byte, to_byte;
8396
8397   CHECK_NUMBER_COERCE_MARKER (start);
8398   CHECK_NUMBER_COERCE_MARKER (end);
8399
8400   validate_region (&start, &end);
8401   from = XINT (start), to = XINT (end);
8402   from_byte = CHAR_TO_BYTE (from);
8403   to_byte = CHAR_TO_BYTE (to);
8404
8405   if (from < GPT && to >= GPT)
8406     move_gap_both (to, to_byte);
8407
8408   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8409                                to - from, to_byte - from_byte,
8410                                !NILP (highest),
8411                                !NILP (BVAR (current_buffer
8412                                       , enable_multibyte_characters)),
8413                                Qnil);
8414 }
8415
8416 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8417        1, 2, 0,
8418        doc: /* Detect coding system of the text in STRING.
8419 Return a list of possible coding systems ordered by priority.
8420 The coding systems to try and their priorities follows what
8421 the function `coding-system-priority-list' (which see) returns.
8422
8423 If only ASCII characters are found (except for such ISO-2022 control
8424 characters as ESC), it returns a list of single element `undecided'
8425 or its subsidiary coding system according to a detected end-of-line
8426 format.
8427
8428 If optional argument HIGHEST is non-nil, return the coding system of
8429 highest priority.  */)
8430   (Lisp_Object string, Lisp_Object highest)
8431 {
8432   CHECK_STRING (string);
8433
8434   return detect_coding_system (SDATA (string),
8435                                SCHARS (string), SBYTES (string),
8436                                !NILP (highest), STRING_MULTIBYTE (string),
8437                                Qnil);
8438 }
8439
8440
8441 static inline int
8442 char_encodable_p (int c, Lisp_Object attrs)
8443 {
8444   Lisp_Object tail;
8445   struct charset *charset;
8446   Lisp_Object translation_table;
8447
8448   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8449   if (! NILP (translation_table))
8450     c = translate_char (translation_table, c);
8451   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8452        CONSP (tail); tail = XCDR (tail))
8453     {
8454       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8455       if (CHAR_CHARSET_P (c, charset))
8456         break;
8457     }
8458   return (! NILP (tail));
8459 }
8460
8461
8462 /* Return a list of coding systems that safely encode the text between
8463    START and END.  If EXCLUDE is non-nil, it is a list of coding
8464    systems not to check.  The returned list doesn't contain any such
8465    coding systems.  In any case, if the text contains only ASCII or is
8466    unibyte, return t.  */
8467
8468 DEFUN ("find-coding-systems-region-internal",
8469        Ffind_coding_systems_region_internal,
8470        Sfind_coding_systems_region_internal, 2, 3, 0,
8471        doc: /* Internal use only.  */)
8472   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8473 {
8474   Lisp_Object coding_attrs_list, safe_codings;
8475   ptrdiff_t start_byte, end_byte;
8476   const unsigned char *p, *pbeg, *pend;
8477   int c;
8478   Lisp_Object tail, elt, work_table;
8479
8480   if (STRINGP (start))
8481     {
8482       if (!STRING_MULTIBYTE (start)
8483           || SCHARS (start) == SBYTES (start))
8484         return Qt;
8485       start_byte = 0;
8486       end_byte = SBYTES (start);
8487     }
8488   else
8489     {
8490       CHECK_NUMBER_COERCE_MARKER (start);
8491       CHECK_NUMBER_COERCE_MARKER (end);
8492       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8493         args_out_of_range (start, end);
8494       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8495         return Qt;
8496       start_byte = CHAR_TO_BYTE (XINT (start));
8497       end_byte = CHAR_TO_BYTE (XINT (end));
8498       if (XINT (end) - XINT (start) == end_byte - start_byte)
8499         return Qt;
8500
8501       if (XINT (start) < GPT && XINT (end) > GPT)
8502         {
8503           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8504             move_gap_both (XINT (start), start_byte);
8505           else
8506             move_gap_both (XINT (end), end_byte);
8507         }
8508     }
8509
8510   coding_attrs_list = Qnil;
8511   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8512     if (NILP (exclude)
8513         || NILP (Fmemq (XCAR (tail), exclude)))
8514       {
8515         Lisp_Object attrs;
8516
8517         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8518         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8519             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8520           {
8521             ASET (attrs, coding_attr_trans_tbl,
8522                   get_translation_table (attrs, 1, NULL));
8523             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8524           }
8525       }
8526
8527   if (STRINGP (start))
8528     p = pbeg = SDATA (start);
8529   else
8530     p = pbeg = BYTE_POS_ADDR (start_byte);
8531   pend = p + (end_byte - start_byte);
8532
8533   while (p < pend && ASCII_BYTE_P (*p)) p++;
8534   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8535
8536   work_table = Fmake_char_table (Qnil, Qnil);
8537   while (p < pend)
8538     {
8539       if (ASCII_BYTE_P (*p))
8540         p++;
8541       else
8542         {
8543           c = STRING_CHAR_ADVANCE (p);
8544           if (!NILP (char_table_ref (work_table, c)))
8545             /* This character was already checked.  Ignore it.  */
8546             continue;
8547
8548           charset_map_loaded = 0;
8549           for (tail = coding_attrs_list; CONSP (tail);)
8550             {
8551               elt = XCAR (tail);
8552               if (NILP (elt))
8553                 tail = XCDR (tail);
8554               else if (char_encodable_p (c, elt))
8555                 tail = XCDR (tail);
8556               else if (CONSP (XCDR (tail)))
8557                 {
8558                   XSETCAR (tail, XCAR (XCDR (tail)));
8559                   XSETCDR (tail, XCDR (XCDR (tail)));
8560                 }
8561               else
8562                 {
8563                   XSETCAR (tail, Qnil);
8564                   tail = XCDR (tail);
8565                 }
8566             }
8567           if (charset_map_loaded)
8568             {
8569               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8570
8571               if (STRINGP (start))
8572                 pbeg = SDATA (start);
8573               else
8574                 pbeg = BYTE_POS_ADDR (start_byte);
8575               p = pbeg + p_offset;
8576               pend = pbeg + pend_offset;
8577             }
8578           char_table_set (work_table, c, Qt);
8579         }
8580     }
8581
8582   safe_codings = list2 (Qraw_text, Qno_conversion);
8583   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8584     if (! NILP (XCAR (tail)))
8585       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8586
8587   return safe_codings;
8588 }
8589
8590
8591 DEFUN ("unencodable-char-position", Funencodable_char_position,
8592        Sunencodable_char_position, 3, 5, 0,
8593        doc: /*
8594 Return position of first un-encodable character in a region.
8595 START and END specify the region and CODING-SYSTEM specifies the
8596 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8597
8598 If optional 4th argument COUNT is non-nil, it specifies at most how
8599 many un-encodable characters to search.  In this case, the value is a
8600 list of positions.
8601
8602 If optional 5th argument STRING is non-nil, it is a string to search
8603 for un-encodable characters.  In that case, START and END are indexes
8604 to the string.  */)
8605   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8606 {
8607   EMACS_INT n;
8608   struct coding_system coding;
8609   Lisp_Object attrs, charset_list, translation_table;
8610   Lisp_Object positions;
8611   ptrdiff_t from, to;
8612   const unsigned char *p, *stop, *pend;
8613   int ascii_compatible;
8614
8615   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8616   attrs = CODING_ID_ATTRS (coding.id);
8617   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8618     return Qnil;
8619   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8620   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8621   translation_table = get_translation_table (attrs, 1, NULL);
8622
8623   if (NILP (string))
8624     {
8625       validate_region (&start, &end);
8626       from = XINT (start);
8627       to = XINT (end);
8628       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8629           || (ascii_compatible
8630               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8631         return Qnil;
8632       p = CHAR_POS_ADDR (from);
8633       pend = CHAR_POS_ADDR (to);
8634       if (from < GPT && to >= GPT)
8635         stop = GPT_ADDR;
8636       else
8637         stop = pend;
8638     }
8639   else
8640     {
8641       CHECK_STRING (string);
8642       CHECK_NATNUM (start);
8643       CHECK_NATNUM (end);
8644       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8645         args_out_of_range_3 (string, start, end);
8646       from = XINT (start);
8647       to = XINT (end);
8648       if (! STRING_MULTIBYTE (string))
8649         return Qnil;
8650       p = SDATA (string) + string_char_to_byte (string, from);
8651       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8652       if (ascii_compatible && (to - from) == (pend - p))
8653         return Qnil;
8654     }
8655
8656   if (NILP (count))
8657     n = 1;
8658   else
8659     {
8660       CHECK_NATNUM (count);
8661       n = XINT (count);
8662     }
8663
8664   positions = Qnil;
8665   while (1)
8666     {
8667       int c;
8668
8669       if (ascii_compatible)
8670         while (p < stop && ASCII_BYTE_P (*p))
8671           p++, from++;
8672       if (p >= stop)
8673         {
8674           if (p >= pend)
8675             break;
8676           stop = pend;
8677           p = GAP_END_ADDR;
8678         }
8679
8680       c = STRING_CHAR_ADVANCE (p);
8681       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8682           && ! char_charset (translate_char (translation_table, c),
8683                              charset_list, NULL))
8684         {
8685           positions = Fcons (make_number (from), positions);
8686           n--;
8687           if (n == 0)
8688             break;
8689         }
8690
8691       from++;
8692     }
8693
8694   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8695 }
8696
8697
8698 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8699        Scheck_coding_systems_region, 3, 3, 0,
8700        doc: /* Check if the region is encodable by coding systems.
8701
8702 START and END are buffer positions specifying the region.
8703 CODING-SYSTEM-LIST is a list of coding systems to check.
8704
8705 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8706 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8707 whole region, POS0, POS1, ... are buffer positions where non-encodable
8708 characters are found.
8709
8710 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8711 value is nil.
8712
8713 START may be a string.  In that case, check if the string is
8714 encodable, and the value contains indices to the string instead of
8715 buffer positions.  END is ignored.
8716
8717 If the current buffer (or START if it is a string) is unibyte, the value
8718 is nil.  */)
8719   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8720 {
8721   Lisp_Object list;
8722   ptrdiff_t start_byte, end_byte;
8723   ptrdiff_t pos;
8724   const unsigned char *p, *pbeg, *pend;
8725   int c;
8726   Lisp_Object tail, elt, attrs;
8727
8728   if (STRINGP (start))
8729     {
8730       if (!STRING_MULTIBYTE (start)
8731           || SCHARS (start) == SBYTES (start))
8732         return Qnil;
8733       start_byte = 0;
8734       end_byte = SBYTES (start);
8735       pos = 0;
8736     }
8737   else
8738     {
8739       CHECK_NUMBER_COERCE_MARKER (start);
8740       CHECK_NUMBER_COERCE_MARKER (end);
8741       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8742         args_out_of_range (start, end);
8743       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8744         return Qnil;
8745       start_byte = CHAR_TO_BYTE (XINT (start));
8746       end_byte = CHAR_TO_BYTE (XINT (end));
8747       if (XINT (end) - XINT (start) == end_byte - start_byte)
8748         return Qnil;
8749
8750       if (XINT (start) < GPT && XINT (end) > GPT)
8751         {
8752           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8753             move_gap_both (XINT (start), start_byte);
8754           else
8755             move_gap_both (XINT (end), end_byte);
8756         }
8757       pos = XINT (start);
8758     }
8759
8760   list = Qnil;
8761   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8762     {
8763       elt = XCAR (tail);
8764       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8765       ASET (attrs, coding_attr_trans_tbl,
8766             get_translation_table (attrs, 1, NULL));
8767       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8768     }
8769
8770   if (STRINGP (start))
8771     p = pbeg = SDATA (start);
8772   else
8773     p = pbeg = BYTE_POS_ADDR (start_byte);
8774   pend = p + (end_byte - start_byte);
8775
8776   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8777   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8778
8779   while (p < pend)
8780     {
8781       if (ASCII_BYTE_P (*p))
8782         p++;
8783       else
8784         {
8785           c = STRING_CHAR_ADVANCE (p);
8786
8787           charset_map_loaded = 0;
8788           for (tail = list; CONSP (tail); tail = XCDR (tail))
8789             {
8790               elt = XCDR (XCAR (tail));
8791               if (! char_encodable_p (c, XCAR (elt)))
8792                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8793             }
8794           if (charset_map_loaded)
8795             {
8796               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8797
8798               if (STRINGP (start))
8799                 pbeg = SDATA (start);
8800               else
8801                 pbeg = BYTE_POS_ADDR (start_byte);
8802               p = pbeg + p_offset;
8803               pend = pbeg + pend_offset;
8804             }
8805         }
8806       pos++;
8807     }
8808
8809   tail = list;
8810   list = Qnil;
8811   for (; CONSP (tail); tail = XCDR (tail))
8812     {
8813       elt = XCAR (tail);
8814       if (CONSP (XCDR (XCDR (elt))))
8815         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8816                       list);
8817     }
8818
8819   return list;
8820 }
8821
8822
8823 static Lisp_Object
8824 code_convert_region (Lisp_Object start, Lisp_Object end,
8825                      Lisp_Object coding_system, Lisp_Object dst_object,
8826                      int encodep, int norecord)
8827 {
8828   struct coding_system coding;
8829   ptrdiff_t from, from_byte, to, to_byte;
8830   Lisp_Object src_object;
8831
8832   CHECK_NUMBER_COERCE_MARKER (start);
8833   CHECK_NUMBER_COERCE_MARKER (end);
8834   if (NILP (coding_system))
8835     coding_system = Qno_conversion;
8836   else
8837     CHECK_CODING_SYSTEM (coding_system);
8838   src_object = Fcurrent_buffer ();
8839   if (NILP (dst_object))
8840     dst_object = src_object;
8841   else if (! EQ (dst_object, Qt))
8842     CHECK_BUFFER (dst_object);
8843
8844   validate_region (&start, &end);
8845   from = XFASTINT (start);
8846   from_byte = CHAR_TO_BYTE (from);
8847   to = XFASTINT (end);
8848   to_byte = CHAR_TO_BYTE (to);
8849
8850   setup_coding_system (coding_system, &coding);
8851   coding.mode |= CODING_MODE_LAST_BLOCK;
8852
8853   if (encodep)
8854     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8855                           dst_object);
8856   else
8857     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8858                           dst_object);
8859   if (! norecord)
8860     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8861
8862   return (BUFFERP (dst_object)
8863           ? make_number (coding.produced_char)
8864           : coding.dst_object);
8865 }
8866
8867
8868 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8869        3, 4, "r\nzCoding system: ",
8870        doc: /* Decode the current region from the specified coding system.
8871 When called from a program, takes four arguments:
8872         START, END, CODING-SYSTEM, and DESTINATION.
8873 START and END are buffer positions.
8874
8875 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8876 If nil, the region between START and END is replaced by the decoded text.
8877 If buffer, the decoded text is inserted in that buffer after point (point
8878 does not move).
8879 In those cases, the length of the decoded text is returned.
8880 If DESTINATION is t, the decoded text is returned.
8881
8882 This function sets `last-coding-system-used' to the precise coding system
8883 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8884 not fully specified.)  */)
8885   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8886 {
8887   return code_convert_region (start, end, coding_system, destination, 0, 0);
8888 }
8889
8890 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8891        3, 4, "r\nzCoding system: ",
8892        doc: /* Encode the current region by specified coding system.
8893 When called from a program, takes four arguments:
8894         START, END, CODING-SYSTEM and DESTINATION.
8895 START and END are buffer positions.
8896
8897 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8898 If nil, the region between START and END is replace by the encoded text.
8899 If buffer, the encoded text is inserted in that buffer after point (point
8900 does not move).
8901 In those cases, the length of the encoded text is returned.
8902 If DESTINATION is t, the encoded text is returned.
8903
8904 This function sets `last-coding-system-used' to the precise coding system
8905 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8906 not fully specified.)  */)
8907   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8908 {
8909   return code_convert_region (start, end, coding_system, destination, 1, 0);
8910 }
8911
8912 Lisp_Object
8913 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8914                      Lisp_Object dst_object, int encodep, int nocopy, int norecord)
8915 {
8916   struct coding_system coding;
8917   ptrdiff_t chars, bytes;
8918
8919   CHECK_STRING (string);
8920   if (NILP (coding_system))
8921     {
8922       if (! norecord)
8923         Vlast_coding_system_used = Qno_conversion;
8924       if (NILP (dst_object))
8925         return (nocopy ? Fcopy_sequence (string) : string);
8926     }
8927
8928   if (NILP (coding_system))
8929     coding_system = Qno_conversion;
8930   else
8931     CHECK_CODING_SYSTEM (coding_system);
8932   if (NILP (dst_object))
8933     dst_object = Qt;
8934   else if (! EQ (dst_object, Qt))
8935     CHECK_BUFFER (dst_object);
8936
8937   setup_coding_system (coding_system, &coding);
8938   coding.mode |= CODING_MODE_LAST_BLOCK;
8939   chars = SCHARS (string);
8940   bytes = SBYTES (string);
8941   if (encodep)
8942     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8943   else
8944     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8945   if (! norecord)
8946     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8947
8948   return (BUFFERP (dst_object)
8949           ? make_number (coding.produced_char)
8950           : coding.dst_object);
8951 }
8952
8953
8954 /* Encode or decode STRING according to CODING_SYSTEM.
8955    Do not set Vlast_coding_system_used.
8956
8957    This function is called only from macros DECODE_FILE and
8958    ENCODE_FILE, thus we ignore character composition.  */
8959
8960 Lisp_Object
8961 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8962                               int encodep)
8963 {
8964   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8965 }
8966
8967
8968 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8969        2, 4, 0,
8970        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8971
8972 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8973 if the decoding operation is trivial.
8974
8975 Optional fourth arg BUFFER non-nil means that the decoded text is
8976 inserted in that buffer after point (point does not move).  In this
8977 case, the return value is the length of the decoded text.
8978
8979 This function sets `last-coding-system-used' to the precise coding system
8980 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8981 not fully specified.)  */)
8982   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
8983 {
8984   return code_convert_string (string, coding_system, buffer,
8985                               0, ! NILP (nocopy), 0);
8986 }
8987
8988 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8989        2, 4, 0,
8990        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8991
8992 Optional third arg NOCOPY non-nil means it is OK to return STRING
8993 itself if the encoding operation is trivial.
8994
8995 Optional fourth arg BUFFER non-nil means that the encoded text is
8996 inserted in that buffer after point (point does not move).  In this
8997 case, the return value is the length of the encoded text.
8998
8999 This function sets `last-coding-system-used' to the precise coding system
9000 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9001 not fully specified.)  */)
9002   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9003 {
9004   return code_convert_string (string, coding_system, buffer,
9005                               1, ! NILP (nocopy), 0);
9006 }
9007
9008 \f
9009 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9010        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9011 Return the corresponding character.  */)
9012   (Lisp_Object code)
9013 {
9014   Lisp_Object spec, attrs, val;
9015   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9016   EMACS_INT ch;
9017   int c;
9018
9019   CHECK_NATNUM (code);
9020   ch = XFASTINT (code);
9021   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9022   attrs = AREF (spec, 0);
9023
9024   if (ASCII_BYTE_P (ch)
9025       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9026     return code;
9027
9028   val = CODING_ATTR_CHARSET_LIST (attrs);
9029   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9030   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9031   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9032
9033   if (ch <= 0x7F)
9034     {
9035       c = ch;
9036       charset = charset_roman;
9037     }
9038   else if (ch >= 0xA0 && ch < 0xDF)
9039     {
9040       c = ch - 0x80;
9041       charset = charset_kana;
9042     }
9043   else
9044     {
9045       EMACS_INT c1 = ch >> 8;
9046       int c2 = ch & 0xFF;
9047
9048       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9049           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9050         error ("Invalid code: %"pI"d", ch);
9051       c = ch;
9052       SJIS_TO_JIS (c);
9053       charset = charset_kanji;
9054     }
9055   c = DECODE_CHAR (charset, c);
9056   if (c < 0)
9057     error ("Invalid code: %"pI"d", ch);
9058   return make_number (c);
9059 }
9060
9061
9062 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9063        doc: /* Encode a Japanese character CH to shift_jis encoding.
9064 Return the corresponding code in SJIS.  */)
9065   (Lisp_Object ch)
9066 {
9067   Lisp_Object spec, attrs, charset_list;
9068   int c;
9069   struct charset *charset;
9070   unsigned code;
9071
9072   CHECK_CHARACTER (ch);
9073   c = XFASTINT (ch);
9074   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9075   attrs = AREF (spec, 0);
9076
9077   if (ASCII_CHAR_P (c)
9078       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9079     return ch;
9080
9081   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9082   charset = char_charset (c, charset_list, &code);
9083   if (code == CHARSET_INVALID_CODE (charset))
9084     error ("Can't encode by shift_jis encoding: %c", c);
9085   JIS_TO_SJIS (code);
9086
9087   return make_number (code);
9088 }
9089
9090 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9091        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9092 Return the corresponding character.  */)
9093   (Lisp_Object code)
9094 {
9095   Lisp_Object spec, attrs, val;
9096   struct charset *charset_roman, *charset_big5, *charset;
9097   EMACS_INT ch;
9098   int c;
9099
9100   CHECK_NATNUM (code);
9101   ch = XFASTINT (code);
9102   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9103   attrs = AREF (spec, 0);
9104
9105   if (ASCII_BYTE_P (ch)
9106       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9107     return code;
9108
9109   val = CODING_ATTR_CHARSET_LIST (attrs);
9110   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9111   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9112
9113   if (ch <= 0x7F)
9114     {
9115       c = ch;
9116       charset = charset_roman;
9117     }
9118   else
9119     {
9120       EMACS_INT b1 = ch >> 8;
9121       int b2 = ch & 0x7F;
9122       if (b1 < 0xA1 || b1 > 0xFE
9123           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9124         error ("Invalid code: %"pI"d", ch);
9125       c = ch;
9126       charset = charset_big5;
9127     }
9128   c = DECODE_CHAR (charset, c);
9129   if (c < 0)
9130     error ("Invalid code: %"pI"d", ch);
9131   return make_number (c);
9132 }
9133
9134 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9135        doc: /* Encode the Big5 character CH to BIG5 coding system.
9136 Return the corresponding character code in Big5.  */)
9137   (Lisp_Object ch)
9138 {
9139   Lisp_Object spec, attrs, charset_list;
9140   struct charset *charset;
9141   int c;
9142   unsigned code;
9143
9144   CHECK_CHARACTER (ch);
9145   c = XFASTINT (ch);
9146   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9147   attrs = AREF (spec, 0);
9148   if (ASCII_CHAR_P (c)
9149       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9150     return ch;
9151
9152   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9153   charset = char_charset (c, charset_list, &code);
9154   if (code == CHARSET_INVALID_CODE (charset))
9155     error ("Can't encode by Big5 encoding: %c", c);
9156
9157   return make_number (code);
9158 }
9159
9160 \f
9161 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9162        Sset_terminal_coding_system_internal, 1, 2, 0,
9163        doc: /* Internal use only.  */)
9164   (Lisp_Object coding_system, Lisp_Object terminal)
9165 {
9166   struct terminal *term = get_terminal (terminal, 1);
9167   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9168   CHECK_SYMBOL (coding_system);
9169   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9170   /* We had better not send unsafe characters to terminal.  */
9171   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9172   /* Character composition should be disabled.  */
9173   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9174   terminal_coding->src_multibyte = 1;
9175   terminal_coding->dst_multibyte = 0;
9176   if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9177     term->charset_list = coding_charset_list (terminal_coding);
9178   else
9179     term->charset_list = Fcons (make_number (charset_ascii), Qnil);
9180   return Qnil;
9181 }
9182
9183 DEFUN ("set-safe-terminal-coding-system-internal",
9184        Fset_safe_terminal_coding_system_internal,
9185        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9186        doc: /* Internal use only.  */)
9187   (Lisp_Object coding_system)
9188 {
9189   CHECK_SYMBOL (coding_system);
9190   setup_coding_system (Fcheck_coding_system (coding_system),
9191                        &safe_terminal_coding);
9192   /* Character composition should be disabled.  */
9193   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9194   safe_terminal_coding.src_multibyte = 1;
9195   safe_terminal_coding.dst_multibyte = 0;
9196   return Qnil;
9197 }
9198
9199 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9200        Sterminal_coding_system, 0, 1, 0,
9201        doc: /* Return coding system specified for terminal output on the given terminal.
9202 TERMINAL may be a terminal object, a frame, or nil for the selected
9203 frame's terminal device.  */)
9204   (Lisp_Object terminal)
9205 {
9206   struct coding_system *terminal_coding
9207     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9208   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9209
9210   /* For backward compatibility, return nil if it is `undecided'. */
9211   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9212 }
9213
9214 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9215        Sset_keyboard_coding_system_internal, 1, 2, 0,
9216        doc: /* Internal use only.  */)
9217   (Lisp_Object coding_system, Lisp_Object terminal)
9218 {
9219   struct terminal *t = get_terminal (terminal, 1);
9220   CHECK_SYMBOL (coding_system);
9221   if (NILP (coding_system))
9222     coding_system = Qno_conversion;
9223   else
9224     Fcheck_coding_system (coding_system);
9225   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9226   /* Character composition should be disabled.  */
9227   TERMINAL_KEYBOARD_CODING (t)->common_flags
9228     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9229   return Qnil;
9230 }
9231
9232 DEFUN ("keyboard-coding-system",
9233        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9234        doc: /* Return coding system specified for decoding keyboard input.  */)
9235   (Lisp_Object terminal)
9236 {
9237   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9238                          (get_terminal (terminal, 1))->id);
9239 }
9240
9241 \f
9242 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9243        Sfind_operation_coding_system,  1, MANY, 0,
9244        doc: /* Choose a coding system for an operation based on the target name.
9245 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9246 DECODING-SYSTEM is the coding system to use for decoding
9247 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9248 for encoding (in case OPERATION does encoding).
9249
9250 The first argument OPERATION specifies an I/O primitive:
9251   For file I/O, `insert-file-contents' or `write-region'.
9252   For process I/O, `call-process', `call-process-region', or `start-process'.
9253   For network I/O, `open-network-stream'.
9254
9255 The remaining arguments should be the same arguments that were passed
9256 to the primitive.  Depending on which primitive, one of those arguments
9257 is selected as the TARGET.  For example, if OPERATION does file I/O,
9258 whichever argument specifies the file name is TARGET.
9259
9260 TARGET has a meaning which depends on OPERATION:
9261   For file I/O, TARGET is a file name (except for the special case below).
9262   For process I/O, TARGET is a process name.
9263   For network I/O, TARGET is a service name or a port number.
9264
9265 This function looks up what is specified for TARGET in
9266 `file-coding-system-alist', `process-coding-system-alist',
9267 or `network-coding-system-alist' depending on OPERATION.
9268 They may specify a coding system, a cons of coding systems,
9269 or a function symbol to call.
9270 In the last case, we call the function with one argument,
9271 which is a list of all the arguments given to this function.
9272 If the function can't decide a coding system, it can return
9273 `undecided' so that the normal code-detection is performed.
9274
9275 If OPERATION is `insert-file-contents', the argument corresponding to
9276 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9277 file name to look up, and BUFFER is a buffer that contains the file's
9278 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9279 function to call for FILENAME, that function should examine the
9280 contents of BUFFER instead of reading the file.
9281
9282 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9283   (ptrdiff_t nargs, Lisp_Object *args)
9284 {
9285   Lisp_Object operation, target_idx, target, val;
9286   register Lisp_Object chain;
9287
9288   if (nargs < 2)
9289     error ("Too few arguments");
9290   operation = args[0];
9291   if (!SYMBOLP (operation)
9292       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9293     error ("Invalid first argument");
9294   if (nargs < 1 + XFASTINT (target_idx))
9295     error ("Too few arguments for operation `%s'",
9296            SDATA (SYMBOL_NAME (operation)));
9297   target = args[XFASTINT (target_idx) + 1];
9298   if (!(STRINGP (target)
9299         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9300             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9301         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9302     error ("Invalid argument %"pI"d of operation `%s'",
9303            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9304   if (CONSP (target))
9305     target = XCAR (target);
9306
9307   chain = ((EQ (operation, Qinsert_file_contents)
9308             || EQ (operation, Qwrite_region))
9309            ? Vfile_coding_system_alist
9310            : (EQ (operation, Qopen_network_stream)
9311               ? Vnetwork_coding_system_alist
9312               : Vprocess_coding_system_alist));
9313   if (NILP (chain))
9314     return Qnil;
9315
9316   for (; CONSP (chain); chain = XCDR (chain))
9317     {
9318       Lisp_Object elt;
9319
9320       elt = XCAR (chain);
9321       if (CONSP (elt)
9322           && ((STRINGP (target)
9323                && STRINGP (XCAR (elt))
9324                && fast_string_match (XCAR (elt), target) >= 0)
9325               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9326         {
9327           val = XCDR (elt);
9328           /* Here, if VAL is both a valid coding system and a valid
9329              function symbol, we return VAL as a coding system.  */
9330           if (CONSP (val))
9331             return val;
9332           if (! SYMBOLP (val))
9333             return Qnil;
9334           if (! NILP (Fcoding_system_p (val)))
9335             return Fcons (val, val);
9336           if (! NILP (Ffboundp (val)))
9337             {
9338               /* We use call1 rather than safe_call1
9339                  so as to get bug reports about functions called here
9340                  which don't handle the current interface.  */
9341               val = call1 (val, Flist (nargs, args));
9342               if (CONSP (val))
9343                 return val;
9344               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9345                 return Fcons (val, val);
9346             }
9347           return Qnil;
9348         }
9349     }
9350   return Qnil;
9351 }
9352
9353 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9354        Sset_coding_system_priority, 0, MANY, 0,
9355        doc: /* Assign higher priority to the coding systems given as arguments.
9356 If multiple coding systems belong to the same category,
9357 all but the first one are ignored.
9358
9359 usage: (set-coding-system-priority &rest coding-systems)  */)
9360   (ptrdiff_t nargs, Lisp_Object *args)
9361 {
9362   ptrdiff_t i, j;
9363   int changed[coding_category_max];
9364   enum coding_category priorities[coding_category_max];
9365
9366   memset (changed, 0, sizeof changed);
9367
9368   for (i = j = 0; i < nargs; i++)
9369     {
9370       enum coding_category category;
9371       Lisp_Object spec, attrs;
9372
9373       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9374       attrs = AREF (spec, 0);
9375       category = XINT (CODING_ATTR_CATEGORY (attrs));
9376       if (changed[category])
9377         /* Ignore this coding system because a coding system of the
9378            same category already had a higher priority.  */
9379         continue;
9380       changed[category] = 1;
9381       priorities[j++] = category;
9382       if (coding_categories[category].id >= 0
9383           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9384         setup_coding_system (args[i], &coding_categories[category]);
9385       Fset (AREF (Vcoding_category_table, category), args[i]);
9386     }
9387
9388   /* Now we have decided top J priorities.  Reflect the order of the
9389      original priorities to the remaining priorities.  */
9390
9391   for (i = j, j = 0; i < coding_category_max; i++, j++)
9392     {
9393       while (j < coding_category_max
9394              && changed[coding_priorities[j]])
9395         j++;
9396       if (j == coding_category_max)
9397         abort ();
9398       priorities[i] = coding_priorities[j];
9399     }
9400
9401   memcpy (coding_priorities, priorities, sizeof priorities);
9402
9403   /* Update `coding-category-list'.  */
9404   Vcoding_category_list = Qnil;
9405   for (i = coding_category_max; i-- > 0; )
9406     Vcoding_category_list
9407       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9408                Vcoding_category_list);
9409
9410   return Qnil;
9411 }
9412
9413 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9414        Scoding_system_priority_list, 0, 1, 0,
9415        doc: /* Return a list of coding systems ordered by their priorities.
9416 The list contains a subset of coding systems; i.e. coding systems
9417 assigned to each coding category (see `coding-category-list').
9418
9419 HIGHESTP non-nil means just return the highest priority one.  */)
9420   (Lisp_Object highestp)
9421 {
9422   int i;
9423   Lisp_Object val;
9424
9425   for (i = 0, val = Qnil; i < coding_category_max; i++)
9426     {
9427       enum coding_category category = coding_priorities[i];
9428       int id = coding_categories[category].id;
9429       Lisp_Object attrs;
9430
9431       if (id < 0)
9432         continue;
9433       attrs = CODING_ID_ATTRS (id);
9434       if (! NILP (highestp))
9435         return CODING_ATTR_BASE_NAME (attrs);
9436       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9437     }
9438   return Fnreverse (val);
9439 }
9440
9441 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9442
9443 static Lisp_Object
9444 make_subsidiaries (Lisp_Object base)
9445 {
9446   Lisp_Object subsidiaries;
9447   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9448   char *buf = (char *) alloca (base_name_len + 6);
9449   int i;
9450
9451   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9452   subsidiaries = Fmake_vector (make_number (3), Qnil);
9453   for (i = 0; i < 3; i++)
9454     {
9455       strcpy (buf + base_name_len, suffixes[i]);
9456       ASET (subsidiaries, i, intern (buf));
9457     }
9458   return subsidiaries;
9459 }
9460
9461
9462 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9463        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9464        doc: /* For internal use only.
9465 usage: (define-coding-system-internal ...)  */)
9466   (ptrdiff_t nargs, Lisp_Object *args)
9467 {
9468   Lisp_Object name;
9469   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9470   Lisp_Object attrs;            /* Vector of attributes.  */
9471   Lisp_Object eol_type;
9472   Lisp_Object aliases;
9473   Lisp_Object coding_type, charset_list, safe_charsets;
9474   enum coding_category category;
9475   Lisp_Object tail, val;
9476   int max_charset_id = 0;
9477   int i;
9478
9479   if (nargs < coding_arg_max)
9480     goto short_args;
9481
9482   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9483
9484   name = args[coding_arg_name];
9485   CHECK_SYMBOL (name);
9486   CODING_ATTR_BASE_NAME (attrs) = name;
9487
9488   val = args[coding_arg_mnemonic];
9489   if (! STRINGP (val))
9490     CHECK_CHARACTER (val);
9491   CODING_ATTR_MNEMONIC (attrs) = val;
9492
9493   coding_type = args[coding_arg_coding_type];
9494   CHECK_SYMBOL (coding_type);
9495   CODING_ATTR_TYPE (attrs) = coding_type;
9496
9497   charset_list = args[coding_arg_charset_list];
9498   if (SYMBOLP (charset_list))
9499     {
9500       if (EQ (charset_list, Qiso_2022))
9501         {
9502           if (! EQ (coding_type, Qiso_2022))
9503             error ("Invalid charset-list");
9504           charset_list = Viso_2022_charset_list;
9505         }
9506       else if (EQ (charset_list, Qemacs_mule))
9507         {
9508           if (! EQ (coding_type, Qemacs_mule))
9509             error ("Invalid charset-list");
9510           charset_list = Vemacs_mule_charset_list;
9511         }
9512       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9513         {
9514           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9515             error ("Invalid charset-list");
9516           if (max_charset_id < XFASTINT (XCAR (tail)))
9517             max_charset_id = XFASTINT (XCAR (tail));
9518         }
9519     }
9520   else
9521     {
9522       charset_list = Fcopy_sequence (charset_list);
9523       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9524         {
9525           struct charset *charset;
9526
9527           val = XCAR (tail);
9528           CHECK_CHARSET_GET_CHARSET (val, charset);
9529           if (EQ (coding_type, Qiso_2022)
9530               ? CHARSET_ISO_FINAL (charset) < 0
9531               : EQ (coding_type, Qemacs_mule)
9532               ? CHARSET_EMACS_MULE_ID (charset) < 0
9533               : 0)
9534             error ("Can't handle charset `%s'",
9535                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9536
9537           XSETCAR (tail, make_number (charset->id));
9538           if (max_charset_id < charset->id)
9539             max_charset_id = charset->id;
9540         }
9541     }
9542   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9543
9544   safe_charsets = make_uninit_string (max_charset_id + 1);
9545   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9546   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9547     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9548   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9549
9550   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9551
9552   val = args[coding_arg_decode_translation_table];
9553   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9554     CHECK_SYMBOL (val);
9555   CODING_ATTR_DECODE_TBL (attrs) = val;
9556
9557   val = args[coding_arg_encode_translation_table];
9558   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9559     CHECK_SYMBOL (val);
9560   CODING_ATTR_ENCODE_TBL (attrs) = val;
9561
9562   val = args[coding_arg_post_read_conversion];
9563   CHECK_SYMBOL (val);
9564   CODING_ATTR_POST_READ (attrs) = val;
9565
9566   val = args[coding_arg_pre_write_conversion];
9567   CHECK_SYMBOL (val);
9568   CODING_ATTR_PRE_WRITE (attrs) = val;
9569
9570   val = args[coding_arg_default_char];
9571   if (NILP (val))
9572     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9573   else
9574     {
9575       CHECK_CHARACTER (val);
9576       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9577     }
9578
9579   val = args[coding_arg_for_unibyte];
9580   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9581
9582   val = args[coding_arg_plist];
9583   CHECK_LIST (val);
9584   CODING_ATTR_PLIST (attrs) = val;
9585
9586   if (EQ (coding_type, Qcharset))
9587     {
9588       /* Generate a lisp vector of 256 elements.  Each element is nil,
9589          integer, or a list of charset IDs.
9590
9591          If Nth element is nil, the byte code N is invalid in this
9592          coding system.
9593
9594          If Nth element is a number NUM, N is the first byte of a
9595          charset whose ID is NUM.
9596
9597          If Nth element is a list of charset IDs, N is the first byte
9598          of one of them.  The list is sorted by dimensions of the
9599          charsets.  A charset of smaller dimension comes first. */
9600       val = Fmake_vector (make_number (256), Qnil);
9601
9602       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9603         {
9604           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9605           int dim = CHARSET_DIMENSION (charset);
9606           int idx = (dim - 1) * 4;
9607
9608           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9609             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9610
9611           for (i = charset->code_space[idx];
9612                i <= charset->code_space[idx + 1]; i++)
9613             {
9614               Lisp_Object tmp, tmp2;
9615               int dim2;
9616
9617               tmp = AREF (val, i);
9618               if (NILP (tmp))
9619                 tmp = XCAR (tail);
9620               else if (NUMBERP (tmp))
9621                 {
9622                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9623                   if (dim < dim2)
9624                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9625                   else
9626                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9627                 }
9628               else
9629                 {
9630                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9631                     {
9632                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9633                       if (dim < dim2)
9634                         break;
9635                     }
9636                   if (NILP (tmp2))
9637                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9638                   else
9639                     {
9640                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9641                       XSETCAR (tmp2, XCAR (tail));
9642                     }
9643                 }
9644               ASET (val, i, tmp);
9645             }
9646         }
9647       ASET (attrs, coding_attr_charset_valids, val);
9648       category = coding_category_charset;
9649     }
9650   else if (EQ (coding_type, Qccl))
9651     {
9652       Lisp_Object valids;
9653
9654       if (nargs < coding_arg_ccl_max)
9655         goto short_args;
9656
9657       val = args[coding_arg_ccl_decoder];
9658       CHECK_CCL_PROGRAM (val);
9659       if (VECTORP (val))
9660         val = Fcopy_sequence (val);
9661       ASET (attrs, coding_attr_ccl_decoder, val);
9662
9663       val = args[coding_arg_ccl_encoder];
9664       CHECK_CCL_PROGRAM (val);
9665       if (VECTORP (val))
9666         val = Fcopy_sequence (val);
9667       ASET (attrs, coding_attr_ccl_encoder, val);
9668
9669       val = args[coding_arg_ccl_valids];
9670       valids = Fmake_string (make_number (256), make_number (0));
9671       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9672         {
9673           int from, to;
9674
9675           val = Fcar (tail);
9676           if (INTEGERP (val))
9677             {
9678               if (! (0 <= XINT (val) && XINT (val) <= 255))
9679                 args_out_of_range_3 (val, make_number (0), make_number (255));
9680               from = to = XINT (val);
9681             }
9682           else
9683             {
9684               CHECK_CONS (val);
9685               CHECK_NATNUM_CAR (val);
9686               CHECK_NUMBER_CDR (val);
9687               if (XINT (XCAR (val)) > 255)
9688                 args_out_of_range_3 (XCAR (val),
9689                                      make_number (0), make_number (255));
9690               from = XINT (XCAR (val));
9691               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9692                 args_out_of_range_3 (XCDR (val),
9693                                      XCAR (val), make_number (255));
9694               to = XINT (XCDR (val));
9695             }
9696           for (i = from; i <= to; i++)
9697             SSET (valids, i, 1);
9698         }
9699       ASET (attrs, coding_attr_ccl_valids, valids);
9700
9701       category = coding_category_ccl;
9702     }
9703   else if (EQ (coding_type, Qutf_16))
9704     {
9705       Lisp_Object bom, endian;
9706
9707       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9708
9709       if (nargs < coding_arg_utf16_max)
9710         goto short_args;
9711
9712       bom = args[coding_arg_utf16_bom];
9713       if (! NILP (bom) && ! EQ (bom, Qt))
9714         {
9715           CHECK_CONS (bom);
9716           val = XCAR (bom);
9717           CHECK_CODING_SYSTEM (val);
9718           val = XCDR (bom);
9719           CHECK_CODING_SYSTEM (val);
9720         }
9721       ASET (attrs, coding_attr_utf_bom, bom);
9722
9723       endian = args[coding_arg_utf16_endian];
9724       CHECK_SYMBOL (endian);
9725       if (NILP (endian))
9726         endian = Qbig;
9727       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9728         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9729       ASET (attrs, coding_attr_utf_16_endian, endian);
9730
9731       category = (CONSP (bom)
9732                   ? coding_category_utf_16_auto
9733                   : NILP (bom)
9734                   ? (EQ (endian, Qbig)
9735                      ? coding_category_utf_16_be_nosig
9736                      : coding_category_utf_16_le_nosig)
9737                   : (EQ (endian, Qbig)
9738                      ? coding_category_utf_16_be
9739                      : coding_category_utf_16_le));
9740     }
9741   else if (EQ (coding_type, Qiso_2022))
9742     {
9743       Lisp_Object initial, reg_usage, request, flags;
9744
9745       if (nargs < coding_arg_iso2022_max)
9746         goto short_args;
9747
9748       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9749       CHECK_VECTOR (initial);
9750       for (i = 0; i < 4; i++)
9751         {
9752           val = Faref (initial, make_number (i));
9753           if (! NILP (val))
9754             {
9755               struct charset *charset;
9756
9757               CHECK_CHARSET_GET_CHARSET (val, charset);
9758               ASET (initial, i, make_number (CHARSET_ID (charset)));
9759               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9760                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9761             }
9762           else
9763             ASET (initial, i, make_number (-1));
9764         }
9765
9766       reg_usage = args[coding_arg_iso2022_reg_usage];
9767       CHECK_CONS (reg_usage);
9768       CHECK_NUMBER_CAR (reg_usage);
9769       CHECK_NUMBER_CDR (reg_usage);
9770
9771       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9772       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9773         {
9774           int id;
9775           Lisp_Object tmp1;
9776
9777           val = Fcar (tail);
9778           CHECK_CONS (val);
9779           tmp1 = XCAR (val);
9780           CHECK_CHARSET_GET_ID (tmp1, id);
9781           CHECK_NATNUM_CDR (val);
9782           if (XINT (XCDR (val)) >= 4)
9783             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9784           XSETCAR (val, make_number (id));
9785         }
9786
9787       flags = args[coding_arg_iso2022_flags];
9788       CHECK_NATNUM (flags);
9789       i = XINT (flags) & INT_MAX;
9790       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9791         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9792       flags = make_number (i);
9793
9794       ASET (attrs, coding_attr_iso_initial, initial);
9795       ASET (attrs, coding_attr_iso_usage, reg_usage);
9796       ASET (attrs, coding_attr_iso_request, request);
9797       ASET (attrs, coding_attr_iso_flags, flags);
9798       setup_iso_safe_charsets (attrs);
9799
9800       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9801         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9802                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9803                     ? coding_category_iso_7_else
9804                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9805                     ? coding_category_iso_7
9806                     : coding_category_iso_7_tight);
9807       else
9808         {
9809           int id = XINT (AREF (initial, 1));
9810
9811           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9812                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9813                        || id < 0)
9814                       ? coding_category_iso_8_else
9815                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9816                       ? coding_category_iso_8_1
9817                       : coding_category_iso_8_2);
9818         }
9819       if (category != coding_category_iso_8_1
9820           && category != coding_category_iso_8_2)
9821         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9822     }
9823   else if (EQ (coding_type, Qemacs_mule))
9824     {
9825       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9826         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9827       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9828       category = coding_category_emacs_mule;
9829     }
9830   else if (EQ (coding_type, Qshift_jis))
9831     {
9832
9833       struct charset *charset;
9834
9835       if (XINT (Flength (charset_list)) != 3
9836           && XINT (Flength (charset_list)) != 4)
9837         error ("There should be three or four charsets");
9838
9839       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9840       if (CHARSET_DIMENSION (charset) != 1)
9841         error ("Dimension of charset %s is not one",
9842                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9843       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9844         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9845
9846       charset_list = XCDR (charset_list);
9847       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9848       if (CHARSET_DIMENSION (charset) != 1)
9849         error ("Dimension of charset %s is not one",
9850                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9851
9852       charset_list = XCDR (charset_list);
9853       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9854       if (CHARSET_DIMENSION (charset) != 2)
9855         error ("Dimension of charset %s is not two",
9856                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9857
9858       charset_list = XCDR (charset_list);
9859       if (! NILP (charset_list))
9860         {
9861           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9862           if (CHARSET_DIMENSION (charset) != 2)
9863             error ("Dimension of charset %s is not two",
9864                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9865         }
9866
9867       category = coding_category_sjis;
9868       Vsjis_coding_system = name;
9869     }
9870   else if (EQ (coding_type, Qbig5))
9871     {
9872       struct charset *charset;
9873
9874       if (XINT (Flength (charset_list)) != 2)
9875         error ("There should be just two charsets");
9876
9877       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9878       if (CHARSET_DIMENSION (charset) != 1)
9879         error ("Dimension of charset %s is not one",
9880                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9881       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9882         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9883
9884       charset_list = XCDR (charset_list);
9885       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9886       if (CHARSET_DIMENSION (charset) != 2)
9887         error ("Dimension of charset %s is not two",
9888                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9889
9890       category = coding_category_big5;
9891       Vbig5_coding_system = name;
9892     }
9893   else if (EQ (coding_type, Qraw_text))
9894     {
9895       category = coding_category_raw_text;
9896       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9897     }
9898   else if (EQ (coding_type, Qutf_8))
9899     {
9900       Lisp_Object bom;
9901
9902       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9903
9904       if (nargs < coding_arg_utf8_max)
9905         goto short_args;
9906
9907       bom = args[coding_arg_utf8_bom];
9908       if (! NILP (bom) && ! EQ (bom, Qt))
9909         {
9910           CHECK_CONS (bom);
9911           val = XCAR (bom);
9912           CHECK_CODING_SYSTEM (val);
9913           val = XCDR (bom);
9914           CHECK_CODING_SYSTEM (val);
9915         }
9916       ASET (attrs, coding_attr_utf_bom, bom);
9917
9918       category = (CONSP (bom) ? coding_category_utf_8_auto
9919                   : NILP (bom) ? coding_category_utf_8_nosig
9920                   : coding_category_utf_8_sig);
9921     }
9922   else if (EQ (coding_type, Qundecided))
9923     category = coding_category_undecided;
9924   else
9925     error ("Invalid coding system type: %s",
9926            SDATA (SYMBOL_NAME (coding_type)));
9927
9928   CODING_ATTR_CATEGORY (attrs) = make_number (category);
9929   CODING_ATTR_PLIST (attrs)
9930     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9931                                 CODING_ATTR_PLIST (attrs)));
9932   CODING_ATTR_PLIST (attrs)
9933     = Fcons (QCascii_compatible_p,
9934              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9935                     CODING_ATTR_PLIST (attrs)));
9936
9937   eol_type = args[coding_arg_eol_type];
9938   if (! NILP (eol_type)
9939       && ! EQ (eol_type, Qunix)
9940       && ! EQ (eol_type, Qdos)
9941       && ! EQ (eol_type, Qmac))
9942     error ("Invalid eol-type");
9943
9944   aliases = Fcons (name, Qnil);
9945
9946   if (NILP (eol_type))
9947     {
9948       eol_type = make_subsidiaries (name);
9949       for (i = 0; i < 3; i++)
9950         {
9951           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9952
9953           this_name = AREF (eol_type, i);
9954           this_aliases = Fcons (this_name, Qnil);
9955           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9956           this_spec = Fmake_vector (make_number (3), attrs);
9957           ASET (this_spec, 1, this_aliases);
9958           ASET (this_spec, 2, this_eol_type);
9959           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9960           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9961           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9962           if (NILP (val))
9963             Vcoding_system_alist
9964               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9965                        Vcoding_system_alist);
9966         }
9967     }
9968
9969   spec_vec = Fmake_vector (make_number (3), attrs);
9970   ASET (spec_vec, 1, aliases);
9971   ASET (spec_vec, 2, eol_type);
9972
9973   Fputhash (name, spec_vec, Vcoding_system_hash_table);
9974   Vcoding_system_list = Fcons (name, Vcoding_system_list);
9975   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9976   if (NILP (val))
9977     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9978                                   Vcoding_system_alist);
9979
9980   {
9981     int id = coding_categories[category].id;
9982
9983     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9984       setup_coding_system (name, &coding_categories[category]);
9985   }
9986
9987   return Qnil;
9988
9989  short_args:
9990   return Fsignal (Qwrong_number_of_arguments,
9991                   Fcons (intern ("define-coding-system-internal"),
9992                          make_number (nargs)));
9993 }
9994
9995
9996 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9997        3, 3, 0,
9998        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
9999   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10000 {
10001   Lisp_Object spec, attrs;
10002
10003   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10004   attrs = AREF (spec, 0);
10005   if (EQ (prop, QCmnemonic))
10006     {
10007       if (! STRINGP (val))
10008         CHECK_CHARACTER (val);
10009       CODING_ATTR_MNEMONIC (attrs) = val;
10010     }
10011   else if (EQ (prop, QCdefault_char))
10012     {
10013       if (NILP (val))
10014         val = make_number (' ');
10015       else
10016         CHECK_CHARACTER (val);
10017       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10018     }
10019   else if (EQ (prop, QCdecode_translation_table))
10020     {
10021       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10022         CHECK_SYMBOL (val);
10023       CODING_ATTR_DECODE_TBL (attrs) = val;
10024     }
10025   else if (EQ (prop, QCencode_translation_table))
10026     {
10027       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10028         CHECK_SYMBOL (val);
10029       CODING_ATTR_ENCODE_TBL (attrs) = val;
10030     }
10031   else if (EQ (prop, QCpost_read_conversion))
10032     {
10033       CHECK_SYMBOL (val);
10034       CODING_ATTR_POST_READ (attrs) = val;
10035     }
10036   else if (EQ (prop, QCpre_write_conversion))
10037     {
10038       CHECK_SYMBOL (val);
10039       CODING_ATTR_PRE_WRITE (attrs) = val;
10040     }
10041   else if (EQ (prop, QCascii_compatible_p))
10042     {
10043       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10044     }
10045
10046   CODING_ATTR_PLIST (attrs)
10047     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10048   return val;
10049 }
10050
10051
10052 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10053        Sdefine_coding_system_alias, 2, 2, 0,
10054        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10055   (Lisp_Object alias, Lisp_Object coding_system)
10056 {
10057   Lisp_Object spec, aliases, eol_type, val;
10058
10059   CHECK_SYMBOL (alias);
10060   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10061   aliases = AREF (spec, 1);
10062   /* ALIASES should be a list of length more than zero, and the first
10063      element is a base coding system.  Append ALIAS at the tail of the
10064      list.  */
10065   while (!NILP (XCDR (aliases)))
10066     aliases = XCDR (aliases);
10067   XSETCDR (aliases, Fcons (alias, Qnil));
10068
10069   eol_type = AREF (spec, 2);
10070   if (VECTORP (eol_type))
10071     {
10072       Lisp_Object subsidiaries;
10073       int i;
10074
10075       subsidiaries = make_subsidiaries (alias);
10076       for (i = 0; i < 3; i++)
10077         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10078                                      AREF (eol_type, i));
10079     }
10080
10081   Fputhash (alias, spec, Vcoding_system_hash_table);
10082   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10083   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10084   if (NILP (val))
10085     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10086                                   Vcoding_system_alist);
10087
10088   return Qnil;
10089 }
10090
10091 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10092        1, 1, 0,
10093        doc: /* Return the base of CODING-SYSTEM.
10094 Any alias or subsidiary coding system is not a base coding system.  */)
10095   (Lisp_Object coding_system)
10096 {
10097   Lisp_Object spec, attrs;
10098
10099   if (NILP (coding_system))
10100     return (Qno_conversion);
10101   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10102   attrs = AREF (spec, 0);
10103   return CODING_ATTR_BASE_NAME (attrs);
10104 }
10105
10106 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10107        1, 1, 0,
10108        doc: "Return the property list of CODING-SYSTEM.")
10109   (Lisp_Object coding_system)
10110 {
10111   Lisp_Object spec, attrs;
10112
10113   if (NILP (coding_system))
10114     coding_system = Qno_conversion;
10115   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10116   attrs = AREF (spec, 0);
10117   return CODING_ATTR_PLIST (attrs);
10118 }
10119
10120
10121 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10122        1, 1, 0,
10123        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10124   (Lisp_Object coding_system)
10125 {
10126   Lisp_Object spec;
10127
10128   if (NILP (coding_system))
10129     coding_system = Qno_conversion;
10130   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10131   return AREF (spec, 1);
10132 }
10133
10134 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10135        Scoding_system_eol_type, 1, 1, 0,
10136        doc: /* Return eol-type of CODING-SYSTEM.
10137 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10138
10139 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10140 and CR respectively.
10141
10142 A vector value indicates that a format of end-of-line should be
10143 detected automatically.  Nth element of the vector is the subsidiary
10144 coding system whose eol-type is N.  */)
10145   (Lisp_Object coding_system)
10146 {
10147   Lisp_Object spec, eol_type;
10148   int n;
10149
10150   if (NILP (coding_system))
10151     coding_system = Qno_conversion;
10152   if (! CODING_SYSTEM_P (coding_system))
10153     return Qnil;
10154   spec = CODING_SYSTEM_SPEC (coding_system);
10155   eol_type = AREF (spec, 2);
10156   if (VECTORP (eol_type))
10157     return Fcopy_sequence (eol_type);
10158   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10159   return make_number (n);
10160 }
10161
10162 #endif /* emacs */
10163
10164 \f
10165 /*** 9. Post-amble ***/
10166
10167 void
10168 init_coding_once (void)
10169 {
10170   int i;
10171
10172   for (i = 0; i < coding_category_max; i++)
10173     {
10174       coding_categories[i].id = -1;
10175       coding_priorities[i] = i;
10176     }
10177
10178   /* ISO2022 specific initialize routine.  */
10179   for (i = 0; i < 0x20; i++)
10180     iso_code_class[i] = ISO_control_0;
10181   for (i = 0x21; i < 0x7F; i++)
10182     iso_code_class[i] = ISO_graphic_plane_0;
10183   for (i = 0x80; i < 0xA0; i++)
10184     iso_code_class[i] = ISO_control_1;
10185   for (i = 0xA1; i < 0xFF; i++)
10186     iso_code_class[i] = ISO_graphic_plane_1;
10187   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10188   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10189   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10190   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10191   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10192   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10193   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10194   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10195   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10196
10197   for (i = 0; i < 256; i++)
10198     {
10199       emacs_mule_bytes[i] = 1;
10200     }
10201   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10202   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10203   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10204   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10205 }
10206
10207 #ifdef emacs
10208
10209 void
10210 syms_of_coding (void)
10211 {
10212   staticpro (&Vcoding_system_hash_table);
10213   {
10214     Lisp_Object args[2];
10215     args[0] = QCtest;
10216     args[1] = Qeq;
10217     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10218   }
10219
10220   staticpro (&Vsjis_coding_system);
10221   Vsjis_coding_system = Qnil;
10222
10223   staticpro (&Vbig5_coding_system);
10224   Vbig5_coding_system = Qnil;
10225
10226   staticpro (&Vcode_conversion_reused_workbuf);
10227   Vcode_conversion_reused_workbuf = Qnil;
10228
10229   staticpro (&Vcode_conversion_workbuf_name);
10230   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10231
10232   reused_workbuf_in_use = 0;
10233
10234   DEFSYM (Qcharset, "charset");
10235   DEFSYM (Qtarget_idx, "target-idx");
10236   DEFSYM (Qcoding_system_history, "coding-system-history");
10237   Fset (Qcoding_system_history, Qnil);
10238
10239   /* Target FILENAME is the first argument.  */
10240   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10241   /* Target FILENAME is the third argument.  */
10242   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10243
10244   DEFSYM (Qcall_process, "call-process");
10245   /* Target PROGRAM is the first argument.  */
10246   Fput (Qcall_process, Qtarget_idx, make_number (0));
10247
10248   DEFSYM (Qcall_process_region, "call-process-region");
10249   /* Target PROGRAM is the third argument.  */
10250   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10251
10252   DEFSYM (Qstart_process, "start-process");
10253   /* Target PROGRAM is the third argument.  */
10254   Fput (Qstart_process, Qtarget_idx, make_number (2));
10255
10256   DEFSYM (Qopen_network_stream, "open-network-stream");
10257   /* Target SERVICE is the fourth argument.  */
10258   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10259
10260   DEFSYM (Qcoding_system, "coding-system");
10261   DEFSYM (Qcoding_aliases, "coding-aliases");
10262
10263   DEFSYM (Qeol_type, "eol-type");
10264   DEFSYM (Qunix, "unix");
10265   DEFSYM (Qdos, "dos");
10266
10267   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10268   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10269   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10270   DEFSYM (Qdefault_char, "default-char");
10271   DEFSYM (Qundecided, "undecided");
10272   DEFSYM (Qno_conversion, "no-conversion");
10273   DEFSYM (Qraw_text, "raw-text");
10274
10275   DEFSYM (Qiso_2022, "iso-2022");
10276
10277   DEFSYM (Qutf_8, "utf-8");
10278   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10279
10280   DEFSYM (Qutf_16, "utf-16");
10281   DEFSYM (Qbig, "big");
10282   DEFSYM (Qlittle, "little");
10283
10284   DEFSYM (Qshift_jis, "shift-jis");
10285   DEFSYM (Qbig5, "big5");
10286
10287   DEFSYM (Qcoding_system_p, "coding-system-p");
10288
10289   DEFSYM (Qcoding_system_error, "coding-system-error");
10290   Fput (Qcoding_system_error, Qerror_conditions,
10291         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10292   Fput (Qcoding_system_error, Qerror_message,
10293         make_pure_c_string ("Invalid coding system"));
10294
10295   /* Intern this now in case it isn't already done.
10296      Setting this variable twice is harmless.
10297      But don't staticpro it here--that is done in alloc.c.  */
10298   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10299
10300   DEFSYM (Qtranslation_table, "translation-table");
10301   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10302   DEFSYM (Qtranslation_table_id, "translation-table-id");
10303   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10304   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10305
10306   DEFSYM (Qvalid_codes, "valid-codes");
10307
10308   DEFSYM (Qemacs_mule, "emacs-mule");
10309
10310   DEFSYM (QCcategory, ":category");
10311   DEFSYM (QCmnemonic, ":mnemonic");
10312   DEFSYM (QCdefault_char, ":default-char");
10313   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10314   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10315   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10316   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10317   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10318
10319   Vcoding_category_table
10320     = Fmake_vector (make_number (coding_category_max), Qnil);
10321   staticpro (&Vcoding_category_table);
10322   /* Followings are target of code detection.  */
10323   ASET (Vcoding_category_table, coding_category_iso_7,
10324         intern_c_string ("coding-category-iso-7"));
10325   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10326         intern_c_string ("coding-category-iso-7-tight"));
10327   ASET (Vcoding_category_table, coding_category_iso_8_1,
10328         intern_c_string ("coding-category-iso-8-1"));
10329   ASET (Vcoding_category_table, coding_category_iso_8_2,
10330         intern_c_string ("coding-category-iso-8-2"));
10331   ASET (Vcoding_category_table, coding_category_iso_7_else,
10332         intern_c_string ("coding-category-iso-7-else"));
10333   ASET (Vcoding_category_table, coding_category_iso_8_else,
10334         intern_c_string ("coding-category-iso-8-else"));
10335   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10336         intern_c_string ("coding-category-utf-8-auto"));
10337   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10338         intern_c_string ("coding-category-utf-8"));
10339   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10340         intern_c_string ("coding-category-utf-8-sig"));
10341   ASET (Vcoding_category_table, coding_category_utf_16_be,
10342         intern_c_string ("coding-category-utf-16-be"));
10343   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10344         intern_c_string ("coding-category-utf-16-auto"));
10345   ASET (Vcoding_category_table, coding_category_utf_16_le,
10346         intern_c_string ("coding-category-utf-16-le"));
10347   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10348         intern_c_string ("coding-category-utf-16-be-nosig"));
10349   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10350         intern_c_string ("coding-category-utf-16-le-nosig"));
10351   ASET (Vcoding_category_table, coding_category_charset,
10352         intern_c_string ("coding-category-charset"));
10353   ASET (Vcoding_category_table, coding_category_sjis,
10354         intern_c_string ("coding-category-sjis"));
10355   ASET (Vcoding_category_table, coding_category_big5,
10356         intern_c_string ("coding-category-big5"));
10357   ASET (Vcoding_category_table, coding_category_ccl,
10358         intern_c_string ("coding-category-ccl"));
10359   ASET (Vcoding_category_table, coding_category_emacs_mule,
10360         intern_c_string ("coding-category-emacs-mule"));
10361   /* Followings are NOT target of code detection.  */
10362   ASET (Vcoding_category_table, coding_category_raw_text,
10363         intern_c_string ("coding-category-raw-text"));
10364   ASET (Vcoding_category_table, coding_category_undecided,
10365         intern_c_string ("coding-category-undecided"));
10366
10367   DEFSYM (Qinsufficient_source, "insufficient-source");
10368   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10369   DEFSYM (Qinvalid_source, "invalid-source");
10370   DEFSYM (Qinterrupted, "interrupted");
10371   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10372   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10373
10374   defsubr (&Scoding_system_p);
10375   defsubr (&Sread_coding_system);
10376   defsubr (&Sread_non_nil_coding_system);
10377   defsubr (&Scheck_coding_system);
10378   defsubr (&Sdetect_coding_region);
10379   defsubr (&Sdetect_coding_string);
10380   defsubr (&Sfind_coding_systems_region_internal);
10381   defsubr (&Sunencodable_char_position);
10382   defsubr (&Scheck_coding_systems_region);
10383   defsubr (&Sdecode_coding_region);
10384   defsubr (&Sencode_coding_region);
10385   defsubr (&Sdecode_coding_string);
10386   defsubr (&Sencode_coding_string);
10387   defsubr (&Sdecode_sjis_char);
10388   defsubr (&Sencode_sjis_char);
10389   defsubr (&Sdecode_big5_char);
10390   defsubr (&Sencode_big5_char);
10391   defsubr (&Sset_terminal_coding_system_internal);
10392   defsubr (&Sset_safe_terminal_coding_system_internal);
10393   defsubr (&Sterminal_coding_system);
10394   defsubr (&Sset_keyboard_coding_system_internal);
10395   defsubr (&Skeyboard_coding_system);
10396   defsubr (&Sfind_operation_coding_system);
10397   defsubr (&Sset_coding_system_priority);
10398   defsubr (&Sdefine_coding_system_internal);
10399   defsubr (&Sdefine_coding_system_alias);
10400   defsubr (&Scoding_system_put);
10401   defsubr (&Scoding_system_base);
10402   defsubr (&Scoding_system_plist);
10403   defsubr (&Scoding_system_aliases);
10404   defsubr (&Scoding_system_eol_type);
10405   defsubr (&Scoding_system_priority_list);
10406
10407   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10408                doc: /* List of coding systems.
10409
10410 Do not alter the value of this variable manually.  This variable should be
10411 updated by the functions `define-coding-system' and
10412 `define-coding-system-alias'.  */);
10413   Vcoding_system_list = Qnil;
10414
10415   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10416                doc: /* Alist of coding system names.
10417 Each element is one element list of coding system name.
10418 This variable is given to `completing-read' as COLLECTION argument.
10419
10420 Do not alter the value of this variable manually.  This variable should be
10421 updated by the functions `make-coding-system' and
10422 `define-coding-system-alias'.  */);
10423   Vcoding_system_alist = Qnil;
10424
10425   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10426                doc: /* List of coding-categories (symbols) ordered by priority.
10427
10428 On detecting a coding system, Emacs tries code detection algorithms
10429 associated with each coding-category one by one in this order.  When
10430 one algorithm agrees with a byte sequence of source text, the coding
10431 system bound to the corresponding coding-category is selected.
10432
10433 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10434   {
10435     int i;
10436
10437     Vcoding_category_list = Qnil;
10438     for (i = coding_category_max - 1; i >= 0; i--)
10439       Vcoding_category_list
10440         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10441                  Vcoding_category_list);
10442   }
10443
10444   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10445                doc: /* Specify the coding system for read operations.
10446 It is useful to bind this variable with `let', but do not set it globally.
10447 If the value is a coding system, it is used for decoding on read operation.
10448 If not, an appropriate element is used from one of the coding system alists.
10449 There are three such tables: `file-coding-system-alist',
10450 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10451   Vcoding_system_for_read = Qnil;
10452
10453   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10454                doc: /* Specify the coding system for write operations.
10455 Programs bind this variable with `let', but you should not set it globally.
10456 If the value is a coding system, it is used for encoding of output,
10457 when writing it to a file and when sending it to a file or subprocess.
10458
10459 If this does not specify a coding system, an appropriate element
10460 is used from one of the coding system alists.
10461 There are three such tables: `file-coding-system-alist',
10462 `process-coding-system-alist', and `network-coding-system-alist'.
10463 For output to files, if the above procedure does not specify a coding system,
10464 the value of `buffer-file-coding-system' is used.  */);
10465   Vcoding_system_for_write = Qnil;
10466
10467   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10468                doc: /*
10469 Coding system used in the latest file or process I/O.  */);
10470   Vlast_coding_system_used = Qnil;
10471
10472   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10473                doc: /*
10474 Error status of the last code conversion.
10475
10476 When an error was detected in the last code conversion, this variable
10477 is set to one of the following symbols.
10478   `insufficient-source'
10479   `inconsistent-eol'
10480   `invalid-source'
10481   `interrupted'
10482   `insufficient-memory'
10483 When no error was detected, the value doesn't change.  So, to check
10484 the error status of a code conversion by this variable, you must
10485 explicitly set this variable to nil before performing code
10486 conversion.  */);
10487   Vlast_code_conversion_error = Qnil;
10488
10489   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10490                doc: /*
10491 *Non-nil means always inhibit code conversion of end-of-line format.
10492 See info node `Coding Systems' and info node `Text and Binary' concerning
10493 such conversion.  */);
10494   inhibit_eol_conversion = 0;
10495
10496   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10497                doc: /*
10498 Non-nil means process buffer inherits coding system of process output.
10499 Bind it to t if the process output is to be treated as if it were a file
10500 read from some filesystem.  */);
10501   inherit_process_coding_system = 0;
10502
10503   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10504                doc: /*
10505 Alist to decide a coding system to use for a file I/O operation.
10506 The format is ((PATTERN . VAL) ...),
10507 where PATTERN is a regular expression matching a file name,
10508 VAL is a coding system, a cons of coding systems, or a function symbol.
10509 If VAL is a coding system, it is used for both decoding and encoding
10510 the file contents.
10511 If VAL is a cons of coding systems, the car part is used for decoding,
10512 and the cdr part is used for encoding.
10513 If VAL is a function symbol, the function must return a coding system
10514 or a cons of coding systems which are used as above.  The function is
10515 called with an argument that is a list of the arguments with which
10516 `find-operation-coding-system' was called.  If the function can't decide
10517 a coding system, it can return `undecided' so that the normal
10518 code-detection is performed.
10519
10520 See also the function `find-operation-coding-system'
10521 and the variable `auto-coding-alist'.  */);
10522   Vfile_coding_system_alist = Qnil;
10523
10524   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10525                doc: /*
10526 Alist to decide a coding system to use for a process I/O operation.
10527 The format is ((PATTERN . VAL) ...),
10528 where PATTERN is a regular expression matching a program name,
10529 VAL is a coding system, a cons of coding systems, or a function symbol.
10530 If VAL is a coding system, it is used for both decoding what received
10531 from the program and encoding what sent to the program.
10532 If VAL is a cons of coding systems, the car part is used for decoding,
10533 and the cdr part is used for encoding.
10534 If VAL is a function symbol, the function must return a coding system
10535 or a cons of coding systems which are used as above.
10536
10537 See also the function `find-operation-coding-system'.  */);
10538   Vprocess_coding_system_alist = Qnil;
10539
10540   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10541                doc: /*
10542 Alist to decide a coding system to use for a network I/O operation.
10543 The format is ((PATTERN . VAL) ...),
10544 where PATTERN is a regular expression matching a network service name
10545 or is a port number to connect to,
10546 VAL is a coding system, a cons of coding systems, or a function symbol.
10547 If VAL is a coding system, it is used for both decoding what received
10548 from the network stream and encoding what sent to the network stream.
10549 If VAL is a cons of coding systems, the car part is used for decoding,
10550 and the cdr part is used for encoding.
10551 If VAL is a function symbol, the function must return a coding system
10552 or a cons of coding systems which are used as above.
10553
10554 See also the function `find-operation-coding-system'.  */);
10555   Vnetwork_coding_system_alist = Qnil;
10556
10557   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10558                doc: /* Coding system to use with system messages.
10559 Also used for decoding keyboard input on X Window system.  */);
10560   Vlocale_coding_system = Qnil;
10561
10562   /* The eol mnemonics are reset in startup.el system-dependently.  */
10563   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10564                doc: /*
10565 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10566   eol_mnemonic_unix = make_pure_c_string (":");
10567
10568   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10569                doc: /*
10570 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10571   eol_mnemonic_dos = make_pure_c_string ("\\");
10572
10573   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10574                doc: /*
10575 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10576   eol_mnemonic_mac = make_pure_c_string ("/");
10577
10578   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10579                doc: /*
10580 *String displayed in mode line when end-of-line format is not yet determined.  */);
10581   eol_mnemonic_undecided = make_pure_c_string (":");
10582
10583   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10584                doc: /*
10585 *Non-nil enables character translation while encoding and decoding.  */);
10586   Venable_character_translation = Qt;
10587
10588   DEFVAR_LISP ("standard-translation-table-for-decode",
10589                Vstandard_translation_table_for_decode,
10590                doc: /* Table for translating characters while decoding.  */);
10591   Vstandard_translation_table_for_decode = Qnil;
10592
10593   DEFVAR_LISP ("standard-translation-table-for-encode",
10594                Vstandard_translation_table_for_encode,
10595                doc: /* Table for translating characters while encoding.  */);
10596   Vstandard_translation_table_for_encode = Qnil;
10597
10598   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10599                doc: /* Alist of charsets vs revision numbers.
10600 While encoding, if a charset (car part of an element) is found,
10601 designate it with the escape sequence identifying revision (cdr part
10602 of the element).  */);
10603   Vcharset_revision_table = Qnil;
10604
10605   DEFVAR_LISP ("default-process-coding-system",
10606                Vdefault_process_coding_system,
10607                doc: /* Cons of coding systems used for process I/O by default.
10608 The car part is used for decoding a process output,
10609 the cdr part is used for encoding a text to be sent to a process.  */);
10610   Vdefault_process_coding_system = Qnil;
10611
10612   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10613                doc: /*
10614 Table of extra Latin codes in the range 128..159 (inclusive).
10615 This is a vector of length 256.
10616 If Nth element is non-nil, the existence of code N in a file
10617 \(or output of subprocess) doesn't prevent it to be detected as
10618 a coding system of ISO 2022 variant which has a flag
10619 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10620 or reading output of a subprocess.
10621 Only 128th through 159th elements have a meaning.  */);
10622   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10623
10624   DEFVAR_LISP ("select-safe-coding-system-function",
10625                Vselect_safe_coding_system_function,
10626                doc: /*
10627 Function to call to select safe coding system for encoding a text.
10628
10629 If set, this function is called to force a user to select a proper
10630 coding system which can encode the text in the case that a default
10631 coding system used in each operation can't encode the text.  The
10632 function should take care that the buffer is not modified while
10633 the coding system is being selected.
10634
10635 The default value is `select-safe-coding-system' (which see).  */);
10636   Vselect_safe_coding_system_function = Qnil;
10637
10638   DEFVAR_BOOL ("coding-system-require-warning",
10639                coding_system_require_warning,
10640                doc: /* Internal use only.
10641 If non-nil, on writing a file, `select-safe-coding-system-function' is
10642 called even if `coding-system-for-write' is non-nil.  The command
10643 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10644   coding_system_require_warning = 0;
10645
10646
10647   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10648                inhibit_iso_escape_detection,
10649                doc: /*
10650 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10651
10652 When Emacs reads text, it tries to detect how the text is encoded.
10653 This code detection is sensitive to escape sequences.  If Emacs sees
10654 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10655 of the ISO2022 encodings, and decodes text by the corresponding coding
10656 system (e.g. `iso-2022-7bit').
10657
10658 However, there may be a case that you want to read escape sequences in
10659 a file as is.  In such a case, you can set this variable to non-nil.
10660 Then the code detection will ignore any escape sequences, and no text is
10661 detected as encoded in some ISO-2022 encoding.  The result is that all
10662 escape sequences become visible in a buffer.
10663
10664 The default value is nil, and it is strongly recommended not to change
10665 it.  That is because many Emacs Lisp source files that contain
10666 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10667 in Emacs's distribution, and they won't be decoded correctly on
10668 reading if you suppress escape sequence detection.
10669
10670 The other way to read escape sequences in a file without decoding is
10671 to explicitly specify some coding system that doesn't use ISO-2022
10672 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10673   inhibit_iso_escape_detection = 0;
10674
10675   DEFVAR_BOOL ("inhibit-null-byte-detection",
10676                inhibit_null_byte_detection,
10677                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10678 By default, Emacs treats it as binary data, and does not attempt to
10679 decode it.  The effect is as if you specified `no-conversion' for
10680 reading that text.
10681
10682 Set this to non-nil when a regular text happens to include null bytes.
10683 Examples are Index nodes of Info files and null-byte delimited output
10684 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10685 decode text as usual.  */);
10686   inhibit_null_byte_detection = 0;
10687
10688   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10689                doc: /* Char table for translating self-inserting characters.
10690 This is applied to the result of input methods, not their input.
10691 See also `keyboard-translate-table'.
10692
10693 Use of this variable for character code unification was rendered
10694 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10695 internal character representation.  */);
10696     Vtranslation_table_for_input = Qnil;
10697
10698   {
10699     Lisp_Object args[coding_arg_max];
10700     Lisp_Object plist[16];
10701     int i;
10702
10703     for (i = 0; i < coding_arg_max; i++)
10704       args[i] = Qnil;
10705
10706     plist[0] = intern_c_string (":name");
10707     plist[1] = args[coding_arg_name] = Qno_conversion;
10708     plist[2] = intern_c_string (":mnemonic");
10709     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10710     plist[4] = intern_c_string (":coding-type");
10711     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10712     plist[6] = intern_c_string (":ascii-compatible-p");
10713     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10714     plist[8] = intern_c_string (":default-char");
10715     plist[9] = args[coding_arg_default_char] = make_number (0);
10716     plist[10] = intern_c_string (":for-unibyte");
10717     plist[11] = args[coding_arg_for_unibyte] = Qt;
10718     plist[12] = intern_c_string (":docstring");
10719     plist[13] = make_pure_c_string ("Do no conversion.\n\
10720 \n\
10721 When you visit a file with this coding, the file is read into a\n\
10722 unibyte buffer as is, thus each byte of a file is treated as a\n\
10723 character.");
10724     plist[14] = intern_c_string (":eol-type");
10725     plist[15] = args[coding_arg_eol_type] = Qunix;
10726     args[coding_arg_plist] = Flist (16, plist);
10727     Fdefine_coding_system_internal (coding_arg_max, args);
10728
10729     plist[1] = args[coding_arg_name] = Qundecided;
10730     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10731     plist[5] = args[coding_arg_coding_type] = Qundecided;
10732     /* This is already set.
10733        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10734     plist[8] = intern_c_string (":charset-list");
10735     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10736     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10737     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10738     plist[15] = args[coding_arg_eol_type] = Qnil;
10739     args[coding_arg_plist] = Flist (16, plist);
10740     Fdefine_coding_system_internal (coding_arg_max, args);
10741   }
10742
10743   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10744
10745   {
10746     int i;
10747
10748     for (i = 0; i < coding_category_max; i++)
10749       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10750   }
10751 #if defined (DOS_NT)
10752   system_eol_type = Qdos;
10753 #else
10754   system_eol_type = Qunix;
10755 #endif
10756   staticpro (&system_eol_type);
10757 }
10758
10759 char *
10760 emacs_strerror (int error_number)
10761 {
10762   char *str;
10763
10764   synchronize_system_messages_locale ();
10765   str = strerror (error_number);
10766
10767   if (! NILP (Vlocale_coding_system))
10768     {
10769       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10770                                                       Vlocale_coding_system,
10771                                                       0);
10772       str = SSDATA (dec);
10773     }
10774
10775   return str;
10776 }
10777
10778 #endif /* emacs */