src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2012 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #include "lisp.h"
 290 #include "character.h"
 291 #include "buffer.h"
 292 #include "charset.h"
 293 #include "ccl.h"
 294 #include "composite.h"
 295 #include "coding.h"
 296 #include "window.h"
 297 #include "frame.h"
 298 #include "termhooks.h"
 299
 300 Lisp_Object Vcoding_system_hash_table;
 301
 302 static Lisp_Object Qcoding_system, Qeol_type;
 303 static Lisp_Object Qcoding_aliases;
 304 Lisp_Object Qunix, Qdos;
 305 Lisp_Object Qbuffer_file_coding_system;
 306 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 307 static Lisp_Object Qdefault_char;
 308 Lisp_Object Qno_conversion, Qundecided;
 309 Lisp_Object Qcharset, Qutf_8;
 310 static Lisp_Object Qiso_2022;
 311 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 312 static Lisp_Object Qbig, Qlittle;
 313 static Lisp_Object Qcoding_system_history;
 314 static Lisp_Object Qvalid_codes;
 315 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 316 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 317 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 318 static Lisp_Object QCascii_compatible_p;
 319
 320 Lisp_Object Qcall_process, Qcall_process_region;
 321 Lisp_Object Qstart_process, Qopen_network_stream;
 322 static Lisp_Object Qtarget_idx;
 323
 324 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 325 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 326
 327 /* If a symbol has this property, evaluate the value to define the
 328    symbol as a coding system.  */
 329 static Lisp_Object Qcoding_system_define_form;
 330
 331 /* Format of end-of-line decided by system.  This is Qunix on
 332    Unix and Mac, Qdos on DOS/Windows.
 333    This has an effect only for external encoding (i.e. for output to
 334    file and process), not for in-buffer or Lisp string encoding.  */
 335 static Lisp_Object system_eol_type;
 336
 337 #ifdef emacs
 338
 339 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 340
 341 /* Coding system emacs-mule and raw-text are for converting only
 342    end-of-line format.  */
 343 Lisp_Object Qemacs_mule, Qraw_text;
 344 Lisp_Object Qutf_8_emacs;
 345
 346 /* Coding-systems are handed between Emacs Lisp programs and C internal
 347    routines by the following three variables.  */
 348 /* Coding system to be used to encode text for terminal display when
 349    terminal coding system is nil.  */
 350 struct coding_system safe_terminal_coding;
 351
 352 #endif /* emacs */
 353
 354 Lisp_Object Qtranslation_table;
 355 Lisp_Object Qtranslation_table_id;
 356 static Lisp_Object Qtranslation_table_for_decode;
 357 static Lisp_Object Qtranslation_table_for_encode;
 358
 359 /* Two special coding systems.  */
 360 static Lisp_Object Vsjis_coding_system;
 361 static Lisp_Object Vbig5_coding_system;
 362
 363 /* ISO2022 section */
 364
 365 #define CODING_ISO_INITIAL(coding, reg)                 \
 366   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 367                      coding_attr_iso_initial),          \
 368                reg)))
 369
 370
 371 #define CODING_ISO_REQUEST(coding, charset_id)          \
 372   (((charset_id) <= (coding)->max_charset_id            \
 373     ? ((coding)->safe_charsets[charset_id] != 255       \
 374        ? (coding)->safe_charsets[charset_id]            \
 375        : -1)                                            \
 376     : -1))
 377
 378
 379 #define CODING_ISO_FLAGS(coding)        \
 380   ((coding)->spec.iso_2022.flags)
 381 #define CODING_ISO_DESIGNATION(coding, reg)     \
 382   ((coding)->spec.iso_2022.current_designation[reg])
 383 #define CODING_ISO_INVOCATION(coding, plane)    \
 384   ((coding)->spec.iso_2022.current_invocation[plane])
 385 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 386   ((coding)->spec.iso_2022.single_shifting)
 387 #define CODING_ISO_BOL(coding)  \
 388   ((coding)->spec.iso_2022.bol)
 389 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 390   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 391 #define CODING_ISO_CMP_STATUS(coding)   \
 392   (&(coding)->spec.iso_2022.cmp_status)
 393 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 394   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 395 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 396   ((coding)->spec.iso_2022.embedded_utf_8)
 397
 398 /* Control characters of ISO2022.  */
 399                         /* code */      /* function */
 400 #define ISO_CODE_SO     0x0E            /* shift-out */
 401 #define ISO_CODE_SI     0x0F            /* shift-in */
 402 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 403 #define ISO_CODE_ESC    0x1B            /* escape */
 404 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 405 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 406 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 407
 408 /* All code (1-byte) of ISO2022 is classified into one of the
 409    followings.  */
 410 enum iso_code_class_type
 411   {
 412     ISO_control_0,              /* Control codes in the range
 413                                    0x00..0x1F and 0x7F, except for the
 414                                    following 5 codes.  */
 415     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 416     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 417     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 418     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 419     ISO_control_1,              /* Control codes in the range
 420                                    0x80..0x9F, except for the
 421                                    following 3 codes.  */
 422     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 423     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 424     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 425     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 426     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 427     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 428     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 429   };
 430
 431 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 432     `iso-flags' attribute of an iso2022 coding system.  */
 433
 434 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 435    instead of the correct short-form sequence (e.g. ESC $ A).  */
 436 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 437
 438 /* If set, reset graphic planes and registers at end-of-line to the
 439    initial state.  */
 440 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 441
 442 /* If set, reset graphic planes and registers before any control
 443    characters to the initial state.  */
 444 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 445
 446 /* If set, encode by 7-bit environment.  */
 447 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 448
 449 /* If set, use locking-shift function.  */
 450 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 451
 452 /* If set, use single-shift function.  Overwrite
 453    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 454 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 455
 456 /* If set, use designation escape sequence.  */
 457 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 458
 459 /* If set, produce revision number sequence.  */
 460 #define CODING_ISO_FLAG_REVISION        0x0080
 461
 462 /* If set, produce ISO6429's direction specifying sequence.  */
 463 #define CODING_ISO_FLAG_DIRECTION       0x0100
 464
 465 /* If set, assume designation states are reset at beginning of line on
 466    output.  */
 467 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 468
 469 /* If set, designation sequence should be placed at beginning of line
 470    on output.  */
 471 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 472
 473 /* If set, do not encode unsafe characters on output.  */
 474 #define CODING_ISO_FLAG_SAFE            0x0800
 475
 476 /* If set, extra latin codes (128..159) are accepted as a valid code
 477    on input.  */
 478 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 479
 480 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 481
 482 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 483
 484 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 485
 486 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 487
 488 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 489
 490 /* A character to be produced on output if encoding of the original
 491    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 492 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 493
 494 /* UTF-8 section */
 495 #define CODING_UTF_8_BOM(coding)        \
 496   ((coding)->spec.utf_8_bom)
 497
 498 /* UTF-16 section */
 499 #define CODING_UTF_16_BOM(coding)       \
 500   ((coding)->spec.utf_16.bom)
 501
 502 #define CODING_UTF_16_ENDIAN(coding)    \
 503   ((coding)->spec.utf_16.endian)
 504
 505 #define CODING_UTF_16_SURROGATE(coding) \
 506   ((coding)->spec.utf_16.surrogate)
 507
 508
 509 /* CCL section */
 510 #define CODING_CCL_DECODER(coding)      \
 511   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 512 #define CODING_CCL_ENCODER(coding)      \
 513   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 514 #define CODING_CCL_VALIDS(coding)                                          \
 515   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 516
 517 /* Index for each coding category in `coding_categories' */
 518
 519 enum coding_category
 520   {
 521     coding_category_iso_7,
 522     coding_category_iso_7_tight,
 523     coding_category_iso_8_1,
 524     coding_category_iso_8_2,
 525     coding_category_iso_7_else,
 526     coding_category_iso_8_else,
 527     coding_category_utf_8_auto,
 528     coding_category_utf_8_nosig,
 529     coding_category_utf_8_sig,
 530     coding_category_utf_16_auto,
 531     coding_category_utf_16_be,
 532     coding_category_utf_16_le,
 533     coding_category_utf_16_be_nosig,
 534     coding_category_utf_16_le_nosig,
 535     coding_category_charset,
 536     coding_category_sjis,
 537     coding_category_big5,
 538     coding_category_ccl,
 539     coding_category_emacs_mule,
 540     /* All above are targets of code detection.  */
 541     coding_category_raw_text,
 542     coding_category_undecided,
 543     coding_category_max
 544   };
 545
 546 /* Definitions of flag bits used in detect_coding_XXXX.  */
 547 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 548 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 549 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 550 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 551 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 552 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 553 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 554 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 555 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 556 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 557 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 558 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 559 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 560 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 561 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 562 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 563 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 564 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 565 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 566 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 567
 568 /* This value is returned if detect_coding_mask () find nothing other
 569    than ASCII characters.  */
 570 #define CATEGORY_MASK_ANY               \
 571   (CATEGORY_MASK_ISO_7                  \
 572    | CATEGORY_MASK_ISO_7_TIGHT          \
 573    | CATEGORY_MASK_ISO_8_1              \
 574    | CATEGORY_MASK_ISO_8_2              \
 575    | CATEGORY_MASK_ISO_7_ELSE           \
 576    | CATEGORY_MASK_ISO_8_ELSE           \
 577    | CATEGORY_MASK_UTF_8_AUTO           \
 578    | CATEGORY_MASK_UTF_8_NOSIG          \
 579    | CATEGORY_MASK_UTF_8_SIG            \
 580    | CATEGORY_MASK_UTF_16_AUTO          \
 581    | CATEGORY_MASK_UTF_16_BE            \
 582    | CATEGORY_MASK_UTF_16_LE            \
 583    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 584    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 585    | CATEGORY_MASK_CHARSET              \
 586    | CATEGORY_MASK_SJIS                 \
 587    | CATEGORY_MASK_BIG5                 \
 588    | CATEGORY_MASK_CCL                  \
 589    | CATEGORY_MASK_EMACS_MULE)
 590
 591
 592 #define CATEGORY_MASK_ISO_7BIT \
 593   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 594
 595 #define CATEGORY_MASK_ISO_8BIT \
 596   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 597
 598 #define CATEGORY_MASK_ISO_ELSE \
 599   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 600
 601 #define CATEGORY_MASK_ISO_ESCAPE        \
 602   (CATEGORY_MASK_ISO_7                  \
 603    | CATEGORY_MASK_ISO_7_TIGHT          \
 604    | CATEGORY_MASK_ISO_7_ELSE           \
 605    | CATEGORY_MASK_ISO_8_ELSE)
 606
 607 #define CATEGORY_MASK_ISO       \
 608   (  CATEGORY_MASK_ISO_7BIT     \
 609      | CATEGORY_MASK_ISO_8BIT   \
 610      | CATEGORY_MASK_ISO_ELSE)
 611
 612 #define CATEGORY_MASK_UTF_16            \
 613   (CATEGORY_MASK_UTF_16_AUTO            \
 614    | CATEGORY_MASK_UTF_16_BE            \
 615    | CATEGORY_MASK_UTF_16_LE            \
 616    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 617    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 618
 619 #define CATEGORY_MASK_UTF_8     \
 620   (CATEGORY_MASK_UTF_8_AUTO     \
 621    | CATEGORY_MASK_UTF_8_NOSIG  \
 622    | CATEGORY_MASK_UTF_8_SIG)
 623
 624 /* Table of coding categories (Lisp symbols).  This variable is for
 625    internal use only.  */
 626 static Lisp_Object Vcoding_category_table;
 627
 628 /* Table of coding-categories ordered by priority.  */
 629 static enum coding_category coding_priorities[coding_category_max];
 630
 631 /* Nth element is a coding context for the coding system bound to the
 632    Nth coding category.  */
 633 static struct coding_system coding_categories[coding_category_max];
 634
 635 /*** Commonly used macros and functions ***/
 636
 637 #ifndef min
 638 #define min(a, b) ((a) < (b) ? (a) : (b))
 639 #endif
 640 #ifndef max
 641 #define max(a, b) ((a) > (b) ? (a) : (b))
 642 #endif
 643
 644 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 645   do {                                                  \
 646     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 647     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 648   } while (0)
 649
 650
 651 /* Safely get one byte from the source text pointed by SRC which ends
 652    at SRC_END, and set C to that byte.  If there are not enough bytes
 653    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 654    and a multibyte character is found at SRC, set C to the
 655    negative value of the character code.  The caller should declare
 656    and set these variables appropriately in advance:
 657         src, src_end, multibytep */
 658
 659 #define ONE_MORE_BYTE(c)                                \
 660   do {                                                  \
 661     if (src == src_end)                                 \
 662       {                                                 \
 663         if (src_base < src)                             \
 664           record_conversion_result                      \
 665             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 666         goto no_more_source;                            \
 667       }                                                 \
 668     c = *src++;                                         \
 669     if (multibytep && (c & 0x80))                       \
 670       {                                                 \
 671         if ((c & 0xFE) == 0xC0)                         \
 672           c = ((c & 1) << 6) | *src++;                  \
 673         else                                            \
 674           {                                             \
 675             src--;                                      \
 676             c = - string_char (src, &src, NULL);        \
 677             record_conversion_result                    \
 678               (coding, CODING_RESULT_INVALID_SRC);      \
 679           }                                             \
 680       }                                                 \
 681     consumed_chars++;                                   \
 682   } while (0)
 683
 684 /* Safely get two bytes from the source text pointed by SRC which ends
 685    at SRC_END, and set C1 and C2 to those bytes while skipping the
 686    heading multibyte characters.  If there are not enough bytes in the
 687    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 688    a multibyte character is found for C2, set C2 to the negative value
 689    of the character code.  The caller should declare and set these
 690    variables appropriately in advance:
 691         src, src_end, multibytep
 692    It is intended that this macro is used in detect_coding_utf_16.  */
 693
 694 #define TWO_MORE_BYTES(c1, c2)                          \
 695   do {                                                  \
 696     do {                                                \
 697       if (src == src_end)                               \
 698         goto no_more_source;                            \
 699       c1 = *src++;                                      \
 700       if (multibytep && (c1 & 0x80))                    \
 701         {                                               \
 702           if ((c1 & 0xFE) == 0xC0)                      \
 703             c1 = ((c1 & 1) << 6) | *src++;              \
 704           else                                          \
 705             {                                           \
 706               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 707               c1 = -1;                                  \
 708             }                                           \
 709         }                                               \
 710     } while (c1 < 0);                                   \
 711     if (src == src_end)                                 \
 712       goto no_more_source;                              \
 713     c2 = *src++;                                        \
 714     if (multibytep && (c2 & 0x80))                      \
 715       {                                                 \
 716         if ((c2 & 0xFE) == 0xC0)                        \
 717           c2 = ((c2 & 1) << 6) | *src++;                \
 718         else                                            \
 719           c2 = -1;                                      \
 720       }                                                 \
 721   } while (0)
 722
 723
 724 /* Store a byte C in the place pointed by DST and increment DST to the
 725    next free point, and increment PRODUCED_CHARS.  The caller should
 726    assure that C is 0..127, and declare and set the variable `dst'
 727    appropriately in advance.
 728 */
 729
 730
 731 #define EMIT_ONE_ASCII_BYTE(c)  \
 732   do {                          \
 733     produced_chars++;           \
 734     *dst++ = (c);               \
 735   } while (0)
 736
 737
 738 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 739
 740 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 741   do {                                  \
 742     produced_chars += 2;                \
 743     *dst++ = (c1), *dst++ = (c2);       \
 744   } while (0)
 745
 746
 747 /* Store a byte C in the place pointed by DST and increment DST to the
 748    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 749    store in an appropriate multibyte form.  The caller should
 750    declare and set the variables `dst' and `multibytep' appropriately
 751    in advance.  */
 752
 753 #define EMIT_ONE_BYTE(c)                \
 754   do {                                  \
 755     produced_chars++;                   \
 756     if (multibytep)                     \
 757       {                                 \
 758         unsigned ch = (c);              \
 759         if (ch >= 0x80)                 \
 760           ch = BYTE8_TO_CHAR (ch);      \
 761         CHAR_STRING_ADVANCE (ch, dst);  \
 762       }                                 \
 763     else                                \
 764       *dst++ = (c);                     \
 765   } while (0)
 766
 767
 768 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 769
 770 #define EMIT_TWO_BYTES(c1, c2)          \
 771   do {                                  \
 772     produced_chars += 2;                \
 773     if (multibytep)                     \
 774       {                                 \
 775         unsigned ch;                    \
 776                                         \
 777         ch = (c1);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781         ch = (c2);                      \
 782         if (ch >= 0x80)                 \
 783           ch = BYTE8_TO_CHAR (ch);      \
 784         CHAR_STRING_ADVANCE (ch, dst);  \
 785       }                                 \
 786     else                                \
 787       {                                 \
 788         *dst++ = (c1);                  \
 789         *dst++ = (c2);                  \
 790       }                                 \
 791   } while (0)
 792
 793
 794 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 795   do {                                  \
 796     EMIT_ONE_BYTE (c1);                 \
 797     EMIT_TWO_BYTES (c2, c3);            \
 798   } while (0)
 799
 800
 801 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 802   do {                                          \
 803     EMIT_TWO_BYTES (c1, c2);                    \
 804     EMIT_TWO_BYTES (c3, c4);                    \
 805   } while (0)
 806
 807
 808 static void
 809 record_conversion_result (struct coding_system *coding,
 810                           enum coding_result_code result)
 811 {
 812   coding->result = result;
 813   switch (result)
 814     {
 815     case CODING_RESULT_INSUFFICIENT_SRC:
 816       Vlast_code_conversion_error = Qinsufficient_source;
 817       break;
 818     case CODING_RESULT_INCONSISTENT_EOL:
 819       Vlast_code_conversion_error = Qinconsistent_eol;
 820       break;
 821     case CODING_RESULT_INVALID_SRC:
 822       Vlast_code_conversion_error = Qinvalid_source;
 823       break;
 824     case CODING_RESULT_INTERRUPT:
 825       Vlast_code_conversion_error = Qinterrupted;
 826       break;
 827     case CODING_RESULT_INSUFFICIENT_MEM:
 828       Vlast_code_conversion_error = Qinsufficient_memory;
 829       break;
 830     case CODING_RESULT_INSUFFICIENT_DST:
 831       /* Don't record this error in Vlast_code_conversion_error
 832          because it happens just temporarily and is resolved when the
 833          whole conversion is finished.  */
 834       break;
 835     case CODING_RESULT_SUCCESS:
 836       break;
 837     default:
 838       Vlast_code_conversion_error = intern ("Unknown error");
 839     }
 840 }
 841
 842 /* These wrapper macros are used to preserve validity of pointers into
 843    buffer text across calls to decode_char, encode_char, etc, which
 844    could cause relocation of buffers if it loads a charset map,
 845    because loading a charset map allocates large structures.  */
 846
 847 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 848   do {                                                                       \
 849     ptrdiff_t offset;                                                        \
 850                                                                              \
 851     charset_map_loaded = 0;                                                  \
 852     c = DECODE_CHAR (charset, code);                                         \
 853     if (charset_map_loaded                                                   \
 854         && (offset = coding_change_source (coding)))                         \
 855       {                                                                      \
 856         src += offset;                                                       \
 857         src_base += offset;                                                  \
 858         src_end += offset;                                                   \
 859       }                                                                      \
 860   } while (0)
 861
 862 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 863   do {                                                                  \
 864     ptrdiff_t offset;                                                   \
 865                                                                         \
 866     charset_map_loaded = 0;                                             \
 867     code = ENCODE_CHAR (charset, c);                                    \
 868     if (charset_map_loaded                                              \
 869         && (offset = coding_change_destination (coding)))               \
 870       {                                                                 \
 871         dst += offset;                                                  \
 872         dst_end += offset;                                              \
 873       }                                                                 \
 874   } while (0)
 875
 876 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 877   do {                                                                  \
 878     ptrdiff_t offset;                                                   \
 879                                                                         \
 880     charset_map_loaded = 0;                                             \
 881     charset = char_charset (c, charset_list, code_return);              \
 882     if (charset_map_loaded                                              \
 883         && (offset = coding_change_destination (coding)))               \
 884       {                                                                 \
 885         dst += offset;                                                  \
 886         dst_end += offset;                                              \
 887       }                                                                 \
 888   } while (0)
 889
 890 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 891   do {                                                                  \
 892     ptrdiff_t offset;                                                   \
 893                                                                         \
 894     charset_map_loaded = 0;                                             \
 895     result = CHAR_CHARSET_P (c, charset);                               \
 896     if (charset_map_loaded                                              \
 897         && (offset = coding_change_destination (coding)))               \
 898       {                                                                 \
 899         dst += offset;                                                  \
 900         dst_end += offset;                                              \
 901       }                                                                 \
 902   } while (0)
 903
 904
 905 /* If there are at least BYTES length of room at dst, allocate memory
 906    for coding->destination and update dst and dst_end.  We don't have
 907    to take care of coding->source which will be relocated.  It is
 908    handled by calling coding_set_source in encode_coding.  */
 909
 910 #define ASSURE_DESTINATION(bytes)                               \
 911   do {                                                          \
 912     if (dst + (bytes) >= dst_end)                               \
 913       {                                                         \
 914         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 915                                                                 \
 916         dst = alloc_destination (coding, more_bytes, dst);      \
 917         dst_end = coding->destination + coding->dst_bytes;      \
 918       }                                                         \
 919   } while (0)
 920
 921
 922 /* Store multibyte form of the character C in P, and advance P to the
 923    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 924    never calls MAYBE_UNIFY_CHAR.  */
 925
 926 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 927   do {                                          \
 928     if ((c) <= MAX_1_BYTE_CHAR)                 \
 929       *(p)++ = (c);                             \
 930     else if ((c) <= MAX_2_BYTE_CHAR)            \
 931       *(p)++ = (0xC0 | ((c) >> 6)),             \
 932         *(p)++ = (0x80 | ((c) & 0x3F));         \
 933     else if ((c) <= MAX_3_BYTE_CHAR)            \
 934       *(p)++ = (0xE0 | ((c) >> 12)),            \
 935         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
 936         *(p)++ = (0x80 | ((c) & 0x3F));         \
 937     else if ((c) <= MAX_4_BYTE_CHAR)            \
 938       *(p)++ = (0xF0 | (c >> 18)),              \
 939         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 940         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 941         *(p)++ = (0x80 | (c & 0x3F));           \
 942     else if ((c) <= MAX_5_BYTE_CHAR)            \
 943       *(p)++ = 0xF8,                            \
 944         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
 945         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 946         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 947         *(p)++ = (0x80 | (c & 0x3F));           \
 948     else                                        \
 949       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
 950   } while (0)
 951
 952
 953 /* Return the character code of character whose multibyte form is at
 954    P, and advance P to the end of the multibyte form.  This is like
 955    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
 956
 957 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
 958   (!((p)[0] & 0x80)                                             \
 959    ? *(p)++                                                     \
 960    : ! ((p)[0] & 0x20)                                          \
 961    ? ((p) += 2,                                                 \
 962       ((((p)[-2] & 0x1F) << 6)                                  \
 963        | ((p)[-1] & 0x3F)                                       \
 964        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
 965    : ! ((p)[0] & 0x10)                                          \
 966    ? ((p) += 3,                                                 \
 967       ((((p)[-3] & 0x0F) << 12)                                 \
 968        | (((p)[-2] & 0x3F) << 6)                                \
 969        | ((p)[-1] & 0x3F)))                                     \
 970    : ! ((p)[0] & 0x08)                                          \
 971    ? ((p) += 4,                                                 \
 972       ((((p)[-4] & 0xF) << 18)                                  \
 973        | (((p)[-3] & 0x3F) << 12)                               \
 974        | (((p)[-2] & 0x3F) << 6)                                \
 975        | ((p)[-1] & 0x3F)))                                     \
 976    : ((p) += 5,                                                 \
 977       ((((p)[-4] & 0x3F) << 18)                                 \
 978        | (((p)[-3] & 0x3F) << 12)                               \
 979        | (((p)[-2] & 0x3F) << 6)                                \
 980        | ((p)[-1] & 0x3F))))
 981
 982
 983 /* Set coding->source from coding->src_object.  */
 984
 985 static void
 986 coding_set_source (struct coding_system *coding)
 987 {
 988   if (BUFFERP (coding->src_object))
 989     {
 990       struct buffer *buf = XBUFFER (coding->src_object);
 991
 992       if (coding->src_pos < 0)
 993         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 994       else
 995         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 996     }
 997   else if (STRINGP (coding->src_object))
 998     {
 999       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1000     }
1001   else
1002     {
1003       /* Otherwise, the source is C string and is never relocated
1004          automatically.  Thus we don't have to update anything.  */
1005     }
1006 }
1007
1008
1009 /* Set coding->source from coding->src_object, and return how many
1010    bytes coding->source was changed.  */
1011
1012 static ptrdiff_t
1013 coding_change_source (struct coding_system *coding)
1014 {
1015   const unsigned char *orig = coding->source;
1016   coding_set_source (coding);
1017   return coding->source - orig;
1018 }
1019
1020
1021 /* Set coding->destination from coding->dst_object.  */
1022
1023 static void
1024 coding_set_destination (struct coding_system *coding)
1025 {
1026   if (BUFFERP (coding->dst_object))
1027     {
1028       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1029         {
1030           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1031           coding->dst_bytes = (GAP_END_ADDR
1032                                - (coding->src_bytes - coding->consumed)
1033                                - coding->destination);
1034         }
1035       else
1036         {
1037           /* We are sure that coding->dst_pos_byte is before the gap
1038              of the buffer. */
1039           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1040                                  + coding->dst_pos_byte - BEG_BYTE);
1041           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1042                                - coding->destination);
1043         }
1044     }
1045   else
1046     {
1047       /* Otherwise, the destination is C string and is never relocated
1048          automatically.  Thus we don't have to update anything.  */
1049     }
1050 }
1051
1052
1053 /* Set coding->destination from coding->dst_object, and return how
1054    many bytes coding->destination was changed.  */
1055
1056 static ptrdiff_t
1057 coding_change_destination (struct coding_system *coding)
1058 {
1059   const unsigned char *orig = coding->destination;
1060   coding_set_destination (coding);
1061   return coding->destination - orig;
1062 }
1063
1064
1065 static void
1066 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1067 {
1068   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1069     string_overflow ();
1070   coding->destination = xrealloc (coding->destination,
1071                                   coding->dst_bytes + bytes);
1072   coding->dst_bytes += bytes;
1073 }
1074
1075 static void
1076 coding_alloc_by_making_gap (struct coding_system *coding,
1077                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1078 {
1079   if (EQ (coding->src_object, coding->dst_object))
1080     {
1081       /* The gap may contain the produced data at the head and not-yet
1082          consumed data at the tail.  To preserve those data, we at
1083          first make the gap size to zero, then increase the gap
1084          size.  */
1085       ptrdiff_t add = GAP_SIZE;
1086
1087       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1088       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1089       make_gap (bytes);
1090       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1091       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1092     }
1093   else
1094     {
1095       Lisp_Object this_buffer;
1096
1097       this_buffer = Fcurrent_buffer ();
1098       set_buffer_internal (XBUFFER (coding->dst_object));
1099       make_gap (bytes);
1100       set_buffer_internal (XBUFFER (this_buffer));
1101     }
1102 }
1103
1104
1105 static unsigned char *
1106 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1107                    unsigned char *dst)
1108 {
1109   ptrdiff_t offset = dst - coding->destination;
1110
1111   if (BUFFERP (coding->dst_object))
1112     {
1113       struct buffer *buf = XBUFFER (coding->dst_object);
1114
1115       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1116     }
1117   else
1118     coding_alloc_by_realloc (coding, nbytes);
1119   coding_set_destination (coding);
1120   dst = coding->destination + offset;
1121   return dst;
1122 }
1123
1124 /** Macros for annotations.  */
1125
1126 /* An annotation data is stored in the array coding->charbuf in this
1127    format:
1128      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1129    LENGTH is the number of elements in the annotation.
1130    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1131    NCHARS is the number of characters in the text annotated.
1132
1133    The format of the following elements depend on ANNOTATION_MASK.
1134
1135    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1136    follows:
1137      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1138
1139    NBYTES is the number of bytes specified in the header part of
1140    old-style emacs-mule encoding, or 0 for the other kind of
1141    composition.
1142
1143    METHOD is one of enum composition_method.
1144
1145    Optional COMPOSITION-COMPONENTS are characters and composition
1146    rules.
1147
1148    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1149    follows.
1150
1151    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1152    recover from an invalid annotation, and should be skipped by
1153    produce_annotation.  */
1154
1155 /* Maximum length of the header of annotation data.  */
1156 #define MAX_ANNOTATION_LENGTH 5
1157
1158 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1159   do {                                                  \
1160     *(buf)++ = -(len);                                  \
1161     *(buf)++ = (mask);                                  \
1162     *(buf)++ = (nchars);                                \
1163     coding->annotated = 1;                              \
1164   } while (0);
1165
1166 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1167   do {                                                                      \
1168     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1169     *buf++ = nbytes;                                                        \
1170     *buf++ = method;                                                        \
1171   } while (0)
1172
1173
1174 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1175   do {                                                                  \
1176     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1177     *buf++ = id;                                                        \
1178   } while (0)
1179
1180 \f
1181 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1182
1183
1184
1185 \f
1186 /*** 3. UTF-8 ***/
1187
1188 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1189    Return true if a text is encoded in UTF-8.  */
1190
1191 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1192 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1193 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1194 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1195 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1196 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1197
1198 #define UTF_8_BOM_1 0xEF
1199 #define UTF_8_BOM_2 0xBB
1200 #define UTF_8_BOM_3 0xBF
1201
1202 static bool
1203 detect_coding_utf_8 (struct coding_system *coding,
1204                      struct coding_detection_info *detect_info)
1205 {
1206   const unsigned char *src = coding->source, *src_base;
1207   const unsigned char *src_end = coding->source + coding->src_bytes;
1208   bool multibytep = coding->src_multibyte;
1209   ptrdiff_t consumed_chars = 0;
1210   bool bom_found = 0;
1211   bool found = 0;
1212
1213   detect_info->checked |= CATEGORY_MASK_UTF_8;
1214   /* A coding system of this category is always ASCII compatible.  */
1215   src += coding->head_ascii;
1216
1217   while (1)
1218     {
1219       int c, c1, c2, c3, c4;
1220
1221       src_base = src;
1222       ONE_MORE_BYTE (c);
1223       if (c < 0 || UTF_8_1_OCTET_P (c))
1224         continue;
1225       ONE_MORE_BYTE (c1);
1226       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1227         break;
1228       if (UTF_8_2_OCTET_LEADING_P (c))
1229         {
1230           found = 1;
1231           continue;
1232         }
1233       ONE_MORE_BYTE (c2);
1234       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1235         break;
1236       if (UTF_8_3_OCTET_LEADING_P (c))
1237         {
1238           found = 1;
1239           if (src_base == coding->source
1240               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1241             bom_found = 1;
1242           continue;
1243         }
1244       ONE_MORE_BYTE (c3);
1245       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1246         break;
1247       if (UTF_8_4_OCTET_LEADING_P (c))
1248         {
1249           found = 1;
1250           continue;
1251         }
1252       ONE_MORE_BYTE (c4);
1253       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1254         break;
1255       if (UTF_8_5_OCTET_LEADING_P (c))
1256         {
1257           found = 1;
1258           continue;
1259         }
1260       break;
1261     }
1262   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1263   return 0;
1264
1265  no_more_source:
1266   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1267     {
1268       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1269       return 0;
1270     }
1271   if (bom_found)
1272     {
1273       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1274       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1275     }
1276   else
1277     {
1278       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1279       if (found)
1280         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1281     }
1282   return 1;
1283 }
1284
1285
1286 static void
1287 decode_coding_utf_8 (struct coding_system *coding)
1288 {
1289   const unsigned char *src = coding->source + coding->consumed;
1290   const unsigned char *src_end = coding->source + coding->src_bytes;
1291   const unsigned char *src_base;
1292   int *charbuf = coding->charbuf + coding->charbuf_used;
1293   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1294   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1295   bool multibytep = coding->src_multibyte;
1296   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1297   bool eol_dos
1298     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1299   int byte_after_cr = -1;
1300
1301   if (bom != utf_without_bom)
1302     {
1303       int c1, c2, c3;
1304
1305       src_base = src;
1306       ONE_MORE_BYTE (c1);
1307       if (! UTF_8_3_OCTET_LEADING_P (c1))
1308         src = src_base;
1309       else
1310         {
1311           ONE_MORE_BYTE (c2);
1312           if (! UTF_8_EXTRA_OCTET_P (c2))
1313             src = src_base;
1314           else
1315             {
1316               ONE_MORE_BYTE (c3);
1317               if (! UTF_8_EXTRA_OCTET_P (c3))
1318                 src = src_base;
1319               else
1320                 {
1321                   if ((c1 != UTF_8_BOM_1)
1322                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1323                     src = src_base;
1324                   else
1325                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1326                 }
1327             }
1328         }
1329     }
1330   CODING_UTF_8_BOM (coding) = utf_without_bom;
1331
1332   while (1)
1333     {
1334       int c, c1, c2, c3, c4, c5;
1335
1336       src_base = src;
1337       consumed_chars_base = consumed_chars;
1338
1339       if (charbuf >= charbuf_end)
1340         {
1341           if (byte_after_cr >= 0)
1342             src_base--;
1343           break;
1344         }
1345
1346       if (byte_after_cr >= 0)
1347         c1 = byte_after_cr, byte_after_cr = -1;
1348       else
1349         ONE_MORE_BYTE (c1);
1350       if (c1 < 0)
1351         {
1352           c = - c1;
1353         }
1354       else if (UTF_8_1_OCTET_P (c1))
1355         {
1356           if (eol_dos && c1 == '\r')
1357             ONE_MORE_BYTE (byte_after_cr);
1358           c = c1;
1359         }
1360       else
1361         {
1362           ONE_MORE_BYTE (c2);
1363           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1364             goto invalid_code;
1365           if (UTF_8_2_OCTET_LEADING_P (c1))
1366             {
1367               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1368               /* Reject overlong sequences here and below.  Encoders
1369                  producing them are incorrect, they can be misleading,
1370                  and they mess up read/write invariance.  */
1371               if (c < 128)
1372                 goto invalid_code;
1373             }
1374           else
1375             {
1376               ONE_MORE_BYTE (c3);
1377               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1378                 goto invalid_code;
1379               if (UTF_8_3_OCTET_LEADING_P (c1))
1380                 {
1381                   c = (((c1 & 0xF) << 12)
1382                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1383                   if (c < 0x800
1384                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1385                     goto invalid_code;
1386                 }
1387               else
1388                 {
1389                   ONE_MORE_BYTE (c4);
1390                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1391                     goto invalid_code;
1392                   if (UTF_8_4_OCTET_LEADING_P (c1))
1393                     {
1394                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1395                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1396                     if (c < 0x10000)
1397                       goto invalid_code;
1398                     }
1399                   else
1400                     {
1401                       ONE_MORE_BYTE (c5);
1402                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1403                         goto invalid_code;
1404                       if (UTF_8_5_OCTET_LEADING_P (c1))
1405                         {
1406                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1407                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1408                                | (c5 & 0x3F));
1409                           if ((c > MAX_CHAR) || (c < 0x200000))
1410                             goto invalid_code;
1411                         }
1412                       else
1413                         goto invalid_code;
1414                     }
1415                 }
1416             }
1417         }
1418
1419       *charbuf++ = c;
1420       continue;
1421
1422     invalid_code:
1423       src = src_base;
1424       consumed_chars = consumed_chars_base;
1425       ONE_MORE_BYTE (c);
1426       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1427       coding->errors++;
1428     }
1429
1430  no_more_source:
1431   coding->consumed_char += consumed_chars_base;
1432   coding->consumed = src_base - coding->source;
1433   coding->charbuf_used = charbuf - coding->charbuf;
1434 }
1435
1436
1437 static bool
1438 encode_coding_utf_8 (struct coding_system *coding)
1439 {
1440   bool multibytep = coding->dst_multibyte;
1441   int *charbuf = coding->charbuf;
1442   int *charbuf_end = charbuf + coding->charbuf_used;
1443   unsigned char *dst = coding->destination + coding->produced;
1444   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1445   ptrdiff_t produced_chars = 0;
1446   int c;
1447
1448   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1449     {
1450       ASSURE_DESTINATION (3);
1451       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1452       CODING_UTF_8_BOM (coding) = utf_without_bom;
1453     }
1454
1455   if (multibytep)
1456     {
1457       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1458
1459       while (charbuf < charbuf_end)
1460         {
1461           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1462
1463           ASSURE_DESTINATION (safe_room);
1464           c = *charbuf++;
1465           if (CHAR_BYTE8_P (c))
1466             {
1467               c = CHAR_TO_BYTE8 (c);
1468               EMIT_ONE_BYTE (c);
1469             }
1470           else
1471             {
1472               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1473               for (p = str; p < pend; p++)
1474                 EMIT_ONE_BYTE (*p);
1475             }
1476         }
1477     }
1478   else
1479     {
1480       int safe_room = MAX_MULTIBYTE_LENGTH;
1481
1482       while (charbuf < charbuf_end)
1483         {
1484           ASSURE_DESTINATION (safe_room);
1485           c = *charbuf++;
1486           if (CHAR_BYTE8_P (c))
1487             *dst++ = CHAR_TO_BYTE8 (c);
1488           else
1489             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1490           produced_chars++;
1491         }
1492     }
1493   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1494   coding->produced_char += produced_chars;
1495   coding->produced = dst - coding->destination;
1496   return 0;
1497 }
1498
1499
1500 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1501    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1502
1503 #define UTF_16_HIGH_SURROGATE_P(val) \
1504   (((val) & 0xFC00) == 0xD800)
1505
1506 #define UTF_16_LOW_SURROGATE_P(val) \
1507   (((val) & 0xFC00) == 0xDC00)
1508
1509
1510 static bool
1511 detect_coding_utf_16 (struct coding_system *coding,
1512                       struct coding_detection_info *detect_info)
1513 {
1514   const unsigned char *src = coding->source;
1515   const unsigned char *src_end = coding->source + coding->src_bytes;
1516   bool multibytep = coding->src_multibyte;
1517   int c1, c2;
1518
1519   detect_info->checked |= CATEGORY_MASK_UTF_16;
1520   if (coding->mode & CODING_MODE_LAST_BLOCK
1521       && (coding->src_chars & 1))
1522     {
1523       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1524       return 0;
1525     }
1526
1527   TWO_MORE_BYTES (c1, c2);
1528   if ((c1 == 0xFF) && (c2 == 0xFE))
1529     {
1530       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1531                              | CATEGORY_MASK_UTF_16_AUTO);
1532       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1533                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1534                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1535     }
1536   else if ((c1 == 0xFE) && (c2 == 0xFF))
1537     {
1538       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1539                              | CATEGORY_MASK_UTF_16_AUTO);
1540       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1541                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1542                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1543     }
1544   else if (c2 < 0)
1545     {
1546       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1547       return 0;
1548     }
1549   else
1550     {
1551       /* We check the dispersion of Eth and Oth bytes where E is even and
1552          O is odd.  If both are high, we assume binary data.*/
1553       unsigned char e[256], o[256];
1554       unsigned e_num = 1, o_num = 1;
1555
1556       memset (e, 0, 256);
1557       memset (o, 0, 256);
1558       e[c1] = 1;
1559       o[c2] = 1;
1560
1561       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1562                                 |CATEGORY_MASK_UTF_16_BE
1563                                 | CATEGORY_MASK_UTF_16_LE);
1564
1565       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1566              != CATEGORY_MASK_UTF_16)
1567         {
1568           TWO_MORE_BYTES (c1, c2);
1569           if (c2 < 0)
1570             break;
1571           if (! e[c1])
1572             {
1573               e[c1] = 1;
1574               e_num++;
1575               if (e_num >= 128)
1576                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1577             }
1578           if (! o[c2])
1579             {
1580               o[c2] = 1;
1581               o_num++;
1582               if (o_num >= 128)
1583                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1584             }
1585         }
1586       return 0;
1587     }
1588
1589  no_more_source:
1590   return 1;
1591 }
1592
1593 static void
1594 decode_coding_utf_16 (struct coding_system *coding)
1595 {
1596   const unsigned char *src = coding->source + coding->consumed;
1597   const unsigned char *src_end = coding->source + coding->src_bytes;
1598   const unsigned char *src_base;
1599   int *charbuf = coding->charbuf + coding->charbuf_used;
1600   /* We may produces at most 3 chars in one loop.  */
1601   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1602   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1603   bool multibytep = coding->src_multibyte;
1604   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1605   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1606   int surrogate = CODING_UTF_16_SURROGATE (coding);
1607   bool eol_dos
1608     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1609   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1610
1611   if (bom == utf_with_bom)
1612     {
1613       int c, c1, c2;
1614
1615       src_base = src;
1616       ONE_MORE_BYTE (c1);
1617       ONE_MORE_BYTE (c2);
1618       c = (c1 << 8) | c2;
1619
1620       if (endian == utf_16_big_endian
1621           ? c != 0xFEFF : c != 0xFFFE)
1622         {
1623           /* The first two bytes are not BOM.  Treat them as bytes
1624              for a normal character.  */
1625           src = src_base;
1626           coding->errors++;
1627         }
1628       CODING_UTF_16_BOM (coding) = utf_without_bom;
1629     }
1630   else if (bom == utf_detect_bom)
1631     {
1632       /* We have already tried to detect BOM and failed in
1633          detect_coding.  */
1634       CODING_UTF_16_BOM (coding) = utf_without_bom;
1635     }
1636
1637   while (1)
1638     {
1639       int c, c1, c2;
1640
1641       src_base = src;
1642       consumed_chars_base = consumed_chars;
1643
1644       if (charbuf >= charbuf_end)
1645         {
1646           if (byte_after_cr1 >= 0)
1647             src_base -= 2;
1648           break;
1649         }
1650
1651       if (byte_after_cr1 >= 0)
1652         c1 = byte_after_cr1, byte_after_cr1 = -1;
1653       else
1654         ONE_MORE_BYTE (c1);
1655       if (c1 < 0)
1656         {
1657           *charbuf++ = -c1;
1658           continue;
1659         }
1660       if (byte_after_cr2 >= 0)
1661         c2 = byte_after_cr2, byte_after_cr2 = -1;
1662       else
1663         ONE_MORE_BYTE (c2);
1664       if (c2 < 0)
1665         {
1666           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1667           *charbuf++ = -c2;
1668           continue;
1669         }
1670       c = (endian == utf_16_big_endian
1671            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1672
1673       if (surrogate)
1674         {
1675           if (! UTF_16_LOW_SURROGATE_P (c))
1676             {
1677               if (endian == utf_16_big_endian)
1678                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1679               else
1680                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1681               *charbuf++ = c1;
1682               *charbuf++ = c2;
1683               coding->errors++;
1684               if (UTF_16_HIGH_SURROGATE_P (c))
1685                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1686               else
1687                 *charbuf++ = c;
1688             }
1689           else
1690             {
1691               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1692               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1693               *charbuf++ = 0x10000 + c;
1694             }
1695         }
1696       else
1697         {
1698           if (UTF_16_HIGH_SURROGATE_P (c))
1699             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1700           else
1701             {
1702               if (eol_dos && c == '\r')
1703                 {
1704                   ONE_MORE_BYTE (byte_after_cr1);
1705                   ONE_MORE_BYTE (byte_after_cr2);
1706                 }
1707               *charbuf++ = c;
1708             }
1709         }
1710     }
1711
1712  no_more_source:
1713   coding->consumed_char += consumed_chars_base;
1714   coding->consumed = src_base - coding->source;
1715   coding->charbuf_used = charbuf - coding->charbuf;
1716 }
1717
1718 static bool
1719 encode_coding_utf_16 (struct coding_system *coding)
1720 {
1721   bool multibytep = coding->dst_multibyte;
1722   int *charbuf = coding->charbuf;
1723   int *charbuf_end = charbuf + coding->charbuf_used;
1724   unsigned char *dst = coding->destination + coding->produced;
1725   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1726   int safe_room = 8;
1727   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1728   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1729   ptrdiff_t produced_chars = 0;
1730   int c;
1731
1732   if (bom != utf_without_bom)
1733     {
1734       ASSURE_DESTINATION (safe_room);
1735       if (big_endian)
1736         EMIT_TWO_BYTES (0xFE, 0xFF);
1737       else
1738         EMIT_TWO_BYTES (0xFF, 0xFE);
1739       CODING_UTF_16_BOM (coding) = utf_without_bom;
1740     }
1741
1742   while (charbuf < charbuf_end)
1743     {
1744       ASSURE_DESTINATION (safe_room);
1745       c = *charbuf++;
1746       if (c > MAX_UNICODE_CHAR)
1747         c = coding->default_char;
1748
1749       if (c < 0x10000)
1750         {
1751           if (big_endian)
1752             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1753           else
1754             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1755         }
1756       else
1757         {
1758           int c1, c2;
1759
1760           c -= 0x10000;
1761           c1 = (c >> 10) + 0xD800;
1762           c2 = (c & 0x3FF) + 0xDC00;
1763           if (big_endian)
1764             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1765           else
1766             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1767         }
1768     }
1769   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1770   coding->produced = dst - coding->destination;
1771   coding->produced_char += produced_chars;
1772   return 0;
1773 }
1774
1775 \f
1776 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1777
1778 /* Emacs' internal format for representation of multiple character
1779    sets is a kind of multi-byte encoding, i.e. characters are
1780    represented by variable-length sequences of one-byte codes.
1781
1782    ASCII characters and control characters (e.g. `tab', `newline') are
1783    represented by one-byte sequences which are their ASCII codes, in
1784    the range 0x00 through 0x7F.
1785
1786    8-bit characters of the range 0x80..0x9F are represented by
1787    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1788    code + 0x20).
1789
1790    8-bit characters of the range 0xA0..0xFF are represented by
1791    one-byte sequences which are their 8-bit code.
1792
1793    The other characters are represented by a sequence of `base
1794    leading-code', optional `extended leading-code', and one or two
1795    `position-code's.  The length of the sequence is determined by the
1796    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1797    whereas extended leading-code and position-code take the range 0xA0
1798    through 0xFF.  See `charset.h' for more details about leading-code
1799    and position-code.
1800
1801    --- CODE RANGE of Emacs' internal format ---
1802    character set        range
1803    -------------        -----
1804    ascii                0x00..0x7F
1805    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1806    eight-bit-graphic    0xA0..0xBF
1807    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1808    ---------------------------------------------
1809
1810    As this is the internal character representation, the format is
1811    usually not used externally (i.e. in a file or in a data sent to a
1812    process).  But, it is possible to have a text externally in this
1813    format (i.e. by encoding by the coding system `emacs-mule').
1814
1815    In that case, a sequence of one-byte codes has a slightly different
1816    form.
1817
1818    At first, all characters in eight-bit-control are represented by
1819    one-byte sequences which are their 8-bit code.
1820
1821    Next, character composition data are represented by the byte
1822    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1823    where,
1824         METHOD is 0xF2 plus one of composition method (enum
1825         composition_method),
1826
1827         BYTES is 0xA0 plus a byte length of this composition data,
1828
1829         CHARS is 0xA0 plus a number of characters composed by this
1830         data,
1831
1832         COMPONENTs are characters of multibyte form or composition
1833         rules encoded by two-byte of ASCII codes.
1834
1835    In addition, for backward compatibility, the following formats are
1836    also recognized as composition data on decoding.
1837
1838    0x80 MSEQ ...
1839    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1840
1841    Here,
1842         MSEQ is a multibyte form but in these special format:
1843           ASCII: 0xA0 ASCII_CODE+0x80,
1844           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1845         RULE is a one byte code of the range 0xA0..0xF0 that
1846         represents a composition rule.
1847   */
1848
1849 char emacs_mule_bytes[256];
1850
1851
1852 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1853    Return true if a text is encoded in 'emacs-mule'.  */
1854
1855 static bool
1856 detect_coding_emacs_mule (struct coding_system *coding,
1857                           struct coding_detection_info *detect_info)
1858 {
1859   const unsigned char *src = coding->source, *src_base;
1860   const unsigned char *src_end = coding->source + coding->src_bytes;
1861   bool multibytep = coding->src_multibyte;
1862   ptrdiff_t consumed_chars = 0;
1863   int c;
1864   int found = 0;
1865
1866   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1867   /* A coding system of this category is always ASCII compatible.  */
1868   src += coding->head_ascii;
1869
1870   while (1)
1871     {
1872       src_base = src;
1873       ONE_MORE_BYTE (c);
1874       if (c < 0)
1875         continue;
1876       if (c == 0x80)
1877         {
1878           /* Perhaps the start of composite character.  We simply skip
1879              it because analyzing it is too heavy for detecting.  But,
1880              at least, we check that the composite character
1881              constitutes of more than 4 bytes.  */
1882           const unsigned char *src_start;
1883
1884         repeat:
1885           src_start = src;
1886           do
1887             {
1888               ONE_MORE_BYTE (c);
1889             }
1890           while (c >= 0xA0);
1891
1892           if (src - src_start <= 4)
1893             break;
1894           found = CATEGORY_MASK_EMACS_MULE;
1895           if (c == 0x80)
1896             goto repeat;
1897         }
1898
1899       if (c < 0x80)
1900         {
1901           if (c < 0x20
1902               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1903             break;
1904         }
1905       else
1906         {
1907           int more_bytes = emacs_mule_bytes[c] - 1;
1908
1909           while (more_bytes > 0)
1910             {
1911               ONE_MORE_BYTE (c);
1912               if (c < 0xA0)
1913                 {
1914                   src--;        /* Unread the last byte.  */
1915                   break;
1916                 }
1917               more_bytes--;
1918             }
1919           if (more_bytes != 0)
1920             break;
1921           found = CATEGORY_MASK_EMACS_MULE;
1922         }
1923     }
1924   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1925   return 0;
1926
1927  no_more_source:
1928   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1929     {
1930       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1931       return 0;
1932     }
1933   detect_info->found |= found;
1934   return 1;
1935 }
1936
1937
1938 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1939    character.  If CMP_STATUS indicates that we must expect MSEQ or
1940    RULE described above, decode it and return the negative value of
1941    the decoded character or rule.  If an invalid byte is found, return
1942    -1.  If SRC is too short, return -2.  */
1943
1944 static int
1945 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1946                  int *nbytes, int *nchars, int *id,
1947                  struct composition_status *cmp_status)
1948 {
1949   const unsigned char *src_end = coding->source + coding->src_bytes;
1950   const unsigned char *src_base = src;
1951   bool multibytep = coding->src_multibyte;
1952   int charset_ID;
1953   unsigned code;
1954   int c;
1955   int consumed_chars = 0;
1956   bool mseq_found = 0;
1957
1958   ONE_MORE_BYTE (c);
1959   if (c < 0)
1960     {
1961       c = -c;
1962       charset_ID = emacs_mule_charset[0];
1963     }
1964   else
1965     {
1966       if (c >= 0xA0)
1967         {
1968           if (cmp_status->state != COMPOSING_NO
1969               && cmp_status->old_form)
1970             {
1971               if (cmp_status->state == COMPOSING_CHAR)
1972                 {
1973                   if (c == 0xA0)
1974                     {
1975                       ONE_MORE_BYTE (c);
1976                       c -= 0x80;
1977                       if (c < 0)
1978                         goto invalid_code;
1979                     }
1980                   else
1981                     c -= 0x20;
1982                   mseq_found = 1;
1983                 }
1984               else
1985                 {
1986                   *nbytes = src - src_base;
1987                   *nchars = consumed_chars;
1988                   return -c;
1989                 }
1990             }
1991           else
1992             goto invalid_code;
1993         }
1994
1995       switch (emacs_mule_bytes[c])
1996         {
1997         case 2:
1998           if ((charset_ID = emacs_mule_charset[c]) < 0)
1999             goto invalid_code;
2000           ONE_MORE_BYTE (c);
2001           if (c < 0xA0)
2002             goto invalid_code;
2003           code = c & 0x7F;
2004           break;
2005
2006         case 3:
2007           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2008               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2009             {
2010               ONE_MORE_BYTE (c);
2011               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2012                 goto invalid_code;
2013               ONE_MORE_BYTE (c);
2014               if (c < 0xA0)
2015                 goto invalid_code;
2016               code = c & 0x7F;
2017             }
2018           else
2019             {
2020               if ((charset_ID = emacs_mule_charset[c]) < 0)
2021                 goto invalid_code;
2022               ONE_MORE_BYTE (c);
2023               if (c < 0xA0)
2024                 goto invalid_code;
2025               code = (c & 0x7F) << 8;
2026               ONE_MORE_BYTE (c);
2027               if (c < 0xA0)
2028                 goto invalid_code;
2029               code |= c & 0x7F;
2030             }
2031           break;
2032
2033         case 4:
2034           ONE_MORE_BYTE (c);
2035           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2036             goto invalid_code;
2037           ONE_MORE_BYTE (c);
2038           if (c < 0xA0)
2039             goto invalid_code;
2040           code = (c & 0x7F) << 8;
2041           ONE_MORE_BYTE (c);
2042           if (c < 0xA0)
2043             goto invalid_code;
2044           code |= c & 0x7F;
2045           break;
2046
2047         case 1:
2048           code = c;
2049           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2050           break;
2051
2052         default:
2053           emacs_abort ();
2054         }
2055       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2056                           CHARSET_FROM_ID (charset_ID), code, c);
2057       if (c < 0)
2058         goto invalid_code;
2059     }
2060   *nbytes = src - src_base;
2061   *nchars = consumed_chars;
2062   if (id)
2063     *id = charset_ID;
2064   return (mseq_found ? -c : c);
2065
2066  no_more_source:
2067   return -2;
2068
2069  invalid_code:
2070   return -1;
2071 }
2072
2073
2074 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2075
2076 /* Handle these composition sequence ('|': the end of header elements,
2077    BYTES and CHARS >= 0xA0):
2078
2079    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2080    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2081    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2082
2083    and these old form:
2084
2085    (4) relative composition: 0x80 | MSEQ ... MSEQ
2086    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2087
2088    When the starter 0x80 and the following header elements are found,
2089    this annotation header is produced.
2090
2091         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2092
2093    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2094    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2095
2096    Then, upon reading the following elements, these codes are produced
2097    until the composition end is found:
2098
2099    (1) CHAR ... CHAR
2100    (2) ALT ... ALT CHAR ... CHAR
2101    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2102    (4) CHAR ... CHAR
2103    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2104
2105    When the composition end is found, LENGTH and NCHARS in the
2106    annotation header is updated as below:
2107
2108    (1) LENGTH: unchanged, NCHARS: unchanged
2109    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2110    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2111    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2112    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2113
2114    If an error is found while composing, the annotation header is
2115    changed to the original composition header (plus filler -1s) as
2116    below:
2117
2118    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2119    (5)          [ 0x80 0xFF -1 -1- -1 ]
2120
2121    and the sequence [ -2 DECODED-RULE ] is changed to the original
2122    byte sequence as below:
2123         o the original byte sequence is B: [ B -1 ]
2124         o the original byte sequence is B1 B2: [ B1 B2 ]
2125
2126    Most of the routines are implemented by macros because many
2127    variables and labels in the caller decode_coding_emacs_mule must be
2128    accessible, and they are usually called just once (thus doesn't
2129    increase the size of compiled object).  */
2130
2131 /* Decode a composition rule represented by C as a component of
2132    composition sequence of Emacs 20 style.  Set RULE to the decoded
2133    rule. */
2134
2135 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2136   do {                                                  \
2137     int gref, nref;                                     \
2138                                                         \
2139     c -= 0xA0;                                          \
2140     if (c < 0 || c >= 81)                               \
2141       goto invalid_code;                                \
2142     gref = c / 9, nref = c % 9;                         \
2143     if (gref == 4) gref = 10;                           \
2144     if (nref == 4) nref = 10;                           \
2145     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2146   } while (0)
2147
2148
2149 /* Decode a composition rule represented by C and the following byte
2150    at SRC as a component of composition sequence of Emacs 21 style.
2151    Set RULE to the decoded rule.  */
2152
2153 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2154   do {                                                  \
2155     int gref, nref;                                     \
2156                                                         \
2157     gref = c - 0x20;                                    \
2158     if (gref < 0 || gref >= 81)                         \
2159       goto invalid_code;                                \
2160     ONE_MORE_BYTE (c);                                  \
2161     nref = c - 0x20;                                    \
2162     if (nref < 0 || nref >= 81)                         \
2163       goto invalid_code;                                \
2164     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2165   } while (0)
2166
2167
2168 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2169    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2170    byte length of this composition information, CHARS is the number of
2171    characters composed by this composition.  */
2172
2173 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2174   do {                                                                  \
2175     enum composition_method method = c - 0xF2;                          \
2176     int nbytes, nchars;                                                 \
2177                                                                         \
2178     ONE_MORE_BYTE (c);                                                  \
2179     if (c < 0)                                                          \
2180       goto invalid_code;                                                \
2181     nbytes = c - 0xA0;                                                  \
2182     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2183       goto invalid_code;                                                \
2184     ONE_MORE_BYTE (c);                                                  \
2185     nchars = c - 0xA0;                                                  \
2186     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2187       goto invalid_code;                                                \
2188     cmp_status->old_form = 0;                                           \
2189     cmp_status->method = method;                                        \
2190     if (method == COMPOSITION_RELATIVE)                                 \
2191       cmp_status->state = COMPOSING_CHAR;                               \
2192     else                                                                \
2193       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2194     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2195     cmp_status->nchars = nchars;                                        \
2196     cmp_status->ncomps = nbytes - 4;                                    \
2197     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2198   } while (0)
2199
2200
2201 /* Start of Emacs 20 style format for relative composition.  */
2202
2203 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2204   do {                                                          \
2205     cmp_status->old_form = 1;                                   \
2206     cmp_status->method = COMPOSITION_RELATIVE;                  \
2207     cmp_status->state = COMPOSING_CHAR;                         \
2208     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2209     cmp_status->nchars = cmp_status->ncomps = 0;                \
2210     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2211   } while (0)
2212
2213
2214 /* Start of Emacs 20 style format for rule-base composition.  */
2215
2216 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2217   do {                                                          \
2218     cmp_status->old_form = 1;                                   \
2219     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2220     cmp_status->state = COMPOSING_CHAR;                         \
2221     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2222     cmp_status->nchars = cmp_status->ncomps = 0;                \
2223     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2224   } while (0)
2225
2226
2227 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2228   do {                                                  \
2229     const unsigned char *current_src = src;             \
2230                                                         \
2231     ONE_MORE_BYTE (c);                                  \
2232     if (c < 0)                                          \
2233       goto invalid_code;                                \
2234     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2235         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2236       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2237     else if (c < 0xA0)                                  \
2238       goto invalid_code;                                \
2239     else if (c < 0xC0)                                  \
2240       {                                                 \
2241         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2242         /* Re-read C as a composition component.  */    \
2243         src = current_src;                              \
2244       }                                                 \
2245     else if (c == 0xFF)                                 \
2246       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2247     else                                                \
2248       goto invalid_code;                                \
2249   } while (0)
2250
2251 #define EMACS_MULE_COMPOSITION_END()                            \
2252   do {                                                          \
2253     int idx = - cmp_status->length;                             \
2254                                                                 \
2255     if (cmp_status->old_form)                                   \
2256       charbuf[idx + 2] = cmp_status->nchars;                    \
2257     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2258       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2259     cmp_status->state = COMPOSING_NO;                           \
2260   } while (0)
2261
2262
2263 static int
2264 emacs_mule_finish_composition (int *charbuf,
2265                                struct composition_status *cmp_status)
2266 {
2267   int idx = - cmp_status->length;
2268   int new_chars;
2269
2270   if (cmp_status->old_form && cmp_status->nchars > 0)
2271     {
2272       charbuf[idx + 2] = cmp_status->nchars;
2273       new_chars = 0;
2274       if (cmp_status->method == COMPOSITION_WITH_RULE
2275           && cmp_status->state == COMPOSING_CHAR)
2276         {
2277           /* The last rule was invalid.  */
2278           int rule = charbuf[-1] + 0xA0;
2279
2280           charbuf[-2] = BYTE8_TO_CHAR (rule);
2281           charbuf[-1] = -1;
2282           new_chars = 1;
2283         }
2284     }
2285   else
2286     {
2287       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2288
2289       if (cmp_status->method == COMPOSITION_WITH_RULE)
2290         {
2291           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2292           charbuf[idx++] = -3;
2293           charbuf[idx++] = 0;
2294           new_chars = 1;
2295         }
2296       else
2297         {
2298           int nchars = charbuf[idx + 1] + 0xA0;
2299           int nbytes = charbuf[idx + 2] + 0xA0;
2300
2301           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2302           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2303           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2304           charbuf[idx++] = -1;
2305           new_chars = 4;
2306         }
2307     }
2308   cmp_status->state = COMPOSING_NO;
2309   return new_chars;
2310 }
2311
2312 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2313   do {                                                                    \
2314     if (cmp_status->state != COMPOSING_NO)                                \
2315       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2316   } while (0)
2317
2318
2319 static void
2320 decode_coding_emacs_mule (struct coding_system *coding)
2321 {
2322   const unsigned char *src = coding->source + coding->consumed;
2323   const unsigned char *src_end = coding->source + coding->src_bytes;
2324   const unsigned char *src_base;
2325   int *charbuf = coding->charbuf + coding->charbuf_used;
2326   /* We may produce two annotations (charset and composition) in one
2327      loop and one more charset annotation at the end.  */
2328   int *charbuf_end
2329     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2330       /* We can produce up to 2 characters in a loop.  */
2331       - 1;
2332   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2333   bool multibytep = coding->src_multibyte;
2334   ptrdiff_t char_offset = coding->produced_char;
2335   ptrdiff_t last_offset = char_offset;
2336   int last_id = charset_ascii;
2337   bool eol_dos
2338     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2339   int byte_after_cr = -1;
2340   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2341
2342   if (cmp_status->state != COMPOSING_NO)
2343     {
2344       int i;
2345
2346       if (charbuf_end - charbuf < cmp_status->length)
2347         emacs_abort ();
2348       for (i = 0; i < cmp_status->length; i++)
2349         *charbuf++ = cmp_status->carryover[i];
2350       coding->annotated = 1;
2351     }
2352
2353   while (1)
2354     {
2355       int c, id IF_LINT (= 0);
2356
2357       src_base = src;
2358       consumed_chars_base = consumed_chars;
2359
2360       if (charbuf >= charbuf_end)
2361         {
2362           if (byte_after_cr >= 0)
2363             src_base--;
2364           break;
2365         }
2366
2367       if (byte_after_cr >= 0)
2368         c = byte_after_cr, byte_after_cr = -1;
2369       else
2370         ONE_MORE_BYTE (c);
2371
2372       if (c < 0 || c == 0x80)
2373         {
2374           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2375           if (c < 0)
2376             {
2377               *charbuf++ = -c;
2378               char_offset++;
2379             }
2380           else
2381             DECODE_EMACS_MULE_COMPOSITION_START ();
2382           continue;
2383         }
2384
2385       if (c < 0x80)
2386         {
2387           if (eol_dos && c == '\r')
2388             ONE_MORE_BYTE (byte_after_cr);
2389           id = charset_ascii;
2390           if (cmp_status->state != COMPOSING_NO)
2391             {
2392               if (cmp_status->old_form)
2393                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2394               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2395                 cmp_status->ncomps--;
2396             }
2397         }
2398       else
2399         {
2400           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2401           /* emacs_mule_char can load a charset map from a file, which
2402              allocates a large structure and might cause buffer text
2403              to be relocated as result.  Thus, we need to remember the
2404              original pointer to buffer text, and fix up all related
2405              pointers after the call.  */
2406           const unsigned char *orig = coding->source;
2407           ptrdiff_t offset;
2408
2409           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2410                                cmp_status);
2411           offset = coding->source - orig;
2412           if (offset)
2413             {
2414               src += offset;
2415               src_base += offset;
2416               src_end += offset;
2417             }
2418           if (c < 0)
2419             {
2420               if (c == -1)
2421                 goto invalid_code;
2422               if (c == -2)
2423                 break;
2424             }
2425           src = src_base + nbytes;
2426           consumed_chars = consumed_chars_base + nchars;
2427           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2428             cmp_status->ncomps -= nchars;
2429         }
2430
2431       /* Now if C >= 0, we found a normally encoded character, if C <
2432          0, we found an old-style composition component character or
2433          rule.  */
2434
2435       if (cmp_status->state == COMPOSING_NO)
2436         {
2437           if (last_id != id)
2438             {
2439               if (last_id != charset_ascii)
2440                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2441                                   last_id);
2442               last_id = id;
2443               last_offset = char_offset;
2444             }
2445           *charbuf++ = c;
2446           char_offset++;
2447         }
2448       else if (cmp_status->state == COMPOSING_CHAR)
2449         {
2450           if (cmp_status->old_form)
2451             {
2452               if (c >= 0)
2453                 {
2454                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2455                   *charbuf++ = c;
2456                   char_offset++;
2457                 }
2458               else
2459                 {
2460                   *charbuf++ = -c;
2461                   cmp_status->nchars++;
2462                   cmp_status->length++;
2463                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2464                     EMACS_MULE_COMPOSITION_END ();
2465                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2466                     cmp_status->state = COMPOSING_RULE;
2467                 }
2468             }
2469           else
2470             {
2471               *charbuf++ = c;
2472               cmp_status->length++;
2473               cmp_status->nchars--;
2474               if (cmp_status->nchars == 0)
2475                 EMACS_MULE_COMPOSITION_END ();
2476             }
2477         }
2478       else if (cmp_status->state == COMPOSING_RULE)
2479         {
2480           int rule;
2481
2482           if (c >= 0)
2483             {
2484               EMACS_MULE_COMPOSITION_END ();
2485               *charbuf++ = c;
2486               char_offset++;
2487             }
2488           else
2489             {
2490               c = -c;
2491               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2492               if (rule < 0)
2493                 goto invalid_code;
2494               *charbuf++ = -2;
2495               *charbuf++ = rule;
2496               cmp_status->length += 2;
2497               cmp_status->state = COMPOSING_CHAR;
2498             }
2499         }
2500       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2501         {
2502           *charbuf++ = c;
2503           cmp_status->length++;
2504           if (cmp_status->ncomps == 0)
2505             cmp_status->state = COMPOSING_CHAR;
2506           else if (cmp_status->ncomps > 0)
2507             {
2508               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2509                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2510             }
2511           else
2512             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2513         }
2514       else                      /* COMPOSING_COMPONENT_RULE */
2515         {
2516           int rule;
2517
2518           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2519           if (rule < 0)
2520             goto invalid_code;
2521           *charbuf++ = -2;
2522           *charbuf++ = rule;
2523           cmp_status->length += 2;
2524           cmp_status->ncomps--;
2525           if (cmp_status->ncomps > 0)
2526             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2527           else
2528             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2529         }
2530       continue;
2531
2532     invalid_code:
2533       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2534       src = src_base;
2535       consumed_chars = consumed_chars_base;
2536       ONE_MORE_BYTE (c);
2537       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2538       char_offset++;
2539       coding->errors++;
2540     }
2541
2542  no_more_source:
2543   if (cmp_status->state != COMPOSING_NO)
2544     {
2545       if (coding->mode & CODING_MODE_LAST_BLOCK)
2546         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2547       else
2548         {
2549           int i;
2550
2551           charbuf -= cmp_status->length;
2552           for (i = 0; i < cmp_status->length; i++)
2553             cmp_status->carryover[i] = charbuf[i];
2554         }
2555     }
2556   if (last_id != charset_ascii)
2557     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2558   coding->consumed_char += consumed_chars_base;
2559   coding->consumed = src_base - coding->source;
2560   coding->charbuf_used = charbuf - coding->charbuf;
2561 }
2562
2563
2564 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2565   do {                                          \
2566     if (id < 0xA0)                              \
2567       codes[0] = id, codes[1] = 0;              \
2568     else if (id < 0xE0)                         \
2569       codes[0] = 0x9A, codes[1] = id;           \
2570     else if (id < 0xF0)                         \
2571       codes[0] = 0x9B, codes[1] = id;           \
2572     else if (id < 0xF5)                         \
2573       codes[0] = 0x9C, codes[1] = id;           \
2574     else                                        \
2575       codes[0] = 0x9D, codes[1] = id;           \
2576   } while (0);
2577
2578
2579 static bool
2580 encode_coding_emacs_mule (struct coding_system *coding)
2581 {
2582   bool multibytep = coding->dst_multibyte;
2583   int *charbuf = coding->charbuf;
2584   int *charbuf_end = charbuf + coding->charbuf_used;
2585   unsigned char *dst = coding->destination + coding->produced;
2586   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2587   int safe_room = 8;
2588   ptrdiff_t produced_chars = 0;
2589   Lisp_Object attrs, charset_list;
2590   int c;
2591   int preferred_charset_id = -1;
2592
2593   CODING_GET_INFO (coding, attrs, charset_list);
2594   if (! EQ (charset_list, Vemacs_mule_charset_list))
2595     {
2596       charset_list = Vemacs_mule_charset_list;
2597       ASET (attrs, coding_attr_charset_list, charset_list);
2598     }
2599
2600   while (charbuf < charbuf_end)
2601     {
2602       ASSURE_DESTINATION (safe_room);
2603       c = *charbuf++;
2604
2605       if (c < 0)
2606         {
2607           /* Handle an annotation.  */
2608           switch (*charbuf)
2609             {
2610             case CODING_ANNOTATE_COMPOSITION_MASK:
2611               /* Not yet implemented.  */
2612               break;
2613             case CODING_ANNOTATE_CHARSET_MASK:
2614               preferred_charset_id = charbuf[3];
2615               if (preferred_charset_id >= 0
2616                   && NILP (Fmemq (make_number (preferred_charset_id),
2617                                   charset_list)))
2618                 preferred_charset_id = -1;
2619               break;
2620             default:
2621               emacs_abort ();
2622             }
2623           charbuf += -c - 1;
2624           continue;
2625         }
2626
2627       if (ASCII_CHAR_P (c))
2628         EMIT_ONE_ASCII_BYTE (c);
2629       else if (CHAR_BYTE8_P (c))
2630         {
2631           c = CHAR_TO_BYTE8 (c);
2632           EMIT_ONE_BYTE (c);
2633         }
2634       else
2635         {
2636           struct charset *charset;
2637           unsigned code;
2638           int dimension;
2639           int emacs_mule_id;
2640           unsigned char leading_codes[2];
2641
2642           if (preferred_charset_id >= 0)
2643             {
2644               bool result;
2645
2646               charset = CHARSET_FROM_ID (preferred_charset_id);
2647               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2648               if (result)
2649                 code = ENCODE_CHAR (charset, c);
2650               else
2651                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2652                                      &code, charset);
2653             }
2654           else
2655             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2656                                  &code, charset);
2657           if (! charset)
2658             {
2659               c = coding->default_char;
2660               if (ASCII_CHAR_P (c))
2661                 {
2662                   EMIT_ONE_ASCII_BYTE (c);
2663                   continue;
2664                 }
2665               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2666                                    &code, charset);
2667             }
2668           dimension = CHARSET_DIMENSION (charset);
2669           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2670           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2671           EMIT_ONE_BYTE (leading_codes[0]);
2672           if (leading_codes[1])
2673             EMIT_ONE_BYTE (leading_codes[1]);
2674           if (dimension == 1)
2675             EMIT_ONE_BYTE (code | 0x80);
2676           else
2677             {
2678               code |= 0x8080;
2679               EMIT_ONE_BYTE (code >> 8);
2680               EMIT_ONE_BYTE (code & 0xFF);
2681             }
2682         }
2683     }
2684   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2685   coding->produced_char += produced_chars;
2686   coding->produced = dst - coding->destination;
2687   return 0;
2688 }
2689
2690 \f
2691 /*** 7. ISO2022 handlers ***/
2692
2693 /* The following note describes the coding system ISO2022 briefly.
2694    Since the intention of this note is to help understand the
2695    functions in this file, some parts are NOT ACCURATE or are OVERLY
2696    SIMPLIFIED.  For thorough understanding, please refer to the
2697    original document of ISO2022.  This is equivalent to the standard
2698    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2699
2700    ISO2022 provides many mechanisms to encode several character sets
2701    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2702    is encoded using bytes less than 128.  This may make the encoded
2703    text a little bit longer, but the text passes more easily through
2704    several types of gateway, some of which strip off the MSB (Most
2705    Significant Bit).
2706
2707    There are two kinds of character sets: control character sets and
2708    graphic character sets.  The former contain control characters such
2709    as `newline' and `escape' to provide control functions (control
2710    functions are also provided by escape sequences).  The latter
2711    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2712    two control character sets and many graphic character sets.
2713
2714    Graphic character sets are classified into one of the following
2715    four classes, according to the number of bytes (DIMENSION) and
2716    number of characters in one dimension (CHARS) of the set:
2717    - DIMENSION1_CHARS94
2718    - DIMENSION1_CHARS96
2719    - DIMENSION2_CHARS94
2720    - DIMENSION2_CHARS96
2721
2722    In addition, each character set is assigned an identification tag,
2723    unique for each set, called the "final character" (denoted as <F>
2724    hereafter).  The <F> of each character set is decided by ECMA(*)
2725    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2726    (0x30..0x3F are for private use only).
2727
2728    Note (*): ECMA = European Computer Manufacturers Association
2729
2730    Here are examples of graphic character sets [NAME(<F>)]:
2731         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2732         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2733         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2734         o DIMENSION2_CHARS96 -- none for the moment
2735
2736    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2737         C0 [0x00..0x1F] -- control character plane 0
2738         GL [0x20..0x7F] -- graphic character plane 0
2739         C1 [0x80..0x9F] -- control character plane 1
2740         GR [0xA0..0xFF] -- graphic character plane 1
2741
2742    A control character set is directly designated and invoked to C0 or
2743    C1 by an escape sequence.  The most common case is that:
2744    - ISO646's  control character set is designated/invoked to C0, and
2745    - ISO6429's control character set is designated/invoked to C1,
2746    and usually these designations/invocations are omitted in encoded
2747    text.  In a 7-bit environment, only C0 can be used, and a control
2748    character for C1 is encoded by an appropriate escape sequence to
2749    fit into the environment.  All control characters for C1 are
2750    defined to have corresponding escape sequences.
2751
2752    A graphic character set is at first designated to one of four
2753    graphic registers (G0 through G3), then these graphic registers are
2754    invoked to GL or GR.  These designations and invocations can be
2755    done independently.  The most common case is that G0 is invoked to
2756    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2757    these invocations and designations are omitted in encoded text.
2758    In a 7-bit environment, only GL can be used.
2759
2760    When a graphic character set of CHARS94 is invoked to GL, codes
2761    0x20 and 0x7F of the GL area work as control characters SPACE and
2762    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2763    be used.
2764
2765    There are two ways of invocation: locking-shift and single-shift.
2766    With locking-shift, the invocation lasts until the next different
2767    invocation, whereas with single-shift, the invocation affects the
2768    following character only and doesn't affect the locking-shift
2769    state.  Invocations are done by the following control characters or
2770    escape sequences:
2771
2772    ----------------------------------------------------------------------
2773    abbrev  function                  cntrl escape seq   description
2774    ----------------------------------------------------------------------
2775    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2776    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2777    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2778    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2779    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2780    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2781    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2782    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2783    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2784    ----------------------------------------------------------------------
2785    (*) These are not used by any known coding system.
2786
2787    Control characters for these functions are defined by macros
2788    ISO_CODE_XXX in `coding.h'.
2789
2790    Designations are done by the following escape sequences:
2791    ----------------------------------------------------------------------
2792    escape sequence      description
2793    ----------------------------------------------------------------------
2794    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2795    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2796    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2797    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2798    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2799    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2800    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2801    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2802    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2803    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2804    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2805    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2806    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2807    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2808    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2809    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2810    ----------------------------------------------------------------------
2811
2812    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2813    of dimension 1, chars 94, and final character <F>, etc...
2814
2815    Note (*): Although these designations are not allowed in ISO2022,
2816    Emacs accepts them on decoding, and produces them on encoding
2817    CHARS96 character sets in a coding system which is characterized as
2818    7-bit environment, non-locking-shift, and non-single-shift.
2819
2820    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2821    '(' must be omitted.  We refer to this as "short-form" hereafter.
2822
2823    Now you may notice that there are a lot of ways of encoding the
2824    same multilingual text in ISO2022.  Actually, there exist many
2825    coding systems such as Compound Text (used in X11's inter client
2826    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2827    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2828    localized platforms), and all of these are variants of ISO2022.
2829
2830    In addition to the above, Emacs handles two more kinds of escape
2831    sequences: ISO6429's direction specification and Emacs' private
2832    sequence for specifying character composition.
2833
2834    ISO6429's direction specification takes the following form:
2835         o CSI ']'      -- end of the current direction
2836         o CSI '0' ']'  -- end of the current direction
2837         o CSI '1' ']'  -- start of left-to-right text
2838         o CSI '2' ']'  -- start of right-to-left text
2839    The control character CSI (0x9B: control sequence introducer) is
2840    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2841
2842    Character composition specification takes the following form:
2843         o ESC '0' -- start relative composition
2844         o ESC '1' -- end composition
2845         o ESC '2' -- start rule-base composition (*)
2846         o ESC '3' -- start relative composition with alternate chars  (**)
2847         o ESC '4' -- start rule-base composition with alternate chars  (**)
2848   Since these are not standard escape sequences of any ISO standard,
2849   the use of them with these meanings is restricted to Emacs only.
2850
2851   (*) This form is used only in Emacs 20.7 and older versions,
2852   but newer versions can safely decode it.
2853   (**) This form is used only in Emacs 21.1 and newer versions,
2854   and older versions can't decode it.
2855
2856   Here's a list of example usages of these composition escape
2857   sequences (categorized by `enum composition_method').
2858
2859   COMPOSITION_RELATIVE:
2860         ESC 0 CHAR [ CHAR ] ESC 1
2861   COMPOSITION_WITH_RULE:
2862         ESC 2 CHAR [ RULE CHAR ] ESC 1
2863   COMPOSITION_WITH_ALTCHARS:
2864         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2865   COMPOSITION_WITH_RULE_ALTCHARS:
2866         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2867
2868 static enum iso_code_class_type iso_code_class[256];
2869
2870 #define SAFE_CHARSET_P(coding, id)      \
2871   ((id) <= (coding)->max_charset_id     \
2872    && (coding)->safe_charsets[id] != 255)
2873
2874 static void
2875 setup_iso_safe_charsets (Lisp_Object attrs)
2876 {
2877   Lisp_Object charset_list, safe_charsets;
2878   Lisp_Object request;
2879   Lisp_Object reg_usage;
2880   Lisp_Object tail;
2881   EMACS_INT reg94, reg96;
2882   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2883   int max_charset_id;
2884
2885   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2886   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2887       && ! EQ (charset_list, Viso_2022_charset_list))
2888     {
2889       charset_list = Viso_2022_charset_list;
2890       ASET (attrs, coding_attr_charset_list, charset_list);
2891       ASET (attrs, coding_attr_safe_charsets, Qnil);
2892     }
2893
2894   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2895     return;
2896
2897   max_charset_id = 0;
2898   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2899     {
2900       int id = XINT (XCAR (tail));
2901       if (max_charset_id < id)
2902         max_charset_id = id;
2903     }
2904
2905   safe_charsets = make_uninit_string (max_charset_id + 1);
2906   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2907   request = AREF (attrs, coding_attr_iso_request);
2908   reg_usage = AREF (attrs, coding_attr_iso_usage);
2909   reg94 = XINT (XCAR (reg_usage));
2910   reg96 = XINT (XCDR (reg_usage));
2911
2912   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2913     {
2914       Lisp_Object id;
2915       Lisp_Object reg;
2916       struct charset *charset;
2917
2918       id = XCAR (tail);
2919       charset = CHARSET_FROM_ID (XINT (id));
2920       reg = Fcdr (Fassq (id, request));
2921       if (! NILP (reg))
2922         SSET (safe_charsets, XINT (id), XINT (reg));
2923       else if (charset->iso_chars_96)
2924         {
2925           if (reg96 < 4)
2926             SSET (safe_charsets, XINT (id), reg96);
2927         }
2928       else
2929         {
2930           if (reg94 < 4)
2931             SSET (safe_charsets, XINT (id), reg94);
2932         }
2933     }
2934   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2935 }
2936
2937
2938 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2939    Return true if a text is encoded in one of ISO-2022 based coding
2940    systems.  */
2941
2942 static bool
2943 detect_coding_iso_2022 (struct coding_system *coding,
2944                         struct coding_detection_info *detect_info)
2945 {
2946   const unsigned char *src = coding->source, *src_base = src;
2947   const unsigned char *src_end = coding->source + coding->src_bytes;
2948   bool multibytep = coding->src_multibyte;
2949   bool single_shifting = 0;
2950   int id;
2951   int c, c1;
2952   ptrdiff_t consumed_chars = 0;
2953   int i;
2954   int rejected = 0;
2955   int found = 0;
2956   int composition_count = -1;
2957
2958   detect_info->checked |= CATEGORY_MASK_ISO;
2959
2960   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2961     {
2962       struct coding_system *this = &(coding_categories[i]);
2963       Lisp_Object attrs, val;
2964
2965       if (this->id < 0)
2966         continue;
2967       attrs = CODING_ID_ATTRS (this->id);
2968       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2969           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2970         setup_iso_safe_charsets (attrs);
2971       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2972       this->max_charset_id = SCHARS (val) - 1;
2973       this->safe_charsets = SDATA (val);
2974     }
2975
2976   /* A coding system of this category is always ASCII compatible.  */
2977   src += coding->head_ascii;
2978
2979   while (rejected != CATEGORY_MASK_ISO)
2980     {
2981       src_base = src;
2982       ONE_MORE_BYTE (c);
2983       switch (c)
2984         {
2985         case ISO_CODE_ESC:
2986           if (inhibit_iso_escape_detection)
2987             break;
2988           single_shifting = 0;
2989           ONE_MORE_BYTE (c);
2990           if (c == 'N' || c == 'O')
2991             {
2992               /* ESC <Fe> for SS2 or SS3.  */
2993               single_shifting = 1;
2994               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2995             }
2996           else if (c == '1')
2997             {
2998               /* End of composition.  */
2999               if (composition_count < 0
3000                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3001                 /* Invalid */
3002                 break;
3003               composition_count = -1;
3004               found |= CATEGORY_MASK_ISO;
3005             }
3006           else if (c >= '0' && c <= '4')
3007             {
3008               /* ESC <Fp> for start/end composition.  */
3009               composition_count = 0;
3010             }
3011           else
3012             {
3013               if (c >= '(' && c <= '/')
3014                 {
3015                   /* Designation sequence for a charset of dimension 1.  */
3016                   ONE_MORE_BYTE (c1);
3017                   if (c1 < ' ' || c1 >= 0x80
3018                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3019                     /* Invalid designation sequence.  Just ignore.  */
3020                     break;
3021                 }
3022               else if (c == '$')
3023                 {
3024                   /* Designation sequence for a charset of dimension 2.  */
3025                   ONE_MORE_BYTE (c);
3026                   if (c >= '@' && c <= 'B')
3027                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3028                     id = iso_charset_table[1][0][c];
3029                   else if (c >= '(' && c <= '/')
3030                     {
3031                       ONE_MORE_BYTE (c1);
3032                       if (c1 < ' ' || c1 >= 0x80
3033                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3034                         /* Invalid designation sequence.  Just ignore.  */
3035                         break;
3036                     }
3037                   else
3038                     /* Invalid designation sequence.  Just ignore it.  */
3039                     break;
3040                 }
3041               else
3042                 {
3043                   /* Invalid escape sequence.  Just ignore it.  */
3044                   break;
3045                 }
3046
3047               /* We found a valid designation sequence for CHARSET.  */
3048               rejected |= CATEGORY_MASK_ISO_8BIT;
3049               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3050                                   id))
3051                 found |= CATEGORY_MASK_ISO_7;
3052               else
3053                 rejected |= CATEGORY_MASK_ISO_7;
3054               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3055                                   id))
3056                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3057               else
3058                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3059               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3060                                   id))
3061                 found |= CATEGORY_MASK_ISO_7_ELSE;
3062               else
3063                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3064               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3065                                   id))
3066                 found |= CATEGORY_MASK_ISO_8_ELSE;
3067               else
3068                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3069             }
3070           break;
3071
3072         case ISO_CODE_SO:
3073         case ISO_CODE_SI:
3074           /* Locking shift out/in.  */
3075           if (inhibit_iso_escape_detection)
3076             break;
3077           single_shifting = 0;
3078           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3079           break;
3080
3081         case ISO_CODE_CSI:
3082           /* Control sequence introducer.  */
3083           single_shifting = 0;
3084           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3085           found |= CATEGORY_MASK_ISO_8_ELSE;
3086           goto check_extra_latin;
3087
3088         case ISO_CODE_SS2:
3089         case ISO_CODE_SS3:
3090           /* Single shift.   */
3091           if (inhibit_iso_escape_detection)
3092             break;
3093           single_shifting = 0;
3094           rejected |= CATEGORY_MASK_ISO_7BIT;
3095           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3096               & CODING_ISO_FLAG_SINGLE_SHIFT)
3097             {
3098               found |= CATEGORY_MASK_ISO_8_1;
3099               single_shifting = 1;
3100             }
3101           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3102               & CODING_ISO_FLAG_SINGLE_SHIFT)
3103             {
3104               found |= CATEGORY_MASK_ISO_8_2;
3105               single_shifting = 1;
3106             }
3107           if (single_shifting)
3108             break;
3109         check_extra_latin:
3110           if (! VECTORP (Vlatin_extra_code_table)
3111               || NILP (AREF (Vlatin_extra_code_table, c)))
3112             {
3113               rejected = CATEGORY_MASK_ISO;
3114               break;
3115             }
3116           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3117               & CODING_ISO_FLAG_LATIN_EXTRA)
3118             found |= CATEGORY_MASK_ISO_8_1;
3119           else
3120             rejected |= CATEGORY_MASK_ISO_8_1;
3121           rejected |= CATEGORY_MASK_ISO_8_2;
3122           break;
3123
3124         default:
3125           if (c < 0)
3126             continue;
3127           if (c < 0x80)
3128             {
3129               if (composition_count >= 0)
3130                 composition_count++;
3131               single_shifting = 0;
3132               break;
3133             }
3134           if (c >= 0xA0)
3135             {
3136               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3137               found |= CATEGORY_MASK_ISO_8_1;
3138               /* Check the length of succeeding codes of the range
3139                  0xA0..0FF.  If the byte length is even, we include
3140                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3141                  only when we are not single shifting.  */
3142               if (! single_shifting
3143                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3144                 {
3145                   int len = 1;
3146                   while (src < src_end)
3147                     {
3148                       src_base = src;
3149                       ONE_MORE_BYTE (c);
3150                       if (c < 0xA0)
3151                         {
3152                           src = src_base;
3153                           break;
3154                         }
3155                       len++;
3156                     }
3157
3158                   if (len & 1 && src < src_end)
3159                     {
3160                       rejected |= CATEGORY_MASK_ISO_8_2;
3161                       if (composition_count >= 0)
3162                         composition_count += len;
3163                     }
3164                   else
3165                     {
3166                       found |= CATEGORY_MASK_ISO_8_2;
3167                       if (composition_count >= 0)
3168                         composition_count += len / 2;
3169                     }
3170                 }
3171               break;
3172             }
3173         }
3174     }
3175   detect_info->rejected |= CATEGORY_MASK_ISO;
3176   return 0;
3177
3178  no_more_source:
3179   detect_info->rejected |= rejected;
3180   detect_info->found |= (found & ~rejected);
3181   return 1;
3182 }
3183
3184
3185 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3186    escape sequence should be kept.  */
3187 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3188   do {                                                                  \
3189     int id, prev;                                                       \
3190                                                                         \
3191     if (final < '0' || final >= 128                                     \
3192         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3193         || !SAFE_CHARSET_P (coding, id))                                \
3194       {                                                                 \
3195         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3196         chars_96 = -1;                                                  \
3197         break;                                                          \
3198       }                                                                 \
3199     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3200     if (id == charset_jisx0201_roman)                                   \
3201       {                                                                 \
3202         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3203           id = charset_ascii;                                           \
3204       }                                                                 \
3205     else if (id == charset_jisx0208_1978)                               \
3206       {                                                                 \
3207         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3208           id = charset_jisx0208;                                        \
3209       }                                                                 \
3210     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3211     /* If there was an invalid designation to REG previously, and this  \
3212        designation is ASCII to REG, we should keep this designation     \
3213        sequence.  */                                                    \
3214     if (prev == -2 && id == charset_ascii)                              \
3215       chars_96 = -1;                                                    \
3216   } while (0)
3217
3218
3219 /* Handle these composition sequence (ALT: alternate char):
3220
3221    (1) relative composition: ESC 0 CHAR ... ESC 1
3222    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3223    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3224    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3225
3226    When the start sequence (ESC 0/2/3/4) is found, this annotation
3227    header is produced.
3228
3229         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3230
3231    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3232    produced until the end sequence (ESC 1) is found:
3233
3234    (1) CHAR ... CHAR
3235    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3236    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3237    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3238
3239    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3240    annotation header is updated as below:
3241
3242    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3243    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3244    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3245    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3246
3247    If an error is found while composing, the annotation header is
3248    changed to:
3249
3250         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3251
3252    and the sequence [ -2 DECODED-RULE ] is changed to the original
3253    byte sequence as below:
3254         o the original byte sequence is B: [ B -1 ]
3255         o the original byte sequence is B1 B2: [ B1 B2 ]
3256    and the sequence [ -1 -1 ] is changed to the original byte
3257    sequence:
3258         [ ESC '0' ]
3259 */
3260
3261 /* Decode a composition rule C1 and maybe one more byte from the
3262    source, and set RULE to the encoded composition rule.  If the rule
3263    is invalid, goto invalid_code.  */
3264
3265 #define DECODE_COMPOSITION_RULE(rule)                                   \
3266   do {                                                                  \
3267     rule = c1 - 32;                                                     \
3268     if (rule < 0)                                                       \
3269       goto invalid_code;                                                \
3270     if (rule < 81)              /* old format (before ver.21) */        \
3271       {                                                                 \
3272         int gref = (rule) / 9;                                          \
3273         int nref = (rule) % 9;                                          \
3274         if (gref == 4) gref = 10;                                       \
3275         if (nref == 4) nref = 10;                                       \
3276         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3277       }                                                                 \
3278     else                        /* new format (after ver.21) */         \
3279       {                                                                 \
3280         int b;                                                          \
3281                                                                         \
3282         ONE_MORE_BYTE (b);                                              \
3283         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3284           goto invalid_code;                                            \
3285         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3286         rule += 0x100;   /* Distinguish it from the old format.  */     \
3287       }                                                                 \
3288   } while (0)
3289
3290 #define ENCODE_COMPOSITION_RULE(rule)                           \
3291   do {                                                          \
3292     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3293                                                                 \
3294     if (rule < 0x100)           /* old format */                \
3295       {                                                         \
3296         if (gref == 10) gref = 4;                               \
3297         if (nref == 10) nref = 4;                               \
3298         charbuf[idx] = 32 + gref * 9 + nref;                    \
3299         charbuf[idx + 1] = -1;                                  \
3300         new_chars++;                                            \
3301       }                                                         \
3302     else                                /* new format */        \
3303       {                                                         \
3304         charbuf[idx] = 32 + 81 + gref;                          \
3305         charbuf[idx + 1] = 32 + nref;                           \
3306         new_chars += 2;                                         \
3307       }                                                         \
3308   } while (0)
3309
3310 /* Finish the current composition as invalid.  */
3311
3312 static int
3313 finish_composition (int *charbuf, struct composition_status *cmp_status)
3314 {
3315   int idx = - cmp_status->length;
3316   int new_chars;
3317
3318   /* Recover the original ESC sequence */
3319   charbuf[idx++] = ISO_CODE_ESC;
3320   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3321                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3322                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3323                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3324                     : '4');
3325   charbuf[idx++] = -2;
3326   charbuf[idx++] = 0;
3327   charbuf[idx++] = -1;
3328   new_chars = cmp_status->nchars;
3329   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3330     for (; idx < 0; idx++)
3331       {
3332         int elt = charbuf[idx];
3333
3334         if (elt == -2)
3335           {
3336             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3337             idx++;
3338           }
3339         else if (elt == -1)
3340           {
3341             charbuf[idx++] = ISO_CODE_ESC;
3342             charbuf[idx] = '0';
3343             new_chars += 2;
3344           }
3345       }
3346   cmp_status->state = COMPOSING_NO;
3347   return new_chars;
3348 }
3349
3350 /* If characters are under composition, finish the composition.  */
3351 #define MAYBE_FINISH_COMPOSITION()                              \
3352   do {                                                          \
3353     if (cmp_status->state != COMPOSING_NO)                      \
3354       char_offset += finish_composition (charbuf, cmp_status);  \
3355   } while (0)
3356
3357 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3358
3359    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3360    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3361    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3362    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3363
3364    Produce this annotation sequence now:
3365
3366    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3367 */
3368
3369 #define DECODE_COMPOSITION_START(c1)                                       \
3370   do {                                                                     \
3371     if (c1 == '0'                                                          \
3372         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3373              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3374             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3375                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3376       {                                                                    \
3377         *charbuf++ = -1;                                                   \
3378         *charbuf++= -1;                                                    \
3379         cmp_status->state = COMPOSING_CHAR;                                \
3380         cmp_status->length += 2;                                           \
3381       }                                                                    \
3382     else                                                                   \
3383       {                                                                    \
3384         MAYBE_FINISH_COMPOSITION ();                                       \
3385         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3386                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3387                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3388                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3389         cmp_status->state                                                  \
3390           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3391         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3392         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3393         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3394         coding->annotated = 1;                                             \
3395       }                                                                    \
3396   } while (0)
3397
3398
3399 /* Handle composition end sequence ESC 1.  */
3400
3401 #define DECODE_COMPOSITION_END()                                        \
3402   do {                                                                  \
3403     if (cmp_status->nchars == 0                                         \
3404         || ((cmp_status->state == COMPOSING_CHAR)                       \
3405             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3406       {                                                                 \
3407         MAYBE_FINISH_COMPOSITION ();                                    \
3408         goto invalid_code;                                              \
3409       }                                                                 \
3410     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3411       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3412     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3413       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3414     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3415     char_offset += cmp_status->nchars;                                  \
3416     cmp_status->state = COMPOSING_NO;                                   \
3417   } while (0)
3418
3419 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3420
3421 #define STORE_COMPOSITION_RULE(rule)    \
3422   do {                                  \
3423     *charbuf++ = -2;                    \
3424     *charbuf++ = rule;                  \
3425     cmp_status->length += 2;            \
3426     cmp_status->state--;                \
3427   } while (0)
3428
3429 /* Store a composed char or a component char C in charbuf, and update
3430    cmp_status.  */
3431
3432 #define STORE_COMPOSITION_CHAR(c)                                       \
3433   do {                                                                  \
3434     *charbuf++ = (c);                                                   \
3435     cmp_status->length++;                                               \
3436     if (cmp_status->state == COMPOSING_CHAR)                            \
3437       cmp_status->nchars++;                                             \
3438     else                                                                \
3439       cmp_status->ncomps++;                                             \
3440     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3441         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3442             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3443       cmp_status->state++;                                              \
3444   } while (0)
3445
3446
3447 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3448
3449 static void
3450 decode_coding_iso_2022 (struct coding_system *coding)
3451 {
3452   const unsigned char *src = coding->source + coding->consumed;
3453   const unsigned char *src_end = coding->source + coding->src_bytes;
3454   const unsigned char *src_base;
3455   int *charbuf = coding->charbuf + coding->charbuf_used;
3456   /* We may produce two annotations (charset and composition) in one
3457      loop and one more charset annotation at the end.  */
3458   int *charbuf_end
3459     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3460   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3461   bool multibytep = coding->src_multibyte;
3462   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3463   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3464   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3465   int charset_id_2, charset_id_3;
3466   struct charset *charset;
3467   int c;
3468   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3469   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3470   ptrdiff_t char_offset = coding->produced_char;
3471   ptrdiff_t last_offset = char_offset;
3472   int last_id = charset_ascii;
3473   bool eol_dos
3474     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3475   int byte_after_cr = -1;
3476   int i;
3477
3478   setup_iso_safe_charsets (attrs);
3479   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3480
3481   if (cmp_status->state != COMPOSING_NO)
3482     {
3483       if (charbuf_end - charbuf < cmp_status->length)
3484         emacs_abort ();
3485       for (i = 0; i < cmp_status->length; i++)
3486         *charbuf++ = cmp_status->carryover[i];
3487       coding->annotated = 1;
3488     }
3489
3490   while (1)
3491     {
3492       int c1, c2, c3;
3493
3494       src_base = src;
3495       consumed_chars_base = consumed_chars;
3496
3497       if (charbuf >= charbuf_end)
3498         {
3499           if (byte_after_cr >= 0)
3500             src_base--;
3501           break;
3502         }
3503
3504       if (byte_after_cr >= 0)
3505         c1 = byte_after_cr, byte_after_cr = -1;
3506       else
3507         ONE_MORE_BYTE (c1);
3508       if (c1 < 0)
3509         goto invalid_code;
3510
3511       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3512         {
3513           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3514           char_offset++;
3515           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3516           continue;
3517         }
3518
3519       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3520         {
3521           if (c1 == ISO_CODE_ESC)
3522             {
3523               if (src + 1 >= src_end)
3524                 goto no_more_source;
3525               *charbuf++ = ISO_CODE_ESC;
3526               char_offset++;
3527               if (src[0] == '%' && src[1] == '@')
3528                 {
3529                   src += 2;
3530                   consumed_chars += 2;
3531                   char_offset += 2;
3532                   /* We are sure charbuf can contain two more chars. */
3533                   *charbuf++ = '%';
3534                   *charbuf++ = '@';
3535                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3536                 }
3537             }
3538           else
3539             {
3540               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3541               char_offset++;
3542             }
3543           continue;
3544         }
3545
3546       if ((cmp_status->state == COMPOSING_RULE
3547            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3548           && c1 != ISO_CODE_ESC)
3549         {
3550           int rule;
3551
3552           DECODE_COMPOSITION_RULE (rule);
3553           STORE_COMPOSITION_RULE (rule);
3554           continue;
3555         }
3556
3557       /* We produce at most one character.  */
3558       switch (iso_code_class [c1])
3559         {
3560         case ISO_0x20_or_0x7F:
3561           if (charset_id_0 < 0
3562               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3563             /* This is SPACE or DEL.  */
3564             charset = CHARSET_FROM_ID (charset_ascii);
3565           else
3566             charset = CHARSET_FROM_ID (charset_id_0);
3567           break;
3568
3569         case ISO_graphic_plane_0:
3570           if (charset_id_0 < 0)
3571             charset = CHARSET_FROM_ID (charset_ascii);
3572           else
3573             charset = CHARSET_FROM_ID (charset_id_0);
3574           break;
3575
3576         case ISO_0xA0_or_0xFF:
3577           if (charset_id_1 < 0
3578               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3579               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3580             goto invalid_code;
3581           /* This is a graphic character, we fall down ... */
3582
3583         case ISO_graphic_plane_1:
3584           if (charset_id_1 < 0)
3585             goto invalid_code;
3586           charset = CHARSET_FROM_ID (charset_id_1);
3587           break;
3588
3589         case ISO_control_0:
3590           if (eol_dos && c1 == '\r')
3591             ONE_MORE_BYTE (byte_after_cr);
3592           MAYBE_FINISH_COMPOSITION ();
3593           charset = CHARSET_FROM_ID (charset_ascii);
3594           break;
3595
3596         case ISO_control_1:
3597           goto invalid_code;
3598
3599         case ISO_shift_out:
3600           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3601               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3602             goto invalid_code;
3603           CODING_ISO_INVOCATION (coding, 0) = 1;
3604           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3605           continue;
3606
3607         case ISO_shift_in:
3608           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3609             goto invalid_code;
3610           CODING_ISO_INVOCATION (coding, 0) = 0;
3611           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3612           continue;
3613
3614         case ISO_single_shift_2_7:
3615           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3616             goto invalid_code;
3617         case ISO_single_shift_2:
3618           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3619             goto invalid_code;
3620           /* SS2 is handled as an escape sequence of ESC 'N' */
3621           c1 = 'N';
3622           goto label_escape_sequence;
3623
3624         case ISO_single_shift_3:
3625           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3626             goto invalid_code;
3627           /* SS2 is handled as an escape sequence of ESC 'O' */
3628           c1 = 'O';
3629           goto label_escape_sequence;
3630
3631         case ISO_control_sequence_introducer:
3632           /* CSI is handled as an escape sequence of ESC '[' ...  */
3633           c1 = '[';
3634           goto label_escape_sequence;
3635
3636         case ISO_escape:
3637           ONE_MORE_BYTE (c1);
3638         label_escape_sequence:
3639           /* Escape sequences handled here are invocation,
3640              designation, direction specification, and character
3641              composition specification.  */
3642           switch (c1)
3643             {
3644             case '&':           /* revision of following character set */
3645               ONE_MORE_BYTE (c1);
3646               if (!(c1 >= '@' && c1 <= '~'))
3647                 goto invalid_code;
3648               ONE_MORE_BYTE (c1);
3649               if (c1 != ISO_CODE_ESC)
3650                 goto invalid_code;
3651               ONE_MORE_BYTE (c1);
3652               goto label_escape_sequence;
3653
3654             case '$':           /* designation of 2-byte character set */
3655               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3656                 goto invalid_code;
3657               {
3658                 int reg, chars96;
3659
3660                 ONE_MORE_BYTE (c1);
3661                 if (c1 >= '@' && c1 <= 'B')
3662                   {     /* designation of JISX0208.1978, GB2312.1980,
3663                            or JISX0208.1980 */
3664                     reg = 0, chars96 = 0;
3665                   }
3666                 else if (c1 >= 0x28 && c1 <= 0x2B)
3667                   { /* designation of DIMENSION2_CHARS94 character set */
3668                     reg = c1 - 0x28, chars96 = 0;
3669                     ONE_MORE_BYTE (c1);
3670                   }
3671                 else if (c1 >= 0x2C && c1 <= 0x2F)
3672                   { /* designation of DIMENSION2_CHARS96 character set */
3673                     reg = c1 - 0x2C, chars96 = 1;
3674                     ONE_MORE_BYTE (c1);
3675                   }
3676                 else
3677                   goto invalid_code;
3678                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3679                 /* We must update these variables now.  */
3680                 if (reg == 0)
3681                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3682                 else if (reg == 1)
3683                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3684                 if (chars96 < 0)
3685                   goto invalid_code;
3686               }
3687               continue;
3688
3689             case 'n':           /* invocation of locking-shift-2 */
3690               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3691                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3692                 goto invalid_code;
3693               CODING_ISO_INVOCATION (coding, 0) = 2;
3694               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3695               continue;
3696
3697             case 'o':           /* invocation of locking-shift-3 */
3698               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3699                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3700                 goto invalid_code;
3701               CODING_ISO_INVOCATION (coding, 0) = 3;
3702               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3703               continue;
3704
3705             case 'N':           /* invocation of single-shift-2 */
3706               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3707                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3708                 goto invalid_code;
3709               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3710               if (charset_id_2 < 0)
3711                 charset = CHARSET_FROM_ID (charset_ascii);
3712               else
3713                 charset = CHARSET_FROM_ID (charset_id_2);
3714               ONE_MORE_BYTE (c1);
3715               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3716                 goto invalid_code;
3717               break;
3718
3719             case 'O':           /* invocation of single-shift-3 */
3720               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3721                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3722                 goto invalid_code;
3723               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3724               if (charset_id_3 < 0)
3725                 charset = CHARSET_FROM_ID (charset_ascii);
3726               else
3727                 charset = CHARSET_FROM_ID (charset_id_3);
3728               ONE_MORE_BYTE (c1);
3729               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3730                 goto invalid_code;
3731               break;
3732
3733             case '0': case '2': case '3': case '4': /* start composition */
3734               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3735                 goto invalid_code;
3736               if (last_id != charset_ascii)
3737                 {
3738                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3739                   last_id = charset_ascii;
3740                   last_offset = char_offset;
3741                 }
3742               DECODE_COMPOSITION_START (c1);
3743               continue;
3744
3745             case '1':           /* end composition */
3746               if (cmp_status->state == COMPOSING_NO)
3747                 goto invalid_code;
3748               DECODE_COMPOSITION_END ();
3749               continue;
3750
3751             case '[':           /* specification of direction */
3752               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3753                 goto invalid_code;
3754               /* For the moment, nested direction is not supported.
3755                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3756                  left-to-right, and nonzero means right-to-left.  */
3757               ONE_MORE_BYTE (c1);
3758               switch (c1)
3759                 {
3760                 case ']':       /* end of the current direction */
3761                   coding->mode &= ~CODING_MODE_DIRECTION;
3762
3763                 case '0':       /* end of the current direction */
3764                 case '1':       /* start of left-to-right direction */
3765                   ONE_MORE_BYTE (c1);
3766                   if (c1 == ']')
3767                     coding->mode &= ~CODING_MODE_DIRECTION;
3768                   else
3769                     goto invalid_code;
3770                   break;
3771
3772                 case '2':       /* start of right-to-left direction */
3773                   ONE_MORE_BYTE (c1);
3774                   if (c1 == ']')
3775                     coding->mode |= CODING_MODE_DIRECTION;
3776                   else
3777                     goto invalid_code;
3778                   break;
3779
3780                 default:
3781                   goto invalid_code;
3782                 }
3783               continue;
3784
3785             case '%':
3786               ONE_MORE_BYTE (c1);
3787               if (c1 == '/')
3788                 {
3789                   /* CTEXT extended segment:
3790                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3791                      We keep these bytes as is for the moment.
3792                      They may be decoded by post-read-conversion.  */
3793                   int dim, M, L;
3794                   int size;
3795
3796                   ONE_MORE_BYTE (dim);
3797                   if (dim < '0' || dim > '4')
3798                     goto invalid_code;
3799                   ONE_MORE_BYTE (M);
3800                   if (M < 128)
3801                     goto invalid_code;
3802                   ONE_MORE_BYTE (L);
3803                   if (L < 128)
3804                     goto invalid_code;
3805                   size = ((M - 128) * 128) + (L - 128);
3806                   if (charbuf + 6 > charbuf_end)
3807                     goto break_loop;
3808                   *charbuf++ = ISO_CODE_ESC;
3809                   *charbuf++ = '%';
3810                   *charbuf++ = '/';
3811                   *charbuf++ = dim;
3812                   *charbuf++ = BYTE8_TO_CHAR (M);
3813                   *charbuf++ = BYTE8_TO_CHAR (L);
3814                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3815                 }
3816               else if (c1 == 'G')
3817                 {
3818                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3819                      ESC % G --UTF-8-BYTES-- ESC % @
3820                      We keep these bytes as is for the moment.
3821                      They may be decoded by post-read-conversion.  */
3822                   if (charbuf + 3 > charbuf_end)
3823                     goto break_loop;
3824                   *charbuf++ = ISO_CODE_ESC;
3825                   *charbuf++ = '%';
3826                   *charbuf++ = 'G';
3827                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3828                 }
3829               else
3830                 goto invalid_code;
3831               continue;
3832               break;
3833
3834             default:
3835               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3836                 goto invalid_code;
3837               {
3838                 int reg, chars96;
3839
3840                 if (c1 >= 0x28 && c1 <= 0x2B)
3841                   { /* designation of DIMENSION1_CHARS94 character set */
3842                     reg = c1 - 0x28, chars96 = 0;
3843                     ONE_MORE_BYTE (c1);
3844                   }
3845                 else if (c1 >= 0x2C && c1 <= 0x2F)
3846                   { /* designation of DIMENSION1_CHARS96 character set */
3847                     reg = c1 - 0x2C, chars96 = 1;
3848                     ONE_MORE_BYTE (c1);
3849                   }
3850                 else
3851                   goto invalid_code;
3852                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3853                 /* We must update these variables now.  */
3854                 if (reg == 0)
3855                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3856                 else if (reg == 1)
3857                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3858                 if (chars96 < 0)
3859                   goto invalid_code;
3860               }
3861               continue;
3862             }
3863           break;
3864
3865         default:
3866           emacs_abort ();
3867         }
3868
3869       if (cmp_status->state == COMPOSING_NO
3870           && charset->id != charset_ascii
3871           && last_id != charset->id)
3872         {
3873           if (last_id != charset_ascii)
3874             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3875           last_id = charset->id;
3876           last_offset = char_offset;
3877         }
3878
3879       /* Now we know CHARSET and 1st position code C1 of a character.
3880          Produce a decoded character while getting 2nd and 3rd
3881          position codes C2, C3 if necessary.  */
3882       if (CHARSET_DIMENSION (charset) > 1)
3883         {
3884           ONE_MORE_BYTE (c2);
3885           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3886               || ((c1 & 0x80) != (c2 & 0x80)))
3887             /* C2 is not in a valid range.  */
3888             goto invalid_code;
3889           if (CHARSET_DIMENSION (charset) == 2)
3890             c1 = (c1 << 8) | c2;
3891           else
3892             {
3893               ONE_MORE_BYTE (c3);
3894               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3895                   || ((c1 & 0x80) != (c3 & 0x80)))
3896                 /* C3 is not in a valid range.  */
3897                 goto invalid_code;
3898               c1 = (c1 << 16) | (c2 << 8) | c2;
3899             }
3900         }
3901       c1 &= 0x7F7F7F;
3902       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3903       if (c < 0)
3904         {
3905           MAYBE_FINISH_COMPOSITION ();
3906           for (; src_base < src; src_base++, char_offset++)
3907             {
3908               if (ASCII_BYTE_P (*src_base))
3909                 *charbuf++ = *src_base;
3910               else
3911                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3912             }
3913         }
3914       else if (cmp_status->state == COMPOSING_NO)
3915         {
3916           *charbuf++ = c;
3917           char_offset++;
3918         }
3919       else if ((cmp_status->state == COMPOSING_CHAR
3920                 ? cmp_status->nchars
3921                 : cmp_status->ncomps)
3922                >= MAX_COMPOSITION_COMPONENTS)
3923         {
3924           /* Too long composition.  */
3925           MAYBE_FINISH_COMPOSITION ();
3926           *charbuf++ = c;
3927           char_offset++;
3928         }
3929       else
3930         STORE_COMPOSITION_CHAR (c);
3931       continue;
3932
3933     invalid_code:
3934       MAYBE_FINISH_COMPOSITION ();
3935       src = src_base;
3936       consumed_chars = consumed_chars_base;
3937       ONE_MORE_BYTE (c);
3938       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3939       char_offset++;
3940       coding->errors++;
3941       continue;
3942
3943     break_loop:
3944       break;
3945     }
3946
3947  no_more_source:
3948   if (cmp_status->state != COMPOSING_NO)
3949     {
3950       if (coding->mode & CODING_MODE_LAST_BLOCK)
3951         MAYBE_FINISH_COMPOSITION ();
3952       else
3953         {
3954           charbuf -= cmp_status->length;
3955           for (i = 0; i < cmp_status->length; i++)
3956             cmp_status->carryover[i] = charbuf[i];
3957         }
3958     }
3959   else if (last_id != charset_ascii)
3960     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3961   coding->consumed_char += consumed_chars_base;
3962   coding->consumed = src_base - coding->source;
3963   coding->charbuf_used = charbuf - coding->charbuf;
3964 }
3965
3966
3967 /* ISO2022 encoding stuff.  */
3968
3969 /*
3970    It is not enough to say just "ISO2022" on encoding, we have to
3971    specify more details.  In Emacs, each coding system of ISO2022
3972    variant has the following specifications:
3973         1. Initial designation to G0 thru G3.
3974         2. Allows short-form designation?
3975         3. ASCII should be designated to G0 before control characters?
3976         4. ASCII should be designated to G0 at end of line?
3977         5. 7-bit environment or 8-bit environment?
3978         6. Use locking-shift?
3979         7. Use Single-shift?
3980    And the following two are only for Japanese:
3981         8. Use ASCII in place of JIS0201-1976-Roman?
3982         9. Use JISX0208-1983 in place of JISX0208-1978?
3983    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3984    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3985    details.
3986 */
3987
3988 /* Produce codes (escape sequence) for designating CHARSET to graphic
3989    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3990    '@', 'A', or 'B' and the coding system CODING allows, produce
3991    designation sequence of short-form.  */
3992
3993 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3994   do {                                                                  \
3995     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3996     const char *intermediate_char_94 = "()*+";                          \
3997     const char *intermediate_char_96 = ",-./";                          \
3998     int revision = -1;                                                  \
3999                                                                         \
4000     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4001       revision = CHARSET_ISO_REVISION (charset);                        \
4002                                                                         \
4003     if (revision >= 0)                                                  \
4004       {                                                                 \
4005         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4006         EMIT_ONE_BYTE ('@' + revision);                                 \
4007       }                                                                 \
4008     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4009     if (CHARSET_DIMENSION (charset) == 1)                               \
4010       {                                                                 \
4011         int b;                                                          \
4012         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4013           b = intermediate_char_94[reg];                                \
4014         else                                                            \
4015           b = intermediate_char_96[reg];                                \
4016         EMIT_ONE_ASCII_BYTE (b);                                        \
4017       }                                                                 \
4018     else                                                                \
4019       {                                                                 \
4020         EMIT_ONE_ASCII_BYTE ('$');                                      \
4021         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4022           {                                                             \
4023             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4024                 || reg != 0                                             \
4025                 || final_char < '@' || final_char > 'B')                \
4026               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4027           }                                                             \
4028         else                                                            \
4029           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4030       }                                                                 \
4031     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4032                                                                         \
4033     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4034   } while (0)
4035
4036
4037 /* The following two macros produce codes (control character or escape
4038    sequence) for ISO2022 single-shift functions (single-shift-2 and
4039    single-shift-3).  */
4040
4041 #define ENCODE_SINGLE_SHIFT_2                                           \
4042   do {                                                                  \
4043     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4044       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4045     else                                                                \
4046       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4047     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4048   } while (0)
4049
4050
4051 #define ENCODE_SINGLE_SHIFT_3                                           \
4052   do {                                                                  \
4053     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4054       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4055     else                                                                \
4056       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4057     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4058   } while (0)
4059
4060
4061 /* The following four macros produce codes (control character or
4062    escape sequence) for ISO2022 locking-shift functions (shift-in,
4063    shift-out, locking-shift-2, and locking-shift-3).  */
4064
4065 #define ENCODE_SHIFT_IN                                 \
4066   do {                                                  \
4067     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4068     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4069   } while (0)
4070
4071
4072 #define ENCODE_SHIFT_OUT                                \
4073   do {                                                  \
4074     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4075     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4076   } while (0)
4077
4078
4079 #define ENCODE_LOCKING_SHIFT_2                          \
4080   do {                                                  \
4081     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4082     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4083   } while (0)
4084
4085
4086 #define ENCODE_LOCKING_SHIFT_3                          \
4087   do {                                                  \
4088     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4089     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4090   } while (0)
4091
4092
4093 /* Produce codes for a DIMENSION1 character whose character set is
4094    CHARSET and whose position-code is C1.  Designation and invocation
4095    sequences are also produced in advance if necessary.  */
4096
4097 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4098   do {                                                                  \
4099     int id = CHARSET_ID (charset);                                      \
4100                                                                         \
4101     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4102         && id == charset_ascii)                                         \
4103       {                                                                 \
4104         id = charset_jisx0201_roman;                                    \
4105         charset = CHARSET_FROM_ID (id);                                 \
4106       }                                                                 \
4107                                                                         \
4108     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4109       {                                                                 \
4110         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4111           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4112         else                                                            \
4113           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4114         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4115         break;                                                          \
4116       }                                                                 \
4117     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4118       {                                                                 \
4119         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4120         break;                                                          \
4121       }                                                                 \
4122     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4123       {                                                                 \
4124         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4125         break;                                                          \
4126       }                                                                 \
4127     else                                                                \
4128       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4129          must invoke it, or, at first, designate it to some graphic     \
4130          register.  Then repeat the loop to actually produce the        \
4131          character.  */                                                 \
4132       dst = encode_invocation_designation (charset, coding, dst,        \
4133                                            &produced_chars);            \
4134   } while (1)
4135
4136
4137 /* Produce codes for a DIMENSION2 character whose character set is
4138    CHARSET and whose position-codes are C1 and C2.  Designation and
4139    invocation codes are also produced in advance if necessary.  */
4140
4141 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4142   do {                                                                  \
4143     int id = CHARSET_ID (charset);                                      \
4144                                                                         \
4145     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4146         && id == charset_jisx0208)                                      \
4147       {                                                                 \
4148         id = charset_jisx0208_1978;                                     \
4149         charset = CHARSET_FROM_ID (id);                                 \
4150       }                                                                 \
4151                                                                         \
4152     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4153       {                                                                 \
4154         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4155           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4156         else                                                            \
4157           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4158         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4159         break;                                                          \
4160       }                                                                 \
4161     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4162       {                                                                 \
4163         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4164         break;                                                          \
4165       }                                                                 \
4166     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4167       {                                                                 \
4168         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4169         break;                                                          \
4170       }                                                                 \
4171     else                                                                \
4172       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4173          must invoke it, or, at first, designate it to some graphic     \
4174          register.  Then repeat the loop to actually produce the        \
4175          character.  */                                                 \
4176       dst = encode_invocation_designation (charset, coding, dst,        \
4177                                            &produced_chars);            \
4178   } while (1)
4179
4180
4181 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4182   do {                                                                     \
4183     unsigned code;                                                         \
4184     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4185                                                                            \
4186     if (CHARSET_DIMENSION (charset) == 1)                                  \
4187       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4188     else                                                                   \
4189       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4190   } while (0)
4191
4192
4193 /* Produce designation and invocation codes at a place pointed by DST
4194    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4195    Return new DST.  */
4196
4197 static unsigned char *
4198 encode_invocation_designation (struct charset *charset,
4199                                struct coding_system *coding,
4200                                unsigned char *dst, ptrdiff_t *p_nchars)
4201 {
4202   bool multibytep = coding->dst_multibyte;
4203   ptrdiff_t produced_chars = *p_nchars;
4204   int reg;                      /* graphic register number */
4205   int id = CHARSET_ID (charset);
4206
4207   /* At first, check designations.  */
4208   for (reg = 0; reg < 4; reg++)
4209     if (id == CODING_ISO_DESIGNATION (coding, reg))
4210       break;
4211
4212   if (reg >= 4)
4213     {
4214       /* CHARSET is not yet designated to any graphic registers.  */
4215       /* At first check the requested designation.  */
4216       reg = CODING_ISO_REQUEST (coding, id);
4217       if (reg < 0)
4218         /* Since CHARSET requests no special designation, designate it
4219            to graphic register 0.  */
4220         reg = 0;
4221
4222       ENCODE_DESIGNATION (charset, reg, coding);
4223     }
4224
4225   if (CODING_ISO_INVOCATION (coding, 0) != reg
4226       && CODING_ISO_INVOCATION (coding, 1) != reg)
4227     {
4228       /* Since the graphic register REG is not invoked to any graphic
4229          planes, invoke it to graphic plane 0.  */
4230       switch (reg)
4231         {
4232         case 0:                 /* graphic register 0 */
4233           ENCODE_SHIFT_IN;
4234           break;
4235
4236         case 1:                 /* graphic register 1 */
4237           ENCODE_SHIFT_OUT;
4238           break;
4239
4240         case 2:                 /* graphic register 2 */
4241           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4242             ENCODE_SINGLE_SHIFT_2;
4243           else
4244             ENCODE_LOCKING_SHIFT_2;
4245           break;
4246
4247         case 3:                 /* graphic register 3 */
4248           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4249             ENCODE_SINGLE_SHIFT_3;
4250           else
4251             ENCODE_LOCKING_SHIFT_3;
4252           break;
4253         }
4254     }
4255
4256   *p_nchars = produced_chars;
4257   return dst;
4258 }
4259
4260
4261 /* Produce codes for designation and invocation to reset the graphic
4262    planes and registers to initial state.  */
4263 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4264   do {                                                                  \
4265     int reg;                                                            \
4266     struct charset *charset;                                            \
4267                                                                         \
4268     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4269       ENCODE_SHIFT_IN;                                                  \
4270     for (reg = 0; reg < 4; reg++)                                       \
4271       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4272           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4273               != CODING_ISO_INITIAL (coding, reg)))                     \
4274         {                                                               \
4275           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4276           ENCODE_DESIGNATION (charset, reg, coding);                    \
4277         }                                                               \
4278   } while (0)
4279
4280
4281 /* Produce designation sequences of charsets in the line started from
4282    CHARBUF to a place pointed by DST, and return the number of
4283    produced bytes.  DST should not directly point a buffer text area
4284    which may be relocated by char_charset call.
4285
4286    If the current block ends before any end-of-line, we may fail to
4287    find all the necessary designations.  */
4288
4289 static ptrdiff_t
4290 encode_designation_at_bol (struct coding_system *coding,
4291                            int *charbuf, int *charbuf_end,
4292                            unsigned char *dst)
4293 {
4294   unsigned char *orig = dst;
4295   struct charset *charset;
4296   /* Table of charsets to be designated to each graphic register.  */
4297   int r[4];
4298   int c, found = 0, reg;
4299   ptrdiff_t produced_chars = 0;
4300   bool multibytep = coding->dst_multibyte;
4301   Lisp_Object attrs;
4302   Lisp_Object charset_list;
4303
4304   attrs = CODING_ID_ATTRS (coding->id);
4305   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4306   if (EQ (charset_list, Qiso_2022))
4307     charset_list = Viso_2022_charset_list;
4308
4309   for (reg = 0; reg < 4; reg++)
4310     r[reg] = -1;
4311
4312   while (charbuf < charbuf_end && found < 4)
4313     {
4314       int id;
4315
4316       c = *charbuf++;
4317       if (c == '\n')
4318         break;
4319       charset = char_charset (c, charset_list, NULL);
4320       id = CHARSET_ID (charset);
4321       reg = CODING_ISO_REQUEST (coding, id);
4322       if (reg >= 0 && r[reg] < 0)
4323         {
4324           found++;
4325           r[reg] = id;
4326         }
4327     }
4328
4329   if (found)
4330     {
4331       for (reg = 0; reg < 4; reg++)
4332         if (r[reg] >= 0
4333             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4334           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4335     }
4336
4337   return dst - orig;
4338 }
4339
4340 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4341
4342 static bool
4343 encode_coding_iso_2022 (struct coding_system *coding)
4344 {
4345   bool multibytep = coding->dst_multibyte;
4346   int *charbuf = coding->charbuf;
4347   int *charbuf_end = charbuf + coding->charbuf_used;
4348   unsigned char *dst = coding->destination + coding->produced;
4349   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4350   int safe_room = 16;
4351   bool bol_designation
4352     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4353        && CODING_ISO_BOL (coding));
4354   ptrdiff_t produced_chars = 0;
4355   Lisp_Object attrs, eol_type, charset_list;
4356   bool ascii_compatible;
4357   int c;
4358   int preferred_charset_id = -1;
4359
4360   CODING_GET_INFO (coding, attrs, charset_list);
4361   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4362   if (VECTORP (eol_type))
4363     eol_type = Qunix;
4364
4365   setup_iso_safe_charsets (attrs);
4366   /* Charset list may have been changed.  */
4367   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4368   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4369
4370   ascii_compatible
4371     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4372        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4373                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4374
4375   while (charbuf < charbuf_end)
4376     {
4377       ASSURE_DESTINATION (safe_room);
4378
4379       if (bol_designation)
4380         {
4381           /* We have to produce designation sequences if any now.  */
4382           unsigned char desig_buf[16];
4383           int nbytes;
4384           ptrdiff_t offset;
4385
4386           charset_map_loaded = 0;
4387           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4388                                               desig_buf);
4389           if (charset_map_loaded
4390               && (offset = coding_change_destination (coding)))
4391             {
4392               dst += offset;
4393               dst_end += offset;
4394             }
4395           memcpy (dst, desig_buf, nbytes);
4396           dst += nbytes;
4397           /* We are sure that designation sequences are all ASCII bytes.  */
4398           produced_chars += nbytes;
4399           bol_designation = 0;
4400           ASSURE_DESTINATION (safe_room);
4401         }
4402
4403       c = *charbuf++;
4404
4405       if (c < 0)
4406         {
4407           /* Handle an annotation.  */
4408           switch (*charbuf)
4409             {
4410             case CODING_ANNOTATE_COMPOSITION_MASK:
4411               /* Not yet implemented.  */
4412               break;
4413             case CODING_ANNOTATE_CHARSET_MASK:
4414               preferred_charset_id = charbuf[2];
4415               if (preferred_charset_id >= 0
4416                   && NILP (Fmemq (make_number (preferred_charset_id),
4417                                   charset_list)))
4418                 preferred_charset_id = -1;
4419               break;
4420             default:
4421               emacs_abort ();
4422             }
4423           charbuf += -c - 1;
4424           continue;
4425         }
4426
4427       /* Now encode the character C.  */
4428       if (c < 0x20 || c == 0x7F)
4429         {
4430           if (c == '\n'
4431               || (c == '\r' && EQ (eol_type, Qmac)))
4432             {
4433               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4434                 ENCODE_RESET_PLANE_AND_REGISTER ();
4435               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4436                 {
4437                   int i;
4438
4439                   for (i = 0; i < 4; i++)
4440                     CODING_ISO_DESIGNATION (coding, i)
4441                       = CODING_ISO_INITIAL (coding, i);
4442                 }
4443               bol_designation = ((CODING_ISO_FLAGS (coding)
4444                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4445                                  != 0);
4446             }
4447           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4448             ENCODE_RESET_PLANE_AND_REGISTER ();
4449           EMIT_ONE_ASCII_BYTE (c);
4450         }
4451       else if (ASCII_CHAR_P (c))
4452         {
4453           if (ascii_compatible)
4454             EMIT_ONE_ASCII_BYTE (c);
4455           else
4456             {
4457               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4458               ENCODE_ISO_CHARACTER (charset, c);
4459             }
4460         }
4461       else if (CHAR_BYTE8_P (c))
4462         {
4463           c = CHAR_TO_BYTE8 (c);
4464           EMIT_ONE_BYTE (c);
4465         }
4466       else
4467         {
4468           struct charset *charset;
4469
4470           if (preferred_charset_id >= 0)
4471             {
4472               bool result;
4473
4474               charset = CHARSET_FROM_ID (preferred_charset_id);
4475               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4476               if (! result)
4477                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4478                                      NULL, charset);
4479             }
4480           else
4481             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4482                                  NULL, charset);
4483           if (!charset)
4484             {
4485               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4486                 {
4487                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4488                   charset = CHARSET_FROM_ID (charset_ascii);
4489                 }
4490               else
4491                 {
4492                   c = coding->default_char;
4493                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4494                                        charset_list, NULL, charset);
4495                 }
4496             }
4497           ENCODE_ISO_CHARACTER (charset, c);
4498         }
4499     }
4500
4501   if (coding->mode & CODING_MODE_LAST_BLOCK
4502       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4503     {
4504       ASSURE_DESTINATION (safe_room);
4505       ENCODE_RESET_PLANE_AND_REGISTER ();
4506     }
4507   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4508   CODING_ISO_BOL (coding) = bol_designation;
4509   coding->produced_char += produced_chars;
4510   coding->produced = dst - coding->destination;
4511   return 0;
4512 }
4513
4514 \f
4515 /*** 8,9. SJIS and BIG5 handlers ***/
4516
4517 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4518    quite widely.  So, for the moment, Emacs supports them in the bare
4519    C code.  But, in the future, they may be supported only by CCL.  */
4520
4521 /* SJIS is a coding system encoding three character sets: ASCII, right
4522    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4523    as is.  A character of charset katakana-jisx0201 is encoded by
4524    "position-code + 0x80".  A character of charset japanese-jisx0208
4525    is encoded in 2-byte but two position-codes are divided and shifted
4526    so that it fit in the range below.
4527
4528    --- CODE RANGE of SJIS ---
4529    (character set)      (range)
4530    ASCII                0x00 .. 0x7F
4531    KATAKANA-JISX0201    0xA0 .. 0xDF
4532    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4533             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4534    -------------------------------
4535
4536 */
4537
4538 /* BIG5 is a coding system encoding two character sets: ASCII and
4539    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4540    character set and is encoded in two-byte.
4541
4542    --- CODE RANGE of BIG5 ---
4543    (character set)      (range)
4544    ASCII                0x00 .. 0x7F
4545    Big5 (1st byte)      0xA1 .. 0xFE
4546         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4547    --------------------------
4548
4549   */
4550
4551 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4552    Return true if a text is encoded in SJIS.  */
4553
4554 static bool
4555 detect_coding_sjis (struct coding_system *coding,
4556                     struct coding_detection_info *detect_info)
4557 {
4558   const unsigned char *src = coding->source, *src_base;
4559   const unsigned char *src_end = coding->source + coding->src_bytes;
4560   bool multibytep = coding->src_multibyte;
4561   ptrdiff_t consumed_chars = 0;
4562   int found = 0;
4563   int c;
4564   Lisp_Object attrs, charset_list;
4565   int max_first_byte_of_2_byte_code;
4566
4567   CODING_GET_INFO (coding, attrs, charset_list);
4568   max_first_byte_of_2_byte_code
4569     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4570
4571   detect_info->checked |= CATEGORY_MASK_SJIS;
4572   /* A coding system of this category is always ASCII compatible.  */
4573   src += coding->head_ascii;
4574
4575   while (1)
4576     {
4577       src_base = src;
4578       ONE_MORE_BYTE (c);
4579       if (c < 0x80)
4580         continue;
4581       if ((c >= 0x81 && c <= 0x9F)
4582           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4583         {
4584           ONE_MORE_BYTE (c);
4585           if (c < 0x40 || c == 0x7F || c > 0xFC)
4586             break;
4587           found = CATEGORY_MASK_SJIS;
4588         }
4589       else if (c >= 0xA0 && c < 0xE0)
4590         found = CATEGORY_MASK_SJIS;
4591       else
4592         break;
4593     }
4594   detect_info->rejected |= CATEGORY_MASK_SJIS;
4595   return 0;
4596
4597  no_more_source:
4598   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4599     {
4600       detect_info->rejected |= CATEGORY_MASK_SJIS;
4601       return 0;
4602     }
4603   detect_info->found |= found;
4604   return 1;
4605 }
4606
4607 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4608    Return true if a text is encoded in BIG5.  */
4609
4610 static bool
4611 detect_coding_big5 (struct coding_system *coding,
4612                     struct coding_detection_info *detect_info)
4613 {
4614   const unsigned char *src = coding->source, *src_base;
4615   const unsigned char *src_end = coding->source + coding->src_bytes;
4616   bool multibytep = coding->src_multibyte;
4617   ptrdiff_t consumed_chars = 0;
4618   int found = 0;
4619   int c;
4620
4621   detect_info->checked |= CATEGORY_MASK_BIG5;
4622   /* A coding system of this category is always ASCII compatible.  */
4623   src += coding->head_ascii;
4624
4625   while (1)
4626     {
4627       src_base = src;
4628       ONE_MORE_BYTE (c);
4629       if (c < 0x80)
4630         continue;
4631       if (c >= 0xA1)
4632         {
4633           ONE_MORE_BYTE (c);
4634           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4635             return 0;
4636           found = CATEGORY_MASK_BIG5;
4637         }
4638       else
4639         break;
4640     }
4641   detect_info->rejected |= CATEGORY_MASK_BIG5;
4642   return 0;
4643
4644  no_more_source:
4645   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4646     {
4647       detect_info->rejected |= CATEGORY_MASK_BIG5;
4648       return 0;
4649     }
4650   detect_info->found |= found;
4651   return 1;
4652 }
4653
4654 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4655
4656 static void
4657 decode_coding_sjis (struct coding_system *coding)
4658 {
4659   const unsigned char *src = coding->source + coding->consumed;
4660   const unsigned char *src_end = coding->source + coding->src_bytes;
4661   const unsigned char *src_base;
4662   int *charbuf = coding->charbuf + coding->charbuf_used;
4663   /* We may produce one charset annotation in one loop and one more at
4664      the end.  */
4665   int *charbuf_end
4666     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4667   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4668   bool multibytep = coding->src_multibyte;
4669   struct charset *charset_roman, *charset_kanji, *charset_kana;
4670   struct charset *charset_kanji2;
4671   Lisp_Object attrs, charset_list, val;
4672   ptrdiff_t char_offset = coding->produced_char;
4673   ptrdiff_t last_offset = char_offset;
4674   int last_id = charset_ascii;
4675   bool eol_dos
4676     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4677   int byte_after_cr = -1;
4678
4679   CODING_GET_INFO (coding, attrs, charset_list);
4680
4681   val = charset_list;
4682   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4683   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4684   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4685   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4686
4687   while (1)
4688     {
4689       int c, c1;
4690       struct charset *charset;
4691
4692       src_base = src;
4693       consumed_chars_base = consumed_chars;
4694
4695       if (charbuf >= charbuf_end)
4696         {
4697           if (byte_after_cr >= 0)
4698             src_base--;
4699           break;
4700         }
4701
4702       if (byte_after_cr >= 0)
4703         c = byte_after_cr, byte_after_cr = -1;
4704       else
4705         ONE_MORE_BYTE (c);
4706       if (c < 0)
4707         goto invalid_code;
4708       if (c < 0x80)
4709         {
4710           if (eol_dos && c == '\r')
4711             ONE_MORE_BYTE (byte_after_cr);
4712           charset = charset_roman;
4713         }
4714       else if (c == 0x80 || c == 0xA0)
4715         goto invalid_code;
4716       else if (c >= 0xA1 && c <= 0xDF)
4717         {
4718           /* SJIS -> JISX0201-Kana */
4719           c &= 0x7F;
4720           charset = charset_kana;
4721         }
4722       else if (c <= 0xEF)
4723         {
4724           /* SJIS -> JISX0208 */
4725           ONE_MORE_BYTE (c1);
4726           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4727             goto invalid_code;
4728           c = (c << 8) | c1;
4729           SJIS_TO_JIS (c);
4730           charset = charset_kanji;
4731         }
4732       else if (c <= 0xFC && charset_kanji2)
4733         {
4734           /* SJIS -> JISX0213-2 */
4735           ONE_MORE_BYTE (c1);
4736           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4737             goto invalid_code;
4738           c = (c << 8) | c1;
4739           SJIS_TO_JIS2 (c);
4740           charset = charset_kanji2;
4741         }
4742       else
4743         goto invalid_code;
4744       if (charset->id != charset_ascii
4745           && last_id != charset->id)
4746         {
4747           if (last_id != charset_ascii)
4748             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4749           last_id = charset->id;
4750           last_offset = char_offset;
4751         }
4752       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4753       *charbuf++ = c;
4754       char_offset++;
4755       continue;
4756
4757     invalid_code:
4758       src = src_base;
4759       consumed_chars = consumed_chars_base;
4760       ONE_MORE_BYTE (c);
4761       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4762       char_offset++;
4763       coding->errors++;
4764     }
4765
4766  no_more_source:
4767   if (last_id != charset_ascii)
4768     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4769   coding->consumed_char += consumed_chars_base;
4770   coding->consumed = src_base - coding->source;
4771   coding->charbuf_used = charbuf - coding->charbuf;
4772 }
4773
4774 static void
4775 decode_coding_big5 (struct coding_system *coding)
4776 {
4777   const unsigned char *src = coding->source + coding->consumed;
4778   const unsigned char *src_end = coding->source + coding->src_bytes;
4779   const unsigned char *src_base;
4780   int *charbuf = coding->charbuf + coding->charbuf_used;
4781   /* We may produce one charset annotation in one loop and one more at
4782      the end.  */
4783   int *charbuf_end
4784     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4785   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4786   bool multibytep = coding->src_multibyte;
4787   struct charset *charset_roman, *charset_big5;
4788   Lisp_Object attrs, charset_list, val;
4789   ptrdiff_t char_offset = coding->produced_char;
4790   ptrdiff_t last_offset = char_offset;
4791   int last_id = charset_ascii;
4792   bool eol_dos
4793     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4794   int byte_after_cr = -1;
4795
4796   CODING_GET_INFO (coding, attrs, charset_list);
4797   val = charset_list;
4798   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4799   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4800
4801   while (1)
4802     {
4803       int c, c1;
4804       struct charset *charset;
4805
4806       src_base = src;
4807       consumed_chars_base = consumed_chars;
4808
4809       if (charbuf >= charbuf_end)
4810         {
4811           if (byte_after_cr >= 0)
4812             src_base--;
4813           break;
4814         }
4815
4816       if (byte_after_cr >= 0)
4817         c = byte_after_cr, byte_after_cr = -1;
4818       else
4819         ONE_MORE_BYTE (c);
4820
4821       if (c < 0)
4822         goto invalid_code;
4823       if (c < 0x80)
4824         {
4825           if (eol_dos && c == '\r')
4826             ONE_MORE_BYTE (byte_after_cr);
4827           charset = charset_roman;
4828         }
4829       else
4830         {
4831           /* BIG5 -> Big5 */
4832           if (c < 0xA1 || c > 0xFE)
4833             goto invalid_code;
4834           ONE_MORE_BYTE (c1);
4835           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4836             goto invalid_code;
4837           c = c << 8 | c1;
4838           charset = charset_big5;
4839         }
4840       if (charset->id != charset_ascii
4841           && last_id != charset->id)
4842         {
4843           if (last_id != charset_ascii)
4844             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4845           last_id = charset->id;
4846           last_offset = char_offset;
4847         }
4848       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4849       *charbuf++ = c;
4850       char_offset++;
4851       continue;
4852
4853     invalid_code:
4854       src = src_base;
4855       consumed_chars = consumed_chars_base;
4856       ONE_MORE_BYTE (c);
4857       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4858       char_offset++;
4859       coding->errors++;
4860     }
4861
4862  no_more_source:
4863   if (last_id != charset_ascii)
4864     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4865   coding->consumed_char += consumed_chars_base;
4866   coding->consumed = src_base - coding->source;
4867   coding->charbuf_used = charbuf - coding->charbuf;
4868 }
4869
4870 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4871    This function can encode charsets `ascii', `katakana-jisx0201',
4872    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4873    are sure that all these charsets are registered as official charset
4874    (i.e. do not have extended leading-codes).  Characters of other
4875    charsets are produced without any encoding.  */
4876
4877 static bool
4878 encode_coding_sjis (struct coding_system *coding)
4879 {
4880   bool multibytep = coding->dst_multibyte;
4881   int *charbuf = coding->charbuf;
4882   int *charbuf_end = charbuf + coding->charbuf_used;
4883   unsigned char *dst = coding->destination + coding->produced;
4884   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4885   int safe_room = 4;
4886   ptrdiff_t produced_chars = 0;
4887   Lisp_Object attrs, charset_list, val;
4888   bool ascii_compatible;
4889   struct charset *charset_kanji, *charset_kana;
4890   struct charset *charset_kanji2;
4891   int c;
4892
4893   CODING_GET_INFO (coding, attrs, charset_list);
4894   val = XCDR (charset_list);
4895   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4896   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4897   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4898
4899   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4900
4901   while (charbuf < charbuf_end)
4902     {
4903       ASSURE_DESTINATION (safe_room);
4904       c = *charbuf++;
4905       /* Now encode the character C.  */
4906       if (ASCII_CHAR_P (c) && ascii_compatible)
4907         EMIT_ONE_ASCII_BYTE (c);
4908       else if (CHAR_BYTE8_P (c))
4909         {
4910           c = CHAR_TO_BYTE8 (c);
4911           EMIT_ONE_BYTE (c);
4912         }
4913       else
4914         {
4915           unsigned code;
4916           struct charset *charset;
4917           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4918                                &code, charset);
4919
4920           if (!charset)
4921             {
4922               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4923                 {
4924                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4925                   charset = CHARSET_FROM_ID (charset_ascii);
4926                 }
4927               else
4928                 {
4929                   c = coding->default_char;
4930                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4931                                        charset_list, &code, charset);
4932                 }
4933             }
4934           if (code == CHARSET_INVALID_CODE (charset))
4935             emacs_abort ();
4936           if (charset == charset_kanji)
4937             {
4938               int c1, c2;
4939               JIS_TO_SJIS (code);
4940               c1 = code >> 8, c2 = code & 0xFF;
4941               EMIT_TWO_BYTES (c1, c2);
4942             }
4943           else if (charset == charset_kana)
4944             EMIT_ONE_BYTE (code | 0x80);
4945           else if (charset_kanji2 && charset == charset_kanji2)
4946             {
4947               int c1, c2;
4948
4949               c1 = code >> 8;
4950               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4951                   || c1 == 0x28
4952                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4953                 {
4954                   JIS_TO_SJIS2 (code);
4955                   c1 = code >> 8, c2 = code & 0xFF;
4956                   EMIT_TWO_BYTES (c1, c2);
4957                 }
4958               else
4959                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4960             }
4961           else
4962             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4963         }
4964     }
4965   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4966   coding->produced_char += produced_chars;
4967   coding->produced = dst - coding->destination;
4968   return 0;
4969 }
4970
4971 static bool
4972 encode_coding_big5 (struct coding_system *coding)
4973 {
4974   bool multibytep = coding->dst_multibyte;
4975   int *charbuf = coding->charbuf;
4976   int *charbuf_end = charbuf + coding->charbuf_used;
4977   unsigned char *dst = coding->destination + coding->produced;
4978   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4979   int safe_room = 4;
4980   ptrdiff_t produced_chars = 0;
4981   Lisp_Object attrs, charset_list, val;
4982   bool ascii_compatible;
4983   struct charset *charset_big5;
4984   int c;
4985
4986   CODING_GET_INFO (coding, attrs, charset_list);
4987   val = XCDR (charset_list);
4988   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4989   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4990
4991   while (charbuf < charbuf_end)
4992     {
4993       ASSURE_DESTINATION (safe_room);
4994       c = *charbuf++;
4995       /* Now encode the character C.  */
4996       if (ASCII_CHAR_P (c) && ascii_compatible)
4997         EMIT_ONE_ASCII_BYTE (c);
4998       else if (CHAR_BYTE8_P (c))
4999         {
5000           c = CHAR_TO_BYTE8 (c);
5001           EMIT_ONE_BYTE (c);
5002         }
5003       else
5004         {
5005           unsigned code;
5006           struct charset *charset;
5007           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5008                                &code, charset);
5009
5010           if (! charset)
5011             {
5012               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5013                 {
5014                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5015                   charset = CHARSET_FROM_ID (charset_ascii);
5016                 }
5017               else
5018                 {
5019                   c = coding->default_char;
5020                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5021                                        charset_list, &code, charset);
5022                 }
5023             }
5024           if (code == CHARSET_INVALID_CODE (charset))
5025             emacs_abort ();
5026           if (charset == charset_big5)
5027             {
5028               int c1, c2;
5029
5030               c1 = code >> 8, c2 = code & 0xFF;
5031               EMIT_TWO_BYTES (c1, c2);
5032             }
5033           else
5034             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5035         }
5036     }
5037   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5038   coding->produced_char += produced_chars;
5039   coding->produced = dst - coding->destination;
5040   return 0;
5041 }
5042
5043 \f
5044 /*** 10. CCL handlers ***/
5045
5046 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5047    Return true if a text is encoded in a coding system of which
5048    encoder/decoder are written in CCL program.  */
5049
5050 static bool
5051 detect_coding_ccl (struct coding_system *coding,
5052                    struct coding_detection_info *detect_info)
5053 {
5054   const unsigned char *src = coding->source, *src_base;
5055   const unsigned char *src_end = coding->source + coding->src_bytes;
5056   bool multibytep = coding->src_multibyte;
5057   ptrdiff_t consumed_chars = 0;
5058   int found = 0;
5059   unsigned char *valids;
5060   ptrdiff_t head_ascii = coding->head_ascii;
5061   Lisp_Object attrs;
5062
5063   detect_info->checked |= CATEGORY_MASK_CCL;
5064
5065   coding = &coding_categories[coding_category_ccl];
5066   valids = CODING_CCL_VALIDS (coding);
5067   attrs = CODING_ID_ATTRS (coding->id);
5068   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5069     src += head_ascii;
5070
5071   while (1)
5072     {
5073       int c;
5074
5075       src_base = src;
5076       ONE_MORE_BYTE (c);
5077       if (c < 0 || ! valids[c])
5078         break;
5079       if ((valids[c] > 1))
5080         found = CATEGORY_MASK_CCL;
5081     }
5082   detect_info->rejected |= CATEGORY_MASK_CCL;
5083   return 0;
5084
5085  no_more_source:
5086   detect_info->found |= found;
5087   return 1;
5088 }
5089
5090 static void
5091 decode_coding_ccl (struct coding_system *coding)
5092 {
5093   const unsigned char *src = coding->source + coding->consumed;
5094   const unsigned char *src_end = coding->source + coding->src_bytes;
5095   int *charbuf = coding->charbuf + coding->charbuf_used;
5096   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5097   ptrdiff_t consumed_chars = 0;
5098   bool multibytep = coding->src_multibyte;
5099   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5100   int source_charbuf[1024];
5101   int source_byteidx[1025];
5102   Lisp_Object attrs, charset_list;
5103
5104   CODING_GET_INFO (coding, attrs, charset_list);
5105
5106   while (1)
5107     {
5108       const unsigned char *p = src;
5109       ptrdiff_t offset;
5110       int i = 0;
5111
5112       if (multibytep)
5113         {
5114           while (i < 1024 && p < src_end)
5115             {
5116               source_byteidx[i] = p - src;
5117               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5118             }
5119           source_byteidx[i] = p - src;
5120         }
5121       else
5122         while (i < 1024 && p < src_end)
5123           source_charbuf[i++] = *p++;
5124
5125       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5126         ccl->last_block = 1;
5127       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5128       charset_map_loaded = 0;
5129       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5130                   charset_list);
5131       if (charset_map_loaded
5132           && (offset = coding_change_source (coding)))
5133         {
5134           p += offset;
5135           src += offset;
5136           src_end += offset;
5137         }
5138       charbuf += ccl->produced;
5139       if (multibytep)
5140         src += source_byteidx[ccl->consumed];
5141       else
5142         src += ccl->consumed;
5143       consumed_chars += ccl->consumed;
5144       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5145         break;
5146     }
5147
5148   switch (ccl->status)
5149     {
5150     case CCL_STAT_SUSPEND_BY_SRC:
5151       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5152       break;
5153     case CCL_STAT_SUSPEND_BY_DST:
5154       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5155       break;
5156     case CCL_STAT_QUIT:
5157     case CCL_STAT_INVALID_CMD:
5158       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5159       break;
5160     default:
5161       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5162       break;
5163     }
5164   coding->consumed_char += consumed_chars;
5165   coding->consumed = src - coding->source;
5166   coding->charbuf_used = charbuf - coding->charbuf;
5167 }
5168
5169 static bool
5170 encode_coding_ccl (struct coding_system *coding)
5171 {
5172   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5173   bool multibytep = coding->dst_multibyte;
5174   int *charbuf = coding->charbuf;
5175   int *charbuf_end = charbuf + coding->charbuf_used;
5176   unsigned char *dst = coding->destination + coding->produced;
5177   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5178   int destination_charbuf[1024];
5179   ptrdiff_t produced_chars = 0;
5180   int i;
5181   Lisp_Object attrs, charset_list;
5182
5183   CODING_GET_INFO (coding, attrs, charset_list);
5184   if (coding->consumed_char == coding->src_chars
5185       && coding->mode & CODING_MODE_LAST_BLOCK)
5186     ccl->last_block = 1;
5187
5188   do
5189     {
5190       ptrdiff_t offset;
5191
5192       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5193       charset_map_loaded = 0;
5194       ccl_driver (ccl, charbuf, destination_charbuf,
5195                   charbuf_end - charbuf, 1024, charset_list);
5196       if (charset_map_loaded
5197           && (offset = coding_change_destination (coding)))
5198         dst += offset;
5199       if (multibytep)
5200         {
5201           ASSURE_DESTINATION (ccl->produced * 2);
5202           for (i = 0; i < ccl->produced; i++)
5203             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5204         }
5205       else
5206         {
5207           ASSURE_DESTINATION (ccl->produced);
5208           for (i = 0; i < ccl->produced; i++)
5209             *dst++ = destination_charbuf[i] & 0xFF;
5210           produced_chars += ccl->produced;
5211         }
5212       charbuf += ccl->consumed;
5213       if (ccl->status == CCL_STAT_QUIT
5214           || ccl->status == CCL_STAT_INVALID_CMD)
5215         break;
5216     }
5217   while (charbuf < charbuf_end);
5218
5219   switch (ccl->status)
5220     {
5221     case CCL_STAT_SUSPEND_BY_SRC:
5222       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5223       break;
5224     case CCL_STAT_SUSPEND_BY_DST:
5225       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5226       break;
5227     case CCL_STAT_QUIT:
5228     case CCL_STAT_INVALID_CMD:
5229       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5230       break;
5231     default:
5232       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5233       break;
5234     }
5235
5236   coding->produced_char += produced_chars;
5237   coding->produced = dst - coding->destination;
5238   return 0;
5239 }
5240
5241 \f
5242 /*** 10, 11. no-conversion handlers ***/
5243
5244 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5245
5246 static void
5247 decode_coding_raw_text (struct coding_system *coding)
5248 {
5249   bool eol_dos
5250     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5251
5252   coding->chars_at_source = 1;
5253   coding->consumed_char = coding->src_chars;
5254   coding->consumed = coding->src_bytes;
5255   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5256     {
5257       coding->consumed_char--;
5258       coding->consumed--;
5259       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5260     }
5261   else
5262     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5263 }
5264
5265 static bool
5266 encode_coding_raw_text (struct coding_system *coding)
5267 {
5268   bool multibytep = coding->dst_multibyte;
5269   int *charbuf = coding->charbuf;
5270   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5271   unsigned char *dst = coding->destination + coding->produced;
5272   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5273   ptrdiff_t produced_chars = 0;
5274   int c;
5275
5276   if (multibytep)
5277     {
5278       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5279
5280       if (coding->src_multibyte)
5281         while (charbuf < charbuf_end)
5282           {
5283             ASSURE_DESTINATION (safe_room);
5284             c = *charbuf++;
5285             if (ASCII_CHAR_P (c))
5286               EMIT_ONE_ASCII_BYTE (c);
5287             else if (CHAR_BYTE8_P (c))
5288               {
5289                 c = CHAR_TO_BYTE8 (c);
5290                 EMIT_ONE_BYTE (c);
5291               }
5292             else
5293               {
5294                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5295
5296                 CHAR_STRING_ADVANCE (c, p1);
5297                 do
5298                   {
5299                     EMIT_ONE_BYTE (*p0);
5300                     p0++;
5301                   }
5302                 while (p0 < p1);
5303               }
5304           }
5305       else
5306         while (charbuf < charbuf_end)
5307           {
5308             ASSURE_DESTINATION (safe_room);
5309             c = *charbuf++;
5310             EMIT_ONE_BYTE (c);
5311           }
5312     }
5313   else
5314     {
5315       if (coding->src_multibyte)
5316         {
5317           int safe_room = MAX_MULTIBYTE_LENGTH;
5318
5319           while (charbuf < charbuf_end)
5320             {
5321               ASSURE_DESTINATION (safe_room);
5322               c = *charbuf++;
5323               if (ASCII_CHAR_P (c))
5324                 *dst++ = c;
5325               else if (CHAR_BYTE8_P (c))
5326                 *dst++ = CHAR_TO_BYTE8 (c);
5327               else
5328                 CHAR_STRING_ADVANCE (c, dst);
5329             }
5330         }
5331       else
5332         {
5333           ASSURE_DESTINATION (charbuf_end - charbuf);
5334           while (charbuf < charbuf_end && dst < dst_end)
5335             *dst++ = *charbuf++;
5336         }
5337       produced_chars = dst - (coding->destination + coding->produced);
5338     }
5339   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5340   coding->produced_char += produced_chars;
5341   coding->produced = dst - coding->destination;
5342   return 0;
5343 }
5344
5345 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5346    Return true if a text is encoded in a charset-based coding system.  */
5347
5348 static bool
5349 detect_coding_charset (struct coding_system *coding,
5350                        struct coding_detection_info *detect_info)
5351 {
5352   const unsigned char *src = coding->source, *src_base;
5353   const unsigned char *src_end = coding->source + coding->src_bytes;
5354   bool multibytep = coding->src_multibyte;
5355   ptrdiff_t consumed_chars = 0;
5356   Lisp_Object attrs, valids, name;
5357   int found = 0;
5358   ptrdiff_t head_ascii = coding->head_ascii;
5359   bool check_latin_extra = 0;
5360
5361   detect_info->checked |= CATEGORY_MASK_CHARSET;
5362
5363   coding = &coding_categories[coding_category_charset];
5364   attrs = CODING_ID_ATTRS (coding->id);
5365   valids = AREF (attrs, coding_attr_charset_valids);
5366   name = CODING_ID_NAME (coding->id);
5367   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5368                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5369       || strncmp (SSDATA (SYMBOL_NAME (name)),
5370                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5371     check_latin_extra = 1;
5372
5373   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5374     src += head_ascii;
5375
5376   while (1)
5377     {
5378       int c;
5379       Lisp_Object val;
5380       struct charset *charset;
5381       int dim, idx;
5382
5383       src_base = src;
5384       ONE_MORE_BYTE (c);
5385       if (c < 0)
5386         continue;
5387       val = AREF (valids, c);
5388       if (NILP (val))
5389         break;
5390       if (c >= 0x80)
5391         {
5392           if (c < 0xA0
5393               && check_latin_extra
5394               && (!VECTORP (Vlatin_extra_code_table)
5395                   || NILP (AREF (Vlatin_extra_code_table, c))))
5396             break;
5397           found = CATEGORY_MASK_CHARSET;
5398         }
5399       if (INTEGERP (val))
5400         {
5401           charset = CHARSET_FROM_ID (XFASTINT (val));
5402           dim = CHARSET_DIMENSION (charset);
5403           for (idx = 1; idx < dim; idx++)
5404             {
5405               if (src == src_end)
5406                 goto too_short;
5407               ONE_MORE_BYTE (c);
5408               if (c < charset->code_space[(dim - 1 - idx) * 4]
5409                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5410                 break;
5411             }
5412           if (idx < dim)
5413             break;
5414         }
5415       else
5416         {
5417           idx = 1;
5418           for (; CONSP (val); val = XCDR (val))
5419             {
5420               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5421               dim = CHARSET_DIMENSION (charset);
5422               while (idx < dim)
5423                 {
5424                   if (src == src_end)
5425                     goto too_short;
5426                   ONE_MORE_BYTE (c);
5427                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5428                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5429                     break;
5430                   idx++;
5431                 }
5432               if (idx == dim)
5433                 {
5434                   val = Qnil;
5435                   break;
5436                 }
5437             }
5438           if (CONSP (val))
5439             break;
5440         }
5441     }
5442  too_short:
5443   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5444   return 0;
5445
5446  no_more_source:
5447   detect_info->found |= found;
5448   return 1;
5449 }
5450
5451 static void
5452 decode_coding_charset (struct coding_system *coding)
5453 {
5454   const unsigned char *src = coding->source + coding->consumed;
5455   const unsigned char *src_end = coding->source + coding->src_bytes;
5456   const unsigned char *src_base;
5457   int *charbuf = coding->charbuf + coding->charbuf_used;
5458   /* We may produce one charset annotation in one loop and one more at
5459      the end.  */
5460   int *charbuf_end
5461     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5462   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5463   bool multibytep = coding->src_multibyte;
5464   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5465   Lisp_Object valids;
5466   ptrdiff_t char_offset = coding->produced_char;
5467   ptrdiff_t last_offset = char_offset;
5468   int last_id = charset_ascii;
5469   bool eol_dos
5470     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5471   int byte_after_cr = -1;
5472
5473   valids = AREF (attrs, coding_attr_charset_valids);
5474
5475   while (1)
5476     {
5477       int c;
5478       Lisp_Object val;
5479       struct charset *charset;
5480       int dim;
5481       int len = 1;
5482       unsigned code;
5483
5484       src_base = src;
5485       consumed_chars_base = consumed_chars;
5486
5487       if (charbuf >= charbuf_end)
5488         {
5489           if (byte_after_cr >= 0)
5490             src_base--;
5491           break;
5492         }
5493
5494       if (byte_after_cr >= 0)
5495         {
5496           c = byte_after_cr;
5497           byte_after_cr = -1;
5498         }
5499       else
5500         {
5501           ONE_MORE_BYTE (c);
5502           if (eol_dos && c == '\r')
5503             ONE_MORE_BYTE (byte_after_cr);
5504         }
5505       if (c < 0)
5506         goto invalid_code;
5507       code = c;
5508
5509       val = AREF (valids, c);
5510       if (! INTEGERP (val) && ! CONSP (val))
5511         goto invalid_code;
5512       if (INTEGERP (val))
5513         {
5514           charset = CHARSET_FROM_ID (XFASTINT (val));
5515           dim = CHARSET_DIMENSION (charset);
5516           while (len < dim)
5517             {
5518               ONE_MORE_BYTE (c);
5519               code = (code << 8) | c;
5520               len++;
5521             }
5522           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5523                               charset, code, c);
5524         }
5525       else
5526         {
5527           /* VAL is a list of charset IDs.  It is assured that the
5528              list is sorted by charset dimensions (smaller one
5529              comes first).  */
5530           while (CONSP (val))
5531             {
5532               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5533               dim = CHARSET_DIMENSION (charset);
5534               while (len < dim)
5535                 {
5536                   ONE_MORE_BYTE (c);
5537                   code = (code << 8) | c;
5538                   len++;
5539                 }
5540               CODING_DECODE_CHAR (coding, src, src_base,
5541                                   src_end, charset, code, c);
5542               if (c >= 0)
5543                 break;
5544               val = XCDR (val);
5545             }
5546         }
5547       if (c < 0)
5548         goto invalid_code;
5549       if (charset->id != charset_ascii
5550           && last_id != charset->id)
5551         {
5552           if (last_id != charset_ascii)
5553             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5554           last_id = charset->id;
5555           last_offset = char_offset;
5556         }
5557
5558       *charbuf++ = c;
5559       char_offset++;
5560       continue;
5561
5562     invalid_code:
5563       src = src_base;
5564       consumed_chars = consumed_chars_base;
5565       ONE_MORE_BYTE (c);
5566       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5567       char_offset++;
5568       coding->errors++;
5569     }
5570
5571  no_more_source:
5572   if (last_id != charset_ascii)
5573     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5574   coding->consumed_char += consumed_chars_base;
5575   coding->consumed = src_base - coding->source;
5576   coding->charbuf_used = charbuf - coding->charbuf;
5577 }
5578
5579 static bool
5580 encode_coding_charset (struct coding_system *coding)
5581 {
5582   bool multibytep = coding->dst_multibyte;
5583   int *charbuf = coding->charbuf;
5584   int *charbuf_end = charbuf + coding->charbuf_used;
5585   unsigned char *dst = coding->destination + coding->produced;
5586   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5587   int safe_room = MAX_MULTIBYTE_LENGTH;
5588   ptrdiff_t produced_chars = 0;
5589   Lisp_Object attrs, charset_list;
5590   bool ascii_compatible;
5591   int c;
5592
5593   CODING_GET_INFO (coding, attrs, charset_list);
5594   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5595
5596   while (charbuf < charbuf_end)
5597     {
5598       struct charset *charset;
5599       unsigned code;
5600
5601       ASSURE_DESTINATION (safe_room);
5602       c = *charbuf++;
5603       if (ascii_compatible && ASCII_CHAR_P (c))
5604         EMIT_ONE_ASCII_BYTE (c);
5605       else if (CHAR_BYTE8_P (c))
5606         {
5607           c = CHAR_TO_BYTE8 (c);
5608           EMIT_ONE_BYTE (c);
5609         }
5610       else
5611         {
5612           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5613                                &code, charset);
5614
5615           if (charset)
5616             {
5617               if (CHARSET_DIMENSION (charset) == 1)
5618                 EMIT_ONE_BYTE (code);
5619               else if (CHARSET_DIMENSION (charset) == 2)
5620                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5621               else if (CHARSET_DIMENSION (charset) == 3)
5622                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5623               else
5624                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5625                                  (code >> 8) & 0xFF, code & 0xFF);
5626             }
5627           else
5628             {
5629               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5630                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5631               else
5632                 c = coding->default_char;
5633               EMIT_ONE_BYTE (c);
5634             }
5635         }
5636     }
5637
5638   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5639   coding->produced_char += produced_chars;
5640   coding->produced = dst - coding->destination;
5641   return 0;
5642 }
5643
5644 \f
5645 /*** 7. C library functions ***/
5646
5647 /* Setup coding context CODING from information about CODING_SYSTEM.
5648    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5649    CODING_SYSTEM is invalid, signal an error.  */
5650
5651 void
5652 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5653 {
5654   Lisp_Object attrs;
5655   Lisp_Object eol_type;
5656   Lisp_Object coding_type;
5657   Lisp_Object val;
5658
5659   if (NILP (coding_system))
5660     coding_system = Qundecided;
5661
5662   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5663
5664   attrs = CODING_ID_ATTRS (coding->id);
5665   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5666
5667   coding->mode = 0;
5668   coding->head_ascii = -1;
5669   if (VECTORP (eol_type))
5670     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5671                             | CODING_REQUIRE_DETECTION_MASK);
5672   else if (! EQ (eol_type, Qunix))
5673     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5674                             | CODING_REQUIRE_ENCODING_MASK);
5675   else
5676     coding->common_flags = 0;
5677   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5678     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5679   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5680     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5681   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5682     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5683
5684   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5685   coding->max_charset_id = SCHARS (val) - 1;
5686   coding->safe_charsets = SDATA (val);
5687   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5688   coding->carryover_bytes = 0;
5689
5690   coding_type = CODING_ATTR_TYPE (attrs);
5691   if (EQ (coding_type, Qundecided))
5692     {
5693       coding->detector = NULL;
5694       coding->decoder = decode_coding_raw_text;
5695       coding->encoder = encode_coding_raw_text;
5696       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5697     }
5698   else if (EQ (coding_type, Qiso_2022))
5699     {
5700       int i;
5701       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5702
5703       /* Invoke graphic register 0 to plane 0.  */
5704       CODING_ISO_INVOCATION (coding, 0) = 0;
5705       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5706       CODING_ISO_INVOCATION (coding, 1)
5707         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5708       /* Setup the initial status of designation.  */
5709       for (i = 0; i < 4; i++)
5710         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5711       /* Not single shifting initially.  */
5712       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5713       /* Beginning of buffer should also be regarded as bol. */
5714       CODING_ISO_BOL (coding) = 1;
5715       coding->detector = detect_coding_iso_2022;
5716       coding->decoder = decode_coding_iso_2022;
5717       coding->encoder = encode_coding_iso_2022;
5718       if (flags & CODING_ISO_FLAG_SAFE)
5719         coding->mode |= CODING_MODE_SAFE_ENCODING;
5720       coding->common_flags
5721         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5722             | CODING_REQUIRE_FLUSHING_MASK);
5723       if (flags & CODING_ISO_FLAG_COMPOSITION)
5724         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5725       if (flags & CODING_ISO_FLAG_DESIGNATION)
5726         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5727       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5728         {
5729           setup_iso_safe_charsets (attrs);
5730           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5731           coding->max_charset_id = SCHARS (val) - 1;
5732           coding->safe_charsets = SDATA (val);
5733         }
5734       CODING_ISO_FLAGS (coding) = flags;
5735       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5736       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5737       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5738       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5739     }
5740   else if (EQ (coding_type, Qcharset))
5741     {
5742       coding->detector = detect_coding_charset;
5743       coding->decoder = decode_coding_charset;
5744       coding->encoder = encode_coding_charset;
5745       coding->common_flags
5746         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5747     }
5748   else if (EQ (coding_type, Qutf_8))
5749     {
5750       val = AREF (attrs, coding_attr_utf_bom);
5751       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5752                                    : EQ (val, Qt) ? utf_with_bom
5753                                    : utf_without_bom);
5754       coding->detector = detect_coding_utf_8;
5755       coding->decoder = decode_coding_utf_8;
5756       coding->encoder = encode_coding_utf_8;
5757       coding->common_flags
5758         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5759       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5760         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5761     }
5762   else if (EQ (coding_type, Qutf_16))
5763     {
5764       val = AREF (attrs, coding_attr_utf_bom);
5765       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5766                                     : EQ (val, Qt) ? utf_with_bom
5767                                     : utf_without_bom);
5768       val = AREF (attrs, coding_attr_utf_16_endian);
5769       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5770                                        : utf_16_little_endian);
5771       CODING_UTF_16_SURROGATE (coding) = 0;
5772       coding->detector = detect_coding_utf_16;
5773       coding->decoder = decode_coding_utf_16;
5774       coding->encoder = encode_coding_utf_16;
5775       coding->common_flags
5776         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5777       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5778         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5779     }
5780   else if (EQ (coding_type, Qccl))
5781     {
5782       coding->detector = detect_coding_ccl;
5783       coding->decoder = decode_coding_ccl;
5784       coding->encoder = encode_coding_ccl;
5785       coding->common_flags
5786         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5787             | CODING_REQUIRE_FLUSHING_MASK);
5788     }
5789   else if (EQ (coding_type, Qemacs_mule))
5790     {
5791       coding->detector = detect_coding_emacs_mule;
5792       coding->decoder = decode_coding_emacs_mule;
5793       coding->encoder = encode_coding_emacs_mule;
5794       coding->common_flags
5795         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5796       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5797           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5798         {
5799           Lisp_Object tail, safe_charsets;
5800           int max_charset_id = 0;
5801
5802           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5803                tail = XCDR (tail))
5804             if (max_charset_id < XFASTINT (XCAR (tail)))
5805               max_charset_id = XFASTINT (XCAR (tail));
5806           safe_charsets = make_uninit_string (max_charset_id + 1);
5807           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5808           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5809                tail = XCDR (tail))
5810             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5811           coding->max_charset_id = max_charset_id;
5812           coding->safe_charsets = SDATA (safe_charsets);
5813         }
5814       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5815       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5816     }
5817   else if (EQ (coding_type, Qshift_jis))
5818     {
5819       coding->detector = detect_coding_sjis;
5820       coding->decoder = decode_coding_sjis;
5821       coding->encoder = encode_coding_sjis;
5822       coding->common_flags
5823         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5824     }
5825   else if (EQ (coding_type, Qbig5))
5826     {
5827       coding->detector = detect_coding_big5;
5828       coding->decoder = decode_coding_big5;
5829       coding->encoder = encode_coding_big5;
5830       coding->common_flags
5831         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5832     }
5833   else                          /* EQ (coding_type, Qraw_text) */
5834     {
5835       coding->detector = NULL;
5836       coding->decoder = decode_coding_raw_text;
5837       coding->encoder = encode_coding_raw_text;
5838       if (! EQ (eol_type, Qunix))
5839         {
5840           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5841           if (! VECTORP (eol_type))
5842             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5843         }
5844
5845     }
5846
5847   return;
5848 }
5849
5850 /* Return a list of charsets supported by CODING.  */
5851
5852 Lisp_Object
5853 coding_charset_list (struct coding_system *coding)
5854 {
5855   Lisp_Object attrs, charset_list;
5856
5857   CODING_GET_INFO (coding, attrs, charset_list);
5858   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5859     {
5860       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5861
5862       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5863         charset_list = Viso_2022_charset_list;
5864     }
5865   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5866     {
5867       charset_list = Vemacs_mule_charset_list;
5868     }
5869   return charset_list;
5870 }
5871
5872
5873 /* Return a list of charsets supported by CODING-SYSTEM.  */
5874
5875 Lisp_Object
5876 coding_system_charset_list (Lisp_Object coding_system)
5877 {
5878   ptrdiff_t id;
5879   Lisp_Object attrs, charset_list;
5880
5881   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5882   attrs = CODING_ID_ATTRS (id);
5883
5884   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5885     {
5886       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5887
5888       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5889         charset_list = Viso_2022_charset_list;
5890       else
5891         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5892     }
5893   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5894     {
5895       charset_list = Vemacs_mule_charset_list;
5896     }
5897   else
5898     {
5899       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5900     }
5901   return charset_list;
5902 }
5903
5904
5905 /* Return raw-text or one of its subsidiaries that has the same
5906    eol_type as CODING-SYSTEM.  */
5907
5908 Lisp_Object
5909 raw_text_coding_system (Lisp_Object coding_system)
5910 {
5911   Lisp_Object spec, attrs;
5912   Lisp_Object eol_type, raw_text_eol_type;
5913
5914   if (NILP (coding_system))
5915     return Qraw_text;
5916   spec = CODING_SYSTEM_SPEC (coding_system);
5917   attrs = AREF (spec, 0);
5918
5919   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5920     return coding_system;
5921
5922   eol_type = AREF (spec, 2);
5923   if (VECTORP (eol_type))
5924     return Qraw_text;
5925   spec = CODING_SYSTEM_SPEC (Qraw_text);
5926   raw_text_eol_type = AREF (spec, 2);
5927   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5928           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5929           : AREF (raw_text_eol_type, 2));
5930 }
5931
5932
5933 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5934    the subsidiary that has the same eol-spec as PARENT (if it is not
5935    nil and specifies end-of-line format) or the system's setting
5936    (system_eol_type).  */
5937
5938 Lisp_Object
5939 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5940 {
5941   Lisp_Object spec, eol_type;
5942
5943   if (NILP (coding_system))
5944     coding_system = Qraw_text;
5945   spec = CODING_SYSTEM_SPEC (coding_system);
5946   eol_type = AREF (spec, 2);
5947   if (VECTORP (eol_type))
5948     {
5949       Lisp_Object parent_eol_type;
5950
5951       if (! NILP (parent))
5952         {
5953           Lisp_Object parent_spec;
5954
5955           parent_spec = CODING_SYSTEM_SPEC (parent);
5956           parent_eol_type = AREF (parent_spec, 2);
5957           if (VECTORP (parent_eol_type))
5958             parent_eol_type = system_eol_type;
5959         }
5960       else
5961         parent_eol_type = system_eol_type;
5962       if (EQ (parent_eol_type, Qunix))
5963         coding_system = AREF (eol_type, 0);
5964       else if (EQ (parent_eol_type, Qdos))
5965         coding_system = AREF (eol_type, 1);
5966       else if (EQ (parent_eol_type, Qmac))
5967         coding_system = AREF (eol_type, 2);
5968     }
5969   return coding_system;
5970 }
5971
5972
5973 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5974    decided for writing to a process.  If not, complement them, and
5975    return a new coding system.  */
5976
5977 Lisp_Object
5978 complement_process_encoding_system (Lisp_Object coding_system)
5979 {
5980   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5981   Lisp_Object spec, attrs;
5982   int i;
5983
5984   for (i = 0; i < 3; i++)
5985     {
5986       if (i == 1)
5987         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5988       else if (i == 2)
5989         coding_system = preferred_coding_system ();
5990       spec = CODING_SYSTEM_SPEC (coding_system);
5991       if (NILP (spec))
5992         continue;
5993       attrs = AREF (spec, 0);
5994       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5995         coding_base = CODING_ATTR_BASE_NAME (attrs);
5996       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5997         eol_base = coding_system;
5998       if (! NILP (coding_base) && ! NILP (eol_base))
5999         break;
6000     }
6001
6002   if (i > 0)
6003     /* The original CODING_SYSTEM didn't specify text-conversion or
6004        eol-conversion.  Be sure that we return a fully complemented
6005        coding system.  */
6006     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6007   return coding_system;
6008 }
6009
6010
6011 /* Emacs has a mechanism to automatically detect a coding system if it
6012    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6013    it's impossible to distinguish some coding systems accurately
6014    because they use the same range of codes.  So, at first, coding
6015    systems are categorized into 7, those are:
6016
6017    o coding-category-emacs-mule
6018
6019         The category for a coding system which has the same code range
6020         as Emacs' internal format.  Assigned the coding-system (Lisp
6021         symbol) `emacs-mule' by default.
6022
6023    o coding-category-sjis
6024
6025         The category for a coding system which has the same code range
6026         as SJIS.  Assigned the coding-system (Lisp
6027         symbol) `japanese-shift-jis' by default.
6028
6029    o coding-category-iso-7
6030
6031         The category for a coding system which has the same code range
6032         as ISO2022 of 7-bit environment.  This doesn't use any locking
6033         shift and single shift functions.  This can encode/decode all
6034         charsets.  Assigned the coding-system (Lisp symbol)
6035         `iso-2022-7bit' by default.
6036
6037    o coding-category-iso-7-tight
6038
6039         Same as coding-category-iso-7 except that this can
6040         encode/decode only the specified charsets.
6041
6042    o coding-category-iso-8-1
6043
6044         The category for a coding system which has the same code range
6045         as ISO2022 of 8-bit environment and graphic plane 1 used only
6046         for DIMENSION1 charset.  This doesn't use any locking shift
6047         and single shift functions.  Assigned the coding-system (Lisp
6048         symbol) `iso-latin-1' by default.
6049
6050    o coding-category-iso-8-2
6051
6052         The category for a coding system which has the same code range
6053         as ISO2022 of 8-bit environment and graphic plane 1 used only
6054         for DIMENSION2 charset.  This doesn't use any locking shift
6055         and single shift functions.  Assigned the coding-system (Lisp
6056         symbol) `japanese-iso-8bit' by default.
6057
6058    o coding-category-iso-7-else
6059
6060         The category for a coding system which has the same code range
6061         as ISO2022 of 7-bit environment but uses locking shift or
6062         single shift functions.  Assigned the coding-system (Lisp
6063         symbol) `iso-2022-7bit-lock' by default.
6064
6065    o coding-category-iso-8-else
6066
6067         The category for a coding system which has the same code range
6068         as ISO2022 of 8-bit environment but uses locking shift or
6069         single shift functions.  Assigned the coding-system (Lisp
6070         symbol) `iso-2022-8bit-ss2' by default.
6071
6072    o coding-category-big5
6073
6074         The category for a coding system which has the same code range
6075         as BIG5.  Assigned the coding-system (Lisp symbol)
6076         `cn-big5' by default.
6077
6078    o coding-category-utf-8
6079
6080         The category for a coding system which has the same code range
6081         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6082         symbol) `utf-8' by default.
6083
6084    o coding-category-utf-16-be
6085
6086         The category for a coding system in which a text has an
6087         Unicode signature (cf. Unicode Standard) in the order of BIG
6088         endian at the head.  Assigned the coding-system (Lisp symbol)
6089         `utf-16-be' by default.
6090
6091    o coding-category-utf-16-le
6092
6093         The category for a coding system in which a text has an
6094         Unicode signature (cf. Unicode Standard) in the order of
6095         LITTLE endian at the head.  Assigned the coding-system (Lisp
6096         symbol) `utf-16-le' by default.
6097
6098    o coding-category-ccl
6099
6100         The category for a coding system of which encoder/decoder is
6101         written in CCL programs.  The default value is nil, i.e., no
6102         coding system is assigned.
6103
6104    o coding-category-binary
6105
6106         The category for a coding system not categorized in any of the
6107         above.  Assigned the coding-system (Lisp symbol)
6108         `no-conversion' by default.
6109
6110    Each of them is a Lisp symbol and the value is an actual
6111    `coding-system's (this is also a Lisp symbol) assigned by a user.
6112    What Emacs does actually is to detect a category of coding system.
6113    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6114    decide only one possible category, it selects a category of the
6115    highest priority.  Priorities of categories are also specified by a
6116    user in a Lisp variable `coding-category-list'.
6117
6118 */
6119
6120 #define EOL_SEEN_NONE   0
6121 #define EOL_SEEN_LF     1
6122 #define EOL_SEEN_CR     2
6123 #define EOL_SEEN_CRLF   4
6124
6125 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6126    SOURCE is encoded.  If CATEGORY is one of
6127    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6128    two-byte, else they are encoded by one-byte.
6129
6130    Return one of EOL_SEEN_XXX.  */
6131
6132 #define MAX_EOL_CHECK_COUNT 3
6133
6134 static int
6135 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6136             enum coding_category category)
6137 {
6138   const unsigned char *src = source, *src_end = src + src_bytes;
6139   unsigned char c;
6140   int total  = 0;
6141   int eol_seen = EOL_SEEN_NONE;
6142
6143   if ((1 << category) & CATEGORY_MASK_UTF_16)
6144     {
6145       bool msb = category == (coding_category_utf_16_le
6146                               | coding_category_utf_16_le_nosig);
6147       bool lsb = !msb;
6148
6149       while (src + 1 < src_end)
6150         {
6151           c = src[lsb];
6152           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6153             {
6154               int this_eol;
6155
6156               if (c == '\n')
6157                 this_eol = EOL_SEEN_LF;
6158               else if (src + 3 >= src_end
6159                        || src[msb + 2] != 0
6160                        || src[lsb + 2] != '\n')
6161                 this_eol = EOL_SEEN_CR;
6162               else
6163                 {
6164                   this_eol = EOL_SEEN_CRLF;
6165                   src += 2;
6166                 }
6167
6168               if (eol_seen == EOL_SEEN_NONE)
6169                 /* This is the first end-of-line.  */
6170                 eol_seen = this_eol;
6171               else if (eol_seen != this_eol)
6172                 {
6173                   /* The found type is different from what found before.
6174                      Allow for stray ^M characters in DOS EOL files.  */
6175                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6176                       || (eol_seen == EOL_SEEN_CRLF
6177                           && this_eol == EOL_SEEN_CR))
6178                     eol_seen = EOL_SEEN_CRLF;
6179                   else
6180                     {
6181                       eol_seen = EOL_SEEN_LF;
6182                       break;
6183                     }
6184                 }
6185               if (++total == MAX_EOL_CHECK_COUNT)
6186                 break;
6187             }
6188           src += 2;
6189         }
6190     }
6191   else
6192     while (src < src_end)
6193       {
6194         c = *src++;
6195         if (c == '\n' || c == '\r')
6196           {
6197             int this_eol;
6198
6199             if (c == '\n')
6200               this_eol = EOL_SEEN_LF;
6201             else if (src >= src_end || *src != '\n')
6202               this_eol = EOL_SEEN_CR;
6203             else
6204               this_eol = EOL_SEEN_CRLF, src++;
6205
6206             if (eol_seen == EOL_SEEN_NONE)
6207               /* This is the first end-of-line.  */
6208               eol_seen = this_eol;
6209             else if (eol_seen != this_eol)
6210               {
6211                 /* The found type is different from what found before.
6212                    Allow for stray ^M characters in DOS EOL files.  */
6213                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6214                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6215                   eol_seen = EOL_SEEN_CRLF;
6216                 else
6217                   {
6218                     eol_seen = EOL_SEEN_LF;
6219                     break;
6220                   }
6221               }
6222             if (++total == MAX_EOL_CHECK_COUNT)
6223               break;
6224           }
6225       }
6226   return eol_seen;
6227 }
6228
6229
6230 static Lisp_Object
6231 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6232 {
6233   Lisp_Object eol_type;
6234
6235   eol_type = CODING_ID_EOL_TYPE (coding->id);
6236   if (eol_seen & EOL_SEEN_LF)
6237     {
6238       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6239       eol_type = Qunix;
6240     }
6241   else if (eol_seen & EOL_SEEN_CRLF)
6242     {
6243       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6244       eol_type = Qdos;
6245     }
6246   else if (eol_seen & EOL_SEEN_CR)
6247     {
6248       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6249       eol_type = Qmac;
6250     }
6251   return eol_type;
6252 }
6253
6254 /* Detect how a text specified in CODING is encoded.  If a coding
6255    system is detected, update fields of CODING by the detected coding
6256    system.  */
6257
6258 static void
6259 detect_coding (struct coding_system *coding)
6260 {
6261   const unsigned char *src, *src_end;
6262   unsigned int saved_mode = coding->mode;
6263
6264   coding->consumed = coding->consumed_char = 0;
6265   coding->produced = coding->produced_char = 0;
6266   coding_set_source (coding);
6267
6268   src_end = coding->source + coding->src_bytes;
6269   coding->head_ascii = 0;
6270
6271   /* If we have not yet decided the text encoding type, detect it
6272      now.  */
6273   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6274     {
6275       int c, i;
6276       struct coding_detection_info detect_info;
6277       bool null_byte_found = 0, eight_bit_found = 0;
6278
6279       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6280       for (src = coding->source; src < src_end; src++)
6281         {
6282           c = *src;
6283           if (c & 0x80)
6284             {
6285               eight_bit_found = 1;
6286               if (null_byte_found)
6287                 break;
6288             }
6289           else if (c < 0x20)
6290             {
6291               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6292                   && ! inhibit_iso_escape_detection
6293                   && ! detect_info.checked)
6294                 {
6295                   if (detect_coding_iso_2022 (coding, &detect_info))
6296                     {
6297                       /* We have scanned the whole data.  */
6298                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6299                         {
6300                           /* We didn't find an 8-bit code.  We may
6301                              have found a null-byte, but it's very
6302                              rare that a binary file conforms to
6303                              ISO-2022.  */
6304                           src = src_end;
6305                           coding->head_ascii = src - coding->source;
6306                         }
6307                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6308                       break;
6309                     }
6310                 }
6311               else if (! c && !inhibit_null_byte_detection)
6312                 {
6313                   null_byte_found = 1;
6314                   if (eight_bit_found)
6315                     break;
6316                 }
6317               if (! eight_bit_found)
6318                 coding->head_ascii++;
6319             }
6320           else if (! eight_bit_found)
6321             coding->head_ascii++;
6322         }
6323
6324       if (null_byte_found || eight_bit_found
6325           || coding->head_ascii < coding->src_bytes
6326           || detect_info.found)
6327         {
6328           enum coding_category category;
6329           struct coding_system *this;
6330
6331           if (coding->head_ascii == coding->src_bytes)
6332             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6333             for (i = 0; i < coding_category_raw_text; i++)
6334               {
6335                 category = coding_priorities[i];
6336                 this = coding_categories + category;
6337                 if (detect_info.found & (1 << category))
6338                   break;
6339               }
6340           else
6341             {
6342               if (null_byte_found)
6343                 {
6344                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6345                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6346                 }
6347               for (i = 0; i < coding_category_raw_text; i++)
6348                 {
6349                   category = coding_priorities[i];
6350                   this = coding_categories + category;
6351                   if (this->id < 0)
6352                     {
6353                       /* No coding system of this category is defined.  */
6354                       detect_info.rejected |= (1 << category);
6355                     }
6356                   else if (category >= coding_category_raw_text)
6357                     continue;
6358                   else if (detect_info.checked & (1 << category))
6359                     {
6360                       if (detect_info.found & (1 << category))
6361                         break;
6362                     }
6363                   else if ((*(this->detector)) (coding, &detect_info)
6364                            && detect_info.found & (1 << category))
6365                     {
6366                       if (category == coding_category_utf_16_auto)
6367                         {
6368                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6369                             category = coding_category_utf_16_le;
6370                           else
6371                             category = coding_category_utf_16_be;
6372                         }
6373                       break;
6374                     }
6375                 }
6376             }
6377
6378           if (i < coding_category_raw_text)
6379             setup_coding_system (CODING_ID_NAME (this->id), coding);
6380           else if (null_byte_found)
6381             setup_coding_system (Qno_conversion, coding);
6382           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6383                    == CATEGORY_MASK_ANY)
6384             setup_coding_system (Qraw_text, coding);
6385           else if (detect_info.rejected)
6386             for (i = 0; i < coding_category_raw_text; i++)
6387               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6388                 {
6389                   this = coding_categories + coding_priorities[i];
6390                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6391                   break;
6392                 }
6393         }
6394     }
6395   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6396            == coding_category_utf_8_auto)
6397     {
6398       Lisp_Object coding_systems;
6399       struct coding_detection_info detect_info;
6400
6401       coding_systems
6402         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6403       detect_info.found = detect_info.rejected = 0;
6404       coding->head_ascii = 0;
6405       if (CONSP (coding_systems)
6406           && detect_coding_utf_8 (coding, &detect_info))
6407         {
6408           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6409             setup_coding_system (XCAR (coding_systems), coding);
6410           else
6411             setup_coding_system (XCDR (coding_systems), coding);
6412         }
6413     }
6414   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6415            == coding_category_utf_16_auto)
6416     {
6417       Lisp_Object coding_systems;
6418       struct coding_detection_info detect_info;
6419
6420       coding_systems
6421         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6422       detect_info.found = detect_info.rejected = 0;
6423       coding->head_ascii = 0;
6424       if (CONSP (coding_systems)
6425           && detect_coding_utf_16 (coding, &detect_info))
6426         {
6427           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6428             setup_coding_system (XCAR (coding_systems), coding);
6429           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6430             setup_coding_system (XCDR (coding_systems), coding);
6431         }
6432     }
6433   coding->mode = saved_mode;
6434 }
6435
6436
6437 static void
6438 decode_eol (struct coding_system *coding)
6439 {
6440   Lisp_Object eol_type;
6441   unsigned char *p, *pbeg, *pend;
6442
6443   eol_type = CODING_ID_EOL_TYPE (coding->id);
6444   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6445     return;
6446
6447   if (NILP (coding->dst_object))
6448     pbeg = coding->destination;
6449   else
6450     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6451   pend = pbeg + coding->produced;
6452
6453   if (VECTORP (eol_type))
6454     {
6455       int eol_seen = EOL_SEEN_NONE;
6456
6457       for (p = pbeg; p < pend; p++)
6458         {
6459           if (*p == '\n')
6460             eol_seen |= EOL_SEEN_LF;
6461           else if (*p == '\r')
6462             {
6463               if (p + 1 < pend && *(p + 1) == '\n')
6464                 {
6465                   eol_seen |= EOL_SEEN_CRLF;
6466                   p++;
6467                 }
6468               else
6469                 eol_seen |= EOL_SEEN_CR;
6470             }
6471         }
6472       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6473       if ((eol_seen & EOL_SEEN_CRLF) != 0
6474           && (eol_seen & EOL_SEEN_CR) != 0
6475           && (eol_seen & EOL_SEEN_LF) == 0)
6476         eol_seen = EOL_SEEN_CRLF;
6477       else if (eol_seen != EOL_SEEN_NONE
6478           && eol_seen != EOL_SEEN_LF
6479           && eol_seen != EOL_SEEN_CRLF
6480           && eol_seen != EOL_SEEN_CR)
6481         eol_seen = EOL_SEEN_LF;
6482       if (eol_seen != EOL_SEEN_NONE)
6483         eol_type = adjust_coding_eol_type (coding, eol_seen);
6484     }
6485
6486   if (EQ (eol_type, Qmac))
6487     {
6488       for (p = pbeg; p < pend; p++)
6489         if (*p == '\r')
6490           *p = '\n';
6491     }
6492   else if (EQ (eol_type, Qdos))
6493     {
6494       ptrdiff_t n = 0;
6495
6496       if (NILP (coding->dst_object))
6497         {
6498           /* Start deleting '\r' from the tail to minimize the memory
6499              movement.  */
6500           for (p = pend - 2; p >= pbeg; p--)
6501             if (*p == '\r')
6502               {
6503                 memmove (p, p + 1, pend-- - p - 1);
6504                 n++;
6505               }
6506         }
6507       else
6508         {
6509           ptrdiff_t pos_byte = coding->dst_pos_byte;
6510           ptrdiff_t pos = coding->dst_pos;
6511           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6512
6513           while (pos < pos_end)
6514             {
6515               p = BYTE_POS_ADDR (pos_byte);
6516               if (*p == '\r' && p[1] == '\n')
6517                 {
6518                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6519                   n++;
6520                   pos_end--;
6521                 }
6522               pos++;
6523               if (coding->dst_multibyte)
6524                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6525               else
6526                 pos_byte++;
6527             }
6528         }
6529       coding->produced -= n;
6530       coding->produced_char -= n;
6531     }
6532 }
6533
6534
6535 /* Return a translation table (or list of them) from coding system
6536    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6537    not ENCODEP). */
6538
6539 static Lisp_Object
6540 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6541 {
6542   Lisp_Object standard, translation_table;
6543   Lisp_Object val;
6544
6545   if (NILP (Venable_character_translation))
6546     {
6547       if (max_lookup)
6548         *max_lookup = 0;
6549       return Qnil;
6550     }
6551   if (encodep)
6552     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6553       standard = Vstandard_translation_table_for_encode;
6554   else
6555     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6556       standard = Vstandard_translation_table_for_decode;
6557   if (NILP (translation_table))
6558     translation_table = standard;
6559   else
6560     {
6561       if (SYMBOLP (translation_table))
6562         translation_table = Fget (translation_table, Qtranslation_table);
6563       else if (CONSP (translation_table))
6564         {
6565           translation_table = Fcopy_sequence (translation_table);
6566           for (val = translation_table; CONSP (val); val = XCDR (val))
6567             if (SYMBOLP (XCAR (val)))
6568               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6569         }
6570       if (CHAR_TABLE_P (standard))
6571         {
6572           if (CONSP (translation_table))
6573             translation_table = nconc2 (translation_table,
6574                                         Fcons (standard, Qnil));
6575           else
6576             translation_table = Fcons (translation_table,
6577                                        Fcons (standard, Qnil));
6578         }
6579     }
6580
6581   if (max_lookup)
6582     {
6583       *max_lookup = 1;
6584       if (CHAR_TABLE_P (translation_table)
6585           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6586         {
6587           val = XCHAR_TABLE (translation_table)->extras[1];
6588           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6589             *max_lookup = XFASTINT (val);
6590         }
6591       else if (CONSP (translation_table))
6592         {
6593           Lisp_Object tail;
6594
6595           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6596             if (CHAR_TABLE_P (XCAR (tail))
6597                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6598               {
6599                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6600                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6601                   *max_lookup = XFASTINT (tailval);
6602               }
6603         }
6604     }
6605   return translation_table;
6606 }
6607
6608 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6609   do {                                                          \
6610     trans = Qnil;                                               \
6611     if (CHAR_TABLE_P (table))                                   \
6612       {                                                         \
6613         trans = CHAR_TABLE_REF (table, c);                      \
6614         if (CHARACTERP (trans))                                 \
6615           c = XFASTINT (trans), trans = Qnil;                   \
6616       }                                                         \
6617     else if (CONSP (table))                                     \
6618       {                                                         \
6619         Lisp_Object tail;                                       \
6620                                                                 \
6621         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6622           if (CHAR_TABLE_P (XCAR (tail)))                       \
6623             {                                                   \
6624               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6625               if (CHARACTERP (trans))                           \
6626                 c = XFASTINT (trans), trans = Qnil;             \
6627               else if (! NILP (trans))                          \
6628                 break;                                          \
6629             }                                                   \
6630       }                                                         \
6631   } while (0)
6632
6633
6634 /* Return a translation of character(s) at BUF according to TRANS.
6635    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6636    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6637    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6638    translation is found, and Qnil if not found..
6639    If BUF is too short to lookup characters in FROM, return Qt.  */
6640
6641 static Lisp_Object
6642 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6643 {
6644
6645   if (INTEGERP (trans))
6646     return trans;
6647   for (; CONSP (trans); trans = XCDR (trans))
6648     {
6649       Lisp_Object val = XCAR (trans);
6650       Lisp_Object from = XCAR (val);
6651       ptrdiff_t len = ASIZE (from);
6652       ptrdiff_t i;
6653
6654       for (i = 0; i < len; i++)
6655         {
6656           if (buf + i == buf_end)
6657             return Qt;
6658           if (XINT (AREF (from, i)) != buf[i])
6659             break;
6660         }
6661       if (i == len)
6662         return val;
6663     }
6664   return Qnil;
6665 }
6666
6667
6668 static int
6669 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6670                bool last_block)
6671 {
6672   unsigned char *dst = coding->destination + coding->produced;
6673   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6674   ptrdiff_t produced;
6675   ptrdiff_t produced_chars = 0;
6676   int carryover = 0;
6677
6678   if (! coding->chars_at_source)
6679     {
6680       /* Source characters are in coding->charbuf.  */
6681       int *buf = coding->charbuf;
6682       int *buf_end = buf + coding->charbuf_used;
6683
6684       if (EQ (coding->src_object, coding->dst_object))
6685         {
6686           coding_set_source (coding);
6687           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6688         }
6689
6690       while (buf < buf_end)
6691         {
6692           int c = *buf;
6693           ptrdiff_t i;
6694
6695           if (c >= 0)
6696             {
6697               ptrdiff_t from_nchars = 1, to_nchars = 1;
6698               Lisp_Object trans = Qnil;
6699
6700               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6701               if (! NILP (trans))
6702                 {
6703                   trans = get_translation (trans, buf, buf_end);
6704                   if (INTEGERP (trans))
6705                     c = XINT (trans);
6706                   else if (CONSP (trans))
6707                     {
6708                       from_nchars = ASIZE (XCAR (trans));
6709                       trans = XCDR (trans);
6710                       if (INTEGERP (trans))
6711                         c = XINT (trans);
6712                       else
6713                         {
6714                           to_nchars = ASIZE (trans);
6715                           c = XINT (AREF (trans, 0));
6716                         }
6717                     }
6718                   else if (EQ (trans, Qt) && ! last_block)
6719                     break;
6720                 }
6721
6722               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6723                 {
6724                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6725                        / MAX_MULTIBYTE_LENGTH)
6726                       < to_nchars)
6727                     memory_full (SIZE_MAX);
6728                   dst = alloc_destination (coding,
6729                                            buf_end - buf
6730                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6731                                            dst);
6732                   if (EQ (coding->src_object, coding->dst_object))
6733                     {
6734                       coding_set_source (coding);
6735                       dst_end = (((unsigned char *) coding->source)
6736                                  + coding->consumed);
6737                     }
6738                   else
6739                     dst_end = coding->destination + coding->dst_bytes;
6740                 }
6741
6742               for (i = 0; i < to_nchars; i++)
6743                 {
6744                   if (i > 0)
6745                     c = XINT (AREF (trans, i));
6746                   if (coding->dst_multibyte
6747                       || ! CHAR_BYTE8_P (c))
6748                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6749                   else
6750                     *dst++ = CHAR_TO_BYTE8 (c);
6751                 }
6752               produced_chars += to_nchars;
6753               buf += from_nchars;
6754             }
6755           else
6756             /* This is an annotation datum.  (-C) is the length.  */
6757             buf += -c;
6758         }
6759       carryover = buf_end - buf;
6760     }
6761   else
6762     {
6763       /* Source characters are at coding->source.  */
6764       const unsigned char *src = coding->source;
6765       const unsigned char *src_end = src + coding->consumed;
6766
6767       if (EQ (coding->dst_object, coding->src_object))
6768         dst_end = (unsigned char *) src;
6769       if (coding->src_multibyte != coding->dst_multibyte)
6770         {
6771           if (coding->src_multibyte)
6772             {
6773               bool multibytep = 1;
6774               ptrdiff_t consumed_chars = 0;
6775
6776               while (1)
6777                 {
6778                   const unsigned char *src_base = src;
6779                   int c;
6780
6781                   ONE_MORE_BYTE (c);
6782                   if (dst == dst_end)
6783                     {
6784                       if (EQ (coding->src_object, coding->dst_object))
6785                         dst_end = (unsigned char *) src;
6786                       if (dst == dst_end)
6787                         {
6788                           ptrdiff_t offset = src - coding->source;
6789
6790                           dst = alloc_destination (coding, src_end - src + 1,
6791                                                    dst);
6792                           dst_end = coding->destination + coding->dst_bytes;
6793                           coding_set_source (coding);
6794                           src = coding->source + offset;
6795                           src_end = coding->source + coding->consumed;
6796                           if (EQ (coding->src_object, coding->dst_object))
6797                             dst_end = (unsigned char *) src;
6798                         }
6799                     }
6800                   *dst++ = c;
6801                   produced_chars++;
6802                 }
6803             no_more_source:
6804               ;
6805             }
6806           else
6807             while (src < src_end)
6808               {
6809                 bool multibytep = 1;
6810                 int c = *src++;
6811
6812                 if (dst >= dst_end - 1)
6813                   {
6814                     if (EQ (coding->src_object, coding->dst_object))
6815                       dst_end = (unsigned char *) src;
6816                     if (dst >= dst_end - 1)
6817                       {
6818                         ptrdiff_t offset = src - coding->source;
6819                         ptrdiff_t more_bytes;
6820
6821                         if (EQ (coding->src_object, coding->dst_object))
6822                           more_bytes = ((src_end - src) / 2) + 2;
6823                         else
6824                           more_bytes = src_end - src + 2;
6825                         dst = alloc_destination (coding, more_bytes, dst);
6826                         dst_end = coding->destination + coding->dst_bytes;
6827                         coding_set_source (coding);
6828                         src = coding->source + offset;
6829                         src_end = coding->source + coding->consumed;
6830                         if (EQ (coding->src_object, coding->dst_object))
6831                           dst_end = (unsigned char *) src;
6832                       }
6833                   }
6834                 EMIT_ONE_BYTE (c);
6835               }
6836         }
6837       else
6838         {
6839           if (!EQ (coding->src_object, coding->dst_object))
6840             {
6841               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6842
6843               if (require > 0)
6844                 {
6845                   ptrdiff_t offset = src - coding->source;
6846
6847                   dst = alloc_destination (coding, require, dst);
6848                   coding_set_source (coding);
6849                   src = coding->source + offset;
6850                   src_end = coding->source + coding->consumed;
6851                 }
6852             }
6853           produced_chars = coding->consumed_char;
6854           while (src < src_end)
6855             *dst++ = *src++;
6856         }
6857     }
6858
6859   produced = dst - (coding->destination + coding->produced);
6860   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6861     insert_from_gap (produced_chars, produced);
6862   coding->produced += produced;
6863   coding->produced_char += produced_chars;
6864   return carryover;
6865 }
6866
6867 /* Compose text in CODING->object according to the annotation data at
6868    CHARBUF.  CHARBUF is an array:
6869      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6870  */
6871
6872 static inline void
6873 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6874 {
6875   int len;
6876   ptrdiff_t to;
6877   enum composition_method method;
6878   Lisp_Object components;
6879
6880   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6881   to = pos + charbuf[2];
6882   method = (enum composition_method) (charbuf[4]);
6883
6884   if (method == COMPOSITION_RELATIVE)
6885     components = Qnil;
6886   else
6887     {
6888       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6889       int i, j;
6890
6891       if (method == COMPOSITION_WITH_RULE)
6892         len = charbuf[2] * 3 - 2;
6893       charbuf += MAX_ANNOTATION_LENGTH;
6894       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6895       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6896         {
6897           if (charbuf[i] >= 0)
6898             args[j] = make_number (charbuf[i]);
6899           else
6900             {
6901               i++;
6902               args[j] = make_number (charbuf[i] % 0x100);
6903             }
6904         }
6905       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6906     }
6907   compose_text (pos, to, components, Qnil, coding->dst_object);
6908 }
6909
6910
6911 /* Put `charset' property on text in CODING->object according to
6912    the annotation data at CHARBUF.  CHARBUF is an array:
6913      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6914  */
6915
6916 static inline void
6917 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6918 {
6919   ptrdiff_t from = pos - charbuf[2];
6920   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6921
6922   Fput_text_property (make_number (from), make_number (pos),
6923                       Qcharset, CHARSET_NAME (charset),
6924                       coding->dst_object);
6925 }
6926
6927
6928 #define CHARBUF_SIZE 0x4000
6929
6930 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6931   do {                                                                  \
6932     int size = CHARBUF_SIZE;                                            \
6933                                                                         \
6934     coding->charbuf = NULL;                                             \
6935     while (size > 1024)                                                 \
6936       {                                                                 \
6937         coding->charbuf = alloca (sizeof (int) * size);                 \
6938         if (coding->charbuf)                                            \
6939           break;                                                        \
6940         size >>= 1;                                                     \
6941       }                                                                 \
6942     if (! coding->charbuf)                                              \
6943       {                                                                 \
6944         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6945         return;                                                         \
6946       }                                                                 \
6947     coding->charbuf_size = size;                                        \
6948   } while (0)
6949
6950
6951 static void
6952 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6953 {
6954   int *charbuf = coding->charbuf;
6955   int *charbuf_end = charbuf + coding->charbuf_used;
6956
6957   if (NILP (coding->dst_object))
6958     return;
6959
6960   while (charbuf < charbuf_end)
6961     {
6962       if (*charbuf >= 0)
6963         pos++, charbuf++;
6964       else
6965         {
6966           int len = -*charbuf;
6967
6968           if (len > 2)
6969             switch (charbuf[1])
6970               {
6971               case CODING_ANNOTATE_COMPOSITION_MASK:
6972                 produce_composition (coding, charbuf, pos);
6973                 break;
6974               case CODING_ANNOTATE_CHARSET_MASK:
6975                 produce_charset (coding, charbuf, pos);
6976                 break;
6977               }
6978           charbuf += len;
6979         }
6980     }
6981 }
6982
6983 /* Decode the data at CODING->src_object into CODING->dst_object.
6984    CODING->src_object is a buffer, a string, or nil.
6985    CODING->dst_object is a buffer.
6986
6987    If CODING->src_object is a buffer, it must be the current buffer.
6988    In this case, if CODING->src_pos is positive, it is a position of
6989    the source text in the buffer, otherwise, the source text is in the
6990    gap area of the buffer, and CODING->src_pos specifies the offset of
6991    the text from GPT (which must be the same as PT).  If this is the
6992    same buffer as CODING->dst_object, CODING->src_pos must be
6993    negative.
6994
6995    If CODING->src_object is a string, CODING->src_pos is an index to
6996    that string.
6997
6998    If CODING->src_object is nil, CODING->source must already point to
6999    the non-relocatable memory area.  In this case, CODING->src_pos is
7000    an offset from CODING->source.
7001
7002    The decoded data is inserted at the current point of the buffer
7003    CODING->dst_object.
7004 */
7005
7006 static void
7007 decode_coding (struct coding_system *coding)
7008 {
7009   Lisp_Object attrs;
7010   Lisp_Object undo_list;
7011   Lisp_Object translation_table;
7012   struct ccl_spec cclspec;
7013   int carryover;
7014   int i;
7015
7016   if (BUFFERP (coding->src_object)
7017       && coding->src_pos > 0
7018       && coding->src_pos < GPT
7019       && coding->src_pos + coding->src_chars > GPT)
7020     move_gap_both (coding->src_pos, coding->src_pos_byte);
7021
7022   undo_list = Qt;
7023   if (BUFFERP (coding->dst_object))
7024     {
7025       set_buffer_internal (XBUFFER (coding->dst_object));
7026       if (GPT != PT)
7027         move_gap_both (PT, PT_BYTE);
7028
7029       /* We must disable undo_list in order to record the whole insert
7030          transaction via record_insert at the end.  But doing so also
7031          disables the recording of the first change to the undo_list.
7032          Therefore we check for first change here and record it via
7033          record_first_change if needed.  */
7034       if (MODIFF <= SAVE_MODIFF)
7035         record_first_change ();
7036
7037       undo_list = BVAR (current_buffer, undo_list);
7038       bset_undo_list (current_buffer, Qt);
7039     }
7040
7041   coding->consumed = coding->consumed_char = 0;
7042   coding->produced = coding->produced_char = 0;
7043   coding->chars_at_source = 0;
7044   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7045   coding->errors = 0;
7046
7047   ALLOC_CONVERSION_WORK_AREA (coding);
7048
7049   attrs = CODING_ID_ATTRS (coding->id);
7050   translation_table = get_translation_table (attrs, 0, NULL);
7051
7052   carryover = 0;
7053   if (coding->decoder == decode_coding_ccl)
7054     {
7055       coding->spec.ccl = &cclspec;
7056       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7057     }
7058   do
7059     {
7060       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7061
7062       coding_set_source (coding);
7063       coding->annotated = 0;
7064       coding->charbuf_used = carryover;
7065       (*(coding->decoder)) (coding);
7066       coding_set_destination (coding);
7067       carryover = produce_chars (coding, translation_table, 0);
7068       if (coding->annotated)
7069         produce_annotation (coding, pos);
7070       for (i = 0; i < carryover; i++)
7071         coding->charbuf[i]
7072           = coding->charbuf[coding->charbuf_used - carryover + i];
7073     }
7074   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7075          || (coding->consumed < coding->src_bytes
7076              && (coding->result == CODING_RESULT_SUCCESS
7077                  || coding->result == CODING_RESULT_INVALID_SRC)));
7078
7079   if (carryover > 0)
7080     {
7081       coding_set_destination (coding);
7082       coding->charbuf_used = carryover;
7083       produce_chars (coding, translation_table, 1);
7084     }
7085
7086   coding->carryover_bytes = 0;
7087   if (coding->consumed < coding->src_bytes)
7088     {
7089       int nbytes = coding->src_bytes - coding->consumed;
7090       const unsigned char *src;
7091
7092       coding_set_source (coding);
7093       coding_set_destination (coding);
7094       src = coding->source + coding->consumed;
7095
7096       if (coding->mode & CODING_MODE_LAST_BLOCK)
7097         {
7098           /* Flush out unprocessed data as binary chars.  We are sure
7099              that the number of data is less than the size of
7100              coding->charbuf.  */
7101           coding->charbuf_used = 0;
7102           coding->chars_at_source = 0;
7103
7104           while (nbytes-- > 0)
7105             {
7106               int c = *src++;
7107
7108               if (c & 0x80)
7109                 c = BYTE8_TO_CHAR (c);
7110               coding->charbuf[coding->charbuf_used++] = c;
7111             }
7112           produce_chars (coding, Qnil, 1);
7113         }
7114       else
7115         {
7116           /* Record unprocessed bytes in coding->carryover.  We are
7117              sure that the number of data is less than the size of
7118              coding->carryover.  */
7119           unsigned char *p = coding->carryover;
7120
7121           if (nbytes > sizeof coding->carryover)
7122             nbytes = sizeof coding->carryover;
7123           coding->carryover_bytes = nbytes;
7124           while (nbytes-- > 0)
7125             *p++ = *src++;
7126         }
7127       coding->consumed = coding->src_bytes;
7128     }
7129
7130   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7131       && !inhibit_eol_conversion)
7132     decode_eol (coding);
7133   if (BUFFERP (coding->dst_object))
7134     {
7135       bset_undo_list (current_buffer, undo_list);
7136       record_insert (coding->dst_pos, coding->produced_char);
7137     }
7138 }
7139
7140
7141 /* Extract an annotation datum from a composition starting at POS and
7142    ending before LIMIT of CODING->src_object (buffer or string), store
7143    the data in BUF, set *STOP to a starting position of the next
7144    composition (if any) or to LIMIT, and return the address of the
7145    next element of BUF.
7146
7147    If such an annotation is not found, set *STOP to a starting
7148    position of a composition after POS (if any) or to LIMIT, and
7149    return BUF.  */
7150
7151 static inline int *
7152 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7153                                struct coding_system *coding, int *buf,
7154                                ptrdiff_t *stop)
7155 {
7156   ptrdiff_t start, end;
7157   Lisp_Object prop;
7158
7159   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7160       || end > limit)
7161     *stop = limit;
7162   else if (start > pos)
7163     *stop = start;
7164   else
7165     {
7166       if (start == pos)
7167         {
7168           /* We found a composition.  Store the corresponding
7169              annotation data in BUF.  */
7170           int *head = buf;
7171           enum composition_method method = COMPOSITION_METHOD (prop);
7172           int nchars = COMPOSITION_LENGTH (prop);
7173
7174           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7175           if (method != COMPOSITION_RELATIVE)
7176             {
7177               Lisp_Object components;
7178               ptrdiff_t i, len, i_byte;
7179
7180               components = COMPOSITION_COMPONENTS (prop);
7181               if (VECTORP (components))
7182                 {
7183                   len = ASIZE (components);
7184                   for (i = 0; i < len; i++)
7185                     *buf++ = XINT (AREF (components, i));
7186                 }
7187               else if (STRINGP (components))
7188                 {
7189                   len = SCHARS (components);
7190                   i = i_byte = 0;
7191                   while (i < len)
7192                     {
7193                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7194                       buf++;
7195                     }
7196                 }
7197               else if (INTEGERP (components))
7198                 {
7199                   len = 1;
7200                   *buf++ = XINT (components);
7201                 }
7202               else if (CONSP (components))
7203                 {
7204                   for (len = 0; CONSP (components);
7205                        len++, components = XCDR (components))
7206                     *buf++ = XINT (XCAR (components));
7207                 }
7208               else
7209                 emacs_abort ();
7210               *head -= len;
7211             }
7212         }
7213
7214       if (find_composition (end, limit, &start, &end, &prop,
7215                             coding->src_object)
7216           && end <= limit)
7217         *stop = start;
7218       else
7219         *stop = limit;
7220     }
7221   return buf;
7222 }
7223
7224
7225 /* Extract an annotation datum from a text property `charset' at POS of
7226    CODING->src_object (buffer of string), store the data in BUF, set
7227    *STOP to the position where the value of `charset' property changes
7228    (limiting by LIMIT), and return the address of the next element of
7229    BUF.
7230
7231    If the property value is nil, set *STOP to the position where the
7232    property value is non-nil (limiting by LIMIT), and return BUF.  */
7233
7234 static inline int *
7235 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7236                            struct coding_system *coding, int *buf,
7237                            ptrdiff_t *stop)
7238 {
7239   Lisp_Object val, next;
7240   int id;
7241
7242   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7243   if (! NILP (val) && CHARSETP (val))
7244     id = XINT (CHARSET_SYMBOL_ID (val));
7245   else
7246     id = -1;
7247   ADD_CHARSET_DATA (buf, 0, id);
7248   next = Fnext_single_property_change (make_number (pos), Qcharset,
7249                                        coding->src_object,
7250                                        make_number (limit));
7251   *stop = XINT (next);
7252   return buf;
7253 }
7254
7255
7256 static void
7257 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7258                int max_lookup)
7259 {
7260   int *buf = coding->charbuf;
7261   int *buf_end = coding->charbuf + coding->charbuf_size;
7262   const unsigned char *src = coding->source + coding->consumed;
7263   const unsigned char *src_end = coding->source + coding->src_bytes;
7264   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7265   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7266   bool multibytep = coding->src_multibyte;
7267   Lisp_Object eol_type;
7268   int c;
7269   ptrdiff_t stop, stop_composition, stop_charset;
7270   int *lookup_buf = NULL;
7271
7272   if (! NILP (translation_table))
7273     lookup_buf = alloca (sizeof (int) * max_lookup);
7274
7275   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7276   if (VECTORP (eol_type))
7277     eol_type = Qunix;
7278
7279   /* Note: composition handling is not yet implemented.  */
7280   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7281
7282   if (NILP (coding->src_object))
7283     stop = stop_composition = stop_charset = end_pos;
7284   else
7285     {
7286       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7287         stop = stop_composition = pos;
7288       else
7289         stop = stop_composition = end_pos;
7290       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7291         stop = stop_charset = pos;
7292       else
7293         stop_charset = end_pos;
7294     }
7295
7296   /* Compensate for CRLF and conversion.  */
7297   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7298   while (buf < buf_end)
7299     {
7300       Lisp_Object trans;
7301
7302       if (pos == stop)
7303         {
7304           if (pos == end_pos)
7305             break;
7306           if (pos == stop_composition)
7307             buf = handle_composition_annotation (pos, end_pos, coding,
7308                                                  buf, &stop_composition);
7309           if (pos == stop_charset)
7310             buf = handle_charset_annotation (pos, end_pos, coding,
7311                                              buf, &stop_charset);
7312           stop = (stop_composition < stop_charset
7313                   ? stop_composition : stop_charset);
7314         }
7315
7316       if (! multibytep)
7317         {
7318           int bytes;
7319
7320           if (coding->encoder == encode_coding_raw_text
7321               || coding->encoder == encode_coding_ccl)
7322             c = *src++, pos++;
7323           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7324             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7325           else
7326             c = BYTE8_TO_CHAR (*src), src++, pos++;
7327         }
7328       else
7329         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7330       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7331         c = '\n';
7332       if (! EQ (eol_type, Qunix))
7333         {
7334           if (c == '\n')
7335             {
7336               if (EQ (eol_type, Qdos))
7337                 *buf++ = '\r';
7338               else
7339                 c = '\r';
7340             }
7341         }
7342
7343       trans = Qnil;
7344       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7345       if (NILP (trans))
7346         *buf++ = c;
7347       else
7348         {
7349           ptrdiff_t from_nchars = 1, to_nchars = 1;
7350           int *lookup_buf_end;
7351           const unsigned char *p = src;
7352           int i;
7353
7354           lookup_buf[0] = c;
7355           for (i = 1; i < max_lookup && p < src_end; i++)
7356             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7357           lookup_buf_end = lookup_buf + i;
7358           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7359           if (INTEGERP (trans))
7360             c = XINT (trans);
7361           else if (CONSP (trans))
7362             {
7363               from_nchars = ASIZE (XCAR (trans));
7364               trans = XCDR (trans);
7365               if (INTEGERP (trans))
7366                 c = XINT (trans);
7367               else
7368                 {
7369                   to_nchars = ASIZE (trans);
7370                   if (buf_end - buf < to_nchars)
7371                     break;
7372                   c = XINT (AREF (trans, 0));
7373                 }
7374             }
7375           else
7376             break;
7377           *buf++ = c;
7378           for (i = 1; i < to_nchars; i++)
7379             *buf++ = XINT (AREF (trans, i));
7380           for (i = 1; i < from_nchars; i++, pos++)
7381             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7382         }
7383     }
7384
7385   coding->consumed = src - coding->source;
7386   coding->consumed_char = pos - coding->src_pos;
7387   coding->charbuf_used = buf - coding->charbuf;
7388   coding->chars_at_source = 0;
7389 }
7390
7391
7392 /* Encode the text at CODING->src_object into CODING->dst_object.
7393    CODING->src_object is a buffer or a string.
7394    CODING->dst_object is a buffer or nil.
7395
7396    If CODING->src_object is a buffer, it must be the current buffer.
7397    In this case, if CODING->src_pos is positive, it is a position of
7398    the source text in the buffer, otherwise. the source text is in the
7399    gap area of the buffer, and coding->src_pos specifies the offset of
7400    the text from GPT (which must be the same as PT).  If this is the
7401    same buffer as CODING->dst_object, CODING->src_pos must be
7402    negative and CODING should not have `pre-write-conversion'.
7403
7404    If CODING->src_object is a string, CODING should not have
7405    `pre-write-conversion'.
7406
7407    If CODING->dst_object is a buffer, the encoded data is inserted at
7408    the current point of that buffer.
7409
7410    If CODING->dst_object is nil, the encoded data is placed at the
7411    memory area specified by CODING->destination.  */
7412
7413 static void
7414 encode_coding (struct coding_system *coding)
7415 {
7416   Lisp_Object attrs;
7417   Lisp_Object translation_table;
7418   int max_lookup;
7419   struct ccl_spec cclspec;
7420
7421   attrs = CODING_ID_ATTRS (coding->id);
7422   if (coding->encoder == encode_coding_raw_text)
7423     translation_table = Qnil, max_lookup = 0;
7424   else
7425     translation_table = get_translation_table (attrs, 1, &max_lookup);
7426
7427   if (BUFFERP (coding->dst_object))
7428     {
7429       set_buffer_internal (XBUFFER (coding->dst_object));
7430       coding->dst_multibyte
7431         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7432     }
7433
7434   coding->consumed = coding->consumed_char = 0;
7435   coding->produced = coding->produced_char = 0;
7436   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7437   coding->errors = 0;
7438
7439   ALLOC_CONVERSION_WORK_AREA (coding);
7440
7441   if (coding->encoder == encode_coding_ccl)
7442     {
7443       coding->spec.ccl = &cclspec;
7444       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7445     }
7446   do {
7447     coding_set_source (coding);
7448     consume_chars (coding, translation_table, max_lookup);
7449     coding_set_destination (coding);
7450     (*(coding->encoder)) (coding);
7451   } while (coding->consumed_char < coding->src_chars);
7452
7453   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7454     insert_from_gap (coding->produced_char, coding->produced);
7455 }
7456
7457
7458 /* Name (or base name) of work buffer for code conversion.  */
7459 static Lisp_Object Vcode_conversion_workbuf_name;
7460
7461 /* A working buffer used by the top level conversion.  Once it is
7462    created, it is never destroyed.  It has the name
7463    Vcode_conversion_workbuf_name.  The other working buffers are
7464    destroyed after the use is finished, and their names are modified
7465    versions of Vcode_conversion_workbuf_name.  */
7466 static Lisp_Object Vcode_conversion_reused_workbuf;
7467
7468 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7469 static bool reused_workbuf_in_use;
7470
7471
7472 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7473    multibyteness of returning buffer.  */
7474
7475 static Lisp_Object
7476 make_conversion_work_buffer (bool multibyte)
7477 {
7478   Lisp_Object name, workbuf;
7479   struct buffer *current;
7480
7481   if (reused_workbuf_in_use)
7482     {
7483       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7484       workbuf = Fget_buffer_create (name);
7485     }
7486   else
7487     {
7488       reused_workbuf_in_use = 1;
7489       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7490         Vcode_conversion_reused_workbuf
7491           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7492       workbuf = Vcode_conversion_reused_workbuf;
7493     }
7494   current = current_buffer;
7495   set_buffer_internal (XBUFFER (workbuf));
7496   /* We can't allow modification hooks to run in the work buffer.  For
7497      instance, directory_files_internal assumes that file decoding
7498      doesn't compile new regexps.  */
7499   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7500   Ferase_buffer ();
7501   bset_undo_list (current_buffer, Qt);
7502   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7503   set_buffer_internal (current);
7504   return workbuf;
7505 }
7506
7507
7508 static Lisp_Object
7509 code_conversion_restore (Lisp_Object arg)
7510 {
7511   Lisp_Object current, workbuf;
7512   struct gcpro gcpro1;
7513
7514   GCPRO1 (arg);
7515   current = XCAR (arg);
7516   workbuf = XCDR (arg);
7517   if (! NILP (workbuf))
7518     {
7519       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7520         reused_workbuf_in_use = 0;
7521       else
7522         Fkill_buffer (workbuf);
7523     }
7524   set_buffer_internal (XBUFFER (current));
7525   UNGCPRO;
7526   return Qnil;
7527 }
7528
7529 Lisp_Object
7530 code_conversion_save (bool with_work_buf, bool multibyte)
7531 {
7532   Lisp_Object workbuf = Qnil;
7533
7534   if (with_work_buf)
7535     workbuf = make_conversion_work_buffer (multibyte);
7536   record_unwind_protect (code_conversion_restore,
7537                          Fcons (Fcurrent_buffer (), workbuf));
7538   return workbuf;
7539 }
7540
7541 void
7542 decode_coding_gap (struct coding_system *coding,
7543                    ptrdiff_t chars, ptrdiff_t bytes)
7544 {
7545   ptrdiff_t count = SPECPDL_INDEX ();
7546   Lisp_Object attrs;
7547
7548   code_conversion_save (0, 0);
7549
7550   coding->src_object = Fcurrent_buffer ();
7551   coding->src_chars = chars;
7552   coding->src_bytes = bytes;
7553   coding->src_pos = -chars;
7554   coding->src_pos_byte = -bytes;
7555   coding->src_multibyte = chars < bytes;
7556   coding->dst_object = coding->src_object;
7557   coding->dst_pos = PT;
7558   coding->dst_pos_byte = PT_BYTE;
7559   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7560
7561   if (CODING_REQUIRE_DETECTION (coding))
7562     detect_coding (coding);
7563
7564   coding->mode |= CODING_MODE_LAST_BLOCK;
7565   current_buffer->text->inhibit_shrinking = 1;
7566   decode_coding (coding);
7567   current_buffer->text->inhibit_shrinking = 0;
7568
7569   attrs = CODING_ID_ATTRS (coding->id);
7570   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7571     {
7572       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7573       Lisp_Object val;
7574
7575       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7576       val = call1 (CODING_ATTR_POST_READ (attrs),
7577                    make_number (coding->produced_char));
7578       CHECK_NATNUM (val);
7579       coding->produced_char += Z - prev_Z;
7580       coding->produced += Z_BYTE - prev_Z_BYTE;
7581     }
7582
7583   unbind_to (count, Qnil);
7584 }
7585
7586
7587 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7588    SRC_OBJECT into DST_OBJECT by coding context CODING.
7589
7590    SRC_OBJECT is a buffer, a string, or Qnil.
7591
7592    If it is a buffer, the text is at point of the buffer.  FROM and TO
7593    are positions in the buffer.
7594
7595    If it is a string, the text is at the beginning of the string.
7596    FROM and TO are indices to the string.
7597
7598    If it is nil, the text is at coding->source.  FROM and TO are
7599    indices to coding->source.
7600
7601    DST_OBJECT is a buffer, Qt, or Qnil.
7602
7603    If it is a buffer, the decoded text is inserted at point of the
7604    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7605    is deleted.
7606
7607    If it is Qt, a string is made from the decoded text, and
7608    set in CODING->dst_object.
7609
7610    If it is Qnil, the decoded text is stored at CODING->destination.
7611    The caller must allocate CODING->dst_bytes bytes at
7612    CODING->destination by xmalloc.  If the decoded text is longer than
7613    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7614  */
7615
7616 void
7617 decode_coding_object (struct coding_system *coding,
7618                       Lisp_Object src_object,
7619                       ptrdiff_t from, ptrdiff_t from_byte,
7620                       ptrdiff_t to, ptrdiff_t to_byte,
7621                       Lisp_Object dst_object)
7622 {
7623   ptrdiff_t count = SPECPDL_INDEX ();
7624   unsigned char *destination IF_LINT (= NULL);
7625   ptrdiff_t dst_bytes IF_LINT (= 0);
7626   ptrdiff_t chars = to - from;
7627   ptrdiff_t bytes = to_byte - from_byte;
7628   Lisp_Object attrs;
7629   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7630   bool need_marker_adjustment = 0;
7631   Lisp_Object old_deactivate_mark;
7632
7633   old_deactivate_mark = Vdeactivate_mark;
7634
7635   if (NILP (dst_object))
7636     {
7637       destination = coding->destination;
7638       dst_bytes = coding->dst_bytes;
7639     }
7640
7641   coding->src_object = src_object;
7642   coding->src_chars = chars;
7643   coding->src_bytes = bytes;
7644   coding->src_multibyte = chars < bytes;
7645
7646   if (STRINGP (src_object))
7647     {
7648       coding->src_pos = from;
7649       coding->src_pos_byte = from_byte;
7650     }
7651   else if (BUFFERP (src_object))
7652     {
7653       set_buffer_internal (XBUFFER (src_object));
7654       if (from != GPT)
7655         move_gap_both (from, from_byte);
7656       if (EQ (src_object, dst_object))
7657         {
7658           struct Lisp_Marker *tail;
7659
7660           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7661             {
7662               tail->need_adjustment
7663                 = tail->charpos == (tail->insertion_type ? from : to);
7664               need_marker_adjustment |= tail->need_adjustment;
7665             }
7666           saved_pt = PT, saved_pt_byte = PT_BYTE;
7667           TEMP_SET_PT_BOTH (from, from_byte);
7668           current_buffer->text->inhibit_shrinking = 1;
7669           del_range_both (from, from_byte, to, to_byte, 1);
7670           coding->src_pos = -chars;
7671           coding->src_pos_byte = -bytes;
7672         }
7673       else
7674         {
7675           coding->src_pos = from;
7676           coding->src_pos_byte = from_byte;
7677         }
7678     }
7679
7680   if (CODING_REQUIRE_DETECTION (coding))
7681     detect_coding (coding);
7682   attrs = CODING_ID_ATTRS (coding->id);
7683
7684   if (EQ (dst_object, Qt)
7685       || (! NILP (CODING_ATTR_POST_READ (attrs))
7686           && NILP (dst_object)))
7687     {
7688       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7689       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7690       coding->dst_pos = BEG;
7691       coding->dst_pos_byte = BEG_BYTE;
7692     }
7693   else if (BUFFERP (dst_object))
7694     {
7695       code_conversion_save (0, 0);
7696       coding->dst_object = dst_object;
7697       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7698       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7699       coding->dst_multibyte
7700         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7701     }
7702   else
7703     {
7704       code_conversion_save (0, 0);
7705       coding->dst_object = Qnil;
7706       /* Most callers presume this will return a multibyte result, and they
7707          won't use `binary' or `raw-text' anyway, so let's not worry about
7708          CODING_FOR_UNIBYTE.  */
7709       coding->dst_multibyte = 1;
7710     }
7711
7712   decode_coding (coding);
7713
7714   if (BUFFERP (coding->dst_object))
7715     set_buffer_internal (XBUFFER (coding->dst_object));
7716
7717   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7718     {
7719       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7720       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7721       Lisp_Object val;
7722
7723       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7724       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7725               old_deactivate_mark);
7726       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7727                         make_number (coding->produced_char));
7728       UNGCPRO;
7729       CHECK_NATNUM (val);
7730       coding->produced_char += Z - prev_Z;
7731       coding->produced += Z_BYTE - prev_Z_BYTE;
7732     }
7733
7734   if (EQ (dst_object, Qt))
7735     {
7736       coding->dst_object = Fbuffer_string ();
7737     }
7738   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7739     {
7740       set_buffer_internal (XBUFFER (coding->dst_object));
7741       if (dst_bytes < coding->produced)
7742         {
7743           destination = xrealloc (destination, coding->produced);
7744           if (! destination)
7745             {
7746               record_conversion_result (coding,
7747                                         CODING_RESULT_INSUFFICIENT_MEM);
7748               unbind_to (count, Qnil);
7749               return;
7750             }
7751           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7752             move_gap_both (BEGV, BEGV_BYTE);
7753           memcpy (destination, BEGV_ADDR, coding->produced);
7754           coding->destination = destination;
7755         }
7756     }
7757
7758   if (saved_pt >= 0)
7759     {
7760       /* This is the case of:
7761          (BUFFERP (src_object) && EQ (src_object, dst_object))
7762          As we have moved PT while replacing the original buffer
7763          contents, we must recover it now.  */
7764       set_buffer_internal (XBUFFER (src_object));
7765       current_buffer->text->inhibit_shrinking = 0;
7766       if (saved_pt < from)
7767         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7768       else if (saved_pt < from + chars)
7769         TEMP_SET_PT_BOTH (from, from_byte);
7770       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7771         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7772                           saved_pt_byte + (coding->produced - bytes));
7773       else
7774         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7775                           saved_pt_byte + (coding->produced - bytes));
7776
7777       if (need_marker_adjustment)
7778         {
7779           struct Lisp_Marker *tail;
7780
7781           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7782             if (tail->need_adjustment)
7783               {
7784                 tail->need_adjustment = 0;
7785                 if (tail->insertion_type)
7786                   {
7787                     tail->bytepos = from_byte;
7788                     tail->charpos = from;
7789                   }
7790                 else
7791                   {
7792                     tail->bytepos = from_byte + coding->produced;
7793                     tail->charpos
7794                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7795                          ? tail->bytepos : from + coding->produced_char);
7796                   }
7797               }
7798         }
7799     }
7800
7801   Vdeactivate_mark = old_deactivate_mark;
7802   unbind_to (count, coding->dst_object);
7803 }
7804
7805
7806 void
7807 encode_coding_object (struct coding_system *coding,
7808                       Lisp_Object src_object,
7809                       ptrdiff_t from, ptrdiff_t from_byte,
7810                       ptrdiff_t to, ptrdiff_t to_byte,
7811                       Lisp_Object dst_object)
7812 {
7813   ptrdiff_t count = SPECPDL_INDEX ();
7814   ptrdiff_t chars = to - from;
7815   ptrdiff_t bytes = to_byte - from_byte;
7816   Lisp_Object attrs;
7817   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7818   bool need_marker_adjustment = 0;
7819   bool kill_src_buffer = 0;
7820   Lisp_Object old_deactivate_mark;
7821
7822   old_deactivate_mark = Vdeactivate_mark;
7823
7824   coding->src_object = src_object;
7825   coding->src_chars = chars;
7826   coding->src_bytes = bytes;
7827   coding->src_multibyte = chars < bytes;
7828
7829   attrs = CODING_ID_ATTRS (coding->id);
7830
7831   if (EQ (src_object, dst_object))
7832     {
7833       struct Lisp_Marker *tail;
7834
7835       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7836         {
7837           tail->need_adjustment
7838             = tail->charpos == (tail->insertion_type ? from : to);
7839           need_marker_adjustment |= tail->need_adjustment;
7840         }
7841     }
7842
7843   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7844     {
7845       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7846       set_buffer_internal (XBUFFER (coding->src_object));
7847       if (STRINGP (src_object))
7848         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7849       else if (BUFFERP (src_object))
7850         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7851       else
7852         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7853
7854       if (EQ (src_object, dst_object))
7855         {
7856           set_buffer_internal (XBUFFER (src_object));
7857           saved_pt = PT, saved_pt_byte = PT_BYTE;
7858           del_range_both (from, from_byte, to, to_byte, 1);
7859           set_buffer_internal (XBUFFER (coding->src_object));
7860         }
7861
7862       {
7863         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7864
7865         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7866                 old_deactivate_mark);
7867         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7868                     make_number (BEG), make_number (Z));
7869         UNGCPRO;
7870       }
7871       if (XBUFFER (coding->src_object) != current_buffer)
7872         kill_src_buffer = 1;
7873       coding->src_object = Fcurrent_buffer ();
7874       if (BEG != GPT)
7875         move_gap_both (BEG, BEG_BYTE);
7876       coding->src_chars = Z - BEG;
7877       coding->src_bytes = Z_BYTE - BEG_BYTE;
7878       coding->src_pos = BEG;
7879       coding->src_pos_byte = BEG_BYTE;
7880       coding->src_multibyte = Z < Z_BYTE;
7881     }
7882   else if (STRINGP (src_object))
7883     {
7884       code_conversion_save (0, 0);
7885       coding->src_pos = from;
7886       coding->src_pos_byte = from_byte;
7887     }
7888   else if (BUFFERP (src_object))
7889     {
7890       code_conversion_save (0, 0);
7891       set_buffer_internal (XBUFFER (src_object));
7892       if (EQ (src_object, dst_object))
7893         {
7894           saved_pt = PT, saved_pt_byte = PT_BYTE;
7895           coding->src_object = del_range_1 (from, to, 1, 1);
7896           coding->src_pos = 0;
7897           coding->src_pos_byte = 0;
7898         }
7899       else
7900         {
7901           if (from < GPT && to >= GPT)
7902             move_gap_both (from, from_byte);
7903           coding->src_pos = from;
7904           coding->src_pos_byte = from_byte;
7905         }
7906     }
7907   else
7908     code_conversion_save (0, 0);
7909
7910   if (BUFFERP (dst_object))
7911     {
7912       coding->dst_object = dst_object;
7913       if (EQ (src_object, dst_object))
7914         {
7915           coding->dst_pos = from;
7916           coding->dst_pos_byte = from_byte;
7917         }
7918       else
7919         {
7920           struct buffer *current = current_buffer;
7921
7922           set_buffer_temp (XBUFFER (dst_object));
7923           coding->dst_pos = PT;
7924           coding->dst_pos_byte = PT_BYTE;
7925           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7926           set_buffer_temp (current);
7927         }
7928       coding->dst_multibyte
7929         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7930     }
7931   else if (EQ (dst_object, Qt))
7932     {
7933       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7934       coding->dst_object = Qnil;
7935       coding->destination = xmalloc (dst_bytes);
7936       coding->dst_bytes = dst_bytes;
7937       coding->dst_multibyte = 0;
7938     }
7939   else
7940     {
7941       coding->dst_object = Qnil;
7942       coding->dst_multibyte = 0;
7943     }
7944
7945   encode_coding (coding);
7946
7947   if (EQ (dst_object, Qt))
7948     {
7949       if (BUFFERP (coding->dst_object))
7950         coding->dst_object = Fbuffer_string ();
7951       else
7952         {
7953           coding->dst_object
7954             = make_unibyte_string ((char *) coding->destination,
7955                                    coding->produced);
7956           xfree (coding->destination);
7957         }
7958     }
7959
7960   if (saved_pt >= 0)
7961     {
7962       /* This is the case of:
7963          (BUFFERP (src_object) && EQ (src_object, dst_object))
7964          As we have moved PT while replacing the original buffer
7965          contents, we must recover it now.  */
7966       set_buffer_internal (XBUFFER (src_object));
7967       if (saved_pt < from)
7968         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7969       else if (saved_pt < from + chars)
7970         TEMP_SET_PT_BOTH (from, from_byte);
7971       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7972         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7973                           saved_pt_byte + (coding->produced - bytes));
7974       else
7975         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7976                           saved_pt_byte + (coding->produced - bytes));
7977
7978       if (need_marker_adjustment)
7979         {
7980           struct Lisp_Marker *tail;
7981
7982           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7983             if (tail->need_adjustment)
7984               {
7985                 tail->need_adjustment = 0;
7986                 if (tail->insertion_type)
7987                   {
7988                     tail->bytepos = from_byte;
7989                     tail->charpos = from;
7990                   }
7991                 else
7992                   {
7993                     tail->bytepos = from_byte + coding->produced;
7994                     tail->charpos
7995                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7996                          ? tail->bytepos : from + coding->produced_char);
7997                   }
7998               }
7999         }
8000     }
8001
8002   if (kill_src_buffer)
8003     Fkill_buffer (coding->src_object);
8004
8005   Vdeactivate_mark = old_deactivate_mark;
8006   unbind_to (count, Qnil);
8007 }
8008
8009
8010 Lisp_Object
8011 preferred_coding_system (void)
8012 {
8013   int id = coding_categories[coding_priorities[0]].id;
8014
8015   return CODING_ID_NAME (id);
8016 }
8017
8018 \f
8019 #ifdef emacs
8020 /*** 8. Emacs Lisp library functions ***/
8021
8022 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8023        doc: /* Return t if OBJECT is nil or a coding-system.
8024 See the documentation of `define-coding-system' for information
8025 about coding-system objects.  */)
8026   (Lisp_Object object)
8027 {
8028   if (NILP (object)
8029       || CODING_SYSTEM_ID (object) >= 0)
8030     return Qt;
8031   if (! SYMBOLP (object)
8032       || NILP (Fget (object, Qcoding_system_define_form)))
8033     return Qnil;
8034   return Qt;
8035 }
8036
8037 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8038        Sread_non_nil_coding_system, 1, 1, 0,
8039        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8040   (Lisp_Object prompt)
8041 {
8042   Lisp_Object val;
8043   do
8044     {
8045       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8046                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8047     }
8048   while (SCHARS (val) == 0);
8049   return (Fintern (val, Qnil));
8050 }
8051
8052 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8053        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8054 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8055 Ignores case when completing coding systems (all Emacs coding systems
8056 are lower-case).  */)
8057   (Lisp_Object prompt, Lisp_Object default_coding_system)
8058 {
8059   Lisp_Object val;
8060   ptrdiff_t count = SPECPDL_INDEX ();
8061
8062   if (SYMBOLP (default_coding_system))
8063     default_coding_system = SYMBOL_NAME (default_coding_system);
8064   specbind (Qcompletion_ignore_case, Qt);
8065   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8066                           Qt, Qnil, Qcoding_system_history,
8067                           default_coding_system, Qnil);
8068   unbind_to (count, Qnil);
8069   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8070 }
8071
8072 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8073        1, 1, 0,
8074        doc: /* Check validity of CODING-SYSTEM.
8075 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8076 It is valid if it is nil or a symbol defined as a coding system by the
8077 function `define-coding-system'.  */)
8078   (Lisp_Object coding_system)
8079 {
8080   Lisp_Object define_form;
8081
8082   define_form = Fget (coding_system, Qcoding_system_define_form);
8083   if (! NILP (define_form))
8084     {
8085       Fput (coding_system, Qcoding_system_define_form, Qnil);
8086       safe_eval (define_form);
8087     }
8088   if (!NILP (Fcoding_system_p (coding_system)))
8089     return coding_system;
8090   xsignal1 (Qcoding_system_error, coding_system);
8091 }
8092
8093 \f
8094 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8095    HIGHEST, return the coding system of the highest
8096    priority among the detected coding systems.  Otherwise return a
8097    list of detected coding systems sorted by their priorities.  If
8098    MULTIBYTEP, it is assumed that the bytes are in correct
8099    multibyte form but contains only ASCII and eight-bit chars.
8100    Otherwise, the bytes are raw bytes.
8101
8102    CODING-SYSTEM controls the detection as below:
8103
8104    If it is nil, detect both text-format and eol-format.  If the
8105    text-format part of CODING-SYSTEM is already specified
8106    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8107    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8108    detect only text-format.  */
8109
8110 Lisp_Object
8111 detect_coding_system (const unsigned char *src,
8112                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8113                       bool highest, bool multibytep,
8114                       Lisp_Object coding_system)
8115 {
8116   const unsigned char *src_end = src + src_bytes;
8117   Lisp_Object attrs, eol_type;
8118   Lisp_Object val = Qnil;
8119   struct coding_system coding;
8120   ptrdiff_t id;
8121   struct coding_detection_info detect_info;
8122   enum coding_category base_category;
8123   bool null_byte_found = 0, eight_bit_found = 0;
8124
8125   if (NILP (coding_system))
8126     coding_system = Qundecided;
8127   setup_coding_system (coding_system, &coding);
8128   attrs = CODING_ID_ATTRS (coding.id);
8129   eol_type = CODING_ID_EOL_TYPE (coding.id);
8130   coding_system = CODING_ATTR_BASE_NAME (attrs);
8131
8132   coding.source = src;
8133   coding.src_chars = src_chars;
8134   coding.src_bytes = src_bytes;
8135   coding.src_multibyte = multibytep;
8136   coding.consumed = 0;
8137   coding.mode |= CODING_MODE_LAST_BLOCK;
8138   coding.head_ascii = 0;
8139
8140   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8141
8142   /* At first, detect text-format if necessary.  */
8143   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8144   if (base_category == coding_category_undecided)
8145     {
8146       enum coding_category category IF_LINT (= 0);
8147       struct coding_system *this IF_LINT (= NULL);
8148       int c, i;
8149
8150       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8151       for (; src < src_end; src++)
8152         {
8153           c = *src;
8154           if (c & 0x80)
8155             {
8156               eight_bit_found = 1;
8157               if (null_byte_found)
8158                 break;
8159             }
8160           else if (c < 0x20)
8161             {
8162               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8163                   && ! inhibit_iso_escape_detection
8164                   && ! detect_info.checked)
8165                 {
8166                   if (detect_coding_iso_2022 (&coding, &detect_info))
8167                     {
8168                       /* We have scanned the whole data.  */
8169                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8170                         {
8171                           /* We didn't find an 8-bit code.  We may
8172                              have found a null-byte, but it's very
8173                              rare that a binary file confirm to
8174                              ISO-2022.  */
8175                           src = src_end;
8176                           coding.head_ascii = src - coding.source;
8177                         }
8178                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8179                       break;
8180                     }
8181                 }
8182               else if (! c && !inhibit_null_byte_detection)
8183                 {
8184                   null_byte_found = 1;
8185                   if (eight_bit_found)
8186                     break;
8187                 }
8188               if (! eight_bit_found)
8189                 coding.head_ascii++;
8190             }
8191           else if (! eight_bit_found)
8192             coding.head_ascii++;
8193         }
8194
8195       if (null_byte_found || eight_bit_found
8196           || coding.head_ascii < coding.src_bytes
8197           || detect_info.found)
8198         {
8199           if (coding.head_ascii == coding.src_bytes)
8200             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8201             for (i = 0; i < coding_category_raw_text; i++)
8202               {
8203                 category = coding_priorities[i];
8204                 this = coding_categories + category;
8205                 if (detect_info.found & (1 << category))
8206                   break;
8207               }
8208           else
8209             {
8210               if (null_byte_found)
8211                 {
8212                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8213                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8214                 }
8215               for (i = 0; i < coding_category_raw_text; i++)
8216                 {
8217                   category = coding_priorities[i];
8218                   this = coding_categories + category;
8219
8220                   if (this->id < 0)
8221                     {
8222                       /* No coding system of this category is defined.  */
8223                       detect_info.rejected |= (1 << category);
8224                     }
8225                   else if (category >= coding_category_raw_text)
8226                     continue;
8227                   else if (detect_info.checked & (1 << category))
8228                     {
8229                       if (highest
8230                           && (detect_info.found & (1 << category)))
8231                         break;
8232                     }
8233                   else if ((*(this->detector)) (&coding, &detect_info)
8234                            && highest
8235                            && (detect_info.found & (1 << category)))
8236                     {
8237                       if (category == coding_category_utf_16_auto)
8238                         {
8239                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8240                             category = coding_category_utf_16_le;
8241                           else
8242                             category = coding_category_utf_16_be;
8243                         }
8244                       break;
8245                     }
8246                 }
8247             }
8248         }
8249
8250       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8251           || null_byte_found)
8252         {
8253           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8254           id = CODING_SYSTEM_ID (Qno_conversion);
8255           val = Fcons (make_number (id), Qnil);
8256         }
8257       else if (! detect_info.rejected && ! detect_info.found)
8258         {
8259           detect_info.found = CATEGORY_MASK_ANY;
8260           id = coding_categories[coding_category_undecided].id;
8261           val = Fcons (make_number (id), Qnil);
8262         }
8263       else if (highest)
8264         {
8265           if (detect_info.found)
8266             {
8267               detect_info.found = 1 << category;
8268               val = Fcons (make_number (this->id), Qnil);
8269             }
8270           else
8271             for (i = 0; i < coding_category_raw_text; i++)
8272               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8273                 {
8274                   detect_info.found = 1 << coding_priorities[i];
8275                   id = coding_categories[coding_priorities[i]].id;
8276                   val = Fcons (make_number (id), Qnil);
8277                   break;
8278                 }
8279         }
8280       else
8281         {
8282           int mask = detect_info.rejected | detect_info.found;
8283           int found = 0;
8284
8285           for (i = coding_category_raw_text - 1; i >= 0; i--)
8286             {
8287               category = coding_priorities[i];
8288               if (! (mask & (1 << category)))
8289                 {
8290                   found |= 1 << category;
8291                   id = coding_categories[category].id;
8292                   if (id >= 0)
8293                     val = Fcons (make_number (id), val);
8294                 }
8295             }
8296           for (i = coding_category_raw_text - 1; i >= 0; i--)
8297             {
8298               category = coding_priorities[i];
8299               if (detect_info.found & (1 << category))
8300                 {
8301                   id = coding_categories[category].id;
8302                   val = Fcons (make_number (id), val);
8303                 }
8304             }
8305           detect_info.found |= found;
8306         }
8307     }
8308   else if (base_category == coding_category_utf_8_auto)
8309     {
8310       if (detect_coding_utf_8 (&coding, &detect_info))
8311         {
8312           struct coding_system *this;
8313
8314           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8315             this = coding_categories + coding_category_utf_8_sig;
8316           else
8317             this = coding_categories + coding_category_utf_8_nosig;
8318           val = Fcons (make_number (this->id), Qnil);
8319         }
8320     }
8321   else if (base_category == coding_category_utf_16_auto)
8322     {
8323       if (detect_coding_utf_16 (&coding, &detect_info))
8324         {
8325           struct coding_system *this;
8326
8327           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8328             this = coding_categories + coding_category_utf_16_le;
8329           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8330             this = coding_categories + coding_category_utf_16_be;
8331           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8332             this = coding_categories + coding_category_utf_16_be_nosig;
8333           else
8334             this = coding_categories + coding_category_utf_16_le_nosig;
8335           val = Fcons (make_number (this->id), Qnil);
8336         }
8337     }
8338   else
8339     {
8340       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8341       val = Fcons (make_number (coding.id), Qnil);
8342     }
8343
8344   /* Then, detect eol-format if necessary.  */
8345   {
8346     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8347     Lisp_Object tail;
8348
8349     if (VECTORP (eol_type))
8350       {
8351         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8352           {
8353             if (null_byte_found)
8354               normal_eol = EOL_SEEN_LF;
8355             else
8356               normal_eol = detect_eol (coding.source, src_bytes,
8357                                        coding_category_raw_text);
8358           }
8359         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8360                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8361           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8362                                       coding_category_utf_16_be);
8363         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8364                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8365           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8366                                       coding_category_utf_16_le);
8367       }
8368     else
8369       {
8370         if (EQ (eol_type, Qunix))
8371           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8372         else if (EQ (eol_type, Qdos))
8373           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8374         else
8375           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8376       }
8377
8378     for (tail = val; CONSP (tail); tail = XCDR (tail))
8379       {
8380         enum coding_category category;
8381         int this_eol;
8382
8383         id = XINT (XCAR (tail));
8384         attrs = CODING_ID_ATTRS (id);
8385         category = XINT (CODING_ATTR_CATEGORY (attrs));
8386         eol_type = CODING_ID_EOL_TYPE (id);
8387         if (VECTORP (eol_type))
8388           {
8389             if (category == coding_category_utf_16_be
8390                 || category == coding_category_utf_16_be_nosig)
8391               this_eol = utf_16_be_eol;
8392             else if (category == coding_category_utf_16_le
8393                      || category == coding_category_utf_16_le_nosig)
8394               this_eol = utf_16_le_eol;
8395             else
8396               this_eol = normal_eol;
8397
8398             if (this_eol == EOL_SEEN_LF)
8399               XSETCAR (tail, AREF (eol_type, 0));
8400             else if (this_eol == EOL_SEEN_CRLF)
8401               XSETCAR (tail, AREF (eol_type, 1));
8402             else if (this_eol == EOL_SEEN_CR)
8403               XSETCAR (tail, AREF (eol_type, 2));
8404             else
8405               XSETCAR (tail, CODING_ID_NAME (id));
8406           }
8407         else
8408           XSETCAR (tail, CODING_ID_NAME (id));
8409       }
8410   }
8411
8412   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8413 }
8414
8415
8416 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8417        2, 3, 0,
8418        doc: /* Detect coding system of the text in the region between START and END.
8419 Return a list of possible coding systems ordered by priority.
8420 The coding systems to try and their priorities follows what
8421 the function `coding-system-priority-list' (which see) returns.
8422
8423 If only ASCII characters are found (except for such ISO-2022 control
8424 characters as ESC), it returns a list of single element `undecided'
8425 or its subsidiary coding system according to a detected end-of-line
8426 format.
8427
8428 If optional argument HIGHEST is non-nil, return the coding system of
8429 highest priority.  */)
8430   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8431 {
8432   ptrdiff_t from, to;
8433   ptrdiff_t from_byte, to_byte;
8434
8435   CHECK_NUMBER_COERCE_MARKER (start);
8436   CHECK_NUMBER_COERCE_MARKER (end);
8437
8438   validate_region (&start, &end);
8439   from = XINT (start), to = XINT (end);
8440   from_byte = CHAR_TO_BYTE (from);
8441   to_byte = CHAR_TO_BYTE (to);
8442
8443   if (from < GPT && to >= GPT)
8444     move_gap_both (to, to_byte);
8445
8446   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8447                                to - from, to_byte - from_byte,
8448                                !NILP (highest),
8449                                !NILP (BVAR (current_buffer
8450                                       , enable_multibyte_characters)),
8451                                Qnil);
8452 }
8453
8454 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8455        1, 2, 0,
8456        doc: /* Detect coding system of the text in STRING.
8457 Return a list of possible coding systems ordered by priority.
8458 The coding systems to try and their priorities follows what
8459 the function `coding-system-priority-list' (which see) returns.
8460
8461 If only ASCII characters are found (except for such ISO-2022 control
8462 characters as ESC), it returns a list of single element `undecided'
8463 or its subsidiary coding system according to a detected end-of-line
8464 format.
8465
8466 If optional argument HIGHEST is non-nil, return the coding system of
8467 highest priority.  */)
8468   (Lisp_Object string, Lisp_Object highest)
8469 {
8470   CHECK_STRING (string);
8471
8472   return detect_coding_system (SDATA (string),
8473                                SCHARS (string), SBYTES (string),
8474                                !NILP (highest), STRING_MULTIBYTE (string),
8475                                Qnil);
8476 }
8477
8478
8479 static inline bool
8480 char_encodable_p (int c, Lisp_Object attrs)
8481 {
8482   Lisp_Object tail;
8483   struct charset *charset;
8484   Lisp_Object translation_table;
8485
8486   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8487   if (! NILP (translation_table))
8488     c = translate_char (translation_table, c);
8489   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8490        CONSP (tail); tail = XCDR (tail))
8491     {
8492       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8493       if (CHAR_CHARSET_P (c, charset))
8494         break;
8495     }
8496   return (! NILP (tail));
8497 }
8498
8499
8500 /* Return a list of coding systems that safely encode the text between
8501    START and END.  If EXCLUDE is non-nil, it is a list of coding
8502    systems not to check.  The returned list doesn't contain any such
8503    coding systems.  In any case, if the text contains only ASCII or is
8504    unibyte, return t.  */
8505
8506 DEFUN ("find-coding-systems-region-internal",
8507        Ffind_coding_systems_region_internal,
8508        Sfind_coding_systems_region_internal, 2, 3, 0,
8509        doc: /* Internal use only.  */)
8510   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8511 {
8512   Lisp_Object coding_attrs_list, safe_codings;
8513   ptrdiff_t start_byte, end_byte;
8514   const unsigned char *p, *pbeg, *pend;
8515   int c;
8516   Lisp_Object tail, elt, work_table;
8517
8518   if (STRINGP (start))
8519     {
8520       if (!STRING_MULTIBYTE (start)
8521           || SCHARS (start) == SBYTES (start))
8522         return Qt;
8523       start_byte = 0;
8524       end_byte = SBYTES (start);
8525     }
8526   else
8527     {
8528       CHECK_NUMBER_COERCE_MARKER (start);
8529       CHECK_NUMBER_COERCE_MARKER (end);
8530       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8531         args_out_of_range (start, end);
8532       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8533         return Qt;
8534       start_byte = CHAR_TO_BYTE (XINT (start));
8535       end_byte = CHAR_TO_BYTE (XINT (end));
8536       if (XINT (end) - XINT (start) == end_byte - start_byte)
8537         return Qt;
8538
8539       if (XINT (start) < GPT && XINT (end) > GPT)
8540         {
8541           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8542             move_gap_both (XINT (start), start_byte);
8543           else
8544             move_gap_both (XINT (end), end_byte);
8545         }
8546     }
8547
8548   coding_attrs_list = Qnil;
8549   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8550     if (NILP (exclude)
8551         || NILP (Fmemq (XCAR (tail), exclude)))
8552       {
8553         Lisp_Object attrs;
8554
8555         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8556         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8557             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8558           {
8559             ASET (attrs, coding_attr_trans_tbl,
8560                   get_translation_table (attrs, 1, NULL));
8561             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8562           }
8563       }
8564
8565   if (STRINGP (start))
8566     p = pbeg = SDATA (start);
8567   else
8568     p = pbeg = BYTE_POS_ADDR (start_byte);
8569   pend = p + (end_byte - start_byte);
8570
8571   while (p < pend && ASCII_BYTE_P (*p)) p++;
8572   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8573
8574   work_table = Fmake_char_table (Qnil, Qnil);
8575   while (p < pend)
8576     {
8577       if (ASCII_BYTE_P (*p))
8578         p++;
8579       else
8580         {
8581           c = STRING_CHAR_ADVANCE (p);
8582           if (!NILP (char_table_ref (work_table, c)))
8583             /* This character was already checked.  Ignore it.  */
8584             continue;
8585
8586           charset_map_loaded = 0;
8587           for (tail = coding_attrs_list; CONSP (tail);)
8588             {
8589               elt = XCAR (tail);
8590               if (NILP (elt))
8591                 tail = XCDR (tail);
8592               else if (char_encodable_p (c, elt))
8593                 tail = XCDR (tail);
8594               else if (CONSP (XCDR (tail)))
8595                 {
8596                   XSETCAR (tail, XCAR (XCDR (tail)));
8597                   XSETCDR (tail, XCDR (XCDR (tail)));
8598                 }
8599               else
8600                 {
8601                   XSETCAR (tail, Qnil);
8602                   tail = XCDR (tail);
8603                 }
8604             }
8605           if (charset_map_loaded)
8606             {
8607               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8608
8609               if (STRINGP (start))
8610                 pbeg = SDATA (start);
8611               else
8612                 pbeg = BYTE_POS_ADDR (start_byte);
8613               p = pbeg + p_offset;
8614               pend = pbeg + pend_offset;
8615             }
8616           char_table_set (work_table, c, Qt);
8617         }
8618     }
8619
8620   safe_codings = list2 (Qraw_text, Qno_conversion);
8621   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8622     if (! NILP (XCAR (tail)))
8623       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8624
8625   return safe_codings;
8626 }
8627
8628
8629 DEFUN ("unencodable-char-position", Funencodable_char_position,
8630        Sunencodable_char_position, 3, 5, 0,
8631        doc: /*
8632 Return position of first un-encodable character in a region.
8633 START and END specify the region and CODING-SYSTEM specifies the
8634 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8635
8636 If optional 4th argument COUNT is non-nil, it specifies at most how
8637 many un-encodable characters to search.  In this case, the value is a
8638 list of positions.
8639
8640 If optional 5th argument STRING is non-nil, it is a string to search
8641 for un-encodable characters.  In that case, START and END are indexes
8642 to the string.  */)
8643   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8644 {
8645   EMACS_INT n;
8646   struct coding_system coding;
8647   Lisp_Object attrs, charset_list, translation_table;
8648   Lisp_Object positions;
8649   ptrdiff_t from, to;
8650   const unsigned char *p, *stop, *pend;
8651   bool ascii_compatible;
8652
8653   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8654   attrs = CODING_ID_ATTRS (coding.id);
8655   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8656     return Qnil;
8657   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8658   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8659   translation_table = get_translation_table (attrs, 1, NULL);
8660
8661   if (NILP (string))
8662     {
8663       validate_region (&start, &end);
8664       from = XINT (start);
8665       to = XINT (end);
8666       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8667           || (ascii_compatible
8668               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8669         return Qnil;
8670       p = CHAR_POS_ADDR (from);
8671       pend = CHAR_POS_ADDR (to);
8672       if (from < GPT && to >= GPT)
8673         stop = GPT_ADDR;
8674       else
8675         stop = pend;
8676     }
8677   else
8678     {
8679       CHECK_STRING (string);
8680       CHECK_NATNUM (start);
8681       CHECK_NATNUM (end);
8682       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8683         args_out_of_range_3 (string, start, end);
8684       from = XINT (start);
8685       to = XINT (end);
8686       if (! STRING_MULTIBYTE (string))
8687         return Qnil;
8688       p = SDATA (string) + string_char_to_byte (string, from);
8689       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8690       if (ascii_compatible && (to - from) == (pend - p))
8691         return Qnil;
8692     }
8693
8694   if (NILP (count))
8695     n = 1;
8696   else
8697     {
8698       CHECK_NATNUM (count);
8699       n = XINT (count);
8700     }
8701
8702   positions = Qnil;
8703   charset_map_loaded = 0;
8704   while (1)
8705     {
8706       int c;
8707
8708       if (ascii_compatible)
8709         while (p < stop && ASCII_BYTE_P (*p))
8710           p++, from++;
8711       if (p >= stop)
8712         {
8713           if (p >= pend)
8714             break;
8715           stop = pend;
8716           p = GAP_END_ADDR;
8717         }
8718
8719       c = STRING_CHAR_ADVANCE (p);
8720       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8721           && ! char_charset (translate_char (translation_table, c),
8722                              charset_list, NULL))
8723         {
8724           positions = Fcons (make_number (from), positions);
8725           n--;
8726           if (n == 0)
8727             break;
8728         }
8729
8730       from++;
8731       if (charset_map_loaded && NILP (string))
8732         {
8733           p = CHAR_POS_ADDR (from);
8734           pend = CHAR_POS_ADDR (to);
8735           if (from < GPT && to >= GPT)
8736             stop = GPT_ADDR;
8737           else
8738             stop = pend;
8739           charset_map_loaded = 0;
8740         }
8741     }
8742
8743   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8744 }
8745
8746
8747 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8748        Scheck_coding_systems_region, 3, 3, 0,
8749        doc: /* Check if the region is encodable by coding systems.
8750
8751 START and END are buffer positions specifying the region.
8752 CODING-SYSTEM-LIST is a list of coding systems to check.
8753
8754 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8755 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8756 whole region, POS0, POS1, ... are buffer positions where non-encodable
8757 characters are found.
8758
8759 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8760 value is nil.
8761
8762 START may be a string.  In that case, check if the string is
8763 encodable, and the value contains indices to the string instead of
8764 buffer positions.  END is ignored.
8765
8766 If the current buffer (or START if it is a string) is unibyte, the value
8767 is nil.  */)
8768   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8769 {
8770   Lisp_Object list;
8771   ptrdiff_t start_byte, end_byte;
8772   ptrdiff_t pos;
8773   const unsigned char *p, *pbeg, *pend;
8774   int c;
8775   Lisp_Object tail, elt, attrs;
8776
8777   if (STRINGP (start))
8778     {
8779       if (!STRING_MULTIBYTE (start)
8780           || SCHARS (start) == SBYTES (start))
8781         return Qnil;
8782       start_byte = 0;
8783       end_byte = SBYTES (start);
8784       pos = 0;
8785     }
8786   else
8787     {
8788       CHECK_NUMBER_COERCE_MARKER (start);
8789       CHECK_NUMBER_COERCE_MARKER (end);
8790       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8791         args_out_of_range (start, end);
8792       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8793         return Qnil;
8794       start_byte = CHAR_TO_BYTE (XINT (start));
8795       end_byte = CHAR_TO_BYTE (XINT (end));
8796       if (XINT (end) - XINT (start) == end_byte - start_byte)
8797         return Qnil;
8798
8799       if (XINT (start) < GPT && XINT (end) > GPT)
8800         {
8801           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8802             move_gap_both (XINT (start), start_byte);
8803           else
8804             move_gap_both (XINT (end), end_byte);
8805         }
8806       pos = XINT (start);
8807     }
8808
8809   list = Qnil;
8810   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8811     {
8812       elt = XCAR (tail);
8813       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8814       ASET (attrs, coding_attr_trans_tbl,
8815             get_translation_table (attrs, 1, NULL));
8816       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8817     }
8818
8819   if (STRINGP (start))
8820     p = pbeg = SDATA (start);
8821   else
8822     p = pbeg = BYTE_POS_ADDR (start_byte);
8823   pend = p + (end_byte - start_byte);
8824
8825   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8826   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8827
8828   while (p < pend)
8829     {
8830       if (ASCII_BYTE_P (*p))
8831         p++;
8832       else
8833         {
8834           c = STRING_CHAR_ADVANCE (p);
8835
8836           charset_map_loaded = 0;
8837           for (tail = list; CONSP (tail); tail = XCDR (tail))
8838             {
8839               elt = XCDR (XCAR (tail));
8840               if (! char_encodable_p (c, XCAR (elt)))
8841                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8842             }
8843           if (charset_map_loaded)
8844             {
8845               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8846
8847               if (STRINGP (start))
8848                 pbeg = SDATA (start);
8849               else
8850                 pbeg = BYTE_POS_ADDR (start_byte);
8851               p = pbeg + p_offset;
8852               pend = pbeg + pend_offset;
8853             }
8854         }
8855       pos++;
8856     }
8857
8858   tail = list;
8859   list = Qnil;
8860   for (; CONSP (tail); tail = XCDR (tail))
8861     {
8862       elt = XCAR (tail);
8863       if (CONSP (XCDR (XCDR (elt))))
8864         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8865                       list);
8866     }
8867
8868   return list;
8869 }
8870
8871
8872 static Lisp_Object
8873 code_convert_region (Lisp_Object start, Lisp_Object end,
8874                      Lisp_Object coding_system, Lisp_Object dst_object,
8875                      bool encodep, bool norecord)
8876 {
8877   struct coding_system coding;
8878   ptrdiff_t from, from_byte, to, to_byte;
8879   Lisp_Object src_object;
8880
8881   CHECK_NUMBER_COERCE_MARKER (start);
8882   CHECK_NUMBER_COERCE_MARKER (end);
8883   if (NILP (coding_system))
8884     coding_system = Qno_conversion;
8885   else
8886     CHECK_CODING_SYSTEM (coding_system);
8887   src_object = Fcurrent_buffer ();
8888   if (NILP (dst_object))
8889     dst_object = src_object;
8890   else if (! EQ (dst_object, Qt))
8891     CHECK_BUFFER (dst_object);
8892
8893   validate_region (&start, &end);
8894   from = XFASTINT (start);
8895   from_byte = CHAR_TO_BYTE (from);
8896   to = XFASTINT (end);
8897   to_byte = CHAR_TO_BYTE (to);
8898
8899   setup_coding_system (coding_system, &coding);
8900   coding.mode |= CODING_MODE_LAST_BLOCK;
8901
8902   if (encodep)
8903     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8904                           dst_object);
8905   else
8906     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8907                           dst_object);
8908   if (! norecord)
8909     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8910
8911   return (BUFFERP (dst_object)
8912           ? make_number (coding.produced_char)
8913           : coding.dst_object);
8914 }
8915
8916
8917 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8918        3, 4, "r\nzCoding system: ",
8919        doc: /* Decode the current region from the specified coding system.
8920 When called from a program, takes four arguments:
8921         START, END, CODING-SYSTEM, and DESTINATION.
8922 START and END are buffer positions.
8923
8924 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8925 If nil, the region between START and END is replaced by the decoded text.
8926 If buffer, the decoded text is inserted in that buffer after point (point
8927 does not move).
8928 In those cases, the length of the decoded text is returned.
8929 If DESTINATION is t, the decoded text is returned.
8930
8931 This function sets `last-coding-system-used' to the precise coding system
8932 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8933 not fully specified.)  */)
8934   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8935 {
8936   return code_convert_region (start, end, coding_system, destination, 0, 0);
8937 }
8938
8939 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8940        3, 4, "r\nzCoding system: ",
8941        doc: /* Encode the current region by specified coding system.
8942 When called from a program, takes four arguments:
8943         START, END, CODING-SYSTEM and DESTINATION.
8944 START and END are buffer positions.
8945
8946 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8947 If nil, the region between START and END is replace by the encoded text.
8948 If buffer, the encoded text is inserted in that buffer after point (point
8949 does not move).
8950 In those cases, the length of the encoded text is returned.
8951 If DESTINATION is t, the encoded text is returned.
8952
8953 This function sets `last-coding-system-used' to the precise coding system
8954 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8955 not fully specified.)  */)
8956   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8957 {
8958   return code_convert_region (start, end, coding_system, destination, 1, 0);
8959 }
8960
8961 Lisp_Object
8962 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8963                      Lisp_Object dst_object, bool encodep, bool nocopy,
8964                      bool norecord)
8965 {
8966   struct coding_system coding;
8967   ptrdiff_t chars, bytes;
8968
8969   CHECK_STRING (string);
8970   if (NILP (coding_system))
8971     {
8972       if (! norecord)
8973         Vlast_coding_system_used = Qno_conversion;
8974       if (NILP (dst_object))
8975         return (nocopy ? Fcopy_sequence (string) : string);
8976     }
8977
8978   if (NILP (coding_system))
8979     coding_system = Qno_conversion;
8980   else
8981     CHECK_CODING_SYSTEM (coding_system);
8982   if (NILP (dst_object))
8983     dst_object = Qt;
8984   else if (! EQ (dst_object, Qt))
8985     CHECK_BUFFER (dst_object);
8986
8987   setup_coding_system (coding_system, &coding);
8988   coding.mode |= CODING_MODE_LAST_BLOCK;
8989   chars = SCHARS (string);
8990   bytes = SBYTES (string);
8991   if (encodep)
8992     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8993   else
8994     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8995   if (! norecord)
8996     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8997
8998   return (BUFFERP (dst_object)
8999           ? make_number (coding.produced_char)
9000           : coding.dst_object);
9001 }
9002
9003
9004 /* Encode or decode STRING according to CODING_SYSTEM.
9005    Do not set Vlast_coding_system_used.
9006
9007    This function is called only from macros DECODE_FILE and
9008    ENCODE_FILE, thus we ignore character composition.  */
9009
9010 Lisp_Object
9011 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9012                               bool encodep)
9013 {
9014   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9015 }
9016
9017
9018 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9019        2, 4, 0,
9020        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9021
9022 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9023 if the decoding operation is trivial.
9024
9025 Optional fourth arg BUFFER non-nil means that the decoded text is
9026 inserted in that buffer after point (point does not move).  In this
9027 case, the return value is the length of the decoded text.
9028
9029 This function sets `last-coding-system-used' to the precise coding system
9030 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9031 not fully specified.)  */)
9032   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9033 {
9034   return code_convert_string (string, coding_system, buffer,
9035                               0, ! NILP (nocopy), 0);
9036 }
9037
9038 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9039        2, 4, 0,
9040        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9041
9042 Optional third arg NOCOPY non-nil means it is OK to return STRING
9043 itself if the encoding operation is trivial.
9044
9045 Optional fourth arg BUFFER non-nil means that the encoded text is
9046 inserted in that buffer after point (point does not move).  In this
9047 case, the return value is the length of the encoded text.
9048
9049 This function sets `last-coding-system-used' to the precise coding system
9050 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9051 not fully specified.)  */)
9052   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9053 {
9054   return code_convert_string (string, coding_system, buffer,
9055                               1, ! NILP (nocopy), 0);
9056 }
9057
9058 \f
9059 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9060        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9061 Return the corresponding character.  */)
9062   (Lisp_Object code)
9063 {
9064   Lisp_Object spec, attrs, val;
9065   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9066   EMACS_INT ch;
9067   int c;
9068
9069   CHECK_NATNUM (code);
9070   ch = XFASTINT (code);
9071   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9072   attrs = AREF (spec, 0);
9073
9074   if (ASCII_BYTE_P (ch)
9075       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9076     return code;
9077
9078   val = CODING_ATTR_CHARSET_LIST (attrs);
9079   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9080   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9081   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9082
9083   if (ch <= 0x7F)
9084     {
9085       c = ch;
9086       charset = charset_roman;
9087     }
9088   else if (ch >= 0xA0 && ch < 0xDF)
9089     {
9090       c = ch - 0x80;
9091       charset = charset_kana;
9092     }
9093   else
9094     {
9095       EMACS_INT c1 = ch >> 8;
9096       int c2 = ch & 0xFF;
9097
9098       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9099           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9100         error ("Invalid code: %"pI"d", ch);
9101       c = ch;
9102       SJIS_TO_JIS (c);
9103       charset = charset_kanji;
9104     }
9105   c = DECODE_CHAR (charset, c);
9106   if (c < 0)
9107     error ("Invalid code: %"pI"d", ch);
9108   return make_number (c);
9109 }
9110
9111
9112 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9113        doc: /* Encode a Japanese character CH to shift_jis encoding.
9114 Return the corresponding code in SJIS.  */)
9115   (Lisp_Object ch)
9116 {
9117   Lisp_Object spec, attrs, charset_list;
9118   int c;
9119   struct charset *charset;
9120   unsigned code;
9121
9122   CHECK_CHARACTER (ch);
9123   c = XFASTINT (ch);
9124   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9125   attrs = AREF (spec, 0);
9126
9127   if (ASCII_CHAR_P (c)
9128       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9129     return ch;
9130
9131   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9132   charset = char_charset (c, charset_list, &code);
9133   if (code == CHARSET_INVALID_CODE (charset))
9134     error ("Can't encode by shift_jis encoding: %c", c);
9135   JIS_TO_SJIS (code);
9136
9137   return make_number (code);
9138 }
9139
9140 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9141        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9142 Return the corresponding character.  */)
9143   (Lisp_Object code)
9144 {
9145   Lisp_Object spec, attrs, val;
9146   struct charset *charset_roman, *charset_big5, *charset;
9147   EMACS_INT ch;
9148   int c;
9149
9150   CHECK_NATNUM (code);
9151   ch = XFASTINT (code);
9152   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9153   attrs = AREF (spec, 0);
9154
9155   if (ASCII_BYTE_P (ch)
9156       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9157     return code;
9158
9159   val = CODING_ATTR_CHARSET_LIST (attrs);
9160   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9161   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9162
9163   if (ch <= 0x7F)
9164     {
9165       c = ch;
9166       charset = charset_roman;
9167     }
9168   else
9169     {
9170       EMACS_INT b1 = ch >> 8;
9171       int b2 = ch & 0x7F;
9172       if (b1 < 0xA1 || b1 > 0xFE
9173           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9174         error ("Invalid code: %"pI"d", ch);
9175       c = ch;
9176       charset = charset_big5;
9177     }
9178   c = DECODE_CHAR (charset, c);
9179   if (c < 0)
9180     error ("Invalid code: %"pI"d", ch);
9181   return make_number (c);
9182 }
9183
9184 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9185        doc: /* Encode the Big5 character CH to BIG5 coding system.
9186 Return the corresponding character code in Big5.  */)
9187   (Lisp_Object ch)
9188 {
9189   Lisp_Object spec, attrs, charset_list;
9190   struct charset *charset;
9191   int c;
9192   unsigned code;
9193
9194   CHECK_CHARACTER (ch);
9195   c = XFASTINT (ch);
9196   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9197   attrs = AREF (spec, 0);
9198   if (ASCII_CHAR_P (c)
9199       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9200     return ch;
9201
9202   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9203   charset = char_charset (c, charset_list, &code);
9204   if (code == CHARSET_INVALID_CODE (charset))
9205     error ("Can't encode by Big5 encoding: %c", c);
9206
9207   return make_number (code);
9208 }
9209
9210 \f
9211 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9212        Sset_terminal_coding_system_internal, 1, 2, 0,
9213        doc: /* Internal use only.  */)
9214   (Lisp_Object coding_system, Lisp_Object terminal)
9215 {
9216   struct terminal *term = get_terminal (terminal, 1);
9217   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9218   CHECK_SYMBOL (coding_system);
9219   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9220   /* We had better not send unsafe characters to terminal.  */
9221   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9222   /* Character composition should be disabled.  */
9223   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9224   terminal_coding->src_multibyte = 1;
9225   terminal_coding->dst_multibyte = 0;
9226   tset_charset_list
9227     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9228             ? coding_charset_list (terminal_coding)
9229             : Fcons (make_number (charset_ascii), Qnil)));
9230   return Qnil;
9231 }
9232
9233 DEFUN ("set-safe-terminal-coding-system-internal",
9234        Fset_safe_terminal_coding_system_internal,
9235        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9236        doc: /* Internal use only.  */)
9237   (Lisp_Object coding_system)
9238 {
9239   CHECK_SYMBOL (coding_system);
9240   setup_coding_system (Fcheck_coding_system (coding_system),
9241                        &safe_terminal_coding);
9242   /* Character composition should be disabled.  */
9243   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9244   safe_terminal_coding.src_multibyte = 1;
9245   safe_terminal_coding.dst_multibyte = 0;
9246   return Qnil;
9247 }
9248
9249 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9250        Sterminal_coding_system, 0, 1, 0,
9251        doc: /* Return coding system specified for terminal output on the given terminal.
9252 TERMINAL may be a terminal object, a frame, or nil for the selected
9253 frame's terminal device.  */)
9254   (Lisp_Object terminal)
9255 {
9256   struct coding_system *terminal_coding
9257     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9258   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9259
9260   /* For backward compatibility, return nil if it is `undecided'.  */
9261   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9262 }
9263
9264 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9265        Sset_keyboard_coding_system_internal, 1, 2, 0,
9266        doc: /* Internal use only.  */)
9267   (Lisp_Object coding_system, Lisp_Object terminal)
9268 {
9269   struct terminal *t = get_terminal (terminal, 1);
9270   CHECK_SYMBOL (coding_system);
9271   if (NILP (coding_system))
9272     coding_system = Qno_conversion;
9273   else
9274     Fcheck_coding_system (coding_system);
9275   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9276   /* Character composition should be disabled.  */
9277   TERMINAL_KEYBOARD_CODING (t)->common_flags
9278     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9279   return Qnil;
9280 }
9281
9282 DEFUN ("keyboard-coding-system",
9283        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9284        doc: /* Return coding system specified for decoding keyboard input.  */)
9285   (Lisp_Object terminal)
9286 {
9287   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9288                          (get_terminal (terminal, 1))->id);
9289 }
9290
9291 \f
9292 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9293        Sfind_operation_coding_system,  1, MANY, 0,
9294        doc: /* Choose a coding system for an operation based on the target name.
9295 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9296 DECODING-SYSTEM is the coding system to use for decoding
9297 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9298 for encoding (in case OPERATION does encoding).
9299
9300 The first argument OPERATION specifies an I/O primitive:
9301   For file I/O, `insert-file-contents' or `write-region'.
9302   For process I/O, `call-process', `call-process-region', or `start-process'.
9303   For network I/O, `open-network-stream'.
9304
9305 The remaining arguments should be the same arguments that were passed
9306 to the primitive.  Depending on which primitive, one of those arguments
9307 is selected as the TARGET.  For example, if OPERATION does file I/O,
9308 whichever argument specifies the file name is TARGET.
9309
9310 TARGET has a meaning which depends on OPERATION:
9311   For file I/O, TARGET is a file name (except for the special case below).
9312   For process I/O, TARGET is a process name.
9313   For network I/O, TARGET is a service name or a port number.
9314
9315 This function looks up what is specified for TARGET in
9316 `file-coding-system-alist', `process-coding-system-alist',
9317 or `network-coding-system-alist' depending on OPERATION.
9318 They may specify a coding system, a cons of coding systems,
9319 or a function symbol to call.
9320 In the last case, we call the function with one argument,
9321 which is a list of all the arguments given to this function.
9322 If the function can't decide a coding system, it can return
9323 `undecided' so that the normal code-detection is performed.
9324
9325 If OPERATION is `insert-file-contents', the argument corresponding to
9326 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9327 file name to look up, and BUFFER is a buffer that contains the file's
9328 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9329 function to call for FILENAME, that function should examine the
9330 contents of BUFFER instead of reading the file.
9331
9332 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9333   (ptrdiff_t nargs, Lisp_Object *args)
9334 {
9335   Lisp_Object operation, target_idx, target, val;
9336   register Lisp_Object chain;
9337
9338   if (nargs < 2)
9339     error ("Too few arguments");
9340   operation = args[0];
9341   if (!SYMBOLP (operation)
9342       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9343     error ("Invalid first argument");
9344   if (nargs <= 1 + XFASTINT (target_idx))
9345     error ("Too few arguments for operation `%s'",
9346            SDATA (SYMBOL_NAME (operation)));
9347   target = args[XFASTINT (target_idx) + 1];
9348   if (!(STRINGP (target)
9349         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9350             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9351         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9352     error ("Invalid argument %"pI"d of operation `%s'",
9353            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9354   if (CONSP (target))
9355     target = XCAR (target);
9356
9357   chain = ((EQ (operation, Qinsert_file_contents)
9358             || EQ (operation, Qwrite_region))
9359            ? Vfile_coding_system_alist
9360            : (EQ (operation, Qopen_network_stream)
9361               ? Vnetwork_coding_system_alist
9362               : Vprocess_coding_system_alist));
9363   if (NILP (chain))
9364     return Qnil;
9365
9366   for (; CONSP (chain); chain = XCDR (chain))
9367     {
9368       Lisp_Object elt;
9369
9370       elt = XCAR (chain);
9371       if (CONSP (elt)
9372           && ((STRINGP (target)
9373                && STRINGP (XCAR (elt))
9374                && fast_string_match (XCAR (elt), target) >= 0)
9375               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9376         {
9377           val = XCDR (elt);
9378           /* Here, if VAL is both a valid coding system and a valid
9379              function symbol, we return VAL as a coding system.  */
9380           if (CONSP (val))
9381             return val;
9382           if (! SYMBOLP (val))
9383             return Qnil;
9384           if (! NILP (Fcoding_system_p (val)))
9385             return Fcons (val, val);
9386           if (! NILP (Ffboundp (val)))
9387             {
9388               /* We use call1 rather than safe_call1
9389                  so as to get bug reports about functions called here
9390                  which don't handle the current interface.  */
9391               val = call1 (val, Flist (nargs, args));
9392               if (CONSP (val))
9393                 return val;
9394               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9395                 return Fcons (val, val);
9396             }
9397           return Qnil;
9398         }
9399     }
9400   return Qnil;
9401 }
9402
9403 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9404        Sset_coding_system_priority, 0, MANY, 0,
9405        doc: /* Assign higher priority to the coding systems given as arguments.
9406 If multiple coding systems belong to the same category,
9407 all but the first one are ignored.
9408
9409 usage: (set-coding-system-priority &rest coding-systems)  */)
9410   (ptrdiff_t nargs, Lisp_Object *args)
9411 {
9412   ptrdiff_t i, j;
9413   bool changed[coding_category_max];
9414   enum coding_category priorities[coding_category_max];
9415
9416   memset (changed, 0, sizeof changed);
9417
9418   for (i = j = 0; i < nargs; i++)
9419     {
9420       enum coding_category category;
9421       Lisp_Object spec, attrs;
9422
9423       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9424       attrs = AREF (spec, 0);
9425       category = XINT (CODING_ATTR_CATEGORY (attrs));
9426       if (changed[category])
9427         /* Ignore this coding system because a coding system of the
9428            same category already had a higher priority.  */
9429         continue;
9430       changed[category] = 1;
9431       priorities[j++] = category;
9432       if (coding_categories[category].id >= 0
9433           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9434         setup_coding_system (args[i], &coding_categories[category]);
9435       Fset (AREF (Vcoding_category_table, category), args[i]);
9436     }
9437
9438   /* Now we have decided top J priorities.  Reflect the order of the
9439      original priorities to the remaining priorities.  */
9440
9441   for (i = j, j = 0; i < coding_category_max; i++, j++)
9442     {
9443       while (j < coding_category_max
9444              && changed[coding_priorities[j]])
9445         j++;
9446       if (j == coding_category_max)
9447         emacs_abort ();
9448       priorities[i] = coding_priorities[j];
9449     }
9450
9451   memcpy (coding_priorities, priorities, sizeof priorities);
9452
9453   /* Update `coding-category-list'.  */
9454   Vcoding_category_list = Qnil;
9455   for (i = coding_category_max; i-- > 0; )
9456     Vcoding_category_list
9457       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9458                Vcoding_category_list);
9459
9460   return Qnil;
9461 }
9462
9463 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9464        Scoding_system_priority_list, 0, 1, 0,
9465        doc: /* Return a list of coding systems ordered by their priorities.
9466 The list contains a subset of coding systems; i.e. coding systems
9467 assigned to each coding category (see `coding-category-list').
9468
9469 HIGHESTP non-nil means just return the highest priority one.  */)
9470   (Lisp_Object highestp)
9471 {
9472   int i;
9473   Lisp_Object val;
9474
9475   for (i = 0, val = Qnil; i < coding_category_max; i++)
9476     {
9477       enum coding_category category = coding_priorities[i];
9478       int id = coding_categories[category].id;
9479       Lisp_Object attrs;
9480
9481       if (id < 0)
9482         continue;
9483       attrs = CODING_ID_ATTRS (id);
9484       if (! NILP (highestp))
9485         return CODING_ATTR_BASE_NAME (attrs);
9486       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9487     }
9488   return Fnreverse (val);
9489 }
9490
9491 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9492
9493 static Lisp_Object
9494 make_subsidiaries (Lisp_Object base)
9495 {
9496   Lisp_Object subsidiaries;
9497   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9498   char *buf = alloca (base_name_len + 6);
9499   int i;
9500
9501   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9502   subsidiaries = Fmake_vector (make_number (3), Qnil);
9503   for (i = 0; i < 3; i++)
9504     {
9505       strcpy (buf + base_name_len, suffixes[i]);
9506       ASET (subsidiaries, i, intern (buf));
9507     }
9508   return subsidiaries;
9509 }
9510
9511
9512 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9513        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9514        doc: /* For internal use only.
9515 usage: (define-coding-system-internal ...)  */)
9516   (ptrdiff_t nargs, Lisp_Object *args)
9517 {
9518   Lisp_Object name;
9519   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9520   Lisp_Object attrs;            /* Vector of attributes.  */
9521   Lisp_Object eol_type;
9522   Lisp_Object aliases;
9523   Lisp_Object coding_type, charset_list, safe_charsets;
9524   enum coding_category category;
9525   Lisp_Object tail, val;
9526   int max_charset_id = 0;
9527   int i;
9528
9529   if (nargs < coding_arg_max)
9530     goto short_args;
9531
9532   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9533
9534   name = args[coding_arg_name];
9535   CHECK_SYMBOL (name);
9536   ASET (attrs, coding_attr_base_name, name);
9537
9538   val = args[coding_arg_mnemonic];
9539   if (! STRINGP (val))
9540     CHECK_CHARACTER (val);
9541   ASET (attrs, coding_attr_mnemonic, val);
9542
9543   coding_type = args[coding_arg_coding_type];
9544   CHECK_SYMBOL (coding_type);
9545   ASET (attrs, coding_attr_type, coding_type);
9546
9547   charset_list = args[coding_arg_charset_list];
9548   if (SYMBOLP (charset_list))
9549     {
9550       if (EQ (charset_list, Qiso_2022))
9551         {
9552           if (! EQ (coding_type, Qiso_2022))
9553             error ("Invalid charset-list");
9554           charset_list = Viso_2022_charset_list;
9555         }
9556       else if (EQ (charset_list, Qemacs_mule))
9557         {
9558           if (! EQ (coding_type, Qemacs_mule))
9559             error ("Invalid charset-list");
9560           charset_list = Vemacs_mule_charset_list;
9561         }
9562       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9563         {
9564           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9565             error ("Invalid charset-list");
9566           if (max_charset_id < XFASTINT (XCAR (tail)))
9567             max_charset_id = XFASTINT (XCAR (tail));
9568         }
9569     }
9570   else
9571     {
9572       charset_list = Fcopy_sequence (charset_list);
9573       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9574         {
9575           struct charset *charset;
9576
9577           val = XCAR (tail);
9578           CHECK_CHARSET_GET_CHARSET (val, charset);
9579           if (EQ (coding_type, Qiso_2022)
9580               ? CHARSET_ISO_FINAL (charset) < 0
9581               : EQ (coding_type, Qemacs_mule)
9582               ? CHARSET_EMACS_MULE_ID (charset) < 0
9583               : 0)
9584             error ("Can't handle charset `%s'",
9585                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9586
9587           XSETCAR (tail, make_number (charset->id));
9588           if (max_charset_id < charset->id)
9589             max_charset_id = charset->id;
9590         }
9591     }
9592   ASET (attrs, coding_attr_charset_list, charset_list);
9593
9594   safe_charsets = make_uninit_string (max_charset_id + 1);
9595   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9596   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9597     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9598   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9599
9600   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9601
9602   val = args[coding_arg_decode_translation_table];
9603   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9604     CHECK_SYMBOL (val);
9605   ASET (attrs, coding_attr_decode_tbl, val);
9606
9607   val = args[coding_arg_encode_translation_table];
9608   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9609     CHECK_SYMBOL (val);
9610   ASET (attrs, coding_attr_encode_tbl, val);
9611
9612   val = args[coding_arg_post_read_conversion];
9613   CHECK_SYMBOL (val);
9614   ASET (attrs, coding_attr_post_read, val);
9615
9616   val = args[coding_arg_pre_write_conversion];
9617   CHECK_SYMBOL (val);
9618   ASET (attrs, coding_attr_pre_write, val);
9619
9620   val = args[coding_arg_default_char];
9621   if (NILP (val))
9622     ASET (attrs, coding_attr_default_char, make_number (' '));
9623   else
9624     {
9625       CHECK_CHARACTER (val);
9626       ASET (attrs, coding_attr_default_char, val);
9627     }
9628
9629   val = args[coding_arg_for_unibyte];
9630   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9631
9632   val = args[coding_arg_plist];
9633   CHECK_LIST (val);
9634   ASET (attrs, coding_attr_plist, val);
9635
9636   if (EQ (coding_type, Qcharset))
9637     {
9638       /* Generate a lisp vector of 256 elements.  Each element is nil,
9639          integer, or a list of charset IDs.
9640
9641          If Nth element is nil, the byte code N is invalid in this
9642          coding system.
9643
9644          If Nth element is a number NUM, N is the first byte of a
9645          charset whose ID is NUM.
9646
9647          If Nth element is a list of charset IDs, N is the first byte
9648          of one of them.  The list is sorted by dimensions of the
9649          charsets.  A charset of smaller dimension comes first. */
9650       val = Fmake_vector (make_number (256), Qnil);
9651
9652       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9653         {
9654           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9655           int dim = CHARSET_DIMENSION (charset);
9656           int idx = (dim - 1) * 4;
9657
9658           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9659             ASET (attrs, coding_attr_ascii_compat, Qt);
9660
9661           for (i = charset->code_space[idx];
9662                i <= charset->code_space[idx + 1]; i++)
9663             {
9664               Lisp_Object tmp, tmp2;
9665               int dim2;
9666
9667               tmp = AREF (val, i);
9668               if (NILP (tmp))
9669                 tmp = XCAR (tail);
9670               else if (NUMBERP (tmp))
9671                 {
9672                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9673                   if (dim < dim2)
9674                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9675                   else
9676                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9677                 }
9678               else
9679                 {
9680                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9681                     {
9682                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9683                       if (dim < dim2)
9684                         break;
9685                     }
9686                   if (NILP (tmp2))
9687                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9688                   else
9689                     {
9690                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9691                       XSETCAR (tmp2, XCAR (tail));
9692                     }
9693                 }
9694               ASET (val, i, tmp);
9695             }
9696         }
9697       ASET (attrs, coding_attr_charset_valids, val);
9698       category = coding_category_charset;
9699     }
9700   else if (EQ (coding_type, Qccl))
9701     {
9702       Lisp_Object valids;
9703
9704       if (nargs < coding_arg_ccl_max)
9705         goto short_args;
9706
9707       val = args[coding_arg_ccl_decoder];
9708       CHECK_CCL_PROGRAM (val);
9709       if (VECTORP (val))
9710         val = Fcopy_sequence (val);
9711       ASET (attrs, coding_attr_ccl_decoder, val);
9712
9713       val = args[coding_arg_ccl_encoder];
9714       CHECK_CCL_PROGRAM (val);
9715       if (VECTORP (val))
9716         val = Fcopy_sequence (val);
9717       ASET (attrs, coding_attr_ccl_encoder, val);
9718
9719       val = args[coding_arg_ccl_valids];
9720       valids = Fmake_string (make_number (256), make_number (0));
9721       for (tail = val; CONSP (tail); tail = XCDR (tail))
9722         {
9723           int from, to;
9724
9725           val = XCAR (tail);
9726           if (INTEGERP (val))
9727             {
9728               if (! (0 <= XINT (val) && XINT (val) <= 255))
9729                 args_out_of_range_3 (val, make_number (0), make_number (255));
9730               from = to = XINT (val);
9731             }
9732           else
9733             {
9734               CHECK_CONS (val);
9735               CHECK_NATNUM_CAR (val);
9736               CHECK_NUMBER_CDR (val);
9737               if (XINT (XCAR (val)) > 255)
9738                 args_out_of_range_3 (XCAR (val),
9739                                      make_number (0), make_number (255));
9740               from = XINT (XCAR (val));
9741               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9742                 args_out_of_range_3 (XCDR (val),
9743                                      XCAR (val), make_number (255));
9744               to = XINT (XCDR (val));
9745             }
9746           for (i = from; i <= to; i++)
9747             SSET (valids, i, 1);
9748         }
9749       ASET (attrs, coding_attr_ccl_valids, valids);
9750
9751       category = coding_category_ccl;
9752     }
9753   else if (EQ (coding_type, Qutf_16))
9754     {
9755       Lisp_Object bom, endian;
9756
9757       ASET (attrs, coding_attr_ascii_compat, Qnil);
9758
9759       if (nargs < coding_arg_utf16_max)
9760         goto short_args;
9761
9762       bom = args[coding_arg_utf16_bom];
9763       if (! NILP (bom) && ! EQ (bom, Qt))
9764         {
9765           CHECK_CONS (bom);
9766           val = XCAR (bom);
9767           CHECK_CODING_SYSTEM (val);
9768           val = XCDR (bom);
9769           CHECK_CODING_SYSTEM (val);
9770         }
9771       ASET (attrs, coding_attr_utf_bom, bom);
9772
9773       endian = args[coding_arg_utf16_endian];
9774       CHECK_SYMBOL (endian);
9775       if (NILP (endian))
9776         endian = Qbig;
9777       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9778         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9779       ASET (attrs, coding_attr_utf_16_endian, endian);
9780
9781       category = (CONSP (bom)
9782                   ? coding_category_utf_16_auto
9783                   : NILP (bom)
9784                   ? (EQ (endian, Qbig)
9785                      ? coding_category_utf_16_be_nosig
9786                      : coding_category_utf_16_le_nosig)
9787                   : (EQ (endian, Qbig)
9788                      ? coding_category_utf_16_be
9789                      : coding_category_utf_16_le));
9790     }
9791   else if (EQ (coding_type, Qiso_2022))
9792     {
9793       Lisp_Object initial, reg_usage, request, flags;
9794
9795       if (nargs < coding_arg_iso2022_max)
9796         goto short_args;
9797
9798       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9799       CHECK_VECTOR (initial);
9800       for (i = 0; i < 4; i++)
9801         {
9802           val = Faref (initial, make_number (i));
9803           if (! NILP (val))
9804             {
9805               struct charset *charset;
9806
9807               CHECK_CHARSET_GET_CHARSET (val, charset);
9808               ASET (initial, i, make_number (CHARSET_ID (charset)));
9809               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9810                 ASET (attrs, coding_attr_ascii_compat, Qt);
9811             }
9812           else
9813             ASET (initial, i, make_number (-1));
9814         }
9815
9816       reg_usage = args[coding_arg_iso2022_reg_usage];
9817       CHECK_CONS (reg_usage);
9818       CHECK_NUMBER_CAR (reg_usage);
9819       CHECK_NUMBER_CDR (reg_usage);
9820
9821       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9822       for (tail = request; CONSP (tail); tail = XCDR (tail))
9823         {
9824           int id;
9825           Lisp_Object tmp1;
9826
9827           val = XCAR (tail);
9828           CHECK_CONS (val);
9829           tmp1 = XCAR (val);
9830           CHECK_CHARSET_GET_ID (tmp1, id);
9831           CHECK_NATNUM_CDR (val);
9832           if (XINT (XCDR (val)) >= 4)
9833             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9834           XSETCAR (val, make_number (id));
9835         }
9836
9837       flags = args[coding_arg_iso2022_flags];
9838       CHECK_NATNUM (flags);
9839       i = XINT (flags) & INT_MAX;
9840       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9841         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9842       flags = make_number (i);
9843
9844       ASET (attrs, coding_attr_iso_initial, initial);
9845       ASET (attrs, coding_attr_iso_usage, reg_usage);
9846       ASET (attrs, coding_attr_iso_request, request);
9847       ASET (attrs, coding_attr_iso_flags, flags);
9848       setup_iso_safe_charsets (attrs);
9849
9850       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9851         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9852                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9853                     ? coding_category_iso_7_else
9854                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9855                     ? coding_category_iso_7
9856                     : coding_category_iso_7_tight);
9857       else
9858         {
9859           int id = XINT (AREF (initial, 1));
9860
9861           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9862                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9863                        || id < 0)
9864                       ? coding_category_iso_8_else
9865                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9866                       ? coding_category_iso_8_1
9867                       : coding_category_iso_8_2);
9868         }
9869       if (category != coding_category_iso_8_1
9870           && category != coding_category_iso_8_2)
9871         ASET (attrs, coding_attr_ascii_compat, Qnil);
9872     }
9873   else if (EQ (coding_type, Qemacs_mule))
9874     {
9875       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9876         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9877       ASET (attrs, coding_attr_ascii_compat, Qt);
9878       category = coding_category_emacs_mule;
9879     }
9880   else if (EQ (coding_type, Qshift_jis))
9881     {
9882
9883       struct charset *charset;
9884
9885       if (XINT (Flength (charset_list)) != 3
9886           && XINT (Flength (charset_list)) != 4)
9887         error ("There should be three or four charsets");
9888
9889       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9890       if (CHARSET_DIMENSION (charset) != 1)
9891         error ("Dimension of charset %s is not one",
9892                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9893       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9894         ASET (attrs, coding_attr_ascii_compat, Qt);
9895
9896       charset_list = XCDR (charset_list);
9897       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9898       if (CHARSET_DIMENSION (charset) != 1)
9899         error ("Dimension of charset %s is not one",
9900                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9901
9902       charset_list = XCDR (charset_list);
9903       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9904       if (CHARSET_DIMENSION (charset) != 2)
9905         error ("Dimension of charset %s is not two",
9906                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9907
9908       charset_list = XCDR (charset_list);
9909       if (! NILP (charset_list))
9910         {
9911           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9912           if (CHARSET_DIMENSION (charset) != 2)
9913             error ("Dimension of charset %s is not two",
9914                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9915         }
9916
9917       category = coding_category_sjis;
9918       Vsjis_coding_system = name;
9919     }
9920   else if (EQ (coding_type, Qbig5))
9921     {
9922       struct charset *charset;
9923
9924       if (XINT (Flength (charset_list)) != 2)
9925         error ("There should be just two charsets");
9926
9927       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9928       if (CHARSET_DIMENSION (charset) != 1)
9929         error ("Dimension of charset %s is not one",
9930                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9931       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9932         ASET (attrs, coding_attr_ascii_compat, Qt);
9933
9934       charset_list = XCDR (charset_list);
9935       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9936       if (CHARSET_DIMENSION (charset) != 2)
9937         error ("Dimension of charset %s is not two",
9938                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9939
9940       category = coding_category_big5;
9941       Vbig5_coding_system = name;
9942     }
9943   else if (EQ (coding_type, Qraw_text))
9944     {
9945       category = coding_category_raw_text;
9946       ASET (attrs, coding_attr_ascii_compat, Qt);
9947     }
9948   else if (EQ (coding_type, Qutf_8))
9949     {
9950       Lisp_Object bom;
9951
9952       if (nargs < coding_arg_utf8_max)
9953         goto short_args;
9954
9955       bom = args[coding_arg_utf8_bom];
9956       if (! NILP (bom) && ! EQ (bom, Qt))
9957         {
9958           CHECK_CONS (bom);
9959           val = XCAR (bom);
9960           CHECK_CODING_SYSTEM (val);
9961           val = XCDR (bom);
9962           CHECK_CODING_SYSTEM (val);
9963         }
9964       ASET (attrs, coding_attr_utf_bom, bom);
9965       if (NILP (bom))
9966         ASET (attrs, coding_attr_ascii_compat, Qt);
9967
9968       category = (CONSP (bom) ? coding_category_utf_8_auto
9969                   : NILP (bom) ? coding_category_utf_8_nosig
9970                   : coding_category_utf_8_sig);
9971     }
9972   else if (EQ (coding_type, Qundecided))
9973     category = coding_category_undecided;
9974   else
9975     error ("Invalid coding system type: %s",
9976            SDATA (SYMBOL_NAME (coding_type)));
9977
9978   ASET (attrs, coding_attr_category, make_number (category));
9979   ASET (attrs, coding_attr_plist,
9980         Fcons (QCcategory,
9981                Fcons (AREF (Vcoding_category_table, category),
9982                       CODING_ATTR_PLIST (attrs))));
9983   ASET (attrs, coding_attr_plist,
9984         Fcons (QCascii_compatible_p,
9985                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9986                       CODING_ATTR_PLIST (attrs))));
9987
9988   eol_type = args[coding_arg_eol_type];
9989   if (! NILP (eol_type)
9990       && ! EQ (eol_type, Qunix)
9991       && ! EQ (eol_type, Qdos)
9992       && ! EQ (eol_type, Qmac))
9993     error ("Invalid eol-type");
9994
9995   aliases = Fcons (name, Qnil);
9996
9997   if (NILP (eol_type))
9998     {
9999       eol_type = make_subsidiaries (name);
10000       for (i = 0; i < 3; i++)
10001         {
10002           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10003
10004           this_name = AREF (eol_type, i);
10005           this_aliases = Fcons (this_name, Qnil);
10006           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10007           this_spec = Fmake_vector (make_number (3), attrs);
10008           ASET (this_spec, 1, this_aliases);
10009           ASET (this_spec, 2, this_eol_type);
10010           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10011           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10012           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10013           if (NILP (val))
10014             Vcoding_system_alist
10015               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10016                        Vcoding_system_alist);
10017         }
10018     }
10019
10020   spec_vec = Fmake_vector (make_number (3), attrs);
10021   ASET (spec_vec, 1, aliases);
10022   ASET (spec_vec, 2, eol_type);
10023
10024   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10025   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10026   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10027   if (NILP (val))
10028     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10029                                   Vcoding_system_alist);
10030
10031   {
10032     int id = coding_categories[category].id;
10033
10034     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10035       setup_coding_system (name, &coding_categories[category]);
10036   }
10037
10038   return Qnil;
10039
10040  short_args:
10041   return Fsignal (Qwrong_number_of_arguments,
10042                   Fcons (intern ("define-coding-system-internal"),
10043                          make_number (nargs)));
10044 }
10045
10046
10047 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10048        3, 3, 0,
10049        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10050   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10051 {
10052   Lisp_Object spec, attrs;
10053
10054   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10055   attrs = AREF (spec, 0);
10056   if (EQ (prop, QCmnemonic))
10057     {
10058       if (! STRINGP (val))
10059         CHECK_CHARACTER (val);
10060       ASET (attrs, coding_attr_mnemonic, val);
10061     }
10062   else if (EQ (prop, QCdefault_char))
10063     {
10064       if (NILP (val))
10065         val = make_number (' ');
10066       else
10067         CHECK_CHARACTER (val);
10068       ASET (attrs, coding_attr_default_char, val);
10069     }
10070   else if (EQ (prop, QCdecode_translation_table))
10071     {
10072       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10073         CHECK_SYMBOL (val);
10074       ASET (attrs, coding_attr_decode_tbl, val);
10075     }
10076   else if (EQ (prop, QCencode_translation_table))
10077     {
10078       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10079         CHECK_SYMBOL (val);
10080       ASET (attrs, coding_attr_encode_tbl, val);
10081     }
10082   else if (EQ (prop, QCpost_read_conversion))
10083     {
10084       CHECK_SYMBOL (val);
10085       ASET (attrs, coding_attr_post_read, val);
10086     }
10087   else if (EQ (prop, QCpre_write_conversion))
10088     {
10089       CHECK_SYMBOL (val);
10090       ASET (attrs, coding_attr_pre_write, val);
10091     }
10092   else if (EQ (prop, QCascii_compatible_p))
10093     {
10094       ASET (attrs, coding_attr_ascii_compat, val);
10095     }
10096
10097   ASET (attrs, coding_attr_plist,
10098         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10099   return val;
10100 }
10101
10102
10103 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10104        Sdefine_coding_system_alias, 2, 2, 0,
10105        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10106   (Lisp_Object alias, Lisp_Object coding_system)
10107 {
10108   Lisp_Object spec, aliases, eol_type, val;
10109
10110   CHECK_SYMBOL (alias);
10111   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10112   aliases = AREF (spec, 1);
10113   /* ALIASES should be a list of length more than zero, and the first
10114      element is a base coding system.  Append ALIAS at the tail of the
10115      list.  */
10116   while (!NILP (XCDR (aliases)))
10117     aliases = XCDR (aliases);
10118   XSETCDR (aliases, Fcons (alias, Qnil));
10119
10120   eol_type = AREF (spec, 2);
10121   if (VECTORP (eol_type))
10122     {
10123       Lisp_Object subsidiaries;
10124       int i;
10125
10126       subsidiaries = make_subsidiaries (alias);
10127       for (i = 0; i < 3; i++)
10128         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10129                                      AREF (eol_type, i));
10130     }
10131
10132   Fputhash (alias, spec, Vcoding_system_hash_table);
10133   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10134   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10135   if (NILP (val))
10136     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10137                                   Vcoding_system_alist);
10138
10139   return Qnil;
10140 }
10141
10142 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10143        1, 1, 0,
10144        doc: /* Return the base of CODING-SYSTEM.
10145 Any alias or subsidiary coding system is not a base coding system.  */)
10146   (Lisp_Object coding_system)
10147 {
10148   Lisp_Object spec, attrs;
10149
10150   if (NILP (coding_system))
10151     return (Qno_conversion);
10152   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10153   attrs = AREF (spec, 0);
10154   return CODING_ATTR_BASE_NAME (attrs);
10155 }
10156
10157 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10158        1, 1, 0,
10159        doc: "Return the property list of CODING-SYSTEM.")
10160   (Lisp_Object coding_system)
10161 {
10162   Lisp_Object spec, attrs;
10163
10164   if (NILP (coding_system))
10165     coding_system = Qno_conversion;
10166   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10167   attrs = AREF (spec, 0);
10168   return CODING_ATTR_PLIST (attrs);
10169 }
10170
10171
10172 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10173        1, 1, 0,
10174        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10175   (Lisp_Object coding_system)
10176 {
10177   Lisp_Object spec;
10178
10179   if (NILP (coding_system))
10180     coding_system = Qno_conversion;
10181   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10182   return AREF (spec, 1);
10183 }
10184
10185 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10186        Scoding_system_eol_type, 1, 1, 0,
10187        doc: /* Return eol-type of CODING-SYSTEM.
10188 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10189
10190 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10191 and CR respectively.
10192
10193 A vector value indicates that a format of end-of-line should be
10194 detected automatically.  Nth element of the vector is the subsidiary
10195 coding system whose eol-type is N.  */)
10196   (Lisp_Object coding_system)
10197 {
10198   Lisp_Object spec, eol_type;
10199   int n;
10200
10201   if (NILP (coding_system))
10202     coding_system = Qno_conversion;
10203   if (! CODING_SYSTEM_P (coding_system))
10204     return Qnil;
10205   spec = CODING_SYSTEM_SPEC (coding_system);
10206   eol_type = AREF (spec, 2);
10207   if (VECTORP (eol_type))
10208     return Fcopy_sequence (eol_type);
10209   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10210   return make_number (n);
10211 }
10212
10213 #endif /* emacs */
10214
10215 \f
10216 /*** 9. Post-amble ***/
10217
10218 void
10219 init_coding_once (void)
10220 {
10221   int i;
10222
10223   for (i = 0; i < coding_category_max; i++)
10224     {
10225       coding_categories[i].id = -1;
10226       coding_priorities[i] = i;
10227     }
10228
10229   /* ISO2022 specific initialize routine.  */
10230   for (i = 0; i < 0x20; i++)
10231     iso_code_class[i] = ISO_control_0;
10232   for (i = 0x21; i < 0x7F; i++)
10233     iso_code_class[i] = ISO_graphic_plane_0;
10234   for (i = 0x80; i < 0xA0; i++)
10235     iso_code_class[i] = ISO_control_1;
10236   for (i = 0xA1; i < 0xFF; i++)
10237     iso_code_class[i] = ISO_graphic_plane_1;
10238   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10239   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10240   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10241   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10242   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10243   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10244   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10245   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10246   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10247
10248   for (i = 0; i < 256; i++)
10249     {
10250       emacs_mule_bytes[i] = 1;
10251     }
10252   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10253   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10254   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10255   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10256 }
10257
10258 #ifdef emacs
10259
10260 void
10261 syms_of_coding (void)
10262 {
10263   staticpro (&Vcoding_system_hash_table);
10264   {
10265     Lisp_Object args[2];
10266     args[0] = QCtest;
10267     args[1] = Qeq;
10268     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10269   }
10270
10271   staticpro (&Vsjis_coding_system);
10272   Vsjis_coding_system = Qnil;
10273
10274   staticpro (&Vbig5_coding_system);
10275   Vbig5_coding_system = Qnil;
10276
10277   staticpro (&Vcode_conversion_reused_workbuf);
10278   Vcode_conversion_reused_workbuf = Qnil;
10279
10280   staticpro (&Vcode_conversion_workbuf_name);
10281   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10282
10283   reused_workbuf_in_use = 0;
10284
10285   DEFSYM (Qcharset, "charset");
10286   DEFSYM (Qtarget_idx, "target-idx");
10287   DEFSYM (Qcoding_system_history, "coding-system-history");
10288   Fset (Qcoding_system_history, Qnil);
10289
10290   /* Target FILENAME is the first argument.  */
10291   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10292   /* Target FILENAME is the third argument.  */
10293   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10294
10295   DEFSYM (Qcall_process, "call-process");
10296   /* Target PROGRAM is the first argument.  */
10297   Fput (Qcall_process, Qtarget_idx, make_number (0));
10298
10299   DEFSYM (Qcall_process_region, "call-process-region");
10300   /* Target PROGRAM is the third argument.  */
10301   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10302
10303   DEFSYM (Qstart_process, "start-process");
10304   /* Target PROGRAM is the third argument.  */
10305   Fput (Qstart_process, Qtarget_idx, make_number (2));
10306
10307   DEFSYM (Qopen_network_stream, "open-network-stream");
10308   /* Target SERVICE is the fourth argument.  */
10309   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10310
10311   DEFSYM (Qcoding_system, "coding-system");
10312   DEFSYM (Qcoding_aliases, "coding-aliases");
10313
10314   DEFSYM (Qeol_type, "eol-type");
10315   DEFSYM (Qunix, "unix");
10316   DEFSYM (Qdos, "dos");
10317
10318   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10319   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10320   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10321   DEFSYM (Qdefault_char, "default-char");
10322   DEFSYM (Qundecided, "undecided");
10323   DEFSYM (Qno_conversion, "no-conversion");
10324   DEFSYM (Qraw_text, "raw-text");
10325
10326   DEFSYM (Qiso_2022, "iso-2022");
10327
10328   DEFSYM (Qutf_8, "utf-8");
10329   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10330
10331   DEFSYM (Qutf_16, "utf-16");
10332   DEFSYM (Qbig, "big");
10333   DEFSYM (Qlittle, "little");
10334
10335   DEFSYM (Qshift_jis, "shift-jis");
10336   DEFSYM (Qbig5, "big5");
10337
10338   DEFSYM (Qcoding_system_p, "coding-system-p");
10339
10340   DEFSYM (Qcoding_system_error, "coding-system-error");
10341   Fput (Qcoding_system_error, Qerror_conditions,
10342         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10343   Fput (Qcoding_system_error, Qerror_message,
10344         build_pure_c_string ("Invalid coding system"));
10345
10346   /* Intern this now in case it isn't already done.
10347      Setting this variable twice is harmless.
10348      But don't staticpro it here--that is done in alloc.c.  */
10349   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10350
10351   DEFSYM (Qtranslation_table, "translation-table");
10352   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10353   DEFSYM (Qtranslation_table_id, "translation-table-id");
10354   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10355   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10356
10357   DEFSYM (Qvalid_codes, "valid-codes");
10358
10359   DEFSYM (Qemacs_mule, "emacs-mule");
10360
10361   DEFSYM (QCcategory, ":category");
10362   DEFSYM (QCmnemonic, ":mnemonic");
10363   DEFSYM (QCdefault_char, ":default-char");
10364   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10365   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10366   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10367   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10368   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10369
10370   Vcoding_category_table
10371     = Fmake_vector (make_number (coding_category_max), Qnil);
10372   staticpro (&Vcoding_category_table);
10373   /* Followings are target of code detection.  */
10374   ASET (Vcoding_category_table, coding_category_iso_7,
10375         intern_c_string ("coding-category-iso-7"));
10376   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10377         intern_c_string ("coding-category-iso-7-tight"));
10378   ASET (Vcoding_category_table, coding_category_iso_8_1,
10379         intern_c_string ("coding-category-iso-8-1"));
10380   ASET (Vcoding_category_table, coding_category_iso_8_2,
10381         intern_c_string ("coding-category-iso-8-2"));
10382   ASET (Vcoding_category_table, coding_category_iso_7_else,
10383         intern_c_string ("coding-category-iso-7-else"));
10384   ASET (Vcoding_category_table, coding_category_iso_8_else,
10385         intern_c_string ("coding-category-iso-8-else"));
10386   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10387         intern_c_string ("coding-category-utf-8-auto"));
10388   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10389         intern_c_string ("coding-category-utf-8"));
10390   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10391         intern_c_string ("coding-category-utf-8-sig"));
10392   ASET (Vcoding_category_table, coding_category_utf_16_be,
10393         intern_c_string ("coding-category-utf-16-be"));
10394   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10395         intern_c_string ("coding-category-utf-16-auto"));
10396   ASET (Vcoding_category_table, coding_category_utf_16_le,
10397         intern_c_string ("coding-category-utf-16-le"));
10398   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10399         intern_c_string ("coding-category-utf-16-be-nosig"));
10400   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10401         intern_c_string ("coding-category-utf-16-le-nosig"));
10402   ASET (Vcoding_category_table, coding_category_charset,
10403         intern_c_string ("coding-category-charset"));
10404   ASET (Vcoding_category_table, coding_category_sjis,
10405         intern_c_string ("coding-category-sjis"));
10406   ASET (Vcoding_category_table, coding_category_big5,
10407         intern_c_string ("coding-category-big5"));
10408   ASET (Vcoding_category_table, coding_category_ccl,
10409         intern_c_string ("coding-category-ccl"));
10410   ASET (Vcoding_category_table, coding_category_emacs_mule,
10411         intern_c_string ("coding-category-emacs-mule"));
10412   /* Followings are NOT target of code detection.  */
10413   ASET (Vcoding_category_table, coding_category_raw_text,
10414         intern_c_string ("coding-category-raw-text"));
10415   ASET (Vcoding_category_table, coding_category_undecided,
10416         intern_c_string ("coding-category-undecided"));
10417
10418   DEFSYM (Qinsufficient_source, "insufficient-source");
10419   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10420   DEFSYM (Qinvalid_source, "invalid-source");
10421   DEFSYM (Qinterrupted, "interrupted");
10422   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10423   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10424
10425   defsubr (&Scoding_system_p);
10426   defsubr (&Sread_coding_system);
10427   defsubr (&Sread_non_nil_coding_system);
10428   defsubr (&Scheck_coding_system);
10429   defsubr (&Sdetect_coding_region);
10430   defsubr (&Sdetect_coding_string);
10431   defsubr (&Sfind_coding_systems_region_internal);
10432   defsubr (&Sunencodable_char_position);
10433   defsubr (&Scheck_coding_systems_region);
10434   defsubr (&Sdecode_coding_region);
10435   defsubr (&Sencode_coding_region);
10436   defsubr (&Sdecode_coding_string);
10437   defsubr (&Sencode_coding_string);
10438   defsubr (&Sdecode_sjis_char);
10439   defsubr (&Sencode_sjis_char);
10440   defsubr (&Sdecode_big5_char);
10441   defsubr (&Sencode_big5_char);
10442   defsubr (&Sset_terminal_coding_system_internal);
10443   defsubr (&Sset_safe_terminal_coding_system_internal);
10444   defsubr (&Sterminal_coding_system);
10445   defsubr (&Sset_keyboard_coding_system_internal);
10446   defsubr (&Skeyboard_coding_system);
10447   defsubr (&Sfind_operation_coding_system);
10448   defsubr (&Sset_coding_system_priority);
10449   defsubr (&Sdefine_coding_system_internal);
10450   defsubr (&Sdefine_coding_system_alias);
10451   defsubr (&Scoding_system_put);
10452   defsubr (&Scoding_system_base);
10453   defsubr (&Scoding_system_plist);
10454   defsubr (&Scoding_system_aliases);
10455   defsubr (&Scoding_system_eol_type);
10456   defsubr (&Scoding_system_priority_list);
10457
10458   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10459                doc: /* List of coding systems.
10460
10461 Do not alter the value of this variable manually.  This variable should be
10462 updated by the functions `define-coding-system' and
10463 `define-coding-system-alias'.  */);
10464   Vcoding_system_list = Qnil;
10465
10466   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10467                doc: /* Alist of coding system names.
10468 Each element is one element list of coding system name.
10469 This variable is given to `completing-read' as COLLECTION argument.
10470
10471 Do not alter the value of this variable manually.  This variable should be
10472 updated by the functions `make-coding-system' and
10473 `define-coding-system-alias'.  */);
10474   Vcoding_system_alist = Qnil;
10475
10476   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10477                doc: /* List of coding-categories (symbols) ordered by priority.
10478
10479 On detecting a coding system, Emacs tries code detection algorithms
10480 associated with each coding-category one by one in this order.  When
10481 one algorithm agrees with a byte sequence of source text, the coding
10482 system bound to the corresponding coding-category is selected.
10483
10484 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10485   {
10486     int i;
10487
10488     Vcoding_category_list = Qnil;
10489     for (i = coding_category_max - 1; i >= 0; i--)
10490       Vcoding_category_list
10491         = Fcons (AREF (Vcoding_category_table, i),
10492                  Vcoding_category_list);
10493   }
10494
10495   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10496                doc: /* Specify the coding system for read operations.
10497 It is useful to bind this variable with `let', but do not set it globally.
10498 If the value is a coding system, it is used for decoding on read operation.
10499 If not, an appropriate element is used from one of the coding system alists.
10500 There are three such tables: `file-coding-system-alist',
10501 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10502   Vcoding_system_for_read = Qnil;
10503
10504   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10505                doc: /* Specify the coding system for write operations.
10506 Programs bind this variable with `let', but you should not set it globally.
10507 If the value is a coding system, it is used for encoding of output,
10508 when writing it to a file and when sending it to a file or subprocess.
10509
10510 If this does not specify a coding system, an appropriate element
10511 is used from one of the coding system alists.
10512 There are three such tables: `file-coding-system-alist',
10513 `process-coding-system-alist', and `network-coding-system-alist'.
10514 For output to files, if the above procedure does not specify a coding system,
10515 the value of `buffer-file-coding-system' is used.  */);
10516   Vcoding_system_for_write = Qnil;
10517
10518   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10519                doc: /*
10520 Coding system used in the latest file or process I/O.  */);
10521   Vlast_coding_system_used = Qnil;
10522
10523   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10524                doc: /*
10525 Error status of the last code conversion.
10526
10527 When an error was detected in the last code conversion, this variable
10528 is set to one of the following symbols.
10529   `insufficient-source'
10530   `inconsistent-eol'
10531   `invalid-source'
10532   `interrupted'
10533   `insufficient-memory'
10534 When no error was detected, the value doesn't change.  So, to check
10535 the error status of a code conversion by this variable, you must
10536 explicitly set this variable to nil before performing code
10537 conversion.  */);
10538   Vlast_code_conversion_error = Qnil;
10539
10540   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10541                doc: /*
10542 *Non-nil means always inhibit code conversion of end-of-line format.
10543 See info node `Coding Systems' and info node `Text and Binary' concerning
10544 such conversion.  */);
10545   inhibit_eol_conversion = 0;
10546
10547   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10548                doc: /*
10549 Non-nil means process buffer inherits coding system of process output.
10550 Bind it to t if the process output is to be treated as if it were a file
10551 read from some filesystem.  */);
10552   inherit_process_coding_system = 0;
10553
10554   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10555                doc: /*
10556 Alist to decide a coding system to use for a file I/O operation.
10557 The format is ((PATTERN . VAL) ...),
10558 where PATTERN is a regular expression matching a file name,
10559 VAL is a coding system, a cons of coding systems, or a function symbol.
10560 If VAL is a coding system, it is used for both decoding and encoding
10561 the file contents.
10562 If VAL is a cons of coding systems, the car part is used for decoding,
10563 and the cdr part is used for encoding.
10564 If VAL is a function symbol, the function must return a coding system
10565 or a cons of coding systems which are used as above.  The function is
10566 called with an argument that is a list of the arguments with which
10567 `find-operation-coding-system' was called.  If the function can't decide
10568 a coding system, it can return `undecided' so that the normal
10569 code-detection is performed.
10570
10571 See also the function `find-operation-coding-system'
10572 and the variable `auto-coding-alist'.  */);
10573   Vfile_coding_system_alist = Qnil;
10574
10575   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10576                doc: /*
10577 Alist to decide a coding system to use for a process I/O operation.
10578 The format is ((PATTERN . VAL) ...),
10579 where PATTERN is a regular expression matching a program name,
10580 VAL is a coding system, a cons of coding systems, or a function symbol.
10581 If VAL is a coding system, it is used for both decoding what received
10582 from the program and encoding what sent to the program.
10583 If VAL is a cons of coding systems, the car part is used for decoding,
10584 and the cdr part is used for encoding.
10585 If VAL is a function symbol, the function must return a coding system
10586 or a cons of coding systems which are used as above.
10587
10588 See also the function `find-operation-coding-system'.  */);
10589   Vprocess_coding_system_alist = Qnil;
10590
10591   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10592                doc: /*
10593 Alist to decide a coding system to use for a network I/O operation.
10594 The format is ((PATTERN . VAL) ...),
10595 where PATTERN is a regular expression matching a network service name
10596 or is a port number to connect to,
10597 VAL is a coding system, a cons of coding systems, or a function symbol.
10598 If VAL is a coding system, it is used for both decoding what received
10599 from the network stream and encoding what sent to the network stream.
10600 If VAL is a cons of coding systems, the car part is used for decoding,
10601 and the cdr part is used for encoding.
10602 If VAL is a function symbol, the function must return a coding system
10603 or a cons of coding systems which are used as above.
10604
10605 See also the function `find-operation-coding-system'.  */);
10606   Vnetwork_coding_system_alist = Qnil;
10607
10608   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10609                doc: /* Coding system to use with system messages.
10610 Also used for decoding keyboard input on X Window system.  */);
10611   Vlocale_coding_system = Qnil;
10612
10613   /* The eol mnemonics are reset in startup.el system-dependently.  */
10614   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10615                doc: /*
10616 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10617   eol_mnemonic_unix = build_pure_c_string (":");
10618
10619   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10620                doc: /*
10621 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10622   eol_mnemonic_dos = build_pure_c_string ("\\");
10623
10624   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10625                doc: /*
10626 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10627   eol_mnemonic_mac = build_pure_c_string ("/");
10628
10629   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10630                doc: /*
10631 *String displayed in mode line when end-of-line format is not yet determined.  */);
10632   eol_mnemonic_undecided = build_pure_c_string (":");
10633
10634   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10635                doc: /*
10636 *Non-nil enables character translation while encoding and decoding.  */);
10637   Venable_character_translation = Qt;
10638
10639   DEFVAR_LISP ("standard-translation-table-for-decode",
10640                Vstandard_translation_table_for_decode,
10641                doc: /* Table for translating characters while decoding.  */);
10642   Vstandard_translation_table_for_decode = Qnil;
10643
10644   DEFVAR_LISP ("standard-translation-table-for-encode",
10645                Vstandard_translation_table_for_encode,
10646                doc: /* Table for translating characters while encoding.  */);
10647   Vstandard_translation_table_for_encode = Qnil;
10648
10649   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10650                doc: /* Alist of charsets vs revision numbers.
10651 While encoding, if a charset (car part of an element) is found,
10652 designate it with the escape sequence identifying revision (cdr part
10653 of the element).  */);
10654   Vcharset_revision_table = Qnil;
10655
10656   DEFVAR_LISP ("default-process-coding-system",
10657                Vdefault_process_coding_system,
10658                doc: /* Cons of coding systems used for process I/O by default.
10659 The car part is used for decoding a process output,
10660 the cdr part is used for encoding a text to be sent to a process.  */);
10661   Vdefault_process_coding_system = Qnil;
10662
10663   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10664                doc: /*
10665 Table of extra Latin codes in the range 128..159 (inclusive).
10666 This is a vector of length 256.
10667 If Nth element is non-nil, the existence of code N in a file
10668 \(or output of subprocess) doesn't prevent it to be detected as
10669 a coding system of ISO 2022 variant which has a flag
10670 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10671 or reading output of a subprocess.
10672 Only 128th through 159th elements have a meaning.  */);
10673   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10674
10675   DEFVAR_LISP ("select-safe-coding-system-function",
10676                Vselect_safe_coding_system_function,
10677                doc: /*
10678 Function to call to select safe coding system for encoding a text.
10679
10680 If set, this function is called to force a user to select a proper
10681 coding system which can encode the text in the case that a default
10682 coding system used in each operation can't encode the text.  The
10683 function should take care that the buffer is not modified while
10684 the coding system is being selected.
10685
10686 The default value is `select-safe-coding-system' (which see).  */);
10687   Vselect_safe_coding_system_function = Qnil;
10688
10689   DEFVAR_BOOL ("coding-system-require-warning",
10690                coding_system_require_warning,
10691                doc: /* Internal use only.
10692 If non-nil, on writing a file, `select-safe-coding-system-function' is
10693 called even if `coding-system-for-write' is non-nil.  The command
10694 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10695   coding_system_require_warning = 0;
10696
10697
10698   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10699                inhibit_iso_escape_detection,
10700                doc: /*
10701 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10702
10703 When Emacs reads text, it tries to detect how the text is encoded.
10704 This code detection is sensitive to escape sequences.  If Emacs sees
10705 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10706 of the ISO2022 encodings, and decodes text by the corresponding coding
10707 system (e.g. `iso-2022-7bit').
10708
10709 However, there may be a case that you want to read escape sequences in
10710 a file as is.  In such a case, you can set this variable to non-nil.
10711 Then the code detection will ignore any escape sequences, and no text is
10712 detected as encoded in some ISO-2022 encoding.  The result is that all
10713 escape sequences become visible in a buffer.
10714
10715 The default value is nil, and it is strongly recommended not to change
10716 it.  That is because many Emacs Lisp source files that contain
10717 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10718 in Emacs's distribution, and they won't be decoded correctly on
10719 reading if you suppress escape sequence detection.
10720
10721 The other way to read escape sequences in a file without decoding is
10722 to explicitly specify some coding system that doesn't use ISO-2022
10723 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10724   inhibit_iso_escape_detection = 0;
10725
10726   DEFVAR_BOOL ("inhibit-null-byte-detection",
10727                inhibit_null_byte_detection,
10728                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10729 By default, Emacs treats it as binary data, and does not attempt to
10730 decode it.  The effect is as if you specified `no-conversion' for
10731 reading that text.
10732
10733 Set this to non-nil when a regular text happens to include null bytes.
10734 Examples are Index nodes of Info files and null-byte delimited output
10735 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10736 decode text as usual.  */);
10737   inhibit_null_byte_detection = 0;
10738
10739   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10740                doc: /* Char table for translating self-inserting characters.
10741 This is applied to the result of input methods, not their input.
10742 See also `keyboard-translate-table'.
10743
10744 Use of this variable for character code unification was rendered
10745 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10746 internal character representation.  */);
10747     Vtranslation_table_for_input = Qnil;
10748
10749   {
10750     Lisp_Object args[coding_arg_max];
10751     Lisp_Object plist[16];
10752     int i;
10753
10754     for (i = 0; i < coding_arg_max; i++)
10755       args[i] = Qnil;
10756
10757     plist[0] = intern_c_string (":name");
10758     plist[1] = args[coding_arg_name] = Qno_conversion;
10759     plist[2] = intern_c_string (":mnemonic");
10760     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10761     plist[4] = intern_c_string (":coding-type");
10762     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10763     plist[6] = intern_c_string (":ascii-compatible-p");
10764     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10765     plist[8] = intern_c_string (":default-char");
10766     plist[9] = args[coding_arg_default_char] = make_number (0);
10767     plist[10] = intern_c_string (":for-unibyte");
10768     plist[11] = args[coding_arg_for_unibyte] = Qt;
10769     plist[12] = intern_c_string (":docstring");
10770     plist[13] = build_pure_c_string ("Do no conversion.\n\
10771 \n\
10772 When you visit a file with this coding, the file is read into a\n\
10773 unibyte buffer as is, thus each byte of a file is treated as a\n\
10774 character.");
10775     plist[14] = intern_c_string (":eol-type");
10776     plist[15] = args[coding_arg_eol_type] = Qunix;
10777     args[coding_arg_plist] = Flist (16, plist);
10778     Fdefine_coding_system_internal (coding_arg_max, args);
10779
10780     plist[1] = args[coding_arg_name] = Qundecided;
10781     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10782     plist[5] = args[coding_arg_coding_type] = Qundecided;
10783     /* This is already set.
10784        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10785     plist[8] = intern_c_string (":charset-list");
10786     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10787     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10788     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10789     plist[15] = args[coding_arg_eol_type] = Qnil;
10790     args[coding_arg_plist] = Flist (16, plist);
10791     Fdefine_coding_system_internal (coding_arg_max, args);
10792   }
10793
10794   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10795
10796   {
10797     int i;
10798
10799     for (i = 0; i < coding_category_max; i++)
10800       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10801   }
10802 #if defined (DOS_NT)
10803   system_eol_type = Qdos;
10804 #else
10805   system_eol_type = Qunix;
10806 #endif
10807   staticpro (&system_eol_type);
10808 }
10809
10810 char *
10811 emacs_strerror (int error_number)
10812 {
10813   char *str;
10814
10815   synchronize_system_messages_locale ();
10816   str = strerror (error_number);
10817
10818   if (! NILP (Vlocale_coding_system))
10819     {
10820       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10821                                                       Vlocale_coding_system,
10822                                                       0);
10823       str = SSDATA (dec);
10824     }
10825
10826   return str;
10827 }
10828
10829 #endif /* emacs */