src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2012 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #include "lisp.h"
 290 #include "character.h"
 291 #include "buffer.h"
 292 #include "charset.h"
 293 #include "ccl.h"
 294 #include "composite.h"
 295 #include "coding.h"
 296 #include "window.h"
 297 #include "frame.h"
 298 #include "termhooks.h"
 299
 300 Lisp_Object Vcoding_system_hash_table;
 301
 302 static Lisp_Object Qcoding_system, Qeol_type;
 303 static Lisp_Object Qcoding_aliases;
 304 Lisp_Object Qunix, Qdos;
 305 Lisp_Object Qbuffer_file_coding_system;
 306 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 307 static Lisp_Object Qdefault_char;
 308 Lisp_Object Qno_conversion, Qundecided;
 309 Lisp_Object Qcharset, Qutf_8;
 310 static Lisp_Object Qiso_2022;
 311 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 312 static Lisp_Object Qbig, Qlittle;
 313 static Lisp_Object Qcoding_system_history;
 314 static Lisp_Object Qvalid_codes;
 315 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 316 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 317 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 318 static Lisp_Object QCascii_compatible_p;
 319
 320 Lisp_Object Qcall_process, Qcall_process_region;
 321 Lisp_Object Qstart_process, Qopen_network_stream;
 322 static Lisp_Object Qtarget_idx;
 323
 324 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 325 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 326
 327 /* If a symbol has this property, evaluate the value to define the
 328    symbol as a coding system.  */
 329 static Lisp_Object Qcoding_system_define_form;
 330
 331 /* Format of end-of-line decided by system.  This is Qunix on
 332    Unix and Mac, Qdos on DOS/Windows.
 333    This has an effect only for external encoding (i.e. for output to
 334    file and process), not for in-buffer or Lisp string encoding.  */
 335 static Lisp_Object system_eol_type;
 336
 337 #ifdef emacs
 338
 339 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 340
 341 /* Coding system emacs-mule and raw-text are for converting only
 342    end-of-line format.  */
 343 Lisp_Object Qemacs_mule, Qraw_text;
 344 Lisp_Object Qutf_8_emacs;
 345
 346 /* Coding-systems are handed between Emacs Lisp programs and C internal
 347    routines by the following three variables.  */
 348 /* Coding system to be used to encode text for terminal display when
 349    terminal coding system is nil.  */
 350 struct coding_system safe_terminal_coding;
 351
 352 #endif /* emacs */
 353
 354 Lisp_Object Qtranslation_table;
 355 Lisp_Object Qtranslation_table_id;
 356 static Lisp_Object Qtranslation_table_for_decode;
 357 static Lisp_Object Qtranslation_table_for_encode;
 358
 359 /* Two special coding systems.  */
 360 static Lisp_Object Vsjis_coding_system;
 361 static Lisp_Object Vbig5_coding_system;
 362
 363 /* ISO2022 section */
 364
 365 #define CODING_ISO_INITIAL(coding, reg)                 \
 366   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 367                      coding_attr_iso_initial),          \
 368                reg)))
 369
 370
 371 #define CODING_ISO_REQUEST(coding, charset_id)          \
 372   (((charset_id) <= (coding)->max_charset_id            \
 373     ? ((coding)->safe_charsets[charset_id] != 255       \
 374        ? (coding)->safe_charsets[charset_id]            \
 375        : -1)                                            \
 376     : -1))
 377
 378
 379 #define CODING_ISO_FLAGS(coding)        \
 380   ((coding)->spec.iso_2022.flags)
 381 #define CODING_ISO_DESIGNATION(coding, reg)     \
 382   ((coding)->spec.iso_2022.current_designation[reg])
 383 #define CODING_ISO_INVOCATION(coding, plane)    \
 384   ((coding)->spec.iso_2022.current_invocation[plane])
 385 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 386   ((coding)->spec.iso_2022.single_shifting)
 387 #define CODING_ISO_BOL(coding)  \
 388   ((coding)->spec.iso_2022.bol)
 389 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 390   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 391 #define CODING_ISO_CMP_STATUS(coding)   \
 392   (&(coding)->spec.iso_2022.cmp_status)
 393 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 394   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 395 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 396   ((coding)->spec.iso_2022.embedded_utf_8)
 397
 398 /* Control characters of ISO2022.  */
 399                         /* code */      /* function */
 400 #define ISO_CODE_SO     0x0E            /* shift-out */
 401 #define ISO_CODE_SI     0x0F            /* shift-in */
 402 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 403 #define ISO_CODE_ESC    0x1B            /* escape */
 404 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 405 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 406 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 407
 408 /* All code (1-byte) of ISO2022 is classified into one of the
 409    followings.  */
 410 enum iso_code_class_type
 411   {
 412     ISO_control_0,              /* Control codes in the range
 413                                    0x00..0x1F and 0x7F, except for the
 414                                    following 5 codes.  */
 415     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 416     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 417     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 418     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 419     ISO_control_1,              /* Control codes in the range
 420                                    0x80..0x9F, except for the
 421                                    following 3 codes.  */
 422     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 423     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 424     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 425     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 426     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 427     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 428     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 429   };
 430
 431 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 432     `iso-flags' attribute of an iso2022 coding system.  */
 433
 434 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 435    instead of the correct short-form sequence (e.g. ESC $ A).  */
 436 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 437
 438 /* If set, reset graphic planes and registers at end-of-line to the
 439    initial state.  */
 440 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 441
 442 /* If set, reset graphic planes and registers before any control
 443    characters to the initial state.  */
 444 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 445
 446 /* If set, encode by 7-bit environment.  */
 447 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 448
 449 /* If set, use locking-shift function.  */
 450 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 451
 452 /* If set, use single-shift function.  Overwrite
 453    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 454 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 455
 456 /* If set, use designation escape sequence.  */
 457 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 458
 459 /* If set, produce revision number sequence.  */
 460 #define CODING_ISO_FLAG_REVISION        0x0080
 461
 462 /* If set, produce ISO6429's direction specifying sequence.  */
 463 #define CODING_ISO_FLAG_DIRECTION       0x0100
 464
 465 /* If set, assume designation states are reset at beginning of line on
 466    output.  */
 467 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 468
 469 /* If set, designation sequence should be placed at beginning of line
 470    on output.  */
 471 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 472
 473 /* If set, do not encode unsafe characters on output.  */
 474 #define CODING_ISO_FLAG_SAFE            0x0800
 475
 476 /* If set, extra latin codes (128..159) are accepted as a valid code
 477    on input.  */
 478 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 479
 480 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 481
 482 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 483
 484 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 485
 486 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 487
 488 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 489
 490 /* A character to be produced on output if encoding of the original
 491    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 492 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 493
 494 /* UTF-8 section */
 495 #define CODING_UTF_8_BOM(coding)        \
 496   ((coding)->spec.utf_8_bom)
 497
 498 /* UTF-16 section */
 499 #define CODING_UTF_16_BOM(coding)       \
 500   ((coding)->spec.utf_16.bom)
 501
 502 #define CODING_UTF_16_ENDIAN(coding)    \
 503   ((coding)->spec.utf_16.endian)
 504
 505 #define CODING_UTF_16_SURROGATE(coding) \
 506   ((coding)->spec.utf_16.surrogate)
 507
 508
 509 /* CCL section */
 510 #define CODING_CCL_DECODER(coding)      \
 511   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 512 #define CODING_CCL_ENCODER(coding)      \
 513   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 514 #define CODING_CCL_VALIDS(coding)                                          \
 515   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 516
 517 /* Index for each coding category in `coding_categories' */
 518
 519 enum coding_category
 520   {
 521     coding_category_iso_7,
 522     coding_category_iso_7_tight,
 523     coding_category_iso_8_1,
 524     coding_category_iso_8_2,
 525     coding_category_iso_7_else,
 526     coding_category_iso_8_else,
 527     coding_category_utf_8_auto,
 528     coding_category_utf_8_nosig,
 529     coding_category_utf_8_sig,
 530     coding_category_utf_16_auto,
 531     coding_category_utf_16_be,
 532     coding_category_utf_16_le,
 533     coding_category_utf_16_be_nosig,
 534     coding_category_utf_16_le_nosig,
 535     coding_category_charset,
 536     coding_category_sjis,
 537     coding_category_big5,
 538     coding_category_ccl,
 539     coding_category_emacs_mule,
 540     /* All above are targets of code detection.  */
 541     coding_category_raw_text,
 542     coding_category_undecided,
 543     coding_category_max
 544   };
 545
 546 /* Definitions of flag bits used in detect_coding_XXXX.  */
 547 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 548 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 549 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 550 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 551 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 552 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 553 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 554 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 555 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 556 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 557 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 558 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 559 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 560 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 561 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 562 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 563 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 564 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 565 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 566 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 567
 568 /* This value is returned if detect_coding_mask () find nothing other
 569    than ASCII characters.  */
 570 #define CATEGORY_MASK_ANY               \
 571   (CATEGORY_MASK_ISO_7                  \
 572    | CATEGORY_MASK_ISO_7_TIGHT          \
 573    | CATEGORY_MASK_ISO_8_1              \
 574    | CATEGORY_MASK_ISO_8_2              \
 575    | CATEGORY_MASK_ISO_7_ELSE           \
 576    | CATEGORY_MASK_ISO_8_ELSE           \
 577    | CATEGORY_MASK_UTF_8_AUTO           \
 578    | CATEGORY_MASK_UTF_8_NOSIG          \
 579    | CATEGORY_MASK_UTF_8_SIG            \
 580    | CATEGORY_MASK_UTF_16_AUTO          \
 581    | CATEGORY_MASK_UTF_16_BE            \
 582    | CATEGORY_MASK_UTF_16_LE            \
 583    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 584    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 585    | CATEGORY_MASK_CHARSET              \
 586    | CATEGORY_MASK_SJIS                 \
 587    | CATEGORY_MASK_BIG5                 \
 588    | CATEGORY_MASK_CCL                  \
 589    | CATEGORY_MASK_EMACS_MULE)
 590
 591
 592 #define CATEGORY_MASK_ISO_7BIT \
 593   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 594
 595 #define CATEGORY_MASK_ISO_8BIT \
 596   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 597
 598 #define CATEGORY_MASK_ISO_ELSE \
 599   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 600
 601 #define CATEGORY_MASK_ISO_ESCAPE        \
 602   (CATEGORY_MASK_ISO_7                  \
 603    | CATEGORY_MASK_ISO_7_TIGHT          \
 604    | CATEGORY_MASK_ISO_7_ELSE           \
 605    | CATEGORY_MASK_ISO_8_ELSE)
 606
 607 #define CATEGORY_MASK_ISO       \
 608   (  CATEGORY_MASK_ISO_7BIT     \
 609      | CATEGORY_MASK_ISO_8BIT   \
 610      | CATEGORY_MASK_ISO_ELSE)
 611
 612 #define CATEGORY_MASK_UTF_16            \
 613   (CATEGORY_MASK_UTF_16_AUTO            \
 614    | CATEGORY_MASK_UTF_16_BE            \
 615    | CATEGORY_MASK_UTF_16_LE            \
 616    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 617    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 618
 619 #define CATEGORY_MASK_UTF_8     \
 620   (CATEGORY_MASK_UTF_8_AUTO     \
 621    | CATEGORY_MASK_UTF_8_NOSIG  \
 622    | CATEGORY_MASK_UTF_8_SIG)
 623
 624 /* Table of coding categories (Lisp symbols).  This variable is for
 625    internal use only.  */
 626 static Lisp_Object Vcoding_category_table;
 627
 628 /* Table of coding-categories ordered by priority.  */
 629 static enum coding_category coding_priorities[coding_category_max];
 630
 631 /* Nth element is a coding context for the coding system bound to the
 632    Nth coding category.  */
 633 static struct coding_system coding_categories[coding_category_max];
 634
 635 /*** Commonly used macros and functions ***/
 636
 637 #ifndef min
 638 #define min(a, b) ((a) < (b) ? (a) : (b))
 639 #endif
 640 #ifndef max
 641 #define max(a, b) ((a) > (b) ? (a) : (b))
 642 #endif
 643
 644 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 645   do {                                                  \
 646     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 647     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 648   } while (0)
 649
 650
 651 /* Safely get one byte from the source text pointed by SRC which ends
 652    at SRC_END, and set C to that byte.  If there are not enough bytes
 653    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 654    and a multibyte character is found at SRC, set C to the
 655    negative value of the character code.  The caller should declare
 656    and set these variables appropriately in advance:
 657         src, src_end, multibytep */
 658
 659 #define ONE_MORE_BYTE(c)                                \
 660   do {                                                  \
 661     if (src == src_end)                                 \
 662       {                                                 \
 663         if (src_base < src)                             \
 664           record_conversion_result                      \
 665             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 666         goto no_more_source;                            \
 667       }                                                 \
 668     c = *src++;                                         \
 669     if (multibytep && (c & 0x80))                       \
 670       {                                                 \
 671         if ((c & 0xFE) == 0xC0)                         \
 672           c = ((c & 1) << 6) | *src++;                  \
 673         else                                            \
 674           {                                             \
 675             src--;                                      \
 676             c = - string_char (src, &src, NULL);        \
 677             record_conversion_result                    \
 678               (coding, CODING_RESULT_INVALID_SRC);      \
 679           }                                             \
 680       }                                                 \
 681     consumed_chars++;                                   \
 682   } while (0)
 683
 684 /* Safely get two bytes from the source text pointed by SRC which ends
 685    at SRC_END, and set C1 and C2 to those bytes while skipping the
 686    heading multibyte characters.  If there are not enough bytes in the
 687    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 688    a multibyte character is found for C2, set C2 to the negative value
 689    of the character code.  The caller should declare and set these
 690    variables appropriately in advance:
 691         src, src_end, multibytep
 692    It is intended that this macro is used in detect_coding_utf_16.  */
 693
 694 #define TWO_MORE_BYTES(c1, c2)                          \
 695   do {                                                  \
 696     do {                                                \
 697       if (src == src_end)                               \
 698         goto no_more_source;                            \
 699       c1 = *src++;                                      \
 700       if (multibytep && (c1 & 0x80))                    \
 701         {                                               \
 702           if ((c1 & 0xFE) == 0xC0)                      \
 703             c1 = ((c1 & 1) << 6) | *src++;              \
 704           else                                          \
 705             {                                           \
 706               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 707               c1 = -1;                                  \
 708             }                                           \
 709         }                                               \
 710     } while (c1 < 0);                                   \
 711     if (src == src_end)                                 \
 712       goto no_more_source;                              \
 713     c2 = *src++;                                        \
 714     if (multibytep && (c2 & 0x80))                      \
 715       {                                                 \
 716         if ((c2 & 0xFE) == 0xC0)                        \
 717           c2 = ((c2 & 1) << 6) | *src++;                \
 718         else                                            \
 719           c2 = -1;                                      \
 720       }                                                 \
 721   } while (0)
 722
 723
 724 /* Store a byte C in the place pointed by DST and increment DST to the
 725    next free point, and increment PRODUCED_CHARS.  The caller should
 726    assure that C is 0..127, and declare and set the variable `dst'
 727    appropriately in advance.
 728 */
 729
 730
 731 #define EMIT_ONE_ASCII_BYTE(c)  \
 732   do {                          \
 733     produced_chars++;           \
 734     *dst++ = (c);               \
 735   } while (0)
 736
 737
 738 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 739
 740 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 741   do {                                  \
 742     produced_chars += 2;                \
 743     *dst++ = (c1), *dst++ = (c2);       \
 744   } while (0)
 745
 746
 747 /* Store a byte C in the place pointed by DST and increment DST to the
 748    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 749    store in an appropriate multibyte form.  The caller should
 750    declare and set the variables `dst' and `multibytep' appropriately
 751    in advance.  */
 752
 753 #define EMIT_ONE_BYTE(c)                \
 754   do {                                  \
 755     produced_chars++;                   \
 756     if (multibytep)                     \
 757       {                                 \
 758         unsigned ch = (c);              \
 759         if (ch >= 0x80)                 \
 760           ch = BYTE8_TO_CHAR (ch);      \
 761         CHAR_STRING_ADVANCE (ch, dst);  \
 762       }                                 \
 763     else                                \
 764       *dst++ = (c);                     \
 765   } while (0)
 766
 767
 768 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 769
 770 #define EMIT_TWO_BYTES(c1, c2)          \
 771   do {                                  \
 772     produced_chars += 2;                \
 773     if (multibytep)                     \
 774       {                                 \
 775         unsigned ch;                    \
 776                                         \
 777         ch = (c1);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781         ch = (c2);                      \
 782         if (ch >= 0x80)                 \
 783           ch = BYTE8_TO_CHAR (ch);      \
 784         CHAR_STRING_ADVANCE (ch, dst);  \
 785       }                                 \
 786     else                                \
 787       {                                 \
 788         *dst++ = (c1);                  \
 789         *dst++ = (c2);                  \
 790       }                                 \
 791   } while (0)
 792
 793
 794 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 795   do {                                  \
 796     EMIT_ONE_BYTE (c1);                 \
 797     EMIT_TWO_BYTES (c2, c3);            \
 798   } while (0)
 799
 800
 801 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 802   do {                                          \
 803     EMIT_TWO_BYTES (c1, c2);                    \
 804     EMIT_TWO_BYTES (c3, c4);                    \
 805   } while (0)
 806
 807
 808 static void
 809 record_conversion_result (struct coding_system *coding,
 810                           enum coding_result_code result)
 811 {
 812   coding->result = result;
 813   switch (result)
 814     {
 815     case CODING_RESULT_INSUFFICIENT_SRC:
 816       Vlast_code_conversion_error = Qinsufficient_source;
 817       break;
 818     case CODING_RESULT_INCONSISTENT_EOL:
 819       Vlast_code_conversion_error = Qinconsistent_eol;
 820       break;
 821     case CODING_RESULT_INVALID_SRC:
 822       Vlast_code_conversion_error = Qinvalid_source;
 823       break;
 824     case CODING_RESULT_INTERRUPT:
 825       Vlast_code_conversion_error = Qinterrupted;
 826       break;
 827     case CODING_RESULT_INSUFFICIENT_MEM:
 828       Vlast_code_conversion_error = Qinsufficient_memory;
 829       break;
 830     case CODING_RESULT_INSUFFICIENT_DST:
 831       /* Don't record this error in Vlast_code_conversion_error
 832          because it happens just temporarily and is resolved when the
 833          whole conversion is finished.  */
 834       break;
 835     case CODING_RESULT_SUCCESS:
 836       break;
 837     default:
 838       Vlast_code_conversion_error = intern ("Unknown error");
 839     }
 840 }
 841
 842 /* These wrapper macros are used to preserve validity of pointers into
 843    buffer text across calls to decode_char, encode_char, etc, which
 844    could cause relocation of buffers if it loads a charset map,
 845    because loading a charset map allocates large structures.  */
 846
 847 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 848   do {                                                                       \
 849     ptrdiff_t offset;                                                        \
 850                                                                              \
 851     charset_map_loaded = 0;                                                  \
 852     c = DECODE_CHAR (charset, code);                                         \
 853     if (charset_map_loaded                                                   \
 854         && (offset = coding_change_source (coding)))                         \
 855       {                                                                      \
 856         src += offset;                                                       \
 857         src_base += offset;                                                  \
 858         src_end += offset;                                                   \
 859       }                                                                      \
 860   } while (0)
 861
 862 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 863   do {                                                                  \
 864     ptrdiff_t offset;                                                   \
 865                                                                         \
 866     charset_map_loaded = 0;                                             \
 867     code = ENCODE_CHAR (charset, c);                                    \
 868     if (charset_map_loaded                                              \
 869         && (offset = coding_change_destination (coding)))               \
 870       {                                                                 \
 871         dst += offset;                                                  \
 872         dst_end += offset;                                              \
 873       }                                                                 \
 874   } while (0)
 875
 876 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 877   do {                                                                  \
 878     ptrdiff_t offset;                                                   \
 879                                                                         \
 880     charset_map_loaded = 0;                                             \
 881     charset = char_charset (c, charset_list, code_return);              \
 882     if (charset_map_loaded                                              \
 883         && (offset = coding_change_destination (coding)))               \
 884       {                                                                 \
 885         dst += offset;                                                  \
 886         dst_end += offset;                                              \
 887       }                                                                 \
 888   } while (0)
 889
 890 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 891   do {                                                                  \
 892     ptrdiff_t offset;                                                   \
 893                                                                         \
 894     charset_map_loaded = 0;                                             \
 895     result = CHAR_CHARSET_P (c, charset);                               \
 896     if (charset_map_loaded                                              \
 897         && (offset = coding_change_destination (coding)))               \
 898       {                                                                 \
 899         dst += offset;                                                  \
 900         dst_end += offset;                                              \
 901       }                                                                 \
 902   } while (0)
 903
 904
 905 /* If there are at least BYTES length of room at dst, allocate memory
 906    for coding->destination and update dst and dst_end.  We don't have
 907    to take care of coding->source which will be relocated.  It is
 908    handled by calling coding_set_source in encode_coding.  */
 909
 910 #define ASSURE_DESTINATION(bytes)                               \
 911   do {                                                          \
 912     if (dst + (bytes) >= dst_end)                               \
 913       {                                                         \
 914         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 915                                                                 \
 916         dst = alloc_destination (coding, more_bytes, dst);      \
 917         dst_end = coding->destination + coding->dst_bytes;      \
 918       }                                                         \
 919   } while (0)
 920
 921
 922 /* Store multibyte form of the character C in P, and advance P to the
 923    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 924    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 925    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 926
 927 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 928
 929 /* Return the character code of character whose multibyte form is at
 930    P, and advance P to the end of the multibyte form.  This used to be
 931    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 932    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 933
 934 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 935
 936 /* Set coding->source from coding->src_object.  */
 937
 938 static void
 939 coding_set_source (struct coding_system *coding)
 940 {
 941   if (BUFFERP (coding->src_object))
 942     {
 943       struct buffer *buf = XBUFFER (coding->src_object);
 944
 945       if (coding->src_pos < 0)
 946         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 947       else
 948         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 949     }
 950   else if (STRINGP (coding->src_object))
 951     {
 952       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 953     }
 954   else
 955     {
 956       /* Otherwise, the source is C string and is never relocated
 957          automatically.  Thus we don't have to update anything.  */
 958     }
 959 }
 960
 961
 962 /* Set coding->source from coding->src_object, and return how many
 963    bytes coding->source was changed.  */
 964
 965 static ptrdiff_t
 966 coding_change_source (struct coding_system *coding)
 967 {
 968   const unsigned char *orig = coding->source;
 969   coding_set_source (coding);
 970   return coding->source - orig;
 971 }
 972
 973
 974 /* Set coding->destination from coding->dst_object.  */
 975
 976 static void
 977 coding_set_destination (struct coding_system *coding)
 978 {
 979   if (BUFFERP (coding->dst_object))
 980     {
 981       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 982         {
 983           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 984           coding->dst_bytes = (GAP_END_ADDR
 985                                - (coding->src_bytes - coding->consumed)
 986                                - coding->destination);
 987         }
 988       else
 989         {
 990           /* We are sure that coding->dst_pos_byte is before the gap
 991              of the buffer. */
 992           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 993                                  + coding->dst_pos_byte - BEG_BYTE);
 994           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 995                                - coding->destination);
 996         }
 997     }
 998   else
 999     {
1000       /* Otherwise, the destination is C string and is never relocated
1001          automatically.  Thus we don't have to update anything.  */
1002     }
1003 }
1004
1005
1006 /* Set coding->destination from coding->dst_object, and return how
1007    many bytes coding->destination was changed.  */
1008
1009 static ptrdiff_t
1010 coding_change_destination (struct coding_system *coding)
1011 {
1012   const unsigned char *orig = coding->destination;
1013   coding_set_destination (coding);
1014   return coding->destination - orig;
1015 }
1016
1017
1018 static void
1019 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1020 {
1021   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1022     string_overflow ();
1023   coding->destination = xrealloc (coding->destination,
1024                                   coding->dst_bytes + bytes);
1025   coding->dst_bytes += bytes;
1026 }
1027
1028 static void
1029 coding_alloc_by_making_gap (struct coding_system *coding,
1030                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1031 {
1032   if (EQ (coding->src_object, coding->dst_object))
1033     {
1034       /* The gap may contain the produced data at the head and not-yet
1035          consumed data at the tail.  To preserve those data, we at
1036          first make the gap size to zero, then increase the gap
1037          size.  */
1038       ptrdiff_t add = GAP_SIZE;
1039
1040       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1041       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1042       make_gap (bytes);
1043       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1044       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1045     }
1046   else
1047     {
1048       Lisp_Object this_buffer;
1049
1050       this_buffer = Fcurrent_buffer ();
1051       set_buffer_internal (XBUFFER (coding->dst_object));
1052       make_gap (bytes);
1053       set_buffer_internal (XBUFFER (this_buffer));
1054     }
1055 }
1056
1057
1058 static unsigned char *
1059 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1060                    unsigned char *dst)
1061 {
1062   ptrdiff_t offset = dst - coding->destination;
1063
1064   if (BUFFERP (coding->dst_object))
1065     {
1066       struct buffer *buf = XBUFFER (coding->dst_object);
1067
1068       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1069     }
1070   else
1071     coding_alloc_by_realloc (coding, nbytes);
1072   coding_set_destination (coding);
1073   dst = coding->destination + offset;
1074   return dst;
1075 }
1076
1077 /** Macros for annotations.  */
1078
1079 /* An annotation data is stored in the array coding->charbuf in this
1080    format:
1081      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1082    LENGTH is the number of elements in the annotation.
1083    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1084    NCHARS is the number of characters in the text annotated.
1085
1086    The format of the following elements depend on ANNOTATION_MASK.
1087
1088    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1089    follows:
1090      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1091
1092    NBYTES is the number of bytes specified in the header part of
1093    old-style emacs-mule encoding, or 0 for the other kind of
1094    composition.
1095
1096    METHOD is one of enum composition_method.
1097
1098    Optional COMPOSITION-COMPONENTS are characters and composition
1099    rules.
1100
1101    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1102    follows.
1103
1104    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1105    recover from an invalid annotation, and should be skipped by
1106    produce_annotation.  */
1107
1108 /* Maximum length of the header of annotation data.  */
1109 #define MAX_ANNOTATION_LENGTH 5
1110
1111 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1112   do {                                                  \
1113     *(buf)++ = -(len);                                  \
1114     *(buf)++ = (mask);                                  \
1115     *(buf)++ = (nchars);                                \
1116     coding->annotated = 1;                              \
1117   } while (0);
1118
1119 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1120   do {                                                                      \
1121     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1122     *buf++ = nbytes;                                                        \
1123     *buf++ = method;                                                        \
1124   } while (0)
1125
1126
1127 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1128   do {                                                                  \
1129     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1130     *buf++ = id;                                                        \
1131   } while (0)
1132
1133 \f
1134 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1135
1136
1137
1138 \f
1139 /*** 3. UTF-8 ***/
1140
1141 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1142    Return true if a text is encoded in UTF-8.  */
1143
1144 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1145 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1146 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1147 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1148 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1149 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1150
1151 #define UTF_8_BOM_1 0xEF
1152 #define UTF_8_BOM_2 0xBB
1153 #define UTF_8_BOM_3 0xBF
1154
1155 static bool
1156 detect_coding_utf_8 (struct coding_system *coding,
1157                      struct coding_detection_info *detect_info)
1158 {
1159   const unsigned char *src = coding->source, *src_base;
1160   const unsigned char *src_end = coding->source + coding->src_bytes;
1161   bool multibytep = coding->src_multibyte;
1162   ptrdiff_t consumed_chars = 0;
1163   bool bom_found = 0;
1164   bool found = 0;
1165
1166   detect_info->checked |= CATEGORY_MASK_UTF_8;
1167   /* A coding system of this category is always ASCII compatible.  */
1168   src += coding->head_ascii;
1169
1170   while (1)
1171     {
1172       int c, c1, c2, c3, c4;
1173
1174       src_base = src;
1175       ONE_MORE_BYTE (c);
1176       if (c < 0 || UTF_8_1_OCTET_P (c))
1177         continue;
1178       ONE_MORE_BYTE (c1);
1179       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1180         break;
1181       if (UTF_8_2_OCTET_LEADING_P (c))
1182         {
1183           found = 1;
1184           continue;
1185         }
1186       ONE_MORE_BYTE (c2);
1187       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1188         break;
1189       if (UTF_8_3_OCTET_LEADING_P (c))
1190         {
1191           found = 1;
1192           if (src_base == coding->source
1193               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1194             bom_found = 1;
1195           continue;
1196         }
1197       ONE_MORE_BYTE (c3);
1198       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1199         break;
1200       if (UTF_8_4_OCTET_LEADING_P (c))
1201         {
1202           found = 1;
1203           continue;
1204         }
1205       ONE_MORE_BYTE (c4);
1206       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1207         break;
1208       if (UTF_8_5_OCTET_LEADING_P (c))
1209         {
1210           found = 1;
1211           continue;
1212         }
1213       break;
1214     }
1215   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1216   return 0;
1217
1218  no_more_source:
1219   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1220     {
1221       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1222       return 0;
1223     }
1224   if (bom_found)
1225     {
1226       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1227       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1228     }
1229   else
1230     {
1231       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1232       if (found)
1233         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1234     }
1235   return 1;
1236 }
1237
1238
1239 static void
1240 decode_coding_utf_8 (struct coding_system *coding)
1241 {
1242   const unsigned char *src = coding->source + coding->consumed;
1243   const unsigned char *src_end = coding->source + coding->src_bytes;
1244   const unsigned char *src_base;
1245   int *charbuf = coding->charbuf + coding->charbuf_used;
1246   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1247   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1248   bool multibytep = coding->src_multibyte;
1249   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1250   bool eol_dos
1251     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1252   int byte_after_cr = -1;
1253
1254   if (bom != utf_without_bom)
1255     {
1256       int c1, c2, c3;
1257
1258       src_base = src;
1259       ONE_MORE_BYTE (c1);
1260       if (! UTF_8_3_OCTET_LEADING_P (c1))
1261         src = src_base;
1262       else
1263         {
1264           ONE_MORE_BYTE (c2);
1265           if (! UTF_8_EXTRA_OCTET_P (c2))
1266             src = src_base;
1267           else
1268             {
1269               ONE_MORE_BYTE (c3);
1270               if (! UTF_8_EXTRA_OCTET_P (c3))
1271                 src = src_base;
1272               else
1273                 {
1274                   if ((c1 != UTF_8_BOM_1)
1275                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1276                     src = src_base;
1277                   else
1278                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1279                 }
1280             }
1281         }
1282     }
1283   CODING_UTF_8_BOM (coding) = utf_without_bom;
1284
1285   while (1)
1286     {
1287       int c, c1, c2, c3, c4, c5;
1288
1289       src_base = src;
1290       consumed_chars_base = consumed_chars;
1291
1292       if (charbuf >= charbuf_end)
1293         {
1294           if (byte_after_cr >= 0)
1295             src_base--;
1296           break;
1297         }
1298
1299       if (byte_after_cr >= 0)
1300         c1 = byte_after_cr, byte_after_cr = -1;
1301       else
1302         ONE_MORE_BYTE (c1);
1303       if (c1 < 0)
1304         {
1305           c = - c1;
1306         }
1307       else if (UTF_8_1_OCTET_P (c1))
1308         {
1309           if (eol_dos && c1 == '\r')
1310             ONE_MORE_BYTE (byte_after_cr);
1311           c = c1;
1312         }
1313       else
1314         {
1315           ONE_MORE_BYTE (c2);
1316           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1317             goto invalid_code;
1318           if (UTF_8_2_OCTET_LEADING_P (c1))
1319             {
1320               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1321               /* Reject overlong sequences here and below.  Encoders
1322                  producing them are incorrect, they can be misleading,
1323                  and they mess up read/write invariance.  */
1324               if (c < 128)
1325                 goto invalid_code;
1326             }
1327           else
1328             {
1329               ONE_MORE_BYTE (c3);
1330               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1331                 goto invalid_code;
1332               if (UTF_8_3_OCTET_LEADING_P (c1))
1333                 {
1334                   c = (((c1 & 0xF) << 12)
1335                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1336                   if (c < 0x800
1337                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1338                     goto invalid_code;
1339                 }
1340               else
1341                 {
1342                   ONE_MORE_BYTE (c4);
1343                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1344                     goto invalid_code;
1345                   if (UTF_8_4_OCTET_LEADING_P (c1))
1346                     {
1347                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1348                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1349                     if (c < 0x10000)
1350                       goto invalid_code;
1351                     }
1352                   else
1353                     {
1354                       ONE_MORE_BYTE (c5);
1355                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1356                         goto invalid_code;
1357                       if (UTF_8_5_OCTET_LEADING_P (c1))
1358                         {
1359                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1360                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1361                                | (c5 & 0x3F));
1362                           if ((c > MAX_CHAR) || (c < 0x200000))
1363                             goto invalid_code;
1364                         }
1365                       else
1366                         goto invalid_code;
1367                     }
1368                 }
1369             }
1370         }
1371
1372       *charbuf++ = c;
1373       continue;
1374
1375     invalid_code:
1376       src = src_base;
1377       consumed_chars = consumed_chars_base;
1378       ONE_MORE_BYTE (c);
1379       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1380       coding->errors++;
1381     }
1382
1383  no_more_source:
1384   coding->consumed_char += consumed_chars_base;
1385   coding->consumed = src_base - coding->source;
1386   coding->charbuf_used = charbuf - coding->charbuf;
1387 }
1388
1389
1390 static bool
1391 encode_coding_utf_8 (struct coding_system *coding)
1392 {
1393   bool multibytep = coding->dst_multibyte;
1394   int *charbuf = coding->charbuf;
1395   int *charbuf_end = charbuf + coding->charbuf_used;
1396   unsigned char *dst = coding->destination + coding->produced;
1397   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1398   ptrdiff_t produced_chars = 0;
1399   int c;
1400
1401   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1402     {
1403       ASSURE_DESTINATION (3);
1404       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1405       CODING_UTF_8_BOM (coding) = utf_without_bom;
1406     }
1407
1408   if (multibytep)
1409     {
1410       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1411
1412       while (charbuf < charbuf_end)
1413         {
1414           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1415
1416           ASSURE_DESTINATION (safe_room);
1417           c = *charbuf++;
1418           if (CHAR_BYTE8_P (c))
1419             {
1420               c = CHAR_TO_BYTE8 (c);
1421               EMIT_ONE_BYTE (c);
1422             }
1423           else
1424             {
1425               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1426               for (p = str; p < pend; p++)
1427                 EMIT_ONE_BYTE (*p);
1428             }
1429         }
1430     }
1431   else
1432     {
1433       int safe_room = MAX_MULTIBYTE_LENGTH;
1434
1435       while (charbuf < charbuf_end)
1436         {
1437           ASSURE_DESTINATION (safe_room);
1438           c = *charbuf++;
1439           if (CHAR_BYTE8_P (c))
1440             *dst++ = CHAR_TO_BYTE8 (c);
1441           else
1442             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1443           produced_chars++;
1444         }
1445     }
1446   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1447   coding->produced_char += produced_chars;
1448   coding->produced = dst - coding->destination;
1449   return 0;
1450 }
1451
1452
1453 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1454    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1455
1456 #define UTF_16_HIGH_SURROGATE_P(val) \
1457   (((val) & 0xFC00) == 0xD800)
1458
1459 #define UTF_16_LOW_SURROGATE_P(val) \
1460   (((val) & 0xFC00) == 0xDC00)
1461
1462
1463 static bool
1464 detect_coding_utf_16 (struct coding_system *coding,
1465                       struct coding_detection_info *detect_info)
1466 {
1467   const unsigned char *src = coding->source;
1468   const unsigned char *src_end = coding->source + coding->src_bytes;
1469   bool multibytep = coding->src_multibyte;
1470   int c1, c2;
1471
1472   detect_info->checked |= CATEGORY_MASK_UTF_16;
1473   if (coding->mode & CODING_MODE_LAST_BLOCK
1474       && (coding->src_chars & 1))
1475     {
1476       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1477       return 0;
1478     }
1479
1480   TWO_MORE_BYTES (c1, c2);
1481   if ((c1 == 0xFF) && (c2 == 0xFE))
1482     {
1483       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1484                              | CATEGORY_MASK_UTF_16_AUTO);
1485       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1486                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1487                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1488     }
1489   else if ((c1 == 0xFE) && (c2 == 0xFF))
1490     {
1491       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1492                              | CATEGORY_MASK_UTF_16_AUTO);
1493       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1494                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1495                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1496     }
1497   else if (c2 < 0)
1498     {
1499       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1500       return 0;
1501     }
1502   else
1503     {
1504       /* We check the dispersion of Eth and Oth bytes where E is even and
1505          O is odd.  If both are high, we assume binary data.*/
1506       unsigned char e[256], o[256];
1507       unsigned e_num = 1, o_num = 1;
1508
1509       memset (e, 0, 256);
1510       memset (o, 0, 256);
1511       e[c1] = 1;
1512       o[c2] = 1;
1513
1514       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1515                                 |CATEGORY_MASK_UTF_16_BE
1516                                 | CATEGORY_MASK_UTF_16_LE);
1517
1518       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1519              != CATEGORY_MASK_UTF_16)
1520         {
1521           TWO_MORE_BYTES (c1, c2);
1522           if (c2 < 0)
1523             break;
1524           if (! e[c1])
1525             {
1526               e[c1] = 1;
1527               e_num++;
1528               if (e_num >= 128)
1529                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1530             }
1531           if (! o[c2])
1532             {
1533               o[c2] = 1;
1534               o_num++;
1535               if (o_num >= 128)
1536                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1537             }
1538         }
1539       return 0;
1540     }
1541
1542  no_more_source:
1543   return 1;
1544 }
1545
1546 static void
1547 decode_coding_utf_16 (struct coding_system *coding)
1548 {
1549   const unsigned char *src = coding->source + coding->consumed;
1550   const unsigned char *src_end = coding->source + coding->src_bytes;
1551   const unsigned char *src_base;
1552   int *charbuf = coding->charbuf + coding->charbuf_used;
1553   /* We may produces at most 3 chars in one loop.  */
1554   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1555   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1556   bool multibytep = coding->src_multibyte;
1557   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1558   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1559   int surrogate = CODING_UTF_16_SURROGATE (coding);
1560   bool eol_dos
1561     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1562   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1563
1564   if (bom == utf_with_bom)
1565     {
1566       int c, c1, c2;
1567
1568       src_base = src;
1569       ONE_MORE_BYTE (c1);
1570       ONE_MORE_BYTE (c2);
1571       c = (c1 << 8) | c2;
1572
1573       if (endian == utf_16_big_endian
1574           ? c != 0xFEFF : c != 0xFFFE)
1575         {
1576           /* The first two bytes are not BOM.  Treat them as bytes
1577              for a normal character.  */
1578           src = src_base;
1579           coding->errors++;
1580         }
1581       CODING_UTF_16_BOM (coding) = utf_without_bom;
1582     }
1583   else if (bom == utf_detect_bom)
1584     {
1585       /* We have already tried to detect BOM and failed in
1586          detect_coding.  */
1587       CODING_UTF_16_BOM (coding) = utf_without_bom;
1588     }
1589
1590   while (1)
1591     {
1592       int c, c1, c2;
1593
1594       src_base = src;
1595       consumed_chars_base = consumed_chars;
1596
1597       if (charbuf >= charbuf_end)
1598         {
1599           if (byte_after_cr1 >= 0)
1600             src_base -= 2;
1601           break;
1602         }
1603
1604       if (byte_after_cr1 >= 0)
1605         c1 = byte_after_cr1, byte_after_cr1 = -1;
1606       else
1607         ONE_MORE_BYTE (c1);
1608       if (c1 < 0)
1609         {
1610           *charbuf++ = -c1;
1611           continue;
1612         }
1613       if (byte_after_cr2 >= 0)
1614         c2 = byte_after_cr2, byte_after_cr2 = -1;
1615       else
1616         ONE_MORE_BYTE (c2);
1617       if (c2 < 0)
1618         {
1619           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1620           *charbuf++ = -c2;
1621           continue;
1622         }
1623       c = (endian == utf_16_big_endian
1624            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1625
1626       if (surrogate)
1627         {
1628           if (! UTF_16_LOW_SURROGATE_P (c))
1629             {
1630               if (endian == utf_16_big_endian)
1631                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1632               else
1633                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1634               *charbuf++ = c1;
1635               *charbuf++ = c2;
1636               coding->errors++;
1637               if (UTF_16_HIGH_SURROGATE_P (c))
1638                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1639               else
1640                 *charbuf++ = c;
1641             }
1642           else
1643             {
1644               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1645               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1646               *charbuf++ = 0x10000 + c;
1647             }
1648         }
1649       else
1650         {
1651           if (UTF_16_HIGH_SURROGATE_P (c))
1652             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1653           else
1654             {
1655               if (eol_dos && c == '\r')
1656                 {
1657                   ONE_MORE_BYTE (byte_after_cr1);
1658                   ONE_MORE_BYTE (byte_after_cr2);
1659                 }
1660               *charbuf++ = c;
1661             }
1662         }
1663     }
1664
1665  no_more_source:
1666   coding->consumed_char += consumed_chars_base;
1667   coding->consumed = src_base - coding->source;
1668   coding->charbuf_used = charbuf - coding->charbuf;
1669 }
1670
1671 static bool
1672 encode_coding_utf_16 (struct coding_system *coding)
1673 {
1674   bool multibytep = coding->dst_multibyte;
1675   int *charbuf = coding->charbuf;
1676   int *charbuf_end = charbuf + coding->charbuf_used;
1677   unsigned char *dst = coding->destination + coding->produced;
1678   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1679   int safe_room = 8;
1680   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1681   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1682   ptrdiff_t produced_chars = 0;
1683   int c;
1684
1685   if (bom != utf_without_bom)
1686     {
1687       ASSURE_DESTINATION (safe_room);
1688       if (big_endian)
1689         EMIT_TWO_BYTES (0xFE, 0xFF);
1690       else
1691         EMIT_TWO_BYTES (0xFF, 0xFE);
1692       CODING_UTF_16_BOM (coding) = utf_without_bom;
1693     }
1694
1695   while (charbuf < charbuf_end)
1696     {
1697       ASSURE_DESTINATION (safe_room);
1698       c = *charbuf++;
1699       if (c > MAX_UNICODE_CHAR)
1700         c = coding->default_char;
1701
1702       if (c < 0x10000)
1703         {
1704           if (big_endian)
1705             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1706           else
1707             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1708         }
1709       else
1710         {
1711           int c1, c2;
1712
1713           c -= 0x10000;
1714           c1 = (c >> 10) + 0xD800;
1715           c2 = (c & 0x3FF) + 0xDC00;
1716           if (big_endian)
1717             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1718           else
1719             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1720         }
1721     }
1722   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1723   coding->produced = dst - coding->destination;
1724   coding->produced_char += produced_chars;
1725   return 0;
1726 }
1727
1728 \f
1729 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1730
1731 /* Emacs' internal format for representation of multiple character
1732    sets is a kind of multi-byte encoding, i.e. characters are
1733    represented by variable-length sequences of one-byte codes.
1734
1735    ASCII characters and control characters (e.g. `tab', `newline') are
1736    represented by one-byte sequences which are their ASCII codes, in
1737    the range 0x00 through 0x7F.
1738
1739    8-bit characters of the range 0x80..0x9F are represented by
1740    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1741    code + 0x20).
1742
1743    8-bit characters of the range 0xA0..0xFF are represented by
1744    one-byte sequences which are their 8-bit code.
1745
1746    The other characters are represented by a sequence of `base
1747    leading-code', optional `extended leading-code', and one or two
1748    `position-code's.  The length of the sequence is determined by the
1749    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1750    whereas extended leading-code and position-code take the range 0xA0
1751    through 0xFF.  See `charset.h' for more details about leading-code
1752    and position-code.
1753
1754    --- CODE RANGE of Emacs' internal format ---
1755    character set        range
1756    -------------        -----
1757    ascii                0x00..0x7F
1758    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1759    eight-bit-graphic    0xA0..0xBF
1760    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1761    ---------------------------------------------
1762
1763    As this is the internal character representation, the format is
1764    usually not used externally (i.e. in a file or in a data sent to a
1765    process).  But, it is possible to have a text externally in this
1766    format (i.e. by encoding by the coding system `emacs-mule').
1767
1768    In that case, a sequence of one-byte codes has a slightly different
1769    form.
1770
1771    At first, all characters in eight-bit-control are represented by
1772    one-byte sequences which are their 8-bit code.
1773
1774    Next, character composition data are represented by the byte
1775    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1776    where,
1777         METHOD is 0xF2 plus one of composition method (enum
1778         composition_method),
1779
1780         BYTES is 0xA0 plus a byte length of this composition data,
1781
1782         CHARS is 0xA0 plus a number of characters composed by this
1783         data,
1784
1785         COMPONENTs are characters of multibyte form or composition
1786         rules encoded by two-byte of ASCII codes.
1787
1788    In addition, for backward compatibility, the following formats are
1789    also recognized as composition data on decoding.
1790
1791    0x80 MSEQ ...
1792    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1793
1794    Here,
1795         MSEQ is a multibyte form but in these special format:
1796           ASCII: 0xA0 ASCII_CODE+0x80,
1797           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1798         RULE is a one byte code of the range 0xA0..0xF0 that
1799         represents a composition rule.
1800   */
1801
1802 char emacs_mule_bytes[256];
1803
1804
1805 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1806    Return true if a text is encoded in 'emacs-mule'.  */
1807
1808 static bool
1809 detect_coding_emacs_mule (struct coding_system *coding,
1810                           struct coding_detection_info *detect_info)
1811 {
1812   const unsigned char *src = coding->source, *src_base;
1813   const unsigned char *src_end = coding->source + coding->src_bytes;
1814   bool multibytep = coding->src_multibyte;
1815   ptrdiff_t consumed_chars = 0;
1816   int c;
1817   int found = 0;
1818
1819   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1820   /* A coding system of this category is always ASCII compatible.  */
1821   src += coding->head_ascii;
1822
1823   while (1)
1824     {
1825       src_base = src;
1826       ONE_MORE_BYTE (c);
1827       if (c < 0)
1828         continue;
1829       if (c == 0x80)
1830         {
1831           /* Perhaps the start of composite character.  We simply skip
1832              it because analyzing it is too heavy for detecting.  But,
1833              at least, we check that the composite character
1834              constitutes of more than 4 bytes.  */
1835           const unsigned char *src_start;
1836
1837         repeat:
1838           src_start = src;
1839           do
1840             {
1841               ONE_MORE_BYTE (c);
1842             }
1843           while (c >= 0xA0);
1844
1845           if (src - src_start <= 4)
1846             break;
1847           found = CATEGORY_MASK_EMACS_MULE;
1848           if (c == 0x80)
1849             goto repeat;
1850         }
1851
1852       if (c < 0x80)
1853         {
1854           if (c < 0x20
1855               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1856             break;
1857         }
1858       else
1859         {
1860           int more_bytes = emacs_mule_bytes[c] - 1;
1861
1862           while (more_bytes > 0)
1863             {
1864               ONE_MORE_BYTE (c);
1865               if (c < 0xA0)
1866                 {
1867                   src--;        /* Unread the last byte.  */
1868                   break;
1869                 }
1870               more_bytes--;
1871             }
1872           if (more_bytes != 0)
1873             break;
1874           found = CATEGORY_MASK_EMACS_MULE;
1875         }
1876     }
1877   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1878   return 0;
1879
1880  no_more_source:
1881   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1882     {
1883       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1884       return 0;
1885     }
1886   detect_info->found |= found;
1887   return 1;
1888 }
1889
1890
1891 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1892    character.  If CMP_STATUS indicates that we must expect MSEQ or
1893    RULE described above, decode it and return the negative value of
1894    the decoded character or rule.  If an invalid byte is found, return
1895    -1.  If SRC is too short, return -2.  */
1896
1897 static int
1898 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1899                  int *nbytes, int *nchars, int *id,
1900                  struct composition_status *cmp_status)
1901 {
1902   const unsigned char *src_end = coding->source + coding->src_bytes;
1903   const unsigned char *src_base = src;
1904   bool multibytep = coding->src_multibyte;
1905   int charset_ID;
1906   unsigned code;
1907   int c;
1908   int consumed_chars = 0;
1909   bool mseq_found = 0;
1910
1911   ONE_MORE_BYTE (c);
1912   if (c < 0)
1913     {
1914       c = -c;
1915       charset_ID = emacs_mule_charset[0];
1916     }
1917   else
1918     {
1919       if (c >= 0xA0)
1920         {
1921           if (cmp_status->state != COMPOSING_NO
1922               && cmp_status->old_form)
1923             {
1924               if (cmp_status->state == COMPOSING_CHAR)
1925                 {
1926                   if (c == 0xA0)
1927                     {
1928                       ONE_MORE_BYTE (c);
1929                       c -= 0x80;
1930                       if (c < 0)
1931                         goto invalid_code;
1932                     }
1933                   else
1934                     c -= 0x20;
1935                   mseq_found = 1;
1936                 }
1937               else
1938                 {
1939                   *nbytes = src - src_base;
1940                   *nchars = consumed_chars;
1941                   return -c;
1942                 }
1943             }
1944           else
1945             goto invalid_code;
1946         }
1947
1948       switch (emacs_mule_bytes[c])
1949         {
1950         case 2:
1951           if ((charset_ID = emacs_mule_charset[c]) < 0)
1952             goto invalid_code;
1953           ONE_MORE_BYTE (c);
1954           if (c < 0xA0)
1955             goto invalid_code;
1956           code = c & 0x7F;
1957           break;
1958
1959         case 3:
1960           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1961               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1962             {
1963               ONE_MORE_BYTE (c);
1964               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1965                 goto invalid_code;
1966               ONE_MORE_BYTE (c);
1967               if (c < 0xA0)
1968                 goto invalid_code;
1969               code = c & 0x7F;
1970             }
1971           else
1972             {
1973               if ((charset_ID = emacs_mule_charset[c]) < 0)
1974                 goto invalid_code;
1975               ONE_MORE_BYTE (c);
1976               if (c < 0xA0)
1977                 goto invalid_code;
1978               code = (c & 0x7F) << 8;
1979               ONE_MORE_BYTE (c);
1980               if (c < 0xA0)
1981                 goto invalid_code;
1982               code |= c & 0x7F;
1983             }
1984           break;
1985
1986         case 4:
1987           ONE_MORE_BYTE (c);
1988           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1989             goto invalid_code;
1990           ONE_MORE_BYTE (c);
1991           if (c < 0xA0)
1992             goto invalid_code;
1993           code = (c & 0x7F) << 8;
1994           ONE_MORE_BYTE (c);
1995           if (c < 0xA0)
1996             goto invalid_code;
1997           code |= c & 0x7F;
1998           break;
1999
2000         case 1:
2001           code = c;
2002           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2003           break;
2004
2005         default:
2006           emacs_abort ();
2007         }
2008       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2009                           CHARSET_FROM_ID (charset_ID), code, c);
2010       if (c < 0)
2011         goto invalid_code;
2012     }
2013   *nbytes = src - src_base;
2014   *nchars = consumed_chars;
2015   if (id)
2016     *id = charset_ID;
2017   return (mseq_found ? -c : c);
2018
2019  no_more_source:
2020   return -2;
2021
2022  invalid_code:
2023   return -1;
2024 }
2025
2026
2027 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2028
2029 /* Handle these composition sequence ('|': the end of header elements,
2030    BYTES and CHARS >= 0xA0):
2031
2032    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2033    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2034    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2035
2036    and these old form:
2037
2038    (4) relative composition: 0x80 | MSEQ ... MSEQ
2039    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2040
2041    When the starter 0x80 and the following header elements are found,
2042    this annotation header is produced.
2043
2044         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2045
2046    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2047    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2048
2049    Then, upon reading the following elements, these codes are produced
2050    until the composition end is found:
2051
2052    (1) CHAR ... CHAR
2053    (2) ALT ... ALT CHAR ... CHAR
2054    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2055    (4) CHAR ... CHAR
2056    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2057
2058    When the composition end is found, LENGTH and NCHARS in the
2059    annotation header is updated as below:
2060
2061    (1) LENGTH: unchanged, NCHARS: unchanged
2062    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2063    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2064    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2065    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2066
2067    If an error is found while composing, the annotation header is
2068    changed to the original composition header (plus filler -1s) as
2069    below:
2070
2071    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2072    (5)          [ 0x80 0xFF -1 -1- -1 ]
2073
2074    and the sequence [ -2 DECODED-RULE ] is changed to the original
2075    byte sequence as below:
2076         o the original byte sequence is B: [ B -1 ]
2077         o the original byte sequence is B1 B2: [ B1 B2 ]
2078
2079    Most of the routines are implemented by macros because many
2080    variables and labels in the caller decode_coding_emacs_mule must be
2081    accessible, and they are usually called just once (thus doesn't
2082    increase the size of compiled object).  */
2083
2084 /* Decode a composition rule represented by C as a component of
2085    composition sequence of Emacs 20 style.  Set RULE to the decoded
2086    rule. */
2087
2088 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2089   do {                                                  \
2090     int gref, nref;                                     \
2091                                                         \
2092     c -= 0xA0;                                          \
2093     if (c < 0 || c >= 81)                               \
2094       goto invalid_code;                                \
2095     gref = c / 9, nref = c % 9;                         \
2096     if (gref == 4) gref = 10;                           \
2097     if (nref == 4) nref = 10;                           \
2098     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2099   } while (0)
2100
2101
2102 /* Decode a composition rule represented by C and the following byte
2103    at SRC as a component of composition sequence of Emacs 21 style.
2104    Set RULE to the decoded rule.  */
2105
2106 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2107   do {                                                  \
2108     int gref, nref;                                     \
2109                                                         \
2110     gref = c - 0x20;                                    \
2111     if (gref < 0 || gref >= 81)                         \
2112       goto invalid_code;                                \
2113     ONE_MORE_BYTE (c);                                  \
2114     nref = c - 0x20;                                    \
2115     if (nref < 0 || nref >= 81)                         \
2116       goto invalid_code;                                \
2117     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2118   } while (0)
2119
2120
2121 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2122    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2123    byte length of this composition information, CHARS is the number of
2124    characters composed by this composition.  */
2125
2126 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2127   do {                                                                  \
2128     enum composition_method method = c - 0xF2;                          \
2129     int nbytes, nchars;                                                 \
2130                                                                         \
2131     ONE_MORE_BYTE (c);                                                  \
2132     if (c < 0)                                                          \
2133       goto invalid_code;                                                \
2134     nbytes = c - 0xA0;                                                  \
2135     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2136       goto invalid_code;                                                \
2137     ONE_MORE_BYTE (c);                                                  \
2138     nchars = c - 0xA0;                                                  \
2139     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2140       goto invalid_code;                                                \
2141     cmp_status->old_form = 0;                                           \
2142     cmp_status->method = method;                                        \
2143     if (method == COMPOSITION_RELATIVE)                                 \
2144       cmp_status->state = COMPOSING_CHAR;                               \
2145     else                                                                \
2146       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2147     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2148     cmp_status->nchars = nchars;                                        \
2149     cmp_status->ncomps = nbytes - 4;                                    \
2150     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2151   } while (0)
2152
2153
2154 /* Start of Emacs 20 style format for relative composition.  */
2155
2156 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2157   do {                                                          \
2158     cmp_status->old_form = 1;                                   \
2159     cmp_status->method = COMPOSITION_RELATIVE;                  \
2160     cmp_status->state = COMPOSING_CHAR;                         \
2161     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2162     cmp_status->nchars = cmp_status->ncomps = 0;                \
2163     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2164   } while (0)
2165
2166
2167 /* Start of Emacs 20 style format for rule-base composition.  */
2168
2169 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2170   do {                                                          \
2171     cmp_status->old_form = 1;                                   \
2172     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2173     cmp_status->state = COMPOSING_CHAR;                         \
2174     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2175     cmp_status->nchars = cmp_status->ncomps = 0;                \
2176     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2177   } while (0)
2178
2179
2180 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2181   do {                                                  \
2182     const unsigned char *current_src = src;             \
2183                                                         \
2184     ONE_MORE_BYTE (c);                                  \
2185     if (c < 0)                                          \
2186       goto invalid_code;                                \
2187     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2188         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2189       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2190     else if (c < 0xA0)                                  \
2191       goto invalid_code;                                \
2192     else if (c < 0xC0)                                  \
2193       {                                                 \
2194         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2195         /* Re-read C as a composition component.  */    \
2196         src = current_src;                              \
2197       }                                                 \
2198     else if (c == 0xFF)                                 \
2199       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2200     else                                                \
2201       goto invalid_code;                                \
2202   } while (0)
2203
2204 #define EMACS_MULE_COMPOSITION_END()                            \
2205   do {                                                          \
2206     int idx = - cmp_status->length;                             \
2207                                                                 \
2208     if (cmp_status->old_form)                                   \
2209       charbuf[idx + 2] = cmp_status->nchars;                    \
2210     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2211       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2212     cmp_status->state = COMPOSING_NO;                           \
2213   } while (0)
2214
2215
2216 static int
2217 emacs_mule_finish_composition (int *charbuf,
2218                                struct composition_status *cmp_status)
2219 {
2220   int idx = - cmp_status->length;
2221   int new_chars;
2222
2223   if (cmp_status->old_form && cmp_status->nchars > 0)
2224     {
2225       charbuf[idx + 2] = cmp_status->nchars;
2226       new_chars = 0;
2227       if (cmp_status->method == COMPOSITION_WITH_RULE
2228           && cmp_status->state == COMPOSING_CHAR)
2229         {
2230           /* The last rule was invalid.  */
2231           int rule = charbuf[-1] + 0xA0;
2232
2233           charbuf[-2] = BYTE8_TO_CHAR (rule);
2234           charbuf[-1] = -1;
2235           new_chars = 1;
2236         }
2237     }
2238   else
2239     {
2240       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2241
2242       if (cmp_status->method == COMPOSITION_WITH_RULE)
2243         {
2244           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2245           charbuf[idx++] = -3;
2246           charbuf[idx++] = 0;
2247           new_chars = 1;
2248         }
2249       else
2250         {
2251           int nchars = charbuf[idx + 1] + 0xA0;
2252           int nbytes = charbuf[idx + 2] + 0xA0;
2253
2254           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2255           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2256           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2257           charbuf[idx++] = -1;
2258           new_chars = 4;
2259         }
2260     }
2261   cmp_status->state = COMPOSING_NO;
2262   return new_chars;
2263 }
2264
2265 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2266   do {                                                                    \
2267     if (cmp_status->state != COMPOSING_NO)                                \
2268       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2269   } while (0)
2270
2271
2272 static void
2273 decode_coding_emacs_mule (struct coding_system *coding)
2274 {
2275   const unsigned char *src = coding->source + coding->consumed;
2276   const unsigned char *src_end = coding->source + coding->src_bytes;
2277   const unsigned char *src_base;
2278   int *charbuf = coding->charbuf + coding->charbuf_used;
2279   /* We may produce two annotations (charset and composition) in one
2280      loop and one more charset annotation at the end.  */
2281   int *charbuf_end
2282     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2283       /* We can produce up to 2 characters in a loop.  */
2284       - 1;
2285   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2286   bool multibytep = coding->src_multibyte;
2287   ptrdiff_t char_offset = coding->produced_char;
2288   ptrdiff_t last_offset = char_offset;
2289   int last_id = charset_ascii;
2290   bool eol_dos
2291     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2292   int byte_after_cr = -1;
2293   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2294
2295   if (cmp_status->state != COMPOSING_NO)
2296     {
2297       int i;
2298
2299       if (charbuf_end - charbuf < cmp_status->length)
2300         emacs_abort ();
2301       for (i = 0; i < cmp_status->length; i++)
2302         *charbuf++ = cmp_status->carryover[i];
2303       coding->annotated = 1;
2304     }
2305
2306   while (1)
2307     {
2308       int c, id IF_LINT (= 0);
2309
2310       src_base = src;
2311       consumed_chars_base = consumed_chars;
2312
2313       if (charbuf >= charbuf_end)
2314         {
2315           if (byte_after_cr >= 0)
2316             src_base--;
2317           break;
2318         }
2319
2320       if (byte_after_cr >= 0)
2321         c = byte_after_cr, byte_after_cr = -1;
2322       else
2323         ONE_MORE_BYTE (c);
2324
2325       if (c < 0 || c == 0x80)
2326         {
2327           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2328           if (c < 0)
2329             {
2330               *charbuf++ = -c;
2331               char_offset++;
2332             }
2333           else
2334             DECODE_EMACS_MULE_COMPOSITION_START ();
2335           continue;
2336         }
2337
2338       if (c < 0x80)
2339         {
2340           if (eol_dos && c == '\r')
2341             ONE_MORE_BYTE (byte_after_cr);
2342           id = charset_ascii;
2343           if (cmp_status->state != COMPOSING_NO)
2344             {
2345               if (cmp_status->old_form)
2346                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2347               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2348                 cmp_status->ncomps--;
2349             }
2350         }
2351       else
2352         {
2353           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2354           /* emacs_mule_char can load a charset map from a file, which
2355              allocates a large structure and might cause buffer text
2356              to be relocated as result.  Thus, we need to remember the
2357              original pointer to buffer text, and fix up all related
2358              pointers after the call.  */
2359           const unsigned char *orig = coding->source;
2360           ptrdiff_t offset;
2361
2362           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2363                                cmp_status);
2364           offset = coding->source - orig;
2365           if (offset)
2366             {
2367               src += offset;
2368               src_base += offset;
2369               src_end += offset;
2370             }
2371           if (c < 0)
2372             {
2373               if (c == -1)
2374                 goto invalid_code;
2375               if (c == -2)
2376                 break;
2377             }
2378           src = src_base + nbytes;
2379           consumed_chars = consumed_chars_base + nchars;
2380           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2381             cmp_status->ncomps -= nchars;
2382         }
2383
2384       /* Now if C >= 0, we found a normally encoded character, if C <
2385          0, we found an old-style composition component character or
2386          rule.  */
2387
2388       if (cmp_status->state == COMPOSING_NO)
2389         {
2390           if (last_id != id)
2391             {
2392               if (last_id != charset_ascii)
2393                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2394                                   last_id);
2395               last_id = id;
2396               last_offset = char_offset;
2397             }
2398           *charbuf++ = c;
2399           char_offset++;
2400         }
2401       else if (cmp_status->state == COMPOSING_CHAR)
2402         {
2403           if (cmp_status->old_form)
2404             {
2405               if (c >= 0)
2406                 {
2407                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2408                   *charbuf++ = c;
2409                   char_offset++;
2410                 }
2411               else
2412                 {
2413                   *charbuf++ = -c;
2414                   cmp_status->nchars++;
2415                   cmp_status->length++;
2416                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2417                     EMACS_MULE_COMPOSITION_END ();
2418                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2419                     cmp_status->state = COMPOSING_RULE;
2420                 }
2421             }
2422           else
2423             {
2424               *charbuf++ = c;
2425               cmp_status->length++;
2426               cmp_status->nchars--;
2427               if (cmp_status->nchars == 0)
2428                 EMACS_MULE_COMPOSITION_END ();
2429             }
2430         }
2431       else if (cmp_status->state == COMPOSING_RULE)
2432         {
2433           int rule;
2434
2435           if (c >= 0)
2436             {
2437               EMACS_MULE_COMPOSITION_END ();
2438               *charbuf++ = c;
2439               char_offset++;
2440             }
2441           else
2442             {
2443               c = -c;
2444               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2445               if (rule < 0)
2446                 goto invalid_code;
2447               *charbuf++ = -2;
2448               *charbuf++ = rule;
2449               cmp_status->length += 2;
2450               cmp_status->state = COMPOSING_CHAR;
2451             }
2452         }
2453       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2454         {
2455           *charbuf++ = c;
2456           cmp_status->length++;
2457           if (cmp_status->ncomps == 0)
2458             cmp_status->state = COMPOSING_CHAR;
2459           else if (cmp_status->ncomps > 0)
2460             {
2461               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2462                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2463             }
2464           else
2465             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2466         }
2467       else                      /* COMPOSING_COMPONENT_RULE */
2468         {
2469           int rule;
2470
2471           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2472           if (rule < 0)
2473             goto invalid_code;
2474           *charbuf++ = -2;
2475           *charbuf++ = rule;
2476           cmp_status->length += 2;
2477           cmp_status->ncomps--;
2478           if (cmp_status->ncomps > 0)
2479             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2480           else
2481             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2482         }
2483       continue;
2484
2485     invalid_code:
2486       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2487       src = src_base;
2488       consumed_chars = consumed_chars_base;
2489       ONE_MORE_BYTE (c);
2490       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2491       char_offset++;
2492       coding->errors++;
2493     }
2494
2495  no_more_source:
2496   if (cmp_status->state != COMPOSING_NO)
2497     {
2498       if (coding->mode & CODING_MODE_LAST_BLOCK)
2499         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2500       else
2501         {
2502           int i;
2503
2504           charbuf -= cmp_status->length;
2505           for (i = 0; i < cmp_status->length; i++)
2506             cmp_status->carryover[i] = charbuf[i];
2507         }
2508     }
2509   if (last_id != charset_ascii)
2510     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2511   coding->consumed_char += consumed_chars_base;
2512   coding->consumed = src_base - coding->source;
2513   coding->charbuf_used = charbuf - coding->charbuf;
2514 }
2515
2516
2517 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2518   do {                                          \
2519     if (id < 0xA0)                              \
2520       codes[0] = id, codes[1] = 0;              \
2521     else if (id < 0xE0)                         \
2522       codes[0] = 0x9A, codes[1] = id;           \
2523     else if (id < 0xF0)                         \
2524       codes[0] = 0x9B, codes[1] = id;           \
2525     else if (id < 0xF5)                         \
2526       codes[0] = 0x9C, codes[1] = id;           \
2527     else                                        \
2528       codes[0] = 0x9D, codes[1] = id;           \
2529   } while (0);
2530
2531
2532 static bool
2533 encode_coding_emacs_mule (struct coding_system *coding)
2534 {
2535   bool multibytep = coding->dst_multibyte;
2536   int *charbuf = coding->charbuf;
2537   int *charbuf_end = charbuf + coding->charbuf_used;
2538   unsigned char *dst = coding->destination + coding->produced;
2539   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2540   int safe_room = 8;
2541   ptrdiff_t produced_chars = 0;
2542   Lisp_Object attrs, charset_list;
2543   int c;
2544   int preferred_charset_id = -1;
2545
2546   CODING_GET_INFO (coding, attrs, charset_list);
2547   if (! EQ (charset_list, Vemacs_mule_charset_list))
2548     {
2549       charset_list = Vemacs_mule_charset_list;
2550       ASET (attrs, coding_attr_charset_list, charset_list);
2551     }
2552
2553   while (charbuf < charbuf_end)
2554     {
2555       ASSURE_DESTINATION (safe_room);
2556       c = *charbuf++;
2557
2558       if (c < 0)
2559         {
2560           /* Handle an annotation.  */
2561           switch (*charbuf)
2562             {
2563             case CODING_ANNOTATE_COMPOSITION_MASK:
2564               /* Not yet implemented.  */
2565               break;
2566             case CODING_ANNOTATE_CHARSET_MASK:
2567               preferred_charset_id = charbuf[3];
2568               if (preferred_charset_id >= 0
2569                   && NILP (Fmemq (make_number (preferred_charset_id),
2570                                   charset_list)))
2571                 preferred_charset_id = -1;
2572               break;
2573             default:
2574               emacs_abort ();
2575             }
2576           charbuf += -c - 1;
2577           continue;
2578         }
2579
2580       if (ASCII_CHAR_P (c))
2581         EMIT_ONE_ASCII_BYTE (c);
2582       else if (CHAR_BYTE8_P (c))
2583         {
2584           c = CHAR_TO_BYTE8 (c);
2585           EMIT_ONE_BYTE (c);
2586         }
2587       else
2588         {
2589           struct charset *charset;
2590           unsigned code;
2591           int dimension;
2592           int emacs_mule_id;
2593           unsigned char leading_codes[2];
2594
2595           if (preferred_charset_id >= 0)
2596             {
2597               bool result;
2598
2599               charset = CHARSET_FROM_ID (preferred_charset_id);
2600               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2601               if (result)
2602                 code = ENCODE_CHAR (charset, c);
2603               else
2604                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2605                                      &code, charset);
2606             }
2607           else
2608             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2609                                  &code, charset);
2610           if (! charset)
2611             {
2612               c = coding->default_char;
2613               if (ASCII_CHAR_P (c))
2614                 {
2615                   EMIT_ONE_ASCII_BYTE (c);
2616                   continue;
2617                 }
2618               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2619                                    &code, charset);
2620             }
2621           dimension = CHARSET_DIMENSION (charset);
2622           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2623           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2624           EMIT_ONE_BYTE (leading_codes[0]);
2625           if (leading_codes[1])
2626             EMIT_ONE_BYTE (leading_codes[1]);
2627           if (dimension == 1)
2628             EMIT_ONE_BYTE (code | 0x80);
2629           else
2630             {
2631               code |= 0x8080;
2632               EMIT_ONE_BYTE (code >> 8);
2633               EMIT_ONE_BYTE (code & 0xFF);
2634             }
2635         }
2636     }
2637   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2638   coding->produced_char += produced_chars;
2639   coding->produced = dst - coding->destination;
2640   return 0;
2641 }
2642
2643 \f
2644 /*** 7. ISO2022 handlers ***/
2645
2646 /* The following note describes the coding system ISO2022 briefly.
2647    Since the intention of this note is to help understand the
2648    functions in this file, some parts are NOT ACCURATE or are OVERLY
2649    SIMPLIFIED.  For thorough understanding, please refer to the
2650    original document of ISO2022.  This is equivalent to the standard
2651    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2652
2653    ISO2022 provides many mechanisms to encode several character sets
2654    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2655    is encoded using bytes less than 128.  This may make the encoded
2656    text a little bit longer, but the text passes more easily through
2657    several types of gateway, some of which strip off the MSB (Most
2658    Significant Bit).
2659
2660    There are two kinds of character sets: control character sets and
2661    graphic character sets.  The former contain control characters such
2662    as `newline' and `escape' to provide control functions (control
2663    functions are also provided by escape sequences).  The latter
2664    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2665    two control character sets and many graphic character sets.
2666
2667    Graphic character sets are classified into one of the following
2668    four classes, according to the number of bytes (DIMENSION) and
2669    number of characters in one dimension (CHARS) of the set:
2670    - DIMENSION1_CHARS94
2671    - DIMENSION1_CHARS96
2672    - DIMENSION2_CHARS94
2673    - DIMENSION2_CHARS96
2674
2675    In addition, each character set is assigned an identification tag,
2676    unique for each set, called the "final character" (denoted as <F>
2677    hereafter).  The <F> of each character set is decided by ECMA(*)
2678    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2679    (0x30..0x3F are for private use only).
2680
2681    Note (*): ECMA = European Computer Manufacturers Association
2682
2683    Here are examples of graphic character sets [NAME(<F>)]:
2684         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2685         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2686         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2687         o DIMENSION2_CHARS96 -- none for the moment
2688
2689    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2690         C0 [0x00..0x1F] -- control character plane 0
2691         GL [0x20..0x7F] -- graphic character plane 0
2692         C1 [0x80..0x9F] -- control character plane 1
2693         GR [0xA0..0xFF] -- graphic character plane 1
2694
2695    A control character set is directly designated and invoked to C0 or
2696    C1 by an escape sequence.  The most common case is that:
2697    - ISO646's  control character set is designated/invoked to C0, and
2698    - ISO6429's control character set is designated/invoked to C1,
2699    and usually these designations/invocations are omitted in encoded
2700    text.  In a 7-bit environment, only C0 can be used, and a control
2701    character for C1 is encoded by an appropriate escape sequence to
2702    fit into the environment.  All control characters for C1 are
2703    defined to have corresponding escape sequences.
2704
2705    A graphic character set is at first designated to one of four
2706    graphic registers (G0 through G3), then these graphic registers are
2707    invoked to GL or GR.  These designations and invocations can be
2708    done independently.  The most common case is that G0 is invoked to
2709    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2710    these invocations and designations are omitted in encoded text.
2711    In a 7-bit environment, only GL can be used.
2712
2713    When a graphic character set of CHARS94 is invoked to GL, codes
2714    0x20 and 0x7F of the GL area work as control characters SPACE and
2715    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2716    be used.
2717
2718    There are two ways of invocation: locking-shift and single-shift.
2719    With locking-shift, the invocation lasts until the next different
2720    invocation, whereas with single-shift, the invocation affects the
2721    following character only and doesn't affect the locking-shift
2722    state.  Invocations are done by the following control characters or
2723    escape sequences:
2724
2725    ----------------------------------------------------------------------
2726    abbrev  function                  cntrl escape seq   description
2727    ----------------------------------------------------------------------
2728    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2729    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2730    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2731    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2732    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2733    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2734    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2735    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2736    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2737    ----------------------------------------------------------------------
2738    (*) These are not used by any known coding system.
2739
2740    Control characters for these functions are defined by macros
2741    ISO_CODE_XXX in `coding.h'.
2742
2743    Designations are done by the following escape sequences:
2744    ----------------------------------------------------------------------
2745    escape sequence      description
2746    ----------------------------------------------------------------------
2747    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2748    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2749    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2750    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2751    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2752    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2753    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2754    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2755    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2756    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2757    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2758    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2759    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2760    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2761    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2762    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2763    ----------------------------------------------------------------------
2764
2765    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2766    of dimension 1, chars 94, and final character <F>, etc...
2767
2768    Note (*): Although these designations are not allowed in ISO2022,
2769    Emacs accepts them on decoding, and produces them on encoding
2770    CHARS96 character sets in a coding system which is characterized as
2771    7-bit environment, non-locking-shift, and non-single-shift.
2772
2773    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2774    '(' must be omitted.  We refer to this as "short-form" hereafter.
2775
2776    Now you may notice that there are a lot of ways of encoding the
2777    same multilingual text in ISO2022.  Actually, there exist many
2778    coding systems such as Compound Text (used in X11's inter client
2779    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2780    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2781    localized platforms), and all of these are variants of ISO2022.
2782
2783    In addition to the above, Emacs handles two more kinds of escape
2784    sequences: ISO6429's direction specification and Emacs' private
2785    sequence for specifying character composition.
2786
2787    ISO6429's direction specification takes the following form:
2788         o CSI ']'      -- end of the current direction
2789         o CSI '0' ']'  -- end of the current direction
2790         o CSI '1' ']'  -- start of left-to-right text
2791         o CSI '2' ']'  -- start of right-to-left text
2792    The control character CSI (0x9B: control sequence introducer) is
2793    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2794
2795    Character composition specification takes the following form:
2796         o ESC '0' -- start relative composition
2797         o ESC '1' -- end composition
2798         o ESC '2' -- start rule-base composition (*)
2799         o ESC '3' -- start relative composition with alternate chars  (**)
2800         o ESC '4' -- start rule-base composition with alternate chars  (**)
2801   Since these are not standard escape sequences of any ISO standard,
2802   the use of them with these meanings is restricted to Emacs only.
2803
2804   (*) This form is used only in Emacs 20.7 and older versions,
2805   but newer versions can safely decode it.
2806   (**) This form is used only in Emacs 21.1 and newer versions,
2807   and older versions can't decode it.
2808
2809   Here's a list of example usages of these composition escape
2810   sequences (categorized by `enum composition_method').
2811
2812   COMPOSITION_RELATIVE:
2813         ESC 0 CHAR [ CHAR ] ESC 1
2814   COMPOSITION_WITH_RULE:
2815         ESC 2 CHAR [ RULE CHAR ] ESC 1
2816   COMPOSITION_WITH_ALTCHARS:
2817         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2818   COMPOSITION_WITH_RULE_ALTCHARS:
2819         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2820
2821 static enum iso_code_class_type iso_code_class[256];
2822
2823 #define SAFE_CHARSET_P(coding, id)      \
2824   ((id) <= (coding)->max_charset_id     \
2825    && (coding)->safe_charsets[id] != 255)
2826
2827 static void
2828 setup_iso_safe_charsets (Lisp_Object attrs)
2829 {
2830   Lisp_Object charset_list, safe_charsets;
2831   Lisp_Object request;
2832   Lisp_Object reg_usage;
2833   Lisp_Object tail;
2834   EMACS_INT reg94, reg96;
2835   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2836   int max_charset_id;
2837
2838   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2839   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2840       && ! EQ (charset_list, Viso_2022_charset_list))
2841     {
2842       charset_list = Viso_2022_charset_list;
2843       ASET (attrs, coding_attr_charset_list, charset_list);
2844       ASET (attrs, coding_attr_safe_charsets, Qnil);
2845     }
2846
2847   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2848     return;
2849
2850   max_charset_id = 0;
2851   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2852     {
2853       int id = XINT (XCAR (tail));
2854       if (max_charset_id < id)
2855         max_charset_id = id;
2856     }
2857
2858   safe_charsets = make_uninit_string (max_charset_id + 1);
2859   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2860   request = AREF (attrs, coding_attr_iso_request);
2861   reg_usage = AREF (attrs, coding_attr_iso_usage);
2862   reg94 = XINT (XCAR (reg_usage));
2863   reg96 = XINT (XCDR (reg_usage));
2864
2865   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2866     {
2867       Lisp_Object id;
2868       Lisp_Object reg;
2869       struct charset *charset;
2870
2871       id = XCAR (tail);
2872       charset = CHARSET_FROM_ID (XINT (id));
2873       reg = Fcdr (Fassq (id, request));
2874       if (! NILP (reg))
2875         SSET (safe_charsets, XINT (id), XINT (reg));
2876       else if (charset->iso_chars_96)
2877         {
2878           if (reg96 < 4)
2879             SSET (safe_charsets, XINT (id), reg96);
2880         }
2881       else
2882         {
2883           if (reg94 < 4)
2884             SSET (safe_charsets, XINT (id), reg94);
2885         }
2886     }
2887   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2888 }
2889
2890
2891 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2892    Return true if a text is encoded in one of ISO-2022 based coding
2893    systems.  */
2894
2895 static bool
2896 detect_coding_iso_2022 (struct coding_system *coding,
2897                         struct coding_detection_info *detect_info)
2898 {
2899   const unsigned char *src = coding->source, *src_base = src;
2900   const unsigned char *src_end = coding->source + coding->src_bytes;
2901   bool multibytep = coding->src_multibyte;
2902   bool single_shifting = 0;
2903   int id;
2904   int c, c1;
2905   ptrdiff_t consumed_chars = 0;
2906   int i;
2907   int rejected = 0;
2908   int found = 0;
2909   int composition_count = -1;
2910
2911   detect_info->checked |= CATEGORY_MASK_ISO;
2912
2913   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2914     {
2915       struct coding_system *this = &(coding_categories[i]);
2916       Lisp_Object attrs, val;
2917
2918       if (this->id < 0)
2919         continue;
2920       attrs = CODING_ID_ATTRS (this->id);
2921       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2922           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2923         setup_iso_safe_charsets (attrs);
2924       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2925       this->max_charset_id = SCHARS (val) - 1;
2926       this->safe_charsets = SDATA (val);
2927     }
2928
2929   /* A coding system of this category is always ASCII compatible.  */
2930   src += coding->head_ascii;
2931
2932   while (rejected != CATEGORY_MASK_ISO)
2933     {
2934       src_base = src;
2935       ONE_MORE_BYTE (c);
2936       switch (c)
2937         {
2938         case ISO_CODE_ESC:
2939           if (inhibit_iso_escape_detection)
2940             break;
2941           single_shifting = 0;
2942           ONE_MORE_BYTE (c);
2943           if (c == 'N' || c == 'O')
2944             {
2945               /* ESC <Fe> for SS2 or SS3.  */
2946               single_shifting = 1;
2947               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2948             }
2949           else if (c == '1')
2950             {
2951               /* End of composition.  */
2952               if (composition_count < 0
2953                   || composition_count > MAX_COMPOSITION_COMPONENTS)
2954                 /* Invalid */
2955                 break;
2956               composition_count = -1;
2957               found |= CATEGORY_MASK_ISO;
2958             }
2959           else if (c >= '0' && c <= '4')
2960             {
2961               /* ESC <Fp> for start/end composition.  */
2962               composition_count = 0;
2963             }
2964           else
2965             {
2966               if (c >= '(' && c <= '/')
2967                 {
2968                   /* Designation sequence for a charset of dimension 1.  */
2969                   ONE_MORE_BYTE (c1);
2970                   if (c1 < ' ' || c1 >= 0x80
2971                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2972                     /* Invalid designation sequence.  Just ignore.  */
2973                     break;
2974                 }
2975               else if (c == '$')
2976                 {
2977                   /* Designation sequence for a charset of dimension 2.  */
2978                   ONE_MORE_BYTE (c);
2979                   if (c >= '@' && c <= 'B')
2980                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2981                     id = iso_charset_table[1][0][c];
2982                   else if (c >= '(' && c <= '/')
2983                     {
2984                       ONE_MORE_BYTE (c1);
2985                       if (c1 < ' ' || c1 >= 0x80
2986                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2987                         /* Invalid designation sequence.  Just ignore.  */
2988                         break;
2989                     }
2990                   else
2991                     /* Invalid designation sequence.  Just ignore it.  */
2992                     break;
2993                 }
2994               else
2995                 {
2996                   /* Invalid escape sequence.  Just ignore it.  */
2997                   break;
2998                 }
2999
3000               /* We found a valid designation sequence for CHARSET.  */
3001               rejected |= CATEGORY_MASK_ISO_8BIT;
3002               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3003                                   id))
3004                 found |= CATEGORY_MASK_ISO_7;
3005               else
3006                 rejected |= CATEGORY_MASK_ISO_7;
3007               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3008                                   id))
3009                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3010               else
3011                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3012               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3013                                   id))
3014                 found |= CATEGORY_MASK_ISO_7_ELSE;
3015               else
3016                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3017               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3018                                   id))
3019                 found |= CATEGORY_MASK_ISO_8_ELSE;
3020               else
3021                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3022             }
3023           break;
3024
3025         case ISO_CODE_SO:
3026         case ISO_CODE_SI:
3027           /* Locking shift out/in.  */
3028           if (inhibit_iso_escape_detection)
3029             break;
3030           single_shifting = 0;
3031           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3032           break;
3033
3034         case ISO_CODE_CSI:
3035           /* Control sequence introducer.  */
3036           single_shifting = 0;
3037           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3038           found |= CATEGORY_MASK_ISO_8_ELSE;
3039           goto check_extra_latin;
3040
3041         case ISO_CODE_SS2:
3042         case ISO_CODE_SS3:
3043           /* Single shift.   */
3044           if (inhibit_iso_escape_detection)
3045             break;
3046           single_shifting = 0;
3047           rejected |= CATEGORY_MASK_ISO_7BIT;
3048           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3049               & CODING_ISO_FLAG_SINGLE_SHIFT)
3050             {
3051               found |= CATEGORY_MASK_ISO_8_1;
3052               single_shifting = 1;
3053             }
3054           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3055               & CODING_ISO_FLAG_SINGLE_SHIFT)
3056             {
3057               found |= CATEGORY_MASK_ISO_8_2;
3058               single_shifting = 1;
3059             }
3060           if (single_shifting)
3061             break;
3062         check_extra_latin:
3063           if (! VECTORP (Vlatin_extra_code_table)
3064               || NILP (AREF (Vlatin_extra_code_table, c)))
3065             {
3066               rejected = CATEGORY_MASK_ISO;
3067               break;
3068             }
3069           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3070               & CODING_ISO_FLAG_LATIN_EXTRA)
3071             found |= CATEGORY_MASK_ISO_8_1;
3072           else
3073             rejected |= CATEGORY_MASK_ISO_8_1;
3074           rejected |= CATEGORY_MASK_ISO_8_2;
3075           break;
3076
3077         default:
3078           if (c < 0)
3079             continue;
3080           if (c < 0x80)
3081             {
3082               if (composition_count >= 0)
3083                 composition_count++;
3084               single_shifting = 0;
3085               break;
3086             }
3087           if (c >= 0xA0)
3088             {
3089               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3090               found |= CATEGORY_MASK_ISO_8_1;
3091               /* Check the length of succeeding codes of the range
3092                  0xA0..0FF.  If the byte length is even, we include
3093                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3094                  only when we are not single shifting.  */
3095               if (! single_shifting
3096                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3097                 {
3098                   int len = 1;
3099                   while (src < src_end)
3100                     {
3101                       src_base = src;
3102                       ONE_MORE_BYTE (c);
3103                       if (c < 0xA0)
3104                         {
3105                           src = src_base;
3106                           break;
3107                         }
3108                       len++;
3109                     }
3110
3111                   if (len & 1 && src < src_end)
3112                     {
3113                       rejected |= CATEGORY_MASK_ISO_8_2;
3114                       if (composition_count >= 0)
3115                         composition_count += len;
3116                     }
3117                   else
3118                     {
3119                       found |= CATEGORY_MASK_ISO_8_2;
3120                       if (composition_count >= 0)
3121                         composition_count += len / 2;
3122                     }
3123                 }
3124               break;
3125             }
3126         }
3127     }
3128   detect_info->rejected |= CATEGORY_MASK_ISO;
3129   return 0;
3130
3131  no_more_source:
3132   detect_info->rejected |= rejected;
3133   detect_info->found |= (found & ~rejected);
3134   return 1;
3135 }
3136
3137
3138 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3139    escape sequence should be kept.  */
3140 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3141   do {                                                                  \
3142     int id, prev;                                                       \
3143                                                                         \
3144     if (final < '0' || final >= 128                                     \
3145         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3146         || !SAFE_CHARSET_P (coding, id))                                \
3147       {                                                                 \
3148         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3149         chars_96 = -1;                                                  \
3150         break;                                                          \
3151       }                                                                 \
3152     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3153     if (id == charset_jisx0201_roman)                                   \
3154       {                                                                 \
3155         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3156           id = charset_ascii;                                           \
3157       }                                                                 \
3158     else if (id == charset_jisx0208_1978)                               \
3159       {                                                                 \
3160         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3161           id = charset_jisx0208;                                        \
3162       }                                                                 \
3163     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3164     /* If there was an invalid designation to REG previously, and this  \
3165        designation is ASCII to REG, we should keep this designation     \
3166        sequence.  */                                                    \
3167     if (prev == -2 && id == charset_ascii)                              \
3168       chars_96 = -1;                                                    \
3169   } while (0)
3170
3171
3172 /* Handle these composition sequence (ALT: alternate char):
3173
3174    (1) relative composition: ESC 0 CHAR ... ESC 1
3175    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3176    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3177    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3178
3179    When the start sequence (ESC 0/2/3/4) is found, this annotation
3180    header is produced.
3181
3182         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3183
3184    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3185    produced until the end sequence (ESC 1) is found:
3186
3187    (1) CHAR ... CHAR
3188    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3189    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3190    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3191
3192    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3193    annotation header is updated as below:
3194
3195    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3196    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3197    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3198    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3199
3200    If an error is found while composing, the annotation header is
3201    changed to:
3202
3203         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3204
3205    and the sequence [ -2 DECODED-RULE ] is changed to the original
3206    byte sequence as below:
3207         o the original byte sequence is B: [ B -1 ]
3208         o the original byte sequence is B1 B2: [ B1 B2 ]
3209    and the sequence [ -1 -1 ] is changed to the original byte
3210    sequence:
3211         [ ESC '0' ]
3212 */
3213
3214 /* Decode a composition rule C1 and maybe one more byte from the
3215    source, and set RULE to the encoded composition rule.  If the rule
3216    is invalid, goto invalid_code.  */
3217
3218 #define DECODE_COMPOSITION_RULE(rule)                                   \
3219   do {                                                                  \
3220     rule = c1 - 32;                                                     \
3221     if (rule < 0)                                                       \
3222       goto invalid_code;                                                \
3223     if (rule < 81)              /* old format (before ver.21) */        \
3224       {                                                                 \
3225         int gref = (rule) / 9;                                          \
3226         int nref = (rule) % 9;                                          \
3227         if (gref == 4) gref = 10;                                       \
3228         if (nref == 4) nref = 10;                                       \
3229         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3230       }                                                                 \
3231     else                        /* new format (after ver.21) */         \
3232       {                                                                 \
3233         int b;                                                          \
3234                                                                         \
3235         ONE_MORE_BYTE (b);                                              \
3236         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3237           goto invalid_code;                                            \
3238         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3239         rule += 0x100;   /* Distinguish it from the old format.  */     \
3240       }                                                                 \
3241   } while (0)
3242
3243 #define ENCODE_COMPOSITION_RULE(rule)                           \
3244   do {                                                          \
3245     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3246                                                                 \
3247     if (rule < 0x100)           /* old format */                \
3248       {                                                         \
3249         if (gref == 10) gref = 4;                               \
3250         if (nref == 10) nref = 4;                               \
3251         charbuf[idx] = 32 + gref * 9 + nref;                    \
3252         charbuf[idx + 1] = -1;                                  \
3253         new_chars++;                                            \
3254       }                                                         \
3255     else                                /* new format */        \
3256       {                                                         \
3257         charbuf[idx] = 32 + 81 + gref;                          \
3258         charbuf[idx + 1] = 32 + nref;                           \
3259         new_chars += 2;                                         \
3260       }                                                         \
3261   } while (0)
3262
3263 /* Finish the current composition as invalid.  */
3264
3265 static int
3266 finish_composition (int *charbuf, struct composition_status *cmp_status)
3267 {
3268   int idx = - cmp_status->length;
3269   int new_chars;
3270
3271   /* Recover the original ESC sequence */
3272   charbuf[idx++] = ISO_CODE_ESC;
3273   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3274                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3275                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3276                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3277                     : '4');
3278   charbuf[idx++] = -2;
3279   charbuf[idx++] = 0;
3280   charbuf[idx++] = -1;
3281   new_chars = cmp_status->nchars;
3282   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3283     for (; idx < 0; idx++)
3284       {
3285         int elt = charbuf[idx];
3286
3287         if (elt == -2)
3288           {
3289             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3290             idx++;
3291           }
3292         else if (elt == -1)
3293           {
3294             charbuf[idx++] = ISO_CODE_ESC;
3295             charbuf[idx] = '0';
3296             new_chars += 2;
3297           }
3298       }
3299   cmp_status->state = COMPOSING_NO;
3300   return new_chars;
3301 }
3302
3303 /* If characters are under composition, finish the composition.  */
3304 #define MAYBE_FINISH_COMPOSITION()                              \
3305   do {                                                          \
3306     if (cmp_status->state != COMPOSING_NO)                      \
3307       char_offset += finish_composition (charbuf, cmp_status);  \
3308   } while (0)
3309
3310 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3311
3312    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3313    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3314    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3315    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3316
3317    Produce this annotation sequence now:
3318
3319    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3320 */
3321
3322 #define DECODE_COMPOSITION_START(c1)                                       \
3323   do {                                                                     \
3324     if (c1 == '0'                                                          \
3325         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3326              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3327             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3328                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3329       {                                                                    \
3330         *charbuf++ = -1;                                                   \
3331         *charbuf++= -1;                                                    \
3332         cmp_status->state = COMPOSING_CHAR;                                \
3333         cmp_status->length += 2;                                           \
3334       }                                                                    \
3335     else                                                                   \
3336       {                                                                    \
3337         MAYBE_FINISH_COMPOSITION ();                                       \
3338         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3339                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3340                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3341                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3342         cmp_status->state                                                  \
3343           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3344         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3345         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3346         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3347         coding->annotated = 1;                                             \
3348       }                                                                    \
3349   } while (0)
3350
3351
3352 /* Handle composition end sequence ESC 1.  */
3353
3354 #define DECODE_COMPOSITION_END()                                        \
3355   do {                                                                  \
3356     if (cmp_status->nchars == 0                                         \
3357         || ((cmp_status->state == COMPOSING_CHAR)                       \
3358             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3359       {                                                                 \
3360         MAYBE_FINISH_COMPOSITION ();                                    \
3361         goto invalid_code;                                              \
3362       }                                                                 \
3363     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3364       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3365     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3366       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3367     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3368     char_offset += cmp_status->nchars;                                  \
3369     cmp_status->state = COMPOSING_NO;                                   \
3370   } while (0)
3371
3372 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3373
3374 #define STORE_COMPOSITION_RULE(rule)    \
3375   do {                                  \
3376     *charbuf++ = -2;                    \
3377     *charbuf++ = rule;                  \
3378     cmp_status->length += 2;            \
3379     cmp_status->state--;                \
3380   } while (0)
3381
3382 /* Store a composed char or a component char C in charbuf, and update
3383    cmp_status.  */
3384
3385 #define STORE_COMPOSITION_CHAR(c)                                       \
3386   do {                                                                  \
3387     *charbuf++ = (c);                                                   \
3388     cmp_status->length++;                                               \
3389     if (cmp_status->state == COMPOSING_CHAR)                            \
3390       cmp_status->nchars++;                                             \
3391     else                                                                \
3392       cmp_status->ncomps++;                                             \
3393     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3394         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3395             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3396       cmp_status->state++;                                              \
3397   } while (0)
3398
3399
3400 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3401
3402 static void
3403 decode_coding_iso_2022 (struct coding_system *coding)
3404 {
3405   const unsigned char *src = coding->source + coding->consumed;
3406   const unsigned char *src_end = coding->source + coding->src_bytes;
3407   const unsigned char *src_base;
3408   int *charbuf = coding->charbuf + coding->charbuf_used;
3409   /* We may produce two annotations (charset and composition) in one
3410      loop and one more charset annotation at the end.  */
3411   int *charbuf_end
3412     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3413   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3414   bool multibytep = coding->src_multibyte;
3415   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3416   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3417   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3418   int charset_id_2, charset_id_3;
3419   struct charset *charset;
3420   int c;
3421   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3422   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3423   ptrdiff_t char_offset = coding->produced_char;
3424   ptrdiff_t last_offset = char_offset;
3425   int last_id = charset_ascii;
3426   bool eol_dos
3427     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3428   int byte_after_cr = -1;
3429   int i;
3430
3431   setup_iso_safe_charsets (attrs);
3432   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3433
3434   if (cmp_status->state != COMPOSING_NO)
3435     {
3436       if (charbuf_end - charbuf < cmp_status->length)
3437         emacs_abort ();
3438       for (i = 0; i < cmp_status->length; i++)
3439         *charbuf++ = cmp_status->carryover[i];
3440       coding->annotated = 1;
3441     }
3442
3443   while (1)
3444     {
3445       int c1, c2, c3;
3446
3447       src_base = src;
3448       consumed_chars_base = consumed_chars;
3449
3450       if (charbuf >= charbuf_end)
3451         {
3452           if (byte_after_cr >= 0)
3453             src_base--;
3454           break;
3455         }
3456
3457       if (byte_after_cr >= 0)
3458         c1 = byte_after_cr, byte_after_cr = -1;
3459       else
3460         ONE_MORE_BYTE (c1);
3461       if (c1 < 0)
3462         goto invalid_code;
3463
3464       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3465         {
3466           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3467           char_offset++;
3468           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3469           continue;
3470         }
3471
3472       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3473         {
3474           if (c1 == ISO_CODE_ESC)
3475             {
3476               if (src + 1 >= src_end)
3477                 goto no_more_source;
3478               *charbuf++ = ISO_CODE_ESC;
3479               char_offset++;
3480               if (src[0] == '%' && src[1] == '@')
3481                 {
3482                   src += 2;
3483                   consumed_chars += 2;
3484                   char_offset += 2;
3485                   /* We are sure charbuf can contain two more chars. */
3486                   *charbuf++ = '%';
3487                   *charbuf++ = '@';
3488                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3489                 }
3490             }
3491           else
3492             {
3493               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3494               char_offset++;
3495             }
3496           continue;
3497         }
3498
3499       if ((cmp_status->state == COMPOSING_RULE
3500            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3501           && c1 != ISO_CODE_ESC)
3502         {
3503           int rule;
3504
3505           DECODE_COMPOSITION_RULE (rule);
3506           STORE_COMPOSITION_RULE (rule);
3507           continue;
3508         }
3509
3510       /* We produce at most one character.  */
3511       switch (iso_code_class [c1])
3512         {
3513         case ISO_0x20_or_0x7F:
3514           if (charset_id_0 < 0
3515               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3516             /* This is SPACE or DEL.  */
3517             charset = CHARSET_FROM_ID (charset_ascii);
3518           else
3519             charset = CHARSET_FROM_ID (charset_id_0);
3520           break;
3521
3522         case ISO_graphic_plane_0:
3523           if (charset_id_0 < 0)
3524             charset = CHARSET_FROM_ID (charset_ascii);
3525           else
3526             charset = CHARSET_FROM_ID (charset_id_0);
3527           break;
3528
3529         case ISO_0xA0_or_0xFF:
3530           if (charset_id_1 < 0
3531               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3532               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3533             goto invalid_code;
3534           /* This is a graphic character, we fall down ... */
3535
3536         case ISO_graphic_plane_1:
3537           if (charset_id_1 < 0)
3538             goto invalid_code;
3539           charset = CHARSET_FROM_ID (charset_id_1);
3540           break;
3541
3542         case ISO_control_0:
3543           if (eol_dos && c1 == '\r')
3544             ONE_MORE_BYTE (byte_after_cr);
3545           MAYBE_FINISH_COMPOSITION ();
3546           charset = CHARSET_FROM_ID (charset_ascii);
3547           break;
3548
3549         case ISO_control_1:
3550           goto invalid_code;
3551
3552         case ISO_shift_out:
3553           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3554               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3555             goto invalid_code;
3556           CODING_ISO_INVOCATION (coding, 0) = 1;
3557           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3558           continue;
3559
3560         case ISO_shift_in:
3561           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3562             goto invalid_code;
3563           CODING_ISO_INVOCATION (coding, 0) = 0;
3564           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3565           continue;
3566
3567         case ISO_single_shift_2_7:
3568           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3569             goto invalid_code;
3570         case ISO_single_shift_2:
3571           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3572             goto invalid_code;
3573           /* SS2 is handled as an escape sequence of ESC 'N' */
3574           c1 = 'N';
3575           goto label_escape_sequence;
3576
3577         case ISO_single_shift_3:
3578           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3579             goto invalid_code;
3580           /* SS2 is handled as an escape sequence of ESC 'O' */
3581           c1 = 'O';
3582           goto label_escape_sequence;
3583
3584         case ISO_control_sequence_introducer:
3585           /* CSI is handled as an escape sequence of ESC '[' ...  */
3586           c1 = '[';
3587           goto label_escape_sequence;
3588
3589         case ISO_escape:
3590           ONE_MORE_BYTE (c1);
3591         label_escape_sequence:
3592           /* Escape sequences handled here are invocation,
3593              designation, direction specification, and character
3594              composition specification.  */
3595           switch (c1)
3596             {
3597             case '&':           /* revision of following character set */
3598               ONE_MORE_BYTE (c1);
3599               if (!(c1 >= '@' && c1 <= '~'))
3600                 goto invalid_code;
3601               ONE_MORE_BYTE (c1);
3602               if (c1 != ISO_CODE_ESC)
3603                 goto invalid_code;
3604               ONE_MORE_BYTE (c1);
3605               goto label_escape_sequence;
3606
3607             case '$':           /* designation of 2-byte character set */
3608               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3609                 goto invalid_code;
3610               {
3611                 int reg, chars96;
3612
3613                 ONE_MORE_BYTE (c1);
3614                 if (c1 >= '@' && c1 <= 'B')
3615                   {     /* designation of JISX0208.1978, GB2312.1980,
3616                            or JISX0208.1980 */
3617                     reg = 0, chars96 = 0;
3618                   }
3619                 else if (c1 >= 0x28 && c1 <= 0x2B)
3620                   { /* designation of DIMENSION2_CHARS94 character set */
3621                     reg = c1 - 0x28, chars96 = 0;
3622                     ONE_MORE_BYTE (c1);
3623                   }
3624                 else if (c1 >= 0x2C && c1 <= 0x2F)
3625                   { /* designation of DIMENSION2_CHARS96 character set */
3626                     reg = c1 - 0x2C, chars96 = 1;
3627                     ONE_MORE_BYTE (c1);
3628                   }
3629                 else
3630                   goto invalid_code;
3631                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3632                 /* We must update these variables now.  */
3633                 if (reg == 0)
3634                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3635                 else if (reg == 1)
3636                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3637                 if (chars96 < 0)
3638                   goto invalid_code;
3639               }
3640               continue;
3641
3642             case 'n':           /* invocation of locking-shift-2 */
3643               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3644                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3645                 goto invalid_code;
3646               CODING_ISO_INVOCATION (coding, 0) = 2;
3647               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3648               continue;
3649
3650             case 'o':           /* invocation of locking-shift-3 */
3651               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3652                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3653                 goto invalid_code;
3654               CODING_ISO_INVOCATION (coding, 0) = 3;
3655               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3656               continue;
3657
3658             case 'N':           /* invocation of single-shift-2 */
3659               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3660                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3661                 goto invalid_code;
3662               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3663               if (charset_id_2 < 0)
3664                 charset = CHARSET_FROM_ID (charset_ascii);
3665               else
3666                 charset = CHARSET_FROM_ID (charset_id_2);
3667               ONE_MORE_BYTE (c1);
3668               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3669                 goto invalid_code;
3670               break;
3671
3672             case 'O':           /* invocation of single-shift-3 */
3673               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3674                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3675                 goto invalid_code;
3676               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3677               if (charset_id_3 < 0)
3678                 charset = CHARSET_FROM_ID (charset_ascii);
3679               else
3680                 charset = CHARSET_FROM_ID (charset_id_3);
3681               ONE_MORE_BYTE (c1);
3682               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3683                 goto invalid_code;
3684               break;
3685
3686             case '0': case '2': case '3': case '4': /* start composition */
3687               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3688                 goto invalid_code;
3689               if (last_id != charset_ascii)
3690                 {
3691                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3692                   last_id = charset_ascii;
3693                   last_offset = char_offset;
3694                 }
3695               DECODE_COMPOSITION_START (c1);
3696               continue;
3697
3698             case '1':           /* end composition */
3699               if (cmp_status->state == COMPOSING_NO)
3700                 goto invalid_code;
3701               DECODE_COMPOSITION_END ();
3702               continue;
3703
3704             case '[':           /* specification of direction */
3705               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3706                 goto invalid_code;
3707               /* For the moment, nested direction is not supported.
3708                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3709                  left-to-right, and nonzero means right-to-left.  */
3710               ONE_MORE_BYTE (c1);
3711               switch (c1)
3712                 {
3713                 case ']':       /* end of the current direction */
3714                   coding->mode &= ~CODING_MODE_DIRECTION;
3715
3716                 case '0':       /* end of the current direction */
3717                 case '1':       /* start of left-to-right direction */
3718                   ONE_MORE_BYTE (c1);
3719                   if (c1 == ']')
3720                     coding->mode &= ~CODING_MODE_DIRECTION;
3721                   else
3722                     goto invalid_code;
3723                   break;
3724
3725                 case '2':       /* start of right-to-left direction */
3726                   ONE_MORE_BYTE (c1);
3727                   if (c1 == ']')
3728                     coding->mode |= CODING_MODE_DIRECTION;
3729                   else
3730                     goto invalid_code;
3731                   break;
3732
3733                 default:
3734                   goto invalid_code;
3735                 }
3736               continue;
3737
3738             case '%':
3739               ONE_MORE_BYTE (c1);
3740               if (c1 == '/')
3741                 {
3742                   /* CTEXT extended segment:
3743                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3744                      We keep these bytes as is for the moment.
3745                      They may be decoded by post-read-conversion.  */
3746                   int dim, M, L;
3747                   int size;
3748
3749                   ONE_MORE_BYTE (dim);
3750                   if (dim < '0' || dim > '4')
3751                     goto invalid_code;
3752                   ONE_MORE_BYTE (M);
3753                   if (M < 128)
3754                     goto invalid_code;
3755                   ONE_MORE_BYTE (L);
3756                   if (L < 128)
3757                     goto invalid_code;
3758                   size = ((M - 128) * 128) + (L - 128);
3759                   if (charbuf + 6 > charbuf_end)
3760                     goto break_loop;
3761                   *charbuf++ = ISO_CODE_ESC;
3762                   *charbuf++ = '%';
3763                   *charbuf++ = '/';
3764                   *charbuf++ = dim;
3765                   *charbuf++ = BYTE8_TO_CHAR (M);
3766                   *charbuf++ = BYTE8_TO_CHAR (L);
3767                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3768                 }
3769               else if (c1 == 'G')
3770                 {
3771                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3772                      ESC % G --UTF-8-BYTES-- ESC % @
3773                      We keep these bytes as is for the moment.
3774                      They may be decoded by post-read-conversion.  */
3775                   if (charbuf + 3 > charbuf_end)
3776                     goto break_loop;
3777                   *charbuf++ = ISO_CODE_ESC;
3778                   *charbuf++ = '%';
3779                   *charbuf++ = 'G';
3780                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3781                 }
3782               else
3783                 goto invalid_code;
3784               continue;
3785               break;
3786
3787             default:
3788               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3789                 goto invalid_code;
3790               {
3791                 int reg, chars96;
3792
3793                 if (c1 >= 0x28 && c1 <= 0x2B)
3794                   { /* designation of DIMENSION1_CHARS94 character set */
3795                     reg = c1 - 0x28, chars96 = 0;
3796                     ONE_MORE_BYTE (c1);
3797                   }
3798                 else if (c1 >= 0x2C && c1 <= 0x2F)
3799                   { /* designation of DIMENSION1_CHARS96 character set */
3800                     reg = c1 - 0x2C, chars96 = 1;
3801                     ONE_MORE_BYTE (c1);
3802                   }
3803                 else
3804                   goto invalid_code;
3805                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3806                 /* We must update these variables now.  */
3807                 if (reg == 0)
3808                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3809                 else if (reg == 1)
3810                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3811                 if (chars96 < 0)
3812                   goto invalid_code;
3813               }
3814               continue;
3815             }
3816           break;
3817
3818         default:
3819           emacs_abort ();
3820         }
3821
3822       if (cmp_status->state == COMPOSING_NO
3823           && charset->id != charset_ascii
3824           && last_id != charset->id)
3825         {
3826           if (last_id != charset_ascii)
3827             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3828           last_id = charset->id;
3829           last_offset = char_offset;
3830         }
3831
3832       /* Now we know CHARSET and 1st position code C1 of a character.
3833          Produce a decoded character while getting 2nd and 3rd
3834          position codes C2, C3 if necessary.  */
3835       if (CHARSET_DIMENSION (charset) > 1)
3836         {
3837           ONE_MORE_BYTE (c2);
3838           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3839               || ((c1 & 0x80) != (c2 & 0x80)))
3840             /* C2 is not in a valid range.  */
3841             goto invalid_code;
3842           if (CHARSET_DIMENSION (charset) == 2)
3843             c1 = (c1 << 8) | c2;
3844           else
3845             {
3846               ONE_MORE_BYTE (c3);
3847               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3848                   || ((c1 & 0x80) != (c3 & 0x80)))
3849                 /* C3 is not in a valid range.  */
3850                 goto invalid_code;
3851               c1 = (c1 << 16) | (c2 << 8) | c2;
3852             }
3853         }
3854       c1 &= 0x7F7F7F;
3855       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3856       if (c < 0)
3857         {
3858           MAYBE_FINISH_COMPOSITION ();
3859           for (; src_base < src; src_base++, char_offset++)
3860             {
3861               if (ASCII_BYTE_P (*src_base))
3862                 *charbuf++ = *src_base;
3863               else
3864                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3865             }
3866         }
3867       else if (cmp_status->state == COMPOSING_NO)
3868         {
3869           *charbuf++ = c;
3870           char_offset++;
3871         }
3872       else if ((cmp_status->state == COMPOSING_CHAR
3873                 ? cmp_status->nchars
3874                 : cmp_status->ncomps)
3875                >= MAX_COMPOSITION_COMPONENTS)
3876         {
3877           /* Too long composition.  */
3878           MAYBE_FINISH_COMPOSITION ();
3879           *charbuf++ = c;
3880           char_offset++;
3881         }
3882       else
3883         STORE_COMPOSITION_CHAR (c);
3884       continue;
3885
3886     invalid_code:
3887       MAYBE_FINISH_COMPOSITION ();
3888       src = src_base;
3889       consumed_chars = consumed_chars_base;
3890       ONE_MORE_BYTE (c);
3891       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3892       char_offset++;
3893       coding->errors++;
3894       continue;
3895
3896     break_loop:
3897       break;
3898     }
3899
3900  no_more_source:
3901   if (cmp_status->state != COMPOSING_NO)
3902     {
3903       if (coding->mode & CODING_MODE_LAST_BLOCK)
3904         MAYBE_FINISH_COMPOSITION ();
3905       else
3906         {
3907           charbuf -= cmp_status->length;
3908           for (i = 0; i < cmp_status->length; i++)
3909             cmp_status->carryover[i] = charbuf[i];
3910         }
3911     }
3912   else if (last_id != charset_ascii)
3913     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3914   coding->consumed_char += consumed_chars_base;
3915   coding->consumed = src_base - coding->source;
3916   coding->charbuf_used = charbuf - coding->charbuf;
3917 }
3918
3919
3920 /* ISO2022 encoding stuff.  */
3921
3922 /*
3923    It is not enough to say just "ISO2022" on encoding, we have to
3924    specify more details.  In Emacs, each coding system of ISO2022
3925    variant has the following specifications:
3926         1. Initial designation to G0 thru G3.
3927         2. Allows short-form designation?
3928         3. ASCII should be designated to G0 before control characters?
3929         4. ASCII should be designated to G0 at end of line?
3930         5. 7-bit environment or 8-bit environment?
3931         6. Use locking-shift?
3932         7. Use Single-shift?
3933    And the following two are only for Japanese:
3934         8. Use ASCII in place of JIS0201-1976-Roman?
3935         9. Use JISX0208-1983 in place of JISX0208-1978?
3936    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3937    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3938    details.
3939 */
3940
3941 /* Produce codes (escape sequence) for designating CHARSET to graphic
3942    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3943    '@', 'A', or 'B' and the coding system CODING allows, produce
3944    designation sequence of short-form.  */
3945
3946 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3947   do {                                                                  \
3948     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3949     const char *intermediate_char_94 = "()*+";                          \
3950     const char *intermediate_char_96 = ",-./";                          \
3951     int revision = -1;                                                  \
3952                                                                         \
3953     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3954       revision = CHARSET_ISO_REVISION (charset);                        \
3955                                                                         \
3956     if (revision >= 0)                                                  \
3957       {                                                                 \
3958         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3959         EMIT_ONE_BYTE ('@' + revision);                                 \
3960       }                                                                 \
3961     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3962     if (CHARSET_DIMENSION (charset) == 1)                               \
3963       {                                                                 \
3964         int b;                                                          \
3965         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3966           b = intermediate_char_94[reg];                                \
3967         else                                                            \
3968           b = intermediate_char_96[reg];                                \
3969         EMIT_ONE_ASCII_BYTE (b);                                        \
3970       }                                                                 \
3971     else                                                                \
3972       {                                                                 \
3973         EMIT_ONE_ASCII_BYTE ('$');                                      \
3974         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3975           {                                                             \
3976             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3977                 || reg != 0                                             \
3978                 || final_char < '@' || final_char > 'B')                \
3979               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3980           }                                                             \
3981         else                                                            \
3982           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3983       }                                                                 \
3984     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3985                                                                         \
3986     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3987   } while (0)
3988
3989
3990 /* The following two macros produce codes (control character or escape
3991    sequence) for ISO2022 single-shift functions (single-shift-2 and
3992    single-shift-3).  */
3993
3994 #define ENCODE_SINGLE_SHIFT_2                                           \
3995   do {                                                                  \
3996     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3997       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3998     else                                                                \
3999       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4000     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4001   } while (0)
4002
4003
4004 #define ENCODE_SINGLE_SHIFT_3                                           \
4005   do {                                                                  \
4006     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4007       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4008     else                                                                \
4009       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4010     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4011   } while (0)
4012
4013
4014 /* The following four macros produce codes (control character or
4015    escape sequence) for ISO2022 locking-shift functions (shift-in,
4016    shift-out, locking-shift-2, and locking-shift-3).  */
4017
4018 #define ENCODE_SHIFT_IN                                 \
4019   do {                                                  \
4020     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4021     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4022   } while (0)
4023
4024
4025 #define ENCODE_SHIFT_OUT                                \
4026   do {                                                  \
4027     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4028     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4029   } while (0)
4030
4031
4032 #define ENCODE_LOCKING_SHIFT_2                          \
4033   do {                                                  \
4034     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4035     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4036   } while (0)
4037
4038
4039 #define ENCODE_LOCKING_SHIFT_3                          \
4040   do {                                                  \
4041     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4042     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4043   } while (0)
4044
4045
4046 /* Produce codes for a DIMENSION1 character whose character set is
4047    CHARSET and whose position-code is C1.  Designation and invocation
4048    sequences are also produced in advance if necessary.  */
4049
4050 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4051   do {                                                                  \
4052     int id = CHARSET_ID (charset);                                      \
4053                                                                         \
4054     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4055         && id == charset_ascii)                                         \
4056       {                                                                 \
4057         id = charset_jisx0201_roman;                                    \
4058         charset = CHARSET_FROM_ID (id);                                 \
4059       }                                                                 \
4060                                                                         \
4061     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4062       {                                                                 \
4063         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4064           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4065         else                                                            \
4066           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4067         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4068         break;                                                          \
4069       }                                                                 \
4070     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4071       {                                                                 \
4072         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4073         break;                                                          \
4074       }                                                                 \
4075     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4076       {                                                                 \
4077         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4078         break;                                                          \
4079       }                                                                 \
4080     else                                                                \
4081       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4082          must invoke it, or, at first, designate it to some graphic     \
4083          register.  Then repeat the loop to actually produce the        \
4084          character.  */                                                 \
4085       dst = encode_invocation_designation (charset, coding, dst,        \
4086                                            &produced_chars);            \
4087   } while (1)
4088
4089
4090 /* Produce codes for a DIMENSION2 character whose character set is
4091    CHARSET and whose position-codes are C1 and C2.  Designation and
4092    invocation codes are also produced in advance if necessary.  */
4093
4094 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4095   do {                                                                  \
4096     int id = CHARSET_ID (charset);                                      \
4097                                                                         \
4098     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4099         && id == charset_jisx0208)                                      \
4100       {                                                                 \
4101         id = charset_jisx0208_1978;                                     \
4102         charset = CHARSET_FROM_ID (id);                                 \
4103       }                                                                 \
4104                                                                         \
4105     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4106       {                                                                 \
4107         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4108           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4109         else                                                            \
4110           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4111         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4112         break;                                                          \
4113       }                                                                 \
4114     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4115       {                                                                 \
4116         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4117         break;                                                          \
4118       }                                                                 \
4119     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4120       {                                                                 \
4121         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4122         break;                                                          \
4123       }                                                                 \
4124     else                                                                \
4125       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4126          must invoke it, or, at first, designate it to some graphic     \
4127          register.  Then repeat the loop to actually produce the        \
4128          character.  */                                                 \
4129       dst = encode_invocation_designation (charset, coding, dst,        \
4130                                            &produced_chars);            \
4131   } while (1)
4132
4133
4134 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4135   do {                                                                     \
4136     unsigned code;                                                         \
4137     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4138                                                                            \
4139     if (CHARSET_DIMENSION (charset) == 1)                                  \
4140       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4141     else                                                                   \
4142       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4143   } while (0)
4144
4145
4146 /* Produce designation and invocation codes at a place pointed by DST
4147    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4148    Return new DST.  */
4149
4150 static unsigned char *
4151 encode_invocation_designation (struct charset *charset,
4152                                struct coding_system *coding,
4153                                unsigned char *dst, ptrdiff_t *p_nchars)
4154 {
4155   bool multibytep = coding->dst_multibyte;
4156   ptrdiff_t produced_chars = *p_nchars;
4157   int reg;                      /* graphic register number */
4158   int id = CHARSET_ID (charset);
4159
4160   /* At first, check designations.  */
4161   for (reg = 0; reg < 4; reg++)
4162     if (id == CODING_ISO_DESIGNATION (coding, reg))
4163       break;
4164
4165   if (reg >= 4)
4166     {
4167       /* CHARSET is not yet designated to any graphic registers.  */
4168       /* At first check the requested designation.  */
4169       reg = CODING_ISO_REQUEST (coding, id);
4170       if (reg < 0)
4171         /* Since CHARSET requests no special designation, designate it
4172            to graphic register 0.  */
4173         reg = 0;
4174
4175       ENCODE_DESIGNATION (charset, reg, coding);
4176     }
4177
4178   if (CODING_ISO_INVOCATION (coding, 0) != reg
4179       && CODING_ISO_INVOCATION (coding, 1) != reg)
4180     {
4181       /* Since the graphic register REG is not invoked to any graphic
4182          planes, invoke it to graphic plane 0.  */
4183       switch (reg)
4184         {
4185         case 0:                 /* graphic register 0 */
4186           ENCODE_SHIFT_IN;
4187           break;
4188
4189         case 1:                 /* graphic register 1 */
4190           ENCODE_SHIFT_OUT;
4191           break;
4192
4193         case 2:                 /* graphic register 2 */
4194           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4195             ENCODE_SINGLE_SHIFT_2;
4196           else
4197             ENCODE_LOCKING_SHIFT_2;
4198           break;
4199
4200         case 3:                 /* graphic register 3 */
4201           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4202             ENCODE_SINGLE_SHIFT_3;
4203           else
4204             ENCODE_LOCKING_SHIFT_3;
4205           break;
4206         }
4207     }
4208
4209   *p_nchars = produced_chars;
4210   return dst;
4211 }
4212
4213
4214 /* Produce codes for designation and invocation to reset the graphic
4215    planes and registers to initial state.  */
4216 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4217   do {                                                                  \
4218     int reg;                                                            \
4219     struct charset *charset;                                            \
4220                                                                         \
4221     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4222       ENCODE_SHIFT_IN;                                                  \
4223     for (reg = 0; reg < 4; reg++)                                       \
4224       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4225           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4226               != CODING_ISO_INITIAL (coding, reg)))                     \
4227         {                                                               \
4228           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4229           ENCODE_DESIGNATION (charset, reg, coding);                    \
4230         }                                                               \
4231   } while (0)
4232
4233
4234 /* Produce designation sequences of charsets in the line started from
4235    CHARBUF to a place pointed by DST, and return the number of
4236    produced bytes.  DST should not directly point a buffer text area
4237    which may be relocated by char_charset call.
4238
4239    If the current block ends before any end-of-line, we may fail to
4240    find all the necessary designations.  */
4241
4242 static ptrdiff_t
4243 encode_designation_at_bol (struct coding_system *coding,
4244                            int *charbuf, int *charbuf_end,
4245                            unsigned char *dst)
4246 {
4247   unsigned char *orig = dst;
4248   struct charset *charset;
4249   /* Table of charsets to be designated to each graphic register.  */
4250   int r[4];
4251   int c, found = 0, reg;
4252   ptrdiff_t produced_chars = 0;
4253   bool multibytep = coding->dst_multibyte;
4254   Lisp_Object attrs;
4255   Lisp_Object charset_list;
4256
4257   attrs = CODING_ID_ATTRS (coding->id);
4258   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4259   if (EQ (charset_list, Qiso_2022))
4260     charset_list = Viso_2022_charset_list;
4261
4262   for (reg = 0; reg < 4; reg++)
4263     r[reg] = -1;
4264
4265   while (charbuf < charbuf_end && found < 4)
4266     {
4267       int id;
4268
4269       c = *charbuf++;
4270       if (c == '\n')
4271         break;
4272       charset = char_charset (c, charset_list, NULL);
4273       id = CHARSET_ID (charset);
4274       reg = CODING_ISO_REQUEST (coding, id);
4275       if (reg >= 0 && r[reg] < 0)
4276         {
4277           found++;
4278           r[reg] = id;
4279         }
4280     }
4281
4282   if (found)
4283     {
4284       for (reg = 0; reg < 4; reg++)
4285         if (r[reg] >= 0
4286             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4287           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4288     }
4289
4290   return dst - orig;
4291 }
4292
4293 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4294
4295 static bool
4296 encode_coding_iso_2022 (struct coding_system *coding)
4297 {
4298   bool multibytep = coding->dst_multibyte;
4299   int *charbuf = coding->charbuf;
4300   int *charbuf_end = charbuf + coding->charbuf_used;
4301   unsigned char *dst = coding->destination + coding->produced;
4302   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4303   int safe_room = 16;
4304   bool bol_designation
4305     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4306        && CODING_ISO_BOL (coding));
4307   ptrdiff_t produced_chars = 0;
4308   Lisp_Object attrs, eol_type, charset_list;
4309   bool ascii_compatible;
4310   int c;
4311   int preferred_charset_id = -1;
4312
4313   CODING_GET_INFO (coding, attrs, charset_list);
4314   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4315   if (VECTORP (eol_type))
4316     eol_type = Qunix;
4317
4318   setup_iso_safe_charsets (attrs);
4319   /* Charset list may have been changed.  */
4320   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4321   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4322
4323   ascii_compatible
4324     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4325        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4326                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4327
4328   while (charbuf < charbuf_end)
4329     {
4330       ASSURE_DESTINATION (safe_room);
4331
4332       if (bol_designation)
4333         {
4334           /* We have to produce designation sequences if any now.  */
4335           unsigned char desig_buf[16];
4336           int nbytes;
4337           ptrdiff_t offset;
4338
4339           charset_map_loaded = 0;
4340           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4341                                               desig_buf);
4342           if (charset_map_loaded
4343               && (offset = coding_change_destination (coding)))
4344             {
4345               dst += offset;
4346               dst_end += offset;
4347             }
4348           memcpy (dst, desig_buf, nbytes);
4349           dst += nbytes;
4350           /* We are sure that designation sequences are all ASCII bytes.  */
4351           produced_chars += nbytes;
4352           bol_designation = 0;
4353           ASSURE_DESTINATION (safe_room);
4354         }
4355
4356       c = *charbuf++;
4357
4358       if (c < 0)
4359         {
4360           /* Handle an annotation.  */
4361           switch (*charbuf)
4362             {
4363             case CODING_ANNOTATE_COMPOSITION_MASK:
4364               /* Not yet implemented.  */
4365               break;
4366             case CODING_ANNOTATE_CHARSET_MASK:
4367               preferred_charset_id = charbuf[2];
4368               if (preferred_charset_id >= 0
4369                   && NILP (Fmemq (make_number (preferred_charset_id),
4370                                   charset_list)))
4371                 preferred_charset_id = -1;
4372               break;
4373             default:
4374               emacs_abort ();
4375             }
4376           charbuf += -c - 1;
4377           continue;
4378         }
4379
4380       /* Now encode the character C.  */
4381       if (c < 0x20 || c == 0x7F)
4382         {
4383           if (c == '\n'
4384               || (c == '\r' && EQ (eol_type, Qmac)))
4385             {
4386               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4387                 ENCODE_RESET_PLANE_AND_REGISTER ();
4388               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4389                 {
4390                   int i;
4391
4392                   for (i = 0; i < 4; i++)
4393                     CODING_ISO_DESIGNATION (coding, i)
4394                       = CODING_ISO_INITIAL (coding, i);
4395                 }
4396               bol_designation = ((CODING_ISO_FLAGS (coding)
4397                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4398                                  != 0);
4399             }
4400           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4401             ENCODE_RESET_PLANE_AND_REGISTER ();
4402           EMIT_ONE_ASCII_BYTE (c);
4403         }
4404       else if (ASCII_CHAR_P (c))
4405         {
4406           if (ascii_compatible)
4407             EMIT_ONE_ASCII_BYTE (c);
4408           else
4409             {
4410               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4411               ENCODE_ISO_CHARACTER (charset, c);
4412             }
4413         }
4414       else if (CHAR_BYTE8_P (c))
4415         {
4416           c = CHAR_TO_BYTE8 (c);
4417           EMIT_ONE_BYTE (c);
4418         }
4419       else
4420         {
4421           struct charset *charset;
4422
4423           if (preferred_charset_id >= 0)
4424             {
4425               bool result;
4426
4427               charset = CHARSET_FROM_ID (preferred_charset_id);
4428               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4429               if (! result)
4430                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4431                                      NULL, charset);
4432             }
4433           else
4434             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4435                                  NULL, charset);
4436           if (!charset)
4437             {
4438               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4439                 {
4440                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4441                   charset = CHARSET_FROM_ID (charset_ascii);
4442                 }
4443               else
4444                 {
4445                   c = coding->default_char;
4446                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4447                                        charset_list, NULL, charset);
4448                 }
4449             }
4450           ENCODE_ISO_CHARACTER (charset, c);
4451         }
4452     }
4453
4454   if (coding->mode & CODING_MODE_LAST_BLOCK
4455       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4456     {
4457       ASSURE_DESTINATION (safe_room);
4458       ENCODE_RESET_PLANE_AND_REGISTER ();
4459     }
4460   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4461   CODING_ISO_BOL (coding) = bol_designation;
4462   coding->produced_char += produced_chars;
4463   coding->produced = dst - coding->destination;
4464   return 0;
4465 }
4466
4467 \f
4468 /*** 8,9. SJIS and BIG5 handlers ***/
4469
4470 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4471    quite widely.  So, for the moment, Emacs supports them in the bare
4472    C code.  But, in the future, they may be supported only by CCL.  */
4473
4474 /* SJIS is a coding system encoding three character sets: ASCII, right
4475    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4476    as is.  A character of charset katakana-jisx0201 is encoded by
4477    "position-code + 0x80".  A character of charset japanese-jisx0208
4478    is encoded in 2-byte but two position-codes are divided and shifted
4479    so that it fit in the range below.
4480
4481    --- CODE RANGE of SJIS ---
4482    (character set)      (range)
4483    ASCII                0x00 .. 0x7F
4484    KATAKANA-JISX0201    0xA0 .. 0xDF
4485    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4486             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4487    -------------------------------
4488
4489 */
4490
4491 /* BIG5 is a coding system encoding two character sets: ASCII and
4492    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4493    character set and is encoded in two-byte.
4494
4495    --- CODE RANGE of BIG5 ---
4496    (character set)      (range)
4497    ASCII                0x00 .. 0x7F
4498    Big5 (1st byte)      0xA1 .. 0xFE
4499         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4500    --------------------------
4501
4502   */
4503
4504 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4505    Return true if a text is encoded in SJIS.  */
4506
4507 static bool
4508 detect_coding_sjis (struct coding_system *coding,
4509                     struct coding_detection_info *detect_info)
4510 {
4511   const unsigned char *src = coding->source, *src_base;
4512   const unsigned char *src_end = coding->source + coding->src_bytes;
4513   bool multibytep = coding->src_multibyte;
4514   ptrdiff_t consumed_chars = 0;
4515   int found = 0;
4516   int c;
4517   Lisp_Object attrs, charset_list;
4518   int max_first_byte_of_2_byte_code;
4519
4520   CODING_GET_INFO (coding, attrs, charset_list);
4521   max_first_byte_of_2_byte_code
4522     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4523
4524   detect_info->checked |= CATEGORY_MASK_SJIS;
4525   /* A coding system of this category is always ASCII compatible.  */
4526   src += coding->head_ascii;
4527
4528   while (1)
4529     {
4530       src_base = src;
4531       ONE_MORE_BYTE (c);
4532       if (c < 0x80)
4533         continue;
4534       if ((c >= 0x81 && c <= 0x9F)
4535           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4536         {
4537           ONE_MORE_BYTE (c);
4538           if (c < 0x40 || c == 0x7F || c > 0xFC)
4539             break;
4540           found = CATEGORY_MASK_SJIS;
4541         }
4542       else if (c >= 0xA0 && c < 0xE0)
4543         found = CATEGORY_MASK_SJIS;
4544       else
4545         break;
4546     }
4547   detect_info->rejected |= CATEGORY_MASK_SJIS;
4548   return 0;
4549
4550  no_more_source:
4551   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4552     {
4553       detect_info->rejected |= CATEGORY_MASK_SJIS;
4554       return 0;
4555     }
4556   detect_info->found |= found;
4557   return 1;
4558 }
4559
4560 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4561    Return true if a text is encoded in BIG5.  */
4562
4563 static bool
4564 detect_coding_big5 (struct coding_system *coding,
4565                     struct coding_detection_info *detect_info)
4566 {
4567   const unsigned char *src = coding->source, *src_base;
4568   const unsigned char *src_end = coding->source + coding->src_bytes;
4569   bool multibytep = coding->src_multibyte;
4570   ptrdiff_t consumed_chars = 0;
4571   int found = 0;
4572   int c;
4573
4574   detect_info->checked |= CATEGORY_MASK_BIG5;
4575   /* A coding system of this category is always ASCII compatible.  */
4576   src += coding->head_ascii;
4577
4578   while (1)
4579     {
4580       src_base = src;
4581       ONE_MORE_BYTE (c);
4582       if (c < 0x80)
4583         continue;
4584       if (c >= 0xA1)
4585         {
4586           ONE_MORE_BYTE (c);
4587           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4588             return 0;
4589           found = CATEGORY_MASK_BIG5;
4590         }
4591       else
4592         break;
4593     }
4594   detect_info->rejected |= CATEGORY_MASK_BIG5;
4595   return 0;
4596
4597  no_more_source:
4598   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4599     {
4600       detect_info->rejected |= CATEGORY_MASK_BIG5;
4601       return 0;
4602     }
4603   detect_info->found |= found;
4604   return 1;
4605 }
4606
4607 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4608
4609 static void
4610 decode_coding_sjis (struct coding_system *coding)
4611 {
4612   const unsigned char *src = coding->source + coding->consumed;
4613   const unsigned char *src_end = coding->source + coding->src_bytes;
4614   const unsigned char *src_base;
4615   int *charbuf = coding->charbuf + coding->charbuf_used;
4616   /* We may produce one charset annotation in one loop and one more at
4617      the end.  */
4618   int *charbuf_end
4619     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4620   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4621   bool multibytep = coding->src_multibyte;
4622   struct charset *charset_roman, *charset_kanji, *charset_kana;
4623   struct charset *charset_kanji2;
4624   Lisp_Object attrs, charset_list, val;
4625   ptrdiff_t char_offset = coding->produced_char;
4626   ptrdiff_t last_offset = char_offset;
4627   int last_id = charset_ascii;
4628   bool eol_dos
4629     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4630   int byte_after_cr = -1;
4631
4632   CODING_GET_INFO (coding, attrs, charset_list);
4633
4634   val = charset_list;
4635   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4636   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4637   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4638   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4639
4640   while (1)
4641     {
4642       int c, c1;
4643       struct charset *charset;
4644
4645       src_base = src;
4646       consumed_chars_base = consumed_chars;
4647
4648       if (charbuf >= charbuf_end)
4649         {
4650           if (byte_after_cr >= 0)
4651             src_base--;
4652           break;
4653         }
4654
4655       if (byte_after_cr >= 0)
4656         c = byte_after_cr, byte_after_cr = -1;
4657       else
4658         ONE_MORE_BYTE (c);
4659       if (c < 0)
4660         goto invalid_code;
4661       if (c < 0x80)
4662         {
4663           if (eol_dos && c == '\r')
4664             ONE_MORE_BYTE (byte_after_cr);
4665           charset = charset_roman;
4666         }
4667       else if (c == 0x80 || c == 0xA0)
4668         goto invalid_code;
4669       else if (c >= 0xA1 && c <= 0xDF)
4670         {
4671           /* SJIS -> JISX0201-Kana */
4672           c &= 0x7F;
4673           charset = charset_kana;
4674         }
4675       else if (c <= 0xEF)
4676         {
4677           /* SJIS -> JISX0208 */
4678           ONE_MORE_BYTE (c1);
4679           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4680             goto invalid_code;
4681           c = (c << 8) | c1;
4682           SJIS_TO_JIS (c);
4683           charset = charset_kanji;
4684         }
4685       else if (c <= 0xFC && charset_kanji2)
4686         {
4687           /* SJIS -> JISX0213-2 */
4688           ONE_MORE_BYTE (c1);
4689           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4690             goto invalid_code;
4691           c = (c << 8) | c1;
4692           SJIS_TO_JIS2 (c);
4693           charset = charset_kanji2;
4694         }
4695       else
4696         goto invalid_code;
4697       if (charset->id != charset_ascii
4698           && last_id != charset->id)
4699         {
4700           if (last_id != charset_ascii)
4701             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4702           last_id = charset->id;
4703           last_offset = char_offset;
4704         }
4705       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4706       *charbuf++ = c;
4707       char_offset++;
4708       continue;
4709
4710     invalid_code:
4711       src = src_base;
4712       consumed_chars = consumed_chars_base;
4713       ONE_MORE_BYTE (c);
4714       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4715       char_offset++;
4716       coding->errors++;
4717     }
4718
4719  no_more_source:
4720   if (last_id != charset_ascii)
4721     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4722   coding->consumed_char += consumed_chars_base;
4723   coding->consumed = src_base - coding->source;
4724   coding->charbuf_used = charbuf - coding->charbuf;
4725 }
4726
4727 static void
4728 decode_coding_big5 (struct coding_system *coding)
4729 {
4730   const unsigned char *src = coding->source + coding->consumed;
4731   const unsigned char *src_end = coding->source + coding->src_bytes;
4732   const unsigned char *src_base;
4733   int *charbuf = coding->charbuf + coding->charbuf_used;
4734   /* We may produce one charset annotation in one loop and one more at
4735      the end.  */
4736   int *charbuf_end
4737     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4738   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4739   bool multibytep = coding->src_multibyte;
4740   struct charset *charset_roman, *charset_big5;
4741   Lisp_Object attrs, charset_list, val;
4742   ptrdiff_t char_offset = coding->produced_char;
4743   ptrdiff_t last_offset = char_offset;
4744   int last_id = charset_ascii;
4745   bool eol_dos
4746     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4747   int byte_after_cr = -1;
4748
4749   CODING_GET_INFO (coding, attrs, charset_list);
4750   val = charset_list;
4751   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4752   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4753
4754   while (1)
4755     {
4756       int c, c1;
4757       struct charset *charset;
4758
4759       src_base = src;
4760       consumed_chars_base = consumed_chars;
4761
4762       if (charbuf >= charbuf_end)
4763         {
4764           if (byte_after_cr >= 0)
4765             src_base--;
4766           break;
4767         }
4768
4769       if (byte_after_cr >= 0)
4770         c = byte_after_cr, byte_after_cr = -1;
4771       else
4772         ONE_MORE_BYTE (c);
4773
4774       if (c < 0)
4775         goto invalid_code;
4776       if (c < 0x80)
4777         {
4778           if (eol_dos && c == '\r')
4779             ONE_MORE_BYTE (byte_after_cr);
4780           charset = charset_roman;
4781         }
4782       else
4783         {
4784           /* BIG5 -> Big5 */
4785           if (c < 0xA1 || c > 0xFE)
4786             goto invalid_code;
4787           ONE_MORE_BYTE (c1);
4788           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4789             goto invalid_code;
4790           c = c << 8 | c1;
4791           charset = charset_big5;
4792         }
4793       if (charset->id != charset_ascii
4794           && last_id != charset->id)
4795         {
4796           if (last_id != charset_ascii)
4797             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4798           last_id = charset->id;
4799           last_offset = char_offset;
4800         }
4801       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4802       *charbuf++ = c;
4803       char_offset++;
4804       continue;
4805
4806     invalid_code:
4807       src = src_base;
4808       consumed_chars = consumed_chars_base;
4809       ONE_MORE_BYTE (c);
4810       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4811       char_offset++;
4812       coding->errors++;
4813     }
4814
4815  no_more_source:
4816   if (last_id != charset_ascii)
4817     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4818   coding->consumed_char += consumed_chars_base;
4819   coding->consumed = src_base - coding->source;
4820   coding->charbuf_used = charbuf - coding->charbuf;
4821 }
4822
4823 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4824    This function can encode charsets `ascii', `katakana-jisx0201',
4825    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4826    are sure that all these charsets are registered as official charset
4827    (i.e. do not have extended leading-codes).  Characters of other
4828    charsets are produced without any encoding.  */
4829
4830 static bool
4831 encode_coding_sjis (struct coding_system *coding)
4832 {
4833   bool multibytep = coding->dst_multibyte;
4834   int *charbuf = coding->charbuf;
4835   int *charbuf_end = charbuf + coding->charbuf_used;
4836   unsigned char *dst = coding->destination + coding->produced;
4837   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4838   int safe_room = 4;
4839   ptrdiff_t produced_chars = 0;
4840   Lisp_Object attrs, charset_list, val;
4841   bool ascii_compatible;
4842   struct charset *charset_kanji, *charset_kana;
4843   struct charset *charset_kanji2;
4844   int c;
4845
4846   CODING_GET_INFO (coding, attrs, charset_list);
4847   val = XCDR (charset_list);
4848   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4849   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4850   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4851
4852   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4853
4854   while (charbuf < charbuf_end)
4855     {
4856       ASSURE_DESTINATION (safe_room);
4857       c = *charbuf++;
4858       /* Now encode the character C.  */
4859       if (ASCII_CHAR_P (c) && ascii_compatible)
4860         EMIT_ONE_ASCII_BYTE (c);
4861       else if (CHAR_BYTE8_P (c))
4862         {
4863           c = CHAR_TO_BYTE8 (c);
4864           EMIT_ONE_BYTE (c);
4865         }
4866       else
4867         {
4868           unsigned code;
4869           struct charset *charset;
4870           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4871                                &code, charset);
4872
4873           if (!charset)
4874             {
4875               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4876                 {
4877                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4878                   charset = CHARSET_FROM_ID (charset_ascii);
4879                 }
4880               else
4881                 {
4882                   c = coding->default_char;
4883                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4884                                        charset_list, &code, charset);
4885                 }
4886             }
4887           if (code == CHARSET_INVALID_CODE (charset))
4888             emacs_abort ();
4889           if (charset == charset_kanji)
4890             {
4891               int c1, c2;
4892               JIS_TO_SJIS (code);
4893               c1 = code >> 8, c2 = code & 0xFF;
4894               EMIT_TWO_BYTES (c1, c2);
4895             }
4896           else if (charset == charset_kana)
4897             EMIT_ONE_BYTE (code | 0x80);
4898           else if (charset_kanji2 && charset == charset_kanji2)
4899             {
4900               int c1, c2;
4901
4902               c1 = code >> 8;
4903               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4904                   || c1 == 0x28
4905                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4906                 {
4907                   JIS_TO_SJIS2 (code);
4908                   c1 = code >> 8, c2 = code & 0xFF;
4909                   EMIT_TWO_BYTES (c1, c2);
4910                 }
4911               else
4912                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4913             }
4914           else
4915             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4916         }
4917     }
4918   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4919   coding->produced_char += produced_chars;
4920   coding->produced = dst - coding->destination;
4921   return 0;
4922 }
4923
4924 static bool
4925 encode_coding_big5 (struct coding_system *coding)
4926 {
4927   bool multibytep = coding->dst_multibyte;
4928   int *charbuf = coding->charbuf;
4929   int *charbuf_end = charbuf + coding->charbuf_used;
4930   unsigned char *dst = coding->destination + coding->produced;
4931   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4932   int safe_room = 4;
4933   ptrdiff_t produced_chars = 0;
4934   Lisp_Object attrs, charset_list, val;
4935   bool ascii_compatible;
4936   struct charset *charset_big5;
4937   int c;
4938
4939   CODING_GET_INFO (coding, attrs, charset_list);
4940   val = XCDR (charset_list);
4941   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4942   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4943
4944   while (charbuf < charbuf_end)
4945     {
4946       ASSURE_DESTINATION (safe_room);
4947       c = *charbuf++;
4948       /* Now encode the character C.  */
4949       if (ASCII_CHAR_P (c) && ascii_compatible)
4950         EMIT_ONE_ASCII_BYTE (c);
4951       else if (CHAR_BYTE8_P (c))
4952         {
4953           c = CHAR_TO_BYTE8 (c);
4954           EMIT_ONE_BYTE (c);
4955         }
4956       else
4957         {
4958           unsigned code;
4959           struct charset *charset;
4960           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4961                                &code, charset);
4962
4963           if (! charset)
4964             {
4965               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4966                 {
4967                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4968                   charset = CHARSET_FROM_ID (charset_ascii);
4969                 }
4970               else
4971                 {
4972                   c = coding->default_char;
4973                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4974                                        charset_list, &code, charset);
4975                 }
4976             }
4977           if (code == CHARSET_INVALID_CODE (charset))
4978             emacs_abort ();
4979           if (charset == charset_big5)
4980             {
4981               int c1, c2;
4982
4983               c1 = code >> 8, c2 = code & 0xFF;
4984               EMIT_TWO_BYTES (c1, c2);
4985             }
4986           else
4987             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4988         }
4989     }
4990   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4991   coding->produced_char += produced_chars;
4992   coding->produced = dst - coding->destination;
4993   return 0;
4994 }
4995
4996 \f
4997 /*** 10. CCL handlers ***/
4998
4999 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5000    Return true if a text is encoded in a coding system of which
5001    encoder/decoder are written in CCL program.  */
5002
5003 static bool
5004 detect_coding_ccl (struct coding_system *coding,
5005                    struct coding_detection_info *detect_info)
5006 {
5007   const unsigned char *src = coding->source, *src_base;
5008   const unsigned char *src_end = coding->source + coding->src_bytes;
5009   bool multibytep = coding->src_multibyte;
5010   ptrdiff_t consumed_chars = 0;
5011   int found = 0;
5012   unsigned char *valids;
5013   ptrdiff_t head_ascii = coding->head_ascii;
5014   Lisp_Object attrs;
5015
5016   detect_info->checked |= CATEGORY_MASK_CCL;
5017
5018   coding = &coding_categories[coding_category_ccl];
5019   valids = CODING_CCL_VALIDS (coding);
5020   attrs = CODING_ID_ATTRS (coding->id);
5021   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5022     src += head_ascii;
5023
5024   while (1)
5025     {
5026       int c;
5027
5028       src_base = src;
5029       ONE_MORE_BYTE (c);
5030       if (c < 0 || ! valids[c])
5031         break;
5032       if ((valids[c] > 1))
5033         found = CATEGORY_MASK_CCL;
5034     }
5035   detect_info->rejected |= CATEGORY_MASK_CCL;
5036   return 0;
5037
5038  no_more_source:
5039   detect_info->found |= found;
5040   return 1;
5041 }
5042
5043 static void
5044 decode_coding_ccl (struct coding_system *coding)
5045 {
5046   const unsigned char *src = coding->source + coding->consumed;
5047   const unsigned char *src_end = coding->source + coding->src_bytes;
5048   int *charbuf = coding->charbuf + coding->charbuf_used;
5049   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5050   ptrdiff_t consumed_chars = 0;
5051   bool multibytep = coding->src_multibyte;
5052   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5053   int source_charbuf[1024];
5054   int source_byteidx[1025];
5055   Lisp_Object attrs, charset_list;
5056
5057   CODING_GET_INFO (coding, attrs, charset_list);
5058
5059   while (1)
5060     {
5061       const unsigned char *p = src;
5062       ptrdiff_t offset;
5063       int i = 0;
5064
5065       if (multibytep)
5066         {
5067           while (i < 1024 && p < src_end)
5068             {
5069               source_byteidx[i] = p - src;
5070               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5071             }
5072           source_byteidx[i] = p - src;
5073         }
5074       else
5075         while (i < 1024 && p < src_end)
5076           source_charbuf[i++] = *p++;
5077
5078       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5079         ccl->last_block = 1;
5080       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5081       charset_map_loaded = 0;
5082       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5083                   charset_list);
5084       if (charset_map_loaded
5085           && (offset = coding_change_source (coding)))
5086         {
5087           p += offset;
5088           src += offset;
5089           src_end += offset;
5090         }
5091       charbuf += ccl->produced;
5092       if (multibytep)
5093         src += source_byteidx[ccl->consumed];
5094       else
5095         src += ccl->consumed;
5096       consumed_chars += ccl->consumed;
5097       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5098         break;
5099     }
5100
5101   switch (ccl->status)
5102     {
5103     case CCL_STAT_SUSPEND_BY_SRC:
5104       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5105       break;
5106     case CCL_STAT_SUSPEND_BY_DST:
5107       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5108       break;
5109     case CCL_STAT_QUIT:
5110     case CCL_STAT_INVALID_CMD:
5111       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5112       break;
5113     default:
5114       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5115       break;
5116     }
5117   coding->consumed_char += consumed_chars;
5118   coding->consumed = src - coding->source;
5119   coding->charbuf_used = charbuf - coding->charbuf;
5120 }
5121
5122 static bool
5123 encode_coding_ccl (struct coding_system *coding)
5124 {
5125   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5126   bool multibytep = coding->dst_multibyte;
5127   int *charbuf = coding->charbuf;
5128   int *charbuf_end = charbuf + coding->charbuf_used;
5129   unsigned char *dst = coding->destination + coding->produced;
5130   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5131   int destination_charbuf[1024];
5132   ptrdiff_t produced_chars = 0;
5133   int i;
5134   Lisp_Object attrs, charset_list;
5135
5136   CODING_GET_INFO (coding, attrs, charset_list);
5137   if (coding->consumed_char == coding->src_chars
5138       && coding->mode & CODING_MODE_LAST_BLOCK)
5139     ccl->last_block = 1;
5140
5141   do
5142     {
5143       ptrdiff_t offset;
5144
5145       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5146       charset_map_loaded = 0;
5147       ccl_driver (ccl, charbuf, destination_charbuf,
5148                   charbuf_end - charbuf, 1024, charset_list);
5149       if (charset_map_loaded
5150           && (offset = coding_change_destination (coding)))
5151         dst += offset;
5152       if (multibytep)
5153         {
5154           ASSURE_DESTINATION (ccl->produced * 2);
5155           for (i = 0; i < ccl->produced; i++)
5156             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5157         }
5158       else
5159         {
5160           ASSURE_DESTINATION (ccl->produced);
5161           for (i = 0; i < ccl->produced; i++)
5162             *dst++ = destination_charbuf[i] & 0xFF;
5163           produced_chars += ccl->produced;
5164         }
5165       charbuf += ccl->consumed;
5166       if (ccl->status == CCL_STAT_QUIT
5167           || ccl->status == CCL_STAT_INVALID_CMD)
5168         break;
5169     }
5170   while (charbuf < charbuf_end);
5171
5172   switch (ccl->status)
5173     {
5174     case CCL_STAT_SUSPEND_BY_SRC:
5175       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5176       break;
5177     case CCL_STAT_SUSPEND_BY_DST:
5178       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5179       break;
5180     case CCL_STAT_QUIT:
5181     case CCL_STAT_INVALID_CMD:
5182       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5183       break;
5184     default:
5185       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5186       break;
5187     }
5188
5189   coding->produced_char += produced_chars;
5190   coding->produced = dst - coding->destination;
5191   return 0;
5192 }
5193
5194 \f
5195 /*** 10, 11. no-conversion handlers ***/
5196
5197 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5198
5199 static void
5200 decode_coding_raw_text (struct coding_system *coding)
5201 {
5202   bool eol_dos
5203     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5204
5205   coding->chars_at_source = 1;
5206   coding->consumed_char = coding->src_chars;
5207   coding->consumed = coding->src_bytes;
5208   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5209     {
5210       coding->consumed_char--;
5211       coding->consumed--;
5212       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5213     }
5214   else
5215     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5216 }
5217
5218 static bool
5219 encode_coding_raw_text (struct coding_system *coding)
5220 {
5221   bool multibytep = coding->dst_multibyte;
5222   int *charbuf = coding->charbuf;
5223   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5224   unsigned char *dst = coding->destination + coding->produced;
5225   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5226   ptrdiff_t produced_chars = 0;
5227   int c;
5228
5229   if (multibytep)
5230     {
5231       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5232
5233       if (coding->src_multibyte)
5234         while (charbuf < charbuf_end)
5235           {
5236             ASSURE_DESTINATION (safe_room);
5237             c = *charbuf++;
5238             if (ASCII_CHAR_P (c))
5239               EMIT_ONE_ASCII_BYTE (c);
5240             else if (CHAR_BYTE8_P (c))
5241               {
5242                 c = CHAR_TO_BYTE8 (c);
5243                 EMIT_ONE_BYTE (c);
5244               }
5245             else
5246               {
5247                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5248
5249                 CHAR_STRING_ADVANCE (c, p1);
5250                 do
5251                   {
5252                     EMIT_ONE_BYTE (*p0);
5253                     p0++;
5254                   }
5255                 while (p0 < p1);
5256               }
5257           }
5258       else
5259         while (charbuf < charbuf_end)
5260           {
5261             ASSURE_DESTINATION (safe_room);
5262             c = *charbuf++;
5263             EMIT_ONE_BYTE (c);
5264           }
5265     }
5266   else
5267     {
5268       if (coding->src_multibyte)
5269         {
5270           int safe_room = MAX_MULTIBYTE_LENGTH;
5271
5272           while (charbuf < charbuf_end)
5273             {
5274               ASSURE_DESTINATION (safe_room);
5275               c = *charbuf++;
5276               if (ASCII_CHAR_P (c))
5277                 *dst++ = c;
5278               else if (CHAR_BYTE8_P (c))
5279                 *dst++ = CHAR_TO_BYTE8 (c);
5280               else
5281                 CHAR_STRING_ADVANCE (c, dst);
5282             }
5283         }
5284       else
5285         {
5286           ASSURE_DESTINATION (charbuf_end - charbuf);
5287           while (charbuf < charbuf_end && dst < dst_end)
5288             *dst++ = *charbuf++;
5289         }
5290       produced_chars = dst - (coding->destination + coding->produced);
5291     }
5292   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5293   coding->produced_char += produced_chars;
5294   coding->produced = dst - coding->destination;
5295   return 0;
5296 }
5297
5298 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5299    Return true if a text is encoded in a charset-based coding system.  */
5300
5301 static bool
5302 detect_coding_charset (struct coding_system *coding,
5303                        struct coding_detection_info *detect_info)
5304 {
5305   const unsigned char *src = coding->source, *src_base;
5306   const unsigned char *src_end = coding->source + coding->src_bytes;
5307   bool multibytep = coding->src_multibyte;
5308   ptrdiff_t consumed_chars = 0;
5309   Lisp_Object attrs, valids, name;
5310   int found = 0;
5311   ptrdiff_t head_ascii = coding->head_ascii;
5312   bool check_latin_extra = 0;
5313
5314   detect_info->checked |= CATEGORY_MASK_CHARSET;
5315
5316   coding = &coding_categories[coding_category_charset];
5317   attrs = CODING_ID_ATTRS (coding->id);
5318   valids = AREF (attrs, coding_attr_charset_valids);
5319   name = CODING_ID_NAME (coding->id);
5320   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5321                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5322       || strncmp (SSDATA (SYMBOL_NAME (name)),
5323                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5324     check_latin_extra = 1;
5325
5326   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5327     src += head_ascii;
5328
5329   while (1)
5330     {
5331       int c;
5332       Lisp_Object val;
5333       struct charset *charset;
5334       int dim, idx;
5335
5336       src_base = src;
5337       ONE_MORE_BYTE (c);
5338       if (c < 0)
5339         continue;
5340       val = AREF (valids, c);
5341       if (NILP (val))
5342         break;
5343       if (c >= 0x80)
5344         {
5345           if (c < 0xA0
5346               && check_latin_extra
5347               && (!VECTORP (Vlatin_extra_code_table)
5348                   || NILP (AREF (Vlatin_extra_code_table, c))))
5349             break;
5350           found = CATEGORY_MASK_CHARSET;
5351         }
5352       if (INTEGERP (val))
5353         {
5354           charset = CHARSET_FROM_ID (XFASTINT (val));
5355           dim = CHARSET_DIMENSION (charset);
5356           for (idx = 1; idx < dim; idx++)
5357             {
5358               if (src == src_end)
5359                 goto too_short;
5360               ONE_MORE_BYTE (c);
5361               if (c < charset->code_space[(dim - 1 - idx) * 4]
5362                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5363                 break;
5364             }
5365           if (idx < dim)
5366             break;
5367         }
5368       else
5369         {
5370           idx = 1;
5371           for (; CONSP (val); val = XCDR (val))
5372             {
5373               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5374               dim = CHARSET_DIMENSION (charset);
5375               while (idx < dim)
5376                 {
5377                   if (src == src_end)
5378                     goto too_short;
5379                   ONE_MORE_BYTE (c);
5380                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5381                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5382                     break;
5383                   idx++;
5384                 }
5385               if (idx == dim)
5386                 {
5387                   val = Qnil;
5388                   break;
5389                 }
5390             }
5391           if (CONSP (val))
5392             break;
5393         }
5394     }
5395  too_short:
5396   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5397   return 0;
5398
5399  no_more_source:
5400   detect_info->found |= found;
5401   return 1;
5402 }
5403
5404 static void
5405 decode_coding_charset (struct coding_system *coding)
5406 {
5407   const unsigned char *src = coding->source + coding->consumed;
5408   const unsigned char *src_end = coding->source + coding->src_bytes;
5409   const unsigned char *src_base;
5410   int *charbuf = coding->charbuf + coding->charbuf_used;
5411   /* We may produce one charset annotation in one loop and one more at
5412      the end.  */
5413   int *charbuf_end
5414     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5415   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5416   bool multibytep = coding->src_multibyte;
5417   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5418   Lisp_Object valids;
5419   ptrdiff_t char_offset = coding->produced_char;
5420   ptrdiff_t last_offset = char_offset;
5421   int last_id = charset_ascii;
5422   bool eol_dos
5423     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5424   int byte_after_cr = -1;
5425
5426   valids = AREF (attrs, coding_attr_charset_valids);
5427
5428   while (1)
5429     {
5430       int c;
5431       Lisp_Object val;
5432       struct charset *charset;
5433       int dim;
5434       int len = 1;
5435       unsigned code;
5436
5437       src_base = src;
5438       consumed_chars_base = consumed_chars;
5439
5440       if (charbuf >= charbuf_end)
5441         {
5442           if (byte_after_cr >= 0)
5443             src_base--;
5444           break;
5445         }
5446
5447       if (byte_after_cr >= 0)
5448         {
5449           c = byte_after_cr;
5450           byte_after_cr = -1;
5451         }
5452       else
5453         {
5454           ONE_MORE_BYTE (c);
5455           if (eol_dos && c == '\r')
5456             ONE_MORE_BYTE (byte_after_cr);
5457         }
5458       if (c < 0)
5459         goto invalid_code;
5460       code = c;
5461
5462       val = AREF (valids, c);
5463       if (! INTEGERP (val) && ! CONSP (val))
5464         goto invalid_code;
5465       if (INTEGERP (val))
5466         {
5467           charset = CHARSET_FROM_ID (XFASTINT (val));
5468           dim = CHARSET_DIMENSION (charset);
5469           while (len < dim)
5470             {
5471               ONE_MORE_BYTE (c);
5472               code = (code << 8) | c;
5473               len++;
5474             }
5475           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5476                               charset, code, c);
5477         }
5478       else
5479         {
5480           /* VAL is a list of charset IDs.  It is assured that the
5481              list is sorted by charset dimensions (smaller one
5482              comes first).  */
5483           while (CONSP (val))
5484             {
5485               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5486               dim = CHARSET_DIMENSION (charset);
5487               while (len < dim)
5488                 {
5489                   ONE_MORE_BYTE (c);
5490                   code = (code << 8) | c;
5491                   len++;
5492                 }
5493               CODING_DECODE_CHAR (coding, src, src_base,
5494                                   src_end, charset, code, c);
5495               if (c >= 0)
5496                 break;
5497               val = XCDR (val);
5498             }
5499         }
5500       if (c < 0)
5501         goto invalid_code;
5502       if (charset->id != charset_ascii
5503           && last_id != charset->id)
5504         {
5505           if (last_id != charset_ascii)
5506             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5507           last_id = charset->id;
5508           last_offset = char_offset;
5509         }
5510
5511       *charbuf++ = c;
5512       char_offset++;
5513       continue;
5514
5515     invalid_code:
5516       src = src_base;
5517       consumed_chars = consumed_chars_base;
5518       ONE_MORE_BYTE (c);
5519       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5520       char_offset++;
5521       coding->errors++;
5522     }
5523
5524  no_more_source:
5525   if (last_id != charset_ascii)
5526     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5527   coding->consumed_char += consumed_chars_base;
5528   coding->consumed = src_base - coding->source;
5529   coding->charbuf_used = charbuf - coding->charbuf;
5530 }
5531
5532 static bool
5533 encode_coding_charset (struct coding_system *coding)
5534 {
5535   bool multibytep = coding->dst_multibyte;
5536   int *charbuf = coding->charbuf;
5537   int *charbuf_end = charbuf + coding->charbuf_used;
5538   unsigned char *dst = coding->destination + coding->produced;
5539   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5540   int safe_room = MAX_MULTIBYTE_LENGTH;
5541   ptrdiff_t produced_chars = 0;
5542   Lisp_Object attrs, charset_list;
5543   bool ascii_compatible;
5544   int c;
5545
5546   CODING_GET_INFO (coding, attrs, charset_list);
5547   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5548
5549   while (charbuf < charbuf_end)
5550     {
5551       struct charset *charset;
5552       unsigned code;
5553
5554       ASSURE_DESTINATION (safe_room);
5555       c = *charbuf++;
5556       if (ascii_compatible && ASCII_CHAR_P (c))
5557         EMIT_ONE_ASCII_BYTE (c);
5558       else if (CHAR_BYTE8_P (c))
5559         {
5560           c = CHAR_TO_BYTE8 (c);
5561           EMIT_ONE_BYTE (c);
5562         }
5563       else
5564         {
5565           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5566                                &code, charset);
5567
5568           if (charset)
5569             {
5570               if (CHARSET_DIMENSION (charset) == 1)
5571                 EMIT_ONE_BYTE (code);
5572               else if (CHARSET_DIMENSION (charset) == 2)
5573                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5574               else if (CHARSET_DIMENSION (charset) == 3)
5575                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5576               else
5577                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5578                                  (code >> 8) & 0xFF, code & 0xFF);
5579             }
5580           else
5581             {
5582               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5583                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5584               else
5585                 c = coding->default_char;
5586               EMIT_ONE_BYTE (c);
5587             }
5588         }
5589     }
5590
5591   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5592   coding->produced_char += produced_chars;
5593   coding->produced = dst - coding->destination;
5594   return 0;
5595 }
5596
5597 \f
5598 /*** 7. C library functions ***/
5599
5600 /* Setup coding context CODING from information about CODING_SYSTEM.
5601    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5602    CODING_SYSTEM is invalid, signal an error.  */
5603
5604 void
5605 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5606 {
5607   Lisp_Object attrs;
5608   Lisp_Object eol_type;
5609   Lisp_Object coding_type;
5610   Lisp_Object val;
5611
5612   if (NILP (coding_system))
5613     coding_system = Qundecided;
5614
5615   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5616
5617   attrs = CODING_ID_ATTRS (coding->id);
5618   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5619
5620   coding->mode = 0;
5621   coding->head_ascii = -1;
5622   if (VECTORP (eol_type))
5623     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5624                             | CODING_REQUIRE_DETECTION_MASK);
5625   else if (! EQ (eol_type, Qunix))
5626     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5627                             | CODING_REQUIRE_ENCODING_MASK);
5628   else
5629     coding->common_flags = 0;
5630   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5631     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5632   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5633     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5634   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5635     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5636
5637   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5638   coding->max_charset_id = SCHARS (val) - 1;
5639   coding->safe_charsets = SDATA (val);
5640   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5641   coding->carryover_bytes = 0;
5642
5643   coding_type = CODING_ATTR_TYPE (attrs);
5644   if (EQ (coding_type, Qundecided))
5645     {
5646       coding->detector = NULL;
5647       coding->decoder = decode_coding_raw_text;
5648       coding->encoder = encode_coding_raw_text;
5649       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5650     }
5651   else if (EQ (coding_type, Qiso_2022))
5652     {
5653       int i;
5654       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5655
5656       /* Invoke graphic register 0 to plane 0.  */
5657       CODING_ISO_INVOCATION (coding, 0) = 0;
5658       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5659       CODING_ISO_INVOCATION (coding, 1)
5660         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5661       /* Setup the initial status of designation.  */
5662       for (i = 0; i < 4; i++)
5663         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5664       /* Not single shifting initially.  */
5665       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5666       /* Beginning of buffer should also be regarded as bol. */
5667       CODING_ISO_BOL (coding) = 1;
5668       coding->detector = detect_coding_iso_2022;
5669       coding->decoder = decode_coding_iso_2022;
5670       coding->encoder = encode_coding_iso_2022;
5671       if (flags & CODING_ISO_FLAG_SAFE)
5672         coding->mode |= CODING_MODE_SAFE_ENCODING;
5673       coding->common_flags
5674         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5675             | CODING_REQUIRE_FLUSHING_MASK);
5676       if (flags & CODING_ISO_FLAG_COMPOSITION)
5677         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5678       if (flags & CODING_ISO_FLAG_DESIGNATION)
5679         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5680       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5681         {
5682           setup_iso_safe_charsets (attrs);
5683           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5684           coding->max_charset_id = SCHARS (val) - 1;
5685           coding->safe_charsets = SDATA (val);
5686         }
5687       CODING_ISO_FLAGS (coding) = flags;
5688       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5689       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5690       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5691       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5692     }
5693   else if (EQ (coding_type, Qcharset))
5694     {
5695       coding->detector = detect_coding_charset;
5696       coding->decoder = decode_coding_charset;
5697       coding->encoder = encode_coding_charset;
5698       coding->common_flags
5699         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5700     }
5701   else if (EQ (coding_type, Qutf_8))
5702     {
5703       val = AREF (attrs, coding_attr_utf_bom);
5704       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5705                                    : EQ (val, Qt) ? utf_with_bom
5706                                    : utf_without_bom);
5707       coding->detector = detect_coding_utf_8;
5708       coding->decoder = decode_coding_utf_8;
5709       coding->encoder = encode_coding_utf_8;
5710       coding->common_flags
5711         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5712       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5713         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5714     }
5715   else if (EQ (coding_type, Qutf_16))
5716     {
5717       val = AREF (attrs, coding_attr_utf_bom);
5718       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5719                                     : EQ (val, Qt) ? utf_with_bom
5720                                     : utf_without_bom);
5721       val = AREF (attrs, coding_attr_utf_16_endian);
5722       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5723                                        : utf_16_little_endian);
5724       CODING_UTF_16_SURROGATE (coding) = 0;
5725       coding->detector = detect_coding_utf_16;
5726       coding->decoder = decode_coding_utf_16;
5727       coding->encoder = encode_coding_utf_16;
5728       coding->common_flags
5729         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5730       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5731         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5732     }
5733   else if (EQ (coding_type, Qccl))
5734     {
5735       coding->detector = detect_coding_ccl;
5736       coding->decoder = decode_coding_ccl;
5737       coding->encoder = encode_coding_ccl;
5738       coding->common_flags
5739         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5740             | CODING_REQUIRE_FLUSHING_MASK);
5741     }
5742   else if (EQ (coding_type, Qemacs_mule))
5743     {
5744       coding->detector = detect_coding_emacs_mule;
5745       coding->decoder = decode_coding_emacs_mule;
5746       coding->encoder = encode_coding_emacs_mule;
5747       coding->common_flags
5748         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5749       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5750           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5751         {
5752           Lisp_Object tail, safe_charsets;
5753           int max_charset_id = 0;
5754
5755           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5756                tail = XCDR (tail))
5757             if (max_charset_id < XFASTINT (XCAR (tail)))
5758               max_charset_id = XFASTINT (XCAR (tail));
5759           safe_charsets = make_uninit_string (max_charset_id + 1);
5760           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5761           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5762                tail = XCDR (tail))
5763             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5764           coding->max_charset_id = max_charset_id;
5765           coding->safe_charsets = SDATA (safe_charsets);
5766         }
5767       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5768       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5769     }
5770   else if (EQ (coding_type, Qshift_jis))
5771     {
5772       coding->detector = detect_coding_sjis;
5773       coding->decoder = decode_coding_sjis;
5774       coding->encoder = encode_coding_sjis;
5775       coding->common_flags
5776         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5777     }
5778   else if (EQ (coding_type, Qbig5))
5779     {
5780       coding->detector = detect_coding_big5;
5781       coding->decoder = decode_coding_big5;
5782       coding->encoder = encode_coding_big5;
5783       coding->common_flags
5784         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5785     }
5786   else                          /* EQ (coding_type, Qraw_text) */
5787     {
5788       coding->detector = NULL;
5789       coding->decoder = decode_coding_raw_text;
5790       coding->encoder = encode_coding_raw_text;
5791       if (! EQ (eol_type, Qunix))
5792         {
5793           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5794           if (! VECTORP (eol_type))
5795             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5796         }
5797
5798     }
5799
5800   return;
5801 }
5802
5803 /* Return a list of charsets supported by CODING.  */
5804
5805 Lisp_Object
5806 coding_charset_list (struct coding_system *coding)
5807 {
5808   Lisp_Object attrs, charset_list;
5809
5810   CODING_GET_INFO (coding, attrs, charset_list);
5811   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5812     {
5813       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5814
5815       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5816         charset_list = Viso_2022_charset_list;
5817     }
5818   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5819     {
5820       charset_list = Vemacs_mule_charset_list;
5821     }
5822   return charset_list;
5823 }
5824
5825
5826 /* Return a list of charsets supported by CODING-SYSTEM.  */
5827
5828 Lisp_Object
5829 coding_system_charset_list (Lisp_Object coding_system)
5830 {
5831   ptrdiff_t id;
5832   Lisp_Object attrs, charset_list;
5833
5834   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5835   attrs = CODING_ID_ATTRS (id);
5836
5837   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5838     {
5839       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5840
5841       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5842         charset_list = Viso_2022_charset_list;
5843       else
5844         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5845     }
5846   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5847     {
5848       charset_list = Vemacs_mule_charset_list;
5849     }
5850   else
5851     {
5852       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5853     }
5854   return charset_list;
5855 }
5856
5857
5858 /* Return raw-text or one of its subsidiaries that has the same
5859    eol_type as CODING-SYSTEM.  */
5860
5861 Lisp_Object
5862 raw_text_coding_system (Lisp_Object coding_system)
5863 {
5864   Lisp_Object spec, attrs;
5865   Lisp_Object eol_type, raw_text_eol_type;
5866
5867   if (NILP (coding_system))
5868     return Qraw_text;
5869   spec = CODING_SYSTEM_SPEC (coding_system);
5870   attrs = AREF (spec, 0);
5871
5872   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5873     return coding_system;
5874
5875   eol_type = AREF (spec, 2);
5876   if (VECTORP (eol_type))
5877     return Qraw_text;
5878   spec = CODING_SYSTEM_SPEC (Qraw_text);
5879   raw_text_eol_type = AREF (spec, 2);
5880   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5881           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5882           : AREF (raw_text_eol_type, 2));
5883 }
5884
5885
5886 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5887    the subsidiary that has the same eol-spec as PARENT (if it is not
5888    nil and specifies end-of-line format) or the system's setting
5889    (system_eol_type).  */
5890
5891 Lisp_Object
5892 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5893 {
5894   Lisp_Object spec, eol_type;
5895
5896   if (NILP (coding_system))
5897     coding_system = Qraw_text;
5898   spec = CODING_SYSTEM_SPEC (coding_system);
5899   eol_type = AREF (spec, 2);
5900   if (VECTORP (eol_type))
5901     {
5902       Lisp_Object parent_eol_type;
5903
5904       if (! NILP (parent))
5905         {
5906           Lisp_Object parent_spec;
5907
5908           parent_spec = CODING_SYSTEM_SPEC (parent);
5909           parent_eol_type = AREF (parent_spec, 2);
5910           if (VECTORP (parent_eol_type))
5911             parent_eol_type = system_eol_type;
5912         }
5913       else
5914         parent_eol_type = system_eol_type;
5915       if (EQ (parent_eol_type, Qunix))
5916         coding_system = AREF (eol_type, 0);
5917       else if (EQ (parent_eol_type, Qdos))
5918         coding_system = AREF (eol_type, 1);
5919       else if (EQ (parent_eol_type, Qmac))
5920         coding_system = AREF (eol_type, 2);
5921     }
5922   return coding_system;
5923 }
5924
5925
5926 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5927    decided for writing to a process.  If not, complement them, and
5928    return a new coding system.  */
5929
5930 Lisp_Object
5931 complement_process_encoding_system (Lisp_Object coding_system)
5932 {
5933   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5934   Lisp_Object spec, attrs;
5935   int i;
5936
5937   for (i = 0; i < 3; i++)
5938     {
5939       if (i == 1)
5940         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5941       else if (i == 2)
5942         coding_system = preferred_coding_system ();
5943       spec = CODING_SYSTEM_SPEC (coding_system);
5944       if (NILP (spec))
5945         continue;
5946       attrs = AREF (spec, 0);
5947       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5948         coding_base = CODING_ATTR_BASE_NAME (attrs);
5949       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5950         eol_base = coding_system;
5951       if (! NILP (coding_base) && ! NILP (eol_base))
5952         break;
5953     }
5954
5955   if (i > 0)
5956     /* The original CODING_SYSTEM didn't specify text-conversion or
5957        eol-conversion.  Be sure that we return a fully complemented
5958        coding system.  */
5959     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5960   return coding_system;
5961 }
5962
5963
5964 /* Emacs has a mechanism to automatically detect a coding system if it
5965    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5966    it's impossible to distinguish some coding systems accurately
5967    because they use the same range of codes.  So, at first, coding
5968    systems are categorized into 7, those are:
5969
5970    o coding-category-emacs-mule
5971
5972         The category for a coding system which has the same code range
5973         as Emacs' internal format.  Assigned the coding-system (Lisp
5974         symbol) `emacs-mule' by default.
5975
5976    o coding-category-sjis
5977
5978         The category for a coding system which has the same code range
5979         as SJIS.  Assigned the coding-system (Lisp
5980         symbol) `japanese-shift-jis' by default.
5981
5982    o coding-category-iso-7
5983
5984         The category for a coding system which has the same code range
5985         as ISO2022 of 7-bit environment.  This doesn't use any locking
5986         shift and single shift functions.  This can encode/decode all
5987         charsets.  Assigned the coding-system (Lisp symbol)
5988         `iso-2022-7bit' by default.
5989
5990    o coding-category-iso-7-tight
5991
5992         Same as coding-category-iso-7 except that this can
5993         encode/decode only the specified charsets.
5994
5995    o coding-category-iso-8-1
5996
5997         The category for a coding system which has the same code range
5998         as ISO2022 of 8-bit environment and graphic plane 1 used only
5999         for DIMENSION1 charset.  This doesn't use any locking shift
6000         and single shift functions.  Assigned the coding-system (Lisp
6001         symbol) `iso-latin-1' by default.
6002
6003    o coding-category-iso-8-2
6004
6005         The category for a coding system which has the same code range
6006         as ISO2022 of 8-bit environment and graphic plane 1 used only
6007         for DIMENSION2 charset.  This doesn't use any locking shift
6008         and single shift functions.  Assigned the coding-system (Lisp
6009         symbol) `japanese-iso-8bit' by default.
6010
6011    o coding-category-iso-7-else
6012
6013         The category for a coding system which has the same code range
6014         as ISO2022 of 7-bit environment but uses locking shift or
6015         single shift functions.  Assigned the coding-system (Lisp
6016         symbol) `iso-2022-7bit-lock' by default.
6017
6018    o coding-category-iso-8-else
6019
6020         The category for a coding system which has the same code range
6021         as ISO2022 of 8-bit environment but uses locking shift or
6022         single shift functions.  Assigned the coding-system (Lisp
6023         symbol) `iso-2022-8bit-ss2' by default.
6024
6025    o coding-category-big5
6026
6027         The category for a coding system which has the same code range
6028         as BIG5.  Assigned the coding-system (Lisp symbol)
6029         `cn-big5' by default.
6030
6031    o coding-category-utf-8
6032
6033         The category for a coding system which has the same code range
6034         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6035         symbol) `utf-8' by default.
6036
6037    o coding-category-utf-16-be
6038
6039         The category for a coding system in which a text has an
6040         Unicode signature (cf. Unicode Standard) in the order of BIG
6041         endian at the head.  Assigned the coding-system (Lisp symbol)
6042         `utf-16-be' by default.
6043
6044    o coding-category-utf-16-le
6045
6046         The category for a coding system in which a text has an
6047         Unicode signature (cf. Unicode Standard) in the order of
6048         LITTLE endian at the head.  Assigned the coding-system (Lisp
6049         symbol) `utf-16-le' by default.
6050
6051    o coding-category-ccl
6052
6053         The category for a coding system of which encoder/decoder is
6054         written in CCL programs.  The default value is nil, i.e., no
6055         coding system is assigned.
6056
6057    o coding-category-binary
6058
6059         The category for a coding system not categorized in any of the
6060         above.  Assigned the coding-system (Lisp symbol)
6061         `no-conversion' by default.
6062
6063    Each of them is a Lisp symbol and the value is an actual
6064    `coding-system's (this is also a Lisp symbol) assigned by a user.
6065    What Emacs does actually is to detect a category of coding system.
6066    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6067    decide only one possible category, it selects a category of the
6068    highest priority.  Priorities of categories are also specified by a
6069    user in a Lisp variable `coding-category-list'.
6070
6071 */
6072
6073 #define EOL_SEEN_NONE   0
6074 #define EOL_SEEN_LF     1
6075 #define EOL_SEEN_CR     2
6076 #define EOL_SEEN_CRLF   4
6077
6078 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6079    SOURCE is encoded.  If CATEGORY is one of
6080    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6081    two-byte, else they are encoded by one-byte.
6082
6083    Return one of EOL_SEEN_XXX.  */
6084
6085 #define MAX_EOL_CHECK_COUNT 3
6086
6087 static int
6088 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6089             enum coding_category category)
6090 {
6091   const unsigned char *src = source, *src_end = src + src_bytes;
6092   unsigned char c;
6093   int total  = 0;
6094   int eol_seen = EOL_SEEN_NONE;
6095
6096   if ((1 << category) & CATEGORY_MASK_UTF_16)
6097     {
6098       bool msb = category == (coding_category_utf_16_le
6099                               | coding_category_utf_16_le_nosig);
6100       bool lsb = !msb;
6101
6102       while (src + 1 < src_end)
6103         {
6104           c = src[lsb];
6105           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6106             {
6107               int this_eol;
6108
6109               if (c == '\n')
6110                 this_eol = EOL_SEEN_LF;
6111               else if (src + 3 >= src_end
6112                        || src[msb + 2] != 0
6113                        || src[lsb + 2] != '\n')
6114                 this_eol = EOL_SEEN_CR;
6115               else
6116                 {
6117                   this_eol = EOL_SEEN_CRLF;
6118                   src += 2;
6119                 }
6120
6121               if (eol_seen == EOL_SEEN_NONE)
6122                 /* This is the first end-of-line.  */
6123                 eol_seen = this_eol;
6124               else if (eol_seen != this_eol)
6125                 {
6126                   /* The found type is different from what found before.
6127                      Allow for stray ^M characters in DOS EOL files.  */
6128                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6129                       || (eol_seen == EOL_SEEN_CRLF
6130                           && this_eol == EOL_SEEN_CR))
6131                     eol_seen = EOL_SEEN_CRLF;
6132                   else
6133                     {
6134                       eol_seen = EOL_SEEN_LF;
6135                       break;
6136                     }
6137                 }
6138               if (++total == MAX_EOL_CHECK_COUNT)
6139                 break;
6140             }
6141           src += 2;
6142         }
6143     }
6144   else
6145     while (src < src_end)
6146       {
6147         c = *src++;
6148         if (c == '\n' || c == '\r')
6149           {
6150             int this_eol;
6151
6152             if (c == '\n')
6153               this_eol = EOL_SEEN_LF;
6154             else if (src >= src_end || *src != '\n')
6155               this_eol = EOL_SEEN_CR;
6156             else
6157               this_eol = EOL_SEEN_CRLF, src++;
6158
6159             if (eol_seen == EOL_SEEN_NONE)
6160               /* This is the first end-of-line.  */
6161               eol_seen = this_eol;
6162             else if (eol_seen != this_eol)
6163               {
6164                 /* The found type is different from what found before.
6165                    Allow for stray ^M characters in DOS EOL files.  */
6166                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6167                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6168                   eol_seen = EOL_SEEN_CRLF;
6169                 else
6170                   {
6171                     eol_seen = EOL_SEEN_LF;
6172                     break;
6173                   }
6174               }
6175             if (++total == MAX_EOL_CHECK_COUNT)
6176               break;
6177           }
6178       }
6179   return eol_seen;
6180 }
6181
6182
6183 static Lisp_Object
6184 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6185 {
6186   Lisp_Object eol_type;
6187
6188   eol_type = CODING_ID_EOL_TYPE (coding->id);
6189   if (eol_seen & EOL_SEEN_LF)
6190     {
6191       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6192       eol_type = Qunix;
6193     }
6194   else if (eol_seen & EOL_SEEN_CRLF)
6195     {
6196       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6197       eol_type = Qdos;
6198     }
6199   else if (eol_seen & EOL_SEEN_CR)
6200     {
6201       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6202       eol_type = Qmac;
6203     }
6204   return eol_type;
6205 }
6206
6207 /* Detect how a text specified in CODING is encoded.  If a coding
6208    system is detected, update fields of CODING by the detected coding
6209    system.  */
6210
6211 static void
6212 detect_coding (struct coding_system *coding)
6213 {
6214   const unsigned char *src, *src_end;
6215   unsigned int saved_mode = coding->mode;
6216
6217   coding->consumed = coding->consumed_char = 0;
6218   coding->produced = coding->produced_char = 0;
6219   coding_set_source (coding);
6220
6221   src_end = coding->source + coding->src_bytes;
6222   coding->head_ascii = 0;
6223
6224   /* If we have not yet decided the text encoding type, detect it
6225      now.  */
6226   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6227     {
6228       int c, i;
6229       struct coding_detection_info detect_info;
6230       bool null_byte_found = 0, eight_bit_found = 0;
6231
6232       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6233       for (src = coding->source; src < src_end; src++)
6234         {
6235           c = *src;
6236           if (c & 0x80)
6237             {
6238               eight_bit_found = 1;
6239               if (null_byte_found)
6240                 break;
6241             }
6242           else if (c < 0x20)
6243             {
6244               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6245                   && ! inhibit_iso_escape_detection
6246                   && ! detect_info.checked)
6247                 {
6248                   if (detect_coding_iso_2022 (coding, &detect_info))
6249                     {
6250                       /* We have scanned the whole data.  */
6251                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6252                         {
6253                           /* We didn't find an 8-bit code.  We may
6254                              have found a null-byte, but it's very
6255                              rare that a binary file conforms to
6256                              ISO-2022.  */
6257                           src = src_end;
6258                           coding->head_ascii = src - coding->source;
6259                         }
6260                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6261                       break;
6262                     }
6263                 }
6264               else if (! c && !inhibit_null_byte_detection)
6265                 {
6266                   null_byte_found = 1;
6267                   if (eight_bit_found)
6268                     break;
6269                 }
6270               if (! eight_bit_found)
6271                 coding->head_ascii++;
6272             }
6273           else if (! eight_bit_found)
6274             coding->head_ascii++;
6275         }
6276
6277       if (null_byte_found || eight_bit_found
6278           || coding->head_ascii < coding->src_bytes
6279           || detect_info.found)
6280         {
6281           enum coding_category category;
6282           struct coding_system *this;
6283
6284           if (coding->head_ascii == coding->src_bytes)
6285             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6286             for (i = 0; i < coding_category_raw_text; i++)
6287               {
6288                 category = coding_priorities[i];
6289                 this = coding_categories + category;
6290                 if (detect_info.found & (1 << category))
6291                   break;
6292               }
6293           else
6294             {
6295               if (null_byte_found)
6296                 {
6297                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6298                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6299                 }
6300               for (i = 0; i < coding_category_raw_text; i++)
6301                 {
6302                   category = coding_priorities[i];
6303                   this = coding_categories + category;
6304                   /* Some of this->detector (e.g. detect_coding_sjis)
6305                      require this information.  */
6306                   coding->id = this->id;
6307                   if (this->id < 0)
6308                     {
6309                       /* No coding system of this category is defined.  */
6310                       detect_info.rejected |= (1 << category);
6311                     }
6312                   else if (category >= coding_category_raw_text)
6313                     continue;
6314                   else if (detect_info.checked & (1 << category))
6315                     {
6316                       if (detect_info.found & (1 << category))
6317                         break;
6318                     }
6319                   else if ((*(this->detector)) (coding, &detect_info)
6320                            && detect_info.found & (1 << category))
6321                     {
6322                       if (category == coding_category_utf_16_auto)
6323                         {
6324                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6325                             category = coding_category_utf_16_le;
6326                           else
6327                             category = coding_category_utf_16_be;
6328                         }
6329                       break;
6330                     }
6331                 }
6332             }
6333
6334           if (i < coding_category_raw_text)
6335             setup_coding_system (CODING_ID_NAME (this->id), coding);
6336           else if (null_byte_found)
6337             setup_coding_system (Qno_conversion, coding);
6338           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6339                    == CATEGORY_MASK_ANY)
6340             setup_coding_system (Qraw_text, coding);
6341           else if (detect_info.rejected)
6342             for (i = 0; i < coding_category_raw_text; i++)
6343               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6344                 {
6345                   this = coding_categories + coding_priorities[i];
6346                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6347                   break;
6348                 }
6349         }
6350     }
6351   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6352            == coding_category_utf_8_auto)
6353     {
6354       Lisp_Object coding_systems;
6355       struct coding_detection_info detect_info;
6356
6357       coding_systems
6358         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6359       detect_info.found = detect_info.rejected = 0;
6360       coding->head_ascii = 0;
6361       if (CONSP (coding_systems)
6362           && detect_coding_utf_8 (coding, &detect_info))
6363         {
6364           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6365             setup_coding_system (XCAR (coding_systems), coding);
6366           else
6367             setup_coding_system (XCDR (coding_systems), coding);
6368         }
6369     }
6370   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6371            == coding_category_utf_16_auto)
6372     {
6373       Lisp_Object coding_systems;
6374       struct coding_detection_info detect_info;
6375
6376       coding_systems
6377         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6378       detect_info.found = detect_info.rejected = 0;
6379       coding->head_ascii = 0;
6380       if (CONSP (coding_systems)
6381           && detect_coding_utf_16 (coding, &detect_info))
6382         {
6383           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6384             setup_coding_system (XCAR (coding_systems), coding);
6385           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6386             setup_coding_system (XCDR (coding_systems), coding);
6387         }
6388     }
6389   coding->mode = saved_mode;
6390 }
6391
6392
6393 static void
6394 decode_eol (struct coding_system *coding)
6395 {
6396   Lisp_Object eol_type;
6397   unsigned char *p, *pbeg, *pend;
6398
6399   eol_type = CODING_ID_EOL_TYPE (coding->id);
6400   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6401     return;
6402
6403   if (NILP (coding->dst_object))
6404     pbeg = coding->destination;
6405   else
6406     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6407   pend = pbeg + coding->produced;
6408
6409   if (VECTORP (eol_type))
6410     {
6411       int eol_seen = EOL_SEEN_NONE;
6412
6413       for (p = pbeg; p < pend; p++)
6414         {
6415           if (*p == '\n')
6416             eol_seen |= EOL_SEEN_LF;
6417           else if (*p == '\r')
6418             {
6419               if (p + 1 < pend && *(p + 1) == '\n')
6420                 {
6421                   eol_seen |= EOL_SEEN_CRLF;
6422                   p++;
6423                 }
6424               else
6425                 eol_seen |= EOL_SEEN_CR;
6426             }
6427         }
6428       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6429       if ((eol_seen & EOL_SEEN_CRLF) != 0
6430           && (eol_seen & EOL_SEEN_CR) != 0
6431           && (eol_seen & EOL_SEEN_LF) == 0)
6432         eol_seen = EOL_SEEN_CRLF;
6433       else if (eol_seen != EOL_SEEN_NONE
6434           && eol_seen != EOL_SEEN_LF
6435           && eol_seen != EOL_SEEN_CRLF
6436           && eol_seen != EOL_SEEN_CR)
6437         eol_seen = EOL_SEEN_LF;
6438       if (eol_seen != EOL_SEEN_NONE)
6439         eol_type = adjust_coding_eol_type (coding, eol_seen);
6440     }
6441
6442   if (EQ (eol_type, Qmac))
6443     {
6444       for (p = pbeg; p < pend; p++)
6445         if (*p == '\r')
6446           *p = '\n';
6447     }
6448   else if (EQ (eol_type, Qdos))
6449     {
6450       ptrdiff_t n = 0;
6451
6452       if (NILP (coding->dst_object))
6453         {
6454           /* Start deleting '\r' from the tail to minimize the memory
6455              movement.  */
6456           for (p = pend - 2; p >= pbeg; p--)
6457             if (*p == '\r')
6458               {
6459                 memmove (p, p + 1, pend-- - p - 1);
6460                 n++;
6461               }
6462         }
6463       else
6464         {
6465           ptrdiff_t pos_byte = coding->dst_pos_byte;
6466           ptrdiff_t pos = coding->dst_pos;
6467           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6468
6469           while (pos < pos_end)
6470             {
6471               p = BYTE_POS_ADDR (pos_byte);
6472               if (*p == '\r' && p[1] == '\n')
6473                 {
6474                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6475                   n++;
6476                   pos_end--;
6477                 }
6478               pos++;
6479               if (coding->dst_multibyte)
6480                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6481               else
6482                 pos_byte++;
6483             }
6484         }
6485       coding->produced -= n;
6486       coding->produced_char -= n;
6487     }
6488 }
6489
6490
6491 /* Return a translation table (or list of them) from coding system
6492    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6493    not ENCODEP). */
6494
6495 static Lisp_Object
6496 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6497 {
6498   Lisp_Object standard, translation_table;
6499   Lisp_Object val;
6500
6501   if (NILP (Venable_character_translation))
6502     {
6503       if (max_lookup)
6504         *max_lookup = 0;
6505       return Qnil;
6506     }
6507   if (encodep)
6508     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6509       standard = Vstandard_translation_table_for_encode;
6510   else
6511     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6512       standard = Vstandard_translation_table_for_decode;
6513   if (NILP (translation_table))
6514     translation_table = standard;
6515   else
6516     {
6517       if (SYMBOLP (translation_table))
6518         translation_table = Fget (translation_table, Qtranslation_table);
6519       else if (CONSP (translation_table))
6520         {
6521           translation_table = Fcopy_sequence (translation_table);
6522           for (val = translation_table; CONSP (val); val = XCDR (val))
6523             if (SYMBOLP (XCAR (val)))
6524               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6525         }
6526       if (CHAR_TABLE_P (standard))
6527         {
6528           if (CONSP (translation_table))
6529             translation_table = nconc2 (translation_table,
6530                                         Fcons (standard, Qnil));
6531           else
6532             translation_table = Fcons (translation_table,
6533                                        Fcons (standard, Qnil));
6534         }
6535     }
6536
6537   if (max_lookup)
6538     {
6539       *max_lookup = 1;
6540       if (CHAR_TABLE_P (translation_table)
6541           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6542         {
6543           val = XCHAR_TABLE (translation_table)->extras[1];
6544           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6545             *max_lookup = XFASTINT (val);
6546         }
6547       else if (CONSP (translation_table))
6548         {
6549           Lisp_Object tail;
6550
6551           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6552             if (CHAR_TABLE_P (XCAR (tail))
6553                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6554               {
6555                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6556                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6557                   *max_lookup = XFASTINT (tailval);
6558               }
6559         }
6560     }
6561   return translation_table;
6562 }
6563
6564 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6565   do {                                                          \
6566     trans = Qnil;                                               \
6567     if (CHAR_TABLE_P (table))                                   \
6568       {                                                         \
6569         trans = CHAR_TABLE_REF (table, c);                      \
6570         if (CHARACTERP (trans))                                 \
6571           c = XFASTINT (trans), trans = Qnil;                   \
6572       }                                                         \
6573     else if (CONSP (table))                                     \
6574       {                                                         \
6575         Lisp_Object tail;                                       \
6576                                                                 \
6577         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6578           if (CHAR_TABLE_P (XCAR (tail)))                       \
6579             {                                                   \
6580               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6581               if (CHARACTERP (trans))                           \
6582                 c = XFASTINT (trans), trans = Qnil;             \
6583               else if (! NILP (trans))                          \
6584                 break;                                          \
6585             }                                                   \
6586       }                                                         \
6587   } while (0)
6588
6589
6590 /* Return a translation of character(s) at BUF according to TRANS.
6591    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6592    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6593    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6594    translation is found, and Qnil if not found..
6595    If BUF is too short to lookup characters in FROM, return Qt.  */
6596
6597 static Lisp_Object
6598 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6599 {
6600
6601   if (INTEGERP (trans))
6602     return trans;
6603   for (; CONSP (trans); trans = XCDR (trans))
6604     {
6605       Lisp_Object val = XCAR (trans);
6606       Lisp_Object from = XCAR (val);
6607       ptrdiff_t len = ASIZE (from);
6608       ptrdiff_t i;
6609
6610       for (i = 0; i < len; i++)
6611         {
6612           if (buf + i == buf_end)
6613             return Qt;
6614           if (XINT (AREF (from, i)) != buf[i])
6615             break;
6616         }
6617       if (i == len)
6618         return val;
6619     }
6620   return Qnil;
6621 }
6622
6623
6624 static int
6625 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6626                bool last_block)
6627 {
6628   unsigned char *dst = coding->destination + coding->produced;
6629   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6630   ptrdiff_t produced;
6631   ptrdiff_t produced_chars = 0;
6632   int carryover = 0;
6633
6634   if (! coding->chars_at_source)
6635     {
6636       /* Source characters are in coding->charbuf.  */
6637       int *buf = coding->charbuf;
6638       int *buf_end = buf + coding->charbuf_used;
6639
6640       if (EQ (coding->src_object, coding->dst_object))
6641         {
6642           coding_set_source (coding);
6643           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6644         }
6645
6646       while (buf < buf_end)
6647         {
6648           int c = *buf;
6649           ptrdiff_t i;
6650
6651           if (c >= 0)
6652             {
6653               ptrdiff_t from_nchars = 1, to_nchars = 1;
6654               Lisp_Object trans = Qnil;
6655
6656               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6657               if (! NILP (trans))
6658                 {
6659                   trans = get_translation (trans, buf, buf_end);
6660                   if (INTEGERP (trans))
6661                     c = XINT (trans);
6662                   else if (CONSP (trans))
6663                     {
6664                       from_nchars = ASIZE (XCAR (trans));
6665                       trans = XCDR (trans);
6666                       if (INTEGERP (trans))
6667                         c = XINT (trans);
6668                       else
6669                         {
6670                           to_nchars = ASIZE (trans);
6671                           c = XINT (AREF (trans, 0));
6672                         }
6673                     }
6674                   else if (EQ (trans, Qt) && ! last_block)
6675                     break;
6676                 }
6677
6678               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6679                 {
6680                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6681                        / MAX_MULTIBYTE_LENGTH)
6682                       < to_nchars)
6683                     memory_full (SIZE_MAX);
6684                   dst = alloc_destination (coding,
6685                                            buf_end - buf
6686                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6687                                            dst);
6688                   if (EQ (coding->src_object, coding->dst_object))
6689                     {
6690                       coding_set_source (coding);
6691                       dst_end = (((unsigned char *) coding->source)
6692                                  + coding->consumed);
6693                     }
6694                   else
6695                     dst_end = coding->destination + coding->dst_bytes;
6696                 }
6697
6698               for (i = 0; i < to_nchars; i++)
6699                 {
6700                   if (i > 0)
6701                     c = XINT (AREF (trans, i));
6702                   if (coding->dst_multibyte
6703                       || ! CHAR_BYTE8_P (c))
6704                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6705                   else
6706                     *dst++ = CHAR_TO_BYTE8 (c);
6707                 }
6708               produced_chars += to_nchars;
6709               buf += from_nchars;
6710             }
6711           else
6712             /* This is an annotation datum.  (-C) is the length.  */
6713             buf += -c;
6714         }
6715       carryover = buf_end - buf;
6716     }
6717   else
6718     {
6719       /* Source characters are at coding->source.  */
6720       const unsigned char *src = coding->source;
6721       const unsigned char *src_end = src + coding->consumed;
6722
6723       if (EQ (coding->dst_object, coding->src_object))
6724         dst_end = (unsigned char *) src;
6725       if (coding->src_multibyte != coding->dst_multibyte)
6726         {
6727           if (coding->src_multibyte)
6728             {
6729               bool multibytep = 1;
6730               ptrdiff_t consumed_chars = 0;
6731
6732               while (1)
6733                 {
6734                   const unsigned char *src_base = src;
6735                   int c;
6736
6737                   ONE_MORE_BYTE (c);
6738                   if (dst == dst_end)
6739                     {
6740                       if (EQ (coding->src_object, coding->dst_object))
6741                         dst_end = (unsigned char *) src;
6742                       if (dst == dst_end)
6743                         {
6744                           ptrdiff_t offset = src - coding->source;
6745
6746                           dst = alloc_destination (coding, src_end - src + 1,
6747                                                    dst);
6748                           dst_end = coding->destination + coding->dst_bytes;
6749                           coding_set_source (coding);
6750                           src = coding->source + offset;
6751                           src_end = coding->source + coding->consumed;
6752                           if (EQ (coding->src_object, coding->dst_object))
6753                             dst_end = (unsigned char *) src;
6754                         }
6755                     }
6756                   *dst++ = c;
6757                   produced_chars++;
6758                 }
6759             no_more_source:
6760               ;
6761             }
6762           else
6763             while (src < src_end)
6764               {
6765                 bool multibytep = 1;
6766                 int c = *src++;
6767
6768                 if (dst >= dst_end - 1)
6769                   {
6770                     if (EQ (coding->src_object, coding->dst_object))
6771                       dst_end = (unsigned char *) src;
6772                     if (dst >= dst_end - 1)
6773                       {
6774                         ptrdiff_t offset = src - coding->source;
6775                         ptrdiff_t more_bytes;
6776
6777                         if (EQ (coding->src_object, coding->dst_object))
6778                           more_bytes = ((src_end - src) / 2) + 2;
6779                         else
6780                           more_bytes = src_end - src + 2;
6781                         dst = alloc_destination (coding, more_bytes, dst);
6782                         dst_end = coding->destination + coding->dst_bytes;
6783                         coding_set_source (coding);
6784                         src = coding->source + offset;
6785                         src_end = coding->source + coding->consumed;
6786                         if (EQ (coding->src_object, coding->dst_object))
6787                           dst_end = (unsigned char *) src;
6788                       }
6789                   }
6790                 EMIT_ONE_BYTE (c);
6791               }
6792         }
6793       else
6794         {
6795           if (!EQ (coding->src_object, coding->dst_object))
6796             {
6797               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6798
6799               if (require > 0)
6800                 {
6801                   ptrdiff_t offset = src - coding->source;
6802
6803                   dst = alloc_destination (coding, require, dst);
6804                   coding_set_source (coding);
6805                   src = coding->source + offset;
6806                   src_end = coding->source + coding->consumed;
6807                 }
6808             }
6809           produced_chars = coding->consumed_char;
6810           while (src < src_end)
6811             *dst++ = *src++;
6812         }
6813     }
6814
6815   produced = dst - (coding->destination + coding->produced);
6816   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6817     insert_from_gap (produced_chars, produced);
6818   coding->produced += produced;
6819   coding->produced_char += produced_chars;
6820   return carryover;
6821 }
6822
6823 /* Compose text in CODING->object according to the annotation data at
6824    CHARBUF.  CHARBUF is an array:
6825      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6826  */
6827
6828 static void
6829 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6830 {
6831   int len;
6832   ptrdiff_t to;
6833   enum composition_method method;
6834   Lisp_Object components;
6835
6836   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6837   to = pos + charbuf[2];
6838   method = (enum composition_method) (charbuf[4]);
6839
6840   if (method == COMPOSITION_RELATIVE)
6841     components = Qnil;
6842   else
6843     {
6844       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6845       int i, j;
6846
6847       if (method == COMPOSITION_WITH_RULE)
6848         len = charbuf[2] * 3 - 2;
6849       charbuf += MAX_ANNOTATION_LENGTH;
6850       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6851       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6852         {
6853           if (charbuf[i] >= 0)
6854             args[j] = make_number (charbuf[i]);
6855           else
6856             {
6857               i++;
6858               args[j] = make_number (charbuf[i] % 0x100);
6859             }
6860         }
6861       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6862     }
6863   compose_text (pos, to, components, Qnil, coding->dst_object);
6864 }
6865
6866
6867 /* Put `charset' property on text in CODING->object according to
6868    the annotation data at CHARBUF.  CHARBUF is an array:
6869      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6870  */
6871
6872 static void
6873 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6874 {
6875   ptrdiff_t from = pos - charbuf[2];
6876   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6877
6878   Fput_text_property (make_number (from), make_number (pos),
6879                       Qcharset, CHARSET_NAME (charset),
6880                       coding->dst_object);
6881 }
6882
6883
6884 #define CHARBUF_SIZE 0x4000
6885
6886 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6887   do {                                                                  \
6888     int size = CHARBUF_SIZE;                                            \
6889                                                                         \
6890     coding->charbuf = NULL;                                             \
6891     while (size > 1024)                                                 \
6892       {                                                                 \
6893         coding->charbuf = alloca (sizeof (int) * size);                 \
6894         if (coding->charbuf)                                            \
6895           break;                                                        \
6896         size >>= 1;                                                     \
6897       }                                                                 \
6898     if (! coding->charbuf)                                              \
6899       {                                                                 \
6900         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6901         return;                                                         \
6902       }                                                                 \
6903     coding->charbuf_size = size;                                        \
6904   } while (0)
6905
6906
6907 static void
6908 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6909 {
6910   int *charbuf = coding->charbuf;
6911   int *charbuf_end = charbuf + coding->charbuf_used;
6912
6913   if (NILP (coding->dst_object))
6914     return;
6915
6916   while (charbuf < charbuf_end)
6917     {
6918       if (*charbuf >= 0)
6919         pos++, charbuf++;
6920       else
6921         {
6922           int len = -*charbuf;
6923
6924           if (len > 2)
6925             switch (charbuf[1])
6926               {
6927               case CODING_ANNOTATE_COMPOSITION_MASK:
6928                 produce_composition (coding, charbuf, pos);
6929                 break;
6930               case CODING_ANNOTATE_CHARSET_MASK:
6931                 produce_charset (coding, charbuf, pos);
6932                 break;
6933               }
6934           charbuf += len;
6935         }
6936     }
6937 }
6938
6939 /* Decode the data at CODING->src_object into CODING->dst_object.
6940    CODING->src_object is a buffer, a string, or nil.
6941    CODING->dst_object is a buffer.
6942
6943    If CODING->src_object is a buffer, it must be the current buffer.
6944    In this case, if CODING->src_pos is positive, it is a position of
6945    the source text in the buffer, otherwise, the source text is in the
6946    gap area of the buffer, and CODING->src_pos specifies the offset of
6947    the text from GPT (which must be the same as PT).  If this is the
6948    same buffer as CODING->dst_object, CODING->src_pos must be
6949    negative.
6950
6951    If CODING->src_object is a string, CODING->src_pos is an index to
6952    that string.
6953
6954    If CODING->src_object is nil, CODING->source must already point to
6955    the non-relocatable memory area.  In this case, CODING->src_pos is
6956    an offset from CODING->source.
6957
6958    The decoded data is inserted at the current point of the buffer
6959    CODING->dst_object.
6960 */
6961
6962 static void
6963 decode_coding (struct coding_system *coding)
6964 {
6965   Lisp_Object attrs;
6966   Lisp_Object undo_list;
6967   Lisp_Object translation_table;
6968   struct ccl_spec cclspec;
6969   int carryover;
6970   int i;
6971
6972   if (BUFFERP (coding->src_object)
6973       && coding->src_pos > 0
6974       && coding->src_pos < GPT
6975       && coding->src_pos + coding->src_chars > GPT)
6976     move_gap_both (coding->src_pos, coding->src_pos_byte);
6977
6978   undo_list = Qt;
6979   if (BUFFERP (coding->dst_object))
6980     {
6981       set_buffer_internal (XBUFFER (coding->dst_object));
6982       if (GPT != PT)
6983         move_gap_both (PT, PT_BYTE);
6984
6985       /* We must disable undo_list in order to record the whole insert
6986          transaction via record_insert at the end.  But doing so also
6987          disables the recording of the first change to the undo_list.
6988          Therefore we check for first change here and record it via
6989          record_first_change if needed.  */
6990       if (MODIFF <= SAVE_MODIFF)
6991         record_first_change ();
6992
6993       undo_list = BVAR (current_buffer, undo_list);
6994       bset_undo_list (current_buffer, Qt);
6995     }
6996
6997   coding->consumed = coding->consumed_char = 0;
6998   coding->produced = coding->produced_char = 0;
6999   coding->chars_at_source = 0;
7000   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7001   coding->errors = 0;
7002
7003   ALLOC_CONVERSION_WORK_AREA (coding);
7004
7005   attrs = CODING_ID_ATTRS (coding->id);
7006   translation_table = get_translation_table (attrs, 0, NULL);
7007
7008   carryover = 0;
7009   if (coding->decoder == decode_coding_ccl)
7010     {
7011       coding->spec.ccl = &cclspec;
7012       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7013     }
7014   do
7015     {
7016       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7017
7018       coding_set_source (coding);
7019       coding->annotated = 0;
7020       coding->charbuf_used = carryover;
7021       (*(coding->decoder)) (coding);
7022       coding_set_destination (coding);
7023       carryover = produce_chars (coding, translation_table, 0);
7024       if (coding->annotated)
7025         produce_annotation (coding, pos);
7026       for (i = 0; i < carryover; i++)
7027         coding->charbuf[i]
7028           = coding->charbuf[coding->charbuf_used - carryover + i];
7029     }
7030   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7031          || (coding->consumed < coding->src_bytes
7032              && (coding->result == CODING_RESULT_SUCCESS
7033                  || coding->result == CODING_RESULT_INVALID_SRC)));
7034
7035   if (carryover > 0)
7036     {
7037       coding_set_destination (coding);
7038       coding->charbuf_used = carryover;
7039       produce_chars (coding, translation_table, 1);
7040     }
7041
7042   coding->carryover_bytes = 0;
7043   if (coding->consumed < coding->src_bytes)
7044     {
7045       int nbytes = coding->src_bytes - coding->consumed;
7046       const unsigned char *src;
7047
7048       coding_set_source (coding);
7049       coding_set_destination (coding);
7050       src = coding->source + coding->consumed;
7051
7052       if (coding->mode & CODING_MODE_LAST_BLOCK)
7053         {
7054           /* Flush out unprocessed data as binary chars.  We are sure
7055              that the number of data is less than the size of
7056              coding->charbuf.  */
7057           coding->charbuf_used = 0;
7058           coding->chars_at_source = 0;
7059
7060           while (nbytes-- > 0)
7061             {
7062               int c = *src++;
7063
7064               if (c & 0x80)
7065                 c = BYTE8_TO_CHAR (c);
7066               coding->charbuf[coding->charbuf_used++] = c;
7067             }
7068           produce_chars (coding, Qnil, 1);
7069         }
7070       else
7071         {
7072           /* Record unprocessed bytes in coding->carryover.  We are
7073              sure that the number of data is less than the size of
7074              coding->carryover.  */
7075           unsigned char *p = coding->carryover;
7076
7077           if (nbytes > sizeof coding->carryover)
7078             nbytes = sizeof coding->carryover;
7079           coding->carryover_bytes = nbytes;
7080           while (nbytes-- > 0)
7081             *p++ = *src++;
7082         }
7083       coding->consumed = coding->src_bytes;
7084     }
7085
7086   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7087       && !inhibit_eol_conversion)
7088     decode_eol (coding);
7089   if (BUFFERP (coding->dst_object))
7090     {
7091       bset_undo_list (current_buffer, undo_list);
7092       record_insert (coding->dst_pos, coding->produced_char);
7093     }
7094 }
7095
7096
7097 /* Extract an annotation datum from a composition starting at POS and
7098    ending before LIMIT of CODING->src_object (buffer or string), store
7099    the data in BUF, set *STOP to a starting position of the next
7100    composition (if any) or to LIMIT, and return the address of the
7101    next element of BUF.
7102
7103    If such an annotation is not found, set *STOP to a starting
7104    position of a composition after POS (if any) or to LIMIT, and
7105    return BUF.  */
7106
7107 static int *
7108 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7109                                struct coding_system *coding, int *buf,
7110                                ptrdiff_t *stop)
7111 {
7112   ptrdiff_t start, end;
7113   Lisp_Object prop;
7114
7115   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7116       || end > limit)
7117     *stop = limit;
7118   else if (start > pos)
7119     *stop = start;
7120   else
7121     {
7122       if (start == pos)
7123         {
7124           /* We found a composition.  Store the corresponding
7125              annotation data in BUF.  */
7126           int *head = buf;
7127           enum composition_method method = COMPOSITION_METHOD (prop);
7128           int nchars = COMPOSITION_LENGTH (prop);
7129
7130           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7131           if (method != COMPOSITION_RELATIVE)
7132             {
7133               Lisp_Object components;
7134               ptrdiff_t i, len, i_byte;
7135
7136               components = COMPOSITION_COMPONENTS (prop);
7137               if (VECTORP (components))
7138                 {
7139                   len = ASIZE (components);
7140                   for (i = 0; i < len; i++)
7141                     *buf++ = XINT (AREF (components, i));
7142                 }
7143               else if (STRINGP (components))
7144                 {
7145                   len = SCHARS (components);
7146                   i = i_byte = 0;
7147                   while (i < len)
7148                     {
7149                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7150                       buf++;
7151                     }
7152                 }
7153               else if (INTEGERP (components))
7154                 {
7155                   len = 1;
7156                   *buf++ = XINT (components);
7157                 }
7158               else if (CONSP (components))
7159                 {
7160                   for (len = 0; CONSP (components);
7161                        len++, components = XCDR (components))
7162                     *buf++ = XINT (XCAR (components));
7163                 }
7164               else
7165                 emacs_abort ();
7166               *head -= len;
7167             }
7168         }
7169
7170       if (find_composition (end, limit, &start, &end, &prop,
7171                             coding->src_object)
7172           && end <= limit)
7173         *stop = start;
7174       else
7175         *stop = limit;
7176     }
7177   return buf;
7178 }
7179
7180
7181 /* Extract an annotation datum from a text property `charset' at POS of
7182    CODING->src_object (buffer of string), store the data in BUF, set
7183    *STOP to the position where the value of `charset' property changes
7184    (limiting by LIMIT), and return the address of the next element of
7185    BUF.
7186
7187    If the property value is nil, set *STOP to the position where the
7188    property value is non-nil (limiting by LIMIT), and return BUF.  */
7189
7190 static int *
7191 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7192                            struct coding_system *coding, int *buf,
7193                            ptrdiff_t *stop)
7194 {
7195   Lisp_Object val, next;
7196   int id;
7197
7198   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7199   if (! NILP (val) && CHARSETP (val))
7200     id = XINT (CHARSET_SYMBOL_ID (val));
7201   else
7202     id = -1;
7203   ADD_CHARSET_DATA (buf, 0, id);
7204   next = Fnext_single_property_change (make_number (pos), Qcharset,
7205                                        coding->src_object,
7206                                        make_number (limit));
7207   *stop = XINT (next);
7208   return buf;
7209 }
7210
7211
7212 static void
7213 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7214                int max_lookup)
7215 {
7216   int *buf = coding->charbuf;
7217   int *buf_end = coding->charbuf + coding->charbuf_size;
7218   const unsigned char *src = coding->source + coding->consumed;
7219   const unsigned char *src_end = coding->source + coding->src_bytes;
7220   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7221   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7222   bool multibytep = coding->src_multibyte;
7223   Lisp_Object eol_type;
7224   int c;
7225   ptrdiff_t stop, stop_composition, stop_charset;
7226   int *lookup_buf = NULL;
7227
7228   if (! NILP (translation_table))
7229     lookup_buf = alloca (sizeof (int) * max_lookup);
7230
7231   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7232   if (VECTORP (eol_type))
7233     eol_type = Qunix;
7234
7235   /* Note: composition handling is not yet implemented.  */
7236   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7237
7238   if (NILP (coding->src_object))
7239     stop = stop_composition = stop_charset = end_pos;
7240   else
7241     {
7242       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7243         stop = stop_composition = pos;
7244       else
7245         stop = stop_composition = end_pos;
7246       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7247         stop = stop_charset = pos;
7248       else
7249         stop_charset = end_pos;
7250     }
7251
7252   /* Compensate for CRLF and conversion.  */
7253   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7254   while (buf < buf_end)
7255     {
7256       Lisp_Object trans;
7257
7258       if (pos == stop)
7259         {
7260           if (pos == end_pos)
7261             break;
7262           if (pos == stop_composition)
7263             buf = handle_composition_annotation (pos, end_pos, coding,
7264                                                  buf, &stop_composition);
7265           if (pos == stop_charset)
7266             buf = handle_charset_annotation (pos, end_pos, coding,
7267                                              buf, &stop_charset);
7268           stop = (stop_composition < stop_charset
7269                   ? stop_composition : stop_charset);
7270         }
7271
7272       if (! multibytep)
7273         {
7274           int bytes;
7275
7276           if (coding->encoder == encode_coding_raw_text
7277               || coding->encoder == encode_coding_ccl)
7278             c = *src++, pos++;
7279           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7280             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7281           else
7282             c = BYTE8_TO_CHAR (*src), src++, pos++;
7283         }
7284       else
7285         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7286       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7287         c = '\n';
7288       if (! EQ (eol_type, Qunix))
7289         {
7290           if (c == '\n')
7291             {
7292               if (EQ (eol_type, Qdos))
7293                 *buf++ = '\r';
7294               else
7295                 c = '\r';
7296             }
7297         }
7298
7299       trans = Qnil;
7300       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7301       if (NILP (trans))
7302         *buf++ = c;
7303       else
7304         {
7305           ptrdiff_t from_nchars = 1, to_nchars = 1;
7306           int *lookup_buf_end;
7307           const unsigned char *p = src;
7308           int i;
7309
7310           lookup_buf[0] = c;
7311           for (i = 1; i < max_lookup && p < src_end; i++)
7312             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7313           lookup_buf_end = lookup_buf + i;
7314           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7315           if (INTEGERP (trans))
7316             c = XINT (trans);
7317           else if (CONSP (trans))
7318             {
7319               from_nchars = ASIZE (XCAR (trans));
7320               trans = XCDR (trans);
7321               if (INTEGERP (trans))
7322                 c = XINT (trans);
7323               else
7324                 {
7325                   to_nchars = ASIZE (trans);
7326                   if (buf_end - buf < to_nchars)
7327                     break;
7328                   c = XINT (AREF (trans, 0));
7329                 }
7330             }
7331           else
7332             break;
7333           *buf++ = c;
7334           for (i = 1; i < to_nchars; i++)
7335             *buf++ = XINT (AREF (trans, i));
7336           for (i = 1; i < from_nchars; i++, pos++)
7337             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7338         }
7339     }
7340
7341   coding->consumed = src - coding->source;
7342   coding->consumed_char = pos - coding->src_pos;
7343   coding->charbuf_used = buf - coding->charbuf;
7344   coding->chars_at_source = 0;
7345 }
7346
7347
7348 /* Encode the text at CODING->src_object into CODING->dst_object.
7349    CODING->src_object is a buffer or a string.
7350    CODING->dst_object is a buffer or nil.
7351
7352    If CODING->src_object is a buffer, it must be the current buffer.
7353    In this case, if CODING->src_pos is positive, it is a position of
7354    the source text in the buffer, otherwise. the source text is in the
7355    gap area of the buffer, and coding->src_pos specifies the offset of
7356    the text from GPT (which must be the same as PT).  If this is the
7357    same buffer as CODING->dst_object, CODING->src_pos must be
7358    negative and CODING should not have `pre-write-conversion'.
7359
7360    If CODING->src_object is a string, CODING should not have
7361    `pre-write-conversion'.
7362
7363    If CODING->dst_object is a buffer, the encoded data is inserted at
7364    the current point of that buffer.
7365
7366    If CODING->dst_object is nil, the encoded data is placed at the
7367    memory area specified by CODING->destination.  */
7368
7369 static void
7370 encode_coding (struct coding_system *coding)
7371 {
7372   Lisp_Object attrs;
7373   Lisp_Object translation_table;
7374   int max_lookup;
7375   struct ccl_spec cclspec;
7376
7377   attrs = CODING_ID_ATTRS (coding->id);
7378   if (coding->encoder == encode_coding_raw_text)
7379     translation_table = Qnil, max_lookup = 0;
7380   else
7381     translation_table = get_translation_table (attrs, 1, &max_lookup);
7382
7383   if (BUFFERP (coding->dst_object))
7384     {
7385       set_buffer_internal (XBUFFER (coding->dst_object));
7386       coding->dst_multibyte
7387         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7388     }
7389
7390   coding->consumed = coding->consumed_char = 0;
7391   coding->produced = coding->produced_char = 0;
7392   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7393   coding->errors = 0;
7394
7395   ALLOC_CONVERSION_WORK_AREA (coding);
7396
7397   if (coding->encoder == encode_coding_ccl)
7398     {
7399       coding->spec.ccl = &cclspec;
7400       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7401     }
7402   do {
7403     coding_set_source (coding);
7404     consume_chars (coding, translation_table, max_lookup);
7405     coding_set_destination (coding);
7406     (*(coding->encoder)) (coding);
7407   } while (coding->consumed_char < coding->src_chars);
7408
7409   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7410     insert_from_gap (coding->produced_char, coding->produced);
7411 }
7412
7413
7414 /* Name (or base name) of work buffer for code conversion.  */
7415 static Lisp_Object Vcode_conversion_workbuf_name;
7416
7417 /* A working buffer used by the top level conversion.  Once it is
7418    created, it is never destroyed.  It has the name
7419    Vcode_conversion_workbuf_name.  The other working buffers are
7420    destroyed after the use is finished, and their names are modified
7421    versions of Vcode_conversion_workbuf_name.  */
7422 static Lisp_Object Vcode_conversion_reused_workbuf;
7423
7424 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7425 static bool reused_workbuf_in_use;
7426
7427
7428 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7429    multibyteness of returning buffer.  */
7430
7431 static Lisp_Object
7432 make_conversion_work_buffer (bool multibyte)
7433 {
7434   Lisp_Object name, workbuf;
7435   struct buffer *current;
7436
7437   if (reused_workbuf_in_use)
7438     {
7439       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7440       workbuf = Fget_buffer_create (name);
7441     }
7442   else
7443     {
7444       reused_workbuf_in_use = 1;
7445       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7446         Vcode_conversion_reused_workbuf
7447           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7448       workbuf = Vcode_conversion_reused_workbuf;
7449     }
7450   current = current_buffer;
7451   set_buffer_internal (XBUFFER (workbuf));
7452   /* We can't allow modification hooks to run in the work buffer.  For
7453      instance, directory_files_internal assumes that file decoding
7454      doesn't compile new regexps.  */
7455   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7456   Ferase_buffer ();
7457   bset_undo_list (current_buffer, Qt);
7458   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7459   set_buffer_internal (current);
7460   return workbuf;
7461 }
7462
7463
7464 static Lisp_Object
7465 code_conversion_restore (Lisp_Object arg)
7466 {
7467   Lisp_Object current, workbuf;
7468   struct gcpro gcpro1;
7469
7470   GCPRO1 (arg);
7471   current = XCAR (arg);
7472   workbuf = XCDR (arg);
7473   if (! NILP (workbuf))
7474     {
7475       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7476         reused_workbuf_in_use = 0;
7477       else
7478         Fkill_buffer (workbuf);
7479     }
7480   set_buffer_internal (XBUFFER (current));
7481   UNGCPRO;
7482   return Qnil;
7483 }
7484
7485 Lisp_Object
7486 code_conversion_save (bool with_work_buf, bool multibyte)
7487 {
7488   Lisp_Object workbuf = Qnil;
7489
7490   if (with_work_buf)
7491     workbuf = make_conversion_work_buffer (multibyte);
7492   record_unwind_protect (code_conversion_restore,
7493                          Fcons (Fcurrent_buffer (), workbuf));
7494   return workbuf;
7495 }
7496
7497 void
7498 decode_coding_gap (struct coding_system *coding,
7499                    ptrdiff_t chars, ptrdiff_t bytes)
7500 {
7501   ptrdiff_t count = SPECPDL_INDEX ();
7502   Lisp_Object attrs;
7503
7504   code_conversion_save (0, 0);
7505
7506   coding->src_object = Fcurrent_buffer ();
7507   coding->src_chars = chars;
7508   coding->src_bytes = bytes;
7509   coding->src_pos = -chars;
7510   coding->src_pos_byte = -bytes;
7511   coding->src_multibyte = chars < bytes;
7512   coding->dst_object = coding->src_object;
7513   coding->dst_pos = PT;
7514   coding->dst_pos_byte = PT_BYTE;
7515   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7516
7517   if (CODING_REQUIRE_DETECTION (coding))
7518     detect_coding (coding);
7519
7520   coding->mode |= CODING_MODE_LAST_BLOCK;
7521   current_buffer->text->inhibit_shrinking = 1;
7522   decode_coding (coding);
7523   current_buffer->text->inhibit_shrinking = 0;
7524
7525   attrs = CODING_ID_ATTRS (coding->id);
7526   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7527     {
7528       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7529       Lisp_Object val;
7530
7531       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7532       val = call1 (CODING_ATTR_POST_READ (attrs),
7533                    make_number (coding->produced_char));
7534       CHECK_NATNUM (val);
7535       coding->produced_char += Z - prev_Z;
7536       coding->produced += Z_BYTE - prev_Z_BYTE;
7537     }
7538
7539   unbind_to (count, Qnil);
7540 }
7541
7542
7543 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7544    SRC_OBJECT into DST_OBJECT by coding context CODING.
7545
7546    SRC_OBJECT is a buffer, a string, or Qnil.
7547
7548    If it is a buffer, the text is at point of the buffer.  FROM and TO
7549    are positions in the buffer.
7550
7551    If it is a string, the text is at the beginning of the string.
7552    FROM and TO are indices to the string.
7553
7554    If it is nil, the text is at coding->source.  FROM and TO are
7555    indices to coding->source.
7556
7557    DST_OBJECT is a buffer, Qt, or Qnil.
7558
7559    If it is a buffer, the decoded text is inserted at point of the
7560    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7561    is deleted.
7562
7563    If it is Qt, a string is made from the decoded text, and
7564    set in CODING->dst_object.
7565
7566    If it is Qnil, the decoded text is stored at CODING->destination.
7567    The caller must allocate CODING->dst_bytes bytes at
7568    CODING->destination by xmalloc.  If the decoded text is longer than
7569    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7570  */
7571
7572 void
7573 decode_coding_object (struct coding_system *coding,
7574                       Lisp_Object src_object,
7575                       ptrdiff_t from, ptrdiff_t from_byte,
7576                       ptrdiff_t to, ptrdiff_t to_byte,
7577                       Lisp_Object dst_object)
7578 {
7579   ptrdiff_t count = SPECPDL_INDEX ();
7580   unsigned char *destination IF_LINT (= NULL);
7581   ptrdiff_t dst_bytes IF_LINT (= 0);
7582   ptrdiff_t chars = to - from;
7583   ptrdiff_t bytes = to_byte - from_byte;
7584   Lisp_Object attrs;
7585   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7586   bool need_marker_adjustment = 0;
7587   Lisp_Object old_deactivate_mark;
7588
7589   old_deactivate_mark = Vdeactivate_mark;
7590
7591   if (NILP (dst_object))
7592     {
7593       destination = coding->destination;
7594       dst_bytes = coding->dst_bytes;
7595     }
7596
7597   coding->src_object = src_object;
7598   coding->src_chars = chars;
7599   coding->src_bytes = bytes;
7600   coding->src_multibyte = chars < bytes;
7601
7602   if (STRINGP (src_object))
7603     {
7604       coding->src_pos = from;
7605       coding->src_pos_byte = from_byte;
7606     }
7607   else if (BUFFERP (src_object))
7608     {
7609       set_buffer_internal (XBUFFER (src_object));
7610       if (from != GPT)
7611         move_gap_both (from, from_byte);
7612       if (EQ (src_object, dst_object))
7613         {
7614           struct Lisp_Marker *tail;
7615
7616           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7617             {
7618               tail->need_adjustment
7619                 = tail->charpos == (tail->insertion_type ? from : to);
7620               need_marker_adjustment |= tail->need_adjustment;
7621             }
7622           saved_pt = PT, saved_pt_byte = PT_BYTE;
7623           TEMP_SET_PT_BOTH (from, from_byte);
7624           current_buffer->text->inhibit_shrinking = 1;
7625           del_range_both (from, from_byte, to, to_byte, 1);
7626           coding->src_pos = -chars;
7627           coding->src_pos_byte = -bytes;
7628         }
7629       else
7630         {
7631           coding->src_pos = from;
7632           coding->src_pos_byte = from_byte;
7633         }
7634     }
7635
7636   if (CODING_REQUIRE_DETECTION (coding))
7637     detect_coding (coding);
7638   attrs = CODING_ID_ATTRS (coding->id);
7639
7640   if (EQ (dst_object, Qt)
7641       || (! NILP (CODING_ATTR_POST_READ (attrs))
7642           && NILP (dst_object)))
7643     {
7644       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7645       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7646       coding->dst_pos = BEG;
7647       coding->dst_pos_byte = BEG_BYTE;
7648     }
7649   else if (BUFFERP (dst_object))
7650     {
7651       code_conversion_save (0, 0);
7652       coding->dst_object = dst_object;
7653       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7654       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7655       coding->dst_multibyte
7656         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7657     }
7658   else
7659     {
7660       code_conversion_save (0, 0);
7661       coding->dst_object = Qnil;
7662       /* Most callers presume this will return a multibyte result, and they
7663          won't use `binary' or `raw-text' anyway, so let's not worry about
7664          CODING_FOR_UNIBYTE.  */
7665       coding->dst_multibyte = 1;
7666     }
7667
7668   decode_coding (coding);
7669
7670   if (BUFFERP (coding->dst_object))
7671     set_buffer_internal (XBUFFER (coding->dst_object));
7672
7673   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7674     {
7675       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7676       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7677       Lisp_Object val;
7678
7679       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7680       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7681               old_deactivate_mark);
7682       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7683                         make_number (coding->produced_char));
7684       UNGCPRO;
7685       CHECK_NATNUM (val);
7686       coding->produced_char += Z - prev_Z;
7687       coding->produced += Z_BYTE - prev_Z_BYTE;
7688     }
7689
7690   if (EQ (dst_object, Qt))
7691     {
7692       coding->dst_object = Fbuffer_string ();
7693     }
7694   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7695     {
7696       set_buffer_internal (XBUFFER (coding->dst_object));
7697       if (dst_bytes < coding->produced)
7698         {
7699           destination = xrealloc (destination, coding->produced);
7700           if (! destination)
7701             {
7702               record_conversion_result (coding,
7703                                         CODING_RESULT_INSUFFICIENT_MEM);
7704               unbind_to (count, Qnil);
7705               return;
7706             }
7707           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7708             move_gap_both (BEGV, BEGV_BYTE);
7709           memcpy (destination, BEGV_ADDR, coding->produced);
7710           coding->destination = destination;
7711         }
7712     }
7713
7714   if (saved_pt >= 0)
7715     {
7716       /* This is the case of:
7717          (BUFFERP (src_object) && EQ (src_object, dst_object))
7718          As we have moved PT while replacing the original buffer
7719          contents, we must recover it now.  */
7720       set_buffer_internal (XBUFFER (src_object));
7721       current_buffer->text->inhibit_shrinking = 0;
7722       if (saved_pt < from)
7723         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7724       else if (saved_pt < from + chars)
7725         TEMP_SET_PT_BOTH (from, from_byte);
7726       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7727         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7728                           saved_pt_byte + (coding->produced - bytes));
7729       else
7730         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7731                           saved_pt_byte + (coding->produced - bytes));
7732
7733       if (need_marker_adjustment)
7734         {
7735           struct Lisp_Marker *tail;
7736
7737           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7738             if (tail->need_adjustment)
7739               {
7740                 tail->need_adjustment = 0;
7741                 if (tail->insertion_type)
7742                   {
7743                     tail->bytepos = from_byte;
7744                     tail->charpos = from;
7745                   }
7746                 else
7747                   {
7748                     tail->bytepos = from_byte + coding->produced;
7749                     tail->charpos
7750                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7751                          ? tail->bytepos : from + coding->produced_char);
7752                   }
7753               }
7754         }
7755     }
7756
7757   Vdeactivate_mark = old_deactivate_mark;
7758   unbind_to (count, coding->dst_object);
7759 }
7760
7761
7762 void
7763 encode_coding_object (struct coding_system *coding,
7764                       Lisp_Object src_object,
7765                       ptrdiff_t from, ptrdiff_t from_byte,
7766                       ptrdiff_t to, ptrdiff_t to_byte,
7767                       Lisp_Object dst_object)
7768 {
7769   ptrdiff_t count = SPECPDL_INDEX ();
7770   ptrdiff_t chars = to - from;
7771   ptrdiff_t bytes = to_byte - from_byte;
7772   Lisp_Object attrs;
7773   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7774   bool need_marker_adjustment = 0;
7775   bool kill_src_buffer = 0;
7776   Lisp_Object old_deactivate_mark;
7777
7778   old_deactivate_mark = Vdeactivate_mark;
7779
7780   coding->src_object = src_object;
7781   coding->src_chars = chars;
7782   coding->src_bytes = bytes;
7783   coding->src_multibyte = chars < bytes;
7784
7785   attrs = CODING_ID_ATTRS (coding->id);
7786
7787   if (EQ (src_object, dst_object))
7788     {
7789       struct Lisp_Marker *tail;
7790
7791       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7792         {
7793           tail->need_adjustment
7794             = tail->charpos == (tail->insertion_type ? from : to);
7795           need_marker_adjustment |= tail->need_adjustment;
7796         }
7797     }
7798
7799   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7800     {
7801       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7802       set_buffer_internal (XBUFFER (coding->src_object));
7803       if (STRINGP (src_object))
7804         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7805       else if (BUFFERP (src_object))
7806         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7807       else
7808         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7809
7810       if (EQ (src_object, dst_object))
7811         {
7812           set_buffer_internal (XBUFFER (src_object));
7813           saved_pt = PT, saved_pt_byte = PT_BYTE;
7814           del_range_both (from, from_byte, to, to_byte, 1);
7815           set_buffer_internal (XBUFFER (coding->src_object));
7816         }
7817
7818       {
7819         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7820
7821         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7822                 old_deactivate_mark);
7823         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7824                     make_number (BEG), make_number (Z));
7825         UNGCPRO;
7826       }
7827       if (XBUFFER (coding->src_object) != current_buffer)
7828         kill_src_buffer = 1;
7829       coding->src_object = Fcurrent_buffer ();
7830       if (BEG != GPT)
7831         move_gap_both (BEG, BEG_BYTE);
7832       coding->src_chars = Z - BEG;
7833       coding->src_bytes = Z_BYTE - BEG_BYTE;
7834       coding->src_pos = BEG;
7835       coding->src_pos_byte = BEG_BYTE;
7836       coding->src_multibyte = Z < Z_BYTE;
7837     }
7838   else if (STRINGP (src_object))
7839     {
7840       code_conversion_save (0, 0);
7841       coding->src_pos = from;
7842       coding->src_pos_byte = from_byte;
7843     }
7844   else if (BUFFERP (src_object))
7845     {
7846       code_conversion_save (0, 0);
7847       set_buffer_internal (XBUFFER (src_object));
7848       if (EQ (src_object, dst_object))
7849         {
7850           saved_pt = PT, saved_pt_byte = PT_BYTE;
7851           coding->src_object = del_range_1 (from, to, 1, 1);
7852           coding->src_pos = 0;
7853           coding->src_pos_byte = 0;
7854         }
7855       else
7856         {
7857           if (from < GPT && to >= GPT)
7858             move_gap_both (from, from_byte);
7859           coding->src_pos = from;
7860           coding->src_pos_byte = from_byte;
7861         }
7862     }
7863   else
7864     code_conversion_save (0, 0);
7865
7866   if (BUFFERP (dst_object))
7867     {
7868       coding->dst_object = dst_object;
7869       if (EQ (src_object, dst_object))
7870         {
7871           coding->dst_pos = from;
7872           coding->dst_pos_byte = from_byte;
7873         }
7874       else
7875         {
7876           struct buffer *current = current_buffer;
7877
7878           set_buffer_temp (XBUFFER (dst_object));
7879           coding->dst_pos = PT;
7880           coding->dst_pos_byte = PT_BYTE;
7881           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7882           set_buffer_temp (current);
7883         }
7884       coding->dst_multibyte
7885         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7886     }
7887   else if (EQ (dst_object, Qt))
7888     {
7889       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7890       coding->dst_object = Qnil;
7891       coding->destination = xmalloc (dst_bytes);
7892       coding->dst_bytes = dst_bytes;
7893       coding->dst_multibyte = 0;
7894     }
7895   else
7896     {
7897       coding->dst_object = Qnil;
7898       coding->dst_multibyte = 0;
7899     }
7900
7901   encode_coding (coding);
7902
7903   if (EQ (dst_object, Qt))
7904     {
7905       if (BUFFERP (coding->dst_object))
7906         coding->dst_object = Fbuffer_string ();
7907       else
7908         {
7909           coding->dst_object
7910             = make_unibyte_string ((char *) coding->destination,
7911                                    coding->produced);
7912           xfree (coding->destination);
7913         }
7914     }
7915
7916   if (saved_pt >= 0)
7917     {
7918       /* This is the case of:
7919          (BUFFERP (src_object) && EQ (src_object, dst_object))
7920          As we have moved PT while replacing the original buffer
7921          contents, we must recover it now.  */
7922       set_buffer_internal (XBUFFER (src_object));
7923       if (saved_pt < from)
7924         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7925       else if (saved_pt < from + chars)
7926         TEMP_SET_PT_BOTH (from, from_byte);
7927       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7928         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7929                           saved_pt_byte + (coding->produced - bytes));
7930       else
7931         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7932                           saved_pt_byte + (coding->produced - bytes));
7933
7934       if (need_marker_adjustment)
7935         {
7936           struct Lisp_Marker *tail;
7937
7938           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7939             if (tail->need_adjustment)
7940               {
7941                 tail->need_adjustment = 0;
7942                 if (tail->insertion_type)
7943                   {
7944                     tail->bytepos = from_byte;
7945                     tail->charpos = from;
7946                   }
7947                 else
7948                   {
7949                     tail->bytepos = from_byte + coding->produced;
7950                     tail->charpos
7951                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7952                          ? tail->bytepos : from + coding->produced_char);
7953                   }
7954               }
7955         }
7956     }
7957
7958   if (kill_src_buffer)
7959     Fkill_buffer (coding->src_object);
7960
7961   Vdeactivate_mark = old_deactivate_mark;
7962   unbind_to (count, Qnil);
7963 }
7964
7965
7966 Lisp_Object
7967 preferred_coding_system (void)
7968 {
7969   int id = coding_categories[coding_priorities[0]].id;
7970
7971   return CODING_ID_NAME (id);
7972 }
7973
7974 \f
7975 #ifdef emacs
7976 /*** 8. Emacs Lisp library functions ***/
7977
7978 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7979        doc: /* Return t if OBJECT is nil or a coding-system.
7980 See the documentation of `define-coding-system' for information
7981 about coding-system objects.  */)
7982   (Lisp_Object object)
7983 {
7984   if (NILP (object)
7985       || CODING_SYSTEM_ID (object) >= 0)
7986     return Qt;
7987   if (! SYMBOLP (object)
7988       || NILP (Fget (object, Qcoding_system_define_form)))
7989     return Qnil;
7990   return Qt;
7991 }
7992
7993 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7994        Sread_non_nil_coding_system, 1, 1, 0,
7995        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7996   (Lisp_Object prompt)
7997 {
7998   Lisp_Object val;
7999   do
8000     {
8001       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8002                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8003     }
8004   while (SCHARS (val) == 0);
8005   return (Fintern (val, Qnil));
8006 }
8007
8008 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8009        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8010 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8011 Ignores case when completing coding systems (all Emacs coding systems
8012 are lower-case).  */)
8013   (Lisp_Object prompt, Lisp_Object default_coding_system)
8014 {
8015   Lisp_Object val;
8016   ptrdiff_t count = SPECPDL_INDEX ();
8017
8018   if (SYMBOLP (default_coding_system))
8019     default_coding_system = SYMBOL_NAME (default_coding_system);
8020   specbind (Qcompletion_ignore_case, Qt);
8021   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8022                           Qt, Qnil, Qcoding_system_history,
8023                           default_coding_system, Qnil);
8024   unbind_to (count, Qnil);
8025   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8026 }
8027
8028 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8029        1, 1, 0,
8030        doc: /* Check validity of CODING-SYSTEM.
8031 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8032 It is valid if it is nil or a symbol defined as a coding system by the
8033 function `define-coding-system'.  */)
8034   (Lisp_Object coding_system)
8035 {
8036   Lisp_Object define_form;
8037
8038   define_form = Fget (coding_system, Qcoding_system_define_form);
8039   if (! NILP (define_form))
8040     {
8041       Fput (coding_system, Qcoding_system_define_form, Qnil);
8042       safe_eval (define_form);
8043     }
8044   if (!NILP (Fcoding_system_p (coding_system)))
8045     return coding_system;
8046   xsignal1 (Qcoding_system_error, coding_system);
8047 }
8048
8049 \f
8050 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8051    HIGHEST, return the coding system of the highest
8052    priority among the detected coding systems.  Otherwise return a
8053    list of detected coding systems sorted by their priorities.  If
8054    MULTIBYTEP, it is assumed that the bytes are in correct
8055    multibyte form but contains only ASCII and eight-bit chars.
8056    Otherwise, the bytes are raw bytes.
8057
8058    CODING-SYSTEM controls the detection as below:
8059
8060    If it is nil, detect both text-format and eol-format.  If the
8061    text-format part of CODING-SYSTEM is already specified
8062    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8063    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8064    detect only text-format.  */
8065
8066 Lisp_Object
8067 detect_coding_system (const unsigned char *src,
8068                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8069                       bool highest, bool multibytep,
8070                       Lisp_Object coding_system)
8071 {
8072   const unsigned char *src_end = src + src_bytes;
8073   Lisp_Object attrs, eol_type;
8074   Lisp_Object val = Qnil;
8075   struct coding_system coding;
8076   ptrdiff_t id;
8077   struct coding_detection_info detect_info;
8078   enum coding_category base_category;
8079   bool null_byte_found = 0, eight_bit_found = 0;
8080
8081   if (NILP (coding_system))
8082     coding_system = Qundecided;
8083   setup_coding_system (coding_system, &coding);
8084   attrs = CODING_ID_ATTRS (coding.id);
8085   eol_type = CODING_ID_EOL_TYPE (coding.id);
8086   coding_system = CODING_ATTR_BASE_NAME (attrs);
8087
8088   coding.source = src;
8089   coding.src_chars = src_chars;
8090   coding.src_bytes = src_bytes;
8091   coding.src_multibyte = multibytep;
8092   coding.consumed = 0;
8093   coding.mode |= CODING_MODE_LAST_BLOCK;
8094   coding.head_ascii = 0;
8095
8096   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8097
8098   /* At first, detect text-format if necessary.  */
8099   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8100   if (base_category == coding_category_undecided)
8101     {
8102       enum coding_category category IF_LINT (= 0);
8103       struct coding_system *this IF_LINT (= NULL);
8104       int c, i;
8105
8106       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8107       for (; src < src_end; src++)
8108         {
8109           c = *src;
8110           if (c & 0x80)
8111             {
8112               eight_bit_found = 1;
8113               if (null_byte_found)
8114                 break;
8115             }
8116           else if (c < 0x20)
8117             {
8118               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8119                   && ! inhibit_iso_escape_detection
8120                   && ! detect_info.checked)
8121                 {
8122                   if (detect_coding_iso_2022 (&coding, &detect_info))
8123                     {
8124                       /* We have scanned the whole data.  */
8125                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8126                         {
8127                           /* We didn't find an 8-bit code.  We may
8128                              have found a null-byte, but it's very
8129                              rare that a binary file confirm to
8130                              ISO-2022.  */
8131                           src = src_end;
8132                           coding.head_ascii = src - coding.source;
8133                         }
8134                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8135                       break;
8136                     }
8137                 }
8138               else if (! c && !inhibit_null_byte_detection)
8139                 {
8140                   null_byte_found = 1;
8141                   if (eight_bit_found)
8142                     break;
8143                 }
8144               if (! eight_bit_found)
8145                 coding.head_ascii++;
8146             }
8147           else if (! eight_bit_found)
8148             coding.head_ascii++;
8149         }
8150
8151       if (null_byte_found || eight_bit_found
8152           || coding.head_ascii < coding.src_bytes
8153           || detect_info.found)
8154         {
8155           if (coding.head_ascii == coding.src_bytes)
8156             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8157             for (i = 0; i < coding_category_raw_text; i++)
8158               {
8159                 category = coding_priorities[i];
8160                 this = coding_categories + category;
8161                 if (detect_info.found & (1 << category))
8162                   break;
8163               }
8164           else
8165             {
8166               if (null_byte_found)
8167                 {
8168                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8169                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8170                 }
8171               for (i = 0; i < coding_category_raw_text; i++)
8172                 {
8173                   category = coding_priorities[i];
8174                   this = coding_categories + category;
8175
8176                   if (this->id < 0)
8177                     {
8178                       /* No coding system of this category is defined.  */
8179                       detect_info.rejected |= (1 << category);
8180                     }
8181                   else if (category >= coding_category_raw_text)
8182                     continue;
8183                   else if (detect_info.checked & (1 << category))
8184                     {
8185                       if (highest
8186                           && (detect_info.found & (1 << category)))
8187                         break;
8188                     }
8189                   else if ((*(this->detector)) (&coding, &detect_info)
8190                            && highest
8191                            && (detect_info.found & (1 << category)))
8192                     {
8193                       if (category == coding_category_utf_16_auto)
8194                         {
8195                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8196                             category = coding_category_utf_16_le;
8197                           else
8198                             category = coding_category_utf_16_be;
8199                         }
8200                       break;
8201                     }
8202                 }
8203             }
8204         }
8205
8206       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8207           || null_byte_found)
8208         {
8209           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8210           id = CODING_SYSTEM_ID (Qno_conversion);
8211           val = Fcons (make_number (id), Qnil);
8212         }
8213       else if (! detect_info.rejected && ! detect_info.found)
8214         {
8215           detect_info.found = CATEGORY_MASK_ANY;
8216           id = coding_categories[coding_category_undecided].id;
8217           val = Fcons (make_number (id), Qnil);
8218         }
8219       else if (highest)
8220         {
8221           if (detect_info.found)
8222             {
8223               detect_info.found = 1 << category;
8224               val = Fcons (make_number (this->id), Qnil);
8225             }
8226           else
8227             for (i = 0; i < coding_category_raw_text; i++)
8228               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8229                 {
8230                   detect_info.found = 1 << coding_priorities[i];
8231                   id = coding_categories[coding_priorities[i]].id;
8232                   val = Fcons (make_number (id), Qnil);
8233                   break;
8234                 }
8235         }
8236       else
8237         {
8238           int mask = detect_info.rejected | detect_info.found;
8239           int found = 0;
8240
8241           for (i = coding_category_raw_text - 1; i >= 0; i--)
8242             {
8243               category = coding_priorities[i];
8244               if (! (mask & (1 << category)))
8245                 {
8246                   found |= 1 << category;
8247                   id = coding_categories[category].id;
8248                   if (id >= 0)
8249                     val = Fcons (make_number (id), val);
8250                 }
8251             }
8252           for (i = coding_category_raw_text - 1; i >= 0; i--)
8253             {
8254               category = coding_priorities[i];
8255               if (detect_info.found & (1 << category))
8256                 {
8257                   id = coding_categories[category].id;
8258                   val = Fcons (make_number (id), val);
8259                 }
8260             }
8261           detect_info.found |= found;
8262         }
8263     }
8264   else if (base_category == coding_category_utf_8_auto)
8265     {
8266       if (detect_coding_utf_8 (&coding, &detect_info))
8267         {
8268           struct coding_system *this;
8269
8270           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8271             this = coding_categories + coding_category_utf_8_sig;
8272           else
8273             this = coding_categories + coding_category_utf_8_nosig;
8274           val = Fcons (make_number (this->id), Qnil);
8275         }
8276     }
8277   else if (base_category == coding_category_utf_16_auto)
8278     {
8279       if (detect_coding_utf_16 (&coding, &detect_info))
8280         {
8281           struct coding_system *this;
8282
8283           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8284             this = coding_categories + coding_category_utf_16_le;
8285           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8286             this = coding_categories + coding_category_utf_16_be;
8287           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8288             this = coding_categories + coding_category_utf_16_be_nosig;
8289           else
8290             this = coding_categories + coding_category_utf_16_le_nosig;
8291           val = Fcons (make_number (this->id), Qnil);
8292         }
8293     }
8294   else
8295     {
8296       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8297       val = Fcons (make_number (coding.id), Qnil);
8298     }
8299
8300   /* Then, detect eol-format if necessary.  */
8301   {
8302     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8303     Lisp_Object tail;
8304
8305     if (VECTORP (eol_type))
8306       {
8307         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8308           {
8309             if (null_byte_found)
8310               normal_eol = EOL_SEEN_LF;
8311             else
8312               normal_eol = detect_eol (coding.source, src_bytes,
8313                                        coding_category_raw_text);
8314           }
8315         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8316                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8317           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8318                                       coding_category_utf_16_be);
8319         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8320                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8321           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8322                                       coding_category_utf_16_le);
8323       }
8324     else
8325       {
8326         if (EQ (eol_type, Qunix))
8327           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8328         else if (EQ (eol_type, Qdos))
8329           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8330         else
8331           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8332       }
8333
8334     for (tail = val; CONSP (tail); tail = XCDR (tail))
8335       {
8336         enum coding_category category;
8337         int this_eol;
8338
8339         id = XINT (XCAR (tail));
8340         attrs = CODING_ID_ATTRS (id);
8341         category = XINT (CODING_ATTR_CATEGORY (attrs));
8342         eol_type = CODING_ID_EOL_TYPE (id);
8343         if (VECTORP (eol_type))
8344           {
8345             if (category == coding_category_utf_16_be
8346                 || category == coding_category_utf_16_be_nosig)
8347               this_eol = utf_16_be_eol;
8348             else if (category == coding_category_utf_16_le
8349                      || category == coding_category_utf_16_le_nosig)
8350               this_eol = utf_16_le_eol;
8351             else
8352               this_eol = normal_eol;
8353
8354             if (this_eol == EOL_SEEN_LF)
8355               XSETCAR (tail, AREF (eol_type, 0));
8356             else if (this_eol == EOL_SEEN_CRLF)
8357               XSETCAR (tail, AREF (eol_type, 1));
8358             else if (this_eol == EOL_SEEN_CR)
8359               XSETCAR (tail, AREF (eol_type, 2));
8360             else
8361               XSETCAR (tail, CODING_ID_NAME (id));
8362           }
8363         else
8364           XSETCAR (tail, CODING_ID_NAME (id));
8365       }
8366   }
8367
8368   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8369 }
8370
8371
8372 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8373        2, 3, 0,
8374        doc: /* Detect coding system of the text in the region between START and END.
8375 Return a list of possible coding systems ordered by priority.
8376 The coding systems to try and their priorities follows what
8377 the function `coding-system-priority-list' (which see) returns.
8378
8379 If only ASCII characters are found (except for such ISO-2022 control
8380 characters as ESC), it returns a list of single element `undecided'
8381 or its subsidiary coding system according to a detected end-of-line
8382 format.
8383
8384 If optional argument HIGHEST is non-nil, return the coding system of
8385 highest priority.  */)
8386   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8387 {
8388   ptrdiff_t from, to;
8389   ptrdiff_t from_byte, to_byte;
8390
8391   CHECK_NUMBER_COERCE_MARKER (start);
8392   CHECK_NUMBER_COERCE_MARKER (end);
8393
8394   validate_region (&start, &end);
8395   from = XINT (start), to = XINT (end);
8396   from_byte = CHAR_TO_BYTE (from);
8397   to_byte = CHAR_TO_BYTE (to);
8398
8399   if (from < GPT && to >= GPT)
8400     move_gap_both (to, to_byte);
8401
8402   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8403                                to - from, to_byte - from_byte,
8404                                !NILP (highest),
8405                                !NILP (BVAR (current_buffer
8406                                       , enable_multibyte_characters)),
8407                                Qnil);
8408 }
8409
8410 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8411        1, 2, 0,
8412        doc: /* Detect coding system of the text in STRING.
8413 Return a list of possible coding systems ordered by priority.
8414 The coding systems to try and their priorities follows what
8415 the function `coding-system-priority-list' (which see) returns.
8416
8417 If only ASCII characters are found (except for such ISO-2022 control
8418 characters as ESC), it returns a list of single element `undecided'
8419 or its subsidiary coding system according to a detected end-of-line
8420 format.
8421
8422 If optional argument HIGHEST is non-nil, return the coding system of
8423 highest priority.  */)
8424   (Lisp_Object string, Lisp_Object highest)
8425 {
8426   CHECK_STRING (string);
8427
8428   return detect_coding_system (SDATA (string),
8429                                SCHARS (string), SBYTES (string),
8430                                !NILP (highest), STRING_MULTIBYTE (string),
8431                                Qnil);
8432 }
8433
8434
8435 static bool
8436 char_encodable_p (int c, Lisp_Object attrs)
8437 {
8438   Lisp_Object tail;
8439   struct charset *charset;
8440   Lisp_Object translation_table;
8441
8442   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8443   if (! NILP (translation_table))
8444     c = translate_char (translation_table, c);
8445   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8446        CONSP (tail); tail = XCDR (tail))
8447     {
8448       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8449       if (CHAR_CHARSET_P (c, charset))
8450         break;
8451     }
8452   return (! NILP (tail));
8453 }
8454
8455
8456 /* Return a list of coding systems that safely encode the text between
8457    START and END.  If EXCLUDE is non-nil, it is a list of coding
8458    systems not to check.  The returned list doesn't contain any such
8459    coding systems.  In any case, if the text contains only ASCII or is
8460    unibyte, return t.  */
8461
8462 DEFUN ("find-coding-systems-region-internal",
8463        Ffind_coding_systems_region_internal,
8464        Sfind_coding_systems_region_internal, 2, 3, 0,
8465        doc: /* Internal use only.  */)
8466   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8467 {
8468   Lisp_Object coding_attrs_list, safe_codings;
8469   ptrdiff_t start_byte, end_byte;
8470   const unsigned char *p, *pbeg, *pend;
8471   int c;
8472   Lisp_Object tail, elt, work_table;
8473
8474   if (STRINGP (start))
8475     {
8476       if (!STRING_MULTIBYTE (start)
8477           || SCHARS (start) == SBYTES (start))
8478         return Qt;
8479       start_byte = 0;
8480       end_byte = SBYTES (start);
8481     }
8482   else
8483     {
8484       CHECK_NUMBER_COERCE_MARKER (start);
8485       CHECK_NUMBER_COERCE_MARKER (end);
8486       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8487         args_out_of_range (start, end);
8488       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8489         return Qt;
8490       start_byte = CHAR_TO_BYTE (XINT (start));
8491       end_byte = CHAR_TO_BYTE (XINT (end));
8492       if (XINT (end) - XINT (start) == end_byte - start_byte)
8493         return Qt;
8494
8495       if (XINT (start) < GPT && XINT (end) > GPT)
8496         {
8497           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8498             move_gap_both (XINT (start), start_byte);
8499           else
8500             move_gap_both (XINT (end), end_byte);
8501         }
8502     }
8503
8504   coding_attrs_list = Qnil;
8505   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8506     if (NILP (exclude)
8507         || NILP (Fmemq (XCAR (tail), exclude)))
8508       {
8509         Lisp_Object attrs;
8510
8511         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8512         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8513             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8514           {
8515             ASET (attrs, coding_attr_trans_tbl,
8516                   get_translation_table (attrs, 1, NULL));
8517             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8518           }
8519       }
8520
8521   if (STRINGP (start))
8522     p = pbeg = SDATA (start);
8523   else
8524     p = pbeg = BYTE_POS_ADDR (start_byte);
8525   pend = p + (end_byte - start_byte);
8526
8527   while (p < pend && ASCII_BYTE_P (*p)) p++;
8528   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8529
8530   work_table = Fmake_char_table (Qnil, Qnil);
8531   while (p < pend)
8532     {
8533       if (ASCII_BYTE_P (*p))
8534         p++;
8535       else
8536         {
8537           c = STRING_CHAR_ADVANCE (p);
8538           if (!NILP (char_table_ref (work_table, c)))
8539             /* This character was already checked.  Ignore it.  */
8540             continue;
8541
8542           charset_map_loaded = 0;
8543           for (tail = coding_attrs_list; CONSP (tail);)
8544             {
8545               elt = XCAR (tail);
8546               if (NILP (elt))
8547                 tail = XCDR (tail);
8548               else if (char_encodable_p (c, elt))
8549                 tail = XCDR (tail);
8550               else if (CONSP (XCDR (tail)))
8551                 {
8552                   XSETCAR (tail, XCAR (XCDR (tail)));
8553                   XSETCDR (tail, XCDR (XCDR (tail)));
8554                 }
8555               else
8556                 {
8557                   XSETCAR (tail, Qnil);
8558                   tail = XCDR (tail);
8559                 }
8560             }
8561           if (charset_map_loaded)
8562             {
8563               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8564
8565               if (STRINGP (start))
8566                 pbeg = SDATA (start);
8567               else
8568                 pbeg = BYTE_POS_ADDR (start_byte);
8569               p = pbeg + p_offset;
8570               pend = pbeg + pend_offset;
8571             }
8572           char_table_set (work_table, c, Qt);
8573         }
8574     }
8575
8576   safe_codings = list2 (Qraw_text, Qno_conversion);
8577   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8578     if (! NILP (XCAR (tail)))
8579       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8580
8581   return safe_codings;
8582 }
8583
8584
8585 DEFUN ("unencodable-char-position", Funencodable_char_position,
8586        Sunencodable_char_position, 3, 5, 0,
8587        doc: /*
8588 Return position of first un-encodable character in a region.
8589 START and END specify the region and CODING-SYSTEM specifies the
8590 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8591
8592 If optional 4th argument COUNT is non-nil, it specifies at most how
8593 many un-encodable characters to search.  In this case, the value is a
8594 list of positions.
8595
8596 If optional 5th argument STRING is non-nil, it is a string to search
8597 for un-encodable characters.  In that case, START and END are indexes
8598 to the string.  */)
8599   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8600 {
8601   EMACS_INT n;
8602   struct coding_system coding;
8603   Lisp_Object attrs, charset_list, translation_table;
8604   Lisp_Object positions;
8605   ptrdiff_t from, to;
8606   const unsigned char *p, *stop, *pend;
8607   bool ascii_compatible;
8608
8609   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8610   attrs = CODING_ID_ATTRS (coding.id);
8611   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8612     return Qnil;
8613   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8614   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8615   translation_table = get_translation_table (attrs, 1, NULL);
8616
8617   if (NILP (string))
8618     {
8619       validate_region (&start, &end);
8620       from = XINT (start);
8621       to = XINT (end);
8622       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8623           || (ascii_compatible
8624               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8625         return Qnil;
8626       p = CHAR_POS_ADDR (from);
8627       pend = CHAR_POS_ADDR (to);
8628       if (from < GPT && to >= GPT)
8629         stop = GPT_ADDR;
8630       else
8631         stop = pend;
8632     }
8633   else
8634     {
8635       CHECK_STRING (string);
8636       CHECK_NATNUM (start);
8637       CHECK_NATNUM (end);
8638       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8639         args_out_of_range_3 (string, start, end);
8640       from = XINT (start);
8641       to = XINT (end);
8642       if (! STRING_MULTIBYTE (string))
8643         return Qnil;
8644       p = SDATA (string) + string_char_to_byte (string, from);
8645       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8646       if (ascii_compatible && (to - from) == (pend - p))
8647         return Qnil;
8648     }
8649
8650   if (NILP (count))
8651     n = 1;
8652   else
8653     {
8654       CHECK_NATNUM (count);
8655       n = XINT (count);
8656     }
8657
8658   positions = Qnil;
8659   charset_map_loaded = 0;
8660   while (1)
8661     {
8662       int c;
8663
8664       if (ascii_compatible)
8665         while (p < stop && ASCII_BYTE_P (*p))
8666           p++, from++;
8667       if (p >= stop)
8668         {
8669           if (p >= pend)
8670             break;
8671           stop = pend;
8672           p = GAP_END_ADDR;
8673         }
8674
8675       c = STRING_CHAR_ADVANCE (p);
8676       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8677           && ! char_charset (translate_char (translation_table, c),
8678                              charset_list, NULL))
8679         {
8680           positions = Fcons (make_number (from), positions);
8681           n--;
8682           if (n == 0)
8683             break;
8684         }
8685
8686       from++;
8687       if (charset_map_loaded && NILP (string))
8688         {
8689           p = CHAR_POS_ADDR (from);
8690           pend = CHAR_POS_ADDR (to);
8691           if (from < GPT && to >= GPT)
8692             stop = GPT_ADDR;
8693           else
8694             stop = pend;
8695           charset_map_loaded = 0;
8696         }
8697     }
8698
8699   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8700 }
8701
8702
8703 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8704        Scheck_coding_systems_region, 3, 3, 0,
8705        doc: /* Check if the region is encodable by coding systems.
8706
8707 START and END are buffer positions specifying the region.
8708 CODING-SYSTEM-LIST is a list of coding systems to check.
8709
8710 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8711 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8712 whole region, POS0, POS1, ... are buffer positions where non-encodable
8713 characters are found.
8714
8715 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8716 value is nil.
8717
8718 START may be a string.  In that case, check if the string is
8719 encodable, and the value contains indices to the string instead of
8720 buffer positions.  END is ignored.
8721
8722 If the current buffer (or START if it is a string) is unibyte, the value
8723 is nil.  */)
8724   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8725 {
8726   Lisp_Object list;
8727   ptrdiff_t start_byte, end_byte;
8728   ptrdiff_t pos;
8729   const unsigned char *p, *pbeg, *pend;
8730   int c;
8731   Lisp_Object tail, elt, attrs;
8732
8733   if (STRINGP (start))
8734     {
8735       if (!STRING_MULTIBYTE (start)
8736           || SCHARS (start) == SBYTES (start))
8737         return Qnil;
8738       start_byte = 0;
8739       end_byte = SBYTES (start);
8740       pos = 0;
8741     }
8742   else
8743     {
8744       CHECK_NUMBER_COERCE_MARKER (start);
8745       CHECK_NUMBER_COERCE_MARKER (end);
8746       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8747         args_out_of_range (start, end);
8748       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8749         return Qnil;
8750       start_byte = CHAR_TO_BYTE (XINT (start));
8751       end_byte = CHAR_TO_BYTE (XINT (end));
8752       if (XINT (end) - XINT (start) == end_byte - start_byte)
8753         return Qnil;
8754
8755       if (XINT (start) < GPT && XINT (end) > GPT)
8756         {
8757           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8758             move_gap_both (XINT (start), start_byte);
8759           else
8760             move_gap_both (XINT (end), end_byte);
8761         }
8762       pos = XINT (start);
8763     }
8764
8765   list = Qnil;
8766   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8767     {
8768       elt = XCAR (tail);
8769       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8770       ASET (attrs, coding_attr_trans_tbl,
8771             get_translation_table (attrs, 1, NULL));
8772       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8773     }
8774
8775   if (STRINGP (start))
8776     p = pbeg = SDATA (start);
8777   else
8778     p = pbeg = BYTE_POS_ADDR (start_byte);
8779   pend = p + (end_byte - start_byte);
8780
8781   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8782   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8783
8784   while (p < pend)
8785     {
8786       if (ASCII_BYTE_P (*p))
8787         p++;
8788       else
8789         {
8790           c = STRING_CHAR_ADVANCE (p);
8791
8792           charset_map_loaded = 0;
8793           for (tail = list; CONSP (tail); tail = XCDR (tail))
8794             {
8795               elt = XCDR (XCAR (tail));
8796               if (! char_encodable_p (c, XCAR (elt)))
8797                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8798             }
8799           if (charset_map_loaded)
8800             {
8801               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8802
8803               if (STRINGP (start))
8804                 pbeg = SDATA (start);
8805               else
8806                 pbeg = BYTE_POS_ADDR (start_byte);
8807               p = pbeg + p_offset;
8808               pend = pbeg + pend_offset;
8809             }
8810         }
8811       pos++;
8812     }
8813
8814   tail = list;
8815   list = Qnil;
8816   for (; CONSP (tail); tail = XCDR (tail))
8817     {
8818       elt = XCAR (tail);
8819       if (CONSP (XCDR (XCDR (elt))))
8820         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8821                       list);
8822     }
8823
8824   return list;
8825 }
8826
8827
8828 static Lisp_Object
8829 code_convert_region (Lisp_Object start, Lisp_Object end,
8830                      Lisp_Object coding_system, Lisp_Object dst_object,
8831                      bool encodep, bool norecord)
8832 {
8833   struct coding_system coding;
8834   ptrdiff_t from, from_byte, to, to_byte;
8835   Lisp_Object src_object;
8836
8837   CHECK_NUMBER_COERCE_MARKER (start);
8838   CHECK_NUMBER_COERCE_MARKER (end);
8839   if (NILP (coding_system))
8840     coding_system = Qno_conversion;
8841   else
8842     CHECK_CODING_SYSTEM (coding_system);
8843   src_object = Fcurrent_buffer ();
8844   if (NILP (dst_object))
8845     dst_object = src_object;
8846   else if (! EQ (dst_object, Qt))
8847     CHECK_BUFFER (dst_object);
8848
8849   validate_region (&start, &end);
8850   from = XFASTINT (start);
8851   from_byte = CHAR_TO_BYTE (from);
8852   to = XFASTINT (end);
8853   to_byte = CHAR_TO_BYTE (to);
8854
8855   setup_coding_system (coding_system, &coding);
8856   coding.mode |= CODING_MODE_LAST_BLOCK;
8857
8858   if (encodep)
8859     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8860                           dst_object);
8861   else
8862     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8863                           dst_object);
8864   if (! norecord)
8865     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8866
8867   return (BUFFERP (dst_object)
8868           ? make_number (coding.produced_char)
8869           : coding.dst_object);
8870 }
8871
8872
8873 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8874        3, 4, "r\nzCoding system: ",
8875        doc: /* Decode the current region from the specified coding system.
8876 When called from a program, takes four arguments:
8877         START, END, CODING-SYSTEM, and DESTINATION.
8878 START and END are buffer positions.
8879
8880 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8881 If nil, the region between START and END is replaced by the decoded text.
8882 If buffer, the decoded text is inserted in that buffer after point (point
8883 does not move).
8884 In those cases, the length of the decoded text is returned.
8885 If DESTINATION is t, the decoded text is returned.
8886
8887 This function sets `last-coding-system-used' to the precise coding system
8888 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8889 not fully specified.)  */)
8890   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8891 {
8892   return code_convert_region (start, end, coding_system, destination, 0, 0);
8893 }
8894
8895 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8896        3, 4, "r\nzCoding system: ",
8897        doc: /* Encode the current region by specified coding system.
8898 When called from a program, takes four arguments:
8899         START, END, CODING-SYSTEM and DESTINATION.
8900 START and END are buffer positions.
8901
8902 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8903 If nil, the region between START and END is replace by the encoded text.
8904 If buffer, the encoded text is inserted in that buffer after point (point
8905 does not move).
8906 In those cases, the length of the encoded text is returned.
8907 If DESTINATION is t, the encoded text is returned.
8908
8909 This function sets `last-coding-system-used' to the precise coding system
8910 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8911 not fully specified.)  */)
8912   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8913 {
8914   return code_convert_region (start, end, coding_system, destination, 1, 0);
8915 }
8916
8917 Lisp_Object
8918 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8919                      Lisp_Object dst_object, bool encodep, bool nocopy,
8920                      bool norecord)
8921 {
8922   struct coding_system coding;
8923   ptrdiff_t chars, bytes;
8924
8925   CHECK_STRING (string);
8926   if (NILP (coding_system))
8927     {
8928       if (! norecord)
8929         Vlast_coding_system_used = Qno_conversion;
8930       if (NILP (dst_object))
8931         return (nocopy ? Fcopy_sequence (string) : string);
8932     }
8933
8934   if (NILP (coding_system))
8935     coding_system = Qno_conversion;
8936   else
8937     CHECK_CODING_SYSTEM (coding_system);
8938   if (NILP (dst_object))
8939     dst_object = Qt;
8940   else if (! EQ (dst_object, Qt))
8941     CHECK_BUFFER (dst_object);
8942
8943   setup_coding_system (coding_system, &coding);
8944   coding.mode |= CODING_MODE_LAST_BLOCK;
8945   chars = SCHARS (string);
8946   bytes = SBYTES (string);
8947   if (encodep)
8948     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8949   else
8950     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8951   if (! norecord)
8952     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8953
8954   return (BUFFERP (dst_object)
8955           ? make_number (coding.produced_char)
8956           : coding.dst_object);
8957 }
8958
8959
8960 /* Encode or decode STRING according to CODING_SYSTEM.
8961    Do not set Vlast_coding_system_used.
8962
8963    This function is called only from macros DECODE_FILE and
8964    ENCODE_FILE, thus we ignore character composition.  */
8965
8966 Lisp_Object
8967 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8968                               bool encodep)
8969 {
8970   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8971 }
8972
8973
8974 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8975        2, 4, 0,
8976        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8977
8978 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8979 if the decoding operation is trivial.
8980
8981 Optional fourth arg BUFFER non-nil means that the decoded text is
8982 inserted in that buffer after point (point does not move).  In this
8983 case, the return value is the length of the decoded text.
8984
8985 This function sets `last-coding-system-used' to the precise coding system
8986 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8987 not fully specified.)  */)
8988   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
8989 {
8990   return code_convert_string (string, coding_system, buffer,
8991                               0, ! NILP (nocopy), 0);
8992 }
8993
8994 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8995        2, 4, 0,
8996        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8997
8998 Optional third arg NOCOPY non-nil means it is OK to return STRING
8999 itself if the encoding operation is trivial.
9000
9001 Optional fourth arg BUFFER non-nil means that the encoded text is
9002 inserted in that buffer after point (point does not move).  In this
9003 case, the return value is the length of the encoded text.
9004
9005 This function sets `last-coding-system-used' to the precise coding system
9006 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9007 not fully specified.)  */)
9008   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9009 {
9010   return code_convert_string (string, coding_system, buffer,
9011                               1, ! NILP (nocopy), 0);
9012 }
9013
9014 \f
9015 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9016        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9017 Return the corresponding character.  */)
9018   (Lisp_Object code)
9019 {
9020   Lisp_Object spec, attrs, val;
9021   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9022   EMACS_INT ch;
9023   int c;
9024
9025   CHECK_NATNUM (code);
9026   ch = XFASTINT (code);
9027   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9028   attrs = AREF (spec, 0);
9029
9030   if (ASCII_BYTE_P (ch)
9031       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9032     return code;
9033
9034   val = CODING_ATTR_CHARSET_LIST (attrs);
9035   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9036   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9037   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9038
9039   if (ch <= 0x7F)
9040     {
9041       c = ch;
9042       charset = charset_roman;
9043     }
9044   else if (ch >= 0xA0 && ch < 0xDF)
9045     {
9046       c = ch - 0x80;
9047       charset = charset_kana;
9048     }
9049   else
9050     {
9051       EMACS_INT c1 = ch >> 8;
9052       int c2 = ch & 0xFF;
9053
9054       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9055           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9056         error ("Invalid code: %"pI"d", ch);
9057       c = ch;
9058       SJIS_TO_JIS (c);
9059       charset = charset_kanji;
9060     }
9061   c = DECODE_CHAR (charset, c);
9062   if (c < 0)
9063     error ("Invalid code: %"pI"d", ch);
9064   return make_number (c);
9065 }
9066
9067
9068 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9069        doc: /* Encode a Japanese character CH to shift_jis encoding.
9070 Return the corresponding code in SJIS.  */)
9071   (Lisp_Object ch)
9072 {
9073   Lisp_Object spec, attrs, charset_list;
9074   int c;
9075   struct charset *charset;
9076   unsigned code;
9077
9078   CHECK_CHARACTER (ch);
9079   c = XFASTINT (ch);
9080   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9081   attrs = AREF (spec, 0);
9082
9083   if (ASCII_CHAR_P (c)
9084       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9085     return ch;
9086
9087   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9088   charset = char_charset (c, charset_list, &code);
9089   if (code == CHARSET_INVALID_CODE (charset))
9090     error ("Can't encode by shift_jis encoding: %c", c);
9091   JIS_TO_SJIS (code);
9092
9093   return make_number (code);
9094 }
9095
9096 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9097        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9098 Return the corresponding character.  */)
9099   (Lisp_Object code)
9100 {
9101   Lisp_Object spec, attrs, val;
9102   struct charset *charset_roman, *charset_big5, *charset;
9103   EMACS_INT ch;
9104   int c;
9105
9106   CHECK_NATNUM (code);
9107   ch = XFASTINT (code);
9108   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9109   attrs = AREF (spec, 0);
9110
9111   if (ASCII_BYTE_P (ch)
9112       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9113     return code;
9114
9115   val = CODING_ATTR_CHARSET_LIST (attrs);
9116   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9117   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9118
9119   if (ch <= 0x7F)
9120     {
9121       c = ch;
9122       charset = charset_roman;
9123     }
9124   else
9125     {
9126       EMACS_INT b1 = ch >> 8;
9127       int b2 = ch & 0x7F;
9128       if (b1 < 0xA1 || b1 > 0xFE
9129           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9130         error ("Invalid code: %"pI"d", ch);
9131       c = ch;
9132       charset = charset_big5;
9133     }
9134   c = DECODE_CHAR (charset, c);
9135   if (c < 0)
9136     error ("Invalid code: %"pI"d", ch);
9137   return make_number (c);
9138 }
9139
9140 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9141        doc: /* Encode the Big5 character CH to BIG5 coding system.
9142 Return the corresponding character code in Big5.  */)
9143   (Lisp_Object ch)
9144 {
9145   Lisp_Object spec, attrs, charset_list;
9146   struct charset *charset;
9147   int c;
9148   unsigned code;
9149
9150   CHECK_CHARACTER (ch);
9151   c = XFASTINT (ch);
9152   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9153   attrs = AREF (spec, 0);
9154   if (ASCII_CHAR_P (c)
9155       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9156     return ch;
9157
9158   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9159   charset = char_charset (c, charset_list, &code);
9160   if (code == CHARSET_INVALID_CODE (charset))
9161     error ("Can't encode by Big5 encoding: %c", c);
9162
9163   return make_number (code);
9164 }
9165
9166 \f
9167 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9168        Sset_terminal_coding_system_internal, 1, 2, 0,
9169        doc: /* Internal use only.  */)
9170   (Lisp_Object coding_system, Lisp_Object terminal)
9171 {
9172   struct terminal *term = get_terminal (terminal, 1);
9173   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9174   CHECK_SYMBOL (coding_system);
9175   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9176   /* We had better not send unsafe characters to terminal.  */
9177   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9178   /* Character composition should be disabled.  */
9179   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9180   terminal_coding->src_multibyte = 1;
9181   terminal_coding->dst_multibyte = 0;
9182   tset_charset_list
9183     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9184             ? coding_charset_list (terminal_coding)
9185             : Fcons (make_number (charset_ascii), Qnil)));
9186   return Qnil;
9187 }
9188
9189 DEFUN ("set-safe-terminal-coding-system-internal",
9190        Fset_safe_terminal_coding_system_internal,
9191        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9192        doc: /* Internal use only.  */)
9193   (Lisp_Object coding_system)
9194 {
9195   CHECK_SYMBOL (coding_system);
9196   setup_coding_system (Fcheck_coding_system (coding_system),
9197                        &safe_terminal_coding);
9198   /* Character composition should be disabled.  */
9199   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9200   safe_terminal_coding.src_multibyte = 1;
9201   safe_terminal_coding.dst_multibyte = 0;
9202   return Qnil;
9203 }
9204
9205 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9206        Sterminal_coding_system, 0, 1, 0,
9207        doc: /* Return coding system specified for terminal output on the given terminal.
9208 TERMINAL may be a terminal object, a frame, or nil for the selected
9209 frame's terminal device.  */)
9210   (Lisp_Object terminal)
9211 {
9212   struct coding_system *terminal_coding
9213     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9214   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9215
9216   /* For backward compatibility, return nil if it is `undecided'.  */
9217   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9218 }
9219
9220 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9221        Sset_keyboard_coding_system_internal, 1, 2, 0,
9222        doc: /* Internal use only.  */)
9223   (Lisp_Object coding_system, Lisp_Object terminal)
9224 {
9225   struct terminal *t = get_terminal (terminal, 1);
9226   CHECK_SYMBOL (coding_system);
9227   if (NILP (coding_system))
9228     coding_system = Qno_conversion;
9229   else
9230     Fcheck_coding_system (coding_system);
9231   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9232   /* Character composition should be disabled.  */
9233   TERMINAL_KEYBOARD_CODING (t)->common_flags
9234     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9235   return Qnil;
9236 }
9237
9238 DEFUN ("keyboard-coding-system",
9239        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9240        doc: /* Return coding system specified for decoding keyboard input.  */)
9241   (Lisp_Object terminal)
9242 {
9243   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9244                          (get_terminal (terminal, 1))->id);
9245 }
9246
9247 \f
9248 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9249        Sfind_operation_coding_system,  1, MANY, 0,
9250        doc: /* Choose a coding system for an operation based on the target name.
9251 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9252 DECODING-SYSTEM is the coding system to use for decoding
9253 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9254 for encoding (in case OPERATION does encoding).
9255
9256 The first argument OPERATION specifies an I/O primitive:
9257   For file I/O, `insert-file-contents' or `write-region'.
9258   For process I/O, `call-process', `call-process-region', or `start-process'.
9259   For network I/O, `open-network-stream'.
9260
9261 The remaining arguments should be the same arguments that were passed
9262 to the primitive.  Depending on which primitive, one of those arguments
9263 is selected as the TARGET.  For example, if OPERATION does file I/O,
9264 whichever argument specifies the file name is TARGET.
9265
9266 TARGET has a meaning which depends on OPERATION:
9267   For file I/O, TARGET is a file name (except for the special case below).
9268   For process I/O, TARGET is a process name.
9269   For network I/O, TARGET is a service name or a port number.
9270
9271 This function looks up what is specified for TARGET in
9272 `file-coding-system-alist', `process-coding-system-alist',
9273 or `network-coding-system-alist' depending on OPERATION.
9274 They may specify a coding system, a cons of coding systems,
9275 or a function symbol to call.
9276 In the last case, we call the function with one argument,
9277 which is a list of all the arguments given to this function.
9278 If the function can't decide a coding system, it can return
9279 `undecided' so that the normal code-detection is performed.
9280
9281 If OPERATION is `insert-file-contents', the argument corresponding to
9282 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9283 file name to look up, and BUFFER is a buffer that contains the file's
9284 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9285 function to call for FILENAME, that function should examine the
9286 contents of BUFFER instead of reading the file.
9287
9288 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9289   (ptrdiff_t nargs, Lisp_Object *args)
9290 {
9291   Lisp_Object operation, target_idx, target, val;
9292   register Lisp_Object chain;
9293
9294   if (nargs < 2)
9295     error ("Too few arguments");
9296   operation = args[0];
9297   if (!SYMBOLP (operation)
9298       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9299     error ("Invalid first argument");
9300   if (nargs <= 1 + XFASTINT (target_idx))
9301     error ("Too few arguments for operation `%s'",
9302            SDATA (SYMBOL_NAME (operation)));
9303   target = args[XFASTINT (target_idx) + 1];
9304   if (!(STRINGP (target)
9305         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9306             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9307         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9308     error ("Invalid argument %"pI"d of operation `%s'",
9309            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9310   if (CONSP (target))
9311     target = XCAR (target);
9312
9313   chain = ((EQ (operation, Qinsert_file_contents)
9314             || EQ (operation, Qwrite_region))
9315            ? Vfile_coding_system_alist
9316            : (EQ (operation, Qopen_network_stream)
9317               ? Vnetwork_coding_system_alist
9318               : Vprocess_coding_system_alist));
9319   if (NILP (chain))
9320     return Qnil;
9321
9322   for (; CONSP (chain); chain = XCDR (chain))
9323     {
9324       Lisp_Object elt;
9325
9326       elt = XCAR (chain);
9327       if (CONSP (elt)
9328           && ((STRINGP (target)
9329                && STRINGP (XCAR (elt))
9330                && fast_string_match (XCAR (elt), target) >= 0)
9331               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9332         {
9333           val = XCDR (elt);
9334           /* Here, if VAL is both a valid coding system and a valid
9335              function symbol, we return VAL as a coding system.  */
9336           if (CONSP (val))
9337             return val;
9338           if (! SYMBOLP (val))
9339             return Qnil;
9340           if (! NILP (Fcoding_system_p (val)))
9341             return Fcons (val, val);
9342           if (! NILP (Ffboundp (val)))
9343             {
9344               /* We use call1 rather than safe_call1
9345                  so as to get bug reports about functions called here
9346                  which don't handle the current interface.  */
9347               val = call1 (val, Flist (nargs, args));
9348               if (CONSP (val))
9349                 return val;
9350               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9351                 return Fcons (val, val);
9352             }
9353           return Qnil;
9354         }
9355     }
9356   return Qnil;
9357 }
9358
9359 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9360        Sset_coding_system_priority, 0, MANY, 0,
9361        doc: /* Assign higher priority to the coding systems given as arguments.
9362 If multiple coding systems belong to the same category,
9363 all but the first one are ignored.
9364
9365 usage: (set-coding-system-priority &rest coding-systems)  */)
9366   (ptrdiff_t nargs, Lisp_Object *args)
9367 {
9368   ptrdiff_t i, j;
9369   bool changed[coding_category_max];
9370   enum coding_category priorities[coding_category_max];
9371
9372   memset (changed, 0, sizeof changed);
9373
9374   for (i = j = 0; i < nargs; i++)
9375     {
9376       enum coding_category category;
9377       Lisp_Object spec, attrs;
9378
9379       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9380       attrs = AREF (spec, 0);
9381       category = XINT (CODING_ATTR_CATEGORY (attrs));
9382       if (changed[category])
9383         /* Ignore this coding system because a coding system of the
9384            same category already had a higher priority.  */
9385         continue;
9386       changed[category] = 1;
9387       priorities[j++] = category;
9388       if (coding_categories[category].id >= 0
9389           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9390         setup_coding_system (args[i], &coding_categories[category]);
9391       Fset (AREF (Vcoding_category_table, category), args[i]);
9392     }
9393
9394   /* Now we have decided top J priorities.  Reflect the order of the
9395      original priorities to the remaining priorities.  */
9396
9397   for (i = j, j = 0; i < coding_category_max; i++, j++)
9398     {
9399       while (j < coding_category_max
9400              && changed[coding_priorities[j]])
9401         j++;
9402       if (j == coding_category_max)
9403         emacs_abort ();
9404       priorities[i] = coding_priorities[j];
9405     }
9406
9407   memcpy (coding_priorities, priorities, sizeof priorities);
9408
9409   /* Update `coding-category-list'.  */
9410   Vcoding_category_list = Qnil;
9411   for (i = coding_category_max; i-- > 0; )
9412     Vcoding_category_list
9413       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9414                Vcoding_category_list);
9415
9416   return Qnil;
9417 }
9418
9419 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9420        Scoding_system_priority_list, 0, 1, 0,
9421        doc: /* Return a list of coding systems ordered by their priorities.
9422 The list contains a subset of coding systems; i.e. coding systems
9423 assigned to each coding category (see `coding-category-list').
9424
9425 HIGHESTP non-nil means just return the highest priority one.  */)
9426   (Lisp_Object highestp)
9427 {
9428   int i;
9429   Lisp_Object val;
9430
9431   for (i = 0, val = Qnil; i < coding_category_max; i++)
9432     {
9433       enum coding_category category = coding_priorities[i];
9434       int id = coding_categories[category].id;
9435       Lisp_Object attrs;
9436
9437       if (id < 0)
9438         continue;
9439       attrs = CODING_ID_ATTRS (id);
9440       if (! NILP (highestp))
9441         return CODING_ATTR_BASE_NAME (attrs);
9442       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9443     }
9444   return Fnreverse (val);
9445 }
9446
9447 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9448
9449 static Lisp_Object
9450 make_subsidiaries (Lisp_Object base)
9451 {
9452   Lisp_Object subsidiaries;
9453   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9454   char *buf = alloca (base_name_len + 6);
9455   int i;
9456
9457   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9458   subsidiaries = Fmake_vector (make_number (3), Qnil);
9459   for (i = 0; i < 3; i++)
9460     {
9461       strcpy (buf + base_name_len, suffixes[i]);
9462       ASET (subsidiaries, i, intern (buf));
9463     }
9464   return subsidiaries;
9465 }
9466
9467
9468 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9469        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9470        doc: /* For internal use only.
9471 usage: (define-coding-system-internal ...)  */)
9472   (ptrdiff_t nargs, Lisp_Object *args)
9473 {
9474   Lisp_Object name;
9475   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9476   Lisp_Object attrs;            /* Vector of attributes.  */
9477   Lisp_Object eol_type;
9478   Lisp_Object aliases;
9479   Lisp_Object coding_type, charset_list, safe_charsets;
9480   enum coding_category category;
9481   Lisp_Object tail, val;
9482   int max_charset_id = 0;
9483   int i;
9484
9485   if (nargs < coding_arg_max)
9486     goto short_args;
9487
9488   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9489
9490   name = args[coding_arg_name];
9491   CHECK_SYMBOL (name);
9492   ASET (attrs, coding_attr_base_name, name);
9493
9494   val = args[coding_arg_mnemonic];
9495   if (! STRINGP (val))
9496     CHECK_CHARACTER (val);
9497   ASET (attrs, coding_attr_mnemonic, val);
9498
9499   coding_type = args[coding_arg_coding_type];
9500   CHECK_SYMBOL (coding_type);
9501   ASET (attrs, coding_attr_type, coding_type);
9502
9503   charset_list = args[coding_arg_charset_list];
9504   if (SYMBOLP (charset_list))
9505     {
9506       if (EQ (charset_list, Qiso_2022))
9507         {
9508           if (! EQ (coding_type, Qiso_2022))
9509             error ("Invalid charset-list");
9510           charset_list = Viso_2022_charset_list;
9511         }
9512       else if (EQ (charset_list, Qemacs_mule))
9513         {
9514           if (! EQ (coding_type, Qemacs_mule))
9515             error ("Invalid charset-list");
9516           charset_list = Vemacs_mule_charset_list;
9517         }
9518       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9519         {
9520           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9521             error ("Invalid charset-list");
9522           if (max_charset_id < XFASTINT (XCAR (tail)))
9523             max_charset_id = XFASTINT (XCAR (tail));
9524         }
9525     }
9526   else
9527     {
9528       charset_list = Fcopy_sequence (charset_list);
9529       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9530         {
9531           struct charset *charset;
9532
9533           val = XCAR (tail);
9534           CHECK_CHARSET_GET_CHARSET (val, charset);
9535           if (EQ (coding_type, Qiso_2022)
9536               ? CHARSET_ISO_FINAL (charset) < 0
9537               : EQ (coding_type, Qemacs_mule)
9538               ? CHARSET_EMACS_MULE_ID (charset) < 0
9539               : 0)
9540             error ("Can't handle charset `%s'",
9541                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9542
9543           XSETCAR (tail, make_number (charset->id));
9544           if (max_charset_id < charset->id)
9545             max_charset_id = charset->id;
9546         }
9547     }
9548   ASET (attrs, coding_attr_charset_list, charset_list);
9549
9550   safe_charsets = make_uninit_string (max_charset_id + 1);
9551   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9552   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9553     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9554   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9555
9556   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9557
9558   val = args[coding_arg_decode_translation_table];
9559   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9560     CHECK_SYMBOL (val);
9561   ASET (attrs, coding_attr_decode_tbl, val);
9562
9563   val = args[coding_arg_encode_translation_table];
9564   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9565     CHECK_SYMBOL (val);
9566   ASET (attrs, coding_attr_encode_tbl, val);
9567
9568   val = args[coding_arg_post_read_conversion];
9569   CHECK_SYMBOL (val);
9570   ASET (attrs, coding_attr_post_read, val);
9571
9572   val = args[coding_arg_pre_write_conversion];
9573   CHECK_SYMBOL (val);
9574   ASET (attrs, coding_attr_pre_write, val);
9575
9576   val = args[coding_arg_default_char];
9577   if (NILP (val))
9578     ASET (attrs, coding_attr_default_char, make_number (' '));
9579   else
9580     {
9581       CHECK_CHARACTER (val);
9582       ASET (attrs, coding_attr_default_char, val);
9583     }
9584
9585   val = args[coding_arg_for_unibyte];
9586   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9587
9588   val = args[coding_arg_plist];
9589   CHECK_LIST (val);
9590   ASET (attrs, coding_attr_plist, val);
9591
9592   if (EQ (coding_type, Qcharset))
9593     {
9594       /* Generate a lisp vector of 256 elements.  Each element is nil,
9595          integer, or a list of charset IDs.
9596
9597          If Nth element is nil, the byte code N is invalid in this
9598          coding system.
9599
9600          If Nth element is a number NUM, N is the first byte of a
9601          charset whose ID is NUM.
9602
9603          If Nth element is a list of charset IDs, N is the first byte
9604          of one of them.  The list is sorted by dimensions of the
9605          charsets.  A charset of smaller dimension comes first. */
9606       val = Fmake_vector (make_number (256), Qnil);
9607
9608       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9609         {
9610           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9611           int dim = CHARSET_DIMENSION (charset);
9612           int idx = (dim - 1) * 4;
9613
9614           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9615             ASET (attrs, coding_attr_ascii_compat, Qt);
9616
9617           for (i = charset->code_space[idx];
9618                i <= charset->code_space[idx + 1]; i++)
9619             {
9620               Lisp_Object tmp, tmp2;
9621               int dim2;
9622
9623               tmp = AREF (val, i);
9624               if (NILP (tmp))
9625                 tmp = XCAR (tail);
9626               else if (NUMBERP (tmp))
9627                 {
9628                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9629                   if (dim < dim2)
9630                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9631                   else
9632                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9633                 }
9634               else
9635                 {
9636                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9637                     {
9638                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9639                       if (dim < dim2)
9640                         break;
9641                     }
9642                   if (NILP (tmp2))
9643                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9644                   else
9645                     {
9646                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9647                       XSETCAR (tmp2, XCAR (tail));
9648                     }
9649                 }
9650               ASET (val, i, tmp);
9651             }
9652         }
9653       ASET (attrs, coding_attr_charset_valids, val);
9654       category = coding_category_charset;
9655     }
9656   else if (EQ (coding_type, Qccl))
9657     {
9658       Lisp_Object valids;
9659
9660       if (nargs < coding_arg_ccl_max)
9661         goto short_args;
9662
9663       val = args[coding_arg_ccl_decoder];
9664       CHECK_CCL_PROGRAM (val);
9665       if (VECTORP (val))
9666         val = Fcopy_sequence (val);
9667       ASET (attrs, coding_attr_ccl_decoder, val);
9668
9669       val = args[coding_arg_ccl_encoder];
9670       CHECK_CCL_PROGRAM (val);
9671       if (VECTORP (val))
9672         val = Fcopy_sequence (val);
9673       ASET (attrs, coding_attr_ccl_encoder, val);
9674
9675       val = args[coding_arg_ccl_valids];
9676       valids = Fmake_string (make_number (256), make_number (0));
9677       for (tail = val; CONSP (tail); tail = XCDR (tail))
9678         {
9679           int from, to;
9680
9681           val = XCAR (tail);
9682           if (INTEGERP (val))
9683             {
9684               if (! (0 <= XINT (val) && XINT (val) <= 255))
9685                 args_out_of_range_3 (val, make_number (0), make_number (255));
9686               from = to = XINT (val);
9687             }
9688           else
9689             {
9690               CHECK_CONS (val);
9691               CHECK_NATNUM_CAR (val);
9692               CHECK_NUMBER_CDR (val);
9693               if (XINT (XCAR (val)) > 255)
9694                 args_out_of_range_3 (XCAR (val),
9695                                      make_number (0), make_number (255));
9696               from = XINT (XCAR (val));
9697               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9698                 args_out_of_range_3 (XCDR (val),
9699                                      XCAR (val), make_number (255));
9700               to = XINT (XCDR (val));
9701             }
9702           for (i = from; i <= to; i++)
9703             SSET (valids, i, 1);
9704         }
9705       ASET (attrs, coding_attr_ccl_valids, valids);
9706
9707       category = coding_category_ccl;
9708     }
9709   else if (EQ (coding_type, Qutf_16))
9710     {
9711       Lisp_Object bom, endian;
9712
9713       ASET (attrs, coding_attr_ascii_compat, Qnil);
9714
9715       if (nargs < coding_arg_utf16_max)
9716         goto short_args;
9717
9718       bom = args[coding_arg_utf16_bom];
9719       if (! NILP (bom) && ! EQ (bom, Qt))
9720         {
9721           CHECK_CONS (bom);
9722           val = XCAR (bom);
9723           CHECK_CODING_SYSTEM (val);
9724           val = XCDR (bom);
9725           CHECK_CODING_SYSTEM (val);
9726         }
9727       ASET (attrs, coding_attr_utf_bom, bom);
9728
9729       endian = args[coding_arg_utf16_endian];
9730       CHECK_SYMBOL (endian);
9731       if (NILP (endian))
9732         endian = Qbig;
9733       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9734         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9735       ASET (attrs, coding_attr_utf_16_endian, endian);
9736
9737       category = (CONSP (bom)
9738                   ? coding_category_utf_16_auto
9739                   : NILP (bom)
9740                   ? (EQ (endian, Qbig)
9741                      ? coding_category_utf_16_be_nosig
9742                      : coding_category_utf_16_le_nosig)
9743                   : (EQ (endian, Qbig)
9744                      ? coding_category_utf_16_be
9745                      : coding_category_utf_16_le));
9746     }
9747   else if (EQ (coding_type, Qiso_2022))
9748     {
9749       Lisp_Object initial, reg_usage, request, flags;
9750
9751       if (nargs < coding_arg_iso2022_max)
9752         goto short_args;
9753
9754       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9755       CHECK_VECTOR (initial);
9756       for (i = 0; i < 4; i++)
9757         {
9758           val = Faref (initial, make_number (i));
9759           if (! NILP (val))
9760             {
9761               struct charset *charset;
9762
9763               CHECK_CHARSET_GET_CHARSET (val, charset);
9764               ASET (initial, i, make_number (CHARSET_ID (charset)));
9765               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9766                 ASET (attrs, coding_attr_ascii_compat, Qt);
9767             }
9768           else
9769             ASET (initial, i, make_number (-1));
9770         }
9771
9772       reg_usage = args[coding_arg_iso2022_reg_usage];
9773       CHECK_CONS (reg_usage);
9774       CHECK_NUMBER_CAR (reg_usage);
9775       CHECK_NUMBER_CDR (reg_usage);
9776
9777       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9778       for (tail = request; CONSP (tail); tail = XCDR (tail))
9779         {
9780           int id;
9781           Lisp_Object tmp1;
9782
9783           val = XCAR (tail);
9784           CHECK_CONS (val);
9785           tmp1 = XCAR (val);
9786           CHECK_CHARSET_GET_ID (tmp1, id);
9787           CHECK_NATNUM_CDR (val);
9788           if (XINT (XCDR (val)) >= 4)
9789             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9790           XSETCAR (val, make_number (id));
9791         }
9792
9793       flags = args[coding_arg_iso2022_flags];
9794       CHECK_NATNUM (flags);
9795       i = XINT (flags) & INT_MAX;
9796       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9797         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9798       flags = make_number (i);
9799
9800       ASET (attrs, coding_attr_iso_initial, initial);
9801       ASET (attrs, coding_attr_iso_usage, reg_usage);
9802       ASET (attrs, coding_attr_iso_request, request);
9803       ASET (attrs, coding_attr_iso_flags, flags);
9804       setup_iso_safe_charsets (attrs);
9805
9806       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9807         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9808                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9809                     ? coding_category_iso_7_else
9810                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9811                     ? coding_category_iso_7
9812                     : coding_category_iso_7_tight);
9813       else
9814         {
9815           int id = XINT (AREF (initial, 1));
9816
9817           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9818                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9819                        || id < 0)
9820                       ? coding_category_iso_8_else
9821                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9822                       ? coding_category_iso_8_1
9823                       : coding_category_iso_8_2);
9824         }
9825       if (category != coding_category_iso_8_1
9826           && category != coding_category_iso_8_2)
9827         ASET (attrs, coding_attr_ascii_compat, Qnil);
9828     }
9829   else if (EQ (coding_type, Qemacs_mule))
9830     {
9831       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9832         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9833       ASET (attrs, coding_attr_ascii_compat, Qt);
9834       category = coding_category_emacs_mule;
9835     }
9836   else if (EQ (coding_type, Qshift_jis))
9837     {
9838
9839       struct charset *charset;
9840
9841       if (XINT (Flength (charset_list)) != 3
9842           && XINT (Flength (charset_list)) != 4)
9843         error ("There should be three or four charsets");
9844
9845       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9846       if (CHARSET_DIMENSION (charset) != 1)
9847         error ("Dimension of charset %s is not one",
9848                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9849       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9850         ASET (attrs, coding_attr_ascii_compat, Qt);
9851
9852       charset_list = XCDR (charset_list);
9853       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9854       if (CHARSET_DIMENSION (charset) != 1)
9855         error ("Dimension of charset %s is not one",
9856                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9857
9858       charset_list = XCDR (charset_list);
9859       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9860       if (CHARSET_DIMENSION (charset) != 2)
9861         error ("Dimension of charset %s is not two",
9862                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9863
9864       charset_list = XCDR (charset_list);
9865       if (! NILP (charset_list))
9866         {
9867           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9868           if (CHARSET_DIMENSION (charset) != 2)
9869             error ("Dimension of charset %s is not two",
9870                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9871         }
9872
9873       category = coding_category_sjis;
9874       Vsjis_coding_system = name;
9875     }
9876   else if (EQ (coding_type, Qbig5))
9877     {
9878       struct charset *charset;
9879
9880       if (XINT (Flength (charset_list)) != 2)
9881         error ("There should be just two charsets");
9882
9883       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9884       if (CHARSET_DIMENSION (charset) != 1)
9885         error ("Dimension of charset %s is not one",
9886                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9887       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9888         ASET (attrs, coding_attr_ascii_compat, Qt);
9889
9890       charset_list = XCDR (charset_list);
9891       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9892       if (CHARSET_DIMENSION (charset) != 2)
9893         error ("Dimension of charset %s is not two",
9894                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9895
9896       category = coding_category_big5;
9897       Vbig5_coding_system = name;
9898     }
9899   else if (EQ (coding_type, Qraw_text))
9900     {
9901       category = coding_category_raw_text;
9902       ASET (attrs, coding_attr_ascii_compat, Qt);
9903     }
9904   else if (EQ (coding_type, Qutf_8))
9905     {
9906       Lisp_Object bom;
9907
9908       if (nargs < coding_arg_utf8_max)
9909         goto short_args;
9910
9911       bom = args[coding_arg_utf8_bom];
9912       if (! NILP (bom) && ! EQ (bom, Qt))
9913         {
9914           CHECK_CONS (bom);
9915           val = XCAR (bom);
9916           CHECK_CODING_SYSTEM (val);
9917           val = XCDR (bom);
9918           CHECK_CODING_SYSTEM (val);
9919         }
9920       ASET (attrs, coding_attr_utf_bom, bom);
9921       if (NILP (bom))
9922         ASET (attrs, coding_attr_ascii_compat, Qt);
9923
9924       category = (CONSP (bom) ? coding_category_utf_8_auto
9925                   : NILP (bom) ? coding_category_utf_8_nosig
9926                   : coding_category_utf_8_sig);
9927     }
9928   else if (EQ (coding_type, Qundecided))
9929     category = coding_category_undecided;
9930   else
9931     error ("Invalid coding system type: %s",
9932            SDATA (SYMBOL_NAME (coding_type)));
9933
9934   ASET (attrs, coding_attr_category, make_number (category));
9935   ASET (attrs, coding_attr_plist,
9936         Fcons (QCcategory,
9937                Fcons (AREF (Vcoding_category_table, category),
9938                       CODING_ATTR_PLIST (attrs))));
9939   ASET (attrs, coding_attr_plist,
9940         Fcons (QCascii_compatible_p,
9941                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9942                       CODING_ATTR_PLIST (attrs))));
9943
9944   eol_type = args[coding_arg_eol_type];
9945   if (! NILP (eol_type)
9946       && ! EQ (eol_type, Qunix)
9947       && ! EQ (eol_type, Qdos)
9948       && ! EQ (eol_type, Qmac))
9949     error ("Invalid eol-type");
9950
9951   aliases = Fcons (name, Qnil);
9952
9953   if (NILP (eol_type))
9954     {
9955       eol_type = make_subsidiaries (name);
9956       for (i = 0; i < 3; i++)
9957         {
9958           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9959
9960           this_name = AREF (eol_type, i);
9961           this_aliases = Fcons (this_name, Qnil);
9962           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9963           this_spec = Fmake_vector (make_number (3), attrs);
9964           ASET (this_spec, 1, this_aliases);
9965           ASET (this_spec, 2, this_eol_type);
9966           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9967           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9968           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9969           if (NILP (val))
9970             Vcoding_system_alist
9971               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9972                        Vcoding_system_alist);
9973         }
9974     }
9975
9976   spec_vec = Fmake_vector (make_number (3), attrs);
9977   ASET (spec_vec, 1, aliases);
9978   ASET (spec_vec, 2, eol_type);
9979
9980   Fputhash (name, spec_vec, Vcoding_system_hash_table);
9981   Vcoding_system_list = Fcons (name, Vcoding_system_list);
9982   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9983   if (NILP (val))
9984     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9985                                   Vcoding_system_alist);
9986
9987   {
9988     int id = coding_categories[category].id;
9989
9990     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9991       setup_coding_system (name, &coding_categories[category]);
9992   }
9993
9994   return Qnil;
9995
9996  short_args:
9997   return Fsignal (Qwrong_number_of_arguments,
9998                   Fcons (intern ("define-coding-system-internal"),
9999                          make_number (nargs)));
10000 }
10001
10002
10003 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10004        3, 3, 0,
10005        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10006   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10007 {
10008   Lisp_Object spec, attrs;
10009
10010   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10011   attrs = AREF (spec, 0);
10012   if (EQ (prop, QCmnemonic))
10013     {
10014       if (! STRINGP (val))
10015         CHECK_CHARACTER (val);
10016       ASET (attrs, coding_attr_mnemonic, val);
10017     }
10018   else if (EQ (prop, QCdefault_char))
10019     {
10020       if (NILP (val))
10021         val = make_number (' ');
10022       else
10023         CHECK_CHARACTER (val);
10024       ASET (attrs, coding_attr_default_char, val);
10025     }
10026   else if (EQ (prop, QCdecode_translation_table))
10027     {
10028       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10029         CHECK_SYMBOL (val);
10030       ASET (attrs, coding_attr_decode_tbl, val);
10031     }
10032   else if (EQ (prop, QCencode_translation_table))
10033     {
10034       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10035         CHECK_SYMBOL (val);
10036       ASET (attrs, coding_attr_encode_tbl, val);
10037     }
10038   else if (EQ (prop, QCpost_read_conversion))
10039     {
10040       CHECK_SYMBOL (val);
10041       ASET (attrs, coding_attr_post_read, val);
10042     }
10043   else if (EQ (prop, QCpre_write_conversion))
10044     {
10045       CHECK_SYMBOL (val);
10046       ASET (attrs, coding_attr_pre_write, val);
10047     }
10048   else if (EQ (prop, QCascii_compatible_p))
10049     {
10050       ASET (attrs, coding_attr_ascii_compat, val);
10051     }
10052
10053   ASET (attrs, coding_attr_plist,
10054         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10055   return val;
10056 }
10057
10058
10059 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10060        Sdefine_coding_system_alias, 2, 2, 0,
10061        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10062   (Lisp_Object alias, Lisp_Object coding_system)
10063 {
10064   Lisp_Object spec, aliases, eol_type, val;
10065
10066   CHECK_SYMBOL (alias);
10067   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10068   aliases = AREF (spec, 1);
10069   /* ALIASES should be a list of length more than zero, and the first
10070      element is a base coding system.  Append ALIAS at the tail of the
10071      list.  */
10072   while (!NILP (XCDR (aliases)))
10073     aliases = XCDR (aliases);
10074   XSETCDR (aliases, Fcons (alias, Qnil));
10075
10076   eol_type = AREF (spec, 2);
10077   if (VECTORP (eol_type))
10078     {
10079       Lisp_Object subsidiaries;
10080       int i;
10081
10082       subsidiaries = make_subsidiaries (alias);
10083       for (i = 0; i < 3; i++)
10084         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10085                                      AREF (eol_type, i));
10086     }
10087
10088   Fputhash (alias, spec, Vcoding_system_hash_table);
10089   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10090   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10091   if (NILP (val))
10092     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10093                                   Vcoding_system_alist);
10094
10095   return Qnil;
10096 }
10097
10098 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10099        1, 1, 0,
10100        doc: /* Return the base of CODING-SYSTEM.
10101 Any alias or subsidiary coding system is not a base coding system.  */)
10102   (Lisp_Object coding_system)
10103 {
10104   Lisp_Object spec, attrs;
10105
10106   if (NILP (coding_system))
10107     return (Qno_conversion);
10108   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10109   attrs = AREF (spec, 0);
10110   return CODING_ATTR_BASE_NAME (attrs);
10111 }
10112
10113 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10114        1, 1, 0,
10115        doc: "Return the property list of CODING-SYSTEM.")
10116   (Lisp_Object coding_system)
10117 {
10118   Lisp_Object spec, attrs;
10119
10120   if (NILP (coding_system))
10121     coding_system = Qno_conversion;
10122   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10123   attrs = AREF (spec, 0);
10124   return CODING_ATTR_PLIST (attrs);
10125 }
10126
10127
10128 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10129        1, 1, 0,
10130        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10131   (Lisp_Object coding_system)
10132 {
10133   Lisp_Object spec;
10134
10135   if (NILP (coding_system))
10136     coding_system = Qno_conversion;
10137   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10138   return AREF (spec, 1);
10139 }
10140
10141 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10142        Scoding_system_eol_type, 1, 1, 0,
10143        doc: /* Return eol-type of CODING-SYSTEM.
10144 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10145
10146 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10147 and CR respectively.
10148
10149 A vector value indicates that a format of end-of-line should be
10150 detected automatically.  Nth element of the vector is the subsidiary
10151 coding system whose eol-type is N.  */)
10152   (Lisp_Object coding_system)
10153 {
10154   Lisp_Object spec, eol_type;
10155   int n;
10156
10157   if (NILP (coding_system))
10158     coding_system = Qno_conversion;
10159   if (! CODING_SYSTEM_P (coding_system))
10160     return Qnil;
10161   spec = CODING_SYSTEM_SPEC (coding_system);
10162   eol_type = AREF (spec, 2);
10163   if (VECTORP (eol_type))
10164     return Fcopy_sequence (eol_type);
10165   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10166   return make_number (n);
10167 }
10168
10169 #endif /* emacs */
10170
10171 \f
10172 /*** 9. Post-amble ***/
10173
10174 void
10175 init_coding_once (void)
10176 {
10177   int i;
10178
10179   for (i = 0; i < coding_category_max; i++)
10180     {
10181       coding_categories[i].id = -1;
10182       coding_priorities[i] = i;
10183     }
10184
10185   /* ISO2022 specific initialize routine.  */
10186   for (i = 0; i < 0x20; i++)
10187     iso_code_class[i] = ISO_control_0;
10188   for (i = 0x21; i < 0x7F; i++)
10189     iso_code_class[i] = ISO_graphic_plane_0;
10190   for (i = 0x80; i < 0xA0; i++)
10191     iso_code_class[i] = ISO_control_1;
10192   for (i = 0xA1; i < 0xFF; i++)
10193     iso_code_class[i] = ISO_graphic_plane_1;
10194   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10195   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10196   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10197   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10198   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10199   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10200   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10201   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10202   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10203
10204   for (i = 0; i < 256; i++)
10205     {
10206       emacs_mule_bytes[i] = 1;
10207     }
10208   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10209   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10210   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10211   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10212 }
10213
10214 #ifdef emacs
10215
10216 void
10217 syms_of_coding (void)
10218 {
10219   staticpro (&Vcoding_system_hash_table);
10220   {
10221     Lisp_Object args[2];
10222     args[0] = QCtest;
10223     args[1] = Qeq;
10224     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10225   }
10226
10227   staticpro (&Vsjis_coding_system);
10228   Vsjis_coding_system = Qnil;
10229
10230   staticpro (&Vbig5_coding_system);
10231   Vbig5_coding_system = Qnil;
10232
10233   staticpro (&Vcode_conversion_reused_workbuf);
10234   Vcode_conversion_reused_workbuf = Qnil;
10235
10236   staticpro (&Vcode_conversion_workbuf_name);
10237   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10238
10239   reused_workbuf_in_use = 0;
10240
10241   DEFSYM (Qcharset, "charset");
10242   DEFSYM (Qtarget_idx, "target-idx");
10243   DEFSYM (Qcoding_system_history, "coding-system-history");
10244   Fset (Qcoding_system_history, Qnil);
10245
10246   /* Target FILENAME is the first argument.  */
10247   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10248   /* Target FILENAME is the third argument.  */
10249   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10250
10251   DEFSYM (Qcall_process, "call-process");
10252   /* Target PROGRAM is the first argument.  */
10253   Fput (Qcall_process, Qtarget_idx, make_number (0));
10254
10255   DEFSYM (Qcall_process_region, "call-process-region");
10256   /* Target PROGRAM is the third argument.  */
10257   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10258
10259   DEFSYM (Qstart_process, "start-process");
10260   /* Target PROGRAM is the third argument.  */
10261   Fput (Qstart_process, Qtarget_idx, make_number (2));
10262
10263   DEFSYM (Qopen_network_stream, "open-network-stream");
10264   /* Target SERVICE is the fourth argument.  */
10265   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10266
10267   DEFSYM (Qcoding_system, "coding-system");
10268   DEFSYM (Qcoding_aliases, "coding-aliases");
10269
10270   DEFSYM (Qeol_type, "eol-type");
10271   DEFSYM (Qunix, "unix");
10272   DEFSYM (Qdos, "dos");
10273
10274   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10275   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10276   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10277   DEFSYM (Qdefault_char, "default-char");
10278   DEFSYM (Qundecided, "undecided");
10279   DEFSYM (Qno_conversion, "no-conversion");
10280   DEFSYM (Qraw_text, "raw-text");
10281
10282   DEFSYM (Qiso_2022, "iso-2022");
10283
10284   DEFSYM (Qutf_8, "utf-8");
10285   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10286
10287   DEFSYM (Qutf_16, "utf-16");
10288   DEFSYM (Qbig, "big");
10289   DEFSYM (Qlittle, "little");
10290
10291   DEFSYM (Qshift_jis, "shift-jis");
10292   DEFSYM (Qbig5, "big5");
10293
10294   DEFSYM (Qcoding_system_p, "coding-system-p");
10295
10296   DEFSYM (Qcoding_system_error, "coding-system-error");
10297   Fput (Qcoding_system_error, Qerror_conditions,
10298         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10299   Fput (Qcoding_system_error, Qerror_message,
10300         build_pure_c_string ("Invalid coding system"));
10301
10302   /* Intern this now in case it isn't already done.
10303      Setting this variable twice is harmless.
10304      But don't staticpro it here--that is done in alloc.c.  */
10305   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10306
10307   DEFSYM (Qtranslation_table, "translation-table");
10308   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10309   DEFSYM (Qtranslation_table_id, "translation-table-id");
10310   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10311   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10312
10313   DEFSYM (Qvalid_codes, "valid-codes");
10314
10315   DEFSYM (Qemacs_mule, "emacs-mule");
10316
10317   DEFSYM (QCcategory, ":category");
10318   DEFSYM (QCmnemonic, ":mnemonic");
10319   DEFSYM (QCdefault_char, ":default-char");
10320   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10321   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10322   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10323   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10324   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10325
10326   Vcoding_category_table
10327     = Fmake_vector (make_number (coding_category_max), Qnil);
10328   staticpro (&Vcoding_category_table);
10329   /* Followings are target of code detection.  */
10330   ASET (Vcoding_category_table, coding_category_iso_7,
10331         intern_c_string ("coding-category-iso-7"));
10332   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10333         intern_c_string ("coding-category-iso-7-tight"));
10334   ASET (Vcoding_category_table, coding_category_iso_8_1,
10335         intern_c_string ("coding-category-iso-8-1"));
10336   ASET (Vcoding_category_table, coding_category_iso_8_2,
10337         intern_c_string ("coding-category-iso-8-2"));
10338   ASET (Vcoding_category_table, coding_category_iso_7_else,
10339         intern_c_string ("coding-category-iso-7-else"));
10340   ASET (Vcoding_category_table, coding_category_iso_8_else,
10341         intern_c_string ("coding-category-iso-8-else"));
10342   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10343         intern_c_string ("coding-category-utf-8-auto"));
10344   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10345         intern_c_string ("coding-category-utf-8"));
10346   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10347         intern_c_string ("coding-category-utf-8-sig"));
10348   ASET (Vcoding_category_table, coding_category_utf_16_be,
10349         intern_c_string ("coding-category-utf-16-be"));
10350   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10351         intern_c_string ("coding-category-utf-16-auto"));
10352   ASET (Vcoding_category_table, coding_category_utf_16_le,
10353         intern_c_string ("coding-category-utf-16-le"));
10354   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10355         intern_c_string ("coding-category-utf-16-be-nosig"));
10356   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10357         intern_c_string ("coding-category-utf-16-le-nosig"));
10358   ASET (Vcoding_category_table, coding_category_charset,
10359         intern_c_string ("coding-category-charset"));
10360   ASET (Vcoding_category_table, coding_category_sjis,
10361         intern_c_string ("coding-category-sjis"));
10362   ASET (Vcoding_category_table, coding_category_big5,
10363         intern_c_string ("coding-category-big5"));
10364   ASET (Vcoding_category_table, coding_category_ccl,
10365         intern_c_string ("coding-category-ccl"));
10366   ASET (Vcoding_category_table, coding_category_emacs_mule,
10367         intern_c_string ("coding-category-emacs-mule"));
10368   /* Followings are NOT target of code detection.  */
10369   ASET (Vcoding_category_table, coding_category_raw_text,
10370         intern_c_string ("coding-category-raw-text"));
10371   ASET (Vcoding_category_table, coding_category_undecided,
10372         intern_c_string ("coding-category-undecided"));
10373
10374   DEFSYM (Qinsufficient_source, "insufficient-source");
10375   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10376   DEFSYM (Qinvalid_source, "invalid-source");
10377   DEFSYM (Qinterrupted, "interrupted");
10378   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10379   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10380
10381   defsubr (&Scoding_system_p);
10382   defsubr (&Sread_coding_system);
10383   defsubr (&Sread_non_nil_coding_system);
10384   defsubr (&Scheck_coding_system);
10385   defsubr (&Sdetect_coding_region);
10386   defsubr (&Sdetect_coding_string);
10387   defsubr (&Sfind_coding_systems_region_internal);
10388   defsubr (&Sunencodable_char_position);
10389   defsubr (&Scheck_coding_systems_region);
10390   defsubr (&Sdecode_coding_region);
10391   defsubr (&Sencode_coding_region);
10392   defsubr (&Sdecode_coding_string);
10393   defsubr (&Sencode_coding_string);
10394   defsubr (&Sdecode_sjis_char);
10395   defsubr (&Sencode_sjis_char);
10396   defsubr (&Sdecode_big5_char);
10397   defsubr (&Sencode_big5_char);
10398   defsubr (&Sset_terminal_coding_system_internal);
10399   defsubr (&Sset_safe_terminal_coding_system_internal);
10400   defsubr (&Sterminal_coding_system);
10401   defsubr (&Sset_keyboard_coding_system_internal);
10402   defsubr (&Skeyboard_coding_system);
10403   defsubr (&Sfind_operation_coding_system);
10404   defsubr (&Sset_coding_system_priority);
10405   defsubr (&Sdefine_coding_system_internal);
10406   defsubr (&Sdefine_coding_system_alias);
10407   defsubr (&Scoding_system_put);
10408   defsubr (&Scoding_system_base);
10409   defsubr (&Scoding_system_plist);
10410   defsubr (&Scoding_system_aliases);
10411   defsubr (&Scoding_system_eol_type);
10412   defsubr (&Scoding_system_priority_list);
10413
10414   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10415                doc: /* List of coding systems.
10416
10417 Do not alter the value of this variable manually.  This variable should be
10418 updated by the functions `define-coding-system' and
10419 `define-coding-system-alias'.  */);
10420   Vcoding_system_list = Qnil;
10421
10422   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10423                doc: /* Alist of coding system names.
10424 Each element is one element list of coding system name.
10425 This variable is given to `completing-read' as COLLECTION argument.
10426
10427 Do not alter the value of this variable manually.  This variable should be
10428 updated by the functions `make-coding-system' and
10429 `define-coding-system-alias'.  */);
10430   Vcoding_system_alist = Qnil;
10431
10432   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10433                doc: /* List of coding-categories (symbols) ordered by priority.
10434
10435 On detecting a coding system, Emacs tries code detection algorithms
10436 associated with each coding-category one by one in this order.  When
10437 one algorithm agrees with a byte sequence of source text, the coding
10438 system bound to the corresponding coding-category is selected.
10439
10440 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10441   {
10442     int i;
10443
10444     Vcoding_category_list = Qnil;
10445     for (i = coding_category_max - 1; i >= 0; i--)
10446       Vcoding_category_list
10447         = Fcons (AREF (Vcoding_category_table, i),
10448                  Vcoding_category_list);
10449   }
10450
10451   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10452                doc: /* Specify the coding system for read operations.
10453 It is useful to bind this variable with `let', but do not set it globally.
10454 If the value is a coding system, it is used for decoding on read operation.
10455 If not, an appropriate element is used from one of the coding system alists.
10456 There are three such tables: `file-coding-system-alist',
10457 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10458   Vcoding_system_for_read = Qnil;
10459
10460   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10461                doc: /* Specify the coding system for write operations.
10462 Programs bind this variable with `let', but you should not set it globally.
10463 If the value is a coding system, it is used for encoding of output,
10464 when writing it to a file and when sending it to a file or subprocess.
10465
10466 If this does not specify a coding system, an appropriate element
10467 is used from one of the coding system alists.
10468 There are three such tables: `file-coding-system-alist',
10469 `process-coding-system-alist', and `network-coding-system-alist'.
10470 For output to files, if the above procedure does not specify a coding system,
10471 the value of `buffer-file-coding-system' is used.  */);
10472   Vcoding_system_for_write = Qnil;
10473
10474   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10475                doc: /*
10476 Coding system used in the latest file or process I/O.  */);
10477   Vlast_coding_system_used = Qnil;
10478
10479   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10480                doc: /*
10481 Error status of the last code conversion.
10482
10483 When an error was detected in the last code conversion, this variable
10484 is set to one of the following symbols.
10485   `insufficient-source'
10486   `inconsistent-eol'
10487   `invalid-source'
10488   `interrupted'
10489   `insufficient-memory'
10490 When no error was detected, the value doesn't change.  So, to check
10491 the error status of a code conversion by this variable, you must
10492 explicitly set this variable to nil before performing code
10493 conversion.  */);
10494   Vlast_code_conversion_error = Qnil;
10495
10496   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10497                doc: /*
10498 *Non-nil means always inhibit code conversion of end-of-line format.
10499 See info node `Coding Systems' and info node `Text and Binary' concerning
10500 such conversion.  */);
10501   inhibit_eol_conversion = 0;
10502
10503   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10504                doc: /*
10505 Non-nil means process buffer inherits coding system of process output.
10506 Bind it to t if the process output is to be treated as if it were a file
10507 read from some filesystem.  */);
10508   inherit_process_coding_system = 0;
10509
10510   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10511                doc: /*
10512 Alist to decide a coding system to use for a file I/O operation.
10513 The format is ((PATTERN . VAL) ...),
10514 where PATTERN is a regular expression matching a file name,
10515 VAL is a coding system, a cons of coding systems, or a function symbol.
10516 If VAL is a coding system, it is used for both decoding and encoding
10517 the file contents.
10518 If VAL is a cons of coding systems, the car part is used for decoding,
10519 and the cdr part is used for encoding.
10520 If VAL is a function symbol, the function must return a coding system
10521 or a cons of coding systems which are used as above.  The function is
10522 called with an argument that is a list of the arguments with which
10523 `find-operation-coding-system' was called.  If the function can't decide
10524 a coding system, it can return `undecided' so that the normal
10525 code-detection is performed.
10526
10527 See also the function `find-operation-coding-system'
10528 and the variable `auto-coding-alist'.  */);
10529   Vfile_coding_system_alist = Qnil;
10530
10531   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10532                doc: /*
10533 Alist to decide a coding system to use for a process I/O operation.
10534 The format is ((PATTERN . VAL) ...),
10535 where PATTERN is a regular expression matching a program name,
10536 VAL is a coding system, a cons of coding systems, or a function symbol.
10537 If VAL is a coding system, it is used for both decoding what received
10538 from the program and encoding what sent to the program.
10539 If VAL is a cons of coding systems, the car part is used for decoding,
10540 and the cdr part is used for encoding.
10541 If VAL is a function symbol, the function must return a coding system
10542 or a cons of coding systems which are used as above.
10543
10544 See also the function `find-operation-coding-system'.  */);
10545   Vprocess_coding_system_alist = Qnil;
10546
10547   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10548                doc: /*
10549 Alist to decide a coding system to use for a network I/O operation.
10550 The format is ((PATTERN . VAL) ...),
10551 where PATTERN is a regular expression matching a network service name
10552 or is a port number to connect to,
10553 VAL is a coding system, a cons of coding systems, or a function symbol.
10554 If VAL is a coding system, it is used for both decoding what received
10555 from the network stream and encoding what sent to the network stream.
10556 If VAL is a cons of coding systems, the car part is used for decoding,
10557 and the cdr part is used for encoding.
10558 If VAL is a function symbol, the function must return a coding system
10559 or a cons of coding systems which are used as above.
10560
10561 See also the function `find-operation-coding-system'.  */);
10562   Vnetwork_coding_system_alist = Qnil;
10563
10564   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10565                doc: /* Coding system to use with system messages.
10566 Also used for decoding keyboard input on X Window system.  */);
10567   Vlocale_coding_system = Qnil;
10568
10569   /* The eol mnemonics are reset in startup.el system-dependently.  */
10570   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10571                doc: /*
10572 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10573   eol_mnemonic_unix = build_pure_c_string (":");
10574
10575   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10576                doc: /*
10577 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10578   eol_mnemonic_dos = build_pure_c_string ("\\");
10579
10580   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10581                doc: /*
10582 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10583   eol_mnemonic_mac = build_pure_c_string ("/");
10584
10585   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10586                doc: /*
10587 *String displayed in mode line when end-of-line format is not yet determined.  */);
10588   eol_mnemonic_undecided = build_pure_c_string (":");
10589
10590   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10591                doc: /*
10592 *Non-nil enables character translation while encoding and decoding.  */);
10593   Venable_character_translation = Qt;
10594
10595   DEFVAR_LISP ("standard-translation-table-for-decode",
10596                Vstandard_translation_table_for_decode,
10597                doc: /* Table for translating characters while decoding.  */);
10598   Vstandard_translation_table_for_decode = Qnil;
10599
10600   DEFVAR_LISP ("standard-translation-table-for-encode",
10601                Vstandard_translation_table_for_encode,
10602                doc: /* Table for translating characters while encoding.  */);
10603   Vstandard_translation_table_for_encode = Qnil;
10604
10605   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10606                doc: /* Alist of charsets vs revision numbers.
10607 While encoding, if a charset (car part of an element) is found,
10608 designate it with the escape sequence identifying revision (cdr part
10609 of the element).  */);
10610   Vcharset_revision_table = Qnil;
10611
10612   DEFVAR_LISP ("default-process-coding-system",
10613                Vdefault_process_coding_system,
10614                doc: /* Cons of coding systems used for process I/O by default.
10615 The car part is used for decoding a process output,
10616 the cdr part is used for encoding a text to be sent to a process.  */);
10617   Vdefault_process_coding_system = Qnil;
10618
10619   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10620                doc: /*
10621 Table of extra Latin codes in the range 128..159 (inclusive).
10622 This is a vector of length 256.
10623 If Nth element is non-nil, the existence of code N in a file
10624 \(or output of subprocess) doesn't prevent it to be detected as
10625 a coding system of ISO 2022 variant which has a flag
10626 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10627 or reading output of a subprocess.
10628 Only 128th through 159th elements have a meaning.  */);
10629   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10630
10631   DEFVAR_LISP ("select-safe-coding-system-function",
10632                Vselect_safe_coding_system_function,
10633                doc: /*
10634 Function to call to select safe coding system for encoding a text.
10635
10636 If set, this function is called to force a user to select a proper
10637 coding system which can encode the text in the case that a default
10638 coding system used in each operation can't encode the text.  The
10639 function should take care that the buffer is not modified while
10640 the coding system is being selected.
10641
10642 The default value is `select-safe-coding-system' (which see).  */);
10643   Vselect_safe_coding_system_function = Qnil;
10644
10645   DEFVAR_BOOL ("coding-system-require-warning",
10646                coding_system_require_warning,
10647                doc: /* Internal use only.
10648 If non-nil, on writing a file, `select-safe-coding-system-function' is
10649 called even if `coding-system-for-write' is non-nil.  The command
10650 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10651   coding_system_require_warning = 0;
10652
10653
10654   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10655                inhibit_iso_escape_detection,
10656                doc: /*
10657 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10658
10659 When Emacs reads text, it tries to detect how the text is encoded.
10660 This code detection is sensitive to escape sequences.  If Emacs sees
10661 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10662 of the ISO2022 encodings, and decodes text by the corresponding coding
10663 system (e.g. `iso-2022-7bit').
10664
10665 However, there may be a case that you want to read escape sequences in
10666 a file as is.  In such a case, you can set this variable to non-nil.
10667 Then the code detection will ignore any escape sequences, and no text is
10668 detected as encoded in some ISO-2022 encoding.  The result is that all
10669 escape sequences become visible in a buffer.
10670
10671 The default value is nil, and it is strongly recommended not to change
10672 it.  That is because many Emacs Lisp source files that contain
10673 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10674 in Emacs's distribution, and they won't be decoded correctly on
10675 reading if you suppress escape sequence detection.
10676
10677 The other way to read escape sequences in a file without decoding is
10678 to explicitly specify some coding system that doesn't use ISO-2022
10679 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10680   inhibit_iso_escape_detection = 0;
10681
10682   DEFVAR_BOOL ("inhibit-null-byte-detection",
10683                inhibit_null_byte_detection,
10684                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10685 By default, Emacs treats it as binary data, and does not attempt to
10686 decode it.  The effect is as if you specified `no-conversion' for
10687 reading that text.
10688
10689 Set this to non-nil when a regular text happens to include null bytes.
10690 Examples are Index nodes of Info files and null-byte delimited output
10691 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10692 decode text as usual.  */);
10693   inhibit_null_byte_detection = 0;
10694
10695   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10696                doc: /* Char table for translating self-inserting characters.
10697 This is applied to the result of input methods, not their input.
10698 See also `keyboard-translate-table'.
10699
10700 Use of this variable for character code unification was rendered
10701 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10702 internal character representation.  */);
10703     Vtranslation_table_for_input = Qnil;
10704
10705   {
10706     Lisp_Object args[coding_arg_max];
10707     Lisp_Object plist[16];
10708     int i;
10709
10710     for (i = 0; i < coding_arg_max; i++)
10711       args[i] = Qnil;
10712
10713     plist[0] = intern_c_string (":name");
10714     plist[1] = args[coding_arg_name] = Qno_conversion;
10715     plist[2] = intern_c_string (":mnemonic");
10716     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10717     plist[4] = intern_c_string (":coding-type");
10718     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10719     plist[6] = intern_c_string (":ascii-compatible-p");
10720     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10721     plist[8] = intern_c_string (":default-char");
10722     plist[9] = args[coding_arg_default_char] = make_number (0);
10723     plist[10] = intern_c_string (":for-unibyte");
10724     plist[11] = args[coding_arg_for_unibyte] = Qt;
10725     plist[12] = intern_c_string (":docstring");
10726     plist[13] = build_pure_c_string ("Do no conversion.\n\
10727 \n\
10728 When you visit a file with this coding, the file is read into a\n\
10729 unibyte buffer as is, thus each byte of a file is treated as a\n\
10730 character.");
10731     plist[14] = intern_c_string (":eol-type");
10732     plist[15] = args[coding_arg_eol_type] = Qunix;
10733     args[coding_arg_plist] = Flist (16, plist);
10734     Fdefine_coding_system_internal (coding_arg_max, args);
10735
10736     plist[1] = args[coding_arg_name] = Qundecided;
10737     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10738     plist[5] = args[coding_arg_coding_type] = Qundecided;
10739     /* This is already set.
10740        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10741     plist[8] = intern_c_string (":charset-list");
10742     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10743     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10744     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10745     plist[15] = args[coding_arg_eol_type] = Qnil;
10746     args[coding_arg_plist] = Flist (16, plist);
10747     Fdefine_coding_system_internal (coding_arg_max, args);
10748   }
10749
10750   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10751
10752   {
10753     int i;
10754
10755     for (i = 0; i < coding_category_max; i++)
10756       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10757   }
10758 #if defined (DOS_NT)
10759   system_eol_type = Qdos;
10760 #else
10761   system_eol_type = Qunix;
10762 #endif
10763   staticpro (&system_eol_type);
10764 }
10765
10766 char *
10767 emacs_strerror (int error_number)
10768 {
10769   char *str;
10770
10771   synchronize_system_messages_locale ();
10772   str = strerror (error_number);
10773
10774   if (! NILP (Vlocale_coding_system))
10775     {
10776       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10777                                                       Vlocale_coding_system,
10778                                                       0);
10779       str = SSDATA (dec);
10780     }
10781
10782   return str;
10783 }
10784
10785 #endif /* emacs */