src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2013 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #include "lisp.h"
 290 #include "character.h"
 291 #include "buffer.h"
 292 #include "charset.h"
 293 #include "ccl.h"
 294 #include "composite.h"
 295 #include "coding.h"
 296 #include "window.h"
 297 #include "frame.h"
 298 #include "termhooks.h"
 299
 300 Lisp_Object Vcoding_system_hash_table;
 301
 302 static Lisp_Object Qcoding_system, Qeol_type;
 303 static Lisp_Object Qcoding_aliases;
 304 Lisp_Object Qunix, Qdos;
 305 static Lisp_Object Qmac;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 static Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qutf_8;
 311 static Lisp_Object Qiso_2022;
 312 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 313 static Lisp_Object Qbig, Qlittle;
 314 static Lisp_Object Qcoding_system_history;
 315 static Lisp_Object Qvalid_codes;
 316 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 317 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 static Lisp_Object QCascii_compatible_p;
 320
 321 Lisp_Object Qcall_process, Qcall_process_region;
 322 Lisp_Object Qstart_process, Qopen_network_stream;
 323 static Lisp_Object Qtarget_idx;
 324
 325 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 326 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 327
 328 /* If a symbol has this property, evaluate the value to define the
 329    symbol as a coding system.  */
 330 static Lisp_Object Qcoding_system_define_form;
 331
 332 /* Format of end-of-line decided by system.  This is Qunix on
 333    Unix and Mac, Qdos on DOS/Windows.
 334    This has an effect only for external encoding (i.e. for output to
 335    file and process), not for in-buffer or Lisp string encoding.  */
 336 static Lisp_Object system_eol_type;
 337
 338 #ifdef emacs
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 #if defined (WINDOWSNT) || defined (CYGWIN)
 348 static Lisp_Object Qutf_16le;
 349 #endif
 350
 351 /* Coding-systems are handed between Emacs Lisp programs and C internal
 352    routines by the following three variables.  */
 353 /* Coding system to be used to encode text for terminal display when
 354    terminal coding system is nil.  */
 355 struct coding_system safe_terminal_coding;
 356
 357 #endif /* emacs */
 358
 359 Lisp_Object Qtranslation_table;
 360 Lisp_Object Qtranslation_table_id;
 361 static Lisp_Object Qtranslation_table_for_decode;
 362 static Lisp_Object Qtranslation_table_for_encode;
 363
 364 /* Two special coding systems.  */
 365 static Lisp_Object Vsjis_coding_system;
 366 static Lisp_Object Vbig5_coding_system;
 367
 368 /* ISO2022 section */
 369
 370 #define CODING_ISO_INITIAL(coding, reg)                 \
 371   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 372                      coding_attr_iso_initial),          \
 373                reg)))
 374
 375
 376 #define CODING_ISO_REQUEST(coding, charset_id)          \
 377   (((charset_id) <= (coding)->max_charset_id            \
 378     ? ((coding)->safe_charsets[charset_id] != 255       \
 379        ? (coding)->safe_charsets[charset_id]            \
 380        : -1)                                            \
 381     : -1))
 382
 383
 384 #define CODING_ISO_FLAGS(coding)        \
 385   ((coding)->spec.iso_2022.flags)
 386 #define CODING_ISO_DESIGNATION(coding, reg)     \
 387   ((coding)->spec.iso_2022.current_designation[reg])
 388 #define CODING_ISO_INVOCATION(coding, plane)    \
 389   ((coding)->spec.iso_2022.current_invocation[plane])
 390 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 391   ((coding)->spec.iso_2022.single_shifting)
 392 #define CODING_ISO_BOL(coding)  \
 393   ((coding)->spec.iso_2022.bol)
 394 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 395   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 396 #define CODING_ISO_CMP_STATUS(coding)   \
 397   (&(coding)->spec.iso_2022.cmp_status)
 398 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 399   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 400 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 401   ((coding)->spec.iso_2022.embedded_utf_8)
 402
 403 /* Control characters of ISO2022.  */
 404                         /* code */      /* function */
 405 #define ISO_CODE_SO     0x0E            /* shift-out */
 406 #define ISO_CODE_SI     0x0F            /* shift-in */
 407 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 408 #define ISO_CODE_ESC    0x1B            /* escape */
 409 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 410 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 411 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 412
 413 /* All code (1-byte) of ISO2022 is classified into one of the
 414    followings.  */
 415 enum iso_code_class_type
 416   {
 417     ISO_control_0,              /* Control codes in the range
 418                                    0x00..0x1F and 0x7F, except for the
 419                                    following 5 codes.  */
 420     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 421     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 422     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 423     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 424     ISO_control_1,              /* Control codes in the range
 425                                    0x80..0x9F, except for the
 426                                    following 3 codes.  */
 427     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 428     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 429     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 430     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 431     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 432     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 433     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 434   };
 435
 436 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 437     `iso-flags' attribute of an iso2022 coding system.  */
 438
 439 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 440    instead of the correct short-form sequence (e.g. ESC $ A).  */
 441 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 442
 443 /* If set, reset graphic planes and registers at end-of-line to the
 444    initial state.  */
 445 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 446
 447 /* If set, reset graphic planes and registers before any control
 448    characters to the initial state.  */
 449 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 450
 451 /* If set, encode by 7-bit environment.  */
 452 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 453
 454 /* If set, use locking-shift function.  */
 455 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 456
 457 /* If set, use single-shift function.  Overwrite
 458    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 459 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 460
 461 /* If set, use designation escape sequence.  */
 462 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 463
 464 /* If set, produce revision number sequence.  */
 465 #define CODING_ISO_FLAG_REVISION        0x0080
 466
 467 /* If set, produce ISO6429's direction specifying sequence.  */
 468 #define CODING_ISO_FLAG_DIRECTION       0x0100
 469
 470 /* If set, assume designation states are reset at beginning of line on
 471    output.  */
 472 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 473
 474 /* If set, designation sequence should be placed at beginning of line
 475    on output.  */
 476 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 477
 478 /* If set, do not encode unsafe characters on output.  */
 479 #define CODING_ISO_FLAG_SAFE            0x0800
 480
 481 /* If set, extra latin codes (128..159) are accepted as a valid code
 482    on input.  */
 483 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 484
 485 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 486
 487 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 488
 489 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 490
 491 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 492
 493 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 494
 495 /* A character to be produced on output if encoding of the original
 496    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 497 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 498
 499 /* UTF-8 section */
 500 #define CODING_UTF_8_BOM(coding)        \
 501   ((coding)->spec.utf_8_bom)
 502
 503 /* UTF-16 section */
 504 #define CODING_UTF_16_BOM(coding)       \
 505   ((coding)->spec.utf_16.bom)
 506
 507 #define CODING_UTF_16_ENDIAN(coding)    \
 508   ((coding)->spec.utf_16.endian)
 509
 510 #define CODING_UTF_16_SURROGATE(coding) \
 511   ((coding)->spec.utf_16.surrogate)
 512
 513
 514 /* CCL section */
 515 #define CODING_CCL_DECODER(coding)      \
 516   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 517 #define CODING_CCL_ENCODER(coding)      \
 518   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 519 #define CODING_CCL_VALIDS(coding)                                          \
 520   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 521
 522 /* Index for each coding category in `coding_categories' */
 523
 524 enum coding_category
 525   {
 526     coding_category_iso_7,
 527     coding_category_iso_7_tight,
 528     coding_category_iso_8_1,
 529     coding_category_iso_8_2,
 530     coding_category_iso_7_else,
 531     coding_category_iso_8_else,
 532     coding_category_utf_8_auto,
 533     coding_category_utf_8_nosig,
 534     coding_category_utf_8_sig,
 535     coding_category_utf_16_auto,
 536     coding_category_utf_16_be,
 537     coding_category_utf_16_le,
 538     coding_category_utf_16_be_nosig,
 539     coding_category_utf_16_le_nosig,
 540     coding_category_charset,
 541     coding_category_sjis,
 542     coding_category_big5,
 543     coding_category_ccl,
 544     coding_category_emacs_mule,
 545     /* All above are targets of code detection.  */
 546     coding_category_raw_text,
 547     coding_category_undecided,
 548     coding_category_max
 549   };
 550
 551 /* Definitions of flag bits used in detect_coding_XXXX.  */
 552 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 553 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 554 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 555 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 556 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 557 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 558 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 559 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 560 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 561 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 562 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 563 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 564 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 565 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 566 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 567 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 568 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 569 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 570 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 571 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 572
 573 /* This value is returned if detect_coding_mask () find nothing other
 574    than ASCII characters.  */
 575 #define CATEGORY_MASK_ANY               \
 576   (CATEGORY_MASK_ISO_7                  \
 577    | CATEGORY_MASK_ISO_7_TIGHT          \
 578    | CATEGORY_MASK_ISO_8_1              \
 579    | CATEGORY_MASK_ISO_8_2              \
 580    | CATEGORY_MASK_ISO_7_ELSE           \
 581    | CATEGORY_MASK_ISO_8_ELSE           \
 582    | CATEGORY_MASK_UTF_8_AUTO           \
 583    | CATEGORY_MASK_UTF_8_NOSIG          \
 584    | CATEGORY_MASK_UTF_8_SIG            \
 585    | CATEGORY_MASK_UTF_16_AUTO          \
 586    | CATEGORY_MASK_UTF_16_BE            \
 587    | CATEGORY_MASK_UTF_16_LE            \
 588    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 589    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 590    | CATEGORY_MASK_CHARSET              \
 591    | CATEGORY_MASK_SJIS                 \
 592    | CATEGORY_MASK_BIG5                 \
 593    | CATEGORY_MASK_CCL                  \
 594    | CATEGORY_MASK_EMACS_MULE)
 595
 596
 597 #define CATEGORY_MASK_ISO_7BIT \
 598   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 599
 600 #define CATEGORY_MASK_ISO_8BIT \
 601   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 602
 603 #define CATEGORY_MASK_ISO_ELSE \
 604   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 605
 606 #define CATEGORY_MASK_ISO_ESCAPE        \
 607   (CATEGORY_MASK_ISO_7                  \
 608    | CATEGORY_MASK_ISO_7_TIGHT          \
 609    | CATEGORY_MASK_ISO_7_ELSE           \
 610    | CATEGORY_MASK_ISO_8_ELSE)
 611
 612 #define CATEGORY_MASK_ISO       \
 613   (  CATEGORY_MASK_ISO_7BIT     \
 614      | CATEGORY_MASK_ISO_8BIT   \
 615      | CATEGORY_MASK_ISO_ELSE)
 616
 617 #define CATEGORY_MASK_UTF_16            \
 618   (CATEGORY_MASK_UTF_16_AUTO            \
 619    | CATEGORY_MASK_UTF_16_BE            \
 620    | CATEGORY_MASK_UTF_16_LE            \
 621    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 622    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 623
 624 #define CATEGORY_MASK_UTF_8     \
 625   (CATEGORY_MASK_UTF_8_AUTO     \
 626    | CATEGORY_MASK_UTF_8_NOSIG  \
 627    | CATEGORY_MASK_UTF_8_SIG)
 628
 629 /* Table of coding categories (Lisp symbols).  This variable is for
 630    internal use only.  */
 631 static Lisp_Object Vcoding_category_table;
 632
 633 /* Table of coding-categories ordered by priority.  */
 634 static enum coding_category coding_priorities[coding_category_max];
 635
 636 /* Nth element is a coding context for the coding system bound to the
 637    Nth coding category.  */
 638 static struct coding_system coding_categories[coding_category_max];
 639
 640 /*** Commonly used macros and functions ***/
 641
 642 #ifndef min
 643 #define min(a, b) ((a) < (b) ? (a) : (b))
 644 #endif
 645 #ifndef max
 646 #define max(a, b) ((a) > (b) ? (a) : (b))
 647 #endif
 648
 649 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 650   do {                                                  \
 651     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 652     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 653   } while (0)
 654
 655
 656 /* Safely get one byte from the source text pointed by SRC which ends
 657    at SRC_END, and set C to that byte.  If there are not enough bytes
 658    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 659    and a multibyte character is found at SRC, set C to the
 660    negative value of the character code.  The caller should declare
 661    and set these variables appropriately in advance:
 662         src, src_end, multibytep */
 663
 664 #define ONE_MORE_BYTE(c)                                \
 665   do {                                                  \
 666     if (src == src_end)                                 \
 667       {                                                 \
 668         if (src_base < src)                             \
 669           record_conversion_result                      \
 670             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 671         goto no_more_source;                            \
 672       }                                                 \
 673     c = *src++;                                         \
 674     if (multibytep && (c & 0x80))                       \
 675       {                                                 \
 676         if ((c & 0xFE) == 0xC0)                         \
 677           c = ((c & 1) << 6) | *src++;                  \
 678         else                                            \
 679           {                                             \
 680             src--;                                      \
 681             c = - string_char (src, &src, NULL);        \
 682             record_conversion_result                    \
 683               (coding, CODING_RESULT_INVALID_SRC);      \
 684           }                                             \
 685       }                                                 \
 686     consumed_chars++;                                   \
 687   } while (0)
 688
 689 /* Safely get two bytes from the source text pointed by SRC which ends
 690    at SRC_END, and set C1 and C2 to those bytes while skipping the
 691    heading multibyte characters.  If there are not enough bytes in the
 692    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 693    a multibyte character is found for C2, set C2 to the negative value
 694    of the character code.  The caller should declare and set these
 695    variables appropriately in advance:
 696         src, src_end, multibytep
 697    It is intended that this macro is used in detect_coding_utf_16.  */
 698
 699 #define TWO_MORE_BYTES(c1, c2)                          \
 700   do {                                                  \
 701     do {                                                \
 702       if (src == src_end)                               \
 703         goto no_more_source;                            \
 704       c1 = *src++;                                      \
 705       if (multibytep && (c1 & 0x80))                    \
 706         {                                               \
 707           if ((c1 & 0xFE) == 0xC0)                      \
 708             c1 = ((c1 & 1) << 6) | *src++;              \
 709           else                                          \
 710             {                                           \
 711               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 712               c1 = -1;                                  \
 713             }                                           \
 714         }                                               \
 715     } while (c1 < 0);                                   \
 716     if (src == src_end)                                 \
 717       goto no_more_source;                              \
 718     c2 = *src++;                                        \
 719     if (multibytep && (c2 & 0x80))                      \
 720       {                                                 \
 721         if ((c2 & 0xFE) == 0xC0)                        \
 722           c2 = ((c2 & 1) << 6) | *src++;                \
 723         else                                            \
 724           c2 = -1;                                      \
 725       }                                                 \
 726   } while (0)
 727
 728
 729 /* Store a byte C in the place pointed by DST and increment DST to the
 730    next free point, and increment PRODUCED_CHARS.  The caller should
 731    assure that C is 0..127, and declare and set the variable `dst'
 732    appropriately in advance.
 733 */
 734
 735
 736 #define EMIT_ONE_ASCII_BYTE(c)  \
 737   do {                          \
 738     produced_chars++;           \
 739     *dst++ = (c);               \
 740   } while (0)
 741
 742
 743 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 744
 745 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 746   do {                                  \
 747     produced_chars += 2;                \
 748     *dst++ = (c1), *dst++ = (c2);       \
 749   } while (0)
 750
 751
 752 /* Store a byte C in the place pointed by DST and increment DST to the
 753    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 754    store in an appropriate multibyte form.  The caller should
 755    declare and set the variables `dst' and `multibytep' appropriately
 756    in advance.  */
 757
 758 #define EMIT_ONE_BYTE(c)                \
 759   do {                                  \
 760     produced_chars++;                   \
 761     if (multibytep)                     \
 762       {                                 \
 763         unsigned ch = (c);              \
 764         if (ch >= 0x80)                 \
 765           ch = BYTE8_TO_CHAR (ch);      \
 766         CHAR_STRING_ADVANCE (ch, dst);  \
 767       }                                 \
 768     else                                \
 769       *dst++ = (c);                     \
 770   } while (0)
 771
 772
 773 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 774
 775 #define EMIT_TWO_BYTES(c1, c2)          \
 776   do {                                  \
 777     produced_chars += 2;                \
 778     if (multibytep)                     \
 779       {                                 \
 780         unsigned ch;                    \
 781                                         \
 782         ch = (c1);                      \
 783         if (ch >= 0x80)                 \
 784           ch = BYTE8_TO_CHAR (ch);      \
 785         CHAR_STRING_ADVANCE (ch, dst);  \
 786         ch = (c2);                      \
 787         if (ch >= 0x80)                 \
 788           ch = BYTE8_TO_CHAR (ch);      \
 789         CHAR_STRING_ADVANCE (ch, dst);  \
 790       }                                 \
 791     else                                \
 792       {                                 \
 793         *dst++ = (c1);                  \
 794         *dst++ = (c2);                  \
 795       }                                 \
 796   } while (0)
 797
 798
 799 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 800   do {                                  \
 801     EMIT_ONE_BYTE (c1);                 \
 802     EMIT_TWO_BYTES (c2, c3);            \
 803   } while (0)
 804
 805
 806 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 807   do {                                          \
 808     EMIT_TWO_BYTES (c1, c2);                    \
 809     EMIT_TWO_BYTES (c3, c4);                    \
 810   } while (0)
 811
 812
 813 static void
 814 record_conversion_result (struct coding_system *coding,
 815                           enum coding_result_code result)
 816 {
 817   coding->result = result;
 818   switch (result)
 819     {
 820     case CODING_RESULT_INSUFFICIENT_SRC:
 821       Vlast_code_conversion_error = Qinsufficient_source;
 822       break;
 823     case CODING_RESULT_INCONSISTENT_EOL:
 824       Vlast_code_conversion_error = Qinconsistent_eol;
 825       break;
 826     case CODING_RESULT_INVALID_SRC:
 827       Vlast_code_conversion_error = Qinvalid_source;
 828       break;
 829     case CODING_RESULT_INTERRUPT:
 830       Vlast_code_conversion_error = Qinterrupted;
 831       break;
 832     case CODING_RESULT_INSUFFICIENT_MEM:
 833       Vlast_code_conversion_error = Qinsufficient_memory;
 834       break;
 835     case CODING_RESULT_INSUFFICIENT_DST:
 836       /* Don't record this error in Vlast_code_conversion_error
 837          because it happens just temporarily and is resolved when the
 838          whole conversion is finished.  */
 839       break;
 840     case CODING_RESULT_SUCCESS:
 841       break;
 842     default:
 843       Vlast_code_conversion_error = intern ("Unknown error");
 844     }
 845 }
 846
 847 /* These wrapper macros are used to preserve validity of pointers into
 848    buffer text across calls to decode_char, encode_char, etc, which
 849    could cause relocation of buffers if it loads a charset map,
 850    because loading a charset map allocates large structures.  */
 851
 852 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 853   do {                                                                       \
 854     ptrdiff_t offset;                                                        \
 855                                                                              \
 856     charset_map_loaded = 0;                                                  \
 857     c = DECODE_CHAR (charset, code);                                         \
 858     if (charset_map_loaded                                                   \
 859         && (offset = coding_change_source (coding)))                         \
 860       {                                                                      \
 861         src += offset;                                                       \
 862         src_base += offset;                                                  \
 863         src_end += offset;                                                   \
 864       }                                                                      \
 865   } while (0)
 866
 867 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 868   do {                                                                  \
 869     ptrdiff_t offset;                                                   \
 870                                                                         \
 871     charset_map_loaded = 0;                                             \
 872     code = ENCODE_CHAR (charset, c);                                    \
 873     if (charset_map_loaded                                              \
 874         && (offset = coding_change_destination (coding)))               \
 875       {                                                                 \
 876         dst += offset;                                                  \
 877         dst_end += offset;                                              \
 878       }                                                                 \
 879   } while (0)
 880
 881 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 882   do {                                                                  \
 883     ptrdiff_t offset;                                                   \
 884                                                                         \
 885     charset_map_loaded = 0;                                             \
 886     charset = char_charset (c, charset_list, code_return);              \
 887     if (charset_map_loaded                                              \
 888         && (offset = coding_change_destination (coding)))               \
 889       {                                                                 \
 890         dst += offset;                                                  \
 891         dst_end += offset;                                              \
 892       }                                                                 \
 893   } while (0)
 894
 895 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 896   do {                                                                  \
 897     ptrdiff_t offset;                                                   \
 898                                                                         \
 899     charset_map_loaded = 0;                                             \
 900     result = CHAR_CHARSET_P (c, charset);                               \
 901     if (charset_map_loaded                                              \
 902         && (offset = coding_change_destination (coding)))               \
 903       {                                                                 \
 904         dst += offset;                                                  \
 905         dst_end += offset;                                              \
 906       }                                                                 \
 907   } while (0)
 908
 909
 910 /* If there are at least BYTES length of room at dst, allocate memory
 911    for coding->destination and update dst and dst_end.  We don't have
 912    to take care of coding->source which will be relocated.  It is
 913    handled by calling coding_set_source in encode_coding.  */
 914
 915 #define ASSURE_DESTINATION(bytes)                               \
 916   do {                                                          \
 917     if (dst + (bytes) >= dst_end)                               \
 918       {                                                         \
 919         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 920                                                                 \
 921         dst = alloc_destination (coding, more_bytes, dst);      \
 922         dst_end = coding->destination + coding->dst_bytes;      \
 923       }                                                         \
 924   } while (0)
 925
 926
 927 /* Store multibyte form of the character C in P, and advance P to the
 928    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 929    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 930    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 931
 932 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 933
 934 /* Return the character code of character whose multibyte form is at
 935    P, and advance P to the end of the multibyte form.  This used to be
 936    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 937    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 938
 939 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 940
 941 /* Set coding->source from coding->src_object.  */
 942
 943 static void
 944 coding_set_source (struct coding_system *coding)
 945 {
 946   if (BUFFERP (coding->src_object))
 947     {
 948       struct buffer *buf = XBUFFER (coding->src_object);
 949
 950       if (coding->src_pos < 0)
 951         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 952       else
 953         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 954     }
 955   else if (STRINGP (coding->src_object))
 956     {
 957       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 958     }
 959   else
 960     {
 961       /* Otherwise, the source is C string and is never relocated
 962          automatically.  Thus we don't have to update anything.  */
 963     }
 964 }
 965
 966
 967 /* Set coding->source from coding->src_object, and return how many
 968    bytes coding->source was changed.  */
 969
 970 static ptrdiff_t
 971 coding_change_source (struct coding_system *coding)
 972 {
 973   const unsigned char *orig = coding->source;
 974   coding_set_source (coding);
 975   return coding->source - orig;
 976 }
 977
 978
 979 /* Set coding->destination from coding->dst_object.  */
 980
 981 static void
 982 coding_set_destination (struct coding_system *coding)
 983 {
 984   if (BUFFERP (coding->dst_object))
 985     {
 986       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 987         {
 988           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 989           coding->dst_bytes = (GAP_END_ADDR
 990                                - (coding->src_bytes - coding->consumed)
 991                                - coding->destination);
 992         }
 993       else
 994         {
 995           /* We are sure that coding->dst_pos_byte is before the gap
 996              of the buffer. */
 997           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 998                                  + coding->dst_pos_byte - BEG_BYTE);
 999           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1000                                - coding->destination);
1001         }
1002     }
1003   else
1004     {
1005       /* Otherwise, the destination is C string and is never relocated
1006          automatically.  Thus we don't have to update anything.  */
1007     }
1008 }
1009
1010
1011 /* Set coding->destination from coding->dst_object, and return how
1012    many bytes coding->destination was changed.  */
1013
1014 static ptrdiff_t
1015 coding_change_destination (struct coding_system *coding)
1016 {
1017   const unsigned char *orig = coding->destination;
1018   coding_set_destination (coding);
1019   return coding->destination - orig;
1020 }
1021
1022
1023 static void
1024 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1025 {
1026   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1027     string_overflow ();
1028   coding->destination = xrealloc (coding->destination,
1029                                   coding->dst_bytes + bytes);
1030   coding->dst_bytes += bytes;
1031 }
1032
1033 static void
1034 coding_alloc_by_making_gap (struct coding_system *coding,
1035                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1036 {
1037   if (EQ (coding->src_object, coding->dst_object))
1038     {
1039       /* The gap may contain the produced data at the head and not-yet
1040          consumed data at the tail.  To preserve those data, we at
1041          first make the gap size to zero, then increase the gap
1042          size.  */
1043       ptrdiff_t add = GAP_SIZE;
1044
1045       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1046       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1047       make_gap (bytes);
1048       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1049       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1050     }
1051   else
1052     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1053 }
1054
1055
1056 static unsigned char *
1057 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1058                    unsigned char *dst)
1059 {
1060   ptrdiff_t offset = dst - coding->destination;
1061
1062   if (BUFFERP (coding->dst_object))
1063     {
1064       struct buffer *buf = XBUFFER (coding->dst_object);
1065
1066       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1067     }
1068   else
1069     coding_alloc_by_realloc (coding, nbytes);
1070   coding_set_destination (coding);
1071   dst = coding->destination + offset;
1072   return dst;
1073 }
1074
1075 /** Macros for annotations.  */
1076
1077 /* An annotation data is stored in the array coding->charbuf in this
1078    format:
1079      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1080    LENGTH is the number of elements in the annotation.
1081    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1082    NCHARS is the number of characters in the text annotated.
1083
1084    The format of the following elements depend on ANNOTATION_MASK.
1085
1086    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1087    follows:
1088      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1089
1090    NBYTES is the number of bytes specified in the header part of
1091    old-style emacs-mule encoding, or 0 for the other kind of
1092    composition.
1093
1094    METHOD is one of enum composition_method.
1095
1096    Optional COMPOSITION-COMPONENTS are characters and composition
1097    rules.
1098
1099    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1100    follows.
1101
1102    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1103    recover from an invalid annotation, and should be skipped by
1104    produce_annotation.  */
1105
1106 /* Maximum length of the header of annotation data.  */
1107 #define MAX_ANNOTATION_LENGTH 5
1108
1109 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1110   do {                                                  \
1111     *(buf)++ = -(len);                                  \
1112     *(buf)++ = (mask);                                  \
1113     *(buf)++ = (nchars);                                \
1114     coding->annotated = 1;                              \
1115   } while (0);
1116
1117 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1118   do {                                                                      \
1119     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1120     *buf++ = nbytes;                                                        \
1121     *buf++ = method;                                                        \
1122   } while (0)
1123
1124
1125 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1126   do {                                                                  \
1127     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1128     *buf++ = id;                                                        \
1129   } while (0)
1130
1131 \f
1132 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1133
1134
1135
1136 \f
1137 /*** 3. UTF-8 ***/
1138
1139 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1140    Return true if a text is encoded in UTF-8.  */
1141
1142 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1143 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1144 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1145 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1146 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1147 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1148
1149 #define UTF_8_BOM_1 0xEF
1150 #define UTF_8_BOM_2 0xBB
1151 #define UTF_8_BOM_3 0xBF
1152
1153 static bool
1154 detect_coding_utf_8 (struct coding_system *coding,
1155                      struct coding_detection_info *detect_info)
1156 {
1157   const unsigned char *src = coding->source, *src_base;
1158   const unsigned char *src_end = coding->source + coding->src_bytes;
1159   bool multibytep = coding->src_multibyte;
1160   ptrdiff_t consumed_chars = 0;
1161   bool bom_found = 0;
1162   bool found = 0;
1163
1164   detect_info->checked |= CATEGORY_MASK_UTF_8;
1165   /* A coding system of this category is always ASCII compatible.  */
1166   src += coding->head_ascii;
1167
1168   while (1)
1169     {
1170       int c, c1, c2, c3, c4;
1171
1172       src_base = src;
1173       ONE_MORE_BYTE (c);
1174       if (c < 0 || UTF_8_1_OCTET_P (c))
1175         continue;
1176       ONE_MORE_BYTE (c1);
1177       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1178         break;
1179       if (UTF_8_2_OCTET_LEADING_P (c))
1180         {
1181           found = 1;
1182           continue;
1183         }
1184       ONE_MORE_BYTE (c2);
1185       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1186         break;
1187       if (UTF_8_3_OCTET_LEADING_P (c))
1188         {
1189           found = 1;
1190           if (src_base == coding->source
1191               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1192             bom_found = 1;
1193           continue;
1194         }
1195       ONE_MORE_BYTE (c3);
1196       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1197         break;
1198       if (UTF_8_4_OCTET_LEADING_P (c))
1199         {
1200           found = 1;
1201           continue;
1202         }
1203       ONE_MORE_BYTE (c4);
1204       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1205         break;
1206       if (UTF_8_5_OCTET_LEADING_P (c))
1207         {
1208           found = 1;
1209           continue;
1210         }
1211       break;
1212     }
1213   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1214   return 0;
1215
1216  no_more_source:
1217   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1218     {
1219       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1220       return 0;
1221     }
1222   if (bom_found)
1223     {
1224       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1225       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1226     }
1227   else
1228     {
1229       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1230       if (found)
1231         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1232     }
1233   return 1;
1234 }
1235
1236
1237 static void
1238 decode_coding_utf_8 (struct coding_system *coding)
1239 {
1240   const unsigned char *src = coding->source + coding->consumed;
1241   const unsigned char *src_end = coding->source + coding->src_bytes;
1242   const unsigned char *src_base;
1243   int *charbuf = coding->charbuf + coding->charbuf_used;
1244   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1245   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1246   bool multibytep = coding->src_multibyte;
1247   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1248   bool eol_dos
1249     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1250   int byte_after_cr = -1;
1251
1252   if (bom != utf_without_bom)
1253     {
1254       int c1, c2, c3;
1255
1256       src_base = src;
1257       ONE_MORE_BYTE (c1);
1258       if (! UTF_8_3_OCTET_LEADING_P (c1))
1259         src = src_base;
1260       else
1261         {
1262           ONE_MORE_BYTE (c2);
1263           if (! UTF_8_EXTRA_OCTET_P (c2))
1264             src = src_base;
1265           else
1266             {
1267               ONE_MORE_BYTE (c3);
1268               if (! UTF_8_EXTRA_OCTET_P (c3))
1269                 src = src_base;
1270               else
1271                 {
1272                   if ((c1 != UTF_8_BOM_1)
1273                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1274                     src = src_base;
1275                   else
1276                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1277                 }
1278             }
1279         }
1280     }
1281   CODING_UTF_8_BOM (coding) = utf_without_bom;
1282
1283   while (1)
1284     {
1285       int c, c1, c2, c3, c4, c5;
1286
1287       src_base = src;
1288       consumed_chars_base = consumed_chars;
1289
1290       if (charbuf >= charbuf_end)
1291         {
1292           if (byte_after_cr >= 0)
1293             src_base--;
1294           break;
1295         }
1296
1297       if (byte_after_cr >= 0)
1298         c1 = byte_after_cr, byte_after_cr = -1;
1299       else
1300         ONE_MORE_BYTE (c1);
1301       if (c1 < 0)
1302         {
1303           c = - c1;
1304         }
1305       else if (UTF_8_1_OCTET_P (c1))
1306         {
1307           if (eol_dos && c1 == '\r')
1308             ONE_MORE_BYTE (byte_after_cr);
1309           c = c1;
1310         }
1311       else
1312         {
1313           ONE_MORE_BYTE (c2);
1314           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1315             goto invalid_code;
1316           if (UTF_8_2_OCTET_LEADING_P (c1))
1317             {
1318               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1319               /* Reject overlong sequences here and below.  Encoders
1320                  producing them are incorrect, they can be misleading,
1321                  and they mess up read/write invariance.  */
1322               if (c < 128)
1323                 goto invalid_code;
1324             }
1325           else
1326             {
1327               ONE_MORE_BYTE (c3);
1328               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1329                 goto invalid_code;
1330               if (UTF_8_3_OCTET_LEADING_P (c1))
1331                 {
1332                   c = (((c1 & 0xF) << 12)
1333                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1334                   if (c < 0x800
1335                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1336                     goto invalid_code;
1337                 }
1338               else
1339                 {
1340                   ONE_MORE_BYTE (c4);
1341                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1342                     goto invalid_code;
1343                   if (UTF_8_4_OCTET_LEADING_P (c1))
1344                     {
1345                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1346                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1347                     if (c < 0x10000)
1348                       goto invalid_code;
1349                     }
1350                   else
1351                     {
1352                       ONE_MORE_BYTE (c5);
1353                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1354                         goto invalid_code;
1355                       if (UTF_8_5_OCTET_LEADING_P (c1))
1356                         {
1357                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1358                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1359                                | (c5 & 0x3F));
1360                           if ((c > MAX_CHAR) || (c < 0x200000))
1361                             goto invalid_code;
1362                         }
1363                       else
1364                         goto invalid_code;
1365                     }
1366                 }
1367             }
1368         }
1369
1370       *charbuf++ = c;
1371       continue;
1372
1373     invalid_code:
1374       src = src_base;
1375       consumed_chars = consumed_chars_base;
1376       ONE_MORE_BYTE (c);
1377       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1378       coding->errors++;
1379     }
1380
1381  no_more_source:
1382   coding->consumed_char += consumed_chars_base;
1383   coding->consumed = src_base - coding->source;
1384   coding->charbuf_used = charbuf - coding->charbuf;
1385 }
1386
1387
1388 static bool
1389 encode_coding_utf_8 (struct coding_system *coding)
1390 {
1391   bool multibytep = coding->dst_multibyte;
1392   int *charbuf = coding->charbuf;
1393   int *charbuf_end = charbuf + coding->charbuf_used;
1394   unsigned char *dst = coding->destination + coding->produced;
1395   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1396   ptrdiff_t produced_chars = 0;
1397   int c;
1398
1399   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1400     {
1401       ASSURE_DESTINATION (3);
1402       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1403       CODING_UTF_8_BOM (coding) = utf_without_bom;
1404     }
1405
1406   if (multibytep)
1407     {
1408       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1409
1410       while (charbuf < charbuf_end)
1411         {
1412           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1413
1414           ASSURE_DESTINATION (safe_room);
1415           c = *charbuf++;
1416           if (CHAR_BYTE8_P (c))
1417             {
1418               c = CHAR_TO_BYTE8 (c);
1419               EMIT_ONE_BYTE (c);
1420             }
1421           else
1422             {
1423               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1424               for (p = str; p < pend; p++)
1425                 EMIT_ONE_BYTE (*p);
1426             }
1427         }
1428     }
1429   else
1430     {
1431       int safe_room = MAX_MULTIBYTE_LENGTH;
1432
1433       while (charbuf < charbuf_end)
1434         {
1435           ASSURE_DESTINATION (safe_room);
1436           c = *charbuf++;
1437           if (CHAR_BYTE8_P (c))
1438             *dst++ = CHAR_TO_BYTE8 (c);
1439           else
1440             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1441           produced_chars++;
1442         }
1443     }
1444   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1445   coding->produced_char += produced_chars;
1446   coding->produced = dst - coding->destination;
1447   return 0;
1448 }
1449
1450
1451 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1452    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1453
1454 #define UTF_16_HIGH_SURROGATE_P(val) \
1455   (((val) & 0xFC00) == 0xD800)
1456
1457 #define UTF_16_LOW_SURROGATE_P(val) \
1458   (((val) & 0xFC00) == 0xDC00)
1459
1460
1461 static bool
1462 detect_coding_utf_16 (struct coding_system *coding,
1463                       struct coding_detection_info *detect_info)
1464 {
1465   const unsigned char *src = coding->source;
1466   const unsigned char *src_end = coding->source + coding->src_bytes;
1467   bool multibytep = coding->src_multibyte;
1468   int c1, c2;
1469
1470   detect_info->checked |= CATEGORY_MASK_UTF_16;
1471   if (coding->mode & CODING_MODE_LAST_BLOCK
1472       && (coding->src_chars & 1))
1473     {
1474       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1475       return 0;
1476     }
1477
1478   TWO_MORE_BYTES (c1, c2);
1479   if ((c1 == 0xFF) && (c2 == 0xFE))
1480     {
1481       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1482                              | CATEGORY_MASK_UTF_16_AUTO);
1483       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1484                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1485                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1486     }
1487   else if ((c1 == 0xFE) && (c2 == 0xFF))
1488     {
1489       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1490                              | CATEGORY_MASK_UTF_16_AUTO);
1491       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1492                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1493                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1494     }
1495   else if (c2 < 0)
1496     {
1497       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1498       return 0;
1499     }
1500   else
1501     {
1502       /* We check the dispersion of Eth and Oth bytes where E is even and
1503          O is odd.  If both are high, we assume binary data.*/
1504       unsigned char e[256], o[256];
1505       unsigned e_num = 1, o_num = 1;
1506
1507       memset (e, 0, 256);
1508       memset (o, 0, 256);
1509       e[c1] = 1;
1510       o[c2] = 1;
1511
1512       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1513                                 |CATEGORY_MASK_UTF_16_BE
1514                                 | CATEGORY_MASK_UTF_16_LE);
1515
1516       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1517              != CATEGORY_MASK_UTF_16)
1518         {
1519           TWO_MORE_BYTES (c1, c2);
1520           if (c2 < 0)
1521             break;
1522           if (! e[c1])
1523             {
1524               e[c1] = 1;
1525               e_num++;
1526               if (e_num >= 128)
1527                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1528             }
1529           if (! o[c2])
1530             {
1531               o[c2] = 1;
1532               o_num++;
1533               if (o_num >= 128)
1534                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1535             }
1536         }
1537       return 0;
1538     }
1539
1540  no_more_source:
1541   return 1;
1542 }
1543
1544 static void
1545 decode_coding_utf_16 (struct coding_system *coding)
1546 {
1547   const unsigned char *src = coding->source + coding->consumed;
1548   const unsigned char *src_end = coding->source + coding->src_bytes;
1549   const unsigned char *src_base;
1550   int *charbuf = coding->charbuf + coding->charbuf_used;
1551   /* We may produces at most 3 chars in one loop.  */
1552   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1553   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1554   bool multibytep = coding->src_multibyte;
1555   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1556   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1557   int surrogate = CODING_UTF_16_SURROGATE (coding);
1558   bool eol_dos
1559     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1560   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1561
1562   if (bom == utf_with_bom)
1563     {
1564       int c, c1, c2;
1565
1566       src_base = src;
1567       ONE_MORE_BYTE (c1);
1568       ONE_MORE_BYTE (c2);
1569       c = (c1 << 8) | c2;
1570
1571       if (endian == utf_16_big_endian
1572           ? c != 0xFEFF : c != 0xFFFE)
1573         {
1574           /* The first two bytes are not BOM.  Treat them as bytes
1575              for a normal character.  */
1576           src = src_base;
1577           coding->errors++;
1578         }
1579       CODING_UTF_16_BOM (coding) = utf_without_bom;
1580     }
1581   else if (bom == utf_detect_bom)
1582     {
1583       /* We have already tried to detect BOM and failed in
1584          detect_coding.  */
1585       CODING_UTF_16_BOM (coding) = utf_without_bom;
1586     }
1587
1588   while (1)
1589     {
1590       int c, c1, c2;
1591
1592       src_base = src;
1593       consumed_chars_base = consumed_chars;
1594
1595       if (charbuf >= charbuf_end)
1596         {
1597           if (byte_after_cr1 >= 0)
1598             src_base -= 2;
1599           break;
1600         }
1601
1602       if (byte_after_cr1 >= 0)
1603         c1 = byte_after_cr1, byte_after_cr1 = -1;
1604       else
1605         ONE_MORE_BYTE (c1);
1606       if (c1 < 0)
1607         {
1608           *charbuf++ = -c1;
1609           continue;
1610         }
1611       if (byte_after_cr2 >= 0)
1612         c2 = byte_after_cr2, byte_after_cr2 = -1;
1613       else
1614         ONE_MORE_BYTE (c2);
1615       if (c2 < 0)
1616         {
1617           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1618           *charbuf++ = -c2;
1619           continue;
1620         }
1621       c = (endian == utf_16_big_endian
1622            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1623
1624       if (surrogate)
1625         {
1626           if (! UTF_16_LOW_SURROGATE_P (c))
1627             {
1628               if (endian == utf_16_big_endian)
1629                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1630               else
1631                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1632               *charbuf++ = c1;
1633               *charbuf++ = c2;
1634               coding->errors++;
1635               if (UTF_16_HIGH_SURROGATE_P (c))
1636                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1637               else
1638                 *charbuf++ = c;
1639             }
1640           else
1641             {
1642               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1643               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1644               *charbuf++ = 0x10000 + c;
1645             }
1646         }
1647       else
1648         {
1649           if (UTF_16_HIGH_SURROGATE_P (c))
1650             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1651           else
1652             {
1653               if (eol_dos && c == '\r')
1654                 {
1655                   ONE_MORE_BYTE (byte_after_cr1);
1656                   ONE_MORE_BYTE (byte_after_cr2);
1657                 }
1658               *charbuf++ = c;
1659             }
1660         }
1661     }
1662
1663  no_more_source:
1664   coding->consumed_char += consumed_chars_base;
1665   coding->consumed = src_base - coding->source;
1666   coding->charbuf_used = charbuf - coding->charbuf;
1667 }
1668
1669 static bool
1670 encode_coding_utf_16 (struct coding_system *coding)
1671 {
1672   bool multibytep = coding->dst_multibyte;
1673   int *charbuf = coding->charbuf;
1674   int *charbuf_end = charbuf + coding->charbuf_used;
1675   unsigned char *dst = coding->destination + coding->produced;
1676   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1677   int safe_room = 8;
1678   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1679   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1680   ptrdiff_t produced_chars = 0;
1681   int c;
1682
1683   if (bom != utf_without_bom)
1684     {
1685       ASSURE_DESTINATION (safe_room);
1686       if (big_endian)
1687         EMIT_TWO_BYTES (0xFE, 0xFF);
1688       else
1689         EMIT_TWO_BYTES (0xFF, 0xFE);
1690       CODING_UTF_16_BOM (coding) = utf_without_bom;
1691     }
1692
1693   while (charbuf < charbuf_end)
1694     {
1695       ASSURE_DESTINATION (safe_room);
1696       c = *charbuf++;
1697       if (c > MAX_UNICODE_CHAR)
1698         c = coding->default_char;
1699
1700       if (c < 0x10000)
1701         {
1702           if (big_endian)
1703             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1704           else
1705             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1706         }
1707       else
1708         {
1709           int c1, c2;
1710
1711           c -= 0x10000;
1712           c1 = (c >> 10) + 0xD800;
1713           c2 = (c & 0x3FF) + 0xDC00;
1714           if (big_endian)
1715             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1716           else
1717             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1718         }
1719     }
1720   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1721   coding->produced = dst - coding->destination;
1722   coding->produced_char += produced_chars;
1723   return 0;
1724 }
1725
1726 \f
1727 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1728
1729 /* Emacs' internal format for representation of multiple character
1730    sets is a kind of multi-byte encoding, i.e. characters are
1731    represented by variable-length sequences of one-byte codes.
1732
1733    ASCII characters and control characters (e.g. `tab', `newline') are
1734    represented by one-byte sequences which are their ASCII codes, in
1735    the range 0x00 through 0x7F.
1736
1737    8-bit characters of the range 0x80..0x9F are represented by
1738    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1739    code + 0x20).
1740
1741    8-bit characters of the range 0xA0..0xFF are represented by
1742    one-byte sequences which are their 8-bit code.
1743
1744    The other characters are represented by a sequence of `base
1745    leading-code', optional `extended leading-code', and one or two
1746    `position-code's.  The length of the sequence is determined by the
1747    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1748    whereas extended leading-code and position-code take the range 0xA0
1749    through 0xFF.  See `charset.h' for more details about leading-code
1750    and position-code.
1751
1752    --- CODE RANGE of Emacs' internal format ---
1753    character set        range
1754    -------------        -----
1755    ascii                0x00..0x7F
1756    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1757    eight-bit-graphic    0xA0..0xBF
1758    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1759    ---------------------------------------------
1760
1761    As this is the internal character representation, the format is
1762    usually not used externally (i.e. in a file or in a data sent to a
1763    process).  But, it is possible to have a text externally in this
1764    format (i.e. by encoding by the coding system `emacs-mule').
1765
1766    In that case, a sequence of one-byte codes has a slightly different
1767    form.
1768
1769    At first, all characters in eight-bit-control are represented by
1770    one-byte sequences which are their 8-bit code.
1771
1772    Next, character composition data are represented by the byte
1773    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1774    where,
1775         METHOD is 0xF2 plus one of composition method (enum
1776         composition_method),
1777
1778         BYTES is 0xA0 plus a byte length of this composition data,
1779
1780         CHARS is 0xA0 plus a number of characters composed by this
1781         data,
1782
1783         COMPONENTs are characters of multibyte form or composition
1784         rules encoded by two-byte of ASCII codes.
1785
1786    In addition, for backward compatibility, the following formats are
1787    also recognized as composition data on decoding.
1788
1789    0x80 MSEQ ...
1790    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1791
1792    Here,
1793         MSEQ is a multibyte form but in these special format:
1794           ASCII: 0xA0 ASCII_CODE+0x80,
1795           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1796         RULE is a one byte code of the range 0xA0..0xF0 that
1797         represents a composition rule.
1798   */
1799
1800 char emacs_mule_bytes[256];
1801
1802
1803 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1804    Return true if a text is encoded in 'emacs-mule'.  */
1805
1806 static bool
1807 detect_coding_emacs_mule (struct coding_system *coding,
1808                           struct coding_detection_info *detect_info)
1809 {
1810   const unsigned char *src = coding->source, *src_base;
1811   const unsigned char *src_end = coding->source + coding->src_bytes;
1812   bool multibytep = coding->src_multibyte;
1813   ptrdiff_t consumed_chars = 0;
1814   int c;
1815   int found = 0;
1816
1817   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1818   /* A coding system of this category is always ASCII compatible.  */
1819   src += coding->head_ascii;
1820
1821   while (1)
1822     {
1823       src_base = src;
1824       ONE_MORE_BYTE (c);
1825       if (c < 0)
1826         continue;
1827       if (c == 0x80)
1828         {
1829           /* Perhaps the start of composite character.  We simply skip
1830              it because analyzing it is too heavy for detecting.  But,
1831              at least, we check that the composite character
1832              constitutes of more than 4 bytes.  */
1833           const unsigned char *src_start;
1834
1835         repeat:
1836           src_start = src;
1837           do
1838             {
1839               ONE_MORE_BYTE (c);
1840             }
1841           while (c >= 0xA0);
1842
1843           if (src - src_start <= 4)
1844             break;
1845           found = CATEGORY_MASK_EMACS_MULE;
1846           if (c == 0x80)
1847             goto repeat;
1848         }
1849
1850       if (c < 0x80)
1851         {
1852           if (c < 0x20
1853               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1854             break;
1855         }
1856       else
1857         {
1858           int more_bytes = emacs_mule_bytes[c] - 1;
1859
1860           while (more_bytes > 0)
1861             {
1862               ONE_MORE_BYTE (c);
1863               if (c < 0xA0)
1864                 {
1865                   src--;        /* Unread the last byte.  */
1866                   break;
1867                 }
1868               more_bytes--;
1869             }
1870           if (more_bytes != 0)
1871             break;
1872           found = CATEGORY_MASK_EMACS_MULE;
1873         }
1874     }
1875   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1876   return 0;
1877
1878  no_more_source:
1879   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1880     {
1881       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1882       return 0;
1883     }
1884   detect_info->found |= found;
1885   return 1;
1886 }
1887
1888
1889 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1890    character.  If CMP_STATUS indicates that we must expect MSEQ or
1891    RULE described above, decode it and return the negative value of
1892    the decoded character or rule.  If an invalid byte is found, return
1893    -1.  If SRC is too short, return -2.  */
1894
1895 static int
1896 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1897                  int *nbytes, int *nchars, int *id,
1898                  struct composition_status *cmp_status)
1899 {
1900   const unsigned char *src_end = coding->source + coding->src_bytes;
1901   const unsigned char *src_base = src;
1902   bool multibytep = coding->src_multibyte;
1903   int charset_ID;
1904   unsigned code;
1905   int c;
1906   int consumed_chars = 0;
1907   bool mseq_found = 0;
1908
1909   ONE_MORE_BYTE (c);
1910   if (c < 0)
1911     {
1912       c = -c;
1913       charset_ID = emacs_mule_charset[0];
1914     }
1915   else
1916     {
1917       if (c >= 0xA0)
1918         {
1919           if (cmp_status->state != COMPOSING_NO
1920               && cmp_status->old_form)
1921             {
1922               if (cmp_status->state == COMPOSING_CHAR)
1923                 {
1924                   if (c == 0xA0)
1925                     {
1926                       ONE_MORE_BYTE (c);
1927                       c -= 0x80;
1928                       if (c < 0)
1929                         goto invalid_code;
1930                     }
1931                   else
1932                     c -= 0x20;
1933                   mseq_found = 1;
1934                 }
1935               else
1936                 {
1937                   *nbytes = src - src_base;
1938                   *nchars = consumed_chars;
1939                   return -c;
1940                 }
1941             }
1942           else
1943             goto invalid_code;
1944         }
1945
1946       switch (emacs_mule_bytes[c])
1947         {
1948         case 2:
1949           if ((charset_ID = emacs_mule_charset[c]) < 0)
1950             goto invalid_code;
1951           ONE_MORE_BYTE (c);
1952           if (c < 0xA0)
1953             goto invalid_code;
1954           code = c & 0x7F;
1955           break;
1956
1957         case 3:
1958           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1959               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1960             {
1961               ONE_MORE_BYTE (c);
1962               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1963                 goto invalid_code;
1964               ONE_MORE_BYTE (c);
1965               if (c < 0xA0)
1966                 goto invalid_code;
1967               code = c & 0x7F;
1968             }
1969           else
1970             {
1971               if ((charset_ID = emacs_mule_charset[c]) < 0)
1972                 goto invalid_code;
1973               ONE_MORE_BYTE (c);
1974               if (c < 0xA0)
1975                 goto invalid_code;
1976               code = (c & 0x7F) << 8;
1977               ONE_MORE_BYTE (c);
1978               if (c < 0xA0)
1979                 goto invalid_code;
1980               code |= c & 0x7F;
1981             }
1982           break;
1983
1984         case 4:
1985           ONE_MORE_BYTE (c);
1986           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1987             goto invalid_code;
1988           ONE_MORE_BYTE (c);
1989           if (c < 0xA0)
1990             goto invalid_code;
1991           code = (c & 0x7F) << 8;
1992           ONE_MORE_BYTE (c);
1993           if (c < 0xA0)
1994             goto invalid_code;
1995           code |= c & 0x7F;
1996           break;
1997
1998         case 1:
1999           code = c;
2000           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2001           break;
2002
2003         default:
2004           emacs_abort ();
2005         }
2006       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2007                           CHARSET_FROM_ID (charset_ID), code, c);
2008       if (c < 0)
2009         goto invalid_code;
2010     }
2011   *nbytes = src - src_base;
2012   *nchars = consumed_chars;
2013   if (id)
2014     *id = charset_ID;
2015   return (mseq_found ? -c : c);
2016
2017  no_more_source:
2018   return -2;
2019
2020  invalid_code:
2021   return -1;
2022 }
2023
2024
2025 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2026
2027 /* Handle these composition sequence ('|': the end of header elements,
2028    BYTES and CHARS >= 0xA0):
2029
2030    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2031    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2032    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2033
2034    and these old form:
2035
2036    (4) relative composition: 0x80 | MSEQ ... MSEQ
2037    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2038
2039    When the starter 0x80 and the following header elements are found,
2040    this annotation header is produced.
2041
2042         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2043
2044    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2045    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2046
2047    Then, upon reading the following elements, these codes are produced
2048    until the composition end is found:
2049
2050    (1) CHAR ... CHAR
2051    (2) ALT ... ALT CHAR ... CHAR
2052    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2053    (4) CHAR ... CHAR
2054    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2055
2056    When the composition end is found, LENGTH and NCHARS in the
2057    annotation header is updated as below:
2058
2059    (1) LENGTH: unchanged, NCHARS: unchanged
2060    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2061    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2062    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2063    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2064
2065    If an error is found while composing, the annotation header is
2066    changed to the original composition header (plus filler -1s) as
2067    below:
2068
2069    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2070    (5)          [ 0x80 0xFF -1 -1- -1 ]
2071
2072    and the sequence [ -2 DECODED-RULE ] is changed to the original
2073    byte sequence as below:
2074         o the original byte sequence is B: [ B -1 ]
2075         o the original byte sequence is B1 B2: [ B1 B2 ]
2076
2077    Most of the routines are implemented by macros because many
2078    variables and labels in the caller decode_coding_emacs_mule must be
2079    accessible, and they are usually called just once (thus doesn't
2080    increase the size of compiled object).  */
2081
2082 /* Decode a composition rule represented by C as a component of
2083    composition sequence of Emacs 20 style.  Set RULE to the decoded
2084    rule. */
2085
2086 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2087   do {                                                  \
2088     int gref, nref;                                     \
2089                                                         \
2090     c -= 0xA0;                                          \
2091     if (c < 0 || c >= 81)                               \
2092       goto invalid_code;                                \
2093     gref = c / 9, nref = c % 9;                         \
2094     if (gref == 4) gref = 10;                           \
2095     if (nref == 4) nref = 10;                           \
2096     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2097   } while (0)
2098
2099
2100 /* Decode a composition rule represented by C and the following byte
2101    at SRC as a component of composition sequence of Emacs 21 style.
2102    Set RULE to the decoded rule.  */
2103
2104 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2105   do {                                                  \
2106     int gref, nref;                                     \
2107                                                         \
2108     gref = c - 0x20;                                    \
2109     if (gref < 0 || gref >= 81)                         \
2110       goto invalid_code;                                \
2111     ONE_MORE_BYTE (c);                                  \
2112     nref = c - 0x20;                                    \
2113     if (nref < 0 || nref >= 81)                         \
2114       goto invalid_code;                                \
2115     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2116   } while (0)
2117
2118
2119 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2120    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2121    byte length of this composition information, CHARS is the number of
2122    characters composed by this composition.  */
2123
2124 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2125   do {                                                                  \
2126     enum composition_method method = c - 0xF2;                          \
2127     int nbytes, nchars;                                                 \
2128                                                                         \
2129     ONE_MORE_BYTE (c);                                                  \
2130     if (c < 0)                                                          \
2131       goto invalid_code;                                                \
2132     nbytes = c - 0xA0;                                                  \
2133     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2134       goto invalid_code;                                                \
2135     ONE_MORE_BYTE (c);                                                  \
2136     nchars = c - 0xA0;                                                  \
2137     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2138       goto invalid_code;                                                \
2139     cmp_status->old_form = 0;                                           \
2140     cmp_status->method = method;                                        \
2141     if (method == COMPOSITION_RELATIVE)                                 \
2142       cmp_status->state = COMPOSING_CHAR;                               \
2143     else                                                                \
2144       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2145     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2146     cmp_status->nchars = nchars;                                        \
2147     cmp_status->ncomps = nbytes - 4;                                    \
2148     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2149   } while (0)
2150
2151
2152 /* Start of Emacs 20 style format for relative composition.  */
2153
2154 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2155   do {                                                          \
2156     cmp_status->old_form = 1;                                   \
2157     cmp_status->method = COMPOSITION_RELATIVE;                  \
2158     cmp_status->state = COMPOSING_CHAR;                         \
2159     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2160     cmp_status->nchars = cmp_status->ncomps = 0;                \
2161     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2162   } while (0)
2163
2164
2165 /* Start of Emacs 20 style format for rule-base composition.  */
2166
2167 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2168   do {                                                          \
2169     cmp_status->old_form = 1;                                   \
2170     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2171     cmp_status->state = COMPOSING_CHAR;                         \
2172     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2173     cmp_status->nchars = cmp_status->ncomps = 0;                \
2174     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2175   } while (0)
2176
2177
2178 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2179   do {                                                  \
2180     const unsigned char *current_src = src;             \
2181                                                         \
2182     ONE_MORE_BYTE (c);                                  \
2183     if (c < 0)                                          \
2184       goto invalid_code;                                \
2185     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2186         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2187       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2188     else if (c < 0xA0)                                  \
2189       goto invalid_code;                                \
2190     else if (c < 0xC0)                                  \
2191       {                                                 \
2192         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2193         /* Re-read C as a composition component.  */    \
2194         src = current_src;                              \
2195       }                                                 \
2196     else if (c == 0xFF)                                 \
2197       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2198     else                                                \
2199       goto invalid_code;                                \
2200   } while (0)
2201
2202 #define EMACS_MULE_COMPOSITION_END()                            \
2203   do {                                                          \
2204     int idx = - cmp_status->length;                             \
2205                                                                 \
2206     if (cmp_status->old_form)                                   \
2207       charbuf[idx + 2] = cmp_status->nchars;                    \
2208     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2209       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2210     cmp_status->state = COMPOSING_NO;                           \
2211   } while (0)
2212
2213
2214 static int
2215 emacs_mule_finish_composition (int *charbuf,
2216                                struct composition_status *cmp_status)
2217 {
2218   int idx = - cmp_status->length;
2219   int new_chars;
2220
2221   if (cmp_status->old_form && cmp_status->nchars > 0)
2222     {
2223       charbuf[idx + 2] = cmp_status->nchars;
2224       new_chars = 0;
2225       if (cmp_status->method == COMPOSITION_WITH_RULE
2226           && cmp_status->state == COMPOSING_CHAR)
2227         {
2228           /* The last rule was invalid.  */
2229           int rule = charbuf[-1] + 0xA0;
2230
2231           charbuf[-2] = BYTE8_TO_CHAR (rule);
2232           charbuf[-1] = -1;
2233           new_chars = 1;
2234         }
2235     }
2236   else
2237     {
2238       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2239
2240       if (cmp_status->method == COMPOSITION_WITH_RULE)
2241         {
2242           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2243           charbuf[idx++] = -3;
2244           charbuf[idx++] = 0;
2245           new_chars = 1;
2246         }
2247       else
2248         {
2249           int nchars = charbuf[idx + 1] + 0xA0;
2250           int nbytes = charbuf[idx + 2] + 0xA0;
2251
2252           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2253           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2254           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2255           charbuf[idx++] = -1;
2256           new_chars = 4;
2257         }
2258     }
2259   cmp_status->state = COMPOSING_NO;
2260   return new_chars;
2261 }
2262
2263 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2264   do {                                                                    \
2265     if (cmp_status->state != COMPOSING_NO)                                \
2266       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2267   } while (0)
2268
2269
2270 static void
2271 decode_coding_emacs_mule (struct coding_system *coding)
2272 {
2273   const unsigned char *src = coding->source + coding->consumed;
2274   const unsigned char *src_end = coding->source + coding->src_bytes;
2275   const unsigned char *src_base;
2276   int *charbuf = coding->charbuf + coding->charbuf_used;
2277   /* We may produce two annotations (charset and composition) in one
2278      loop and one more charset annotation at the end.  */
2279   int *charbuf_end
2280     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2281       /* We can produce up to 2 characters in a loop.  */
2282       - 1;
2283   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2284   bool multibytep = coding->src_multibyte;
2285   ptrdiff_t char_offset = coding->produced_char;
2286   ptrdiff_t last_offset = char_offset;
2287   int last_id = charset_ascii;
2288   bool eol_dos
2289     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2290   int byte_after_cr = -1;
2291   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2292
2293   if (cmp_status->state != COMPOSING_NO)
2294     {
2295       int i;
2296
2297       if (charbuf_end - charbuf < cmp_status->length)
2298         emacs_abort ();
2299       for (i = 0; i < cmp_status->length; i++)
2300         *charbuf++ = cmp_status->carryover[i];
2301       coding->annotated = 1;
2302     }
2303
2304   while (1)
2305     {
2306       int c, id IF_LINT (= 0);
2307
2308       src_base = src;
2309       consumed_chars_base = consumed_chars;
2310
2311       if (charbuf >= charbuf_end)
2312         {
2313           if (byte_after_cr >= 0)
2314             src_base--;
2315           break;
2316         }
2317
2318       if (byte_after_cr >= 0)
2319         c = byte_after_cr, byte_after_cr = -1;
2320       else
2321         ONE_MORE_BYTE (c);
2322
2323       if (c < 0 || c == 0x80)
2324         {
2325           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2326           if (c < 0)
2327             {
2328               *charbuf++ = -c;
2329               char_offset++;
2330             }
2331           else
2332             DECODE_EMACS_MULE_COMPOSITION_START ();
2333           continue;
2334         }
2335
2336       if (c < 0x80)
2337         {
2338           if (eol_dos && c == '\r')
2339             ONE_MORE_BYTE (byte_after_cr);
2340           id = charset_ascii;
2341           if (cmp_status->state != COMPOSING_NO)
2342             {
2343               if (cmp_status->old_form)
2344                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2345               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2346                 cmp_status->ncomps--;
2347             }
2348         }
2349       else
2350         {
2351           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2352           /* emacs_mule_char can load a charset map from a file, which
2353              allocates a large structure and might cause buffer text
2354              to be relocated as result.  Thus, we need to remember the
2355              original pointer to buffer text, and fix up all related
2356              pointers after the call.  */
2357           const unsigned char *orig = coding->source;
2358           ptrdiff_t offset;
2359
2360           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2361                                cmp_status);
2362           offset = coding->source - orig;
2363           if (offset)
2364             {
2365               src += offset;
2366               src_base += offset;
2367               src_end += offset;
2368             }
2369           if (c < 0)
2370             {
2371               if (c == -1)
2372                 goto invalid_code;
2373               if (c == -2)
2374                 break;
2375             }
2376           src = src_base + nbytes;
2377           consumed_chars = consumed_chars_base + nchars;
2378           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2379             cmp_status->ncomps -= nchars;
2380         }
2381
2382       /* Now if C >= 0, we found a normally encoded character, if C <
2383          0, we found an old-style composition component character or
2384          rule.  */
2385
2386       if (cmp_status->state == COMPOSING_NO)
2387         {
2388           if (last_id != id)
2389             {
2390               if (last_id != charset_ascii)
2391                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2392                                   last_id);
2393               last_id = id;
2394               last_offset = char_offset;
2395             }
2396           *charbuf++ = c;
2397           char_offset++;
2398         }
2399       else if (cmp_status->state == COMPOSING_CHAR)
2400         {
2401           if (cmp_status->old_form)
2402             {
2403               if (c >= 0)
2404                 {
2405                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2406                   *charbuf++ = c;
2407                   char_offset++;
2408                 }
2409               else
2410                 {
2411                   *charbuf++ = -c;
2412                   cmp_status->nchars++;
2413                   cmp_status->length++;
2414                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2415                     EMACS_MULE_COMPOSITION_END ();
2416                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2417                     cmp_status->state = COMPOSING_RULE;
2418                 }
2419             }
2420           else
2421             {
2422               *charbuf++ = c;
2423               cmp_status->length++;
2424               cmp_status->nchars--;
2425               if (cmp_status->nchars == 0)
2426                 EMACS_MULE_COMPOSITION_END ();
2427             }
2428         }
2429       else if (cmp_status->state == COMPOSING_RULE)
2430         {
2431           int rule;
2432
2433           if (c >= 0)
2434             {
2435               EMACS_MULE_COMPOSITION_END ();
2436               *charbuf++ = c;
2437               char_offset++;
2438             }
2439           else
2440             {
2441               c = -c;
2442               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2443               if (rule < 0)
2444                 goto invalid_code;
2445               *charbuf++ = -2;
2446               *charbuf++ = rule;
2447               cmp_status->length += 2;
2448               cmp_status->state = COMPOSING_CHAR;
2449             }
2450         }
2451       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2452         {
2453           *charbuf++ = c;
2454           cmp_status->length++;
2455           if (cmp_status->ncomps == 0)
2456             cmp_status->state = COMPOSING_CHAR;
2457           else if (cmp_status->ncomps > 0)
2458             {
2459               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2460                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2461             }
2462           else
2463             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2464         }
2465       else                      /* COMPOSING_COMPONENT_RULE */
2466         {
2467           int rule;
2468
2469           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2470           if (rule < 0)
2471             goto invalid_code;
2472           *charbuf++ = -2;
2473           *charbuf++ = rule;
2474           cmp_status->length += 2;
2475           cmp_status->ncomps--;
2476           if (cmp_status->ncomps > 0)
2477             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2478           else
2479             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2480         }
2481       continue;
2482
2483     invalid_code:
2484       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2485       src = src_base;
2486       consumed_chars = consumed_chars_base;
2487       ONE_MORE_BYTE (c);
2488       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2489       char_offset++;
2490       coding->errors++;
2491     }
2492
2493  no_more_source:
2494   if (cmp_status->state != COMPOSING_NO)
2495     {
2496       if (coding->mode & CODING_MODE_LAST_BLOCK)
2497         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2498       else
2499         {
2500           int i;
2501
2502           charbuf -= cmp_status->length;
2503           for (i = 0; i < cmp_status->length; i++)
2504             cmp_status->carryover[i] = charbuf[i];
2505         }
2506     }
2507   if (last_id != charset_ascii)
2508     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2509   coding->consumed_char += consumed_chars_base;
2510   coding->consumed = src_base - coding->source;
2511   coding->charbuf_used = charbuf - coding->charbuf;
2512 }
2513
2514
2515 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2516   do {                                          \
2517     if (id < 0xA0)                              \
2518       codes[0] = id, codes[1] = 0;              \
2519     else if (id < 0xE0)                         \
2520       codes[0] = 0x9A, codes[1] = id;           \
2521     else if (id < 0xF0)                         \
2522       codes[0] = 0x9B, codes[1] = id;           \
2523     else if (id < 0xF5)                         \
2524       codes[0] = 0x9C, codes[1] = id;           \
2525     else                                        \
2526       codes[0] = 0x9D, codes[1] = id;           \
2527   } while (0);
2528
2529
2530 static bool
2531 encode_coding_emacs_mule (struct coding_system *coding)
2532 {
2533   bool multibytep = coding->dst_multibyte;
2534   int *charbuf = coding->charbuf;
2535   int *charbuf_end = charbuf + coding->charbuf_used;
2536   unsigned char *dst = coding->destination + coding->produced;
2537   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2538   int safe_room = 8;
2539   ptrdiff_t produced_chars = 0;
2540   Lisp_Object attrs, charset_list;
2541   int c;
2542   int preferred_charset_id = -1;
2543
2544   CODING_GET_INFO (coding, attrs, charset_list);
2545   if (! EQ (charset_list, Vemacs_mule_charset_list))
2546     {
2547       charset_list = Vemacs_mule_charset_list;
2548       ASET (attrs, coding_attr_charset_list, charset_list);
2549     }
2550
2551   while (charbuf < charbuf_end)
2552     {
2553       ASSURE_DESTINATION (safe_room);
2554       c = *charbuf++;
2555
2556       if (c < 0)
2557         {
2558           /* Handle an annotation.  */
2559           switch (*charbuf)
2560             {
2561             case CODING_ANNOTATE_COMPOSITION_MASK:
2562               /* Not yet implemented.  */
2563               break;
2564             case CODING_ANNOTATE_CHARSET_MASK:
2565               preferred_charset_id = charbuf[3];
2566               if (preferred_charset_id >= 0
2567                   && NILP (Fmemq (make_number (preferred_charset_id),
2568                                   charset_list)))
2569                 preferred_charset_id = -1;
2570               break;
2571             default:
2572               emacs_abort ();
2573             }
2574           charbuf += -c - 1;
2575           continue;
2576         }
2577
2578       if (ASCII_CHAR_P (c))
2579         EMIT_ONE_ASCII_BYTE (c);
2580       else if (CHAR_BYTE8_P (c))
2581         {
2582           c = CHAR_TO_BYTE8 (c);
2583           EMIT_ONE_BYTE (c);
2584         }
2585       else
2586         {
2587           struct charset *charset;
2588           unsigned code;
2589           int dimension;
2590           int emacs_mule_id;
2591           unsigned char leading_codes[2];
2592
2593           if (preferred_charset_id >= 0)
2594             {
2595               bool result;
2596
2597               charset = CHARSET_FROM_ID (preferred_charset_id);
2598               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2599               if (result)
2600                 code = ENCODE_CHAR (charset, c);
2601               else
2602                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2603                                      &code, charset);
2604             }
2605           else
2606             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2607                                  &code, charset);
2608           if (! charset)
2609             {
2610               c = coding->default_char;
2611               if (ASCII_CHAR_P (c))
2612                 {
2613                   EMIT_ONE_ASCII_BYTE (c);
2614                   continue;
2615                 }
2616               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2617                                    &code, charset);
2618             }
2619           dimension = CHARSET_DIMENSION (charset);
2620           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2621           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2622           EMIT_ONE_BYTE (leading_codes[0]);
2623           if (leading_codes[1])
2624             EMIT_ONE_BYTE (leading_codes[1]);
2625           if (dimension == 1)
2626             EMIT_ONE_BYTE (code | 0x80);
2627           else
2628             {
2629               code |= 0x8080;
2630               EMIT_ONE_BYTE (code >> 8);
2631               EMIT_ONE_BYTE (code & 0xFF);
2632             }
2633         }
2634     }
2635   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2636   coding->produced_char += produced_chars;
2637   coding->produced = dst - coding->destination;
2638   return 0;
2639 }
2640
2641 \f
2642 /*** 7. ISO2022 handlers ***/
2643
2644 /* The following note describes the coding system ISO2022 briefly.
2645    Since the intention of this note is to help understand the
2646    functions in this file, some parts are NOT ACCURATE or are OVERLY
2647    SIMPLIFIED.  For thorough understanding, please refer to the
2648    original document of ISO2022.  This is equivalent to the standard
2649    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2650
2651    ISO2022 provides many mechanisms to encode several character sets
2652    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2653    is encoded using bytes less than 128.  This may make the encoded
2654    text a little bit longer, but the text passes more easily through
2655    several types of gateway, some of which strip off the MSB (Most
2656    Significant Bit).
2657
2658    There are two kinds of character sets: control character sets and
2659    graphic character sets.  The former contain control characters such
2660    as `newline' and `escape' to provide control functions (control
2661    functions are also provided by escape sequences).  The latter
2662    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2663    two control character sets and many graphic character sets.
2664
2665    Graphic character sets are classified into one of the following
2666    four classes, according to the number of bytes (DIMENSION) and
2667    number of characters in one dimension (CHARS) of the set:
2668    - DIMENSION1_CHARS94
2669    - DIMENSION1_CHARS96
2670    - DIMENSION2_CHARS94
2671    - DIMENSION2_CHARS96
2672
2673    In addition, each character set is assigned an identification tag,
2674    unique for each set, called the "final character" (denoted as <F>
2675    hereafter).  The <F> of each character set is decided by ECMA(*)
2676    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2677    (0x30..0x3F are for private use only).
2678
2679    Note (*): ECMA = European Computer Manufacturers Association
2680
2681    Here are examples of graphic character sets [NAME(<F>)]:
2682         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2683         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2684         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2685         o DIMENSION2_CHARS96 -- none for the moment
2686
2687    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2688         C0 [0x00..0x1F] -- control character plane 0
2689         GL [0x20..0x7F] -- graphic character plane 0
2690         C1 [0x80..0x9F] -- control character plane 1
2691         GR [0xA0..0xFF] -- graphic character plane 1
2692
2693    A control character set is directly designated and invoked to C0 or
2694    C1 by an escape sequence.  The most common case is that:
2695    - ISO646's  control character set is designated/invoked to C0, and
2696    - ISO6429's control character set is designated/invoked to C1,
2697    and usually these designations/invocations are omitted in encoded
2698    text.  In a 7-bit environment, only C0 can be used, and a control
2699    character for C1 is encoded by an appropriate escape sequence to
2700    fit into the environment.  All control characters for C1 are
2701    defined to have corresponding escape sequences.
2702
2703    A graphic character set is at first designated to one of four
2704    graphic registers (G0 through G3), then these graphic registers are
2705    invoked to GL or GR.  These designations and invocations can be
2706    done independently.  The most common case is that G0 is invoked to
2707    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2708    these invocations and designations are omitted in encoded text.
2709    In a 7-bit environment, only GL can be used.
2710
2711    When a graphic character set of CHARS94 is invoked to GL, codes
2712    0x20 and 0x7F of the GL area work as control characters SPACE and
2713    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2714    be used.
2715
2716    There are two ways of invocation: locking-shift and single-shift.
2717    With locking-shift, the invocation lasts until the next different
2718    invocation, whereas with single-shift, the invocation affects the
2719    following character only and doesn't affect the locking-shift
2720    state.  Invocations are done by the following control characters or
2721    escape sequences:
2722
2723    ----------------------------------------------------------------------
2724    abbrev  function                  cntrl escape seq   description
2725    ----------------------------------------------------------------------
2726    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2727    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2728    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2729    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2730    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2731    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2732    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2733    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2734    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2735    ----------------------------------------------------------------------
2736    (*) These are not used by any known coding system.
2737
2738    Control characters for these functions are defined by macros
2739    ISO_CODE_XXX in `coding.h'.
2740
2741    Designations are done by the following escape sequences:
2742    ----------------------------------------------------------------------
2743    escape sequence      description
2744    ----------------------------------------------------------------------
2745    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2746    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2747    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2748    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2749    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2750    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2751    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2752    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2753    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2754    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2755    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2756    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2757    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2758    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2759    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2760    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2761    ----------------------------------------------------------------------
2762
2763    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2764    of dimension 1, chars 94, and final character <F>, etc...
2765
2766    Note (*): Although these designations are not allowed in ISO2022,
2767    Emacs accepts them on decoding, and produces them on encoding
2768    CHARS96 character sets in a coding system which is characterized as
2769    7-bit environment, non-locking-shift, and non-single-shift.
2770
2771    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2772    '(' must be omitted.  We refer to this as "short-form" hereafter.
2773
2774    Now you may notice that there are a lot of ways of encoding the
2775    same multilingual text in ISO2022.  Actually, there exist many
2776    coding systems such as Compound Text (used in X11's inter client
2777    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2778    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2779    localized platforms), and all of these are variants of ISO2022.
2780
2781    In addition to the above, Emacs handles two more kinds of escape
2782    sequences: ISO6429's direction specification and Emacs' private
2783    sequence for specifying character composition.
2784
2785    ISO6429's direction specification takes the following form:
2786         o CSI ']'      -- end of the current direction
2787         o CSI '0' ']'  -- end of the current direction
2788         o CSI '1' ']'  -- start of left-to-right text
2789         o CSI '2' ']'  -- start of right-to-left text
2790    The control character CSI (0x9B: control sequence introducer) is
2791    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2792
2793    Character composition specification takes the following form:
2794         o ESC '0' -- start relative composition
2795         o ESC '1' -- end composition
2796         o ESC '2' -- start rule-base composition (*)
2797         o ESC '3' -- start relative composition with alternate chars  (**)
2798         o ESC '4' -- start rule-base composition with alternate chars  (**)
2799   Since these are not standard escape sequences of any ISO standard,
2800   the use of them with these meanings is restricted to Emacs only.
2801
2802   (*) This form is used only in Emacs 20.7 and older versions,
2803   but newer versions can safely decode it.
2804   (**) This form is used only in Emacs 21.1 and newer versions,
2805   and older versions can't decode it.
2806
2807   Here's a list of example usages of these composition escape
2808   sequences (categorized by `enum composition_method').
2809
2810   COMPOSITION_RELATIVE:
2811         ESC 0 CHAR [ CHAR ] ESC 1
2812   COMPOSITION_WITH_RULE:
2813         ESC 2 CHAR [ RULE CHAR ] ESC 1
2814   COMPOSITION_WITH_ALTCHARS:
2815         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2816   COMPOSITION_WITH_RULE_ALTCHARS:
2817         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2818
2819 static enum iso_code_class_type iso_code_class[256];
2820
2821 #define SAFE_CHARSET_P(coding, id)      \
2822   ((id) <= (coding)->max_charset_id     \
2823    && (coding)->safe_charsets[id] != 255)
2824
2825 static void
2826 setup_iso_safe_charsets (Lisp_Object attrs)
2827 {
2828   Lisp_Object charset_list, safe_charsets;
2829   Lisp_Object request;
2830   Lisp_Object reg_usage;
2831   Lisp_Object tail;
2832   EMACS_INT reg94, reg96;
2833   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2834   int max_charset_id;
2835
2836   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2837   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2838       && ! EQ (charset_list, Viso_2022_charset_list))
2839     {
2840       charset_list = Viso_2022_charset_list;
2841       ASET (attrs, coding_attr_charset_list, charset_list);
2842       ASET (attrs, coding_attr_safe_charsets, Qnil);
2843     }
2844
2845   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2846     return;
2847
2848   max_charset_id = 0;
2849   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2850     {
2851       int id = XINT (XCAR (tail));
2852       if (max_charset_id < id)
2853         max_charset_id = id;
2854     }
2855
2856   safe_charsets = make_uninit_string (max_charset_id + 1);
2857   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2858   request = AREF (attrs, coding_attr_iso_request);
2859   reg_usage = AREF (attrs, coding_attr_iso_usage);
2860   reg94 = XINT (XCAR (reg_usage));
2861   reg96 = XINT (XCDR (reg_usage));
2862
2863   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2864     {
2865       Lisp_Object id;
2866       Lisp_Object reg;
2867       struct charset *charset;
2868
2869       id = XCAR (tail);
2870       charset = CHARSET_FROM_ID (XINT (id));
2871       reg = Fcdr (Fassq (id, request));
2872       if (! NILP (reg))
2873         SSET (safe_charsets, XINT (id), XINT (reg));
2874       else if (charset->iso_chars_96)
2875         {
2876           if (reg96 < 4)
2877             SSET (safe_charsets, XINT (id), reg96);
2878         }
2879       else
2880         {
2881           if (reg94 < 4)
2882             SSET (safe_charsets, XINT (id), reg94);
2883         }
2884     }
2885   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2886 }
2887
2888
2889 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2890    Return true if a text is encoded in one of ISO-2022 based coding
2891    systems.  */
2892
2893 static bool
2894 detect_coding_iso_2022 (struct coding_system *coding,
2895                         struct coding_detection_info *detect_info)
2896 {
2897   const unsigned char *src = coding->source, *src_base = src;
2898   const unsigned char *src_end = coding->source + coding->src_bytes;
2899   bool multibytep = coding->src_multibyte;
2900   bool single_shifting = 0;
2901   int id;
2902   int c, c1;
2903   ptrdiff_t consumed_chars = 0;
2904   int i;
2905   int rejected = 0;
2906   int found = 0;
2907   int composition_count = -1;
2908
2909   detect_info->checked |= CATEGORY_MASK_ISO;
2910
2911   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2912     {
2913       struct coding_system *this = &(coding_categories[i]);
2914       Lisp_Object attrs, val;
2915
2916       if (this->id < 0)
2917         continue;
2918       attrs = CODING_ID_ATTRS (this->id);
2919       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2920           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2921         setup_iso_safe_charsets (attrs);
2922       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2923       this->max_charset_id = SCHARS (val) - 1;
2924       this->safe_charsets = SDATA (val);
2925     }
2926
2927   /* A coding system of this category is always ASCII compatible.  */
2928   src += coding->head_ascii;
2929
2930   while (rejected != CATEGORY_MASK_ISO)
2931     {
2932       src_base = src;
2933       ONE_MORE_BYTE (c);
2934       switch (c)
2935         {
2936         case ISO_CODE_ESC:
2937           if (inhibit_iso_escape_detection)
2938             break;
2939           single_shifting = 0;
2940           ONE_MORE_BYTE (c);
2941           if (c == 'N' || c == 'O')
2942             {
2943               /* ESC <Fe> for SS2 or SS3.  */
2944               single_shifting = 1;
2945               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2946             }
2947           else if (c == '1')
2948             {
2949               /* End of composition.  */
2950               if (composition_count < 0
2951                   || composition_count > MAX_COMPOSITION_COMPONENTS)
2952                 /* Invalid */
2953                 break;
2954               composition_count = -1;
2955               found |= CATEGORY_MASK_ISO;
2956             }
2957           else if (c >= '0' && c <= '4')
2958             {
2959               /* ESC <Fp> for start/end composition.  */
2960               composition_count = 0;
2961             }
2962           else
2963             {
2964               if (c >= '(' && c <= '/')
2965                 {
2966                   /* Designation sequence for a charset of dimension 1.  */
2967                   ONE_MORE_BYTE (c1);
2968                   if (c1 < ' ' || c1 >= 0x80
2969                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2970                     /* Invalid designation sequence.  Just ignore.  */
2971                     break;
2972                 }
2973               else if (c == '$')
2974                 {
2975                   /* Designation sequence for a charset of dimension 2.  */
2976                   ONE_MORE_BYTE (c);
2977                   if (c >= '@' && c <= 'B')
2978                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2979                     id = iso_charset_table[1][0][c];
2980                   else if (c >= '(' && c <= '/')
2981                     {
2982                       ONE_MORE_BYTE (c1);
2983                       if (c1 < ' ' || c1 >= 0x80
2984                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2985                         /* Invalid designation sequence.  Just ignore.  */
2986                         break;
2987                     }
2988                   else
2989                     /* Invalid designation sequence.  Just ignore it.  */
2990                     break;
2991                 }
2992               else
2993                 {
2994                   /* Invalid escape sequence.  Just ignore it.  */
2995                   break;
2996                 }
2997
2998               /* We found a valid designation sequence for CHARSET.  */
2999               rejected |= CATEGORY_MASK_ISO_8BIT;
3000               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3001                                   id))
3002                 found |= CATEGORY_MASK_ISO_7;
3003               else
3004                 rejected |= CATEGORY_MASK_ISO_7;
3005               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3006                                   id))
3007                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3008               else
3009                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3010               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3011                                   id))
3012                 found |= CATEGORY_MASK_ISO_7_ELSE;
3013               else
3014                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3015               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3016                                   id))
3017                 found |= CATEGORY_MASK_ISO_8_ELSE;
3018               else
3019                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3020             }
3021           break;
3022
3023         case ISO_CODE_SO:
3024         case ISO_CODE_SI:
3025           /* Locking shift out/in.  */
3026           if (inhibit_iso_escape_detection)
3027             break;
3028           single_shifting = 0;
3029           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3030           break;
3031
3032         case ISO_CODE_CSI:
3033           /* Control sequence introducer.  */
3034           single_shifting = 0;
3035           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3036           found |= CATEGORY_MASK_ISO_8_ELSE;
3037           goto check_extra_latin;
3038
3039         case ISO_CODE_SS2:
3040         case ISO_CODE_SS3:
3041           /* Single shift.   */
3042           if (inhibit_iso_escape_detection)
3043             break;
3044           single_shifting = 0;
3045           rejected |= CATEGORY_MASK_ISO_7BIT;
3046           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3047               & CODING_ISO_FLAG_SINGLE_SHIFT)
3048             {
3049               found |= CATEGORY_MASK_ISO_8_1;
3050               single_shifting = 1;
3051             }
3052           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3053               & CODING_ISO_FLAG_SINGLE_SHIFT)
3054             {
3055               found |= CATEGORY_MASK_ISO_8_2;
3056               single_shifting = 1;
3057             }
3058           if (single_shifting)
3059             break;
3060         check_extra_latin:
3061           if (! VECTORP (Vlatin_extra_code_table)
3062               || NILP (AREF (Vlatin_extra_code_table, c)))
3063             {
3064               rejected = CATEGORY_MASK_ISO;
3065               break;
3066             }
3067           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3068               & CODING_ISO_FLAG_LATIN_EXTRA)
3069             found |= CATEGORY_MASK_ISO_8_1;
3070           else
3071             rejected |= CATEGORY_MASK_ISO_8_1;
3072           rejected |= CATEGORY_MASK_ISO_8_2;
3073           break;
3074
3075         default:
3076           if (c < 0)
3077             continue;
3078           if (c < 0x80)
3079             {
3080               if (composition_count >= 0)
3081                 composition_count++;
3082               single_shifting = 0;
3083               break;
3084             }
3085           if (c >= 0xA0)
3086             {
3087               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3088               found |= CATEGORY_MASK_ISO_8_1;
3089               /* Check the length of succeeding codes of the range
3090                  0xA0..0FF.  If the byte length is even, we include
3091                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3092                  only when we are not single shifting.  */
3093               if (! single_shifting
3094                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3095                 {
3096                   int len = 1;
3097                   while (src < src_end)
3098                     {
3099                       src_base = src;
3100                       ONE_MORE_BYTE (c);
3101                       if (c < 0xA0)
3102                         {
3103                           src = src_base;
3104                           break;
3105                         }
3106                       len++;
3107                     }
3108
3109                   if (len & 1 && src < src_end)
3110                     {
3111                       rejected |= CATEGORY_MASK_ISO_8_2;
3112                       if (composition_count >= 0)
3113                         composition_count += len;
3114                     }
3115                   else
3116                     {
3117                       found |= CATEGORY_MASK_ISO_8_2;
3118                       if (composition_count >= 0)
3119                         composition_count += len / 2;
3120                     }
3121                 }
3122               break;
3123             }
3124         }
3125     }
3126   detect_info->rejected |= CATEGORY_MASK_ISO;
3127   return 0;
3128
3129  no_more_source:
3130   detect_info->rejected |= rejected;
3131   detect_info->found |= (found & ~rejected);
3132   return 1;
3133 }
3134
3135
3136 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3137    escape sequence should be kept.  */
3138 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3139   do {                                                                  \
3140     int id, prev;                                                       \
3141                                                                         \
3142     if (final < '0' || final >= 128                                     \
3143         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3144         || !SAFE_CHARSET_P (coding, id))                                \
3145       {                                                                 \
3146         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3147         chars_96 = -1;                                                  \
3148         break;                                                          \
3149       }                                                                 \
3150     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3151     if (id == charset_jisx0201_roman)                                   \
3152       {                                                                 \
3153         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3154           id = charset_ascii;                                           \
3155       }                                                                 \
3156     else if (id == charset_jisx0208_1978)                               \
3157       {                                                                 \
3158         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3159           id = charset_jisx0208;                                        \
3160       }                                                                 \
3161     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3162     /* If there was an invalid designation to REG previously, and this  \
3163        designation is ASCII to REG, we should keep this designation     \
3164        sequence.  */                                                    \
3165     if (prev == -2 && id == charset_ascii)                              \
3166       chars_96 = -1;                                                    \
3167   } while (0)
3168
3169
3170 /* Handle these composition sequence (ALT: alternate char):
3171
3172    (1) relative composition: ESC 0 CHAR ... ESC 1
3173    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3174    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3175    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3176
3177    When the start sequence (ESC 0/2/3/4) is found, this annotation
3178    header is produced.
3179
3180         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3181
3182    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3183    produced until the end sequence (ESC 1) is found:
3184
3185    (1) CHAR ... CHAR
3186    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3187    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3188    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3189
3190    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3191    annotation header is updated as below:
3192
3193    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3194    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3195    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3196    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3197
3198    If an error is found while composing, the annotation header is
3199    changed to:
3200
3201         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3202
3203    and the sequence [ -2 DECODED-RULE ] is changed to the original
3204    byte sequence as below:
3205         o the original byte sequence is B: [ B -1 ]
3206         o the original byte sequence is B1 B2: [ B1 B2 ]
3207    and the sequence [ -1 -1 ] is changed to the original byte
3208    sequence:
3209         [ ESC '0' ]
3210 */
3211
3212 /* Decode a composition rule C1 and maybe one more byte from the
3213    source, and set RULE to the encoded composition rule.  If the rule
3214    is invalid, goto invalid_code.  */
3215
3216 #define DECODE_COMPOSITION_RULE(rule)                                   \
3217   do {                                                                  \
3218     rule = c1 - 32;                                                     \
3219     if (rule < 0)                                                       \
3220       goto invalid_code;                                                \
3221     if (rule < 81)              /* old format (before ver.21) */        \
3222       {                                                                 \
3223         int gref = (rule) / 9;                                          \
3224         int nref = (rule) % 9;                                          \
3225         if (gref == 4) gref = 10;                                       \
3226         if (nref == 4) nref = 10;                                       \
3227         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3228       }                                                                 \
3229     else                        /* new format (after ver.21) */         \
3230       {                                                                 \
3231         int b;                                                          \
3232                                                                         \
3233         ONE_MORE_BYTE (b);                                              \
3234         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3235           goto invalid_code;                                            \
3236         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3237         rule += 0x100;   /* Distinguish it from the old format.  */     \
3238       }                                                                 \
3239   } while (0)
3240
3241 #define ENCODE_COMPOSITION_RULE(rule)                           \
3242   do {                                                          \
3243     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3244                                                                 \
3245     if (rule < 0x100)           /* old format */                \
3246       {                                                         \
3247         if (gref == 10) gref = 4;                               \
3248         if (nref == 10) nref = 4;                               \
3249         charbuf[idx] = 32 + gref * 9 + nref;                    \
3250         charbuf[idx + 1] = -1;                                  \
3251         new_chars++;                                            \
3252       }                                                         \
3253     else                                /* new format */        \
3254       {                                                         \
3255         charbuf[idx] = 32 + 81 + gref;                          \
3256         charbuf[idx + 1] = 32 + nref;                           \
3257         new_chars += 2;                                         \
3258       }                                                         \
3259   } while (0)
3260
3261 /* Finish the current composition as invalid.  */
3262
3263 static int
3264 finish_composition (int *charbuf, struct composition_status *cmp_status)
3265 {
3266   int idx = - cmp_status->length;
3267   int new_chars;
3268
3269   /* Recover the original ESC sequence */
3270   charbuf[idx++] = ISO_CODE_ESC;
3271   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3272                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3273                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3274                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3275                     : '4');
3276   charbuf[idx++] = -2;
3277   charbuf[idx++] = 0;
3278   charbuf[idx++] = -1;
3279   new_chars = cmp_status->nchars;
3280   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3281     for (; idx < 0; idx++)
3282       {
3283         int elt = charbuf[idx];
3284
3285         if (elt == -2)
3286           {
3287             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3288             idx++;
3289           }
3290         else if (elt == -1)
3291           {
3292             charbuf[idx++] = ISO_CODE_ESC;
3293             charbuf[idx] = '0';
3294             new_chars += 2;
3295           }
3296       }
3297   cmp_status->state = COMPOSING_NO;
3298   return new_chars;
3299 }
3300
3301 /* If characters are under composition, finish the composition.  */
3302 #define MAYBE_FINISH_COMPOSITION()                              \
3303   do {                                                          \
3304     if (cmp_status->state != COMPOSING_NO)                      \
3305       char_offset += finish_composition (charbuf, cmp_status);  \
3306   } while (0)
3307
3308 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3309
3310    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3311    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3312    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3313    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3314
3315    Produce this annotation sequence now:
3316
3317    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3318 */
3319
3320 #define DECODE_COMPOSITION_START(c1)                                       \
3321   do {                                                                     \
3322     if (c1 == '0'                                                          \
3323         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3324              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3325             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3326                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3327       {                                                                    \
3328         *charbuf++ = -1;                                                   \
3329         *charbuf++= -1;                                                    \
3330         cmp_status->state = COMPOSING_CHAR;                                \
3331         cmp_status->length += 2;                                           \
3332       }                                                                    \
3333     else                                                                   \
3334       {                                                                    \
3335         MAYBE_FINISH_COMPOSITION ();                                       \
3336         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3337                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3338                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3339                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3340         cmp_status->state                                                  \
3341           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3342         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3343         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3344         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3345         coding->annotated = 1;                                             \
3346       }                                                                    \
3347   } while (0)
3348
3349
3350 /* Handle composition end sequence ESC 1.  */
3351
3352 #define DECODE_COMPOSITION_END()                                        \
3353   do {                                                                  \
3354     if (cmp_status->nchars == 0                                         \
3355         || ((cmp_status->state == COMPOSING_CHAR)                       \
3356             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3357       {                                                                 \
3358         MAYBE_FINISH_COMPOSITION ();                                    \
3359         goto invalid_code;                                              \
3360       }                                                                 \
3361     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3362       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3363     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3364       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3365     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3366     char_offset += cmp_status->nchars;                                  \
3367     cmp_status->state = COMPOSING_NO;                                   \
3368   } while (0)
3369
3370 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3371
3372 #define STORE_COMPOSITION_RULE(rule)    \
3373   do {                                  \
3374     *charbuf++ = -2;                    \
3375     *charbuf++ = rule;                  \
3376     cmp_status->length += 2;            \
3377     cmp_status->state--;                \
3378   } while (0)
3379
3380 /* Store a composed char or a component char C in charbuf, and update
3381    cmp_status.  */
3382
3383 #define STORE_COMPOSITION_CHAR(c)                                       \
3384   do {                                                                  \
3385     *charbuf++ = (c);                                                   \
3386     cmp_status->length++;                                               \
3387     if (cmp_status->state == COMPOSING_CHAR)                            \
3388       cmp_status->nchars++;                                             \
3389     else                                                                \
3390       cmp_status->ncomps++;                                             \
3391     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3392         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3393             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3394       cmp_status->state++;                                              \
3395   } while (0)
3396
3397
3398 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3399
3400 static void
3401 decode_coding_iso_2022 (struct coding_system *coding)
3402 {
3403   const unsigned char *src = coding->source + coding->consumed;
3404   const unsigned char *src_end = coding->source + coding->src_bytes;
3405   const unsigned char *src_base;
3406   int *charbuf = coding->charbuf + coding->charbuf_used;
3407   /* We may produce two annotations (charset and composition) in one
3408      loop and one more charset annotation at the end.  */
3409   int *charbuf_end
3410     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3411   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3412   bool multibytep = coding->src_multibyte;
3413   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3414   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3415   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3416   int charset_id_2, charset_id_3;
3417   struct charset *charset;
3418   int c;
3419   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3420   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3421   ptrdiff_t char_offset = coding->produced_char;
3422   ptrdiff_t last_offset = char_offset;
3423   int last_id = charset_ascii;
3424   bool eol_dos
3425     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3426   int byte_after_cr = -1;
3427   int i;
3428
3429   setup_iso_safe_charsets (attrs);
3430   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3431
3432   if (cmp_status->state != COMPOSING_NO)
3433     {
3434       if (charbuf_end - charbuf < cmp_status->length)
3435         emacs_abort ();
3436       for (i = 0; i < cmp_status->length; i++)
3437         *charbuf++ = cmp_status->carryover[i];
3438       coding->annotated = 1;
3439     }
3440
3441   while (1)
3442     {
3443       int c1, c2, c3;
3444
3445       src_base = src;
3446       consumed_chars_base = consumed_chars;
3447
3448       if (charbuf >= charbuf_end)
3449         {
3450           if (byte_after_cr >= 0)
3451             src_base--;
3452           break;
3453         }
3454
3455       if (byte_after_cr >= 0)
3456         c1 = byte_after_cr, byte_after_cr = -1;
3457       else
3458         ONE_MORE_BYTE (c1);
3459       if (c1 < 0)
3460         goto invalid_code;
3461
3462       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3463         {
3464           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3465           char_offset++;
3466           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3467           continue;
3468         }
3469
3470       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3471         {
3472           if (c1 == ISO_CODE_ESC)
3473             {
3474               if (src + 1 >= src_end)
3475                 goto no_more_source;
3476               *charbuf++ = ISO_CODE_ESC;
3477               char_offset++;
3478               if (src[0] == '%' && src[1] == '@')
3479                 {
3480                   src += 2;
3481                   consumed_chars += 2;
3482                   char_offset += 2;
3483                   /* We are sure charbuf can contain two more chars. */
3484                   *charbuf++ = '%';
3485                   *charbuf++ = '@';
3486                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3487                 }
3488             }
3489           else
3490             {
3491               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3492               char_offset++;
3493             }
3494           continue;
3495         }
3496
3497       if ((cmp_status->state == COMPOSING_RULE
3498            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3499           && c1 != ISO_CODE_ESC)
3500         {
3501           int rule;
3502
3503           DECODE_COMPOSITION_RULE (rule);
3504           STORE_COMPOSITION_RULE (rule);
3505           continue;
3506         }
3507
3508       /* We produce at most one character.  */
3509       switch (iso_code_class [c1])
3510         {
3511         case ISO_0x20_or_0x7F:
3512           if (charset_id_0 < 0
3513               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3514             /* This is SPACE or DEL.  */
3515             charset = CHARSET_FROM_ID (charset_ascii);
3516           else
3517             charset = CHARSET_FROM_ID (charset_id_0);
3518           break;
3519
3520         case ISO_graphic_plane_0:
3521           if (charset_id_0 < 0)
3522             charset = CHARSET_FROM_ID (charset_ascii);
3523           else
3524             charset = CHARSET_FROM_ID (charset_id_0);
3525           break;
3526
3527         case ISO_0xA0_or_0xFF:
3528           if (charset_id_1 < 0
3529               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3530               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3531             goto invalid_code;
3532           /* This is a graphic character, we fall down ... */
3533
3534         case ISO_graphic_plane_1:
3535           if (charset_id_1 < 0)
3536             goto invalid_code;
3537           charset = CHARSET_FROM_ID (charset_id_1);
3538           break;
3539
3540         case ISO_control_0:
3541           if (eol_dos && c1 == '\r')
3542             ONE_MORE_BYTE (byte_after_cr);
3543           MAYBE_FINISH_COMPOSITION ();
3544           charset = CHARSET_FROM_ID (charset_ascii);
3545           break;
3546
3547         case ISO_control_1:
3548           goto invalid_code;
3549
3550         case ISO_shift_out:
3551           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3552               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3553             goto invalid_code;
3554           CODING_ISO_INVOCATION (coding, 0) = 1;
3555           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3556           continue;
3557
3558         case ISO_shift_in:
3559           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3560             goto invalid_code;
3561           CODING_ISO_INVOCATION (coding, 0) = 0;
3562           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3563           continue;
3564
3565         case ISO_single_shift_2_7:
3566           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3567             goto invalid_code;
3568         case ISO_single_shift_2:
3569           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3570             goto invalid_code;
3571           /* SS2 is handled as an escape sequence of ESC 'N' */
3572           c1 = 'N';
3573           goto label_escape_sequence;
3574
3575         case ISO_single_shift_3:
3576           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3577             goto invalid_code;
3578           /* SS2 is handled as an escape sequence of ESC 'O' */
3579           c1 = 'O';
3580           goto label_escape_sequence;
3581
3582         case ISO_control_sequence_introducer:
3583           /* CSI is handled as an escape sequence of ESC '[' ...  */
3584           c1 = '[';
3585           goto label_escape_sequence;
3586
3587         case ISO_escape:
3588           ONE_MORE_BYTE (c1);
3589         label_escape_sequence:
3590           /* Escape sequences handled here are invocation,
3591              designation, direction specification, and character
3592              composition specification.  */
3593           switch (c1)
3594             {
3595             case '&':           /* revision of following character set */
3596               ONE_MORE_BYTE (c1);
3597               if (!(c1 >= '@' && c1 <= '~'))
3598                 goto invalid_code;
3599               ONE_MORE_BYTE (c1);
3600               if (c1 != ISO_CODE_ESC)
3601                 goto invalid_code;
3602               ONE_MORE_BYTE (c1);
3603               goto label_escape_sequence;
3604
3605             case '$':           /* designation of 2-byte character set */
3606               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3607                 goto invalid_code;
3608               {
3609                 int reg, chars96;
3610
3611                 ONE_MORE_BYTE (c1);
3612                 if (c1 >= '@' && c1 <= 'B')
3613                   {     /* designation of JISX0208.1978, GB2312.1980,
3614                            or JISX0208.1980 */
3615                     reg = 0, chars96 = 0;
3616                   }
3617                 else if (c1 >= 0x28 && c1 <= 0x2B)
3618                   { /* designation of DIMENSION2_CHARS94 character set */
3619                     reg = c1 - 0x28, chars96 = 0;
3620                     ONE_MORE_BYTE (c1);
3621                   }
3622                 else if (c1 >= 0x2C && c1 <= 0x2F)
3623                   { /* designation of DIMENSION2_CHARS96 character set */
3624                     reg = c1 - 0x2C, chars96 = 1;
3625                     ONE_MORE_BYTE (c1);
3626                   }
3627                 else
3628                   goto invalid_code;
3629                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3630                 /* We must update these variables now.  */
3631                 if (reg == 0)
3632                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3633                 else if (reg == 1)
3634                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3635                 if (chars96 < 0)
3636                   goto invalid_code;
3637               }
3638               continue;
3639
3640             case 'n':           /* invocation of locking-shift-2 */
3641               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3642                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3643                 goto invalid_code;
3644               CODING_ISO_INVOCATION (coding, 0) = 2;
3645               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3646               continue;
3647
3648             case 'o':           /* invocation of locking-shift-3 */
3649               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3650                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3651                 goto invalid_code;
3652               CODING_ISO_INVOCATION (coding, 0) = 3;
3653               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3654               continue;
3655
3656             case 'N':           /* invocation of single-shift-2 */
3657               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3658                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3659                 goto invalid_code;
3660               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3661               if (charset_id_2 < 0)
3662                 charset = CHARSET_FROM_ID (charset_ascii);
3663               else
3664                 charset = CHARSET_FROM_ID (charset_id_2);
3665               ONE_MORE_BYTE (c1);
3666               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3667                 goto invalid_code;
3668               break;
3669
3670             case 'O':           /* invocation of single-shift-3 */
3671               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3672                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3673                 goto invalid_code;
3674               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3675               if (charset_id_3 < 0)
3676                 charset = CHARSET_FROM_ID (charset_ascii);
3677               else
3678                 charset = CHARSET_FROM_ID (charset_id_3);
3679               ONE_MORE_BYTE (c1);
3680               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3681                 goto invalid_code;
3682               break;
3683
3684             case '0': case '2': case '3': case '4': /* start composition */
3685               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3686                 goto invalid_code;
3687               if (last_id != charset_ascii)
3688                 {
3689                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3690                   last_id = charset_ascii;
3691                   last_offset = char_offset;
3692                 }
3693               DECODE_COMPOSITION_START (c1);
3694               continue;
3695
3696             case '1':           /* end composition */
3697               if (cmp_status->state == COMPOSING_NO)
3698                 goto invalid_code;
3699               DECODE_COMPOSITION_END ();
3700               continue;
3701
3702             case '[':           /* specification of direction */
3703               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3704                 goto invalid_code;
3705               /* For the moment, nested direction is not supported.
3706                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3707                  left-to-right, and nonzero means right-to-left.  */
3708               ONE_MORE_BYTE (c1);
3709               switch (c1)
3710                 {
3711                 case ']':       /* end of the current direction */
3712                   coding->mode &= ~CODING_MODE_DIRECTION;
3713
3714                 case '0':       /* end of the current direction */
3715                 case '1':       /* start of left-to-right direction */
3716                   ONE_MORE_BYTE (c1);
3717                   if (c1 == ']')
3718                     coding->mode &= ~CODING_MODE_DIRECTION;
3719                   else
3720                     goto invalid_code;
3721                   break;
3722
3723                 case '2':       /* start of right-to-left direction */
3724                   ONE_MORE_BYTE (c1);
3725                   if (c1 == ']')
3726                     coding->mode |= CODING_MODE_DIRECTION;
3727                   else
3728                     goto invalid_code;
3729                   break;
3730
3731                 default:
3732                   goto invalid_code;
3733                 }
3734               continue;
3735
3736             case '%':
3737               ONE_MORE_BYTE (c1);
3738               if (c1 == '/')
3739                 {
3740                   /* CTEXT extended segment:
3741                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3742                      We keep these bytes as is for the moment.
3743                      They may be decoded by post-read-conversion.  */
3744                   int dim, M, L;
3745                   int size;
3746
3747                   ONE_MORE_BYTE (dim);
3748                   if (dim < '0' || dim > '4')
3749                     goto invalid_code;
3750                   ONE_MORE_BYTE (M);
3751                   if (M < 128)
3752                     goto invalid_code;
3753                   ONE_MORE_BYTE (L);
3754                   if (L < 128)
3755                     goto invalid_code;
3756                   size = ((M - 128) * 128) + (L - 128);
3757                   if (charbuf + 6 > charbuf_end)
3758                     goto break_loop;
3759                   *charbuf++ = ISO_CODE_ESC;
3760                   *charbuf++ = '%';
3761                   *charbuf++ = '/';
3762                   *charbuf++ = dim;
3763                   *charbuf++ = BYTE8_TO_CHAR (M);
3764                   *charbuf++ = BYTE8_TO_CHAR (L);
3765                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3766                 }
3767               else if (c1 == 'G')
3768                 {
3769                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3770                      ESC % G --UTF-8-BYTES-- ESC % @
3771                      We keep these bytes as is for the moment.
3772                      They may be decoded by post-read-conversion.  */
3773                   if (charbuf + 3 > charbuf_end)
3774                     goto break_loop;
3775                   *charbuf++ = ISO_CODE_ESC;
3776                   *charbuf++ = '%';
3777                   *charbuf++ = 'G';
3778                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3779                 }
3780               else
3781                 goto invalid_code;
3782               continue;
3783               break;
3784
3785             default:
3786               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3787                 goto invalid_code;
3788               {
3789                 int reg, chars96;
3790
3791                 if (c1 >= 0x28 && c1 <= 0x2B)
3792                   { /* designation of DIMENSION1_CHARS94 character set */
3793                     reg = c1 - 0x28, chars96 = 0;
3794                     ONE_MORE_BYTE (c1);
3795                   }
3796                 else if (c1 >= 0x2C && c1 <= 0x2F)
3797                   { /* designation of DIMENSION1_CHARS96 character set */
3798                     reg = c1 - 0x2C, chars96 = 1;
3799                     ONE_MORE_BYTE (c1);
3800                   }
3801                 else
3802                   goto invalid_code;
3803                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3804                 /* We must update these variables now.  */
3805                 if (reg == 0)
3806                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3807                 else if (reg == 1)
3808                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3809                 if (chars96 < 0)
3810                   goto invalid_code;
3811               }
3812               continue;
3813             }
3814           break;
3815
3816         default:
3817           emacs_abort ();
3818         }
3819
3820       if (cmp_status->state == COMPOSING_NO
3821           && charset->id != charset_ascii
3822           && last_id != charset->id)
3823         {
3824           if (last_id != charset_ascii)
3825             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3826           last_id = charset->id;
3827           last_offset = char_offset;
3828         }
3829
3830       /* Now we know CHARSET and 1st position code C1 of a character.
3831          Produce a decoded character while getting 2nd and 3rd
3832          position codes C2, C3 if necessary.  */
3833       if (CHARSET_DIMENSION (charset) > 1)
3834         {
3835           ONE_MORE_BYTE (c2);
3836           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3837               || ((c1 & 0x80) != (c2 & 0x80)))
3838             /* C2 is not in a valid range.  */
3839             goto invalid_code;
3840           if (CHARSET_DIMENSION (charset) == 2)
3841             c1 = (c1 << 8) | c2;
3842           else
3843             {
3844               ONE_MORE_BYTE (c3);
3845               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3846                   || ((c1 & 0x80) != (c3 & 0x80)))
3847                 /* C3 is not in a valid range.  */
3848                 goto invalid_code;
3849               c1 = (c1 << 16) | (c2 << 8) | c2;
3850             }
3851         }
3852       c1 &= 0x7F7F7F;
3853       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3854       if (c < 0)
3855         {
3856           MAYBE_FINISH_COMPOSITION ();
3857           for (; src_base < src; src_base++, char_offset++)
3858             {
3859               if (ASCII_BYTE_P (*src_base))
3860                 *charbuf++ = *src_base;
3861               else
3862                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3863             }
3864         }
3865       else if (cmp_status->state == COMPOSING_NO)
3866         {
3867           *charbuf++ = c;
3868           char_offset++;
3869         }
3870       else if ((cmp_status->state == COMPOSING_CHAR
3871                 ? cmp_status->nchars
3872                 : cmp_status->ncomps)
3873                >= MAX_COMPOSITION_COMPONENTS)
3874         {
3875           /* Too long composition.  */
3876           MAYBE_FINISH_COMPOSITION ();
3877           *charbuf++ = c;
3878           char_offset++;
3879         }
3880       else
3881         STORE_COMPOSITION_CHAR (c);
3882       continue;
3883
3884     invalid_code:
3885       MAYBE_FINISH_COMPOSITION ();
3886       src = src_base;
3887       consumed_chars = consumed_chars_base;
3888       ONE_MORE_BYTE (c);
3889       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3890       char_offset++;
3891       coding->errors++;
3892       continue;
3893
3894     break_loop:
3895       break;
3896     }
3897
3898  no_more_source:
3899   if (cmp_status->state != COMPOSING_NO)
3900     {
3901       if (coding->mode & CODING_MODE_LAST_BLOCK)
3902         MAYBE_FINISH_COMPOSITION ();
3903       else
3904         {
3905           charbuf -= cmp_status->length;
3906           for (i = 0; i < cmp_status->length; i++)
3907             cmp_status->carryover[i] = charbuf[i];
3908         }
3909     }
3910   else if (last_id != charset_ascii)
3911     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3912   coding->consumed_char += consumed_chars_base;
3913   coding->consumed = src_base - coding->source;
3914   coding->charbuf_used = charbuf - coding->charbuf;
3915 }
3916
3917
3918 /* ISO2022 encoding stuff.  */
3919
3920 /*
3921    It is not enough to say just "ISO2022" on encoding, we have to
3922    specify more details.  In Emacs, each coding system of ISO2022
3923    variant has the following specifications:
3924         1. Initial designation to G0 thru G3.
3925         2. Allows short-form designation?
3926         3. ASCII should be designated to G0 before control characters?
3927         4. ASCII should be designated to G0 at end of line?
3928         5. 7-bit environment or 8-bit environment?
3929         6. Use locking-shift?
3930         7. Use Single-shift?
3931    And the following two are only for Japanese:
3932         8. Use ASCII in place of JIS0201-1976-Roman?
3933         9. Use JISX0208-1983 in place of JISX0208-1978?
3934    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3935    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3936    details.
3937 */
3938
3939 /* Produce codes (escape sequence) for designating CHARSET to graphic
3940    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3941    '@', 'A', or 'B' and the coding system CODING allows, produce
3942    designation sequence of short-form.  */
3943
3944 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3945   do {                                                                  \
3946     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3947     const char *intermediate_char_94 = "()*+";                          \
3948     const char *intermediate_char_96 = ",-./";                          \
3949     int revision = -1;                                                  \
3950                                                                         \
3951     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3952       revision = CHARSET_ISO_REVISION (charset);                        \
3953                                                                         \
3954     if (revision >= 0)                                                  \
3955       {                                                                 \
3956         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3957         EMIT_ONE_BYTE ('@' + revision);                                 \
3958       }                                                                 \
3959     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3960     if (CHARSET_DIMENSION (charset) == 1)                               \
3961       {                                                                 \
3962         int b;                                                          \
3963         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3964           b = intermediate_char_94[reg];                                \
3965         else                                                            \
3966           b = intermediate_char_96[reg];                                \
3967         EMIT_ONE_ASCII_BYTE (b);                                        \
3968       }                                                                 \
3969     else                                                                \
3970       {                                                                 \
3971         EMIT_ONE_ASCII_BYTE ('$');                                      \
3972         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3973           {                                                             \
3974             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3975                 || reg != 0                                             \
3976                 || final_char < '@' || final_char > 'B')                \
3977               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3978           }                                                             \
3979         else                                                            \
3980           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3981       }                                                                 \
3982     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3983                                                                         \
3984     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3985   } while (0)
3986
3987
3988 /* The following two macros produce codes (control character or escape
3989    sequence) for ISO2022 single-shift functions (single-shift-2 and
3990    single-shift-3).  */
3991
3992 #define ENCODE_SINGLE_SHIFT_2                                           \
3993   do {                                                                  \
3994     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3995       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3996     else                                                                \
3997       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3998     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3999   } while (0)
4000
4001
4002 #define ENCODE_SINGLE_SHIFT_3                                           \
4003   do {                                                                  \
4004     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4005       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4006     else                                                                \
4007       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4008     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4009   } while (0)
4010
4011
4012 /* The following four macros produce codes (control character or
4013    escape sequence) for ISO2022 locking-shift functions (shift-in,
4014    shift-out, locking-shift-2, and locking-shift-3).  */
4015
4016 #define ENCODE_SHIFT_IN                                 \
4017   do {                                                  \
4018     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4019     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4020   } while (0)
4021
4022
4023 #define ENCODE_SHIFT_OUT                                \
4024   do {                                                  \
4025     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4026     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4027   } while (0)
4028
4029
4030 #define ENCODE_LOCKING_SHIFT_2                          \
4031   do {                                                  \
4032     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4033     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4034   } while (0)
4035
4036
4037 #define ENCODE_LOCKING_SHIFT_3                          \
4038   do {                                                  \
4039     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4040     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4041   } while (0)
4042
4043
4044 /* Produce codes for a DIMENSION1 character whose character set is
4045    CHARSET and whose position-code is C1.  Designation and invocation
4046    sequences are also produced in advance if necessary.  */
4047
4048 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4049   do {                                                                  \
4050     int id = CHARSET_ID (charset);                                      \
4051                                                                         \
4052     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4053         && id == charset_ascii)                                         \
4054       {                                                                 \
4055         id = charset_jisx0201_roman;                                    \
4056         charset = CHARSET_FROM_ID (id);                                 \
4057       }                                                                 \
4058                                                                         \
4059     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4060       {                                                                 \
4061         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4062           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4063         else                                                            \
4064           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4065         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4066         break;                                                          \
4067       }                                                                 \
4068     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4069       {                                                                 \
4070         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4071         break;                                                          \
4072       }                                                                 \
4073     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4074       {                                                                 \
4075         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4076         break;                                                          \
4077       }                                                                 \
4078     else                                                                \
4079       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4080          must invoke it, or, at first, designate it to some graphic     \
4081          register.  Then repeat the loop to actually produce the        \
4082          character.  */                                                 \
4083       dst = encode_invocation_designation (charset, coding, dst,        \
4084                                            &produced_chars);            \
4085   } while (1)
4086
4087
4088 /* Produce codes for a DIMENSION2 character whose character set is
4089    CHARSET and whose position-codes are C1 and C2.  Designation and
4090    invocation codes are also produced in advance if necessary.  */
4091
4092 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4093   do {                                                                  \
4094     int id = CHARSET_ID (charset);                                      \
4095                                                                         \
4096     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4097         && id == charset_jisx0208)                                      \
4098       {                                                                 \
4099         id = charset_jisx0208_1978;                                     \
4100         charset = CHARSET_FROM_ID (id);                                 \
4101       }                                                                 \
4102                                                                         \
4103     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4104       {                                                                 \
4105         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4106           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4107         else                                                            \
4108           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4109         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4110         break;                                                          \
4111       }                                                                 \
4112     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4113       {                                                                 \
4114         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4115         break;                                                          \
4116       }                                                                 \
4117     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4118       {                                                                 \
4119         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4120         break;                                                          \
4121       }                                                                 \
4122     else                                                                \
4123       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4124          must invoke it, or, at first, designate it to some graphic     \
4125          register.  Then repeat the loop to actually produce the        \
4126          character.  */                                                 \
4127       dst = encode_invocation_designation (charset, coding, dst,        \
4128                                            &produced_chars);            \
4129   } while (1)
4130
4131
4132 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4133   do {                                                                     \
4134     unsigned code;                                                         \
4135     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4136                                                                            \
4137     if (CHARSET_DIMENSION (charset) == 1)                                  \
4138       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4139     else                                                                   \
4140       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4141   } while (0)
4142
4143
4144 /* Produce designation and invocation codes at a place pointed by DST
4145    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4146    Return new DST.  */
4147
4148 static unsigned char *
4149 encode_invocation_designation (struct charset *charset,
4150                                struct coding_system *coding,
4151                                unsigned char *dst, ptrdiff_t *p_nchars)
4152 {
4153   bool multibytep = coding->dst_multibyte;
4154   ptrdiff_t produced_chars = *p_nchars;
4155   int reg;                      /* graphic register number */
4156   int id = CHARSET_ID (charset);
4157
4158   /* At first, check designations.  */
4159   for (reg = 0; reg < 4; reg++)
4160     if (id == CODING_ISO_DESIGNATION (coding, reg))
4161       break;
4162
4163   if (reg >= 4)
4164     {
4165       /* CHARSET is not yet designated to any graphic registers.  */
4166       /* At first check the requested designation.  */
4167       reg = CODING_ISO_REQUEST (coding, id);
4168       if (reg < 0)
4169         /* Since CHARSET requests no special designation, designate it
4170            to graphic register 0.  */
4171         reg = 0;
4172
4173       ENCODE_DESIGNATION (charset, reg, coding);
4174     }
4175
4176   if (CODING_ISO_INVOCATION (coding, 0) != reg
4177       && CODING_ISO_INVOCATION (coding, 1) != reg)
4178     {
4179       /* Since the graphic register REG is not invoked to any graphic
4180          planes, invoke it to graphic plane 0.  */
4181       switch (reg)
4182         {
4183         case 0:                 /* graphic register 0 */
4184           ENCODE_SHIFT_IN;
4185           break;
4186
4187         case 1:                 /* graphic register 1 */
4188           ENCODE_SHIFT_OUT;
4189           break;
4190
4191         case 2:                 /* graphic register 2 */
4192           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4193             ENCODE_SINGLE_SHIFT_2;
4194           else
4195             ENCODE_LOCKING_SHIFT_2;
4196           break;
4197
4198         case 3:                 /* graphic register 3 */
4199           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4200             ENCODE_SINGLE_SHIFT_3;
4201           else
4202             ENCODE_LOCKING_SHIFT_3;
4203           break;
4204         }
4205     }
4206
4207   *p_nchars = produced_chars;
4208   return dst;
4209 }
4210
4211
4212 /* Produce codes for designation and invocation to reset the graphic
4213    planes and registers to initial state.  */
4214 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4215   do {                                                                  \
4216     int reg;                                                            \
4217     struct charset *charset;                                            \
4218                                                                         \
4219     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4220       ENCODE_SHIFT_IN;                                                  \
4221     for (reg = 0; reg < 4; reg++)                                       \
4222       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4223           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4224               != CODING_ISO_INITIAL (coding, reg)))                     \
4225         {                                                               \
4226           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4227           ENCODE_DESIGNATION (charset, reg, coding);                    \
4228         }                                                               \
4229   } while (0)
4230
4231
4232 /* Produce designation sequences of charsets in the line started from
4233    CHARBUF to a place pointed by DST, and return the number of
4234    produced bytes.  DST should not directly point a buffer text area
4235    which may be relocated by char_charset call.
4236
4237    If the current block ends before any end-of-line, we may fail to
4238    find all the necessary designations.  */
4239
4240 static ptrdiff_t
4241 encode_designation_at_bol (struct coding_system *coding,
4242                            int *charbuf, int *charbuf_end,
4243                            unsigned char *dst)
4244 {
4245   unsigned char *orig = dst;
4246   struct charset *charset;
4247   /* Table of charsets to be designated to each graphic register.  */
4248   int r[4];
4249   int c, found = 0, reg;
4250   ptrdiff_t produced_chars = 0;
4251   bool multibytep = coding->dst_multibyte;
4252   Lisp_Object attrs;
4253   Lisp_Object charset_list;
4254
4255   attrs = CODING_ID_ATTRS (coding->id);
4256   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4257   if (EQ (charset_list, Qiso_2022))
4258     charset_list = Viso_2022_charset_list;
4259
4260   for (reg = 0; reg < 4; reg++)
4261     r[reg] = -1;
4262
4263   while (charbuf < charbuf_end && found < 4)
4264     {
4265       int id;
4266
4267       c = *charbuf++;
4268       if (c == '\n')
4269         break;
4270       charset = char_charset (c, charset_list, NULL);
4271       id = CHARSET_ID (charset);
4272       reg = CODING_ISO_REQUEST (coding, id);
4273       if (reg >= 0 && r[reg] < 0)
4274         {
4275           found++;
4276           r[reg] = id;
4277         }
4278     }
4279
4280   if (found)
4281     {
4282       for (reg = 0; reg < 4; reg++)
4283         if (r[reg] >= 0
4284             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4285           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4286     }
4287
4288   return dst - orig;
4289 }
4290
4291 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4292
4293 static bool
4294 encode_coding_iso_2022 (struct coding_system *coding)
4295 {
4296   bool multibytep = coding->dst_multibyte;
4297   int *charbuf = coding->charbuf;
4298   int *charbuf_end = charbuf + coding->charbuf_used;
4299   unsigned char *dst = coding->destination + coding->produced;
4300   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4301   int safe_room = 16;
4302   bool bol_designation
4303     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4304        && CODING_ISO_BOL (coding));
4305   ptrdiff_t produced_chars = 0;
4306   Lisp_Object attrs, eol_type, charset_list;
4307   bool ascii_compatible;
4308   int c;
4309   int preferred_charset_id = -1;
4310
4311   CODING_GET_INFO (coding, attrs, charset_list);
4312   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4313   if (VECTORP (eol_type))
4314     eol_type = Qunix;
4315
4316   setup_iso_safe_charsets (attrs);
4317   /* Charset list may have been changed.  */
4318   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4319   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4320
4321   ascii_compatible
4322     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4323        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4324                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4325
4326   while (charbuf < charbuf_end)
4327     {
4328       ASSURE_DESTINATION (safe_room);
4329
4330       if (bol_designation)
4331         {
4332           /* We have to produce designation sequences if any now.  */
4333           unsigned char desig_buf[16];
4334           int nbytes;
4335           ptrdiff_t offset;
4336
4337           charset_map_loaded = 0;
4338           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4339                                               desig_buf);
4340           if (charset_map_loaded
4341               && (offset = coding_change_destination (coding)))
4342             {
4343               dst += offset;
4344               dst_end += offset;
4345             }
4346           memcpy (dst, desig_buf, nbytes);
4347           dst += nbytes;
4348           /* We are sure that designation sequences are all ASCII bytes.  */
4349           produced_chars += nbytes;
4350           bol_designation = 0;
4351           ASSURE_DESTINATION (safe_room);
4352         }
4353
4354       c = *charbuf++;
4355
4356       if (c < 0)
4357         {
4358           /* Handle an annotation.  */
4359           switch (*charbuf)
4360             {
4361             case CODING_ANNOTATE_COMPOSITION_MASK:
4362               /* Not yet implemented.  */
4363               break;
4364             case CODING_ANNOTATE_CHARSET_MASK:
4365               preferred_charset_id = charbuf[2];
4366               if (preferred_charset_id >= 0
4367                   && NILP (Fmemq (make_number (preferred_charset_id),
4368                                   charset_list)))
4369                 preferred_charset_id = -1;
4370               break;
4371             default:
4372               emacs_abort ();
4373             }
4374           charbuf += -c - 1;
4375           continue;
4376         }
4377
4378       /* Now encode the character C.  */
4379       if (c < 0x20 || c == 0x7F)
4380         {
4381           if (c == '\n'
4382               || (c == '\r' && EQ (eol_type, Qmac)))
4383             {
4384               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4385                 ENCODE_RESET_PLANE_AND_REGISTER ();
4386               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4387                 {
4388                   int i;
4389
4390                   for (i = 0; i < 4; i++)
4391                     CODING_ISO_DESIGNATION (coding, i)
4392                       = CODING_ISO_INITIAL (coding, i);
4393                 }
4394               bol_designation = ((CODING_ISO_FLAGS (coding)
4395                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4396                                  != 0);
4397             }
4398           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4399             ENCODE_RESET_PLANE_AND_REGISTER ();
4400           EMIT_ONE_ASCII_BYTE (c);
4401         }
4402       else if (ASCII_CHAR_P (c))
4403         {
4404           if (ascii_compatible)
4405             EMIT_ONE_ASCII_BYTE (c);
4406           else
4407             {
4408               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4409               ENCODE_ISO_CHARACTER (charset, c);
4410             }
4411         }
4412       else if (CHAR_BYTE8_P (c))
4413         {
4414           c = CHAR_TO_BYTE8 (c);
4415           EMIT_ONE_BYTE (c);
4416         }
4417       else
4418         {
4419           struct charset *charset;
4420
4421           if (preferred_charset_id >= 0)
4422             {
4423               bool result;
4424
4425               charset = CHARSET_FROM_ID (preferred_charset_id);
4426               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4427               if (! result)
4428                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4429                                      NULL, charset);
4430             }
4431           else
4432             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4433                                  NULL, charset);
4434           if (!charset)
4435             {
4436               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4437                 {
4438                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4439                   charset = CHARSET_FROM_ID (charset_ascii);
4440                 }
4441               else
4442                 {
4443                   c = coding->default_char;
4444                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4445                                        charset_list, NULL, charset);
4446                 }
4447             }
4448           ENCODE_ISO_CHARACTER (charset, c);
4449         }
4450     }
4451
4452   if (coding->mode & CODING_MODE_LAST_BLOCK
4453       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4454     {
4455       ASSURE_DESTINATION (safe_room);
4456       ENCODE_RESET_PLANE_AND_REGISTER ();
4457     }
4458   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4459   CODING_ISO_BOL (coding) = bol_designation;
4460   coding->produced_char += produced_chars;
4461   coding->produced = dst - coding->destination;
4462   return 0;
4463 }
4464
4465 \f
4466 /*** 8,9. SJIS and BIG5 handlers ***/
4467
4468 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4469    quite widely.  So, for the moment, Emacs supports them in the bare
4470    C code.  But, in the future, they may be supported only by CCL.  */
4471
4472 /* SJIS is a coding system encoding three character sets: ASCII, right
4473    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4474    as is.  A character of charset katakana-jisx0201 is encoded by
4475    "position-code + 0x80".  A character of charset japanese-jisx0208
4476    is encoded in 2-byte but two position-codes are divided and shifted
4477    so that it fit in the range below.
4478
4479    --- CODE RANGE of SJIS ---
4480    (character set)      (range)
4481    ASCII                0x00 .. 0x7F
4482    KATAKANA-JISX0201    0xA0 .. 0xDF
4483    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4484             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4485    -------------------------------
4486
4487 */
4488
4489 /* BIG5 is a coding system encoding two character sets: ASCII and
4490    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4491    character set and is encoded in two-byte.
4492
4493    --- CODE RANGE of BIG5 ---
4494    (character set)      (range)
4495    ASCII                0x00 .. 0x7F
4496    Big5 (1st byte)      0xA1 .. 0xFE
4497         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4498    --------------------------
4499
4500   */
4501
4502 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4503    Return true if a text is encoded in SJIS.  */
4504
4505 static bool
4506 detect_coding_sjis (struct coding_system *coding,
4507                     struct coding_detection_info *detect_info)
4508 {
4509   const unsigned char *src = coding->source, *src_base;
4510   const unsigned char *src_end = coding->source + coding->src_bytes;
4511   bool multibytep = coding->src_multibyte;
4512   ptrdiff_t consumed_chars = 0;
4513   int found = 0;
4514   int c;
4515   Lisp_Object attrs, charset_list;
4516   int max_first_byte_of_2_byte_code;
4517
4518   CODING_GET_INFO (coding, attrs, charset_list);
4519   max_first_byte_of_2_byte_code
4520     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4521
4522   detect_info->checked |= CATEGORY_MASK_SJIS;
4523   /* A coding system of this category is always ASCII compatible.  */
4524   src += coding->head_ascii;
4525
4526   while (1)
4527     {
4528       src_base = src;
4529       ONE_MORE_BYTE (c);
4530       if (c < 0x80)
4531         continue;
4532       if ((c >= 0x81 && c <= 0x9F)
4533           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4534         {
4535           ONE_MORE_BYTE (c);
4536           if (c < 0x40 || c == 0x7F || c > 0xFC)
4537             break;
4538           found = CATEGORY_MASK_SJIS;
4539         }
4540       else if (c >= 0xA0 && c < 0xE0)
4541         found = CATEGORY_MASK_SJIS;
4542       else
4543         break;
4544     }
4545   detect_info->rejected |= CATEGORY_MASK_SJIS;
4546   return 0;
4547
4548  no_more_source:
4549   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4550     {
4551       detect_info->rejected |= CATEGORY_MASK_SJIS;
4552       return 0;
4553     }
4554   detect_info->found |= found;
4555   return 1;
4556 }
4557
4558 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4559    Return true if a text is encoded in BIG5.  */
4560
4561 static bool
4562 detect_coding_big5 (struct coding_system *coding,
4563                     struct coding_detection_info *detect_info)
4564 {
4565   const unsigned char *src = coding->source, *src_base;
4566   const unsigned char *src_end = coding->source + coding->src_bytes;
4567   bool multibytep = coding->src_multibyte;
4568   ptrdiff_t consumed_chars = 0;
4569   int found = 0;
4570   int c;
4571
4572   detect_info->checked |= CATEGORY_MASK_BIG5;
4573   /* A coding system of this category is always ASCII compatible.  */
4574   src += coding->head_ascii;
4575
4576   while (1)
4577     {
4578       src_base = src;
4579       ONE_MORE_BYTE (c);
4580       if (c < 0x80)
4581         continue;
4582       if (c >= 0xA1)
4583         {
4584           ONE_MORE_BYTE (c);
4585           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4586             return 0;
4587           found = CATEGORY_MASK_BIG5;
4588         }
4589       else
4590         break;
4591     }
4592   detect_info->rejected |= CATEGORY_MASK_BIG5;
4593   return 0;
4594
4595  no_more_source:
4596   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4597     {
4598       detect_info->rejected |= CATEGORY_MASK_BIG5;
4599       return 0;
4600     }
4601   detect_info->found |= found;
4602   return 1;
4603 }
4604
4605 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4606
4607 static void
4608 decode_coding_sjis (struct coding_system *coding)
4609 {
4610   const unsigned char *src = coding->source + coding->consumed;
4611   const unsigned char *src_end = coding->source + coding->src_bytes;
4612   const unsigned char *src_base;
4613   int *charbuf = coding->charbuf + coding->charbuf_used;
4614   /* We may produce one charset annotation in one loop and one more at
4615      the end.  */
4616   int *charbuf_end
4617     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4618   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4619   bool multibytep = coding->src_multibyte;
4620   struct charset *charset_roman, *charset_kanji, *charset_kana;
4621   struct charset *charset_kanji2;
4622   Lisp_Object attrs, charset_list, val;
4623   ptrdiff_t char_offset = coding->produced_char;
4624   ptrdiff_t last_offset = char_offset;
4625   int last_id = charset_ascii;
4626   bool eol_dos
4627     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4628   int byte_after_cr = -1;
4629
4630   CODING_GET_INFO (coding, attrs, charset_list);
4631
4632   val = charset_list;
4633   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4634   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4635   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4636   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4637
4638   while (1)
4639     {
4640       int c, c1;
4641       struct charset *charset;
4642
4643       src_base = src;
4644       consumed_chars_base = consumed_chars;
4645
4646       if (charbuf >= charbuf_end)
4647         {
4648           if (byte_after_cr >= 0)
4649             src_base--;
4650           break;
4651         }
4652
4653       if (byte_after_cr >= 0)
4654         c = byte_after_cr, byte_after_cr = -1;
4655       else
4656         ONE_MORE_BYTE (c);
4657       if (c < 0)
4658         goto invalid_code;
4659       if (c < 0x80)
4660         {
4661           if (eol_dos && c == '\r')
4662             ONE_MORE_BYTE (byte_after_cr);
4663           charset = charset_roman;
4664         }
4665       else if (c == 0x80 || c == 0xA0)
4666         goto invalid_code;
4667       else if (c >= 0xA1 && c <= 0xDF)
4668         {
4669           /* SJIS -> JISX0201-Kana */
4670           c &= 0x7F;
4671           charset = charset_kana;
4672         }
4673       else if (c <= 0xEF)
4674         {
4675           /* SJIS -> JISX0208 */
4676           ONE_MORE_BYTE (c1);
4677           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4678             goto invalid_code;
4679           c = (c << 8) | c1;
4680           SJIS_TO_JIS (c);
4681           charset = charset_kanji;
4682         }
4683       else if (c <= 0xFC && charset_kanji2)
4684         {
4685           /* SJIS -> JISX0213-2 */
4686           ONE_MORE_BYTE (c1);
4687           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4688             goto invalid_code;
4689           c = (c << 8) | c1;
4690           SJIS_TO_JIS2 (c);
4691           charset = charset_kanji2;
4692         }
4693       else
4694         goto invalid_code;
4695       if (charset->id != charset_ascii
4696           && last_id != charset->id)
4697         {
4698           if (last_id != charset_ascii)
4699             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4700           last_id = charset->id;
4701           last_offset = char_offset;
4702         }
4703       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4704       *charbuf++ = c;
4705       char_offset++;
4706       continue;
4707
4708     invalid_code:
4709       src = src_base;
4710       consumed_chars = consumed_chars_base;
4711       ONE_MORE_BYTE (c);
4712       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4713       char_offset++;
4714       coding->errors++;
4715     }
4716
4717  no_more_source:
4718   if (last_id != charset_ascii)
4719     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4720   coding->consumed_char += consumed_chars_base;
4721   coding->consumed = src_base - coding->source;
4722   coding->charbuf_used = charbuf - coding->charbuf;
4723 }
4724
4725 static void
4726 decode_coding_big5 (struct coding_system *coding)
4727 {
4728   const unsigned char *src = coding->source + coding->consumed;
4729   const unsigned char *src_end = coding->source + coding->src_bytes;
4730   const unsigned char *src_base;
4731   int *charbuf = coding->charbuf + coding->charbuf_used;
4732   /* We may produce one charset annotation in one loop and one more at
4733      the end.  */
4734   int *charbuf_end
4735     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4736   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4737   bool multibytep = coding->src_multibyte;
4738   struct charset *charset_roman, *charset_big5;
4739   Lisp_Object attrs, charset_list, val;
4740   ptrdiff_t char_offset = coding->produced_char;
4741   ptrdiff_t last_offset = char_offset;
4742   int last_id = charset_ascii;
4743   bool eol_dos
4744     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4745   int byte_after_cr = -1;
4746
4747   CODING_GET_INFO (coding, attrs, charset_list);
4748   val = charset_list;
4749   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4750   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4751
4752   while (1)
4753     {
4754       int c, c1;
4755       struct charset *charset;
4756
4757       src_base = src;
4758       consumed_chars_base = consumed_chars;
4759
4760       if (charbuf >= charbuf_end)
4761         {
4762           if (byte_after_cr >= 0)
4763             src_base--;
4764           break;
4765         }
4766
4767       if (byte_after_cr >= 0)
4768         c = byte_after_cr, byte_after_cr = -1;
4769       else
4770         ONE_MORE_BYTE (c);
4771
4772       if (c < 0)
4773         goto invalid_code;
4774       if (c < 0x80)
4775         {
4776           if (eol_dos && c == '\r')
4777             ONE_MORE_BYTE (byte_after_cr);
4778           charset = charset_roman;
4779         }
4780       else
4781         {
4782           /* BIG5 -> Big5 */
4783           if (c < 0xA1 || c > 0xFE)
4784             goto invalid_code;
4785           ONE_MORE_BYTE (c1);
4786           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4787             goto invalid_code;
4788           c = c << 8 | c1;
4789           charset = charset_big5;
4790         }
4791       if (charset->id != charset_ascii
4792           && last_id != charset->id)
4793         {
4794           if (last_id != charset_ascii)
4795             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4796           last_id = charset->id;
4797           last_offset = char_offset;
4798         }
4799       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4800       *charbuf++ = c;
4801       char_offset++;
4802       continue;
4803
4804     invalid_code:
4805       src = src_base;
4806       consumed_chars = consumed_chars_base;
4807       ONE_MORE_BYTE (c);
4808       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4809       char_offset++;
4810       coding->errors++;
4811     }
4812
4813  no_more_source:
4814   if (last_id != charset_ascii)
4815     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4816   coding->consumed_char += consumed_chars_base;
4817   coding->consumed = src_base - coding->source;
4818   coding->charbuf_used = charbuf - coding->charbuf;
4819 }
4820
4821 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4822    This function can encode charsets `ascii', `katakana-jisx0201',
4823    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4824    are sure that all these charsets are registered as official charset
4825    (i.e. do not have extended leading-codes).  Characters of other
4826    charsets are produced without any encoding.  */
4827
4828 static bool
4829 encode_coding_sjis (struct coding_system *coding)
4830 {
4831   bool multibytep = coding->dst_multibyte;
4832   int *charbuf = coding->charbuf;
4833   int *charbuf_end = charbuf + coding->charbuf_used;
4834   unsigned char *dst = coding->destination + coding->produced;
4835   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4836   int safe_room = 4;
4837   ptrdiff_t produced_chars = 0;
4838   Lisp_Object attrs, charset_list, val;
4839   bool ascii_compatible;
4840   struct charset *charset_kanji, *charset_kana;
4841   struct charset *charset_kanji2;
4842   int c;
4843
4844   CODING_GET_INFO (coding, attrs, charset_list);
4845   val = XCDR (charset_list);
4846   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4847   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4848   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4849
4850   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4851
4852   while (charbuf < charbuf_end)
4853     {
4854       ASSURE_DESTINATION (safe_room);
4855       c = *charbuf++;
4856       /* Now encode the character C.  */
4857       if (ASCII_CHAR_P (c) && ascii_compatible)
4858         EMIT_ONE_ASCII_BYTE (c);
4859       else if (CHAR_BYTE8_P (c))
4860         {
4861           c = CHAR_TO_BYTE8 (c);
4862           EMIT_ONE_BYTE (c);
4863         }
4864       else
4865         {
4866           unsigned code;
4867           struct charset *charset;
4868           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4869                                &code, charset);
4870
4871           if (!charset)
4872             {
4873               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4874                 {
4875                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4876                   charset = CHARSET_FROM_ID (charset_ascii);
4877                 }
4878               else
4879                 {
4880                   c = coding->default_char;
4881                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4882                                        charset_list, &code, charset);
4883                 }
4884             }
4885           if (code == CHARSET_INVALID_CODE (charset))
4886             emacs_abort ();
4887           if (charset == charset_kanji)
4888             {
4889               int c1, c2;
4890               JIS_TO_SJIS (code);
4891               c1 = code >> 8, c2 = code & 0xFF;
4892               EMIT_TWO_BYTES (c1, c2);
4893             }
4894           else if (charset == charset_kana)
4895             EMIT_ONE_BYTE (code | 0x80);
4896           else if (charset_kanji2 && charset == charset_kanji2)
4897             {
4898               int c1, c2;
4899
4900               c1 = code >> 8;
4901               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4902                   || c1 == 0x28
4903                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4904                 {
4905                   JIS_TO_SJIS2 (code);
4906                   c1 = code >> 8, c2 = code & 0xFF;
4907                   EMIT_TWO_BYTES (c1, c2);
4908                 }
4909               else
4910                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4911             }
4912           else
4913             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4914         }
4915     }
4916   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4917   coding->produced_char += produced_chars;
4918   coding->produced = dst - coding->destination;
4919   return 0;
4920 }
4921
4922 static bool
4923 encode_coding_big5 (struct coding_system *coding)
4924 {
4925   bool multibytep = coding->dst_multibyte;
4926   int *charbuf = coding->charbuf;
4927   int *charbuf_end = charbuf + coding->charbuf_used;
4928   unsigned char *dst = coding->destination + coding->produced;
4929   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4930   int safe_room = 4;
4931   ptrdiff_t produced_chars = 0;
4932   Lisp_Object attrs, charset_list, val;
4933   bool ascii_compatible;
4934   struct charset *charset_big5;
4935   int c;
4936
4937   CODING_GET_INFO (coding, attrs, charset_list);
4938   val = XCDR (charset_list);
4939   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4940   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4941
4942   while (charbuf < charbuf_end)
4943     {
4944       ASSURE_DESTINATION (safe_room);
4945       c = *charbuf++;
4946       /* Now encode the character C.  */
4947       if (ASCII_CHAR_P (c) && ascii_compatible)
4948         EMIT_ONE_ASCII_BYTE (c);
4949       else if (CHAR_BYTE8_P (c))
4950         {
4951           c = CHAR_TO_BYTE8 (c);
4952           EMIT_ONE_BYTE (c);
4953         }
4954       else
4955         {
4956           unsigned code;
4957           struct charset *charset;
4958           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4959                                &code, charset);
4960
4961           if (! charset)
4962             {
4963               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4964                 {
4965                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4966                   charset = CHARSET_FROM_ID (charset_ascii);
4967                 }
4968               else
4969                 {
4970                   c = coding->default_char;
4971                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4972                                        charset_list, &code, charset);
4973                 }
4974             }
4975           if (code == CHARSET_INVALID_CODE (charset))
4976             emacs_abort ();
4977           if (charset == charset_big5)
4978             {
4979               int c1, c2;
4980
4981               c1 = code >> 8, c2 = code & 0xFF;
4982               EMIT_TWO_BYTES (c1, c2);
4983             }
4984           else
4985             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4986         }
4987     }
4988   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4989   coding->produced_char += produced_chars;
4990   coding->produced = dst - coding->destination;
4991   return 0;
4992 }
4993
4994 \f
4995 /*** 10. CCL handlers ***/
4996
4997 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4998    Return true if a text is encoded in a coding system of which
4999    encoder/decoder are written in CCL program.  */
5000
5001 static bool
5002 detect_coding_ccl (struct coding_system *coding,
5003                    struct coding_detection_info *detect_info)
5004 {
5005   const unsigned char *src = coding->source, *src_base;
5006   const unsigned char *src_end = coding->source + coding->src_bytes;
5007   bool multibytep = coding->src_multibyte;
5008   ptrdiff_t consumed_chars = 0;
5009   int found = 0;
5010   unsigned char *valids;
5011   ptrdiff_t head_ascii = coding->head_ascii;
5012   Lisp_Object attrs;
5013
5014   detect_info->checked |= CATEGORY_MASK_CCL;
5015
5016   coding = &coding_categories[coding_category_ccl];
5017   valids = CODING_CCL_VALIDS (coding);
5018   attrs = CODING_ID_ATTRS (coding->id);
5019   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5020     src += head_ascii;
5021
5022   while (1)
5023     {
5024       int c;
5025
5026       src_base = src;
5027       ONE_MORE_BYTE (c);
5028       if (c < 0 || ! valids[c])
5029         break;
5030       if ((valids[c] > 1))
5031         found = CATEGORY_MASK_CCL;
5032     }
5033   detect_info->rejected |= CATEGORY_MASK_CCL;
5034   return 0;
5035
5036  no_more_source:
5037   detect_info->found |= found;
5038   return 1;
5039 }
5040
5041 static void
5042 decode_coding_ccl (struct coding_system *coding)
5043 {
5044   const unsigned char *src = coding->source + coding->consumed;
5045   const unsigned char *src_end = coding->source + coding->src_bytes;
5046   int *charbuf = coding->charbuf + coding->charbuf_used;
5047   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5048   ptrdiff_t consumed_chars = 0;
5049   bool multibytep = coding->src_multibyte;
5050   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5051   int source_charbuf[1024];
5052   int source_byteidx[1025];
5053   Lisp_Object attrs, charset_list;
5054
5055   CODING_GET_INFO (coding, attrs, charset_list);
5056
5057   while (1)
5058     {
5059       const unsigned char *p = src;
5060       ptrdiff_t offset;
5061       int i = 0;
5062
5063       if (multibytep)
5064         {
5065           while (i < 1024 && p < src_end)
5066             {
5067               source_byteidx[i] = p - src;
5068               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5069             }
5070           source_byteidx[i] = p - src;
5071         }
5072       else
5073         while (i < 1024 && p < src_end)
5074           source_charbuf[i++] = *p++;
5075
5076       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5077         ccl->last_block = 1;
5078       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5079       charset_map_loaded = 0;
5080       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5081                   charset_list);
5082       if (charset_map_loaded
5083           && (offset = coding_change_source (coding)))
5084         {
5085           p += offset;
5086           src += offset;
5087           src_end += offset;
5088         }
5089       charbuf += ccl->produced;
5090       if (multibytep)
5091         src += source_byteidx[ccl->consumed];
5092       else
5093         src += ccl->consumed;
5094       consumed_chars += ccl->consumed;
5095       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5096         break;
5097     }
5098
5099   switch (ccl->status)
5100     {
5101     case CCL_STAT_SUSPEND_BY_SRC:
5102       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5103       break;
5104     case CCL_STAT_SUSPEND_BY_DST:
5105       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5106       break;
5107     case CCL_STAT_QUIT:
5108     case CCL_STAT_INVALID_CMD:
5109       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5110       break;
5111     default:
5112       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5113       break;
5114     }
5115   coding->consumed_char += consumed_chars;
5116   coding->consumed = src - coding->source;
5117   coding->charbuf_used = charbuf - coding->charbuf;
5118 }
5119
5120 static bool
5121 encode_coding_ccl (struct coding_system *coding)
5122 {
5123   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5124   bool multibytep = coding->dst_multibyte;
5125   int *charbuf = coding->charbuf;
5126   int *charbuf_end = charbuf + coding->charbuf_used;
5127   unsigned char *dst = coding->destination + coding->produced;
5128   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5129   int destination_charbuf[1024];
5130   ptrdiff_t produced_chars = 0;
5131   int i;
5132   Lisp_Object attrs, charset_list;
5133
5134   CODING_GET_INFO (coding, attrs, charset_list);
5135   if (coding->consumed_char == coding->src_chars
5136       && coding->mode & CODING_MODE_LAST_BLOCK)
5137     ccl->last_block = 1;
5138
5139   do
5140     {
5141       ptrdiff_t offset;
5142
5143       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5144       charset_map_loaded = 0;
5145       ccl_driver (ccl, charbuf, destination_charbuf,
5146                   charbuf_end - charbuf, 1024, charset_list);
5147       if (charset_map_loaded
5148           && (offset = coding_change_destination (coding)))
5149         dst += offset;
5150       if (multibytep)
5151         {
5152           ASSURE_DESTINATION (ccl->produced * 2);
5153           for (i = 0; i < ccl->produced; i++)
5154             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5155         }
5156       else
5157         {
5158           ASSURE_DESTINATION (ccl->produced);
5159           for (i = 0; i < ccl->produced; i++)
5160             *dst++ = destination_charbuf[i] & 0xFF;
5161           produced_chars += ccl->produced;
5162         }
5163       charbuf += ccl->consumed;
5164       if (ccl->status == CCL_STAT_QUIT
5165           || ccl->status == CCL_STAT_INVALID_CMD)
5166         break;
5167     }
5168   while (charbuf < charbuf_end);
5169
5170   switch (ccl->status)
5171     {
5172     case CCL_STAT_SUSPEND_BY_SRC:
5173       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5174       break;
5175     case CCL_STAT_SUSPEND_BY_DST:
5176       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5177       break;
5178     case CCL_STAT_QUIT:
5179     case CCL_STAT_INVALID_CMD:
5180       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5181       break;
5182     default:
5183       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5184       break;
5185     }
5186
5187   coding->produced_char += produced_chars;
5188   coding->produced = dst - coding->destination;
5189   return 0;
5190 }
5191
5192 \f
5193 /*** 10, 11. no-conversion handlers ***/
5194
5195 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5196
5197 static void
5198 decode_coding_raw_text (struct coding_system *coding)
5199 {
5200   bool eol_dos
5201     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5202
5203   coding->chars_at_source = 1;
5204   coding->consumed_char = coding->src_chars;
5205   coding->consumed = coding->src_bytes;
5206   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5207     {
5208       coding->consumed_char--;
5209       coding->consumed--;
5210       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5211     }
5212   else
5213     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5214 }
5215
5216 static bool
5217 encode_coding_raw_text (struct coding_system *coding)
5218 {
5219   bool multibytep = coding->dst_multibyte;
5220   int *charbuf = coding->charbuf;
5221   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5222   unsigned char *dst = coding->destination + coding->produced;
5223   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5224   ptrdiff_t produced_chars = 0;
5225   int c;
5226
5227   if (multibytep)
5228     {
5229       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5230
5231       if (coding->src_multibyte)
5232         while (charbuf < charbuf_end)
5233           {
5234             ASSURE_DESTINATION (safe_room);
5235             c = *charbuf++;
5236             if (ASCII_CHAR_P (c))
5237               EMIT_ONE_ASCII_BYTE (c);
5238             else if (CHAR_BYTE8_P (c))
5239               {
5240                 c = CHAR_TO_BYTE8 (c);
5241                 EMIT_ONE_BYTE (c);
5242               }
5243             else
5244               {
5245                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5246
5247                 CHAR_STRING_ADVANCE (c, p1);
5248                 do
5249                   {
5250                     EMIT_ONE_BYTE (*p0);
5251                     p0++;
5252                   }
5253                 while (p0 < p1);
5254               }
5255           }
5256       else
5257         while (charbuf < charbuf_end)
5258           {
5259             ASSURE_DESTINATION (safe_room);
5260             c = *charbuf++;
5261             EMIT_ONE_BYTE (c);
5262           }
5263     }
5264   else
5265     {
5266       if (coding->src_multibyte)
5267         {
5268           int safe_room = MAX_MULTIBYTE_LENGTH;
5269
5270           while (charbuf < charbuf_end)
5271             {
5272               ASSURE_DESTINATION (safe_room);
5273               c = *charbuf++;
5274               if (ASCII_CHAR_P (c))
5275                 *dst++ = c;
5276               else if (CHAR_BYTE8_P (c))
5277                 *dst++ = CHAR_TO_BYTE8 (c);
5278               else
5279                 CHAR_STRING_ADVANCE (c, dst);
5280             }
5281         }
5282       else
5283         {
5284           ASSURE_DESTINATION (charbuf_end - charbuf);
5285           while (charbuf < charbuf_end && dst < dst_end)
5286             *dst++ = *charbuf++;
5287         }
5288       produced_chars = dst - (coding->destination + coding->produced);
5289     }
5290   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5291   coding->produced_char += produced_chars;
5292   coding->produced = dst - coding->destination;
5293   return 0;
5294 }
5295
5296 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5297    Return true if a text is encoded in a charset-based coding system.  */
5298
5299 static bool
5300 detect_coding_charset (struct coding_system *coding,
5301                        struct coding_detection_info *detect_info)
5302 {
5303   const unsigned char *src = coding->source, *src_base;
5304   const unsigned char *src_end = coding->source + coding->src_bytes;
5305   bool multibytep = coding->src_multibyte;
5306   ptrdiff_t consumed_chars = 0;
5307   Lisp_Object attrs, valids, name;
5308   int found = 0;
5309   ptrdiff_t head_ascii = coding->head_ascii;
5310   bool check_latin_extra = 0;
5311
5312   detect_info->checked |= CATEGORY_MASK_CHARSET;
5313
5314   coding = &coding_categories[coding_category_charset];
5315   attrs = CODING_ID_ATTRS (coding->id);
5316   valids = AREF (attrs, coding_attr_charset_valids);
5317   name = CODING_ID_NAME (coding->id);
5318   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5319                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5320       || strncmp (SSDATA (SYMBOL_NAME (name)),
5321                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5322     check_latin_extra = 1;
5323
5324   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5325     src += head_ascii;
5326
5327   while (1)
5328     {
5329       int c;
5330       Lisp_Object val;
5331       struct charset *charset;
5332       int dim, idx;
5333
5334       src_base = src;
5335       ONE_MORE_BYTE (c);
5336       if (c < 0)
5337         continue;
5338       val = AREF (valids, c);
5339       if (NILP (val))
5340         break;
5341       if (c >= 0x80)
5342         {
5343           if (c < 0xA0
5344               && check_latin_extra
5345               && (!VECTORP (Vlatin_extra_code_table)
5346                   || NILP (AREF (Vlatin_extra_code_table, c))))
5347             break;
5348           found = CATEGORY_MASK_CHARSET;
5349         }
5350       if (INTEGERP (val))
5351         {
5352           charset = CHARSET_FROM_ID (XFASTINT (val));
5353           dim = CHARSET_DIMENSION (charset);
5354           for (idx = 1; idx < dim; idx++)
5355             {
5356               if (src == src_end)
5357                 goto too_short;
5358               ONE_MORE_BYTE (c);
5359               if (c < charset->code_space[(dim - 1 - idx) * 4]
5360                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5361                 break;
5362             }
5363           if (idx < dim)
5364             break;
5365         }
5366       else
5367         {
5368           idx = 1;
5369           for (; CONSP (val); val = XCDR (val))
5370             {
5371               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5372               dim = CHARSET_DIMENSION (charset);
5373               while (idx < dim)
5374                 {
5375                   if (src == src_end)
5376                     goto too_short;
5377                   ONE_MORE_BYTE (c);
5378                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5379                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5380                     break;
5381                   idx++;
5382                 }
5383               if (idx == dim)
5384                 {
5385                   val = Qnil;
5386                   break;
5387                 }
5388             }
5389           if (CONSP (val))
5390             break;
5391         }
5392     }
5393  too_short:
5394   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5395   return 0;
5396
5397  no_more_source:
5398   detect_info->found |= found;
5399   return 1;
5400 }
5401
5402 static void
5403 decode_coding_charset (struct coding_system *coding)
5404 {
5405   const unsigned char *src = coding->source + coding->consumed;
5406   const unsigned char *src_end = coding->source + coding->src_bytes;
5407   const unsigned char *src_base;
5408   int *charbuf = coding->charbuf + coding->charbuf_used;
5409   /* We may produce one charset annotation in one loop and one more at
5410      the end.  */
5411   int *charbuf_end
5412     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5413   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5414   bool multibytep = coding->src_multibyte;
5415   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5416   Lisp_Object valids;
5417   ptrdiff_t char_offset = coding->produced_char;
5418   ptrdiff_t last_offset = char_offset;
5419   int last_id = charset_ascii;
5420   bool eol_dos
5421     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5422   int byte_after_cr = -1;
5423
5424   valids = AREF (attrs, coding_attr_charset_valids);
5425
5426   while (1)
5427     {
5428       int c;
5429       Lisp_Object val;
5430       struct charset *charset;
5431       int dim;
5432       int len = 1;
5433       unsigned code;
5434
5435       src_base = src;
5436       consumed_chars_base = consumed_chars;
5437
5438       if (charbuf >= charbuf_end)
5439         {
5440           if (byte_after_cr >= 0)
5441             src_base--;
5442           break;
5443         }
5444
5445       if (byte_after_cr >= 0)
5446         {
5447           c = byte_after_cr;
5448           byte_after_cr = -1;
5449         }
5450       else
5451         {
5452           ONE_MORE_BYTE (c);
5453           if (eol_dos && c == '\r')
5454             ONE_MORE_BYTE (byte_after_cr);
5455         }
5456       if (c < 0)
5457         goto invalid_code;
5458       code = c;
5459
5460       val = AREF (valids, c);
5461       if (! INTEGERP (val) && ! CONSP (val))
5462         goto invalid_code;
5463       if (INTEGERP (val))
5464         {
5465           charset = CHARSET_FROM_ID (XFASTINT (val));
5466           dim = CHARSET_DIMENSION (charset);
5467           while (len < dim)
5468             {
5469               ONE_MORE_BYTE (c);
5470               code = (code << 8) | c;
5471               len++;
5472             }
5473           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5474                               charset, code, c);
5475         }
5476       else
5477         {
5478           /* VAL is a list of charset IDs.  It is assured that the
5479              list is sorted by charset dimensions (smaller one
5480              comes first).  */
5481           while (CONSP (val))
5482             {
5483               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5484               dim = CHARSET_DIMENSION (charset);
5485               while (len < dim)
5486                 {
5487                   ONE_MORE_BYTE (c);
5488                   code = (code << 8) | c;
5489                   len++;
5490                 }
5491               CODING_DECODE_CHAR (coding, src, src_base,
5492                                   src_end, charset, code, c);
5493               if (c >= 0)
5494                 break;
5495               val = XCDR (val);
5496             }
5497         }
5498       if (c < 0)
5499         goto invalid_code;
5500       if (charset->id != charset_ascii
5501           && last_id != charset->id)
5502         {
5503           if (last_id != charset_ascii)
5504             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5505           last_id = charset->id;
5506           last_offset = char_offset;
5507         }
5508
5509       *charbuf++ = c;
5510       char_offset++;
5511       continue;
5512
5513     invalid_code:
5514       src = src_base;
5515       consumed_chars = consumed_chars_base;
5516       ONE_MORE_BYTE (c);
5517       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5518       char_offset++;
5519       coding->errors++;
5520     }
5521
5522  no_more_source:
5523   if (last_id != charset_ascii)
5524     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5525   coding->consumed_char += consumed_chars_base;
5526   coding->consumed = src_base - coding->source;
5527   coding->charbuf_used = charbuf - coding->charbuf;
5528 }
5529
5530 static bool
5531 encode_coding_charset (struct coding_system *coding)
5532 {
5533   bool multibytep = coding->dst_multibyte;
5534   int *charbuf = coding->charbuf;
5535   int *charbuf_end = charbuf + coding->charbuf_used;
5536   unsigned char *dst = coding->destination + coding->produced;
5537   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5538   int safe_room = MAX_MULTIBYTE_LENGTH;
5539   ptrdiff_t produced_chars = 0;
5540   Lisp_Object attrs, charset_list;
5541   bool ascii_compatible;
5542   int c;
5543
5544   CODING_GET_INFO (coding, attrs, charset_list);
5545   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5546
5547   while (charbuf < charbuf_end)
5548     {
5549       struct charset *charset;
5550       unsigned code;
5551
5552       ASSURE_DESTINATION (safe_room);
5553       c = *charbuf++;
5554       if (ascii_compatible && ASCII_CHAR_P (c))
5555         EMIT_ONE_ASCII_BYTE (c);
5556       else if (CHAR_BYTE8_P (c))
5557         {
5558           c = CHAR_TO_BYTE8 (c);
5559           EMIT_ONE_BYTE (c);
5560         }
5561       else
5562         {
5563           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5564                                &code, charset);
5565
5566           if (charset)
5567             {
5568               if (CHARSET_DIMENSION (charset) == 1)
5569                 EMIT_ONE_BYTE (code);
5570               else if (CHARSET_DIMENSION (charset) == 2)
5571                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5572               else if (CHARSET_DIMENSION (charset) == 3)
5573                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5574               else
5575                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5576                                  (code >> 8) & 0xFF, code & 0xFF);
5577             }
5578           else
5579             {
5580               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5581                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5582               else
5583                 c = coding->default_char;
5584               EMIT_ONE_BYTE (c);
5585             }
5586         }
5587     }
5588
5589   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5590   coding->produced_char += produced_chars;
5591   coding->produced = dst - coding->destination;
5592   return 0;
5593 }
5594
5595 \f
5596 /*** 7. C library functions ***/
5597
5598 /* Setup coding context CODING from information about CODING_SYSTEM.
5599    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5600    CODING_SYSTEM is invalid, signal an error.  */
5601
5602 void
5603 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5604 {
5605   Lisp_Object attrs;
5606   Lisp_Object eol_type;
5607   Lisp_Object coding_type;
5608   Lisp_Object val;
5609
5610   if (NILP (coding_system))
5611     coding_system = Qundecided;
5612
5613   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5614
5615   attrs = CODING_ID_ATTRS (coding->id);
5616   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5617
5618   coding->mode = 0;
5619   coding->head_ascii = -1;
5620   if (VECTORP (eol_type))
5621     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5622                             | CODING_REQUIRE_DETECTION_MASK);
5623   else if (! EQ (eol_type, Qunix))
5624     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5625                             | CODING_REQUIRE_ENCODING_MASK);
5626   else
5627     coding->common_flags = 0;
5628   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5629     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5630   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5631     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5632   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5633     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5634
5635   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5636   coding->max_charset_id = SCHARS (val) - 1;
5637   coding->safe_charsets = SDATA (val);
5638   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5639   coding->carryover_bytes = 0;
5640
5641   coding_type = CODING_ATTR_TYPE (attrs);
5642   if (EQ (coding_type, Qundecided))
5643     {
5644       coding->detector = NULL;
5645       coding->decoder = decode_coding_raw_text;
5646       coding->encoder = encode_coding_raw_text;
5647       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5648     }
5649   else if (EQ (coding_type, Qiso_2022))
5650     {
5651       int i;
5652       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5653
5654       /* Invoke graphic register 0 to plane 0.  */
5655       CODING_ISO_INVOCATION (coding, 0) = 0;
5656       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5657       CODING_ISO_INVOCATION (coding, 1)
5658         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5659       /* Setup the initial status of designation.  */
5660       for (i = 0; i < 4; i++)
5661         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5662       /* Not single shifting initially.  */
5663       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5664       /* Beginning of buffer should also be regarded as bol. */
5665       CODING_ISO_BOL (coding) = 1;
5666       coding->detector = detect_coding_iso_2022;
5667       coding->decoder = decode_coding_iso_2022;
5668       coding->encoder = encode_coding_iso_2022;
5669       if (flags & CODING_ISO_FLAG_SAFE)
5670         coding->mode |= CODING_MODE_SAFE_ENCODING;
5671       coding->common_flags
5672         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5673             | CODING_REQUIRE_FLUSHING_MASK);
5674       if (flags & CODING_ISO_FLAG_COMPOSITION)
5675         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5676       if (flags & CODING_ISO_FLAG_DESIGNATION)
5677         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5678       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5679         {
5680           setup_iso_safe_charsets (attrs);
5681           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5682           coding->max_charset_id = SCHARS (val) - 1;
5683           coding->safe_charsets = SDATA (val);
5684         }
5685       CODING_ISO_FLAGS (coding) = flags;
5686       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5687       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5688       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5689       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5690     }
5691   else if (EQ (coding_type, Qcharset))
5692     {
5693       coding->detector = detect_coding_charset;
5694       coding->decoder = decode_coding_charset;
5695       coding->encoder = encode_coding_charset;
5696       coding->common_flags
5697         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5698     }
5699   else if (EQ (coding_type, Qutf_8))
5700     {
5701       val = AREF (attrs, coding_attr_utf_bom);
5702       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5703                                    : EQ (val, Qt) ? utf_with_bom
5704                                    : utf_without_bom);
5705       coding->detector = detect_coding_utf_8;
5706       coding->decoder = decode_coding_utf_8;
5707       coding->encoder = encode_coding_utf_8;
5708       coding->common_flags
5709         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5710       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5711         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5712     }
5713   else if (EQ (coding_type, Qutf_16))
5714     {
5715       val = AREF (attrs, coding_attr_utf_bom);
5716       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5717                                     : EQ (val, Qt) ? utf_with_bom
5718                                     : utf_without_bom);
5719       val = AREF (attrs, coding_attr_utf_16_endian);
5720       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5721                                        : utf_16_little_endian);
5722       CODING_UTF_16_SURROGATE (coding) = 0;
5723       coding->detector = detect_coding_utf_16;
5724       coding->decoder = decode_coding_utf_16;
5725       coding->encoder = encode_coding_utf_16;
5726       coding->common_flags
5727         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5728       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5729         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5730     }
5731   else if (EQ (coding_type, Qccl))
5732     {
5733       coding->detector = detect_coding_ccl;
5734       coding->decoder = decode_coding_ccl;
5735       coding->encoder = encode_coding_ccl;
5736       coding->common_flags
5737         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5738             | CODING_REQUIRE_FLUSHING_MASK);
5739     }
5740   else if (EQ (coding_type, Qemacs_mule))
5741     {
5742       coding->detector = detect_coding_emacs_mule;
5743       coding->decoder = decode_coding_emacs_mule;
5744       coding->encoder = encode_coding_emacs_mule;
5745       coding->common_flags
5746         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5747       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5748           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5749         {
5750           Lisp_Object tail, safe_charsets;
5751           int max_charset_id = 0;
5752
5753           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5754                tail = XCDR (tail))
5755             if (max_charset_id < XFASTINT (XCAR (tail)))
5756               max_charset_id = XFASTINT (XCAR (tail));
5757           safe_charsets = make_uninit_string (max_charset_id + 1);
5758           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5759           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5760                tail = XCDR (tail))
5761             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5762           coding->max_charset_id = max_charset_id;
5763           coding->safe_charsets = SDATA (safe_charsets);
5764         }
5765       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5766       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5767     }
5768   else if (EQ (coding_type, Qshift_jis))
5769     {
5770       coding->detector = detect_coding_sjis;
5771       coding->decoder = decode_coding_sjis;
5772       coding->encoder = encode_coding_sjis;
5773       coding->common_flags
5774         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5775     }
5776   else if (EQ (coding_type, Qbig5))
5777     {
5778       coding->detector = detect_coding_big5;
5779       coding->decoder = decode_coding_big5;
5780       coding->encoder = encode_coding_big5;
5781       coding->common_flags
5782         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5783     }
5784   else                          /* EQ (coding_type, Qraw_text) */
5785     {
5786       coding->detector = NULL;
5787       coding->decoder = decode_coding_raw_text;
5788       coding->encoder = encode_coding_raw_text;
5789       if (! EQ (eol_type, Qunix))
5790         {
5791           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5792           if (! VECTORP (eol_type))
5793             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5794         }
5795
5796     }
5797
5798   return;
5799 }
5800
5801 /* Return a list of charsets supported by CODING.  */
5802
5803 Lisp_Object
5804 coding_charset_list (struct coding_system *coding)
5805 {
5806   Lisp_Object attrs, charset_list;
5807
5808   CODING_GET_INFO (coding, attrs, charset_list);
5809   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5810     {
5811       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5812
5813       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5814         charset_list = Viso_2022_charset_list;
5815     }
5816   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5817     {
5818       charset_list = Vemacs_mule_charset_list;
5819     }
5820   return charset_list;
5821 }
5822
5823
5824 /* Return a list of charsets supported by CODING-SYSTEM.  */
5825
5826 Lisp_Object
5827 coding_system_charset_list (Lisp_Object coding_system)
5828 {
5829   ptrdiff_t id;
5830   Lisp_Object attrs, charset_list;
5831
5832   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5833   attrs = CODING_ID_ATTRS (id);
5834
5835   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5836     {
5837       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5838
5839       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5840         charset_list = Viso_2022_charset_list;
5841       else
5842         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5843     }
5844   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5845     {
5846       charset_list = Vemacs_mule_charset_list;
5847     }
5848   else
5849     {
5850       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5851     }
5852   return charset_list;
5853 }
5854
5855
5856 /* Return raw-text or one of its subsidiaries that has the same
5857    eol_type as CODING-SYSTEM.  */
5858
5859 Lisp_Object
5860 raw_text_coding_system (Lisp_Object coding_system)
5861 {
5862   Lisp_Object spec, attrs;
5863   Lisp_Object eol_type, raw_text_eol_type;
5864
5865   if (NILP (coding_system))
5866     return Qraw_text;
5867   spec = CODING_SYSTEM_SPEC (coding_system);
5868   attrs = AREF (spec, 0);
5869
5870   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5871     return coding_system;
5872
5873   eol_type = AREF (spec, 2);
5874   if (VECTORP (eol_type))
5875     return Qraw_text;
5876   spec = CODING_SYSTEM_SPEC (Qraw_text);
5877   raw_text_eol_type = AREF (spec, 2);
5878   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5879           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5880           : AREF (raw_text_eol_type, 2));
5881 }
5882
5883
5884 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5885    the subsidiary that has the same eol-spec as PARENT (if it is not
5886    nil and specifies end-of-line format) or the system's setting
5887    (system_eol_type).  */
5888
5889 Lisp_Object
5890 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5891 {
5892   Lisp_Object spec, eol_type;
5893
5894   if (NILP (coding_system))
5895     coding_system = Qraw_text;
5896   spec = CODING_SYSTEM_SPEC (coding_system);
5897   eol_type = AREF (spec, 2);
5898   if (VECTORP (eol_type))
5899     {
5900       Lisp_Object parent_eol_type;
5901
5902       if (! NILP (parent))
5903         {
5904           Lisp_Object parent_spec;
5905
5906           parent_spec = CODING_SYSTEM_SPEC (parent);
5907           parent_eol_type = AREF (parent_spec, 2);
5908           if (VECTORP (parent_eol_type))
5909             parent_eol_type = system_eol_type;
5910         }
5911       else
5912         parent_eol_type = system_eol_type;
5913       if (EQ (parent_eol_type, Qunix))
5914         coding_system = AREF (eol_type, 0);
5915       else if (EQ (parent_eol_type, Qdos))
5916         coding_system = AREF (eol_type, 1);
5917       else if (EQ (parent_eol_type, Qmac))
5918         coding_system = AREF (eol_type, 2);
5919     }
5920   return coding_system;
5921 }
5922
5923
5924 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5925    decided for writing to a process.  If not, complement them, and
5926    return a new coding system.  */
5927
5928 Lisp_Object
5929 complement_process_encoding_system (Lisp_Object coding_system)
5930 {
5931   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5932   Lisp_Object spec, attrs;
5933   int i;
5934
5935   for (i = 0; i < 3; i++)
5936     {
5937       if (i == 1)
5938         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5939       else if (i == 2)
5940         coding_system = preferred_coding_system ();
5941       spec = CODING_SYSTEM_SPEC (coding_system);
5942       if (NILP (spec))
5943         continue;
5944       attrs = AREF (spec, 0);
5945       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5946         coding_base = CODING_ATTR_BASE_NAME (attrs);
5947       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5948         eol_base = coding_system;
5949       if (! NILP (coding_base) && ! NILP (eol_base))
5950         break;
5951     }
5952
5953   if (i > 0)
5954     /* The original CODING_SYSTEM didn't specify text-conversion or
5955        eol-conversion.  Be sure that we return a fully complemented
5956        coding system.  */
5957     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5958   return coding_system;
5959 }
5960
5961
5962 /* Emacs has a mechanism to automatically detect a coding system if it
5963    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5964    it's impossible to distinguish some coding systems accurately
5965    because they use the same range of codes.  So, at first, coding
5966    systems are categorized into 7, those are:
5967
5968    o coding-category-emacs-mule
5969
5970         The category for a coding system which has the same code range
5971         as Emacs' internal format.  Assigned the coding-system (Lisp
5972         symbol) `emacs-mule' by default.
5973
5974    o coding-category-sjis
5975
5976         The category for a coding system which has the same code range
5977         as SJIS.  Assigned the coding-system (Lisp
5978         symbol) `japanese-shift-jis' by default.
5979
5980    o coding-category-iso-7
5981
5982         The category for a coding system which has the same code range
5983         as ISO2022 of 7-bit environment.  This doesn't use any locking
5984         shift and single shift functions.  This can encode/decode all
5985         charsets.  Assigned the coding-system (Lisp symbol)
5986         `iso-2022-7bit' by default.
5987
5988    o coding-category-iso-7-tight
5989
5990         Same as coding-category-iso-7 except that this can
5991         encode/decode only the specified charsets.
5992
5993    o coding-category-iso-8-1
5994
5995         The category for a coding system which has the same code range
5996         as ISO2022 of 8-bit environment and graphic plane 1 used only
5997         for DIMENSION1 charset.  This doesn't use any locking shift
5998         and single shift functions.  Assigned the coding-system (Lisp
5999         symbol) `iso-latin-1' by default.
6000
6001    o coding-category-iso-8-2
6002
6003         The category for a coding system which has the same code range
6004         as ISO2022 of 8-bit environment and graphic plane 1 used only
6005         for DIMENSION2 charset.  This doesn't use any locking shift
6006         and single shift functions.  Assigned the coding-system (Lisp
6007         symbol) `japanese-iso-8bit' by default.
6008
6009    o coding-category-iso-7-else
6010
6011         The category for a coding system which has the same code range
6012         as ISO2022 of 7-bit environment but uses locking shift or
6013         single shift functions.  Assigned the coding-system (Lisp
6014         symbol) `iso-2022-7bit-lock' by default.
6015
6016    o coding-category-iso-8-else
6017
6018         The category for a coding system which has the same code range
6019         as ISO2022 of 8-bit environment but uses locking shift or
6020         single shift functions.  Assigned the coding-system (Lisp
6021         symbol) `iso-2022-8bit-ss2' by default.
6022
6023    o coding-category-big5
6024
6025         The category for a coding system which has the same code range
6026         as BIG5.  Assigned the coding-system (Lisp symbol)
6027         `cn-big5' by default.
6028
6029    o coding-category-utf-8
6030
6031         The category for a coding system which has the same code range
6032         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6033         symbol) `utf-8' by default.
6034
6035    o coding-category-utf-16-be
6036
6037         The category for a coding system in which a text has an
6038         Unicode signature (cf. Unicode Standard) in the order of BIG
6039         endian at the head.  Assigned the coding-system (Lisp symbol)
6040         `utf-16-be' by default.
6041
6042    o coding-category-utf-16-le
6043
6044         The category for a coding system in which a text has an
6045         Unicode signature (cf. Unicode Standard) in the order of
6046         LITTLE endian at the head.  Assigned the coding-system (Lisp
6047         symbol) `utf-16-le' by default.
6048
6049    o coding-category-ccl
6050
6051         The category for a coding system of which encoder/decoder is
6052         written in CCL programs.  The default value is nil, i.e., no
6053         coding system is assigned.
6054
6055    o coding-category-binary
6056
6057         The category for a coding system not categorized in any of the
6058         above.  Assigned the coding-system (Lisp symbol)
6059         `no-conversion' by default.
6060
6061    Each of them is a Lisp symbol and the value is an actual
6062    `coding-system's (this is also a Lisp symbol) assigned by a user.
6063    What Emacs does actually is to detect a category of coding system.
6064    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6065    decide only one possible category, it selects a category of the
6066    highest priority.  Priorities of categories are also specified by a
6067    user in a Lisp variable `coding-category-list'.
6068
6069 */
6070
6071 #define EOL_SEEN_NONE   0
6072 #define EOL_SEEN_LF     1
6073 #define EOL_SEEN_CR     2
6074 #define EOL_SEEN_CRLF   4
6075
6076 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6077    SOURCE is encoded.  If CATEGORY is one of
6078    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6079    two-byte, else they are encoded by one-byte.
6080
6081    Return one of EOL_SEEN_XXX.  */
6082
6083 #define MAX_EOL_CHECK_COUNT 3
6084
6085 static int
6086 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6087             enum coding_category category)
6088 {
6089   const unsigned char *src = source, *src_end = src + src_bytes;
6090   unsigned char c;
6091   int total  = 0;
6092   int eol_seen = EOL_SEEN_NONE;
6093
6094   if ((1 << category) & CATEGORY_MASK_UTF_16)
6095     {
6096       bool msb = category == (coding_category_utf_16_le
6097                               | coding_category_utf_16_le_nosig);
6098       bool lsb = !msb;
6099
6100       while (src + 1 < src_end)
6101         {
6102           c = src[lsb];
6103           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6104             {
6105               int this_eol;
6106
6107               if (c == '\n')
6108                 this_eol = EOL_SEEN_LF;
6109               else if (src + 3 >= src_end
6110                        || src[msb + 2] != 0
6111                        || src[lsb + 2] != '\n')
6112                 this_eol = EOL_SEEN_CR;
6113               else
6114                 {
6115                   this_eol = EOL_SEEN_CRLF;
6116                   src += 2;
6117                 }
6118
6119               if (eol_seen == EOL_SEEN_NONE)
6120                 /* This is the first end-of-line.  */
6121                 eol_seen = this_eol;
6122               else if (eol_seen != this_eol)
6123                 {
6124                   /* The found type is different from what found before.
6125                      Allow for stray ^M characters in DOS EOL files.  */
6126                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6127                       || (eol_seen == EOL_SEEN_CRLF
6128                           && this_eol == EOL_SEEN_CR))
6129                     eol_seen = EOL_SEEN_CRLF;
6130                   else
6131                     {
6132                       eol_seen = EOL_SEEN_LF;
6133                       break;
6134                     }
6135                 }
6136               if (++total == MAX_EOL_CHECK_COUNT)
6137                 break;
6138             }
6139           src += 2;
6140         }
6141     }
6142   else
6143     while (src < src_end)
6144       {
6145         c = *src++;
6146         if (c == '\n' || c == '\r')
6147           {
6148             int this_eol;
6149
6150             if (c == '\n')
6151               this_eol = EOL_SEEN_LF;
6152             else if (src >= src_end || *src != '\n')
6153               this_eol = EOL_SEEN_CR;
6154             else
6155               this_eol = EOL_SEEN_CRLF, src++;
6156
6157             if (eol_seen == EOL_SEEN_NONE)
6158               /* This is the first end-of-line.  */
6159               eol_seen = this_eol;
6160             else if (eol_seen != this_eol)
6161               {
6162                 /* The found type is different from what found before.
6163                    Allow for stray ^M characters in DOS EOL files.  */
6164                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6165                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6166                   eol_seen = EOL_SEEN_CRLF;
6167                 else
6168                   {
6169                     eol_seen = EOL_SEEN_LF;
6170                     break;
6171                   }
6172               }
6173             if (++total == MAX_EOL_CHECK_COUNT)
6174               break;
6175           }
6176       }
6177   return eol_seen;
6178 }
6179
6180
6181 static Lisp_Object
6182 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6183 {
6184   Lisp_Object eol_type;
6185
6186   eol_type = CODING_ID_EOL_TYPE (coding->id);
6187   if (eol_seen & EOL_SEEN_LF)
6188     {
6189       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6190       eol_type = Qunix;
6191     }
6192   else if (eol_seen & EOL_SEEN_CRLF)
6193     {
6194       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6195       eol_type = Qdos;
6196     }
6197   else if (eol_seen & EOL_SEEN_CR)
6198     {
6199       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6200       eol_type = Qmac;
6201     }
6202   return eol_type;
6203 }
6204
6205 /* Detect how a text specified in CODING is encoded.  If a coding
6206    system is detected, update fields of CODING by the detected coding
6207    system.  */
6208
6209 static void
6210 detect_coding (struct coding_system *coding)
6211 {
6212   const unsigned char *src, *src_end;
6213   unsigned int saved_mode = coding->mode;
6214
6215   coding->consumed = coding->consumed_char = 0;
6216   coding->produced = coding->produced_char = 0;
6217   coding_set_source (coding);
6218
6219   src_end = coding->source + coding->src_bytes;
6220   coding->head_ascii = 0;
6221
6222   /* If we have not yet decided the text encoding type, detect it
6223      now.  */
6224   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6225     {
6226       int c, i;
6227       struct coding_detection_info detect_info;
6228       bool null_byte_found = 0, eight_bit_found = 0;
6229
6230       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6231       for (src = coding->source; src < src_end; src++)
6232         {
6233           c = *src;
6234           if (c & 0x80)
6235             {
6236               eight_bit_found = 1;
6237               if (null_byte_found)
6238                 break;
6239             }
6240           else if (c < 0x20)
6241             {
6242               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6243                   && ! inhibit_iso_escape_detection
6244                   && ! detect_info.checked)
6245                 {
6246                   if (detect_coding_iso_2022 (coding, &detect_info))
6247                     {
6248                       /* We have scanned the whole data.  */
6249                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6250                         {
6251                           /* We didn't find an 8-bit code.  We may
6252                              have found a null-byte, but it's very
6253                              rare that a binary file conforms to
6254                              ISO-2022.  */
6255                           src = src_end;
6256                           coding->head_ascii = src - coding->source;
6257                         }
6258                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6259                       break;
6260                     }
6261                 }
6262               else if (! c && !inhibit_null_byte_detection)
6263                 {
6264                   null_byte_found = 1;
6265                   if (eight_bit_found)
6266                     break;
6267                 }
6268               if (! eight_bit_found)
6269                 coding->head_ascii++;
6270             }
6271           else if (! eight_bit_found)
6272             coding->head_ascii++;
6273         }
6274
6275       if (null_byte_found || eight_bit_found
6276           || coding->head_ascii < coding->src_bytes
6277           || detect_info.found)
6278         {
6279           enum coding_category category;
6280           struct coding_system *this;
6281
6282           if (coding->head_ascii == coding->src_bytes)
6283             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6284             for (i = 0; i < coding_category_raw_text; i++)
6285               {
6286                 category = coding_priorities[i];
6287                 this = coding_categories + category;
6288                 if (detect_info.found & (1 << category))
6289                   break;
6290               }
6291           else
6292             {
6293               if (null_byte_found)
6294                 {
6295                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6296                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6297                 }
6298               for (i = 0; i < coding_category_raw_text; i++)
6299                 {
6300                   category = coding_priorities[i];
6301                   this = coding_categories + category;
6302                   /* Some of this->detector (e.g. detect_coding_sjis)
6303                      require this information.  */
6304                   coding->id = this->id;
6305                   if (this->id < 0)
6306                     {
6307                       /* No coding system of this category is defined.  */
6308                       detect_info.rejected |= (1 << category);
6309                     }
6310                   else if (category >= coding_category_raw_text)
6311                     continue;
6312                   else if (detect_info.checked & (1 << category))
6313                     {
6314                       if (detect_info.found & (1 << category))
6315                         break;
6316                     }
6317                   else if ((*(this->detector)) (coding, &detect_info)
6318                            && detect_info.found & (1 << category))
6319                     {
6320                       if (category == coding_category_utf_16_auto)
6321                         {
6322                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6323                             category = coding_category_utf_16_le;
6324                           else
6325                             category = coding_category_utf_16_be;
6326                         }
6327                       break;
6328                     }
6329                 }
6330             }
6331
6332           if (i < coding_category_raw_text)
6333             setup_coding_system (CODING_ID_NAME (this->id), coding);
6334           else if (null_byte_found)
6335             setup_coding_system (Qno_conversion, coding);
6336           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6337                    == CATEGORY_MASK_ANY)
6338             setup_coding_system (Qraw_text, coding);
6339           else if (detect_info.rejected)
6340             for (i = 0; i < coding_category_raw_text; i++)
6341               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6342                 {
6343                   this = coding_categories + coding_priorities[i];
6344                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6345                   break;
6346                 }
6347         }
6348     }
6349   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6350            == coding_category_utf_8_auto)
6351     {
6352       Lisp_Object coding_systems;
6353       struct coding_detection_info detect_info;
6354
6355       coding_systems
6356         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6357       detect_info.found = detect_info.rejected = 0;
6358       coding->head_ascii = 0;
6359       if (CONSP (coding_systems)
6360           && detect_coding_utf_8 (coding, &detect_info))
6361         {
6362           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6363             setup_coding_system (XCAR (coding_systems), coding);
6364           else
6365             setup_coding_system (XCDR (coding_systems), coding);
6366         }
6367     }
6368   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6369            == coding_category_utf_16_auto)
6370     {
6371       Lisp_Object coding_systems;
6372       struct coding_detection_info detect_info;
6373
6374       coding_systems
6375         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6376       detect_info.found = detect_info.rejected = 0;
6377       coding->head_ascii = 0;
6378       if (CONSP (coding_systems)
6379           && detect_coding_utf_16 (coding, &detect_info))
6380         {
6381           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6382             setup_coding_system (XCAR (coding_systems), coding);
6383           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6384             setup_coding_system (XCDR (coding_systems), coding);
6385         }
6386     }
6387   coding->mode = saved_mode;
6388 }
6389
6390
6391 static void
6392 decode_eol (struct coding_system *coding)
6393 {
6394   Lisp_Object eol_type;
6395   unsigned char *p, *pbeg, *pend;
6396
6397   eol_type = CODING_ID_EOL_TYPE (coding->id);
6398   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6399     return;
6400
6401   if (NILP (coding->dst_object))
6402     pbeg = coding->destination;
6403   else
6404     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6405   pend = pbeg + coding->produced;
6406
6407   if (VECTORP (eol_type))
6408     {
6409       int eol_seen = EOL_SEEN_NONE;
6410
6411       for (p = pbeg; p < pend; p++)
6412         {
6413           if (*p == '\n')
6414             eol_seen |= EOL_SEEN_LF;
6415           else if (*p == '\r')
6416             {
6417               if (p + 1 < pend && *(p + 1) == '\n')
6418                 {
6419                   eol_seen |= EOL_SEEN_CRLF;
6420                   p++;
6421                 }
6422               else
6423                 eol_seen |= EOL_SEEN_CR;
6424             }
6425         }
6426       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6427       if ((eol_seen & EOL_SEEN_CRLF) != 0
6428           && (eol_seen & EOL_SEEN_CR) != 0
6429           && (eol_seen & EOL_SEEN_LF) == 0)
6430         eol_seen = EOL_SEEN_CRLF;
6431       else if (eol_seen != EOL_SEEN_NONE
6432           && eol_seen != EOL_SEEN_LF
6433           && eol_seen != EOL_SEEN_CRLF
6434           && eol_seen != EOL_SEEN_CR)
6435         eol_seen = EOL_SEEN_LF;
6436       if (eol_seen != EOL_SEEN_NONE)
6437         eol_type = adjust_coding_eol_type (coding, eol_seen);
6438     }
6439
6440   if (EQ (eol_type, Qmac))
6441     {
6442       for (p = pbeg; p < pend; p++)
6443         if (*p == '\r')
6444           *p = '\n';
6445     }
6446   else if (EQ (eol_type, Qdos))
6447     {
6448       ptrdiff_t n = 0;
6449
6450       if (NILP (coding->dst_object))
6451         {
6452           /* Start deleting '\r' from the tail to minimize the memory
6453              movement.  */
6454           for (p = pend - 2; p >= pbeg; p--)
6455             if (*p == '\r')
6456               {
6457                 memmove (p, p + 1, pend-- - p - 1);
6458                 n++;
6459               }
6460         }
6461       else
6462         {
6463           ptrdiff_t pos_byte = coding->dst_pos_byte;
6464           ptrdiff_t pos = coding->dst_pos;
6465           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6466
6467           while (pos < pos_end)
6468             {
6469               p = BYTE_POS_ADDR (pos_byte);
6470               if (*p == '\r' && p[1] == '\n')
6471                 {
6472                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6473                   n++;
6474                   pos_end--;
6475                 }
6476               pos++;
6477               if (coding->dst_multibyte)
6478                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6479               else
6480                 pos_byte++;
6481             }
6482         }
6483       coding->produced -= n;
6484       coding->produced_char -= n;
6485     }
6486 }
6487
6488
6489 /* Return a translation table (or list of them) from coding system
6490    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6491    not ENCODEP). */
6492
6493 static Lisp_Object
6494 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6495 {
6496   Lisp_Object standard, translation_table;
6497   Lisp_Object val;
6498
6499   if (NILP (Venable_character_translation))
6500     {
6501       if (max_lookup)
6502         *max_lookup = 0;
6503       return Qnil;
6504     }
6505   if (encodep)
6506     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6507       standard = Vstandard_translation_table_for_encode;
6508   else
6509     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6510       standard = Vstandard_translation_table_for_decode;
6511   if (NILP (translation_table))
6512     translation_table = standard;
6513   else
6514     {
6515       if (SYMBOLP (translation_table))
6516         translation_table = Fget (translation_table, Qtranslation_table);
6517       else if (CONSP (translation_table))
6518         {
6519           translation_table = Fcopy_sequence (translation_table);
6520           for (val = translation_table; CONSP (val); val = XCDR (val))
6521             if (SYMBOLP (XCAR (val)))
6522               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6523         }
6524       if (CHAR_TABLE_P (standard))
6525         {
6526           if (CONSP (translation_table))
6527             translation_table = nconc2 (translation_table,
6528                                         Fcons (standard, Qnil));
6529           else
6530             translation_table = Fcons (translation_table,
6531                                        Fcons (standard, Qnil));
6532         }
6533     }
6534
6535   if (max_lookup)
6536     {
6537       *max_lookup = 1;
6538       if (CHAR_TABLE_P (translation_table)
6539           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6540         {
6541           val = XCHAR_TABLE (translation_table)->extras[1];
6542           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6543             *max_lookup = XFASTINT (val);
6544         }
6545       else if (CONSP (translation_table))
6546         {
6547           Lisp_Object tail;
6548
6549           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6550             if (CHAR_TABLE_P (XCAR (tail))
6551                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6552               {
6553                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6554                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6555                   *max_lookup = XFASTINT (tailval);
6556               }
6557         }
6558     }
6559   return translation_table;
6560 }
6561
6562 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6563   do {                                                          \
6564     trans = Qnil;                                               \
6565     if (CHAR_TABLE_P (table))                                   \
6566       {                                                         \
6567         trans = CHAR_TABLE_REF (table, c);                      \
6568         if (CHARACTERP (trans))                                 \
6569           c = XFASTINT (trans), trans = Qnil;                   \
6570       }                                                         \
6571     else if (CONSP (table))                                     \
6572       {                                                         \
6573         Lisp_Object tail;                                       \
6574                                                                 \
6575         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6576           if (CHAR_TABLE_P (XCAR (tail)))                       \
6577             {                                                   \
6578               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6579               if (CHARACTERP (trans))                           \
6580                 c = XFASTINT (trans), trans = Qnil;             \
6581               else if (! NILP (trans))                          \
6582                 break;                                          \
6583             }                                                   \
6584       }                                                         \
6585   } while (0)
6586
6587
6588 /* Return a translation of character(s) at BUF according to TRANS.
6589    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6590    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6591    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6592    translation is found, and Qnil if not found..
6593    If BUF is too short to lookup characters in FROM, return Qt.  */
6594
6595 static Lisp_Object
6596 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6597 {
6598
6599   if (INTEGERP (trans))
6600     return trans;
6601   for (; CONSP (trans); trans = XCDR (trans))
6602     {
6603       Lisp_Object val = XCAR (trans);
6604       Lisp_Object from = XCAR (val);
6605       ptrdiff_t len = ASIZE (from);
6606       ptrdiff_t i;
6607
6608       for (i = 0; i < len; i++)
6609         {
6610           if (buf + i == buf_end)
6611             return Qt;
6612           if (XINT (AREF (from, i)) != buf[i])
6613             break;
6614         }
6615       if (i == len)
6616         return val;
6617     }
6618   return Qnil;
6619 }
6620
6621
6622 static int
6623 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6624                bool last_block)
6625 {
6626   unsigned char *dst = coding->destination + coding->produced;
6627   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6628   ptrdiff_t produced;
6629   ptrdiff_t produced_chars = 0;
6630   int carryover = 0;
6631
6632   if (! coding->chars_at_source)
6633     {
6634       /* Source characters are in coding->charbuf.  */
6635       int *buf = coding->charbuf;
6636       int *buf_end = buf + coding->charbuf_used;
6637
6638       if (EQ (coding->src_object, coding->dst_object))
6639         {
6640           coding_set_source (coding);
6641           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6642         }
6643
6644       while (buf < buf_end)
6645         {
6646           int c = *buf;
6647           ptrdiff_t i;
6648
6649           if (c >= 0)
6650             {
6651               ptrdiff_t from_nchars = 1, to_nchars = 1;
6652               Lisp_Object trans = Qnil;
6653
6654               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6655               if (! NILP (trans))
6656                 {
6657                   trans = get_translation (trans, buf, buf_end);
6658                   if (INTEGERP (trans))
6659                     c = XINT (trans);
6660                   else if (CONSP (trans))
6661                     {
6662                       from_nchars = ASIZE (XCAR (trans));
6663                       trans = XCDR (trans);
6664                       if (INTEGERP (trans))
6665                         c = XINT (trans);
6666                       else
6667                         {
6668                           to_nchars = ASIZE (trans);
6669                           c = XINT (AREF (trans, 0));
6670                         }
6671                     }
6672                   else if (EQ (trans, Qt) && ! last_block)
6673                     break;
6674                 }
6675
6676               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6677                 {
6678                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6679                        / MAX_MULTIBYTE_LENGTH)
6680                       < to_nchars)
6681                     memory_full (SIZE_MAX);
6682                   dst = alloc_destination (coding,
6683                                            buf_end - buf
6684                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6685                                            dst);
6686                   if (EQ (coding->src_object, coding->dst_object))
6687                     {
6688                       coding_set_source (coding);
6689                       dst_end = (((unsigned char *) coding->source)
6690                                  + coding->consumed);
6691                     }
6692                   else
6693                     dst_end = coding->destination + coding->dst_bytes;
6694                 }
6695
6696               for (i = 0; i < to_nchars; i++)
6697                 {
6698                   if (i > 0)
6699                     c = XINT (AREF (trans, i));
6700                   if (coding->dst_multibyte
6701                       || ! CHAR_BYTE8_P (c))
6702                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6703                   else
6704                     *dst++ = CHAR_TO_BYTE8 (c);
6705                 }
6706               produced_chars += to_nchars;
6707               buf += from_nchars;
6708             }
6709           else
6710             /* This is an annotation datum.  (-C) is the length.  */
6711             buf += -c;
6712         }
6713       carryover = buf_end - buf;
6714     }
6715   else
6716     {
6717       /* Source characters are at coding->source.  */
6718       const unsigned char *src = coding->source;
6719       const unsigned char *src_end = src + coding->consumed;
6720
6721       if (EQ (coding->dst_object, coding->src_object))
6722         dst_end = (unsigned char *) src;
6723       if (coding->src_multibyte != coding->dst_multibyte)
6724         {
6725           if (coding->src_multibyte)
6726             {
6727               bool multibytep = 1;
6728               ptrdiff_t consumed_chars = 0;
6729
6730               while (1)
6731                 {
6732                   const unsigned char *src_base = src;
6733                   int c;
6734
6735                   ONE_MORE_BYTE (c);
6736                   if (dst == dst_end)
6737                     {
6738                       if (EQ (coding->src_object, coding->dst_object))
6739                         dst_end = (unsigned char *) src;
6740                       if (dst == dst_end)
6741                         {
6742                           ptrdiff_t offset = src - coding->source;
6743
6744                           dst = alloc_destination (coding, src_end - src + 1,
6745                                                    dst);
6746                           dst_end = coding->destination + coding->dst_bytes;
6747                           coding_set_source (coding);
6748                           src = coding->source + offset;
6749                           src_end = coding->source + coding->consumed;
6750                           if (EQ (coding->src_object, coding->dst_object))
6751                             dst_end = (unsigned char *) src;
6752                         }
6753                     }
6754                   *dst++ = c;
6755                   produced_chars++;
6756                 }
6757             no_more_source:
6758               ;
6759             }
6760           else
6761             while (src < src_end)
6762               {
6763                 bool multibytep = 1;
6764                 int c = *src++;
6765
6766                 if (dst >= dst_end - 1)
6767                   {
6768                     if (EQ (coding->src_object, coding->dst_object))
6769                       dst_end = (unsigned char *) src;
6770                     if (dst >= dst_end - 1)
6771                       {
6772                         ptrdiff_t offset = src - coding->source;
6773                         ptrdiff_t more_bytes;
6774
6775                         if (EQ (coding->src_object, coding->dst_object))
6776                           more_bytes = ((src_end - src) / 2) + 2;
6777                         else
6778                           more_bytes = src_end - src + 2;
6779                         dst = alloc_destination (coding, more_bytes, dst);
6780                         dst_end = coding->destination + coding->dst_bytes;
6781                         coding_set_source (coding);
6782                         src = coding->source + offset;
6783                         src_end = coding->source + coding->consumed;
6784                         if (EQ (coding->src_object, coding->dst_object))
6785                           dst_end = (unsigned char *) src;
6786                       }
6787                   }
6788                 EMIT_ONE_BYTE (c);
6789               }
6790         }
6791       else
6792         {
6793           if (!EQ (coding->src_object, coding->dst_object))
6794             {
6795               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6796
6797               if (require > 0)
6798                 {
6799                   ptrdiff_t offset = src - coding->source;
6800
6801                   dst = alloc_destination (coding, require, dst);
6802                   coding_set_source (coding);
6803                   src = coding->source + offset;
6804                   src_end = coding->source + coding->consumed;
6805                 }
6806             }
6807           produced_chars = coding->consumed_char;
6808           while (src < src_end)
6809             *dst++ = *src++;
6810         }
6811     }
6812
6813   produced = dst - (coding->destination + coding->produced);
6814   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6815     insert_from_gap (produced_chars, produced);
6816   coding->produced += produced;
6817   coding->produced_char += produced_chars;
6818   return carryover;
6819 }
6820
6821 /* Compose text in CODING->object according to the annotation data at
6822    CHARBUF.  CHARBUF is an array:
6823      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6824  */
6825
6826 static void
6827 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6828 {
6829   int len;
6830   ptrdiff_t to;
6831   enum composition_method method;
6832   Lisp_Object components;
6833
6834   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6835   to = pos + charbuf[2];
6836   method = (enum composition_method) (charbuf[4]);
6837
6838   if (method == COMPOSITION_RELATIVE)
6839     components = Qnil;
6840   else
6841     {
6842       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6843       int i, j;
6844
6845       if (method == COMPOSITION_WITH_RULE)
6846         len = charbuf[2] * 3 - 2;
6847       charbuf += MAX_ANNOTATION_LENGTH;
6848       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6849       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6850         {
6851           if (charbuf[i] >= 0)
6852             args[j] = make_number (charbuf[i]);
6853           else
6854             {
6855               i++;
6856               args[j] = make_number (charbuf[i] % 0x100);
6857             }
6858         }
6859       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6860     }
6861   compose_text (pos, to, components, Qnil, coding->dst_object);
6862 }
6863
6864
6865 /* Put `charset' property on text in CODING->object according to
6866    the annotation data at CHARBUF.  CHARBUF is an array:
6867      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6868  */
6869
6870 static void
6871 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6872 {
6873   ptrdiff_t from = pos - charbuf[2];
6874   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6875
6876   Fput_text_property (make_number (from), make_number (pos),
6877                       Qcharset, CHARSET_NAME (charset),
6878                       coding->dst_object);
6879 }
6880
6881
6882 #define CHARBUF_SIZE 0x4000
6883
6884 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6885   do {                                                                  \
6886     int size = CHARBUF_SIZE;                                            \
6887                                                                         \
6888     coding->charbuf = NULL;                                             \
6889     while (size > 1024)                                                 \
6890       {                                                                 \
6891         coding->charbuf = alloca (sizeof (int) * size);                 \
6892         if (coding->charbuf)                                            \
6893           break;                                                        \
6894         size >>= 1;                                                     \
6895       }                                                                 \
6896     if (! coding->charbuf)                                              \
6897       {                                                                 \
6898         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6899         return;                                                         \
6900       }                                                                 \
6901     coding->charbuf_size = size;                                        \
6902   } while (0)
6903
6904
6905 static void
6906 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6907 {
6908   int *charbuf = coding->charbuf;
6909   int *charbuf_end = charbuf + coding->charbuf_used;
6910
6911   if (NILP (coding->dst_object))
6912     return;
6913
6914   while (charbuf < charbuf_end)
6915     {
6916       if (*charbuf >= 0)
6917         pos++, charbuf++;
6918       else
6919         {
6920           int len = -*charbuf;
6921
6922           if (len > 2)
6923             switch (charbuf[1])
6924               {
6925               case CODING_ANNOTATE_COMPOSITION_MASK:
6926                 produce_composition (coding, charbuf, pos);
6927                 break;
6928               case CODING_ANNOTATE_CHARSET_MASK:
6929                 produce_charset (coding, charbuf, pos);
6930                 break;
6931               }
6932           charbuf += len;
6933         }
6934     }
6935 }
6936
6937 /* Decode the data at CODING->src_object into CODING->dst_object.
6938    CODING->src_object is a buffer, a string, or nil.
6939    CODING->dst_object is a buffer.
6940
6941    If CODING->src_object is a buffer, it must be the current buffer.
6942    In this case, if CODING->src_pos is positive, it is a position of
6943    the source text in the buffer, otherwise, the source text is in the
6944    gap area of the buffer, and CODING->src_pos specifies the offset of
6945    the text from GPT (which must be the same as PT).  If this is the
6946    same buffer as CODING->dst_object, CODING->src_pos must be
6947    negative.
6948
6949    If CODING->src_object is a string, CODING->src_pos is an index to
6950    that string.
6951
6952    If CODING->src_object is nil, CODING->source must already point to
6953    the non-relocatable memory area.  In this case, CODING->src_pos is
6954    an offset from CODING->source.
6955
6956    The decoded data is inserted at the current point of the buffer
6957    CODING->dst_object.
6958 */
6959
6960 static void
6961 decode_coding (struct coding_system *coding)
6962 {
6963   Lisp_Object attrs;
6964   Lisp_Object undo_list;
6965   Lisp_Object translation_table;
6966   struct ccl_spec cclspec;
6967   int carryover;
6968   int i;
6969
6970   if (BUFFERP (coding->src_object)
6971       && coding->src_pos > 0
6972       && coding->src_pos < GPT
6973       && coding->src_pos + coding->src_chars > GPT)
6974     move_gap_both (coding->src_pos, coding->src_pos_byte);
6975
6976   undo_list = Qt;
6977   if (BUFFERP (coding->dst_object))
6978     {
6979       set_buffer_internal (XBUFFER (coding->dst_object));
6980       if (GPT != PT)
6981         move_gap_both (PT, PT_BYTE);
6982
6983       /* We must disable undo_list in order to record the whole insert
6984          transaction via record_insert at the end.  But doing so also
6985          disables the recording of the first change to the undo_list.
6986          Therefore we check for first change here and record it via
6987          record_first_change if needed.  */
6988       if (MODIFF <= SAVE_MODIFF)
6989         record_first_change ();
6990
6991       undo_list = BVAR (current_buffer, undo_list);
6992       bset_undo_list (current_buffer, Qt);
6993     }
6994
6995   coding->consumed = coding->consumed_char = 0;
6996   coding->produced = coding->produced_char = 0;
6997   coding->chars_at_source = 0;
6998   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6999   coding->errors = 0;
7000
7001   ALLOC_CONVERSION_WORK_AREA (coding);
7002
7003   attrs = CODING_ID_ATTRS (coding->id);
7004   translation_table = get_translation_table (attrs, 0, NULL);
7005
7006   carryover = 0;
7007   if (coding->decoder == decode_coding_ccl)
7008     {
7009       coding->spec.ccl = &cclspec;
7010       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7011     }
7012   do
7013     {
7014       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7015
7016       coding_set_source (coding);
7017       coding->annotated = 0;
7018       coding->charbuf_used = carryover;
7019       (*(coding->decoder)) (coding);
7020       coding_set_destination (coding);
7021       carryover = produce_chars (coding, translation_table, 0);
7022       if (coding->annotated)
7023         produce_annotation (coding, pos);
7024       for (i = 0; i < carryover; i++)
7025         coding->charbuf[i]
7026           = coding->charbuf[coding->charbuf_used - carryover + i];
7027     }
7028   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7029          || (coding->consumed < coding->src_bytes
7030              && (coding->result == CODING_RESULT_SUCCESS
7031                  || coding->result == CODING_RESULT_INVALID_SRC)));
7032
7033   if (carryover > 0)
7034     {
7035       coding_set_destination (coding);
7036       coding->charbuf_used = carryover;
7037       produce_chars (coding, translation_table, 1);
7038     }
7039
7040   coding->carryover_bytes = 0;
7041   if (coding->consumed < coding->src_bytes)
7042     {
7043       int nbytes = coding->src_bytes - coding->consumed;
7044       const unsigned char *src;
7045
7046       coding_set_source (coding);
7047       coding_set_destination (coding);
7048       src = coding->source + coding->consumed;
7049
7050       if (coding->mode & CODING_MODE_LAST_BLOCK)
7051         {
7052           /* Flush out unprocessed data as binary chars.  We are sure
7053              that the number of data is less than the size of
7054              coding->charbuf.  */
7055           coding->charbuf_used = 0;
7056           coding->chars_at_source = 0;
7057
7058           while (nbytes-- > 0)
7059             {
7060               int c = *src++;
7061
7062               if (c & 0x80)
7063                 c = BYTE8_TO_CHAR (c);
7064               coding->charbuf[coding->charbuf_used++] = c;
7065             }
7066           produce_chars (coding, Qnil, 1);
7067         }
7068       else
7069         {
7070           /* Record unprocessed bytes in coding->carryover.  We are
7071              sure that the number of data is less than the size of
7072              coding->carryover.  */
7073           unsigned char *p = coding->carryover;
7074
7075           if (nbytes > sizeof coding->carryover)
7076             nbytes = sizeof coding->carryover;
7077           coding->carryover_bytes = nbytes;
7078           while (nbytes-- > 0)
7079             *p++ = *src++;
7080         }
7081       coding->consumed = coding->src_bytes;
7082     }
7083
7084   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7085       && !inhibit_eol_conversion)
7086     decode_eol (coding);
7087   if (BUFFERP (coding->dst_object))
7088     {
7089       bset_undo_list (current_buffer, undo_list);
7090       record_insert (coding->dst_pos, coding->produced_char);
7091     }
7092 }
7093
7094
7095 /* Extract an annotation datum from a composition starting at POS and
7096    ending before LIMIT of CODING->src_object (buffer or string), store
7097    the data in BUF, set *STOP to a starting position of the next
7098    composition (if any) or to LIMIT, and return the address of the
7099    next element of BUF.
7100
7101    If such an annotation is not found, set *STOP to a starting
7102    position of a composition after POS (if any) or to LIMIT, and
7103    return BUF.  */
7104
7105 static int *
7106 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7107                                struct coding_system *coding, int *buf,
7108                                ptrdiff_t *stop)
7109 {
7110   ptrdiff_t start, end;
7111   Lisp_Object prop;
7112
7113   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7114       || end > limit)
7115     *stop = limit;
7116   else if (start > pos)
7117     *stop = start;
7118   else
7119     {
7120       if (start == pos)
7121         {
7122           /* We found a composition.  Store the corresponding
7123              annotation data in BUF.  */
7124           int *head = buf;
7125           enum composition_method method = COMPOSITION_METHOD (prop);
7126           int nchars = COMPOSITION_LENGTH (prop);
7127
7128           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7129           if (method != COMPOSITION_RELATIVE)
7130             {
7131               Lisp_Object components;
7132               ptrdiff_t i, len, i_byte;
7133
7134               components = COMPOSITION_COMPONENTS (prop);
7135               if (VECTORP (components))
7136                 {
7137                   len = ASIZE (components);
7138                   for (i = 0; i < len; i++)
7139                     *buf++ = XINT (AREF (components, i));
7140                 }
7141               else if (STRINGP (components))
7142                 {
7143                   len = SCHARS (components);
7144                   i = i_byte = 0;
7145                   while (i < len)
7146                     {
7147                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7148                       buf++;
7149                     }
7150                 }
7151               else if (INTEGERP (components))
7152                 {
7153                   len = 1;
7154                   *buf++ = XINT (components);
7155                 }
7156               else if (CONSP (components))
7157                 {
7158                   for (len = 0; CONSP (components);
7159                        len++, components = XCDR (components))
7160                     *buf++ = XINT (XCAR (components));
7161                 }
7162               else
7163                 emacs_abort ();
7164               *head -= len;
7165             }
7166         }
7167
7168       if (find_composition (end, limit, &start, &end, &prop,
7169                             coding->src_object)
7170           && end <= limit)
7171         *stop = start;
7172       else
7173         *stop = limit;
7174     }
7175   return buf;
7176 }
7177
7178
7179 /* Extract an annotation datum from a text property `charset' at POS of
7180    CODING->src_object (buffer of string), store the data in BUF, set
7181    *STOP to the position where the value of `charset' property changes
7182    (limiting by LIMIT), and return the address of the next element of
7183    BUF.
7184
7185    If the property value is nil, set *STOP to the position where the
7186    property value is non-nil (limiting by LIMIT), and return BUF.  */
7187
7188 static int *
7189 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7190                            struct coding_system *coding, int *buf,
7191                            ptrdiff_t *stop)
7192 {
7193   Lisp_Object val, next;
7194   int id;
7195
7196   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7197   if (! NILP (val) && CHARSETP (val))
7198     id = XINT (CHARSET_SYMBOL_ID (val));
7199   else
7200     id = -1;
7201   ADD_CHARSET_DATA (buf, 0, id);
7202   next = Fnext_single_property_change (make_number (pos), Qcharset,
7203                                        coding->src_object,
7204                                        make_number (limit));
7205   *stop = XINT (next);
7206   return buf;
7207 }
7208
7209
7210 static void
7211 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7212                int max_lookup)
7213 {
7214   int *buf = coding->charbuf;
7215   int *buf_end = coding->charbuf + coding->charbuf_size;
7216   const unsigned char *src = coding->source + coding->consumed;
7217   const unsigned char *src_end = coding->source + coding->src_bytes;
7218   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7219   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7220   bool multibytep = coding->src_multibyte;
7221   Lisp_Object eol_type;
7222   int c;
7223   ptrdiff_t stop, stop_composition, stop_charset;
7224   int *lookup_buf = NULL;
7225
7226   if (! NILP (translation_table))
7227     lookup_buf = alloca (sizeof (int) * max_lookup);
7228
7229   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7230   if (VECTORP (eol_type))
7231     eol_type = Qunix;
7232
7233   /* Note: composition handling is not yet implemented.  */
7234   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7235
7236   if (NILP (coding->src_object))
7237     stop = stop_composition = stop_charset = end_pos;
7238   else
7239     {
7240       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7241         stop = stop_composition = pos;
7242       else
7243         stop = stop_composition = end_pos;
7244       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7245         stop = stop_charset = pos;
7246       else
7247         stop_charset = end_pos;
7248     }
7249
7250   /* Compensate for CRLF and conversion.  */
7251   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7252   while (buf < buf_end)
7253     {
7254       Lisp_Object trans;
7255
7256       if (pos == stop)
7257         {
7258           if (pos == end_pos)
7259             break;
7260           if (pos == stop_composition)
7261             buf = handle_composition_annotation (pos, end_pos, coding,
7262                                                  buf, &stop_composition);
7263           if (pos == stop_charset)
7264             buf = handle_charset_annotation (pos, end_pos, coding,
7265                                              buf, &stop_charset);
7266           stop = (stop_composition < stop_charset
7267                   ? stop_composition : stop_charset);
7268         }
7269
7270       if (! multibytep)
7271         {
7272           int bytes;
7273
7274           if (coding->encoder == encode_coding_raw_text
7275               || coding->encoder == encode_coding_ccl)
7276             c = *src++, pos++;
7277           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7278             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7279           else
7280             c = BYTE8_TO_CHAR (*src), src++, pos++;
7281         }
7282       else
7283         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7284       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7285         c = '\n';
7286       if (! EQ (eol_type, Qunix))
7287         {
7288           if (c == '\n')
7289             {
7290               if (EQ (eol_type, Qdos))
7291                 *buf++ = '\r';
7292               else
7293                 c = '\r';
7294             }
7295         }
7296
7297       trans = Qnil;
7298       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7299       if (NILP (trans))
7300         *buf++ = c;
7301       else
7302         {
7303           ptrdiff_t from_nchars = 1, to_nchars = 1;
7304           int *lookup_buf_end;
7305           const unsigned char *p = src;
7306           int i;
7307
7308           lookup_buf[0] = c;
7309           for (i = 1; i < max_lookup && p < src_end; i++)
7310             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7311           lookup_buf_end = lookup_buf + i;
7312           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7313           if (INTEGERP (trans))
7314             c = XINT (trans);
7315           else if (CONSP (trans))
7316             {
7317               from_nchars = ASIZE (XCAR (trans));
7318               trans = XCDR (trans);
7319               if (INTEGERP (trans))
7320                 c = XINT (trans);
7321               else
7322                 {
7323                   to_nchars = ASIZE (trans);
7324                   if (buf_end - buf < to_nchars)
7325                     break;
7326                   c = XINT (AREF (trans, 0));
7327                 }
7328             }
7329           else
7330             break;
7331           *buf++ = c;
7332           for (i = 1; i < to_nchars; i++)
7333             *buf++ = XINT (AREF (trans, i));
7334           for (i = 1; i < from_nchars; i++, pos++)
7335             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7336         }
7337     }
7338
7339   coding->consumed = src - coding->source;
7340   coding->consumed_char = pos - coding->src_pos;
7341   coding->charbuf_used = buf - coding->charbuf;
7342   coding->chars_at_source = 0;
7343 }
7344
7345
7346 /* Encode the text at CODING->src_object into CODING->dst_object.
7347    CODING->src_object is a buffer or a string.
7348    CODING->dst_object is a buffer or nil.
7349
7350    If CODING->src_object is a buffer, it must be the current buffer.
7351    In this case, if CODING->src_pos is positive, it is a position of
7352    the source text in the buffer, otherwise. the source text is in the
7353    gap area of the buffer, and coding->src_pos specifies the offset of
7354    the text from GPT (which must be the same as PT).  If this is the
7355    same buffer as CODING->dst_object, CODING->src_pos must be
7356    negative and CODING should not have `pre-write-conversion'.
7357
7358    If CODING->src_object is a string, CODING should not have
7359    `pre-write-conversion'.
7360
7361    If CODING->dst_object is a buffer, the encoded data is inserted at
7362    the current point of that buffer.
7363
7364    If CODING->dst_object is nil, the encoded data is placed at the
7365    memory area specified by CODING->destination.  */
7366
7367 static void
7368 encode_coding (struct coding_system *coding)
7369 {
7370   Lisp_Object attrs;
7371   Lisp_Object translation_table;
7372   int max_lookup;
7373   struct ccl_spec cclspec;
7374
7375   attrs = CODING_ID_ATTRS (coding->id);
7376   if (coding->encoder == encode_coding_raw_text)
7377     translation_table = Qnil, max_lookup = 0;
7378   else
7379     translation_table = get_translation_table (attrs, 1, &max_lookup);
7380
7381   if (BUFFERP (coding->dst_object))
7382     {
7383       set_buffer_internal (XBUFFER (coding->dst_object));
7384       coding->dst_multibyte
7385         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7386     }
7387
7388   coding->consumed = coding->consumed_char = 0;
7389   coding->produced = coding->produced_char = 0;
7390   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7391   coding->errors = 0;
7392
7393   ALLOC_CONVERSION_WORK_AREA (coding);
7394
7395   if (coding->encoder == encode_coding_ccl)
7396     {
7397       coding->spec.ccl = &cclspec;
7398       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7399     }
7400   do {
7401     coding_set_source (coding);
7402     consume_chars (coding, translation_table, max_lookup);
7403     coding_set_destination (coding);
7404     (*(coding->encoder)) (coding);
7405   } while (coding->consumed_char < coding->src_chars);
7406
7407   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7408     insert_from_gap (coding->produced_char, coding->produced);
7409 }
7410
7411
7412 /* Name (or base name) of work buffer for code conversion.  */
7413 static Lisp_Object Vcode_conversion_workbuf_name;
7414
7415 /* A working buffer used by the top level conversion.  Once it is
7416    created, it is never destroyed.  It has the name
7417    Vcode_conversion_workbuf_name.  The other working buffers are
7418    destroyed after the use is finished, and their names are modified
7419    versions of Vcode_conversion_workbuf_name.  */
7420 static Lisp_Object Vcode_conversion_reused_workbuf;
7421
7422 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7423 static bool reused_workbuf_in_use;
7424
7425
7426 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7427    multibyteness of returning buffer.  */
7428
7429 static Lisp_Object
7430 make_conversion_work_buffer (bool multibyte)
7431 {
7432   Lisp_Object name, workbuf;
7433   struct buffer *current;
7434
7435   if (reused_workbuf_in_use)
7436     {
7437       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7438       workbuf = Fget_buffer_create (name);
7439     }
7440   else
7441     {
7442       reused_workbuf_in_use = 1;
7443       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7444         Vcode_conversion_reused_workbuf
7445           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7446       workbuf = Vcode_conversion_reused_workbuf;
7447     }
7448   current = current_buffer;
7449   set_buffer_internal (XBUFFER (workbuf));
7450   /* We can't allow modification hooks to run in the work buffer.  For
7451      instance, directory_files_internal assumes that file decoding
7452      doesn't compile new regexps.  */
7453   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7454   Ferase_buffer ();
7455   bset_undo_list (current_buffer, Qt);
7456   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7457   set_buffer_internal (current);
7458   return workbuf;
7459 }
7460
7461
7462 static Lisp_Object
7463 code_conversion_restore (Lisp_Object arg)
7464 {
7465   Lisp_Object current, workbuf;
7466   struct gcpro gcpro1;
7467
7468   GCPRO1 (arg);
7469   current = XCAR (arg);
7470   workbuf = XCDR (arg);
7471   if (! NILP (workbuf))
7472     {
7473       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7474         reused_workbuf_in_use = 0;
7475       else
7476         Fkill_buffer (workbuf);
7477     }
7478   set_buffer_internal (XBUFFER (current));
7479   UNGCPRO;
7480   return Qnil;
7481 }
7482
7483 Lisp_Object
7484 code_conversion_save (bool with_work_buf, bool multibyte)
7485 {
7486   Lisp_Object workbuf = Qnil;
7487
7488   if (with_work_buf)
7489     workbuf = make_conversion_work_buffer (multibyte);
7490   record_unwind_protect (code_conversion_restore,
7491                          Fcons (Fcurrent_buffer (), workbuf));
7492   return workbuf;
7493 }
7494
7495 void
7496 decode_coding_gap (struct coding_system *coding,
7497                    ptrdiff_t chars, ptrdiff_t bytes)
7498 {
7499   ptrdiff_t count = SPECPDL_INDEX ();
7500   Lisp_Object attrs;
7501
7502   code_conversion_save (0, 0);
7503
7504   coding->src_object = Fcurrent_buffer ();
7505   coding->src_chars = chars;
7506   coding->src_bytes = bytes;
7507   coding->src_pos = -chars;
7508   coding->src_pos_byte = -bytes;
7509   coding->src_multibyte = chars < bytes;
7510   coding->dst_object = coding->src_object;
7511   coding->dst_pos = PT;
7512   coding->dst_pos_byte = PT_BYTE;
7513   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7514
7515   if (CODING_REQUIRE_DETECTION (coding))
7516     detect_coding (coding);
7517
7518   coding->mode |= CODING_MODE_LAST_BLOCK;
7519   current_buffer->text->inhibit_shrinking = 1;
7520   decode_coding (coding);
7521   current_buffer->text->inhibit_shrinking = 0;
7522
7523   attrs = CODING_ID_ATTRS (coding->id);
7524   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7525     {
7526       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7527       Lisp_Object val;
7528
7529       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7530       val = call1 (CODING_ATTR_POST_READ (attrs),
7531                    make_number (coding->produced_char));
7532       CHECK_NATNUM (val);
7533       coding->produced_char += Z - prev_Z;
7534       coding->produced += Z_BYTE - prev_Z_BYTE;
7535     }
7536
7537   unbind_to (count, Qnil);
7538 }
7539
7540
7541 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7542    SRC_OBJECT into DST_OBJECT by coding context CODING.
7543
7544    SRC_OBJECT is a buffer, a string, or Qnil.
7545
7546    If it is a buffer, the text is at point of the buffer.  FROM and TO
7547    are positions in the buffer.
7548
7549    If it is a string, the text is at the beginning of the string.
7550    FROM and TO are indices to the string.
7551
7552    If it is nil, the text is at coding->source.  FROM and TO are
7553    indices to coding->source.
7554
7555    DST_OBJECT is a buffer, Qt, or Qnil.
7556
7557    If it is a buffer, the decoded text is inserted at point of the
7558    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7559    is deleted.
7560
7561    If it is Qt, a string is made from the decoded text, and
7562    set in CODING->dst_object.
7563
7564    If it is Qnil, the decoded text is stored at CODING->destination.
7565    The caller must allocate CODING->dst_bytes bytes at
7566    CODING->destination by xmalloc.  If the decoded text is longer than
7567    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7568  */
7569
7570 void
7571 decode_coding_object (struct coding_system *coding,
7572                       Lisp_Object src_object,
7573                       ptrdiff_t from, ptrdiff_t from_byte,
7574                       ptrdiff_t to, ptrdiff_t to_byte,
7575                       Lisp_Object dst_object)
7576 {
7577   ptrdiff_t count = SPECPDL_INDEX ();
7578   unsigned char *destination IF_LINT (= NULL);
7579   ptrdiff_t dst_bytes IF_LINT (= 0);
7580   ptrdiff_t chars = to - from;
7581   ptrdiff_t bytes = to_byte - from_byte;
7582   Lisp_Object attrs;
7583   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7584   bool need_marker_adjustment = 0;
7585   Lisp_Object old_deactivate_mark;
7586
7587   old_deactivate_mark = Vdeactivate_mark;
7588
7589   if (NILP (dst_object))
7590     {
7591       destination = coding->destination;
7592       dst_bytes = coding->dst_bytes;
7593     }
7594
7595   coding->src_object = src_object;
7596   coding->src_chars = chars;
7597   coding->src_bytes = bytes;
7598   coding->src_multibyte = chars < bytes;
7599
7600   if (STRINGP (src_object))
7601     {
7602       coding->src_pos = from;
7603       coding->src_pos_byte = from_byte;
7604     }
7605   else if (BUFFERP (src_object))
7606     {
7607       set_buffer_internal (XBUFFER (src_object));
7608       if (from != GPT)
7609         move_gap_both (from, from_byte);
7610       if (EQ (src_object, dst_object))
7611         {
7612           struct Lisp_Marker *tail;
7613
7614           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7615             {
7616               tail->need_adjustment
7617                 = tail->charpos == (tail->insertion_type ? from : to);
7618               need_marker_adjustment |= tail->need_adjustment;
7619             }
7620           saved_pt = PT, saved_pt_byte = PT_BYTE;
7621           TEMP_SET_PT_BOTH (from, from_byte);
7622           current_buffer->text->inhibit_shrinking = 1;
7623           del_range_both (from, from_byte, to, to_byte, 1);
7624           coding->src_pos = -chars;
7625           coding->src_pos_byte = -bytes;
7626         }
7627       else
7628         {
7629           coding->src_pos = from;
7630           coding->src_pos_byte = from_byte;
7631         }
7632     }
7633
7634   if (CODING_REQUIRE_DETECTION (coding))
7635     detect_coding (coding);
7636   attrs = CODING_ID_ATTRS (coding->id);
7637
7638   if (EQ (dst_object, Qt)
7639       || (! NILP (CODING_ATTR_POST_READ (attrs))
7640           && NILP (dst_object)))
7641     {
7642       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7643       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7644       coding->dst_pos = BEG;
7645       coding->dst_pos_byte = BEG_BYTE;
7646     }
7647   else if (BUFFERP (dst_object))
7648     {
7649       code_conversion_save (0, 0);
7650       coding->dst_object = dst_object;
7651       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7652       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7653       coding->dst_multibyte
7654         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7655     }
7656   else
7657     {
7658       code_conversion_save (0, 0);
7659       coding->dst_object = Qnil;
7660       /* Most callers presume this will return a multibyte result, and they
7661          won't use `binary' or `raw-text' anyway, so let's not worry about
7662          CODING_FOR_UNIBYTE.  */
7663       coding->dst_multibyte = 1;
7664     }
7665
7666   decode_coding (coding);
7667
7668   if (BUFFERP (coding->dst_object))
7669     set_buffer_internal (XBUFFER (coding->dst_object));
7670
7671   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7672     {
7673       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7674       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7675       Lisp_Object val;
7676
7677       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7678       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7679               old_deactivate_mark);
7680       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7681                         make_number (coding->produced_char));
7682       UNGCPRO;
7683       CHECK_NATNUM (val);
7684       coding->produced_char += Z - prev_Z;
7685       coding->produced += Z_BYTE - prev_Z_BYTE;
7686     }
7687
7688   if (EQ (dst_object, Qt))
7689     {
7690       coding->dst_object = Fbuffer_string ();
7691     }
7692   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7693     {
7694       set_buffer_internal (XBUFFER (coding->dst_object));
7695       if (dst_bytes < coding->produced)
7696         {
7697           destination = xrealloc (destination, coding->produced);
7698           if (! destination)
7699             {
7700               record_conversion_result (coding,
7701                                         CODING_RESULT_INSUFFICIENT_MEM);
7702               unbind_to (count, Qnil);
7703               return;
7704             }
7705           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7706             move_gap_both (BEGV, BEGV_BYTE);
7707           memcpy (destination, BEGV_ADDR, coding->produced);
7708           coding->destination = destination;
7709         }
7710     }
7711
7712   if (saved_pt >= 0)
7713     {
7714       /* This is the case of:
7715          (BUFFERP (src_object) && EQ (src_object, dst_object))
7716          As we have moved PT while replacing the original buffer
7717          contents, we must recover it now.  */
7718       set_buffer_internal (XBUFFER (src_object));
7719       current_buffer->text->inhibit_shrinking = 0;
7720       if (saved_pt < from)
7721         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7722       else if (saved_pt < from + chars)
7723         TEMP_SET_PT_BOTH (from, from_byte);
7724       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7725         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7726                           saved_pt_byte + (coding->produced - bytes));
7727       else
7728         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7729                           saved_pt_byte + (coding->produced - bytes));
7730
7731       if (need_marker_adjustment)
7732         {
7733           struct Lisp_Marker *tail;
7734
7735           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7736             if (tail->need_adjustment)
7737               {
7738                 tail->need_adjustment = 0;
7739                 if (tail->insertion_type)
7740                   {
7741                     tail->bytepos = from_byte;
7742                     tail->charpos = from;
7743                   }
7744                 else
7745                   {
7746                     tail->bytepos = from_byte + coding->produced;
7747                     tail->charpos
7748                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7749                          ? tail->bytepos : from + coding->produced_char);
7750                   }
7751               }
7752         }
7753     }
7754
7755   Vdeactivate_mark = old_deactivate_mark;
7756   unbind_to (count, coding->dst_object);
7757 }
7758
7759
7760 void
7761 encode_coding_object (struct coding_system *coding,
7762                       Lisp_Object src_object,
7763                       ptrdiff_t from, ptrdiff_t from_byte,
7764                       ptrdiff_t to, ptrdiff_t to_byte,
7765                       Lisp_Object dst_object)
7766 {
7767   ptrdiff_t count = SPECPDL_INDEX ();
7768   ptrdiff_t chars = to - from;
7769   ptrdiff_t bytes = to_byte - from_byte;
7770   Lisp_Object attrs;
7771   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7772   bool need_marker_adjustment = 0;
7773   bool kill_src_buffer = 0;
7774   Lisp_Object old_deactivate_mark;
7775
7776   old_deactivate_mark = Vdeactivate_mark;
7777
7778   coding->src_object = src_object;
7779   coding->src_chars = chars;
7780   coding->src_bytes = bytes;
7781   coding->src_multibyte = chars < bytes;
7782
7783   attrs = CODING_ID_ATTRS (coding->id);
7784
7785   if (EQ (src_object, dst_object))
7786     {
7787       struct Lisp_Marker *tail;
7788
7789       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7790         {
7791           tail->need_adjustment
7792             = tail->charpos == (tail->insertion_type ? from : to);
7793           need_marker_adjustment |= tail->need_adjustment;
7794         }
7795     }
7796
7797   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7798     {
7799       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7800       set_buffer_internal (XBUFFER (coding->src_object));
7801       if (STRINGP (src_object))
7802         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7803       else if (BUFFERP (src_object))
7804         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7805       else
7806         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7807
7808       if (EQ (src_object, dst_object))
7809         {
7810           set_buffer_internal (XBUFFER (src_object));
7811           saved_pt = PT, saved_pt_byte = PT_BYTE;
7812           del_range_both (from, from_byte, to, to_byte, 1);
7813           set_buffer_internal (XBUFFER (coding->src_object));
7814         }
7815
7816       {
7817         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7818
7819         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7820                 old_deactivate_mark);
7821         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7822                     make_number (BEG), make_number (Z));
7823         UNGCPRO;
7824       }
7825       if (XBUFFER (coding->src_object) != current_buffer)
7826         kill_src_buffer = 1;
7827       coding->src_object = Fcurrent_buffer ();
7828       if (BEG != GPT)
7829         move_gap_both (BEG, BEG_BYTE);
7830       coding->src_chars = Z - BEG;
7831       coding->src_bytes = Z_BYTE - BEG_BYTE;
7832       coding->src_pos = BEG;
7833       coding->src_pos_byte = BEG_BYTE;
7834       coding->src_multibyte = Z < Z_BYTE;
7835     }
7836   else if (STRINGP (src_object))
7837     {
7838       code_conversion_save (0, 0);
7839       coding->src_pos = from;
7840       coding->src_pos_byte = from_byte;
7841     }
7842   else if (BUFFERP (src_object))
7843     {
7844       code_conversion_save (0, 0);
7845       set_buffer_internal (XBUFFER (src_object));
7846       if (EQ (src_object, dst_object))
7847         {
7848           saved_pt = PT, saved_pt_byte = PT_BYTE;
7849           coding->src_object = del_range_1 (from, to, 1, 1);
7850           coding->src_pos = 0;
7851           coding->src_pos_byte = 0;
7852         }
7853       else
7854         {
7855           if (from < GPT && to >= GPT)
7856             move_gap_both (from, from_byte);
7857           coding->src_pos = from;
7858           coding->src_pos_byte = from_byte;
7859         }
7860     }
7861   else
7862     code_conversion_save (0, 0);
7863
7864   if (BUFFERP (dst_object))
7865     {
7866       coding->dst_object = dst_object;
7867       if (EQ (src_object, dst_object))
7868         {
7869           coding->dst_pos = from;
7870           coding->dst_pos_byte = from_byte;
7871         }
7872       else
7873         {
7874           struct buffer *current = current_buffer;
7875
7876           set_buffer_temp (XBUFFER (dst_object));
7877           coding->dst_pos = PT;
7878           coding->dst_pos_byte = PT_BYTE;
7879           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7880           set_buffer_temp (current);
7881         }
7882       coding->dst_multibyte
7883         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7884     }
7885   else if (EQ (dst_object, Qt))
7886     {
7887       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7888       coding->dst_object = Qnil;
7889       coding->destination = xmalloc (dst_bytes);
7890       coding->dst_bytes = dst_bytes;
7891       coding->dst_multibyte = 0;
7892     }
7893   else
7894     {
7895       coding->dst_object = Qnil;
7896       coding->dst_multibyte = 0;
7897     }
7898
7899   encode_coding (coding);
7900
7901   if (EQ (dst_object, Qt))
7902     {
7903       if (BUFFERP (coding->dst_object))
7904         coding->dst_object = Fbuffer_string ();
7905       else
7906         {
7907           coding->dst_object
7908             = make_unibyte_string ((char *) coding->destination,
7909                                    coding->produced);
7910           xfree (coding->destination);
7911         }
7912     }
7913
7914   if (saved_pt >= 0)
7915     {
7916       /* This is the case of:
7917          (BUFFERP (src_object) && EQ (src_object, dst_object))
7918          As we have moved PT while replacing the original buffer
7919          contents, we must recover it now.  */
7920       set_buffer_internal (XBUFFER (src_object));
7921       if (saved_pt < from)
7922         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7923       else if (saved_pt < from + chars)
7924         TEMP_SET_PT_BOTH (from, from_byte);
7925       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7926         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7927                           saved_pt_byte + (coding->produced - bytes));
7928       else
7929         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7930                           saved_pt_byte + (coding->produced - bytes));
7931
7932       if (need_marker_adjustment)
7933         {
7934           struct Lisp_Marker *tail;
7935
7936           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7937             if (tail->need_adjustment)
7938               {
7939                 tail->need_adjustment = 0;
7940                 if (tail->insertion_type)
7941                   {
7942                     tail->bytepos = from_byte;
7943                     tail->charpos = from;
7944                   }
7945                 else
7946                   {
7947                     tail->bytepos = from_byte + coding->produced;
7948                     tail->charpos
7949                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7950                          ? tail->bytepos : from + coding->produced_char);
7951                   }
7952               }
7953         }
7954     }
7955
7956   if (kill_src_buffer)
7957     Fkill_buffer (coding->src_object);
7958
7959   Vdeactivate_mark = old_deactivate_mark;
7960   unbind_to (count, Qnil);
7961 }
7962
7963
7964 Lisp_Object
7965 preferred_coding_system (void)
7966 {
7967   int id = coding_categories[coding_priorities[0]].id;
7968
7969   return CODING_ID_NAME (id);
7970 }
7971
7972 #if defined (WINDOWSNT) || defined (CYGWIN)
7973
7974 Lisp_Object
7975 from_unicode (Lisp_Object str)
7976 {
7977   CHECK_STRING (str);
7978   if (!STRING_MULTIBYTE (str) &&
7979       SBYTES (str) & 1)
7980     {
7981       str = Fsubstring (str, make_number (0), make_number (-1));
7982     }
7983
7984   return code_convert_string_norecord (str, Qutf_16le, 0);
7985 }
7986
7987 wchar_t *
7988 to_unicode (Lisp_Object str, Lisp_Object *buf)
7989 {
7990   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
7991   /* We need to make a another copy (in addition to the one made by
7992      code_convert_string_norecord) to ensure that the final string is
7993      _doubly_ zero terminated --- that is, that the string is
7994      terminated by two zero bytes and one utf-16le null character.
7995      Because strings are already terminated with a single zero byte,
7996      we just add one additional zero. */
7997   str = make_uninit_string (SBYTES (*buf) + 1);
7998   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
7999   SDATA (str) [SBYTES (*buf)] = '\0';
8000   *buf = str;
8001   return WCSDATA (*buf);
8002 }
8003
8004 #endif /* WINDOWSNT || CYGWIN */
8005
8006 \f
8007 #ifdef emacs
8008 /*** 8. Emacs Lisp library functions ***/
8009
8010 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8011        doc: /* Return t if OBJECT is nil or a coding-system.
8012 See the documentation of `define-coding-system' for information
8013 about coding-system objects.  */)
8014   (Lisp_Object object)
8015 {
8016   if (NILP (object)
8017       || CODING_SYSTEM_ID (object) >= 0)
8018     return Qt;
8019   if (! SYMBOLP (object)
8020       || NILP (Fget (object, Qcoding_system_define_form)))
8021     return Qnil;
8022   return Qt;
8023 }
8024
8025 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8026        Sread_non_nil_coding_system, 1, 1, 0,
8027        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8028   (Lisp_Object prompt)
8029 {
8030   Lisp_Object val;
8031   do
8032     {
8033       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8034                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8035     }
8036   while (SCHARS (val) == 0);
8037   return (Fintern (val, Qnil));
8038 }
8039
8040 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8041        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8042 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8043 Ignores case when completing coding systems (all Emacs coding systems
8044 are lower-case).  */)
8045   (Lisp_Object prompt, Lisp_Object default_coding_system)
8046 {
8047   Lisp_Object val;
8048   ptrdiff_t count = SPECPDL_INDEX ();
8049
8050   if (SYMBOLP (default_coding_system))
8051     default_coding_system = SYMBOL_NAME (default_coding_system);
8052   specbind (Qcompletion_ignore_case, Qt);
8053   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8054                           Qt, Qnil, Qcoding_system_history,
8055                           default_coding_system, Qnil);
8056   unbind_to (count, Qnil);
8057   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8058 }
8059
8060 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8061        1, 1, 0,
8062        doc: /* Check validity of CODING-SYSTEM.
8063 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8064 It is valid if it is nil or a symbol defined as a coding system by the
8065 function `define-coding-system'.  */)
8066   (Lisp_Object coding_system)
8067 {
8068   Lisp_Object define_form;
8069
8070   define_form = Fget (coding_system, Qcoding_system_define_form);
8071   if (! NILP (define_form))
8072     {
8073       Fput (coding_system, Qcoding_system_define_form, Qnil);
8074       safe_eval (define_form);
8075     }
8076   if (!NILP (Fcoding_system_p (coding_system)))
8077     return coding_system;
8078   xsignal1 (Qcoding_system_error, coding_system);
8079 }
8080
8081 \f
8082 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8083    HIGHEST, return the coding system of the highest
8084    priority among the detected coding systems.  Otherwise return a
8085    list of detected coding systems sorted by their priorities.  If
8086    MULTIBYTEP, it is assumed that the bytes are in correct
8087    multibyte form but contains only ASCII and eight-bit chars.
8088    Otherwise, the bytes are raw bytes.
8089
8090    CODING-SYSTEM controls the detection as below:
8091
8092    If it is nil, detect both text-format and eol-format.  If the
8093    text-format part of CODING-SYSTEM is already specified
8094    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8095    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8096    detect only text-format.  */
8097
8098 Lisp_Object
8099 detect_coding_system (const unsigned char *src,
8100                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8101                       bool highest, bool multibytep,
8102                       Lisp_Object coding_system)
8103 {
8104   const unsigned char *src_end = src + src_bytes;
8105   Lisp_Object attrs, eol_type;
8106   Lisp_Object val = Qnil;
8107   struct coding_system coding;
8108   ptrdiff_t id;
8109   struct coding_detection_info detect_info;
8110   enum coding_category base_category;
8111   bool null_byte_found = 0, eight_bit_found = 0;
8112
8113   if (NILP (coding_system))
8114     coding_system = Qundecided;
8115   setup_coding_system (coding_system, &coding);
8116   attrs = CODING_ID_ATTRS (coding.id);
8117   eol_type = CODING_ID_EOL_TYPE (coding.id);
8118   coding_system = CODING_ATTR_BASE_NAME (attrs);
8119
8120   coding.source = src;
8121   coding.src_chars = src_chars;
8122   coding.src_bytes = src_bytes;
8123   coding.src_multibyte = multibytep;
8124   coding.consumed = 0;
8125   coding.mode |= CODING_MODE_LAST_BLOCK;
8126   coding.head_ascii = 0;
8127
8128   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8129
8130   /* At first, detect text-format if necessary.  */
8131   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8132   if (base_category == coding_category_undecided)
8133     {
8134       enum coding_category category IF_LINT (= 0);
8135       struct coding_system *this IF_LINT (= NULL);
8136       int c, i;
8137
8138       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8139       for (; src < src_end; src++)
8140         {
8141           c = *src;
8142           if (c & 0x80)
8143             {
8144               eight_bit_found = 1;
8145               if (null_byte_found)
8146                 break;
8147             }
8148           else if (c < 0x20)
8149             {
8150               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8151                   && ! inhibit_iso_escape_detection
8152                   && ! detect_info.checked)
8153                 {
8154                   if (detect_coding_iso_2022 (&coding, &detect_info))
8155                     {
8156                       /* We have scanned the whole data.  */
8157                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8158                         {
8159                           /* We didn't find an 8-bit code.  We may
8160                              have found a null-byte, but it's very
8161                              rare that a binary file confirm to
8162                              ISO-2022.  */
8163                           src = src_end;
8164                           coding.head_ascii = src - coding.source;
8165                         }
8166                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8167                       break;
8168                     }
8169                 }
8170               else if (! c && !inhibit_null_byte_detection)
8171                 {
8172                   null_byte_found = 1;
8173                   if (eight_bit_found)
8174                     break;
8175                 }
8176               if (! eight_bit_found)
8177                 coding.head_ascii++;
8178             }
8179           else if (! eight_bit_found)
8180             coding.head_ascii++;
8181         }
8182
8183       if (null_byte_found || eight_bit_found
8184           || coding.head_ascii < coding.src_bytes
8185           || detect_info.found)
8186         {
8187           if (coding.head_ascii == coding.src_bytes)
8188             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8189             for (i = 0; i < coding_category_raw_text; i++)
8190               {
8191                 category = coding_priorities[i];
8192                 this = coding_categories + category;
8193                 if (detect_info.found & (1 << category))
8194                   break;
8195               }
8196           else
8197             {
8198               if (null_byte_found)
8199                 {
8200                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8201                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8202                 }
8203               for (i = 0; i < coding_category_raw_text; i++)
8204                 {
8205                   category = coding_priorities[i];
8206                   this = coding_categories + category;
8207
8208                   if (this->id < 0)
8209                     {
8210                       /* No coding system of this category is defined.  */
8211                       detect_info.rejected |= (1 << category);
8212                     }
8213                   else if (category >= coding_category_raw_text)
8214                     continue;
8215                   else if (detect_info.checked & (1 << category))
8216                     {
8217                       if (highest
8218                           && (detect_info.found & (1 << category)))
8219                         break;
8220                     }
8221                   else if ((*(this->detector)) (&coding, &detect_info)
8222                            && highest
8223                            && (detect_info.found & (1 << category)))
8224                     {
8225                       if (category == coding_category_utf_16_auto)
8226                         {
8227                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8228                             category = coding_category_utf_16_le;
8229                           else
8230                             category = coding_category_utf_16_be;
8231                         }
8232                       break;
8233                     }
8234                 }
8235             }
8236         }
8237
8238       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8239           || null_byte_found)
8240         {
8241           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8242           id = CODING_SYSTEM_ID (Qno_conversion);
8243           val = Fcons (make_number (id), Qnil);
8244         }
8245       else if (! detect_info.rejected && ! detect_info.found)
8246         {
8247           detect_info.found = CATEGORY_MASK_ANY;
8248           id = coding_categories[coding_category_undecided].id;
8249           val = Fcons (make_number (id), Qnil);
8250         }
8251       else if (highest)
8252         {
8253           if (detect_info.found)
8254             {
8255               detect_info.found = 1 << category;
8256               val = Fcons (make_number (this->id), Qnil);
8257             }
8258           else
8259             for (i = 0; i < coding_category_raw_text; i++)
8260               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8261                 {
8262                   detect_info.found = 1 << coding_priorities[i];
8263                   id = coding_categories[coding_priorities[i]].id;
8264                   val = Fcons (make_number (id), Qnil);
8265                   break;
8266                 }
8267         }
8268       else
8269         {
8270           int mask = detect_info.rejected | detect_info.found;
8271           int found = 0;
8272
8273           for (i = coding_category_raw_text - 1; i >= 0; i--)
8274             {
8275               category = coding_priorities[i];
8276               if (! (mask & (1 << category)))
8277                 {
8278                   found |= 1 << category;
8279                   id = coding_categories[category].id;
8280                   if (id >= 0)
8281                     val = Fcons (make_number (id), val);
8282                 }
8283             }
8284           for (i = coding_category_raw_text - 1; i >= 0; i--)
8285             {
8286               category = coding_priorities[i];
8287               if (detect_info.found & (1 << category))
8288                 {
8289                   id = coding_categories[category].id;
8290                   val = Fcons (make_number (id), val);
8291                 }
8292             }
8293           detect_info.found |= found;
8294         }
8295     }
8296   else if (base_category == coding_category_utf_8_auto)
8297     {
8298       if (detect_coding_utf_8 (&coding, &detect_info))
8299         {
8300           struct coding_system *this;
8301
8302           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8303             this = coding_categories + coding_category_utf_8_sig;
8304           else
8305             this = coding_categories + coding_category_utf_8_nosig;
8306           val = Fcons (make_number (this->id), Qnil);
8307         }
8308     }
8309   else if (base_category == coding_category_utf_16_auto)
8310     {
8311       if (detect_coding_utf_16 (&coding, &detect_info))
8312         {
8313           struct coding_system *this;
8314
8315           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8316             this = coding_categories + coding_category_utf_16_le;
8317           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8318             this = coding_categories + coding_category_utf_16_be;
8319           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8320             this = coding_categories + coding_category_utf_16_be_nosig;
8321           else
8322             this = coding_categories + coding_category_utf_16_le_nosig;
8323           val = Fcons (make_number (this->id), Qnil);
8324         }
8325     }
8326   else
8327     {
8328       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8329       val = Fcons (make_number (coding.id), Qnil);
8330     }
8331
8332   /* Then, detect eol-format if necessary.  */
8333   {
8334     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8335     Lisp_Object tail;
8336
8337     if (VECTORP (eol_type))
8338       {
8339         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8340           {
8341             if (null_byte_found)
8342               normal_eol = EOL_SEEN_LF;
8343             else
8344               normal_eol = detect_eol (coding.source, src_bytes,
8345                                        coding_category_raw_text);
8346           }
8347         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8348                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8349           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8350                                       coding_category_utf_16_be);
8351         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8352                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8353           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8354                                       coding_category_utf_16_le);
8355       }
8356     else
8357       {
8358         if (EQ (eol_type, Qunix))
8359           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8360         else if (EQ (eol_type, Qdos))
8361           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8362         else
8363           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8364       }
8365
8366     for (tail = val; CONSP (tail); tail = XCDR (tail))
8367       {
8368         enum coding_category category;
8369         int this_eol;
8370
8371         id = XINT (XCAR (tail));
8372         attrs = CODING_ID_ATTRS (id);
8373         category = XINT (CODING_ATTR_CATEGORY (attrs));
8374         eol_type = CODING_ID_EOL_TYPE (id);
8375         if (VECTORP (eol_type))
8376           {
8377             if (category == coding_category_utf_16_be
8378                 || category == coding_category_utf_16_be_nosig)
8379               this_eol = utf_16_be_eol;
8380             else if (category == coding_category_utf_16_le
8381                      || category == coding_category_utf_16_le_nosig)
8382               this_eol = utf_16_le_eol;
8383             else
8384               this_eol = normal_eol;
8385
8386             if (this_eol == EOL_SEEN_LF)
8387               XSETCAR (tail, AREF (eol_type, 0));
8388             else if (this_eol == EOL_SEEN_CRLF)
8389               XSETCAR (tail, AREF (eol_type, 1));
8390             else if (this_eol == EOL_SEEN_CR)
8391               XSETCAR (tail, AREF (eol_type, 2));
8392             else
8393               XSETCAR (tail, CODING_ID_NAME (id));
8394           }
8395         else
8396           XSETCAR (tail, CODING_ID_NAME (id));
8397       }
8398   }
8399
8400   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8401 }
8402
8403
8404 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8405        2, 3, 0,
8406        doc: /* Detect coding system of the text in the region between START and END.
8407 Return a list of possible coding systems ordered by priority.
8408 The coding systems to try and their priorities follows what
8409 the function `coding-system-priority-list' (which see) returns.
8410
8411 If only ASCII characters are found (except for such ISO-2022 control
8412 characters as ESC), it returns a list of single element `undecided'
8413 or its subsidiary coding system according to a detected end-of-line
8414 format.
8415
8416 If optional argument HIGHEST is non-nil, return the coding system of
8417 highest priority.  */)
8418   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8419 {
8420   ptrdiff_t from, to;
8421   ptrdiff_t from_byte, to_byte;
8422
8423   validate_region (&start, &end);
8424   from = XINT (start), to = XINT (end);
8425   from_byte = CHAR_TO_BYTE (from);
8426   to_byte = CHAR_TO_BYTE (to);
8427
8428   if (from < GPT && to >= GPT)
8429     move_gap_both (to, to_byte);
8430
8431   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8432                                to - from, to_byte - from_byte,
8433                                !NILP (highest),
8434                                !NILP (BVAR (current_buffer
8435                                       , enable_multibyte_characters)),
8436                                Qnil);
8437 }
8438
8439 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8440        1, 2, 0,
8441        doc: /* Detect coding system of the text in STRING.
8442 Return a list of possible coding systems ordered by priority.
8443 The coding systems to try and their priorities follows what
8444 the function `coding-system-priority-list' (which see) returns.
8445
8446 If only ASCII characters are found (except for such ISO-2022 control
8447 characters as ESC), it returns a list of single element `undecided'
8448 or its subsidiary coding system according to a detected end-of-line
8449 format.
8450
8451 If optional argument HIGHEST is non-nil, return the coding system of
8452 highest priority.  */)
8453   (Lisp_Object string, Lisp_Object highest)
8454 {
8455   CHECK_STRING (string);
8456
8457   return detect_coding_system (SDATA (string),
8458                                SCHARS (string), SBYTES (string),
8459                                !NILP (highest), STRING_MULTIBYTE (string),
8460                                Qnil);
8461 }
8462
8463
8464 static bool
8465 char_encodable_p (int c, Lisp_Object attrs)
8466 {
8467   Lisp_Object tail;
8468   struct charset *charset;
8469   Lisp_Object translation_table;
8470
8471   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8472   if (! NILP (translation_table))
8473     c = translate_char (translation_table, c);
8474   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8475        CONSP (tail); tail = XCDR (tail))
8476     {
8477       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8478       if (CHAR_CHARSET_P (c, charset))
8479         break;
8480     }
8481   return (! NILP (tail));
8482 }
8483
8484
8485 /* Return a list of coding systems that safely encode the text between
8486    START and END.  If EXCLUDE is non-nil, it is a list of coding
8487    systems not to check.  The returned list doesn't contain any such
8488    coding systems.  In any case, if the text contains only ASCII or is
8489    unibyte, return t.  */
8490
8491 DEFUN ("find-coding-systems-region-internal",
8492        Ffind_coding_systems_region_internal,
8493        Sfind_coding_systems_region_internal, 2, 3, 0,
8494        doc: /* Internal use only.  */)
8495   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8496 {
8497   Lisp_Object coding_attrs_list, safe_codings;
8498   ptrdiff_t start_byte, end_byte;
8499   const unsigned char *p, *pbeg, *pend;
8500   int c;
8501   Lisp_Object tail, elt, work_table;
8502
8503   if (STRINGP (start))
8504     {
8505       if (!STRING_MULTIBYTE (start)
8506           || SCHARS (start) == SBYTES (start))
8507         return Qt;
8508       start_byte = 0;
8509       end_byte = SBYTES (start);
8510     }
8511   else
8512     {
8513       CHECK_NUMBER_COERCE_MARKER (start);
8514       CHECK_NUMBER_COERCE_MARKER (end);
8515       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8516         args_out_of_range (start, end);
8517       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8518         return Qt;
8519       start_byte = CHAR_TO_BYTE (XINT (start));
8520       end_byte = CHAR_TO_BYTE (XINT (end));
8521       if (XINT (end) - XINT (start) == end_byte - start_byte)
8522         return Qt;
8523
8524       if (XINT (start) < GPT && XINT (end) > GPT)
8525         {
8526           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8527             move_gap_both (XINT (start), start_byte);
8528           else
8529             move_gap_both (XINT (end), end_byte);
8530         }
8531     }
8532
8533   coding_attrs_list = Qnil;
8534   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8535     if (NILP (exclude)
8536         || NILP (Fmemq (XCAR (tail), exclude)))
8537       {
8538         Lisp_Object attrs;
8539
8540         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8541         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8542             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8543           {
8544             ASET (attrs, coding_attr_trans_tbl,
8545                   get_translation_table (attrs, 1, NULL));
8546             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8547           }
8548       }
8549
8550   if (STRINGP (start))
8551     p = pbeg = SDATA (start);
8552   else
8553     p = pbeg = BYTE_POS_ADDR (start_byte);
8554   pend = p + (end_byte - start_byte);
8555
8556   while (p < pend && ASCII_BYTE_P (*p)) p++;
8557   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8558
8559   work_table = Fmake_char_table (Qnil, Qnil);
8560   while (p < pend)
8561     {
8562       if (ASCII_BYTE_P (*p))
8563         p++;
8564       else
8565         {
8566           c = STRING_CHAR_ADVANCE (p);
8567           if (!NILP (char_table_ref (work_table, c)))
8568             /* This character was already checked.  Ignore it.  */
8569             continue;
8570
8571           charset_map_loaded = 0;
8572           for (tail = coding_attrs_list; CONSP (tail);)
8573             {
8574               elt = XCAR (tail);
8575               if (NILP (elt))
8576                 tail = XCDR (tail);
8577               else if (char_encodable_p (c, elt))
8578                 tail = XCDR (tail);
8579               else if (CONSP (XCDR (tail)))
8580                 {
8581                   XSETCAR (tail, XCAR (XCDR (tail)));
8582                   XSETCDR (tail, XCDR (XCDR (tail)));
8583                 }
8584               else
8585                 {
8586                   XSETCAR (tail, Qnil);
8587                   tail = XCDR (tail);
8588                 }
8589             }
8590           if (charset_map_loaded)
8591             {
8592               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8593
8594               if (STRINGP (start))
8595                 pbeg = SDATA (start);
8596               else
8597                 pbeg = BYTE_POS_ADDR (start_byte);
8598               p = pbeg + p_offset;
8599               pend = pbeg + pend_offset;
8600             }
8601           char_table_set (work_table, c, Qt);
8602         }
8603     }
8604
8605   safe_codings = list2 (Qraw_text, Qno_conversion);
8606   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8607     if (! NILP (XCAR (tail)))
8608       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8609
8610   return safe_codings;
8611 }
8612
8613
8614 DEFUN ("unencodable-char-position", Funencodable_char_position,
8615        Sunencodable_char_position, 3, 5, 0,
8616        doc: /*
8617 Return position of first un-encodable character in a region.
8618 START and END specify the region and CODING-SYSTEM specifies the
8619 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8620
8621 If optional 4th argument COUNT is non-nil, it specifies at most how
8622 many un-encodable characters to search.  In this case, the value is a
8623 list of positions.
8624
8625 If optional 5th argument STRING is non-nil, it is a string to search
8626 for un-encodable characters.  In that case, START and END are indexes
8627 to the string.  */)
8628   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8629 {
8630   EMACS_INT n;
8631   struct coding_system coding;
8632   Lisp_Object attrs, charset_list, translation_table;
8633   Lisp_Object positions;
8634   ptrdiff_t from, to;
8635   const unsigned char *p, *stop, *pend;
8636   bool ascii_compatible;
8637
8638   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8639   attrs = CODING_ID_ATTRS (coding.id);
8640   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8641     return Qnil;
8642   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8643   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8644   translation_table = get_translation_table (attrs, 1, NULL);
8645
8646   if (NILP (string))
8647     {
8648       validate_region (&start, &end);
8649       from = XINT (start);
8650       to = XINT (end);
8651       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8652           || (ascii_compatible
8653               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8654         return Qnil;
8655       p = CHAR_POS_ADDR (from);
8656       pend = CHAR_POS_ADDR (to);
8657       if (from < GPT && to >= GPT)
8658         stop = GPT_ADDR;
8659       else
8660         stop = pend;
8661     }
8662   else
8663     {
8664       CHECK_STRING (string);
8665       CHECK_NATNUM (start);
8666       CHECK_NATNUM (end);
8667       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8668         args_out_of_range_3 (string, start, end);
8669       from = XINT (start);
8670       to = XINT (end);
8671       if (! STRING_MULTIBYTE (string))
8672         return Qnil;
8673       p = SDATA (string) + string_char_to_byte (string, from);
8674       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8675       if (ascii_compatible && (to - from) == (pend - p))
8676         return Qnil;
8677     }
8678
8679   if (NILP (count))
8680     n = 1;
8681   else
8682     {
8683       CHECK_NATNUM (count);
8684       n = XINT (count);
8685     }
8686
8687   positions = Qnil;
8688   charset_map_loaded = 0;
8689   while (1)
8690     {
8691       int c;
8692
8693       if (ascii_compatible)
8694         while (p < stop && ASCII_BYTE_P (*p))
8695           p++, from++;
8696       if (p >= stop)
8697         {
8698           if (p >= pend)
8699             break;
8700           stop = pend;
8701           p = GAP_END_ADDR;
8702         }
8703
8704       c = STRING_CHAR_ADVANCE (p);
8705       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8706           && ! char_charset (translate_char (translation_table, c),
8707                              charset_list, NULL))
8708         {
8709           positions = Fcons (make_number (from), positions);
8710           n--;
8711           if (n == 0)
8712             break;
8713         }
8714
8715       from++;
8716       if (charset_map_loaded && NILP (string))
8717         {
8718           p = CHAR_POS_ADDR (from);
8719           pend = CHAR_POS_ADDR (to);
8720           if (from < GPT && to >= GPT)
8721             stop = GPT_ADDR;
8722           else
8723             stop = pend;
8724           charset_map_loaded = 0;
8725         }
8726     }
8727
8728   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8729 }
8730
8731
8732 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8733        Scheck_coding_systems_region, 3, 3, 0,
8734        doc: /* Check if the region is encodable by coding systems.
8735
8736 START and END are buffer positions specifying the region.
8737 CODING-SYSTEM-LIST is a list of coding systems to check.
8738
8739 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8740 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8741 whole region, POS0, POS1, ... are buffer positions where non-encodable
8742 characters are found.
8743
8744 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8745 value is nil.
8746
8747 START may be a string.  In that case, check if the string is
8748 encodable, and the value contains indices to the string instead of
8749 buffer positions.  END is ignored.
8750
8751 If the current buffer (or START if it is a string) is unibyte, the value
8752 is nil.  */)
8753   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8754 {
8755   Lisp_Object list;
8756   ptrdiff_t start_byte, end_byte;
8757   ptrdiff_t pos;
8758   const unsigned char *p, *pbeg, *pend;
8759   int c;
8760   Lisp_Object tail, elt, attrs;
8761
8762   if (STRINGP (start))
8763     {
8764       if (!STRING_MULTIBYTE (start)
8765           || SCHARS (start) == SBYTES (start))
8766         return Qnil;
8767       start_byte = 0;
8768       end_byte = SBYTES (start);
8769       pos = 0;
8770     }
8771   else
8772     {
8773       CHECK_NUMBER_COERCE_MARKER (start);
8774       CHECK_NUMBER_COERCE_MARKER (end);
8775       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8776         args_out_of_range (start, end);
8777       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8778         return Qnil;
8779       start_byte = CHAR_TO_BYTE (XINT (start));
8780       end_byte = CHAR_TO_BYTE (XINT (end));
8781       if (XINT (end) - XINT (start) == end_byte - start_byte)
8782         return Qnil;
8783
8784       if (XINT (start) < GPT && XINT (end) > GPT)
8785         {
8786           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8787             move_gap_both (XINT (start), start_byte);
8788           else
8789             move_gap_both (XINT (end), end_byte);
8790         }
8791       pos = XINT (start);
8792     }
8793
8794   list = Qnil;
8795   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8796     {
8797       elt = XCAR (tail);
8798       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8799       ASET (attrs, coding_attr_trans_tbl,
8800             get_translation_table (attrs, 1, NULL));
8801       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8802     }
8803
8804   if (STRINGP (start))
8805     p = pbeg = SDATA (start);
8806   else
8807     p = pbeg = BYTE_POS_ADDR (start_byte);
8808   pend = p + (end_byte - start_byte);
8809
8810   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8811   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8812
8813   while (p < pend)
8814     {
8815       if (ASCII_BYTE_P (*p))
8816         p++;
8817       else
8818         {
8819           c = STRING_CHAR_ADVANCE (p);
8820
8821           charset_map_loaded = 0;
8822           for (tail = list; CONSP (tail); tail = XCDR (tail))
8823             {
8824               elt = XCDR (XCAR (tail));
8825               if (! char_encodable_p (c, XCAR (elt)))
8826                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8827             }
8828           if (charset_map_loaded)
8829             {
8830               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8831
8832               if (STRINGP (start))
8833                 pbeg = SDATA (start);
8834               else
8835                 pbeg = BYTE_POS_ADDR (start_byte);
8836               p = pbeg + p_offset;
8837               pend = pbeg + pend_offset;
8838             }
8839         }
8840       pos++;
8841     }
8842
8843   tail = list;
8844   list = Qnil;
8845   for (; CONSP (tail); tail = XCDR (tail))
8846     {
8847       elt = XCAR (tail);
8848       if (CONSP (XCDR (XCDR (elt))))
8849         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8850                       list);
8851     }
8852
8853   return list;
8854 }
8855
8856
8857 static Lisp_Object
8858 code_convert_region (Lisp_Object start, Lisp_Object end,
8859                      Lisp_Object coding_system, Lisp_Object dst_object,
8860                      bool encodep, bool norecord)
8861 {
8862   struct coding_system coding;
8863   ptrdiff_t from, from_byte, to, to_byte;
8864   Lisp_Object src_object;
8865
8866   if (NILP (coding_system))
8867     coding_system = Qno_conversion;
8868   else
8869     CHECK_CODING_SYSTEM (coding_system);
8870   src_object = Fcurrent_buffer ();
8871   if (NILP (dst_object))
8872     dst_object = src_object;
8873   else if (! EQ (dst_object, Qt))
8874     CHECK_BUFFER (dst_object);
8875
8876   validate_region (&start, &end);
8877   from = XFASTINT (start);
8878   from_byte = CHAR_TO_BYTE (from);
8879   to = XFASTINT (end);
8880   to_byte = CHAR_TO_BYTE (to);
8881
8882   setup_coding_system (coding_system, &coding);
8883   coding.mode |= CODING_MODE_LAST_BLOCK;
8884
8885   if (encodep)
8886     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8887                           dst_object);
8888   else
8889     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8890                           dst_object);
8891   if (! norecord)
8892     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8893
8894   return (BUFFERP (dst_object)
8895           ? make_number (coding.produced_char)
8896           : coding.dst_object);
8897 }
8898
8899
8900 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8901        3, 4, "r\nzCoding system: ",
8902        doc: /* Decode the current region from the specified coding system.
8903 When called from a program, takes four arguments:
8904         START, END, CODING-SYSTEM, and DESTINATION.
8905 START and END are buffer positions.
8906
8907 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8908 If nil, the region between START and END is replaced by the decoded text.
8909 If buffer, the decoded text is inserted in that buffer after point (point
8910 does not move).
8911 In those cases, the length of the decoded text is returned.
8912 If DESTINATION is t, the decoded text is returned.
8913
8914 This function sets `last-coding-system-used' to the precise coding system
8915 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8916 not fully specified.)  */)
8917   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8918 {
8919   return code_convert_region (start, end, coding_system, destination, 0, 0);
8920 }
8921
8922 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8923        3, 4, "r\nzCoding system: ",
8924        doc: /* Encode the current region by specified coding system.
8925 When called from a program, takes four arguments:
8926         START, END, CODING-SYSTEM and DESTINATION.
8927 START and END are buffer positions.
8928
8929 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8930 If nil, the region between START and END is replace by the encoded text.
8931 If buffer, the encoded text is inserted in that buffer after point (point
8932 does not move).
8933 In those cases, the length of the encoded text is returned.
8934 If DESTINATION is t, the encoded text is returned.
8935
8936 This function sets `last-coding-system-used' to the precise coding system
8937 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8938 not fully specified.)  */)
8939   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8940 {
8941   return code_convert_region (start, end, coding_system, destination, 1, 0);
8942 }
8943
8944 Lisp_Object
8945 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8946                      Lisp_Object dst_object, bool encodep, bool nocopy,
8947                      bool norecord)
8948 {
8949   struct coding_system coding;
8950   ptrdiff_t chars, bytes;
8951
8952   CHECK_STRING (string);
8953   if (NILP (coding_system))
8954     {
8955       if (! norecord)
8956         Vlast_coding_system_used = Qno_conversion;
8957       if (NILP (dst_object))
8958         return (nocopy ? Fcopy_sequence (string) : string);
8959     }
8960
8961   if (NILP (coding_system))
8962     coding_system = Qno_conversion;
8963   else
8964     CHECK_CODING_SYSTEM (coding_system);
8965   if (NILP (dst_object))
8966     dst_object = Qt;
8967   else if (! EQ (dst_object, Qt))
8968     CHECK_BUFFER (dst_object);
8969
8970   setup_coding_system (coding_system, &coding);
8971   coding.mode |= CODING_MODE_LAST_BLOCK;
8972   chars = SCHARS (string);
8973   bytes = SBYTES (string);
8974   if (encodep)
8975     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8976   else
8977     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8978   if (! norecord)
8979     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8980
8981   return (BUFFERP (dst_object)
8982           ? make_number (coding.produced_char)
8983           : coding.dst_object);
8984 }
8985
8986
8987 /* Encode or decode STRING according to CODING_SYSTEM.
8988    Do not set Vlast_coding_system_used.
8989
8990    This function is called only from macros DECODE_FILE and
8991    ENCODE_FILE, thus we ignore character composition.  */
8992
8993 Lisp_Object
8994 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8995                               bool encodep)
8996 {
8997   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8998 }
8999
9000
9001 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9002        2, 4, 0,
9003        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9004
9005 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9006 if the decoding operation is trivial.
9007
9008 Optional fourth arg BUFFER non-nil means that the decoded text is
9009 inserted in that buffer after point (point does not move).  In this
9010 case, the return value is the length of the decoded text.
9011
9012 This function sets `last-coding-system-used' to the precise coding system
9013 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9014 not fully specified.)  */)
9015   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9016 {
9017   return code_convert_string (string, coding_system, buffer,
9018                               0, ! NILP (nocopy), 0);
9019 }
9020
9021 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9022        2, 4, 0,
9023        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9024
9025 Optional third arg NOCOPY non-nil means it is OK to return STRING
9026 itself if the encoding operation is trivial.
9027
9028 Optional fourth arg BUFFER non-nil means that the encoded text is
9029 inserted in that buffer after point (point does not move).  In this
9030 case, the return value is the length of the encoded text.
9031
9032 This function sets `last-coding-system-used' to the precise coding system
9033 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9034 not fully specified.)  */)
9035   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9036 {
9037   return code_convert_string (string, coding_system, buffer,
9038                               1, ! NILP (nocopy), 0);
9039 }
9040
9041 \f
9042 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9043        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9044 Return the corresponding character.  */)
9045   (Lisp_Object code)
9046 {
9047   Lisp_Object spec, attrs, val;
9048   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9049   EMACS_INT ch;
9050   int c;
9051
9052   CHECK_NATNUM (code);
9053   ch = XFASTINT (code);
9054   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9055   attrs = AREF (spec, 0);
9056
9057   if (ASCII_BYTE_P (ch)
9058       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9059     return code;
9060
9061   val = CODING_ATTR_CHARSET_LIST (attrs);
9062   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9063   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9064   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9065
9066   if (ch <= 0x7F)
9067     {
9068       c = ch;
9069       charset = charset_roman;
9070     }
9071   else if (ch >= 0xA0 && ch < 0xDF)
9072     {
9073       c = ch - 0x80;
9074       charset = charset_kana;
9075     }
9076   else
9077     {
9078       EMACS_INT c1 = ch >> 8;
9079       int c2 = ch & 0xFF;
9080
9081       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9082           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9083         error ("Invalid code: %"pI"d", ch);
9084       c = ch;
9085       SJIS_TO_JIS (c);
9086       charset = charset_kanji;
9087     }
9088   c = DECODE_CHAR (charset, c);
9089   if (c < 0)
9090     error ("Invalid code: %"pI"d", ch);
9091   return make_number (c);
9092 }
9093
9094
9095 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9096        doc: /* Encode a Japanese character CH to shift_jis encoding.
9097 Return the corresponding code in SJIS.  */)
9098   (Lisp_Object ch)
9099 {
9100   Lisp_Object spec, attrs, charset_list;
9101   int c;
9102   struct charset *charset;
9103   unsigned code;
9104
9105   CHECK_CHARACTER (ch);
9106   c = XFASTINT (ch);
9107   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9108   attrs = AREF (spec, 0);
9109
9110   if (ASCII_CHAR_P (c)
9111       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9112     return ch;
9113
9114   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9115   charset = char_charset (c, charset_list, &code);
9116   if (code == CHARSET_INVALID_CODE (charset))
9117     error ("Can't encode by shift_jis encoding: %c", c);
9118   JIS_TO_SJIS (code);
9119
9120   return make_number (code);
9121 }
9122
9123 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9124        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9125 Return the corresponding character.  */)
9126   (Lisp_Object code)
9127 {
9128   Lisp_Object spec, attrs, val;
9129   struct charset *charset_roman, *charset_big5, *charset;
9130   EMACS_INT ch;
9131   int c;
9132
9133   CHECK_NATNUM (code);
9134   ch = XFASTINT (code);
9135   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9136   attrs = AREF (spec, 0);
9137
9138   if (ASCII_BYTE_P (ch)
9139       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9140     return code;
9141
9142   val = CODING_ATTR_CHARSET_LIST (attrs);
9143   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9144   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9145
9146   if (ch <= 0x7F)
9147     {
9148       c = ch;
9149       charset = charset_roman;
9150     }
9151   else
9152     {
9153       EMACS_INT b1 = ch >> 8;
9154       int b2 = ch & 0x7F;
9155       if (b1 < 0xA1 || b1 > 0xFE
9156           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9157         error ("Invalid code: %"pI"d", ch);
9158       c = ch;
9159       charset = charset_big5;
9160     }
9161   c = DECODE_CHAR (charset, c);
9162   if (c < 0)
9163     error ("Invalid code: %"pI"d", ch);
9164   return make_number (c);
9165 }
9166
9167 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9168        doc: /* Encode the Big5 character CH to BIG5 coding system.
9169 Return the corresponding character code in Big5.  */)
9170   (Lisp_Object ch)
9171 {
9172   Lisp_Object spec, attrs, charset_list;
9173   struct charset *charset;
9174   int c;
9175   unsigned code;
9176
9177   CHECK_CHARACTER (ch);
9178   c = XFASTINT (ch);
9179   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9180   attrs = AREF (spec, 0);
9181   if (ASCII_CHAR_P (c)
9182       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9183     return ch;
9184
9185   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9186   charset = char_charset (c, charset_list, &code);
9187   if (code == CHARSET_INVALID_CODE (charset))
9188     error ("Can't encode by Big5 encoding: %c", c);
9189
9190   return make_number (code);
9191 }
9192
9193 \f
9194 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9195        Sset_terminal_coding_system_internal, 1, 2, 0,
9196        doc: /* Internal use only.  */)
9197   (Lisp_Object coding_system, Lisp_Object terminal)
9198 {
9199   struct terminal *term = get_terminal (terminal, 1);
9200   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9201   CHECK_SYMBOL (coding_system);
9202   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9203   /* We had better not send unsafe characters to terminal.  */
9204   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9205   /* Character composition should be disabled.  */
9206   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9207   terminal_coding->src_multibyte = 1;
9208   terminal_coding->dst_multibyte = 0;
9209   tset_charset_list
9210     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9211             ? coding_charset_list (terminal_coding)
9212             : Fcons (make_number (charset_ascii), Qnil)));
9213   return Qnil;
9214 }
9215
9216 DEFUN ("set-safe-terminal-coding-system-internal",
9217        Fset_safe_terminal_coding_system_internal,
9218        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9219        doc: /* Internal use only.  */)
9220   (Lisp_Object coding_system)
9221 {
9222   CHECK_SYMBOL (coding_system);
9223   setup_coding_system (Fcheck_coding_system (coding_system),
9224                        &safe_terminal_coding);
9225   /* Character composition should be disabled.  */
9226   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9227   safe_terminal_coding.src_multibyte = 1;
9228   safe_terminal_coding.dst_multibyte = 0;
9229   return Qnil;
9230 }
9231
9232 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9233        Sterminal_coding_system, 0, 1, 0,
9234        doc: /* Return coding system specified for terminal output on the given terminal.
9235 TERMINAL may be a terminal object, a frame, or nil for the selected
9236 frame's terminal device.  */)
9237   (Lisp_Object terminal)
9238 {
9239   struct coding_system *terminal_coding
9240     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9241   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9242
9243   /* For backward compatibility, return nil if it is `undecided'.  */
9244   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9245 }
9246
9247 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9248        Sset_keyboard_coding_system_internal, 1, 2, 0,
9249        doc: /* Internal use only.  */)
9250   (Lisp_Object coding_system, Lisp_Object terminal)
9251 {
9252   struct terminal *t = get_terminal (terminal, 1);
9253   CHECK_SYMBOL (coding_system);
9254   if (NILP (coding_system))
9255     coding_system = Qno_conversion;
9256   else
9257     Fcheck_coding_system (coding_system);
9258   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9259   /* Character composition should be disabled.  */
9260   TERMINAL_KEYBOARD_CODING (t)->common_flags
9261     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9262   return Qnil;
9263 }
9264
9265 DEFUN ("keyboard-coding-system",
9266        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9267        doc: /* Return coding system specified for decoding keyboard input.  */)
9268   (Lisp_Object terminal)
9269 {
9270   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9271                          (get_terminal (terminal, 1))->id);
9272 }
9273
9274 \f
9275 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9276        Sfind_operation_coding_system,  1, MANY, 0,
9277        doc: /* Choose a coding system for an operation based on the target name.
9278 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9279 DECODING-SYSTEM is the coding system to use for decoding
9280 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9281 for encoding (in case OPERATION does encoding).
9282
9283 The first argument OPERATION specifies an I/O primitive:
9284   For file I/O, `insert-file-contents' or `write-region'.
9285   For process I/O, `call-process', `call-process-region', or `start-process'.
9286   For network I/O, `open-network-stream'.
9287
9288 The remaining arguments should be the same arguments that were passed
9289 to the primitive.  Depending on which primitive, one of those arguments
9290 is selected as the TARGET.  For example, if OPERATION does file I/O,
9291 whichever argument specifies the file name is TARGET.
9292
9293 TARGET has a meaning which depends on OPERATION:
9294   For file I/O, TARGET is a file name (except for the special case below).
9295   For process I/O, TARGET is a process name.
9296   For network I/O, TARGET is a service name or a port number.
9297
9298 This function looks up what is specified for TARGET in
9299 `file-coding-system-alist', `process-coding-system-alist',
9300 or `network-coding-system-alist' depending on OPERATION.
9301 They may specify a coding system, a cons of coding systems,
9302 or a function symbol to call.
9303 In the last case, we call the function with one argument,
9304 which is a list of all the arguments given to this function.
9305 If the function can't decide a coding system, it can return
9306 `undecided' so that the normal code-detection is performed.
9307
9308 If OPERATION is `insert-file-contents', the argument corresponding to
9309 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9310 file name to look up, and BUFFER is a buffer that contains the file's
9311 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9312 function to call for FILENAME, that function should examine the
9313 contents of BUFFER instead of reading the file.
9314
9315 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9316   (ptrdiff_t nargs, Lisp_Object *args)
9317 {
9318   Lisp_Object operation, target_idx, target, val;
9319   register Lisp_Object chain;
9320
9321   if (nargs < 2)
9322     error ("Too few arguments");
9323   operation = args[0];
9324   if (!SYMBOLP (operation)
9325       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9326     error ("Invalid first argument");
9327   if (nargs <= 1 + XFASTINT (target_idx))
9328     error ("Too few arguments for operation `%s'",
9329            SDATA (SYMBOL_NAME (operation)));
9330   target = args[XFASTINT (target_idx) + 1];
9331   if (!(STRINGP (target)
9332         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9333             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9334         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9335     error ("Invalid argument %"pI"d of operation `%s'",
9336            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9337   if (CONSP (target))
9338     target = XCAR (target);
9339
9340   chain = ((EQ (operation, Qinsert_file_contents)
9341             || EQ (operation, Qwrite_region))
9342            ? Vfile_coding_system_alist
9343            : (EQ (operation, Qopen_network_stream)
9344               ? Vnetwork_coding_system_alist
9345               : Vprocess_coding_system_alist));
9346   if (NILP (chain))
9347     return Qnil;
9348
9349   for (; CONSP (chain); chain = XCDR (chain))
9350     {
9351       Lisp_Object elt;
9352
9353       elt = XCAR (chain);
9354       if (CONSP (elt)
9355           && ((STRINGP (target)
9356                && STRINGP (XCAR (elt))
9357                && fast_string_match (XCAR (elt), target) >= 0)
9358               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9359         {
9360           val = XCDR (elt);
9361           /* Here, if VAL is both a valid coding system and a valid
9362              function symbol, we return VAL as a coding system.  */
9363           if (CONSP (val))
9364             return val;
9365           if (! SYMBOLP (val))
9366             return Qnil;
9367           if (! NILP (Fcoding_system_p (val)))
9368             return Fcons (val, val);
9369           if (! NILP (Ffboundp (val)))
9370             {
9371               /* We use call1 rather than safe_call1
9372                  so as to get bug reports about functions called here
9373                  which don't handle the current interface.  */
9374               val = call1 (val, Flist (nargs, args));
9375               if (CONSP (val))
9376                 return val;
9377               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9378                 return Fcons (val, val);
9379             }
9380           return Qnil;
9381         }
9382     }
9383   return Qnil;
9384 }
9385
9386 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9387        Sset_coding_system_priority, 0, MANY, 0,
9388        doc: /* Assign higher priority to the coding systems given as arguments.
9389 If multiple coding systems belong to the same category,
9390 all but the first one are ignored.
9391
9392 usage: (set-coding-system-priority &rest coding-systems)  */)
9393   (ptrdiff_t nargs, Lisp_Object *args)
9394 {
9395   ptrdiff_t i, j;
9396   bool changed[coding_category_max];
9397   enum coding_category priorities[coding_category_max];
9398
9399   memset (changed, 0, sizeof changed);
9400
9401   for (i = j = 0; i < nargs; i++)
9402     {
9403       enum coding_category category;
9404       Lisp_Object spec, attrs;
9405
9406       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9407       attrs = AREF (spec, 0);
9408       category = XINT (CODING_ATTR_CATEGORY (attrs));
9409       if (changed[category])
9410         /* Ignore this coding system because a coding system of the
9411            same category already had a higher priority.  */
9412         continue;
9413       changed[category] = 1;
9414       priorities[j++] = category;
9415       if (coding_categories[category].id >= 0
9416           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9417         setup_coding_system (args[i], &coding_categories[category]);
9418       Fset (AREF (Vcoding_category_table, category), args[i]);
9419     }
9420
9421   /* Now we have decided top J priorities.  Reflect the order of the
9422      original priorities to the remaining priorities.  */
9423
9424   for (i = j, j = 0; i < coding_category_max; i++, j++)
9425     {
9426       while (j < coding_category_max
9427              && changed[coding_priorities[j]])
9428         j++;
9429       if (j == coding_category_max)
9430         emacs_abort ();
9431       priorities[i] = coding_priorities[j];
9432     }
9433
9434   memcpy (coding_priorities, priorities, sizeof priorities);
9435
9436   /* Update `coding-category-list'.  */
9437   Vcoding_category_list = Qnil;
9438   for (i = coding_category_max; i-- > 0; )
9439     Vcoding_category_list
9440       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9441                Vcoding_category_list);
9442
9443   return Qnil;
9444 }
9445
9446 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9447        Scoding_system_priority_list, 0, 1, 0,
9448        doc: /* Return a list of coding systems ordered by their priorities.
9449 The list contains a subset of coding systems; i.e. coding systems
9450 assigned to each coding category (see `coding-category-list').
9451
9452 HIGHESTP non-nil means just return the highest priority one.  */)
9453   (Lisp_Object highestp)
9454 {
9455   int i;
9456   Lisp_Object val;
9457
9458   for (i = 0, val = Qnil; i < coding_category_max; i++)
9459     {
9460       enum coding_category category = coding_priorities[i];
9461       int id = coding_categories[category].id;
9462       Lisp_Object attrs;
9463
9464       if (id < 0)
9465         continue;
9466       attrs = CODING_ID_ATTRS (id);
9467       if (! NILP (highestp))
9468         return CODING_ATTR_BASE_NAME (attrs);
9469       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9470     }
9471   return Fnreverse (val);
9472 }
9473
9474 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9475
9476 static Lisp_Object
9477 make_subsidiaries (Lisp_Object base)
9478 {
9479   Lisp_Object subsidiaries;
9480   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9481   char *buf = alloca (base_name_len + 6);
9482   int i;
9483
9484   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9485   subsidiaries = Fmake_vector (make_number (3), Qnil);
9486   for (i = 0; i < 3; i++)
9487     {
9488       strcpy (buf + base_name_len, suffixes[i]);
9489       ASET (subsidiaries, i, intern (buf));
9490     }
9491   return subsidiaries;
9492 }
9493
9494
9495 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9496        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9497        doc: /* For internal use only.
9498 usage: (define-coding-system-internal ...)  */)
9499   (ptrdiff_t nargs, Lisp_Object *args)
9500 {
9501   Lisp_Object name;
9502   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9503   Lisp_Object attrs;            /* Vector of attributes.  */
9504   Lisp_Object eol_type;
9505   Lisp_Object aliases;
9506   Lisp_Object coding_type, charset_list, safe_charsets;
9507   enum coding_category category;
9508   Lisp_Object tail, val;
9509   int max_charset_id = 0;
9510   int i;
9511
9512   if (nargs < coding_arg_max)
9513     goto short_args;
9514
9515   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9516
9517   name = args[coding_arg_name];
9518   CHECK_SYMBOL (name);
9519   ASET (attrs, coding_attr_base_name, name);
9520
9521   val = args[coding_arg_mnemonic];
9522   if (! STRINGP (val))
9523     CHECK_CHARACTER (val);
9524   ASET (attrs, coding_attr_mnemonic, val);
9525
9526   coding_type = args[coding_arg_coding_type];
9527   CHECK_SYMBOL (coding_type);
9528   ASET (attrs, coding_attr_type, coding_type);
9529
9530   charset_list = args[coding_arg_charset_list];
9531   if (SYMBOLP (charset_list))
9532     {
9533       if (EQ (charset_list, Qiso_2022))
9534         {
9535           if (! EQ (coding_type, Qiso_2022))
9536             error ("Invalid charset-list");
9537           charset_list = Viso_2022_charset_list;
9538         }
9539       else if (EQ (charset_list, Qemacs_mule))
9540         {
9541           if (! EQ (coding_type, Qemacs_mule))
9542             error ("Invalid charset-list");
9543           charset_list = Vemacs_mule_charset_list;
9544         }
9545       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9546         {
9547           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9548             error ("Invalid charset-list");
9549           if (max_charset_id < XFASTINT (XCAR (tail)))
9550             max_charset_id = XFASTINT (XCAR (tail));
9551         }
9552     }
9553   else
9554     {
9555       charset_list = Fcopy_sequence (charset_list);
9556       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9557         {
9558           struct charset *charset;
9559
9560           val = XCAR (tail);
9561           CHECK_CHARSET_GET_CHARSET (val, charset);
9562           if (EQ (coding_type, Qiso_2022)
9563               ? CHARSET_ISO_FINAL (charset) < 0
9564               : EQ (coding_type, Qemacs_mule)
9565               ? CHARSET_EMACS_MULE_ID (charset) < 0
9566               : 0)
9567             error ("Can't handle charset `%s'",
9568                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9569
9570           XSETCAR (tail, make_number (charset->id));
9571           if (max_charset_id < charset->id)
9572             max_charset_id = charset->id;
9573         }
9574     }
9575   ASET (attrs, coding_attr_charset_list, charset_list);
9576
9577   safe_charsets = make_uninit_string (max_charset_id + 1);
9578   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9579   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9580     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9581   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9582
9583   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9584
9585   val = args[coding_arg_decode_translation_table];
9586   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9587     CHECK_SYMBOL (val);
9588   ASET (attrs, coding_attr_decode_tbl, val);
9589
9590   val = args[coding_arg_encode_translation_table];
9591   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9592     CHECK_SYMBOL (val);
9593   ASET (attrs, coding_attr_encode_tbl, val);
9594
9595   val = args[coding_arg_post_read_conversion];
9596   CHECK_SYMBOL (val);
9597   ASET (attrs, coding_attr_post_read, val);
9598
9599   val = args[coding_arg_pre_write_conversion];
9600   CHECK_SYMBOL (val);
9601   ASET (attrs, coding_attr_pre_write, val);
9602
9603   val = args[coding_arg_default_char];
9604   if (NILP (val))
9605     ASET (attrs, coding_attr_default_char, make_number (' '));
9606   else
9607     {
9608       CHECK_CHARACTER (val);
9609       ASET (attrs, coding_attr_default_char, val);
9610     }
9611
9612   val = args[coding_arg_for_unibyte];
9613   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9614
9615   val = args[coding_arg_plist];
9616   CHECK_LIST (val);
9617   ASET (attrs, coding_attr_plist, val);
9618
9619   if (EQ (coding_type, Qcharset))
9620     {
9621       /* Generate a lisp vector of 256 elements.  Each element is nil,
9622          integer, or a list of charset IDs.
9623
9624          If Nth element is nil, the byte code N is invalid in this
9625          coding system.
9626
9627          If Nth element is a number NUM, N is the first byte of a
9628          charset whose ID is NUM.
9629
9630          If Nth element is a list of charset IDs, N is the first byte
9631          of one of them.  The list is sorted by dimensions of the
9632          charsets.  A charset of smaller dimension comes first. */
9633       val = Fmake_vector (make_number (256), Qnil);
9634
9635       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9636         {
9637           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9638           int dim = CHARSET_DIMENSION (charset);
9639           int idx = (dim - 1) * 4;
9640
9641           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9642             ASET (attrs, coding_attr_ascii_compat, Qt);
9643
9644           for (i = charset->code_space[idx];
9645                i <= charset->code_space[idx + 1]; i++)
9646             {
9647               Lisp_Object tmp, tmp2;
9648               int dim2;
9649
9650               tmp = AREF (val, i);
9651               if (NILP (tmp))
9652                 tmp = XCAR (tail);
9653               else if (NUMBERP (tmp))
9654                 {
9655                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9656                   if (dim < dim2)
9657                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9658                   else
9659                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9660                 }
9661               else
9662                 {
9663                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9664                     {
9665                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9666                       if (dim < dim2)
9667                         break;
9668                     }
9669                   if (NILP (tmp2))
9670                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9671                   else
9672                     {
9673                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9674                       XSETCAR (tmp2, XCAR (tail));
9675                     }
9676                 }
9677               ASET (val, i, tmp);
9678             }
9679         }
9680       ASET (attrs, coding_attr_charset_valids, val);
9681       category = coding_category_charset;
9682     }
9683   else if (EQ (coding_type, Qccl))
9684     {
9685       Lisp_Object valids;
9686
9687       if (nargs < coding_arg_ccl_max)
9688         goto short_args;
9689
9690       val = args[coding_arg_ccl_decoder];
9691       CHECK_CCL_PROGRAM (val);
9692       if (VECTORP (val))
9693         val = Fcopy_sequence (val);
9694       ASET (attrs, coding_attr_ccl_decoder, val);
9695
9696       val = args[coding_arg_ccl_encoder];
9697       CHECK_CCL_PROGRAM (val);
9698       if (VECTORP (val))
9699         val = Fcopy_sequence (val);
9700       ASET (attrs, coding_attr_ccl_encoder, val);
9701
9702       val = args[coding_arg_ccl_valids];
9703       valids = Fmake_string (make_number (256), make_number (0));
9704       for (tail = val; CONSP (tail); tail = XCDR (tail))
9705         {
9706           int from, to;
9707
9708           val = XCAR (tail);
9709           if (INTEGERP (val))
9710             {
9711               if (! (0 <= XINT (val) && XINT (val) <= 255))
9712                 args_out_of_range_3 (val, make_number (0), make_number (255));
9713               from = to = XINT (val);
9714             }
9715           else
9716             {
9717               CHECK_CONS (val);
9718               CHECK_NATNUM_CAR (val);
9719               CHECK_NUMBER_CDR (val);
9720               if (XINT (XCAR (val)) > 255)
9721                 args_out_of_range_3 (XCAR (val),
9722                                      make_number (0), make_number (255));
9723               from = XINT (XCAR (val));
9724               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9725                 args_out_of_range_3 (XCDR (val),
9726                                      XCAR (val), make_number (255));
9727               to = XINT (XCDR (val));
9728             }
9729           for (i = from; i <= to; i++)
9730             SSET (valids, i, 1);
9731         }
9732       ASET (attrs, coding_attr_ccl_valids, valids);
9733
9734       category = coding_category_ccl;
9735     }
9736   else if (EQ (coding_type, Qutf_16))
9737     {
9738       Lisp_Object bom, endian;
9739
9740       ASET (attrs, coding_attr_ascii_compat, Qnil);
9741
9742       if (nargs < coding_arg_utf16_max)
9743         goto short_args;
9744
9745       bom = args[coding_arg_utf16_bom];
9746       if (! NILP (bom) && ! EQ (bom, Qt))
9747         {
9748           CHECK_CONS (bom);
9749           val = XCAR (bom);
9750           CHECK_CODING_SYSTEM (val);
9751           val = XCDR (bom);
9752           CHECK_CODING_SYSTEM (val);
9753         }
9754       ASET (attrs, coding_attr_utf_bom, bom);
9755
9756       endian = args[coding_arg_utf16_endian];
9757       CHECK_SYMBOL (endian);
9758       if (NILP (endian))
9759         endian = Qbig;
9760       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9761         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9762       ASET (attrs, coding_attr_utf_16_endian, endian);
9763
9764       category = (CONSP (bom)
9765                   ? coding_category_utf_16_auto
9766                   : NILP (bom)
9767                   ? (EQ (endian, Qbig)
9768                      ? coding_category_utf_16_be_nosig
9769                      : coding_category_utf_16_le_nosig)
9770                   : (EQ (endian, Qbig)
9771                      ? coding_category_utf_16_be
9772                      : coding_category_utf_16_le));
9773     }
9774   else if (EQ (coding_type, Qiso_2022))
9775     {
9776       Lisp_Object initial, reg_usage, request, flags;
9777
9778       if (nargs < coding_arg_iso2022_max)
9779         goto short_args;
9780
9781       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9782       CHECK_VECTOR (initial);
9783       for (i = 0; i < 4; i++)
9784         {
9785           val = Faref (initial, make_number (i));
9786           if (! NILP (val))
9787             {
9788               struct charset *charset;
9789
9790               CHECK_CHARSET_GET_CHARSET (val, charset);
9791               ASET (initial, i, make_number (CHARSET_ID (charset)));
9792               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9793                 ASET (attrs, coding_attr_ascii_compat, Qt);
9794             }
9795           else
9796             ASET (initial, i, make_number (-1));
9797         }
9798
9799       reg_usage = args[coding_arg_iso2022_reg_usage];
9800       CHECK_CONS (reg_usage);
9801       CHECK_NUMBER_CAR (reg_usage);
9802       CHECK_NUMBER_CDR (reg_usage);
9803
9804       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9805       for (tail = request; CONSP (tail); tail = XCDR (tail))
9806         {
9807           int id;
9808           Lisp_Object tmp1;
9809
9810           val = XCAR (tail);
9811           CHECK_CONS (val);
9812           tmp1 = XCAR (val);
9813           CHECK_CHARSET_GET_ID (tmp1, id);
9814           CHECK_NATNUM_CDR (val);
9815           if (XINT (XCDR (val)) >= 4)
9816             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9817           XSETCAR (val, make_number (id));
9818         }
9819
9820       flags = args[coding_arg_iso2022_flags];
9821       CHECK_NATNUM (flags);
9822       i = XINT (flags) & INT_MAX;
9823       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9824         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9825       flags = make_number (i);
9826
9827       ASET (attrs, coding_attr_iso_initial, initial);
9828       ASET (attrs, coding_attr_iso_usage, reg_usage);
9829       ASET (attrs, coding_attr_iso_request, request);
9830       ASET (attrs, coding_attr_iso_flags, flags);
9831       setup_iso_safe_charsets (attrs);
9832
9833       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9834         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9835                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9836                     ? coding_category_iso_7_else
9837                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9838                     ? coding_category_iso_7
9839                     : coding_category_iso_7_tight);
9840       else
9841         {
9842           int id = XINT (AREF (initial, 1));
9843
9844           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9845                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9846                        || id < 0)
9847                       ? coding_category_iso_8_else
9848                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9849                       ? coding_category_iso_8_1
9850                       : coding_category_iso_8_2);
9851         }
9852       if (category != coding_category_iso_8_1
9853           && category != coding_category_iso_8_2)
9854         ASET (attrs, coding_attr_ascii_compat, Qnil);
9855     }
9856   else if (EQ (coding_type, Qemacs_mule))
9857     {
9858       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9859         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9860       ASET (attrs, coding_attr_ascii_compat, Qt);
9861       category = coding_category_emacs_mule;
9862     }
9863   else if (EQ (coding_type, Qshift_jis))
9864     {
9865
9866       struct charset *charset;
9867
9868       if (XINT (Flength (charset_list)) != 3
9869           && XINT (Flength (charset_list)) != 4)
9870         error ("There should be three or four charsets");
9871
9872       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9873       if (CHARSET_DIMENSION (charset) != 1)
9874         error ("Dimension of charset %s is not one",
9875                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9876       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9877         ASET (attrs, coding_attr_ascii_compat, Qt);
9878
9879       charset_list = XCDR (charset_list);
9880       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9881       if (CHARSET_DIMENSION (charset) != 1)
9882         error ("Dimension of charset %s is not one",
9883                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9884
9885       charset_list = XCDR (charset_list);
9886       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9887       if (CHARSET_DIMENSION (charset) != 2)
9888         error ("Dimension of charset %s is not two",
9889                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9890
9891       charset_list = XCDR (charset_list);
9892       if (! NILP (charset_list))
9893         {
9894           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9895           if (CHARSET_DIMENSION (charset) != 2)
9896             error ("Dimension of charset %s is not two",
9897                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9898         }
9899
9900       category = coding_category_sjis;
9901       Vsjis_coding_system = name;
9902     }
9903   else if (EQ (coding_type, Qbig5))
9904     {
9905       struct charset *charset;
9906
9907       if (XINT (Flength (charset_list)) != 2)
9908         error ("There should be just two charsets");
9909
9910       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9911       if (CHARSET_DIMENSION (charset) != 1)
9912         error ("Dimension of charset %s is not one",
9913                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9914       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9915         ASET (attrs, coding_attr_ascii_compat, Qt);
9916
9917       charset_list = XCDR (charset_list);
9918       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9919       if (CHARSET_DIMENSION (charset) != 2)
9920         error ("Dimension of charset %s is not two",
9921                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9922
9923       category = coding_category_big5;
9924       Vbig5_coding_system = name;
9925     }
9926   else if (EQ (coding_type, Qraw_text))
9927     {
9928       category = coding_category_raw_text;
9929       ASET (attrs, coding_attr_ascii_compat, Qt);
9930     }
9931   else if (EQ (coding_type, Qutf_8))
9932     {
9933       Lisp_Object bom;
9934
9935       if (nargs < coding_arg_utf8_max)
9936         goto short_args;
9937
9938       bom = args[coding_arg_utf8_bom];
9939       if (! NILP (bom) && ! EQ (bom, Qt))
9940         {
9941           CHECK_CONS (bom);
9942           val = XCAR (bom);
9943           CHECK_CODING_SYSTEM (val);
9944           val = XCDR (bom);
9945           CHECK_CODING_SYSTEM (val);
9946         }
9947       ASET (attrs, coding_attr_utf_bom, bom);
9948       if (NILP (bom))
9949         ASET (attrs, coding_attr_ascii_compat, Qt);
9950
9951       category = (CONSP (bom) ? coding_category_utf_8_auto
9952                   : NILP (bom) ? coding_category_utf_8_nosig
9953                   : coding_category_utf_8_sig);
9954     }
9955   else if (EQ (coding_type, Qundecided))
9956     category = coding_category_undecided;
9957   else
9958     error ("Invalid coding system type: %s",
9959            SDATA (SYMBOL_NAME (coding_type)));
9960
9961   ASET (attrs, coding_attr_category, make_number (category));
9962   ASET (attrs, coding_attr_plist,
9963         Fcons (QCcategory,
9964                Fcons (AREF (Vcoding_category_table, category),
9965                       CODING_ATTR_PLIST (attrs))));
9966   ASET (attrs, coding_attr_plist,
9967         Fcons (QCascii_compatible_p,
9968                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9969                       CODING_ATTR_PLIST (attrs))));
9970
9971   eol_type = args[coding_arg_eol_type];
9972   if (! NILP (eol_type)
9973       && ! EQ (eol_type, Qunix)
9974       && ! EQ (eol_type, Qdos)
9975       && ! EQ (eol_type, Qmac))
9976     error ("Invalid eol-type");
9977
9978   aliases = Fcons (name, Qnil);
9979
9980   if (NILP (eol_type))
9981     {
9982       eol_type = make_subsidiaries (name);
9983       for (i = 0; i < 3; i++)
9984         {
9985           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9986
9987           this_name = AREF (eol_type, i);
9988           this_aliases = Fcons (this_name, Qnil);
9989           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9990           this_spec = Fmake_vector (make_number (3), attrs);
9991           ASET (this_spec, 1, this_aliases);
9992           ASET (this_spec, 2, this_eol_type);
9993           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9994           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9995           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9996           if (NILP (val))
9997             Vcoding_system_alist
9998               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9999                        Vcoding_system_alist);
10000         }
10001     }
10002
10003   spec_vec = Fmake_vector (make_number (3), attrs);
10004   ASET (spec_vec, 1, aliases);
10005   ASET (spec_vec, 2, eol_type);
10006
10007   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10008   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10009   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10010   if (NILP (val))
10011     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10012                                   Vcoding_system_alist);
10013
10014   {
10015     int id = coding_categories[category].id;
10016
10017     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10018       setup_coding_system (name, &coding_categories[category]);
10019   }
10020
10021   return Qnil;
10022
10023  short_args:
10024   return Fsignal (Qwrong_number_of_arguments,
10025                   Fcons (intern ("define-coding-system-internal"),
10026                          make_number (nargs)));
10027 }
10028
10029
10030 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10031        3, 3, 0,
10032        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10033   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10034 {
10035   Lisp_Object spec, attrs;
10036
10037   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10038   attrs = AREF (spec, 0);
10039   if (EQ (prop, QCmnemonic))
10040     {
10041       if (! STRINGP (val))
10042         CHECK_CHARACTER (val);
10043       ASET (attrs, coding_attr_mnemonic, val);
10044     }
10045   else if (EQ (prop, QCdefault_char))
10046     {
10047       if (NILP (val))
10048         val = make_number (' ');
10049       else
10050         CHECK_CHARACTER (val);
10051       ASET (attrs, coding_attr_default_char, val);
10052     }
10053   else if (EQ (prop, QCdecode_translation_table))
10054     {
10055       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10056         CHECK_SYMBOL (val);
10057       ASET (attrs, coding_attr_decode_tbl, val);
10058     }
10059   else if (EQ (prop, QCencode_translation_table))
10060     {
10061       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10062         CHECK_SYMBOL (val);
10063       ASET (attrs, coding_attr_encode_tbl, val);
10064     }
10065   else if (EQ (prop, QCpost_read_conversion))
10066     {
10067       CHECK_SYMBOL (val);
10068       ASET (attrs, coding_attr_post_read, val);
10069     }
10070   else if (EQ (prop, QCpre_write_conversion))
10071     {
10072       CHECK_SYMBOL (val);
10073       ASET (attrs, coding_attr_pre_write, val);
10074     }
10075   else if (EQ (prop, QCascii_compatible_p))
10076     {
10077       ASET (attrs, coding_attr_ascii_compat, val);
10078     }
10079
10080   ASET (attrs, coding_attr_plist,
10081         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10082   return val;
10083 }
10084
10085
10086 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10087        Sdefine_coding_system_alias, 2, 2, 0,
10088        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10089   (Lisp_Object alias, Lisp_Object coding_system)
10090 {
10091   Lisp_Object spec, aliases, eol_type, val;
10092
10093   CHECK_SYMBOL (alias);
10094   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10095   aliases = AREF (spec, 1);
10096   /* ALIASES should be a list of length more than zero, and the first
10097      element is a base coding system.  Append ALIAS at the tail of the
10098      list.  */
10099   while (!NILP (XCDR (aliases)))
10100     aliases = XCDR (aliases);
10101   XSETCDR (aliases, Fcons (alias, Qnil));
10102
10103   eol_type = AREF (spec, 2);
10104   if (VECTORP (eol_type))
10105     {
10106       Lisp_Object subsidiaries;
10107       int i;
10108
10109       subsidiaries = make_subsidiaries (alias);
10110       for (i = 0; i < 3; i++)
10111         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10112                                      AREF (eol_type, i));
10113     }
10114
10115   Fputhash (alias, spec, Vcoding_system_hash_table);
10116   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10117   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10118   if (NILP (val))
10119     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10120                                   Vcoding_system_alist);
10121
10122   return Qnil;
10123 }
10124
10125 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10126        1, 1, 0,
10127        doc: /* Return the base of CODING-SYSTEM.
10128 Any alias or subsidiary coding system is not a base coding system.  */)
10129   (Lisp_Object coding_system)
10130 {
10131   Lisp_Object spec, attrs;
10132
10133   if (NILP (coding_system))
10134     return (Qno_conversion);
10135   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10136   attrs = AREF (spec, 0);
10137   return CODING_ATTR_BASE_NAME (attrs);
10138 }
10139
10140 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10141        1, 1, 0,
10142        doc: "Return the property list of CODING-SYSTEM.")
10143   (Lisp_Object coding_system)
10144 {
10145   Lisp_Object spec, attrs;
10146
10147   if (NILP (coding_system))
10148     coding_system = Qno_conversion;
10149   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10150   attrs = AREF (spec, 0);
10151   return CODING_ATTR_PLIST (attrs);
10152 }
10153
10154
10155 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10156        1, 1, 0,
10157        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10158   (Lisp_Object coding_system)
10159 {
10160   Lisp_Object spec;
10161
10162   if (NILP (coding_system))
10163     coding_system = Qno_conversion;
10164   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10165   return AREF (spec, 1);
10166 }
10167
10168 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10169        Scoding_system_eol_type, 1, 1, 0,
10170        doc: /* Return eol-type of CODING-SYSTEM.
10171 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10172
10173 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10174 and CR respectively.
10175
10176 A vector value indicates that a format of end-of-line should be
10177 detected automatically.  Nth element of the vector is the subsidiary
10178 coding system whose eol-type is N.  */)
10179   (Lisp_Object coding_system)
10180 {
10181   Lisp_Object spec, eol_type;
10182   int n;
10183
10184   if (NILP (coding_system))
10185     coding_system = Qno_conversion;
10186   if (! CODING_SYSTEM_P (coding_system))
10187     return Qnil;
10188   spec = CODING_SYSTEM_SPEC (coding_system);
10189   eol_type = AREF (spec, 2);
10190   if (VECTORP (eol_type))
10191     return Fcopy_sequence (eol_type);
10192   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10193   return make_number (n);
10194 }
10195
10196 #endif /* emacs */
10197
10198 \f
10199 /*** 9. Post-amble ***/
10200
10201 void
10202 init_coding_once (void)
10203 {
10204   int i;
10205
10206   for (i = 0; i < coding_category_max; i++)
10207     {
10208       coding_categories[i].id = -1;
10209       coding_priorities[i] = i;
10210     }
10211
10212   /* ISO2022 specific initialize routine.  */
10213   for (i = 0; i < 0x20; i++)
10214     iso_code_class[i] = ISO_control_0;
10215   for (i = 0x21; i < 0x7F; i++)
10216     iso_code_class[i] = ISO_graphic_plane_0;
10217   for (i = 0x80; i < 0xA0; i++)
10218     iso_code_class[i] = ISO_control_1;
10219   for (i = 0xA1; i < 0xFF; i++)
10220     iso_code_class[i] = ISO_graphic_plane_1;
10221   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10222   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10223   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10224   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10225   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10226   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10227   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10228   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10229   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10230
10231   for (i = 0; i < 256; i++)
10232     {
10233       emacs_mule_bytes[i] = 1;
10234     }
10235   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10236   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10237   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10238   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10239 }
10240
10241 #ifdef emacs
10242
10243 void
10244 syms_of_coding (void)
10245 {
10246   staticpro (&Vcoding_system_hash_table);
10247   {
10248     Lisp_Object args[2];
10249     args[0] = QCtest;
10250     args[1] = Qeq;
10251     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10252   }
10253
10254   staticpro (&Vsjis_coding_system);
10255   Vsjis_coding_system = Qnil;
10256
10257   staticpro (&Vbig5_coding_system);
10258   Vbig5_coding_system = Qnil;
10259
10260   staticpro (&Vcode_conversion_reused_workbuf);
10261   Vcode_conversion_reused_workbuf = Qnil;
10262
10263   staticpro (&Vcode_conversion_workbuf_name);
10264   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10265
10266   reused_workbuf_in_use = 0;
10267
10268   DEFSYM (Qcharset, "charset");
10269   DEFSYM (Qtarget_idx, "target-idx");
10270   DEFSYM (Qcoding_system_history, "coding-system-history");
10271   Fset (Qcoding_system_history, Qnil);
10272
10273   /* Target FILENAME is the first argument.  */
10274   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10275   /* Target FILENAME is the third argument.  */
10276   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10277
10278   DEFSYM (Qcall_process, "call-process");
10279   /* Target PROGRAM is the first argument.  */
10280   Fput (Qcall_process, Qtarget_idx, make_number (0));
10281
10282   DEFSYM (Qcall_process_region, "call-process-region");
10283   /* Target PROGRAM is the third argument.  */
10284   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10285
10286   DEFSYM (Qstart_process, "start-process");
10287   /* Target PROGRAM is the third argument.  */
10288   Fput (Qstart_process, Qtarget_idx, make_number (2));
10289
10290   DEFSYM (Qopen_network_stream, "open-network-stream");
10291   /* Target SERVICE is the fourth argument.  */
10292   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10293
10294   DEFSYM (Qcoding_system, "coding-system");
10295   DEFSYM (Qcoding_aliases, "coding-aliases");
10296
10297   DEFSYM (Qeol_type, "eol-type");
10298   DEFSYM (Qunix, "unix");
10299   DEFSYM (Qdos, "dos");
10300   DEFSYM (Qmac, "mac");
10301
10302   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10303   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10304   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10305   DEFSYM (Qdefault_char, "default-char");
10306   DEFSYM (Qundecided, "undecided");
10307   DEFSYM (Qno_conversion, "no-conversion");
10308   DEFSYM (Qraw_text, "raw-text");
10309
10310   DEFSYM (Qiso_2022, "iso-2022");
10311
10312   DEFSYM (Qutf_8, "utf-8");
10313   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10314
10315 #if defined (WINDOWSNT) || defined (CYGWIN)
10316   /* No, not utf-16-le: that one has a BOM.  */
10317   DEFSYM (Qutf_16le, "utf-16le");
10318 #endif
10319
10320   DEFSYM (Qutf_16, "utf-16");
10321   DEFSYM (Qbig, "big");
10322   DEFSYM (Qlittle, "little");
10323
10324   DEFSYM (Qshift_jis, "shift-jis");
10325   DEFSYM (Qbig5, "big5");
10326
10327   DEFSYM (Qcoding_system_p, "coding-system-p");
10328
10329   DEFSYM (Qcoding_system_error, "coding-system-error");
10330   Fput (Qcoding_system_error, Qerror_conditions,
10331         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10332   Fput (Qcoding_system_error, Qerror_message,
10333         build_pure_c_string ("Invalid coding system"));
10334
10335   /* Intern this now in case it isn't already done.
10336      Setting this variable twice is harmless.
10337      But don't staticpro it here--that is done in alloc.c.  */
10338   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10339
10340   DEFSYM (Qtranslation_table, "translation-table");
10341   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10342   DEFSYM (Qtranslation_table_id, "translation-table-id");
10343   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10344   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10345
10346   DEFSYM (Qvalid_codes, "valid-codes");
10347
10348   DEFSYM (Qemacs_mule, "emacs-mule");
10349
10350   DEFSYM (QCcategory, ":category");
10351   DEFSYM (QCmnemonic, ":mnemonic");
10352   DEFSYM (QCdefault_char, ":default-char");
10353   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10354   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10355   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10356   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10357   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10358
10359   Vcoding_category_table
10360     = Fmake_vector (make_number (coding_category_max), Qnil);
10361   staticpro (&Vcoding_category_table);
10362   /* Followings are target of code detection.  */
10363   ASET (Vcoding_category_table, coding_category_iso_7,
10364         intern_c_string ("coding-category-iso-7"));
10365   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10366         intern_c_string ("coding-category-iso-7-tight"));
10367   ASET (Vcoding_category_table, coding_category_iso_8_1,
10368         intern_c_string ("coding-category-iso-8-1"));
10369   ASET (Vcoding_category_table, coding_category_iso_8_2,
10370         intern_c_string ("coding-category-iso-8-2"));
10371   ASET (Vcoding_category_table, coding_category_iso_7_else,
10372         intern_c_string ("coding-category-iso-7-else"));
10373   ASET (Vcoding_category_table, coding_category_iso_8_else,
10374         intern_c_string ("coding-category-iso-8-else"));
10375   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10376         intern_c_string ("coding-category-utf-8-auto"));
10377   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10378         intern_c_string ("coding-category-utf-8"));
10379   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10380         intern_c_string ("coding-category-utf-8-sig"));
10381   ASET (Vcoding_category_table, coding_category_utf_16_be,
10382         intern_c_string ("coding-category-utf-16-be"));
10383   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10384         intern_c_string ("coding-category-utf-16-auto"));
10385   ASET (Vcoding_category_table, coding_category_utf_16_le,
10386         intern_c_string ("coding-category-utf-16-le"));
10387   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10388         intern_c_string ("coding-category-utf-16-be-nosig"));
10389   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10390         intern_c_string ("coding-category-utf-16-le-nosig"));
10391   ASET (Vcoding_category_table, coding_category_charset,
10392         intern_c_string ("coding-category-charset"));
10393   ASET (Vcoding_category_table, coding_category_sjis,
10394         intern_c_string ("coding-category-sjis"));
10395   ASET (Vcoding_category_table, coding_category_big5,
10396         intern_c_string ("coding-category-big5"));
10397   ASET (Vcoding_category_table, coding_category_ccl,
10398         intern_c_string ("coding-category-ccl"));
10399   ASET (Vcoding_category_table, coding_category_emacs_mule,
10400         intern_c_string ("coding-category-emacs-mule"));
10401   /* Followings are NOT target of code detection.  */
10402   ASET (Vcoding_category_table, coding_category_raw_text,
10403         intern_c_string ("coding-category-raw-text"));
10404   ASET (Vcoding_category_table, coding_category_undecided,
10405         intern_c_string ("coding-category-undecided"));
10406
10407   DEFSYM (Qinsufficient_source, "insufficient-source");
10408   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10409   DEFSYM (Qinvalid_source, "invalid-source");
10410   DEFSYM (Qinterrupted, "interrupted");
10411   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10412   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10413
10414   defsubr (&Scoding_system_p);
10415   defsubr (&Sread_coding_system);
10416   defsubr (&Sread_non_nil_coding_system);
10417   defsubr (&Scheck_coding_system);
10418   defsubr (&Sdetect_coding_region);
10419   defsubr (&Sdetect_coding_string);
10420   defsubr (&Sfind_coding_systems_region_internal);
10421   defsubr (&Sunencodable_char_position);
10422   defsubr (&Scheck_coding_systems_region);
10423   defsubr (&Sdecode_coding_region);
10424   defsubr (&Sencode_coding_region);
10425   defsubr (&Sdecode_coding_string);
10426   defsubr (&Sencode_coding_string);
10427   defsubr (&Sdecode_sjis_char);
10428   defsubr (&Sencode_sjis_char);
10429   defsubr (&Sdecode_big5_char);
10430   defsubr (&Sencode_big5_char);
10431   defsubr (&Sset_terminal_coding_system_internal);
10432   defsubr (&Sset_safe_terminal_coding_system_internal);
10433   defsubr (&Sterminal_coding_system);
10434   defsubr (&Sset_keyboard_coding_system_internal);
10435   defsubr (&Skeyboard_coding_system);
10436   defsubr (&Sfind_operation_coding_system);
10437   defsubr (&Sset_coding_system_priority);
10438   defsubr (&Sdefine_coding_system_internal);
10439   defsubr (&Sdefine_coding_system_alias);
10440   defsubr (&Scoding_system_put);
10441   defsubr (&Scoding_system_base);
10442   defsubr (&Scoding_system_plist);
10443   defsubr (&Scoding_system_aliases);
10444   defsubr (&Scoding_system_eol_type);
10445   defsubr (&Scoding_system_priority_list);
10446
10447   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10448                doc: /* List of coding systems.
10449
10450 Do not alter the value of this variable manually.  This variable should be
10451 updated by the functions `define-coding-system' and
10452 `define-coding-system-alias'.  */);
10453   Vcoding_system_list = Qnil;
10454
10455   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10456                doc: /* Alist of coding system names.
10457 Each element is one element list of coding system name.
10458 This variable is given to `completing-read' as COLLECTION argument.
10459
10460 Do not alter the value of this variable manually.  This variable should be
10461 updated by the functions `make-coding-system' and
10462 `define-coding-system-alias'.  */);
10463   Vcoding_system_alist = Qnil;
10464
10465   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10466                doc: /* List of coding-categories (symbols) ordered by priority.
10467
10468 On detecting a coding system, Emacs tries code detection algorithms
10469 associated with each coding-category one by one in this order.  When
10470 one algorithm agrees with a byte sequence of source text, the coding
10471 system bound to the corresponding coding-category is selected.
10472
10473 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10474   {
10475     int i;
10476
10477     Vcoding_category_list = Qnil;
10478     for (i = coding_category_max - 1; i >= 0; i--)
10479       Vcoding_category_list
10480         = Fcons (AREF (Vcoding_category_table, i),
10481                  Vcoding_category_list);
10482   }
10483
10484   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10485                doc: /* Specify the coding system for read operations.
10486 It is useful to bind this variable with `let', but do not set it globally.
10487 If the value is a coding system, it is used for decoding on read operation.
10488 If not, an appropriate element is used from one of the coding system alists.
10489 There are three such tables: `file-coding-system-alist',
10490 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10491   Vcoding_system_for_read = Qnil;
10492
10493   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10494                doc: /* Specify the coding system for write operations.
10495 Programs bind this variable with `let', but you should not set it globally.
10496 If the value is a coding system, it is used for encoding of output,
10497 when writing it to a file and when sending it to a file or subprocess.
10498
10499 If this does not specify a coding system, an appropriate element
10500 is used from one of the coding system alists.
10501 There are three such tables: `file-coding-system-alist',
10502 `process-coding-system-alist', and `network-coding-system-alist'.
10503 For output to files, if the above procedure does not specify a coding system,
10504 the value of `buffer-file-coding-system' is used.  */);
10505   Vcoding_system_for_write = Qnil;
10506
10507   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10508                doc: /*
10509 Coding system used in the latest file or process I/O.  */);
10510   Vlast_coding_system_used = Qnil;
10511
10512   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10513                doc: /*
10514 Error status of the last code conversion.
10515
10516 When an error was detected in the last code conversion, this variable
10517 is set to one of the following symbols.
10518   `insufficient-source'
10519   `inconsistent-eol'
10520   `invalid-source'
10521   `interrupted'
10522   `insufficient-memory'
10523 When no error was detected, the value doesn't change.  So, to check
10524 the error status of a code conversion by this variable, you must
10525 explicitly set this variable to nil before performing code
10526 conversion.  */);
10527   Vlast_code_conversion_error = Qnil;
10528
10529   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10530                doc: /*
10531 *Non-nil means always inhibit code conversion of end-of-line format.
10532 See info node `Coding Systems' and info node `Text and Binary' concerning
10533 such conversion.  */);
10534   inhibit_eol_conversion = 0;
10535
10536   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10537                doc: /*
10538 Non-nil means process buffer inherits coding system of process output.
10539 Bind it to t if the process output is to be treated as if it were a file
10540 read from some filesystem.  */);
10541   inherit_process_coding_system = 0;
10542
10543   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10544                doc: /*
10545 Alist to decide a coding system to use for a file I/O operation.
10546 The format is ((PATTERN . VAL) ...),
10547 where PATTERN is a regular expression matching a file name,
10548 VAL is a coding system, a cons of coding systems, or a function symbol.
10549 If VAL is a coding system, it is used for both decoding and encoding
10550 the file contents.
10551 If VAL is a cons of coding systems, the car part is used for decoding,
10552 and the cdr part is used for encoding.
10553 If VAL is a function symbol, the function must return a coding system
10554 or a cons of coding systems which are used as above.  The function is
10555 called with an argument that is a list of the arguments with which
10556 `find-operation-coding-system' was called.  If the function can't decide
10557 a coding system, it can return `undecided' so that the normal
10558 code-detection is performed.
10559
10560 See also the function `find-operation-coding-system'
10561 and the variable `auto-coding-alist'.  */);
10562   Vfile_coding_system_alist = Qnil;
10563
10564   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10565                doc: /*
10566 Alist to decide a coding system to use for a process I/O operation.
10567 The format is ((PATTERN . VAL) ...),
10568 where PATTERN is a regular expression matching a program name,
10569 VAL is a coding system, a cons of coding systems, or a function symbol.
10570 If VAL is a coding system, it is used for both decoding what received
10571 from the program and encoding what sent to the program.
10572 If VAL is a cons of coding systems, the car part is used for decoding,
10573 and the cdr part is used for encoding.
10574 If VAL is a function symbol, the function must return a coding system
10575 or a cons of coding systems which are used as above.
10576
10577 See also the function `find-operation-coding-system'.  */);
10578   Vprocess_coding_system_alist = Qnil;
10579
10580   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10581                doc: /*
10582 Alist to decide a coding system to use for a network I/O operation.
10583 The format is ((PATTERN . VAL) ...),
10584 where PATTERN is a regular expression matching a network service name
10585 or is a port number to connect to,
10586 VAL is a coding system, a cons of coding systems, or a function symbol.
10587 If VAL is a coding system, it is used for both decoding what received
10588 from the network stream and encoding what sent to the network stream.
10589 If VAL is a cons of coding systems, the car part is used for decoding,
10590 and the cdr part is used for encoding.
10591 If VAL is a function symbol, the function must return a coding system
10592 or a cons of coding systems which are used as above.
10593
10594 See also the function `find-operation-coding-system'.  */);
10595   Vnetwork_coding_system_alist = Qnil;
10596
10597   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10598                doc: /* Coding system to use with system messages.
10599 Also used for decoding keyboard input on X Window system.  */);
10600   Vlocale_coding_system = Qnil;
10601
10602   /* The eol mnemonics are reset in startup.el system-dependently.  */
10603   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10604                doc: /*
10605 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10606   eol_mnemonic_unix = build_pure_c_string (":");
10607
10608   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10609                doc: /*
10610 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10611   eol_mnemonic_dos = build_pure_c_string ("\\");
10612
10613   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10614                doc: /*
10615 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10616   eol_mnemonic_mac = build_pure_c_string ("/");
10617
10618   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10619                doc: /*
10620 *String displayed in mode line when end-of-line format is not yet determined.  */);
10621   eol_mnemonic_undecided = build_pure_c_string (":");
10622
10623   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10624                doc: /*
10625 *Non-nil enables character translation while encoding and decoding.  */);
10626   Venable_character_translation = Qt;
10627
10628   DEFVAR_LISP ("standard-translation-table-for-decode",
10629                Vstandard_translation_table_for_decode,
10630                doc: /* Table for translating characters while decoding.  */);
10631   Vstandard_translation_table_for_decode = Qnil;
10632
10633   DEFVAR_LISP ("standard-translation-table-for-encode",
10634                Vstandard_translation_table_for_encode,
10635                doc: /* Table for translating characters while encoding.  */);
10636   Vstandard_translation_table_for_encode = Qnil;
10637
10638   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10639                doc: /* Alist of charsets vs revision numbers.
10640 While encoding, if a charset (car part of an element) is found,
10641 designate it with the escape sequence identifying revision (cdr part
10642 of the element).  */);
10643   Vcharset_revision_table = Qnil;
10644
10645   DEFVAR_LISP ("default-process-coding-system",
10646                Vdefault_process_coding_system,
10647                doc: /* Cons of coding systems used for process I/O by default.
10648 The car part is used for decoding a process output,
10649 the cdr part is used for encoding a text to be sent to a process.  */);
10650   Vdefault_process_coding_system = Qnil;
10651
10652   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10653                doc: /*
10654 Table of extra Latin codes in the range 128..159 (inclusive).
10655 This is a vector of length 256.
10656 If Nth element is non-nil, the existence of code N in a file
10657 \(or output of subprocess) doesn't prevent it to be detected as
10658 a coding system of ISO 2022 variant which has a flag
10659 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10660 or reading output of a subprocess.
10661 Only 128th through 159th elements have a meaning.  */);
10662   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10663
10664   DEFVAR_LISP ("select-safe-coding-system-function",
10665                Vselect_safe_coding_system_function,
10666                doc: /*
10667 Function to call to select safe coding system for encoding a text.
10668
10669 If set, this function is called to force a user to select a proper
10670 coding system which can encode the text in the case that a default
10671 coding system used in each operation can't encode the text.  The
10672 function should take care that the buffer is not modified while
10673 the coding system is being selected.
10674
10675 The default value is `select-safe-coding-system' (which see).  */);
10676   Vselect_safe_coding_system_function = Qnil;
10677
10678   DEFVAR_BOOL ("coding-system-require-warning",
10679                coding_system_require_warning,
10680                doc: /* Internal use only.
10681 If non-nil, on writing a file, `select-safe-coding-system-function' is
10682 called even if `coding-system-for-write' is non-nil.  The command
10683 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10684   coding_system_require_warning = 0;
10685
10686
10687   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10688                inhibit_iso_escape_detection,
10689                doc: /*
10690 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10691
10692 When Emacs reads text, it tries to detect how the text is encoded.
10693 This code detection is sensitive to escape sequences.  If Emacs sees
10694 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10695 of the ISO2022 encodings, and decodes text by the corresponding coding
10696 system (e.g. `iso-2022-7bit').
10697
10698 However, there may be a case that you want to read escape sequences in
10699 a file as is.  In such a case, you can set this variable to non-nil.
10700 Then the code detection will ignore any escape sequences, and no text is
10701 detected as encoded in some ISO-2022 encoding.  The result is that all
10702 escape sequences become visible in a buffer.
10703
10704 The default value is nil, and it is strongly recommended not to change
10705 it.  That is because many Emacs Lisp source files that contain
10706 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10707 in Emacs's distribution, and they won't be decoded correctly on
10708 reading if you suppress escape sequence detection.
10709
10710 The other way to read escape sequences in a file without decoding is
10711 to explicitly specify some coding system that doesn't use ISO-2022
10712 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10713   inhibit_iso_escape_detection = 0;
10714
10715   DEFVAR_BOOL ("inhibit-null-byte-detection",
10716                inhibit_null_byte_detection,
10717                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10718 By default, Emacs treats it as binary data, and does not attempt to
10719 decode it.  The effect is as if you specified `no-conversion' for
10720 reading that text.
10721
10722 Set this to non-nil when a regular text happens to include null bytes.
10723 Examples are Index nodes of Info files and null-byte delimited output
10724 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10725 decode text as usual.  */);
10726   inhibit_null_byte_detection = 0;
10727
10728   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10729                doc: /* Char table for translating self-inserting characters.
10730 This is applied to the result of input methods, not their input.
10731 See also `keyboard-translate-table'.
10732
10733 Use of this variable for character code unification was rendered
10734 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10735 internal character representation.  */);
10736     Vtranslation_table_for_input = Qnil;
10737
10738   {
10739     Lisp_Object args[coding_arg_max];
10740     Lisp_Object plist[16];
10741     int i;
10742
10743     for (i = 0; i < coding_arg_max; i++)
10744       args[i] = Qnil;
10745
10746     plist[0] = intern_c_string (":name");
10747     plist[1] = args[coding_arg_name] = Qno_conversion;
10748     plist[2] = intern_c_string (":mnemonic");
10749     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10750     plist[4] = intern_c_string (":coding-type");
10751     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10752     plist[6] = intern_c_string (":ascii-compatible-p");
10753     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10754     plist[8] = intern_c_string (":default-char");
10755     plist[9] = args[coding_arg_default_char] = make_number (0);
10756     plist[10] = intern_c_string (":for-unibyte");
10757     plist[11] = args[coding_arg_for_unibyte] = Qt;
10758     plist[12] = intern_c_string (":docstring");
10759     plist[13] = build_pure_c_string ("Do no conversion.\n\
10760 \n\
10761 When you visit a file with this coding, the file is read into a\n\
10762 unibyte buffer as is, thus each byte of a file is treated as a\n\
10763 character.");
10764     plist[14] = intern_c_string (":eol-type");
10765     plist[15] = args[coding_arg_eol_type] = Qunix;
10766     args[coding_arg_plist] = Flist (16, plist);
10767     Fdefine_coding_system_internal (coding_arg_max, args);
10768
10769     plist[1] = args[coding_arg_name] = Qundecided;
10770     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10771     plist[5] = args[coding_arg_coding_type] = Qundecided;
10772     /* This is already set.
10773        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10774     plist[8] = intern_c_string (":charset-list");
10775     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10776     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10777     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10778     plist[15] = args[coding_arg_eol_type] = Qnil;
10779     args[coding_arg_plist] = Flist (16, plist);
10780     Fdefine_coding_system_internal (coding_arg_max, args);
10781   }
10782
10783   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10784
10785   {
10786     int i;
10787
10788     for (i = 0; i < coding_category_max; i++)
10789       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10790   }
10791 #if defined (DOS_NT)
10792   system_eol_type = Qdos;
10793 #else
10794   system_eol_type = Qunix;
10795 #endif
10796   staticpro (&system_eol_type);
10797 }
10798
10799 char *
10800 emacs_strerror (int error_number)
10801 {
10802   char *str;
10803
10804   synchronize_system_messages_locale ();
10805   str = strerror (error_number);
10806
10807   if (! NILP (Vlocale_coding_system))
10808     {
10809       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10810                                                       Vlocale_coding_system,
10811                                                       0);
10812       str = SSDATA (dec);
10813     }
10814
10815   return str;
10816 }
10817
10818 #endif /* emacs */