src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2012 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288 #include <setjmp.h>
 289
 290 #include "lisp.h"
 291 #include "character.h"
 292 #include "buffer.h"
 293 #include "charset.h"
 294 #include "ccl.h"
 295 #include "composite.h"
 296 #include "coding.h"
 297 #include "window.h"
 298 #include "frame.h"
 299 #include "termhooks.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 static Lisp_Object Qcoding_system, Qeol_type;
 304 static Lisp_Object Qcoding_aliases;
 305 Lisp_Object Qunix, Qdos;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 static Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qutf_8;
 311 static Lisp_Object Qiso_2022;
 312 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 313 static Lisp_Object Qbig, Qlittle;
 314 static Lisp_Object Qcoding_system_history;
 315 static Lisp_Object Qvalid_codes;
 316 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 317 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 static Lisp_Object QCascii_compatible_p;
 320
 321 Lisp_Object Qcall_process, Qcall_process_region;
 322 Lisp_Object Qstart_process, Qopen_network_stream;
 323 static Lisp_Object Qtarget_idx;
 324
 325 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 326 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 327
 328 /* If a symbol has this property, evaluate the value to define the
 329    symbol as a coding system.  */
 330 static Lisp_Object Qcoding_system_define_form;
 331
 332 /* Format of end-of-line decided by system.  This is Qunix on
 333    Unix and Mac, Qdos on DOS/Windows.
 334    This has an effect only for external encoding (i.e. for output to
 335    file and process), not for in-buffer or Lisp string encoding.  */
 336 static Lisp_Object system_eol_type;
 337
 338 #ifdef emacs
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 /* Coding-systems are handed between Emacs Lisp programs and C internal
 348    routines by the following three variables.  */
 349 /* Coding system to be used to encode text for terminal display when
 350    terminal coding system is nil.  */
 351 struct coding_system safe_terminal_coding;
 352
 353 #endif /* emacs */
 354
 355 Lisp_Object Qtranslation_table;
 356 Lisp_Object Qtranslation_table_id;
 357 static Lisp_Object Qtranslation_table_for_decode;
 358 static Lisp_Object Qtranslation_table_for_encode;
 359
 360 /* Two special coding systems.  */
 361 static Lisp_Object Vsjis_coding_system;
 362 static Lisp_Object Vbig5_coding_system;
 363
 364 /* ISO2022 section */
 365
 366 #define CODING_ISO_INITIAL(coding, reg)                 \
 367   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 368                      coding_attr_iso_initial),          \
 369                reg)))
 370
 371
 372 #define CODING_ISO_REQUEST(coding, charset_id)          \
 373   (((charset_id) <= (coding)->max_charset_id            \
 374     ? ((coding)->safe_charsets[charset_id] != 255       \
 375        ? (coding)->safe_charsets[charset_id]            \
 376        : -1)                                            \
 377     : -1))
 378
 379
 380 #define CODING_ISO_FLAGS(coding)        \
 381   ((coding)->spec.iso_2022.flags)
 382 #define CODING_ISO_DESIGNATION(coding, reg)     \
 383   ((coding)->spec.iso_2022.current_designation[reg])
 384 #define CODING_ISO_INVOCATION(coding, plane)    \
 385   ((coding)->spec.iso_2022.current_invocation[plane])
 386 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 387   ((coding)->spec.iso_2022.single_shifting)
 388 #define CODING_ISO_BOL(coding)  \
 389   ((coding)->spec.iso_2022.bol)
 390 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 391   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 392 #define CODING_ISO_CMP_STATUS(coding)   \
 393   (&(coding)->spec.iso_2022.cmp_status)
 394 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 395   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 396 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 397   ((coding)->spec.iso_2022.embedded_utf_8)
 398
 399 /* Control characters of ISO2022.  */
 400                         /* code */      /* function */
 401 #define ISO_CODE_SO     0x0E            /* shift-out */
 402 #define ISO_CODE_SI     0x0F            /* shift-in */
 403 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 404 #define ISO_CODE_ESC    0x1B            /* escape */
 405 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 406 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 407 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 408
 409 /* All code (1-byte) of ISO2022 is classified into one of the
 410    followings.  */
 411 enum iso_code_class_type
 412   {
 413     ISO_control_0,              /* Control codes in the range
 414                                    0x00..0x1F and 0x7F, except for the
 415                                    following 5 codes.  */
 416     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 417     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 418     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 419     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 420     ISO_control_1,              /* Control codes in the range
 421                                    0x80..0x9F, except for the
 422                                    following 3 codes.  */
 423     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 424     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 425     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 426     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 427     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 428     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 429     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 430   };
 431
 432 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 433     `iso-flags' attribute of an iso2022 coding system.  */
 434
 435 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 436    instead of the correct short-form sequence (e.g. ESC $ A).  */
 437 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 438
 439 /* If set, reset graphic planes and registers at end-of-line to the
 440    initial state.  */
 441 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 442
 443 /* If set, reset graphic planes and registers before any control
 444    characters to the initial state.  */
 445 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 446
 447 /* If set, encode by 7-bit environment.  */
 448 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 449
 450 /* If set, use locking-shift function.  */
 451 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 452
 453 /* If set, use single-shift function.  Overwrite
 454    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 455 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 456
 457 /* If set, use designation escape sequence.  */
 458 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 459
 460 /* If set, produce revision number sequence.  */
 461 #define CODING_ISO_FLAG_REVISION        0x0080
 462
 463 /* If set, produce ISO6429's direction specifying sequence.  */
 464 #define CODING_ISO_FLAG_DIRECTION       0x0100
 465
 466 /* If set, assume designation states are reset at beginning of line on
 467    output.  */
 468 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 469
 470 /* If set, designation sequence should be placed at beginning of line
 471    on output.  */
 472 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 473
 474 /* If set, do not encode unsafe characters on output.  */
 475 #define CODING_ISO_FLAG_SAFE            0x0800
 476
 477 /* If set, extra latin codes (128..159) are accepted as a valid code
 478    on input.  */
 479 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 480
 481 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 482
 483 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 484
 485 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 486
 487 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 488
 489 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 490
 491 /* A character to be produced on output if encoding of the original
 492    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 493 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 494
 495 /* UTF-8 section */
 496 #define CODING_UTF_8_BOM(coding)        \
 497   ((coding)->spec.utf_8_bom)
 498
 499 /* UTF-16 section */
 500 #define CODING_UTF_16_BOM(coding)       \
 501   ((coding)->spec.utf_16.bom)
 502
 503 #define CODING_UTF_16_ENDIAN(coding)    \
 504   ((coding)->spec.utf_16.endian)
 505
 506 #define CODING_UTF_16_SURROGATE(coding) \
 507   ((coding)->spec.utf_16.surrogate)
 508
 509
 510 /* CCL section */
 511 #define CODING_CCL_DECODER(coding)      \
 512   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 513 #define CODING_CCL_ENCODER(coding)      \
 514   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 515 #define CODING_CCL_VALIDS(coding)                                          \
 516   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 517
 518 /* Index for each coding category in `coding_categories' */
 519
 520 enum coding_category
 521   {
 522     coding_category_iso_7,
 523     coding_category_iso_7_tight,
 524     coding_category_iso_8_1,
 525     coding_category_iso_8_2,
 526     coding_category_iso_7_else,
 527     coding_category_iso_8_else,
 528     coding_category_utf_8_auto,
 529     coding_category_utf_8_nosig,
 530     coding_category_utf_8_sig,
 531     coding_category_utf_16_auto,
 532     coding_category_utf_16_be,
 533     coding_category_utf_16_le,
 534     coding_category_utf_16_be_nosig,
 535     coding_category_utf_16_le_nosig,
 536     coding_category_charset,
 537     coding_category_sjis,
 538     coding_category_big5,
 539     coding_category_ccl,
 540     coding_category_emacs_mule,
 541     /* All above are targets of code detection.  */
 542     coding_category_raw_text,
 543     coding_category_undecided,
 544     coding_category_max
 545   };
 546
 547 /* Definitions of flag bits used in detect_coding_XXXX.  */
 548 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 549 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 550 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 551 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 552 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 553 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 554 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 555 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 556 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 557 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 558 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 559 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 560 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 561 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 562 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 563 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 564 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 565 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 566 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 567 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 568
 569 /* This value is returned if detect_coding_mask () find nothing other
 570    than ASCII characters.  */
 571 #define CATEGORY_MASK_ANY               \
 572   (CATEGORY_MASK_ISO_7                  \
 573    | CATEGORY_MASK_ISO_7_TIGHT          \
 574    | CATEGORY_MASK_ISO_8_1              \
 575    | CATEGORY_MASK_ISO_8_2              \
 576    | CATEGORY_MASK_ISO_7_ELSE           \
 577    | CATEGORY_MASK_ISO_8_ELSE           \
 578    | CATEGORY_MASK_UTF_8_AUTO           \
 579    | CATEGORY_MASK_UTF_8_NOSIG          \
 580    | CATEGORY_MASK_UTF_8_SIG            \
 581    | CATEGORY_MASK_UTF_16_AUTO          \
 582    | CATEGORY_MASK_UTF_16_BE            \
 583    | CATEGORY_MASK_UTF_16_LE            \
 584    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 585    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 586    | CATEGORY_MASK_CHARSET              \
 587    | CATEGORY_MASK_SJIS                 \
 588    | CATEGORY_MASK_BIG5                 \
 589    | CATEGORY_MASK_CCL                  \
 590    | CATEGORY_MASK_EMACS_MULE)
 591
 592
 593 #define CATEGORY_MASK_ISO_7BIT \
 594   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 595
 596 #define CATEGORY_MASK_ISO_8BIT \
 597   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 598
 599 #define CATEGORY_MASK_ISO_ELSE \
 600   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 601
 602 #define CATEGORY_MASK_ISO_ESCAPE        \
 603   (CATEGORY_MASK_ISO_7                  \
 604    | CATEGORY_MASK_ISO_7_TIGHT          \
 605    | CATEGORY_MASK_ISO_7_ELSE           \
 606    | CATEGORY_MASK_ISO_8_ELSE)
 607
 608 #define CATEGORY_MASK_ISO       \
 609   (  CATEGORY_MASK_ISO_7BIT     \
 610      | CATEGORY_MASK_ISO_8BIT   \
 611      | CATEGORY_MASK_ISO_ELSE)
 612
 613 #define CATEGORY_MASK_UTF_16            \
 614   (CATEGORY_MASK_UTF_16_AUTO            \
 615    | CATEGORY_MASK_UTF_16_BE            \
 616    | CATEGORY_MASK_UTF_16_LE            \
 617    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 618    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 619
 620 #define CATEGORY_MASK_UTF_8     \
 621   (CATEGORY_MASK_UTF_8_AUTO     \
 622    | CATEGORY_MASK_UTF_8_NOSIG  \
 623    | CATEGORY_MASK_UTF_8_SIG)
 624
 625 /* Table of coding categories (Lisp symbols).  This variable is for
 626    internal use only.  */
 627 static Lisp_Object Vcoding_category_table;
 628
 629 /* Table of coding-categories ordered by priority.  */
 630 static enum coding_category coding_priorities[coding_category_max];
 631
 632 /* Nth element is a coding context for the coding system bound to the
 633    Nth coding category.  */
 634 static struct coding_system coding_categories[coding_category_max];
 635
 636 /*** Commonly used macros and functions ***/
 637
 638 #ifndef min
 639 #define min(a, b) ((a) < (b) ? (a) : (b))
 640 #endif
 641 #ifndef max
 642 #define max(a, b) ((a) > (b) ? (a) : (b))
 643 #endif
 644
 645 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 646   do {                                                  \
 647     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 648     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 649   } while (0)
 650
 651
 652 /* Safely get one byte from the source text pointed by SRC which ends
 653    at SRC_END, and set C to that byte.  If there are not enough bytes
 654    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 655    and a multibyte character is found at SRC, set C to the
 656    negative value of the character code.  The caller should declare
 657    and set these variables appropriately in advance:
 658         src, src_end, multibytep */
 659
 660 #define ONE_MORE_BYTE(c)                                \
 661   do {                                                  \
 662     if (src == src_end)                                 \
 663       {                                                 \
 664         if (src_base < src)                             \
 665           record_conversion_result                      \
 666             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 667         goto no_more_source;                            \
 668       }                                                 \
 669     c = *src++;                                         \
 670     if (multibytep && (c & 0x80))                       \
 671       {                                                 \
 672         if ((c & 0xFE) == 0xC0)                         \
 673           c = ((c & 1) << 6) | *src++;                  \
 674         else                                            \
 675           {                                             \
 676             src--;                                      \
 677             c = - string_char (src, &src, NULL);        \
 678             record_conversion_result                    \
 679               (coding, CODING_RESULT_INVALID_SRC);      \
 680           }                                             \
 681       }                                                 \
 682     consumed_chars++;                                   \
 683   } while (0)
 684
 685 /* Safely get two bytes from the source text pointed by SRC which ends
 686    at SRC_END, and set C1 and C2 to those bytes while skipping the
 687    heading multibyte characters.  If there are not enough bytes in the
 688    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 689    a multibyte character is found for C2, set C2 to the negative value
 690    of the character code.  The caller should declare and set these
 691    variables appropriately in advance:
 692         src, src_end, multibytep
 693    It is intended that this macro is used in detect_coding_utf_16.  */
 694
 695 #define TWO_MORE_BYTES(c1, c2)                          \
 696   do {                                                  \
 697     do {                                                \
 698       if (src == src_end)                               \
 699         goto no_more_source;                            \
 700       c1 = *src++;                                      \
 701       if (multibytep && (c1 & 0x80))                    \
 702         {                                               \
 703           if ((c1 & 0xFE) == 0xC0)                      \
 704             c1 = ((c1 & 1) << 6) | *src++;              \
 705           else                                          \
 706             {                                           \
 707               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 708               c1 = -1;                                  \
 709             }                                           \
 710         }                                               \
 711     } while (c1 < 0);                                   \
 712     if (src == src_end)                                 \
 713       goto no_more_source;                              \
 714     c2 = *src++;                                        \
 715     if (multibytep && (c2 & 0x80))                      \
 716       {                                                 \
 717         if ((c2 & 0xFE) == 0xC0)                        \
 718           c2 = ((c2 & 1) << 6) | *src++;                \
 719         else                                            \
 720           c2 = -1;                                      \
 721       }                                                 \
 722   } while (0)
 723
 724
 725 /* Store a byte C in the place pointed by DST and increment DST to the
 726    next free point, and increment PRODUCED_CHARS.  The caller should
 727    assure that C is 0..127, and declare and set the variable `dst'
 728    appropriately in advance.
 729 */
 730
 731
 732 #define EMIT_ONE_ASCII_BYTE(c)  \
 733   do {                          \
 734     produced_chars++;           \
 735     *dst++ = (c);               \
 736   } while (0)
 737
 738
 739 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 740
 741 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 742   do {                                  \
 743     produced_chars += 2;                \
 744     *dst++ = (c1), *dst++ = (c2);       \
 745   } while (0)
 746
 747
 748 /* Store a byte C in the place pointed by DST and increment DST to the
 749    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 750    store in an appropriate multibyte form.  The caller should
 751    declare and set the variables `dst' and `multibytep' appropriately
 752    in advance.  */
 753
 754 #define EMIT_ONE_BYTE(c)                \
 755   do {                                  \
 756     produced_chars++;                   \
 757     if (multibytep)                     \
 758       {                                 \
 759         unsigned ch = (c);              \
 760         if (ch >= 0x80)                 \
 761           ch = BYTE8_TO_CHAR (ch);      \
 762         CHAR_STRING_ADVANCE (ch, dst);  \
 763       }                                 \
 764     else                                \
 765       *dst++ = (c);                     \
 766   } while (0)
 767
 768
 769 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 770
 771 #define EMIT_TWO_BYTES(c1, c2)          \
 772   do {                                  \
 773     produced_chars += 2;                \
 774     if (multibytep)                     \
 775       {                                 \
 776         unsigned ch;                    \
 777                                         \
 778         ch = (c1);                      \
 779         if (ch >= 0x80)                 \
 780           ch = BYTE8_TO_CHAR (ch);      \
 781         CHAR_STRING_ADVANCE (ch, dst);  \
 782         ch = (c2);                      \
 783         if (ch >= 0x80)                 \
 784           ch = BYTE8_TO_CHAR (ch);      \
 785         CHAR_STRING_ADVANCE (ch, dst);  \
 786       }                                 \
 787     else                                \
 788       {                                 \
 789         *dst++ = (c1);                  \
 790         *dst++ = (c2);                  \
 791       }                                 \
 792   } while (0)
 793
 794
 795 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 796   do {                                  \
 797     EMIT_ONE_BYTE (c1);                 \
 798     EMIT_TWO_BYTES (c2, c3);            \
 799   } while (0)
 800
 801
 802 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 803   do {                                          \
 804     EMIT_TWO_BYTES (c1, c2);                    \
 805     EMIT_TWO_BYTES (c3, c4);                    \
 806   } while (0)
 807
 808
 809 static void
 810 record_conversion_result (struct coding_system *coding,
 811                           enum coding_result_code result)
 812 {
 813   coding->result = result;
 814   switch (result)
 815     {
 816     case CODING_RESULT_INSUFFICIENT_SRC:
 817       Vlast_code_conversion_error = Qinsufficient_source;
 818       break;
 819     case CODING_RESULT_INCONSISTENT_EOL:
 820       Vlast_code_conversion_error = Qinconsistent_eol;
 821       break;
 822     case CODING_RESULT_INVALID_SRC:
 823       Vlast_code_conversion_error = Qinvalid_source;
 824       break;
 825     case CODING_RESULT_INTERRUPT:
 826       Vlast_code_conversion_error = Qinterrupted;
 827       break;
 828     case CODING_RESULT_INSUFFICIENT_MEM:
 829       Vlast_code_conversion_error = Qinsufficient_memory;
 830       break;
 831     case CODING_RESULT_INSUFFICIENT_DST:
 832       /* Don't record this error in Vlast_code_conversion_error
 833          because it happens just temporarily and is resolved when the
 834          whole conversion is finished.  */
 835       break;
 836     case CODING_RESULT_SUCCESS:
 837       break;
 838     default:
 839       Vlast_code_conversion_error = intern ("Unknown error");
 840     }
 841 }
 842
 843 /* These wrapper macros are used to preserve validity of pointers into
 844    buffer text across calls to decode_char, encode_char, etc, which
 845    could cause relocation of buffers if it loads a charset map,
 846    because loading a charset map allocates large structures.  */
 847
 848 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 849   do {                                                                       \
 850     ptrdiff_t offset;                                                        \
 851                                                                              \
 852     charset_map_loaded = 0;                                                  \
 853     c = DECODE_CHAR (charset, code);                                         \
 854     if (charset_map_loaded                                                   \
 855         && (offset = coding_change_source (coding)))                         \
 856       {                                                                      \
 857         src += offset;                                                       \
 858         src_base += offset;                                                  \
 859         src_end += offset;                                                   \
 860       }                                                                      \
 861   } while (0)
 862
 863 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 864   do {                                                                  \
 865     ptrdiff_t offset;                                                   \
 866                                                                         \
 867     charset_map_loaded = 0;                                             \
 868     code = ENCODE_CHAR (charset, c);                                    \
 869     if (charset_map_loaded                                              \
 870         && (offset = coding_change_destination (coding)))               \
 871       {                                                                 \
 872         dst += offset;                                                  \
 873         dst_end += offset;                                              \
 874       }                                                                 \
 875   } while (0)
 876
 877 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 878   do {                                                                  \
 879     ptrdiff_t offset;                                                   \
 880                                                                         \
 881     charset_map_loaded = 0;                                             \
 882     charset = char_charset (c, charset_list, code_return);              \
 883     if (charset_map_loaded                                              \
 884         && (offset = coding_change_destination (coding)))               \
 885       {                                                                 \
 886         dst += offset;                                                  \
 887         dst_end += offset;                                              \
 888       }                                                                 \
 889   } while (0)
 890
 891 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 892   do {                                                                  \
 893     ptrdiff_t offset;                                                   \
 894                                                                         \
 895     charset_map_loaded = 0;                                             \
 896     result = CHAR_CHARSET_P (c, charset);                               \
 897     if (charset_map_loaded                                              \
 898         && (offset = coding_change_destination (coding)))               \
 899       {                                                                 \
 900         dst += offset;                                                  \
 901         dst_end += offset;                                              \
 902       }                                                                 \
 903   } while (0)
 904
 905
 906 /* If there are at least BYTES length of room at dst, allocate memory
 907    for coding->destination and update dst and dst_end.  We don't have
 908    to take care of coding->source which will be relocated.  It is
 909    handled by calling coding_set_source in encode_coding.  */
 910
 911 #define ASSURE_DESTINATION(bytes)                               \
 912   do {                                                          \
 913     if (dst + (bytes) >= dst_end)                               \
 914       {                                                         \
 915         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 916                                                                 \
 917         dst = alloc_destination (coding, more_bytes, dst);      \
 918         dst_end = coding->destination + coding->dst_bytes;      \
 919       }                                                         \
 920   } while (0)
 921
 922
 923 /* Store multibyte form of the character C in P, and advance P to the
 924    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 925    never calls MAYBE_UNIFY_CHAR.  */
 926
 927 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 928   do {                                          \
 929     if ((c) <= MAX_1_BYTE_CHAR)                 \
 930       *(p)++ = (c);                             \
 931     else if ((c) <= MAX_2_BYTE_CHAR)            \
 932       *(p)++ = (0xC0 | ((c) >> 6)),             \
 933         *(p)++ = (0x80 | ((c) & 0x3F));         \
 934     else if ((c) <= MAX_3_BYTE_CHAR)            \
 935       *(p)++ = (0xE0 | ((c) >> 12)),            \
 936         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
 937         *(p)++ = (0x80 | ((c) & 0x3F));         \
 938     else if ((c) <= MAX_4_BYTE_CHAR)            \
 939       *(p)++ = (0xF0 | (c >> 18)),              \
 940         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 941         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 942         *(p)++ = (0x80 | (c & 0x3F));           \
 943     else if ((c) <= MAX_5_BYTE_CHAR)            \
 944       *(p)++ = 0xF8,                            \
 945         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
 946         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 947         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 948         *(p)++ = (0x80 | (c & 0x3F));           \
 949     else                                        \
 950       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
 951   } while (0)
 952
 953
 954 /* Return the character code of character whose multibyte form is at
 955    P, and advance P to the end of the multibyte form.  This is like
 956    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
 957
 958 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
 959   (!((p)[0] & 0x80)                                             \
 960    ? *(p)++                                                     \
 961    : ! ((p)[0] & 0x20)                                          \
 962    ? ((p) += 2,                                                 \
 963       ((((p)[-2] & 0x1F) << 6)                                  \
 964        | ((p)[-1] & 0x3F)                                       \
 965        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
 966    : ! ((p)[0] & 0x10)                                          \
 967    ? ((p) += 3,                                                 \
 968       ((((p)[-3] & 0x0F) << 12)                                 \
 969        | (((p)[-2] & 0x3F) << 6)                                \
 970        | ((p)[-1] & 0x3F)))                                     \
 971    : ! ((p)[0] & 0x08)                                          \
 972    ? ((p) += 4,                                                 \
 973       ((((p)[-4] & 0xF) << 18)                                  \
 974        | (((p)[-3] & 0x3F) << 12)                               \
 975        | (((p)[-2] & 0x3F) << 6)                                \
 976        | ((p)[-1] & 0x3F)))                                     \
 977    : ((p) += 5,                                                 \
 978       ((((p)[-4] & 0x3F) << 18)                                 \
 979        | (((p)[-3] & 0x3F) << 12)                               \
 980        | (((p)[-2] & 0x3F) << 6)                                \
 981        | ((p)[-1] & 0x3F))))
 982
 983
 984 /* Set coding->source from coding->src_object.  */
 985
 986 static void
 987 coding_set_source (struct coding_system *coding)
 988 {
 989   if (BUFFERP (coding->src_object))
 990     {
 991       struct buffer *buf = XBUFFER (coding->src_object);
 992
 993       if (coding->src_pos < 0)
 994         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 995       else
 996         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 997     }
 998   else if (STRINGP (coding->src_object))
 999     {
1000       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1001     }
1002   else
1003     {
1004       /* Otherwise, the source is C string and is never relocated
1005          automatically.  Thus we don't have to update anything.  */
1006     }
1007 }
1008
1009
1010 /* Set coding->source from coding->src_object, and return how many
1011    bytes coding->source was changed.  */
1012
1013 static ptrdiff_t
1014 coding_change_source (struct coding_system *coding)
1015 {
1016   const unsigned char *orig = coding->source;
1017   coding_set_source (coding);
1018   return coding->source - orig;
1019 }
1020
1021
1022 /* Set coding->destination from coding->dst_object.  */
1023
1024 static void
1025 coding_set_destination (struct coding_system *coding)
1026 {
1027   if (BUFFERP (coding->dst_object))
1028     {
1029       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1030         {
1031           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1032           coding->dst_bytes = (GAP_END_ADDR
1033                                - (coding->src_bytes - coding->consumed)
1034                                - coding->destination);
1035         }
1036       else
1037         {
1038           /* We are sure that coding->dst_pos_byte is before the gap
1039              of the buffer. */
1040           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1041                                  + coding->dst_pos_byte - BEG_BYTE);
1042           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1043                                - coding->destination);
1044         }
1045     }
1046   else
1047     {
1048       /* Otherwise, the destination is C string and is never relocated
1049          automatically.  Thus we don't have to update anything.  */
1050     }
1051 }
1052
1053
1054 /* Set coding->destination from coding->dst_object, and return how
1055    many bytes coding->destination was changed.  */
1056
1057 static ptrdiff_t
1058 coding_change_destination (struct coding_system *coding)
1059 {
1060   const unsigned char *orig = coding->destination;
1061   coding_set_destination (coding);
1062   return coding->destination - orig;
1063 }
1064
1065
1066 static void
1067 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1068 {
1069   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1070     string_overflow ();
1071   coding->destination = xrealloc (coding->destination,
1072                                   coding->dst_bytes + bytes);
1073   coding->dst_bytes += bytes;
1074 }
1075
1076 static void
1077 coding_alloc_by_making_gap (struct coding_system *coding,
1078                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1079 {
1080   if (EQ (coding->src_object, coding->dst_object))
1081     {
1082       /* The gap may contain the produced data at the head and not-yet
1083          consumed data at the tail.  To preserve those data, we at
1084          first make the gap size to zero, then increase the gap
1085          size.  */
1086       ptrdiff_t add = GAP_SIZE;
1087
1088       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1089       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1090       make_gap (bytes);
1091       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1092       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1093     }
1094   else
1095     {
1096       Lisp_Object this_buffer;
1097
1098       this_buffer = Fcurrent_buffer ();
1099       set_buffer_internal (XBUFFER (coding->dst_object));
1100       make_gap (bytes);
1101       set_buffer_internal (XBUFFER (this_buffer));
1102     }
1103 }
1104
1105
1106 static unsigned char *
1107 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1108                    unsigned char *dst)
1109 {
1110   ptrdiff_t offset = dst - coding->destination;
1111
1112   if (BUFFERP (coding->dst_object))
1113     {
1114       struct buffer *buf = XBUFFER (coding->dst_object);
1115
1116       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1117     }
1118   else
1119     coding_alloc_by_realloc (coding, nbytes);
1120   coding_set_destination (coding);
1121   dst = coding->destination + offset;
1122   return dst;
1123 }
1124
1125 /** Macros for annotations.  */
1126
1127 /* An annotation data is stored in the array coding->charbuf in this
1128    format:
1129      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1130    LENGTH is the number of elements in the annotation.
1131    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1132    NCHARS is the number of characters in the text annotated.
1133
1134    The format of the following elements depend on ANNOTATION_MASK.
1135
1136    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1137    follows:
1138      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1139
1140    NBYTES is the number of bytes specified in the header part of
1141    old-style emacs-mule encoding, or 0 for the other kind of
1142    composition.
1143
1144    METHOD is one of enum composition_method.
1145
1146    Optional COMPOSITION-COMPONENTS are characters and composition
1147    rules.
1148
1149    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1150    follows.
1151
1152    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1153    recover from an invalid annotation, and should be skipped by
1154    produce_annotation.  */
1155
1156 /* Maximum length of the header of annotation data.  */
1157 #define MAX_ANNOTATION_LENGTH 5
1158
1159 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1160   do {                                                  \
1161     *(buf)++ = -(len);                                  \
1162     *(buf)++ = (mask);                                  \
1163     *(buf)++ = (nchars);                                \
1164     coding->annotated = 1;                              \
1165   } while (0);
1166
1167 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1168   do {                                                                      \
1169     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1170     *buf++ = nbytes;                                                        \
1171     *buf++ = method;                                                        \
1172   } while (0)
1173
1174
1175 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1176   do {                                                                  \
1177     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1178     *buf++ = id;                                                        \
1179   } while (0)
1180
1181 \f
1182 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1183
1184
1185
1186 \f
1187 /*** 3. UTF-8 ***/
1188
1189 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1190    Return true if a text is encoded in UTF-8.  */
1191
1192 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1193 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1194 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1195 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1196 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1197 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1198
1199 #define UTF_8_BOM_1 0xEF
1200 #define UTF_8_BOM_2 0xBB
1201 #define UTF_8_BOM_3 0xBF
1202
1203 static bool
1204 detect_coding_utf_8 (struct coding_system *coding,
1205                      struct coding_detection_info *detect_info)
1206 {
1207   const unsigned char *src = coding->source, *src_base;
1208   const unsigned char *src_end = coding->source + coding->src_bytes;
1209   bool multibytep = coding->src_multibyte;
1210   ptrdiff_t consumed_chars = 0;
1211   bool bom_found = 0;
1212   bool found = 0;
1213
1214   detect_info->checked |= CATEGORY_MASK_UTF_8;
1215   /* A coding system of this category is always ASCII compatible.  */
1216   src += coding->head_ascii;
1217
1218   while (1)
1219     {
1220       int c, c1, c2, c3, c4;
1221
1222       src_base = src;
1223       ONE_MORE_BYTE (c);
1224       if (c < 0 || UTF_8_1_OCTET_P (c))
1225         continue;
1226       ONE_MORE_BYTE (c1);
1227       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1228         break;
1229       if (UTF_8_2_OCTET_LEADING_P (c))
1230         {
1231           found = 1;
1232           continue;
1233         }
1234       ONE_MORE_BYTE (c2);
1235       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1236         break;
1237       if (UTF_8_3_OCTET_LEADING_P (c))
1238         {
1239           found = 1;
1240           if (src_base == coding->source
1241               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1242             bom_found = 1;
1243           continue;
1244         }
1245       ONE_MORE_BYTE (c3);
1246       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1247         break;
1248       if (UTF_8_4_OCTET_LEADING_P (c))
1249         {
1250           found = 1;
1251           continue;
1252         }
1253       ONE_MORE_BYTE (c4);
1254       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1255         break;
1256       if (UTF_8_5_OCTET_LEADING_P (c))
1257         {
1258           found = 1;
1259           continue;
1260         }
1261       break;
1262     }
1263   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1264   return 0;
1265
1266  no_more_source:
1267   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1268     {
1269       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1270       return 0;
1271     }
1272   if (bom_found)
1273     {
1274       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1275       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1276     }
1277   else
1278     {
1279       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1280       if (found)
1281         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1282     }
1283   return 1;
1284 }
1285
1286
1287 static void
1288 decode_coding_utf_8 (struct coding_system *coding)
1289 {
1290   const unsigned char *src = coding->source + coding->consumed;
1291   const unsigned char *src_end = coding->source + coding->src_bytes;
1292   const unsigned char *src_base;
1293   int *charbuf = coding->charbuf + coding->charbuf_used;
1294   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1295   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1296   bool multibytep = coding->src_multibyte;
1297   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1298   bool eol_dos
1299     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1300   int byte_after_cr = -1;
1301
1302   if (bom != utf_without_bom)
1303     {
1304       int c1, c2, c3;
1305
1306       src_base = src;
1307       ONE_MORE_BYTE (c1);
1308       if (! UTF_8_3_OCTET_LEADING_P (c1))
1309         src = src_base;
1310       else
1311         {
1312           ONE_MORE_BYTE (c2);
1313           if (! UTF_8_EXTRA_OCTET_P (c2))
1314             src = src_base;
1315           else
1316             {
1317               ONE_MORE_BYTE (c3);
1318               if (! UTF_8_EXTRA_OCTET_P (c3))
1319                 src = src_base;
1320               else
1321                 {
1322                   if ((c1 != UTF_8_BOM_1)
1323                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1324                     src = src_base;
1325                   else
1326                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1327                 }
1328             }
1329         }
1330     }
1331   CODING_UTF_8_BOM (coding) = utf_without_bom;
1332
1333   while (1)
1334     {
1335       int c, c1, c2, c3, c4, c5;
1336
1337       src_base = src;
1338       consumed_chars_base = consumed_chars;
1339
1340       if (charbuf >= charbuf_end)
1341         {
1342           if (byte_after_cr >= 0)
1343             src_base--;
1344           break;
1345         }
1346
1347       if (byte_after_cr >= 0)
1348         c1 = byte_after_cr, byte_after_cr = -1;
1349       else
1350         ONE_MORE_BYTE (c1);
1351       if (c1 < 0)
1352         {
1353           c = - c1;
1354         }
1355       else if (UTF_8_1_OCTET_P (c1))
1356         {
1357           if (eol_dos && c1 == '\r')
1358             ONE_MORE_BYTE (byte_after_cr);
1359           c = c1;
1360         }
1361       else
1362         {
1363           ONE_MORE_BYTE (c2);
1364           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1365             goto invalid_code;
1366           if (UTF_8_2_OCTET_LEADING_P (c1))
1367             {
1368               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1369               /* Reject overlong sequences here and below.  Encoders
1370                  producing them are incorrect, they can be misleading,
1371                  and they mess up read/write invariance.  */
1372               if (c < 128)
1373                 goto invalid_code;
1374             }
1375           else
1376             {
1377               ONE_MORE_BYTE (c3);
1378               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1379                 goto invalid_code;
1380               if (UTF_8_3_OCTET_LEADING_P (c1))
1381                 {
1382                   c = (((c1 & 0xF) << 12)
1383                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1384                   if (c < 0x800
1385                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1386                     goto invalid_code;
1387                 }
1388               else
1389                 {
1390                   ONE_MORE_BYTE (c4);
1391                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1392                     goto invalid_code;
1393                   if (UTF_8_4_OCTET_LEADING_P (c1))
1394                     {
1395                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1396                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1397                     if (c < 0x10000)
1398                       goto invalid_code;
1399                     }
1400                   else
1401                     {
1402                       ONE_MORE_BYTE (c5);
1403                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1404                         goto invalid_code;
1405                       if (UTF_8_5_OCTET_LEADING_P (c1))
1406                         {
1407                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1408                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1409                                | (c5 & 0x3F));
1410                           if ((c > MAX_CHAR) || (c < 0x200000))
1411                             goto invalid_code;
1412                         }
1413                       else
1414                         goto invalid_code;
1415                     }
1416                 }
1417             }
1418         }
1419
1420       *charbuf++ = c;
1421       continue;
1422
1423     invalid_code:
1424       src = src_base;
1425       consumed_chars = consumed_chars_base;
1426       ONE_MORE_BYTE (c);
1427       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1428       coding->errors++;
1429     }
1430
1431  no_more_source:
1432   coding->consumed_char += consumed_chars_base;
1433   coding->consumed = src_base - coding->source;
1434   coding->charbuf_used = charbuf - coding->charbuf;
1435 }
1436
1437
1438 static bool
1439 encode_coding_utf_8 (struct coding_system *coding)
1440 {
1441   bool multibytep = coding->dst_multibyte;
1442   int *charbuf = coding->charbuf;
1443   int *charbuf_end = charbuf + coding->charbuf_used;
1444   unsigned char *dst = coding->destination + coding->produced;
1445   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1446   ptrdiff_t produced_chars = 0;
1447   int c;
1448
1449   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1450     {
1451       ASSURE_DESTINATION (3);
1452       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1453       CODING_UTF_8_BOM (coding) = utf_without_bom;
1454     }
1455
1456   if (multibytep)
1457     {
1458       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1459
1460       while (charbuf < charbuf_end)
1461         {
1462           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1463
1464           ASSURE_DESTINATION (safe_room);
1465           c = *charbuf++;
1466           if (CHAR_BYTE8_P (c))
1467             {
1468               c = CHAR_TO_BYTE8 (c);
1469               EMIT_ONE_BYTE (c);
1470             }
1471           else
1472             {
1473               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1474               for (p = str; p < pend; p++)
1475                 EMIT_ONE_BYTE (*p);
1476             }
1477         }
1478     }
1479   else
1480     {
1481       int safe_room = MAX_MULTIBYTE_LENGTH;
1482
1483       while (charbuf < charbuf_end)
1484         {
1485           ASSURE_DESTINATION (safe_room);
1486           c = *charbuf++;
1487           if (CHAR_BYTE8_P (c))
1488             *dst++ = CHAR_TO_BYTE8 (c);
1489           else
1490             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1491           produced_chars++;
1492         }
1493     }
1494   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1495   coding->produced_char += produced_chars;
1496   coding->produced = dst - coding->destination;
1497   return 0;
1498 }
1499
1500
1501 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1502    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1503
1504 #define UTF_16_HIGH_SURROGATE_P(val) \
1505   (((val) & 0xFC00) == 0xD800)
1506
1507 #define UTF_16_LOW_SURROGATE_P(val) \
1508   (((val) & 0xFC00) == 0xDC00)
1509
1510
1511 static bool
1512 detect_coding_utf_16 (struct coding_system *coding,
1513                       struct coding_detection_info *detect_info)
1514 {
1515   const unsigned char *src = coding->source;
1516   const unsigned char *src_end = coding->source + coding->src_bytes;
1517   bool multibytep = coding->src_multibyte;
1518   int c1, c2;
1519
1520   detect_info->checked |= CATEGORY_MASK_UTF_16;
1521   if (coding->mode & CODING_MODE_LAST_BLOCK
1522       && (coding->src_chars & 1))
1523     {
1524       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1525       return 0;
1526     }
1527
1528   TWO_MORE_BYTES (c1, c2);
1529   if ((c1 == 0xFF) && (c2 == 0xFE))
1530     {
1531       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1532                              | CATEGORY_MASK_UTF_16_AUTO);
1533       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1534                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1535                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1536     }
1537   else if ((c1 == 0xFE) && (c2 == 0xFF))
1538     {
1539       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1540                              | CATEGORY_MASK_UTF_16_AUTO);
1541       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1542                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1543                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1544     }
1545   else if (c2 < 0)
1546     {
1547       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1548       return 0;
1549     }
1550   else
1551     {
1552       /* We check the dispersion of Eth and Oth bytes where E is even and
1553          O is odd.  If both are high, we assume binary data.*/
1554       unsigned char e[256], o[256];
1555       unsigned e_num = 1, o_num = 1;
1556
1557       memset (e, 0, 256);
1558       memset (o, 0, 256);
1559       e[c1] = 1;
1560       o[c2] = 1;
1561
1562       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1563                                 |CATEGORY_MASK_UTF_16_BE
1564                                 | CATEGORY_MASK_UTF_16_LE);
1565
1566       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1567              != CATEGORY_MASK_UTF_16)
1568         {
1569           TWO_MORE_BYTES (c1, c2);
1570           if (c2 < 0)
1571             break;
1572           if (! e[c1])
1573             {
1574               e[c1] = 1;
1575               e_num++;
1576               if (e_num >= 128)
1577                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1578             }
1579           if (! o[c2])
1580             {
1581               o[c2] = 1;
1582               o_num++;
1583               if (o_num >= 128)
1584                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1585             }
1586         }
1587       return 0;
1588     }
1589
1590  no_more_source:
1591   return 1;
1592 }
1593
1594 static void
1595 decode_coding_utf_16 (struct coding_system *coding)
1596 {
1597   const unsigned char *src = coding->source + coding->consumed;
1598   const unsigned char *src_end = coding->source + coding->src_bytes;
1599   const unsigned char *src_base;
1600   int *charbuf = coding->charbuf + coding->charbuf_used;
1601   /* We may produces at most 3 chars in one loop.  */
1602   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1603   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1604   bool multibytep = coding->src_multibyte;
1605   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1606   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1607   int surrogate = CODING_UTF_16_SURROGATE (coding);
1608   bool eol_dos
1609     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1610   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1611
1612   if (bom == utf_with_bom)
1613     {
1614       int c, c1, c2;
1615
1616       src_base = src;
1617       ONE_MORE_BYTE (c1);
1618       ONE_MORE_BYTE (c2);
1619       c = (c1 << 8) | c2;
1620
1621       if (endian == utf_16_big_endian
1622           ? c != 0xFEFF : c != 0xFFFE)
1623         {
1624           /* The first two bytes are not BOM.  Treat them as bytes
1625              for a normal character.  */
1626           src = src_base;
1627           coding->errors++;
1628         }
1629       CODING_UTF_16_BOM (coding) = utf_without_bom;
1630     }
1631   else if (bom == utf_detect_bom)
1632     {
1633       /* We have already tried to detect BOM and failed in
1634          detect_coding.  */
1635       CODING_UTF_16_BOM (coding) = utf_without_bom;
1636     }
1637
1638   while (1)
1639     {
1640       int c, c1, c2;
1641
1642       src_base = src;
1643       consumed_chars_base = consumed_chars;
1644
1645       if (charbuf >= charbuf_end)
1646         {
1647           if (byte_after_cr1 >= 0)
1648             src_base -= 2;
1649           break;
1650         }
1651
1652       if (byte_after_cr1 >= 0)
1653         c1 = byte_after_cr1, byte_after_cr1 = -1;
1654       else
1655         ONE_MORE_BYTE (c1);
1656       if (c1 < 0)
1657         {
1658           *charbuf++ = -c1;
1659           continue;
1660         }
1661       if (byte_after_cr2 >= 0)
1662         c2 = byte_after_cr2, byte_after_cr2 = -1;
1663       else
1664         ONE_MORE_BYTE (c2);
1665       if (c2 < 0)
1666         {
1667           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1668           *charbuf++ = -c2;
1669           continue;
1670         }
1671       c = (endian == utf_16_big_endian
1672            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1673
1674       if (surrogate)
1675         {
1676           if (! UTF_16_LOW_SURROGATE_P (c))
1677             {
1678               if (endian == utf_16_big_endian)
1679                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1680               else
1681                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1682               *charbuf++ = c1;
1683               *charbuf++ = c2;
1684               coding->errors++;
1685               if (UTF_16_HIGH_SURROGATE_P (c))
1686                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1687               else
1688                 *charbuf++ = c;
1689             }
1690           else
1691             {
1692               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1693               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1694               *charbuf++ = 0x10000 + c;
1695             }
1696         }
1697       else
1698         {
1699           if (UTF_16_HIGH_SURROGATE_P (c))
1700             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1701           else
1702             {
1703               if (eol_dos && c == '\r')
1704                 {
1705                   ONE_MORE_BYTE (byte_after_cr1);
1706                   ONE_MORE_BYTE (byte_after_cr2);
1707                 }
1708               *charbuf++ = c;
1709             }
1710         }
1711     }
1712
1713  no_more_source:
1714   coding->consumed_char += consumed_chars_base;
1715   coding->consumed = src_base - coding->source;
1716   coding->charbuf_used = charbuf - coding->charbuf;
1717 }
1718
1719 static bool
1720 encode_coding_utf_16 (struct coding_system *coding)
1721 {
1722   bool multibytep = coding->dst_multibyte;
1723   int *charbuf = coding->charbuf;
1724   int *charbuf_end = charbuf + coding->charbuf_used;
1725   unsigned char *dst = coding->destination + coding->produced;
1726   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1727   int safe_room = 8;
1728   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1729   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1730   ptrdiff_t produced_chars = 0;
1731   int c;
1732
1733   if (bom != utf_without_bom)
1734     {
1735       ASSURE_DESTINATION (safe_room);
1736       if (big_endian)
1737         EMIT_TWO_BYTES (0xFE, 0xFF);
1738       else
1739         EMIT_TWO_BYTES (0xFF, 0xFE);
1740       CODING_UTF_16_BOM (coding) = utf_without_bom;
1741     }
1742
1743   while (charbuf < charbuf_end)
1744     {
1745       ASSURE_DESTINATION (safe_room);
1746       c = *charbuf++;
1747       if (c > MAX_UNICODE_CHAR)
1748         c = coding->default_char;
1749
1750       if (c < 0x10000)
1751         {
1752           if (big_endian)
1753             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1754           else
1755             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1756         }
1757       else
1758         {
1759           int c1, c2;
1760
1761           c -= 0x10000;
1762           c1 = (c >> 10) + 0xD800;
1763           c2 = (c & 0x3FF) + 0xDC00;
1764           if (big_endian)
1765             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1766           else
1767             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1768         }
1769     }
1770   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1771   coding->produced = dst - coding->destination;
1772   coding->produced_char += produced_chars;
1773   return 0;
1774 }
1775
1776 \f
1777 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1778
1779 /* Emacs' internal format for representation of multiple character
1780    sets is a kind of multi-byte encoding, i.e. characters are
1781    represented by variable-length sequences of one-byte codes.
1782
1783    ASCII characters and control characters (e.g. `tab', `newline') are
1784    represented by one-byte sequences which are their ASCII codes, in
1785    the range 0x00 through 0x7F.
1786
1787    8-bit characters of the range 0x80..0x9F are represented by
1788    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1789    code + 0x20).
1790
1791    8-bit characters of the range 0xA0..0xFF are represented by
1792    one-byte sequences which are their 8-bit code.
1793
1794    The other characters are represented by a sequence of `base
1795    leading-code', optional `extended leading-code', and one or two
1796    `position-code's.  The length of the sequence is determined by the
1797    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1798    whereas extended leading-code and position-code take the range 0xA0
1799    through 0xFF.  See `charset.h' for more details about leading-code
1800    and position-code.
1801
1802    --- CODE RANGE of Emacs' internal format ---
1803    character set        range
1804    -------------        -----
1805    ascii                0x00..0x7F
1806    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1807    eight-bit-graphic    0xA0..0xBF
1808    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1809    ---------------------------------------------
1810
1811    As this is the internal character representation, the format is
1812    usually not used externally (i.e. in a file or in a data sent to a
1813    process).  But, it is possible to have a text externally in this
1814    format (i.e. by encoding by the coding system `emacs-mule').
1815
1816    In that case, a sequence of one-byte codes has a slightly different
1817    form.
1818
1819    At first, all characters in eight-bit-control are represented by
1820    one-byte sequences which are their 8-bit code.
1821
1822    Next, character composition data are represented by the byte
1823    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1824    where,
1825         METHOD is 0xF2 plus one of composition method (enum
1826         composition_method),
1827
1828         BYTES is 0xA0 plus a byte length of this composition data,
1829
1830         CHARS is 0xA0 plus a number of characters composed by this
1831         data,
1832
1833         COMPONENTs are characters of multibyte form or composition
1834         rules encoded by two-byte of ASCII codes.
1835
1836    In addition, for backward compatibility, the following formats are
1837    also recognized as composition data on decoding.
1838
1839    0x80 MSEQ ...
1840    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1841
1842    Here,
1843         MSEQ is a multibyte form but in these special format:
1844           ASCII: 0xA0 ASCII_CODE+0x80,
1845           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1846         RULE is a one byte code of the range 0xA0..0xF0 that
1847         represents a composition rule.
1848   */
1849
1850 char emacs_mule_bytes[256];
1851
1852
1853 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1854    Return true if a text is encoded in 'emacs-mule'.  */
1855
1856 static bool
1857 detect_coding_emacs_mule (struct coding_system *coding,
1858                           struct coding_detection_info *detect_info)
1859 {
1860   const unsigned char *src = coding->source, *src_base;
1861   const unsigned char *src_end = coding->source + coding->src_bytes;
1862   bool multibytep = coding->src_multibyte;
1863   ptrdiff_t consumed_chars = 0;
1864   int c;
1865   int found = 0;
1866
1867   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1868   /* A coding system of this category is always ASCII compatible.  */
1869   src += coding->head_ascii;
1870
1871   while (1)
1872     {
1873       src_base = src;
1874       ONE_MORE_BYTE (c);
1875       if (c < 0)
1876         continue;
1877       if (c == 0x80)
1878         {
1879           /* Perhaps the start of composite character.  We simply skip
1880              it because analyzing it is too heavy for detecting.  But,
1881              at least, we check that the composite character
1882              constitutes of more than 4 bytes.  */
1883           const unsigned char *src_start;
1884
1885         repeat:
1886           src_start = src;
1887           do
1888             {
1889               ONE_MORE_BYTE (c);
1890             }
1891           while (c >= 0xA0);
1892
1893           if (src - src_start <= 4)
1894             break;
1895           found = CATEGORY_MASK_EMACS_MULE;
1896           if (c == 0x80)
1897             goto repeat;
1898         }
1899
1900       if (c < 0x80)
1901         {
1902           if (c < 0x20
1903               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1904             break;
1905         }
1906       else
1907         {
1908           int more_bytes = emacs_mule_bytes[c] - 1;
1909
1910           while (more_bytes > 0)
1911             {
1912               ONE_MORE_BYTE (c);
1913               if (c < 0xA0)
1914                 {
1915                   src--;        /* Unread the last byte.  */
1916                   break;
1917                 }
1918               more_bytes--;
1919             }
1920           if (more_bytes != 0)
1921             break;
1922           found = CATEGORY_MASK_EMACS_MULE;
1923         }
1924     }
1925   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1926   return 0;
1927
1928  no_more_source:
1929   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1930     {
1931       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1932       return 0;
1933     }
1934   detect_info->found |= found;
1935   return 1;
1936 }
1937
1938
1939 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1940    character.  If CMP_STATUS indicates that we must expect MSEQ or
1941    RULE described above, decode it and return the negative value of
1942    the decoded character or rule.  If an invalid byte is found, return
1943    -1.  If SRC is too short, return -2.  */
1944
1945 static int
1946 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1947                  int *nbytes, int *nchars, int *id,
1948                  struct composition_status *cmp_status)
1949 {
1950   const unsigned char *src_end = coding->source + coding->src_bytes;
1951   const unsigned char *src_base = src;
1952   bool multibytep = coding->src_multibyte;
1953   int charset_ID;
1954   unsigned code;
1955   int c;
1956   int consumed_chars = 0;
1957   bool mseq_found = 0;
1958
1959   ONE_MORE_BYTE (c);
1960   if (c < 0)
1961     {
1962       c = -c;
1963       charset_ID = emacs_mule_charset[0];
1964     }
1965   else
1966     {
1967       if (c >= 0xA0)
1968         {
1969           if (cmp_status->state != COMPOSING_NO
1970               && cmp_status->old_form)
1971             {
1972               if (cmp_status->state == COMPOSING_CHAR)
1973                 {
1974                   if (c == 0xA0)
1975                     {
1976                       ONE_MORE_BYTE (c);
1977                       c -= 0x80;
1978                       if (c < 0)
1979                         goto invalid_code;
1980                     }
1981                   else
1982                     c -= 0x20;
1983                   mseq_found = 1;
1984                 }
1985               else
1986                 {
1987                   *nbytes = src - src_base;
1988                   *nchars = consumed_chars;
1989                   return -c;
1990                 }
1991             }
1992           else
1993             goto invalid_code;
1994         }
1995
1996       switch (emacs_mule_bytes[c])
1997         {
1998         case 2:
1999           if ((charset_ID = emacs_mule_charset[c]) < 0)
2000             goto invalid_code;
2001           ONE_MORE_BYTE (c);
2002           if (c < 0xA0)
2003             goto invalid_code;
2004           code = c & 0x7F;
2005           break;
2006
2007         case 3:
2008           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2009               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2010             {
2011               ONE_MORE_BYTE (c);
2012               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2013                 goto invalid_code;
2014               ONE_MORE_BYTE (c);
2015               if (c < 0xA0)
2016                 goto invalid_code;
2017               code = c & 0x7F;
2018             }
2019           else
2020             {
2021               if ((charset_ID = emacs_mule_charset[c]) < 0)
2022                 goto invalid_code;
2023               ONE_MORE_BYTE (c);
2024               if (c < 0xA0)
2025                 goto invalid_code;
2026               code = (c & 0x7F) << 8;
2027               ONE_MORE_BYTE (c);
2028               if (c < 0xA0)
2029                 goto invalid_code;
2030               code |= c & 0x7F;
2031             }
2032           break;
2033
2034         case 4:
2035           ONE_MORE_BYTE (c);
2036           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2037             goto invalid_code;
2038           ONE_MORE_BYTE (c);
2039           if (c < 0xA0)
2040             goto invalid_code;
2041           code = (c & 0x7F) << 8;
2042           ONE_MORE_BYTE (c);
2043           if (c < 0xA0)
2044             goto invalid_code;
2045           code |= c & 0x7F;
2046           break;
2047
2048         case 1:
2049           code = c;
2050           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2051           break;
2052
2053         default:
2054           abort ();
2055         }
2056       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2057                           CHARSET_FROM_ID (charset_ID), code, c);
2058       if (c < 0)
2059         goto invalid_code;
2060     }
2061   *nbytes = src - src_base;
2062   *nchars = consumed_chars;
2063   if (id)
2064     *id = charset_ID;
2065   return (mseq_found ? -c : c);
2066
2067  no_more_source:
2068   return -2;
2069
2070  invalid_code:
2071   return -1;
2072 }
2073
2074
2075 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2076
2077 /* Handle these composition sequence ('|': the end of header elements,
2078    BYTES and CHARS >= 0xA0):
2079
2080    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2081    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2082    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2083
2084    and these old form:
2085
2086    (4) relative composition: 0x80 | MSEQ ... MSEQ
2087    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2088
2089    When the starter 0x80 and the following header elements are found,
2090    this annotation header is produced.
2091
2092         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2093
2094    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2095    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2096
2097    Then, upon reading the following elements, these codes are produced
2098    until the composition end is found:
2099
2100    (1) CHAR ... CHAR
2101    (2) ALT ... ALT CHAR ... CHAR
2102    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2103    (4) CHAR ... CHAR
2104    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2105
2106    When the composition end is found, LENGTH and NCHARS in the
2107    annotation header is updated as below:
2108
2109    (1) LENGTH: unchanged, NCHARS: unchanged
2110    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2111    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2112    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2113    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2114
2115    If an error is found while composing, the annotation header is
2116    changed to the original composition header (plus filler -1s) as
2117    below:
2118
2119    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2120    (5)          [ 0x80 0xFF -1 -1- -1 ]
2121
2122    and the sequence [ -2 DECODED-RULE ] is changed to the original
2123    byte sequence as below:
2124         o the original byte sequence is B: [ B -1 ]
2125         o the original byte sequence is B1 B2: [ B1 B2 ]
2126
2127    Most of the routines are implemented by macros because many
2128    variables and labels in the caller decode_coding_emacs_mule must be
2129    accessible, and they are usually called just once (thus doesn't
2130    increase the size of compiled object).  */
2131
2132 /* Decode a composition rule represented by C as a component of
2133    composition sequence of Emacs 20 style.  Set RULE to the decoded
2134    rule. */
2135
2136 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2137   do {                                                  \
2138     int gref, nref;                                     \
2139                                                         \
2140     c -= 0xA0;                                          \
2141     if (c < 0 || c >= 81)                               \
2142       goto invalid_code;                                \
2143     gref = c / 9, nref = c % 9;                         \
2144     if (gref == 4) gref = 10;                           \
2145     if (nref == 4) nref = 10;                           \
2146     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2147   } while (0)
2148
2149
2150 /* Decode a composition rule represented by C and the following byte
2151    at SRC as a component of composition sequence of Emacs 21 style.
2152    Set RULE to the decoded rule.  */
2153
2154 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2155   do {                                                  \
2156     int gref, nref;                                     \
2157                                                         \
2158     gref = c - 0x20;                                    \
2159     if (gref < 0 || gref >= 81)                         \
2160       goto invalid_code;                                \
2161     ONE_MORE_BYTE (c);                                  \
2162     nref = c - 0x20;                                    \
2163     if (nref < 0 || nref >= 81)                         \
2164       goto invalid_code;                                \
2165     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2166   } while (0)
2167
2168
2169 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2170    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2171    byte length of this composition information, CHARS is the number of
2172    characters composed by this composition.  */
2173
2174 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2175   do {                                                                  \
2176     enum composition_method method = c - 0xF2;                          \
2177     int nbytes, nchars;                                                 \
2178                                                                         \
2179     ONE_MORE_BYTE (c);                                                  \
2180     if (c < 0)                                                          \
2181       goto invalid_code;                                                \
2182     nbytes = c - 0xA0;                                                  \
2183     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2184       goto invalid_code;                                                \
2185     ONE_MORE_BYTE (c);                                                  \
2186     nchars = c - 0xA0;                                                  \
2187     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2188       goto invalid_code;                                                \
2189     cmp_status->old_form = 0;                                           \
2190     cmp_status->method = method;                                        \
2191     if (method == COMPOSITION_RELATIVE)                                 \
2192       cmp_status->state = COMPOSING_CHAR;                               \
2193     else                                                                \
2194       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2195     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2196     cmp_status->nchars = nchars;                                        \
2197     cmp_status->ncomps = nbytes - 4;                                    \
2198     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2199   } while (0)
2200
2201
2202 /* Start of Emacs 20 style format for relative composition.  */
2203
2204 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2205   do {                                                          \
2206     cmp_status->old_form = 1;                                   \
2207     cmp_status->method = COMPOSITION_RELATIVE;                  \
2208     cmp_status->state = COMPOSING_CHAR;                         \
2209     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2210     cmp_status->nchars = cmp_status->ncomps = 0;                \
2211     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2212   } while (0)
2213
2214
2215 /* Start of Emacs 20 style format for rule-base composition.  */
2216
2217 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2218   do {                                                          \
2219     cmp_status->old_form = 1;                                   \
2220     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2221     cmp_status->state = COMPOSING_CHAR;                         \
2222     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2223     cmp_status->nchars = cmp_status->ncomps = 0;                \
2224     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2225   } while (0)
2226
2227
2228 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2229   do {                                                  \
2230     const unsigned char *current_src = src;             \
2231                                                         \
2232     ONE_MORE_BYTE (c);                                  \
2233     if (c < 0)                                          \
2234       goto invalid_code;                                \
2235     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2236         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2237       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2238     else if (c < 0xA0)                                  \
2239       goto invalid_code;                                \
2240     else if (c < 0xC0)                                  \
2241       {                                                 \
2242         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2243         /* Re-read C as a composition component.  */    \
2244         src = current_src;                              \
2245       }                                                 \
2246     else if (c == 0xFF)                                 \
2247       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2248     else                                                \
2249       goto invalid_code;                                \
2250   } while (0)
2251
2252 #define EMACS_MULE_COMPOSITION_END()                            \
2253   do {                                                          \
2254     int idx = - cmp_status->length;                             \
2255                                                                 \
2256     if (cmp_status->old_form)                                   \
2257       charbuf[idx + 2] = cmp_status->nchars;                    \
2258     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2259       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2260     cmp_status->state = COMPOSING_NO;                           \
2261   } while (0)
2262
2263
2264 static int
2265 emacs_mule_finish_composition (int *charbuf,
2266                                struct composition_status *cmp_status)
2267 {
2268   int idx = - cmp_status->length;
2269   int new_chars;
2270
2271   if (cmp_status->old_form && cmp_status->nchars > 0)
2272     {
2273       charbuf[idx + 2] = cmp_status->nchars;
2274       new_chars = 0;
2275       if (cmp_status->method == COMPOSITION_WITH_RULE
2276           && cmp_status->state == COMPOSING_CHAR)
2277         {
2278           /* The last rule was invalid.  */
2279           int rule = charbuf[-1] + 0xA0;
2280
2281           charbuf[-2] = BYTE8_TO_CHAR (rule);
2282           charbuf[-1] = -1;
2283           new_chars = 1;
2284         }
2285     }
2286   else
2287     {
2288       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2289
2290       if (cmp_status->method == COMPOSITION_WITH_RULE)
2291         {
2292           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2293           charbuf[idx++] = -3;
2294           charbuf[idx++] = 0;
2295           new_chars = 1;
2296         }
2297       else
2298         {
2299           int nchars = charbuf[idx + 1] + 0xA0;
2300           int nbytes = charbuf[idx + 2] + 0xA0;
2301
2302           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2303           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2304           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2305           charbuf[idx++] = -1;
2306           new_chars = 4;
2307         }
2308     }
2309   cmp_status->state = COMPOSING_NO;
2310   return new_chars;
2311 }
2312
2313 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2314   do {                                                                    \
2315     if (cmp_status->state != COMPOSING_NO)                                \
2316       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2317   } while (0)
2318
2319
2320 static void
2321 decode_coding_emacs_mule (struct coding_system *coding)
2322 {
2323   const unsigned char *src = coding->source + coding->consumed;
2324   const unsigned char *src_end = coding->source + coding->src_bytes;
2325   const unsigned char *src_base;
2326   int *charbuf = coding->charbuf + coding->charbuf_used;
2327   /* We may produce two annotations (charset and composition) in one
2328      loop and one more charset annotation at the end.  */
2329   int *charbuf_end
2330     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2331       /* We can produce up to 2 characters in a loop.  */
2332       - 1;
2333   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2334   bool multibytep = coding->src_multibyte;
2335   ptrdiff_t char_offset = coding->produced_char;
2336   ptrdiff_t last_offset = char_offset;
2337   int last_id = charset_ascii;
2338   bool eol_dos
2339     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2340   int byte_after_cr = -1;
2341   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2342
2343   if (cmp_status->state != COMPOSING_NO)
2344     {
2345       int i;
2346
2347       if (charbuf_end - charbuf < cmp_status->length)
2348         abort ();
2349       for (i = 0; i < cmp_status->length; i++)
2350         *charbuf++ = cmp_status->carryover[i];
2351       coding->annotated = 1;
2352     }
2353
2354   while (1)
2355     {
2356       int c, id IF_LINT (= 0);
2357
2358       src_base = src;
2359       consumed_chars_base = consumed_chars;
2360
2361       if (charbuf >= charbuf_end)
2362         {
2363           if (byte_after_cr >= 0)
2364             src_base--;
2365           break;
2366         }
2367
2368       if (byte_after_cr >= 0)
2369         c = byte_after_cr, byte_after_cr = -1;
2370       else
2371         ONE_MORE_BYTE (c);
2372
2373       if (c < 0 || c == 0x80)
2374         {
2375           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2376           if (c < 0)
2377             {
2378               *charbuf++ = -c;
2379               char_offset++;
2380             }
2381           else
2382             DECODE_EMACS_MULE_COMPOSITION_START ();
2383           continue;
2384         }
2385
2386       if (c < 0x80)
2387         {
2388           if (eol_dos && c == '\r')
2389             ONE_MORE_BYTE (byte_after_cr);
2390           id = charset_ascii;
2391           if (cmp_status->state != COMPOSING_NO)
2392             {
2393               if (cmp_status->old_form)
2394                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2395               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2396                 cmp_status->ncomps--;
2397             }
2398         }
2399       else
2400         {
2401           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2402           /* emacs_mule_char can load a charset map from a file, which
2403              allocates a large structure and might cause buffer text
2404              to be relocated as result.  Thus, we need to remember the
2405              original pointer to buffer text, and fix up all related
2406              pointers after the call.  */
2407           const unsigned char *orig = coding->source;
2408           ptrdiff_t offset;
2409
2410           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2411                                cmp_status);
2412           offset = coding->source - orig;
2413           if (offset)
2414             {
2415               src += offset;
2416               src_base += offset;
2417               src_end += offset;
2418             }
2419           if (c < 0)
2420             {
2421               if (c == -1)
2422                 goto invalid_code;
2423               if (c == -2)
2424                 break;
2425             }
2426           src = src_base + nbytes;
2427           consumed_chars = consumed_chars_base + nchars;
2428           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2429             cmp_status->ncomps -= nchars;
2430         }
2431
2432       /* Now if C >= 0, we found a normally encoded character, if C <
2433          0, we found an old-style composition component character or
2434          rule.  */
2435
2436       if (cmp_status->state == COMPOSING_NO)
2437         {
2438           if (last_id != id)
2439             {
2440               if (last_id != charset_ascii)
2441                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2442                                   last_id);
2443               last_id = id;
2444               last_offset = char_offset;
2445             }
2446           *charbuf++ = c;
2447           char_offset++;
2448         }
2449       else if (cmp_status->state == COMPOSING_CHAR)
2450         {
2451           if (cmp_status->old_form)
2452             {
2453               if (c >= 0)
2454                 {
2455                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2456                   *charbuf++ = c;
2457                   char_offset++;
2458                 }
2459               else
2460                 {
2461                   *charbuf++ = -c;
2462                   cmp_status->nchars++;
2463                   cmp_status->length++;
2464                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2465                     EMACS_MULE_COMPOSITION_END ();
2466                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2467                     cmp_status->state = COMPOSING_RULE;
2468                 }
2469             }
2470           else
2471             {
2472               *charbuf++ = c;
2473               cmp_status->length++;
2474               cmp_status->nchars--;
2475               if (cmp_status->nchars == 0)
2476                 EMACS_MULE_COMPOSITION_END ();
2477             }
2478         }
2479       else if (cmp_status->state == COMPOSING_RULE)
2480         {
2481           int rule;
2482
2483           if (c >= 0)
2484             {
2485               EMACS_MULE_COMPOSITION_END ();
2486               *charbuf++ = c;
2487               char_offset++;
2488             }
2489           else
2490             {
2491               c = -c;
2492               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2493               if (rule < 0)
2494                 goto invalid_code;
2495               *charbuf++ = -2;
2496               *charbuf++ = rule;
2497               cmp_status->length += 2;
2498               cmp_status->state = COMPOSING_CHAR;
2499             }
2500         }
2501       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2502         {
2503           *charbuf++ = c;
2504           cmp_status->length++;
2505           if (cmp_status->ncomps == 0)
2506             cmp_status->state = COMPOSING_CHAR;
2507           else if (cmp_status->ncomps > 0)
2508             {
2509               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2510                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2511             }
2512           else
2513             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2514         }
2515       else                      /* COMPOSING_COMPONENT_RULE */
2516         {
2517           int rule;
2518
2519           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2520           if (rule < 0)
2521             goto invalid_code;
2522           *charbuf++ = -2;
2523           *charbuf++ = rule;
2524           cmp_status->length += 2;
2525           cmp_status->ncomps--;
2526           if (cmp_status->ncomps > 0)
2527             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2528           else
2529             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2530         }
2531       continue;
2532
2533     invalid_code:
2534       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2535       src = src_base;
2536       consumed_chars = consumed_chars_base;
2537       ONE_MORE_BYTE (c);
2538       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2539       char_offset++;
2540       coding->errors++;
2541     }
2542
2543  no_more_source:
2544   if (cmp_status->state != COMPOSING_NO)
2545     {
2546       if (coding->mode & CODING_MODE_LAST_BLOCK)
2547         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2548       else
2549         {
2550           int i;
2551
2552           charbuf -= cmp_status->length;
2553           for (i = 0; i < cmp_status->length; i++)
2554             cmp_status->carryover[i] = charbuf[i];
2555         }
2556     }
2557   if (last_id != charset_ascii)
2558     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2559   coding->consumed_char += consumed_chars_base;
2560   coding->consumed = src_base - coding->source;
2561   coding->charbuf_used = charbuf - coding->charbuf;
2562 }
2563
2564
2565 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2566   do {                                          \
2567     if (id < 0xA0)                              \
2568       codes[0] = id, codes[1] = 0;              \
2569     else if (id < 0xE0)                         \
2570       codes[0] = 0x9A, codes[1] = id;           \
2571     else if (id < 0xF0)                         \
2572       codes[0] = 0x9B, codes[1] = id;           \
2573     else if (id < 0xF5)                         \
2574       codes[0] = 0x9C, codes[1] = id;           \
2575     else                                        \
2576       codes[0] = 0x9D, codes[1] = id;           \
2577   } while (0);
2578
2579
2580 static bool
2581 encode_coding_emacs_mule (struct coding_system *coding)
2582 {
2583   bool multibytep = coding->dst_multibyte;
2584   int *charbuf = coding->charbuf;
2585   int *charbuf_end = charbuf + coding->charbuf_used;
2586   unsigned char *dst = coding->destination + coding->produced;
2587   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2588   int safe_room = 8;
2589   ptrdiff_t produced_chars = 0;
2590   Lisp_Object attrs, charset_list;
2591   int c;
2592   int preferred_charset_id = -1;
2593
2594   CODING_GET_INFO (coding, attrs, charset_list);
2595   if (! EQ (charset_list, Vemacs_mule_charset_list))
2596     {
2597       charset_list = Vemacs_mule_charset_list;
2598       ASET (attrs, coding_attr_charset_list, charset_list);
2599     }
2600
2601   while (charbuf < charbuf_end)
2602     {
2603       ASSURE_DESTINATION (safe_room);
2604       c = *charbuf++;
2605
2606       if (c < 0)
2607         {
2608           /* Handle an annotation.  */
2609           switch (*charbuf)
2610             {
2611             case CODING_ANNOTATE_COMPOSITION_MASK:
2612               /* Not yet implemented.  */
2613               break;
2614             case CODING_ANNOTATE_CHARSET_MASK:
2615               preferred_charset_id = charbuf[3];
2616               if (preferred_charset_id >= 0
2617                   && NILP (Fmemq (make_number (preferred_charset_id),
2618                                   charset_list)))
2619                 preferred_charset_id = -1;
2620               break;
2621             default:
2622               abort ();
2623             }
2624           charbuf += -c - 1;
2625           continue;
2626         }
2627
2628       if (ASCII_CHAR_P (c))
2629         EMIT_ONE_ASCII_BYTE (c);
2630       else if (CHAR_BYTE8_P (c))
2631         {
2632           c = CHAR_TO_BYTE8 (c);
2633           EMIT_ONE_BYTE (c);
2634         }
2635       else
2636         {
2637           struct charset *charset;
2638           unsigned code;
2639           int dimension;
2640           int emacs_mule_id;
2641           unsigned char leading_codes[2];
2642
2643           if (preferred_charset_id >= 0)
2644             {
2645               bool result;
2646
2647               charset = CHARSET_FROM_ID (preferred_charset_id);
2648               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2649               if (result)
2650                 code = ENCODE_CHAR (charset, c);
2651               else
2652                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2653                                      &code, charset);
2654             }
2655           else
2656             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2657                                  &code, charset);
2658           if (! charset)
2659             {
2660               c = coding->default_char;
2661               if (ASCII_CHAR_P (c))
2662                 {
2663                   EMIT_ONE_ASCII_BYTE (c);
2664                   continue;
2665                 }
2666               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2667                                    &code, charset);
2668             }
2669           dimension = CHARSET_DIMENSION (charset);
2670           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2671           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2672           EMIT_ONE_BYTE (leading_codes[0]);
2673           if (leading_codes[1])
2674             EMIT_ONE_BYTE (leading_codes[1]);
2675           if (dimension == 1)
2676             EMIT_ONE_BYTE (code | 0x80);
2677           else
2678             {
2679               code |= 0x8080;
2680               EMIT_ONE_BYTE (code >> 8);
2681               EMIT_ONE_BYTE (code & 0xFF);
2682             }
2683         }
2684     }
2685   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2686   coding->produced_char += produced_chars;
2687   coding->produced = dst - coding->destination;
2688   return 0;
2689 }
2690
2691 \f
2692 /*** 7. ISO2022 handlers ***/
2693
2694 /* The following note describes the coding system ISO2022 briefly.
2695    Since the intention of this note is to help understand the
2696    functions in this file, some parts are NOT ACCURATE or are OVERLY
2697    SIMPLIFIED.  For thorough understanding, please refer to the
2698    original document of ISO2022.  This is equivalent to the standard
2699    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2700
2701    ISO2022 provides many mechanisms to encode several character sets
2702    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2703    is encoded using bytes less than 128.  This may make the encoded
2704    text a little bit longer, but the text passes more easily through
2705    several types of gateway, some of which strip off the MSB (Most
2706    Significant Bit).
2707
2708    There are two kinds of character sets: control character sets and
2709    graphic character sets.  The former contain control characters such
2710    as `newline' and `escape' to provide control functions (control
2711    functions are also provided by escape sequences).  The latter
2712    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2713    two control character sets and many graphic character sets.
2714
2715    Graphic character sets are classified into one of the following
2716    four classes, according to the number of bytes (DIMENSION) and
2717    number of characters in one dimension (CHARS) of the set:
2718    - DIMENSION1_CHARS94
2719    - DIMENSION1_CHARS96
2720    - DIMENSION2_CHARS94
2721    - DIMENSION2_CHARS96
2722
2723    In addition, each character set is assigned an identification tag,
2724    unique for each set, called the "final character" (denoted as <F>
2725    hereafter).  The <F> of each character set is decided by ECMA(*)
2726    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2727    (0x30..0x3F are for private use only).
2728
2729    Note (*): ECMA = European Computer Manufacturers Association
2730
2731    Here are examples of graphic character sets [NAME(<F>)]:
2732         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2733         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2734         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2735         o DIMENSION2_CHARS96 -- none for the moment
2736
2737    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2738         C0 [0x00..0x1F] -- control character plane 0
2739         GL [0x20..0x7F] -- graphic character plane 0
2740         C1 [0x80..0x9F] -- control character plane 1
2741         GR [0xA0..0xFF] -- graphic character plane 1
2742
2743    A control character set is directly designated and invoked to C0 or
2744    C1 by an escape sequence.  The most common case is that:
2745    - ISO646's  control character set is designated/invoked to C0, and
2746    - ISO6429's control character set is designated/invoked to C1,
2747    and usually these designations/invocations are omitted in encoded
2748    text.  In a 7-bit environment, only C0 can be used, and a control
2749    character for C1 is encoded by an appropriate escape sequence to
2750    fit into the environment.  All control characters for C1 are
2751    defined to have corresponding escape sequences.
2752
2753    A graphic character set is at first designated to one of four
2754    graphic registers (G0 through G3), then these graphic registers are
2755    invoked to GL or GR.  These designations and invocations can be
2756    done independently.  The most common case is that G0 is invoked to
2757    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2758    these invocations and designations are omitted in encoded text.
2759    In a 7-bit environment, only GL can be used.
2760
2761    When a graphic character set of CHARS94 is invoked to GL, codes
2762    0x20 and 0x7F of the GL area work as control characters SPACE and
2763    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2764    be used.
2765
2766    There are two ways of invocation: locking-shift and single-shift.
2767    With locking-shift, the invocation lasts until the next different
2768    invocation, whereas with single-shift, the invocation affects the
2769    following character only and doesn't affect the locking-shift
2770    state.  Invocations are done by the following control characters or
2771    escape sequences:
2772
2773    ----------------------------------------------------------------------
2774    abbrev  function                  cntrl escape seq   description
2775    ----------------------------------------------------------------------
2776    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2777    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2778    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2779    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2780    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2781    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2782    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2783    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2784    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2785    ----------------------------------------------------------------------
2786    (*) These are not used by any known coding system.
2787
2788    Control characters for these functions are defined by macros
2789    ISO_CODE_XXX in `coding.h'.
2790
2791    Designations are done by the following escape sequences:
2792    ----------------------------------------------------------------------
2793    escape sequence      description
2794    ----------------------------------------------------------------------
2795    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2796    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2797    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2798    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2799    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2800    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2801    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2802    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2803    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2804    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2805    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2806    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2807    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2808    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2809    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2810    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2811    ----------------------------------------------------------------------
2812
2813    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2814    of dimension 1, chars 94, and final character <F>, etc...
2815
2816    Note (*): Although these designations are not allowed in ISO2022,
2817    Emacs accepts them on decoding, and produces them on encoding
2818    CHARS96 character sets in a coding system which is characterized as
2819    7-bit environment, non-locking-shift, and non-single-shift.
2820
2821    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2822    '(' must be omitted.  We refer to this as "short-form" hereafter.
2823
2824    Now you may notice that there are a lot of ways of encoding the
2825    same multilingual text in ISO2022.  Actually, there exist many
2826    coding systems such as Compound Text (used in X11's inter client
2827    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2828    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2829    localized platforms), and all of these are variants of ISO2022.
2830
2831    In addition to the above, Emacs handles two more kinds of escape
2832    sequences: ISO6429's direction specification and Emacs' private
2833    sequence for specifying character composition.
2834
2835    ISO6429's direction specification takes the following form:
2836         o CSI ']'      -- end of the current direction
2837         o CSI '0' ']'  -- end of the current direction
2838         o CSI '1' ']'  -- start of left-to-right text
2839         o CSI '2' ']'  -- start of right-to-left text
2840    The control character CSI (0x9B: control sequence introducer) is
2841    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2842
2843    Character composition specification takes the following form:
2844         o ESC '0' -- start relative composition
2845         o ESC '1' -- end composition
2846         o ESC '2' -- start rule-base composition (*)
2847         o ESC '3' -- start relative composition with alternate chars  (**)
2848         o ESC '4' -- start rule-base composition with alternate chars  (**)
2849   Since these are not standard escape sequences of any ISO standard,
2850   the use of them with these meanings is restricted to Emacs only.
2851
2852   (*) This form is used only in Emacs 20.7 and older versions,
2853   but newer versions can safely decode it.
2854   (**) This form is used only in Emacs 21.1 and newer versions,
2855   and older versions can't decode it.
2856
2857   Here's a list of example usages of these composition escape
2858   sequences (categorized by `enum composition_method').
2859
2860   COMPOSITION_RELATIVE:
2861         ESC 0 CHAR [ CHAR ] ESC 1
2862   COMPOSITION_WITH_RULE:
2863         ESC 2 CHAR [ RULE CHAR ] ESC 1
2864   COMPOSITION_WITH_ALTCHARS:
2865         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2866   COMPOSITION_WITH_RULE_ALTCHARS:
2867         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2868
2869 static enum iso_code_class_type iso_code_class[256];
2870
2871 #define SAFE_CHARSET_P(coding, id)      \
2872   ((id) <= (coding)->max_charset_id     \
2873    && (coding)->safe_charsets[id] != 255)
2874
2875 static void
2876 setup_iso_safe_charsets (Lisp_Object attrs)
2877 {
2878   Lisp_Object charset_list, safe_charsets;
2879   Lisp_Object request;
2880   Lisp_Object reg_usage;
2881   Lisp_Object tail;
2882   EMACS_INT reg94, reg96;
2883   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2884   int max_charset_id;
2885
2886   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2887   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2888       && ! EQ (charset_list, Viso_2022_charset_list))
2889     {
2890       charset_list = Viso_2022_charset_list;
2891       ASET (attrs, coding_attr_charset_list, charset_list);
2892       ASET (attrs, coding_attr_safe_charsets, Qnil);
2893     }
2894
2895   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2896     return;
2897
2898   max_charset_id = 0;
2899   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2900     {
2901       int id = XINT (XCAR (tail));
2902       if (max_charset_id < id)
2903         max_charset_id = id;
2904     }
2905
2906   safe_charsets = make_uninit_string (max_charset_id + 1);
2907   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2908   request = AREF (attrs, coding_attr_iso_request);
2909   reg_usage = AREF (attrs, coding_attr_iso_usage);
2910   reg94 = XINT (XCAR (reg_usage));
2911   reg96 = XINT (XCDR (reg_usage));
2912
2913   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2914     {
2915       Lisp_Object id;
2916       Lisp_Object reg;
2917       struct charset *charset;
2918
2919       id = XCAR (tail);
2920       charset = CHARSET_FROM_ID (XINT (id));
2921       reg = Fcdr (Fassq (id, request));
2922       if (! NILP (reg))
2923         SSET (safe_charsets, XINT (id), XINT (reg));
2924       else if (charset->iso_chars_96)
2925         {
2926           if (reg96 < 4)
2927             SSET (safe_charsets, XINT (id), reg96);
2928         }
2929       else
2930         {
2931           if (reg94 < 4)
2932             SSET (safe_charsets, XINT (id), reg94);
2933         }
2934     }
2935   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2936 }
2937
2938
2939 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2940    Return true if a text is encoded in one of ISO-2022 based coding
2941    systems.  */
2942
2943 static bool
2944 detect_coding_iso_2022 (struct coding_system *coding,
2945                         struct coding_detection_info *detect_info)
2946 {
2947   const unsigned char *src = coding->source, *src_base = src;
2948   const unsigned char *src_end = coding->source + coding->src_bytes;
2949   bool multibytep = coding->src_multibyte;
2950   bool single_shifting = 0;
2951   int id;
2952   int c, c1;
2953   ptrdiff_t consumed_chars = 0;
2954   int i;
2955   int rejected = 0;
2956   int found = 0;
2957   int composition_count = -1;
2958
2959   detect_info->checked |= CATEGORY_MASK_ISO;
2960
2961   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2962     {
2963       struct coding_system *this = &(coding_categories[i]);
2964       Lisp_Object attrs, val;
2965
2966       if (this->id < 0)
2967         continue;
2968       attrs = CODING_ID_ATTRS (this->id);
2969       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2970           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2971         setup_iso_safe_charsets (attrs);
2972       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2973       this->max_charset_id = SCHARS (val) - 1;
2974       this->safe_charsets = SDATA (val);
2975     }
2976
2977   /* A coding system of this category is always ASCII compatible.  */
2978   src += coding->head_ascii;
2979
2980   while (rejected != CATEGORY_MASK_ISO)
2981     {
2982       src_base = src;
2983       ONE_MORE_BYTE (c);
2984       switch (c)
2985         {
2986         case ISO_CODE_ESC:
2987           if (inhibit_iso_escape_detection)
2988             break;
2989           single_shifting = 0;
2990           ONE_MORE_BYTE (c);
2991           if (c == 'N' || c == 'O')
2992             {
2993               /* ESC <Fe> for SS2 or SS3.  */
2994               single_shifting = 1;
2995               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2996             }
2997           else if (c == '1')
2998             {
2999               /* End of composition.  */
3000               if (composition_count < 0
3001                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3002                 /* Invalid */
3003                 break;
3004               composition_count = -1;
3005               found |= CATEGORY_MASK_ISO;
3006             }
3007           else if (c >= '0' && c <= '4')
3008             {
3009               /* ESC <Fp> for start/end composition.  */
3010               composition_count = 0;
3011             }
3012           else
3013             {
3014               if (c >= '(' && c <= '/')
3015                 {
3016                   /* Designation sequence for a charset of dimension 1.  */
3017                   ONE_MORE_BYTE (c1);
3018                   if (c1 < ' ' || c1 >= 0x80
3019                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3020                     /* Invalid designation sequence.  Just ignore.  */
3021                     break;
3022                 }
3023               else if (c == '$')
3024                 {
3025                   /* Designation sequence for a charset of dimension 2.  */
3026                   ONE_MORE_BYTE (c);
3027                   if (c >= '@' && c <= 'B')
3028                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3029                     id = iso_charset_table[1][0][c];
3030                   else if (c >= '(' && c <= '/')
3031                     {
3032                       ONE_MORE_BYTE (c1);
3033                       if (c1 < ' ' || c1 >= 0x80
3034                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3035                         /* Invalid designation sequence.  Just ignore.  */
3036                         break;
3037                     }
3038                   else
3039                     /* Invalid designation sequence.  Just ignore it.  */
3040                     break;
3041                 }
3042               else
3043                 {
3044                   /* Invalid escape sequence.  Just ignore it.  */
3045                   break;
3046                 }
3047
3048               /* We found a valid designation sequence for CHARSET.  */
3049               rejected |= CATEGORY_MASK_ISO_8BIT;
3050               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3051                                   id))
3052                 found |= CATEGORY_MASK_ISO_7;
3053               else
3054                 rejected |= CATEGORY_MASK_ISO_7;
3055               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3056                                   id))
3057                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3058               else
3059                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3060               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3061                                   id))
3062                 found |= CATEGORY_MASK_ISO_7_ELSE;
3063               else
3064                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3065               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3066                                   id))
3067                 found |= CATEGORY_MASK_ISO_8_ELSE;
3068               else
3069                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3070             }
3071           break;
3072
3073         case ISO_CODE_SO:
3074         case ISO_CODE_SI:
3075           /* Locking shift out/in.  */
3076           if (inhibit_iso_escape_detection)
3077             break;
3078           single_shifting = 0;
3079           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3080           break;
3081
3082         case ISO_CODE_CSI:
3083           /* Control sequence introducer.  */
3084           single_shifting = 0;
3085           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3086           found |= CATEGORY_MASK_ISO_8_ELSE;
3087           goto check_extra_latin;
3088
3089         case ISO_CODE_SS2:
3090         case ISO_CODE_SS3:
3091           /* Single shift.   */
3092           if (inhibit_iso_escape_detection)
3093             break;
3094           single_shifting = 0;
3095           rejected |= CATEGORY_MASK_ISO_7BIT;
3096           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3097               & CODING_ISO_FLAG_SINGLE_SHIFT)
3098             {
3099               found |= CATEGORY_MASK_ISO_8_1;
3100               single_shifting = 1;
3101             }
3102           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3103               & CODING_ISO_FLAG_SINGLE_SHIFT)
3104             {
3105               found |= CATEGORY_MASK_ISO_8_2;
3106               single_shifting = 1;
3107             }
3108           if (single_shifting)
3109             break;
3110         check_extra_latin:
3111           if (! VECTORP (Vlatin_extra_code_table)
3112               || NILP (AREF (Vlatin_extra_code_table, c)))
3113             {
3114               rejected = CATEGORY_MASK_ISO;
3115               break;
3116             }
3117           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3118               & CODING_ISO_FLAG_LATIN_EXTRA)
3119             found |= CATEGORY_MASK_ISO_8_1;
3120           else
3121             rejected |= CATEGORY_MASK_ISO_8_1;
3122           rejected |= CATEGORY_MASK_ISO_8_2;
3123           break;
3124
3125         default:
3126           if (c < 0)
3127             continue;
3128           if (c < 0x80)
3129             {
3130               if (composition_count >= 0)
3131                 composition_count++;
3132               single_shifting = 0;
3133               break;
3134             }
3135           if (c >= 0xA0)
3136             {
3137               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3138               found |= CATEGORY_MASK_ISO_8_1;
3139               /* Check the length of succeeding codes of the range
3140                  0xA0..0FF.  If the byte length is even, we include
3141                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3142                  only when we are not single shifting.  */
3143               if (! single_shifting
3144                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3145                 {
3146                   int len = 1;
3147                   while (src < src_end)
3148                     {
3149                       src_base = src;
3150                       ONE_MORE_BYTE (c);
3151                       if (c < 0xA0)
3152                         {
3153                           src = src_base;
3154                           break;
3155                         }
3156                       len++;
3157                     }
3158
3159                   if (len & 1 && src < src_end)
3160                     {
3161                       rejected |= CATEGORY_MASK_ISO_8_2;
3162                       if (composition_count >= 0)
3163                         composition_count += len;
3164                     }
3165                   else
3166                     {
3167                       found |= CATEGORY_MASK_ISO_8_2;
3168                       if (composition_count >= 0)
3169                         composition_count += len / 2;
3170                     }
3171                 }
3172               break;
3173             }
3174         }
3175     }
3176   detect_info->rejected |= CATEGORY_MASK_ISO;
3177   return 0;
3178
3179  no_more_source:
3180   detect_info->rejected |= rejected;
3181   detect_info->found |= (found & ~rejected);
3182   return 1;
3183 }
3184
3185
3186 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3187    escape sequence should be kept.  */
3188 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3189   do {                                                                  \
3190     int id, prev;                                                       \
3191                                                                         \
3192     if (final < '0' || final >= 128                                     \
3193         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3194         || !SAFE_CHARSET_P (coding, id))                                \
3195       {                                                                 \
3196         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3197         chars_96 = -1;                                                  \
3198         break;                                                          \
3199       }                                                                 \
3200     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3201     if (id == charset_jisx0201_roman)                                   \
3202       {                                                                 \
3203         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3204           id = charset_ascii;                                           \
3205       }                                                                 \
3206     else if (id == charset_jisx0208_1978)                               \
3207       {                                                                 \
3208         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3209           id = charset_jisx0208;                                        \
3210       }                                                                 \
3211     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3212     /* If there was an invalid designation to REG previously, and this  \
3213        designation is ASCII to REG, we should keep this designation     \
3214        sequence.  */                                                    \
3215     if (prev == -2 && id == charset_ascii)                              \
3216       chars_96 = -1;                                                    \
3217   } while (0)
3218
3219
3220 /* Handle these composition sequence (ALT: alternate char):
3221
3222    (1) relative composition: ESC 0 CHAR ... ESC 1
3223    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3224    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3225    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3226
3227    When the start sequence (ESC 0/2/3/4) is found, this annotation
3228    header is produced.
3229
3230         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3231
3232    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3233    produced until the end sequence (ESC 1) is found:
3234
3235    (1) CHAR ... CHAR
3236    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3237    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3238    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3239
3240    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3241    annotation header is updated as below:
3242
3243    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3244    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3245    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3246    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3247
3248    If an error is found while composing, the annotation header is
3249    changed to:
3250
3251         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3252
3253    and the sequence [ -2 DECODED-RULE ] is changed to the original
3254    byte sequence as below:
3255         o the original byte sequence is B: [ B -1 ]
3256         o the original byte sequence is B1 B2: [ B1 B2 ]
3257    and the sequence [ -1 -1 ] is changed to the original byte
3258    sequence:
3259         [ ESC '0' ]
3260 */
3261
3262 /* Decode a composition rule C1 and maybe one more byte from the
3263    source, and set RULE to the encoded composition rule.  If the rule
3264    is invalid, goto invalid_code.  */
3265
3266 #define DECODE_COMPOSITION_RULE(rule)                                   \
3267   do {                                                                  \
3268     rule = c1 - 32;                                                     \
3269     if (rule < 0)                                                       \
3270       goto invalid_code;                                                \
3271     if (rule < 81)              /* old format (before ver.21) */        \
3272       {                                                                 \
3273         int gref = (rule) / 9;                                          \
3274         int nref = (rule) % 9;                                          \
3275         if (gref == 4) gref = 10;                                       \
3276         if (nref == 4) nref = 10;                                       \
3277         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3278       }                                                                 \
3279     else                        /* new format (after ver.21) */         \
3280       {                                                                 \
3281         int b;                                                          \
3282                                                                         \
3283         ONE_MORE_BYTE (b);                                              \
3284         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3285           goto invalid_code;                                            \
3286         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3287         rule += 0x100;   /* Distinguish it from the old format.  */     \
3288       }                                                                 \
3289   } while (0)
3290
3291 #define ENCODE_COMPOSITION_RULE(rule)                           \
3292   do {                                                          \
3293     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3294                                                                 \
3295     if (rule < 0x100)           /* old format */                \
3296       {                                                         \
3297         if (gref == 10) gref = 4;                               \
3298         if (nref == 10) nref = 4;                               \
3299         charbuf[idx] = 32 + gref * 9 + nref;                    \
3300         charbuf[idx + 1] = -1;                                  \
3301         new_chars++;                                            \
3302       }                                                         \
3303     else                                /* new format */        \
3304       {                                                         \
3305         charbuf[idx] = 32 + 81 + gref;                          \
3306         charbuf[idx + 1] = 32 + nref;                           \
3307         new_chars += 2;                                         \
3308       }                                                         \
3309   } while (0)
3310
3311 /* Finish the current composition as invalid.  */
3312
3313 static int
3314 finish_composition (int *charbuf, struct composition_status *cmp_status)
3315 {
3316   int idx = - cmp_status->length;
3317   int new_chars;
3318
3319   /* Recover the original ESC sequence */
3320   charbuf[idx++] = ISO_CODE_ESC;
3321   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3322                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3323                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3324                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3325                     : '4');
3326   charbuf[idx++] = -2;
3327   charbuf[idx++] = 0;
3328   charbuf[idx++] = -1;
3329   new_chars = cmp_status->nchars;
3330   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3331     for (; idx < 0; idx++)
3332       {
3333         int elt = charbuf[idx];
3334
3335         if (elt == -2)
3336           {
3337             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3338             idx++;
3339           }
3340         else if (elt == -1)
3341           {
3342             charbuf[idx++] = ISO_CODE_ESC;
3343             charbuf[idx] = '0';
3344             new_chars += 2;
3345           }
3346       }
3347   cmp_status->state = COMPOSING_NO;
3348   return new_chars;
3349 }
3350
3351 /* If characters are under composition, finish the composition.  */
3352 #define MAYBE_FINISH_COMPOSITION()                              \
3353   do {                                                          \
3354     if (cmp_status->state != COMPOSING_NO)                      \
3355       char_offset += finish_composition (charbuf, cmp_status);  \
3356   } while (0)
3357
3358 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3359
3360    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3361    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3362    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3363    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3364
3365    Produce this annotation sequence now:
3366
3367    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3368 */
3369
3370 #define DECODE_COMPOSITION_START(c1)                                       \
3371   do {                                                                     \
3372     if (c1 == '0'                                                          \
3373         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3374              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3375             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3376                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3377       {                                                                    \
3378         *charbuf++ = -1;                                                   \
3379         *charbuf++= -1;                                                    \
3380         cmp_status->state = COMPOSING_CHAR;                                \
3381         cmp_status->length += 2;                                           \
3382       }                                                                    \
3383     else                                                                   \
3384       {                                                                    \
3385         MAYBE_FINISH_COMPOSITION ();                                       \
3386         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3387                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3388                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3389                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3390         cmp_status->state                                                  \
3391           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3392         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3393         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3394         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3395         coding->annotated = 1;                                             \
3396       }                                                                    \
3397   } while (0)
3398
3399
3400 /* Handle composition end sequence ESC 1.  */
3401
3402 #define DECODE_COMPOSITION_END()                                        \
3403   do {                                                                  \
3404     if (cmp_status->nchars == 0                                         \
3405         || ((cmp_status->state == COMPOSING_CHAR)                       \
3406             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3407       {                                                                 \
3408         MAYBE_FINISH_COMPOSITION ();                                    \
3409         goto invalid_code;                                              \
3410       }                                                                 \
3411     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3412       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3413     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3414       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3415     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3416     char_offset += cmp_status->nchars;                                  \
3417     cmp_status->state = COMPOSING_NO;                                   \
3418   } while (0)
3419
3420 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3421
3422 #define STORE_COMPOSITION_RULE(rule)    \
3423   do {                                  \
3424     *charbuf++ = -2;                    \
3425     *charbuf++ = rule;                  \
3426     cmp_status->length += 2;            \
3427     cmp_status->state--;                \
3428   } while (0)
3429
3430 /* Store a composed char or a component char C in charbuf, and update
3431    cmp_status.  */
3432
3433 #define STORE_COMPOSITION_CHAR(c)                                       \
3434   do {                                                                  \
3435     *charbuf++ = (c);                                                   \
3436     cmp_status->length++;                                               \
3437     if (cmp_status->state == COMPOSING_CHAR)                            \
3438       cmp_status->nchars++;                                             \
3439     else                                                                \
3440       cmp_status->ncomps++;                                             \
3441     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3442         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3443             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3444       cmp_status->state++;                                              \
3445   } while (0)
3446
3447
3448 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3449
3450 static void
3451 decode_coding_iso_2022 (struct coding_system *coding)
3452 {
3453   const unsigned char *src = coding->source + coding->consumed;
3454   const unsigned char *src_end = coding->source + coding->src_bytes;
3455   const unsigned char *src_base;
3456   int *charbuf = coding->charbuf + coding->charbuf_used;
3457   /* We may produce two annotations (charset and composition) in one
3458      loop and one more charset annotation at the end.  */
3459   int *charbuf_end
3460     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3461   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3462   bool multibytep = coding->src_multibyte;
3463   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3464   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3465   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3466   int charset_id_2, charset_id_3;
3467   struct charset *charset;
3468   int c;
3469   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3470   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3471   ptrdiff_t char_offset = coding->produced_char;
3472   ptrdiff_t last_offset = char_offset;
3473   int last_id = charset_ascii;
3474   bool eol_dos
3475     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3476   int byte_after_cr = -1;
3477   int i;
3478
3479   setup_iso_safe_charsets (attrs);
3480   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3481
3482   if (cmp_status->state != COMPOSING_NO)
3483     {
3484       if (charbuf_end - charbuf < cmp_status->length)
3485         abort ();
3486       for (i = 0; i < cmp_status->length; i++)
3487         *charbuf++ = cmp_status->carryover[i];
3488       coding->annotated = 1;
3489     }
3490
3491   while (1)
3492     {
3493       int c1, c2, c3;
3494
3495       src_base = src;
3496       consumed_chars_base = consumed_chars;
3497
3498       if (charbuf >= charbuf_end)
3499         {
3500           if (byte_after_cr >= 0)
3501             src_base--;
3502           break;
3503         }
3504
3505       if (byte_after_cr >= 0)
3506         c1 = byte_after_cr, byte_after_cr = -1;
3507       else
3508         ONE_MORE_BYTE (c1);
3509       if (c1 < 0)
3510         goto invalid_code;
3511
3512       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3513         {
3514           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3515           char_offset++;
3516           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3517           continue;
3518         }
3519
3520       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3521         {
3522           if (c1 == ISO_CODE_ESC)
3523             {
3524               if (src + 1 >= src_end)
3525                 goto no_more_source;
3526               *charbuf++ = ISO_CODE_ESC;
3527               char_offset++;
3528               if (src[0] == '%' && src[1] == '@')
3529                 {
3530                   src += 2;
3531                   consumed_chars += 2;
3532                   char_offset += 2;
3533                   /* We are sure charbuf can contain two more chars. */
3534                   *charbuf++ = '%';
3535                   *charbuf++ = '@';
3536                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3537                 }
3538             }
3539           else
3540             {
3541               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3542               char_offset++;
3543             }
3544           continue;
3545         }
3546
3547       if ((cmp_status->state == COMPOSING_RULE
3548            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3549           && c1 != ISO_CODE_ESC)
3550         {
3551           int rule;
3552
3553           DECODE_COMPOSITION_RULE (rule);
3554           STORE_COMPOSITION_RULE (rule);
3555           continue;
3556         }
3557
3558       /* We produce at most one character.  */
3559       switch (iso_code_class [c1])
3560         {
3561         case ISO_0x20_or_0x7F:
3562           if (charset_id_0 < 0
3563               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3564             /* This is SPACE or DEL.  */
3565             charset = CHARSET_FROM_ID (charset_ascii);
3566           else
3567             charset = CHARSET_FROM_ID (charset_id_0);
3568           break;
3569
3570         case ISO_graphic_plane_0:
3571           if (charset_id_0 < 0)
3572             charset = CHARSET_FROM_ID (charset_ascii);
3573           else
3574             charset = CHARSET_FROM_ID (charset_id_0);
3575           break;
3576
3577         case ISO_0xA0_or_0xFF:
3578           if (charset_id_1 < 0
3579               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3580               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3581             goto invalid_code;
3582           /* This is a graphic character, we fall down ... */
3583
3584         case ISO_graphic_plane_1:
3585           if (charset_id_1 < 0)
3586             goto invalid_code;
3587           charset = CHARSET_FROM_ID (charset_id_1);
3588           break;
3589
3590         case ISO_control_0:
3591           if (eol_dos && c1 == '\r')
3592             ONE_MORE_BYTE (byte_after_cr);
3593           MAYBE_FINISH_COMPOSITION ();
3594           charset = CHARSET_FROM_ID (charset_ascii);
3595           break;
3596
3597         case ISO_control_1:
3598           goto invalid_code;
3599
3600         case ISO_shift_out:
3601           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3602               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3603             goto invalid_code;
3604           CODING_ISO_INVOCATION (coding, 0) = 1;
3605           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3606           continue;
3607
3608         case ISO_shift_in:
3609           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3610             goto invalid_code;
3611           CODING_ISO_INVOCATION (coding, 0) = 0;
3612           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3613           continue;
3614
3615         case ISO_single_shift_2_7:
3616           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3617             goto invalid_code;
3618         case ISO_single_shift_2:
3619           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3620             goto invalid_code;
3621           /* SS2 is handled as an escape sequence of ESC 'N' */
3622           c1 = 'N';
3623           goto label_escape_sequence;
3624
3625         case ISO_single_shift_3:
3626           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3627             goto invalid_code;
3628           /* SS2 is handled as an escape sequence of ESC 'O' */
3629           c1 = 'O';
3630           goto label_escape_sequence;
3631
3632         case ISO_control_sequence_introducer:
3633           /* CSI is handled as an escape sequence of ESC '[' ...  */
3634           c1 = '[';
3635           goto label_escape_sequence;
3636
3637         case ISO_escape:
3638           ONE_MORE_BYTE (c1);
3639         label_escape_sequence:
3640           /* Escape sequences handled here are invocation,
3641              designation, direction specification, and character
3642              composition specification.  */
3643           switch (c1)
3644             {
3645             case '&':           /* revision of following character set */
3646               ONE_MORE_BYTE (c1);
3647               if (!(c1 >= '@' && c1 <= '~'))
3648                 goto invalid_code;
3649               ONE_MORE_BYTE (c1);
3650               if (c1 != ISO_CODE_ESC)
3651                 goto invalid_code;
3652               ONE_MORE_BYTE (c1);
3653               goto label_escape_sequence;
3654
3655             case '$':           /* designation of 2-byte character set */
3656               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3657                 goto invalid_code;
3658               {
3659                 int reg, chars96;
3660
3661                 ONE_MORE_BYTE (c1);
3662                 if (c1 >= '@' && c1 <= 'B')
3663                   {     /* designation of JISX0208.1978, GB2312.1980,
3664                            or JISX0208.1980 */
3665                     reg = 0, chars96 = 0;
3666                   }
3667                 else if (c1 >= 0x28 && c1 <= 0x2B)
3668                   { /* designation of DIMENSION2_CHARS94 character set */
3669                     reg = c1 - 0x28, chars96 = 0;
3670                     ONE_MORE_BYTE (c1);
3671                   }
3672                 else if (c1 >= 0x2C && c1 <= 0x2F)
3673                   { /* designation of DIMENSION2_CHARS96 character set */
3674                     reg = c1 - 0x2C, chars96 = 1;
3675                     ONE_MORE_BYTE (c1);
3676                   }
3677                 else
3678                   goto invalid_code;
3679                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3680                 /* We must update these variables now.  */
3681                 if (reg == 0)
3682                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3683                 else if (reg == 1)
3684                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3685                 if (chars96 < 0)
3686                   goto invalid_code;
3687               }
3688               continue;
3689
3690             case 'n':           /* invocation of locking-shift-2 */
3691               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3692                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3693                 goto invalid_code;
3694               CODING_ISO_INVOCATION (coding, 0) = 2;
3695               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3696               continue;
3697
3698             case 'o':           /* invocation of locking-shift-3 */
3699               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3700                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3701                 goto invalid_code;
3702               CODING_ISO_INVOCATION (coding, 0) = 3;
3703               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3704               continue;
3705
3706             case 'N':           /* invocation of single-shift-2 */
3707               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3708                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3709                 goto invalid_code;
3710               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3711               if (charset_id_2 < 0)
3712                 charset = CHARSET_FROM_ID (charset_ascii);
3713               else
3714                 charset = CHARSET_FROM_ID (charset_id_2);
3715               ONE_MORE_BYTE (c1);
3716               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3717                 goto invalid_code;
3718               break;
3719
3720             case 'O':           /* invocation of single-shift-3 */
3721               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3722                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3723                 goto invalid_code;
3724               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3725               if (charset_id_3 < 0)
3726                 charset = CHARSET_FROM_ID (charset_ascii);
3727               else
3728                 charset = CHARSET_FROM_ID (charset_id_3);
3729               ONE_MORE_BYTE (c1);
3730               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3731                 goto invalid_code;
3732               break;
3733
3734             case '0': case '2': case '3': case '4': /* start composition */
3735               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3736                 goto invalid_code;
3737               if (last_id != charset_ascii)
3738                 {
3739                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3740                   last_id = charset_ascii;
3741                   last_offset = char_offset;
3742                 }
3743               DECODE_COMPOSITION_START (c1);
3744               continue;
3745
3746             case '1':           /* end composition */
3747               if (cmp_status->state == COMPOSING_NO)
3748                 goto invalid_code;
3749               DECODE_COMPOSITION_END ();
3750               continue;
3751
3752             case '[':           /* specification of direction */
3753               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3754                 goto invalid_code;
3755               /* For the moment, nested direction is not supported.
3756                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3757                  left-to-right, and nonzero means right-to-left.  */
3758               ONE_MORE_BYTE (c1);
3759               switch (c1)
3760                 {
3761                 case ']':       /* end of the current direction */
3762                   coding->mode &= ~CODING_MODE_DIRECTION;
3763
3764                 case '0':       /* end of the current direction */
3765                 case '1':       /* start of left-to-right direction */
3766                   ONE_MORE_BYTE (c1);
3767                   if (c1 == ']')
3768                     coding->mode &= ~CODING_MODE_DIRECTION;
3769                   else
3770                     goto invalid_code;
3771                   break;
3772
3773                 case '2':       /* start of right-to-left direction */
3774                   ONE_MORE_BYTE (c1);
3775                   if (c1 == ']')
3776                     coding->mode |= CODING_MODE_DIRECTION;
3777                   else
3778                     goto invalid_code;
3779                   break;
3780
3781                 default:
3782                   goto invalid_code;
3783                 }
3784               continue;
3785
3786             case '%':
3787               ONE_MORE_BYTE (c1);
3788               if (c1 == '/')
3789                 {
3790                   /* CTEXT extended segment:
3791                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3792                      We keep these bytes as is for the moment.
3793                      They may be decoded by post-read-conversion.  */
3794                   int dim, M, L;
3795                   int size;
3796
3797                   ONE_MORE_BYTE (dim);
3798                   if (dim < '0' || dim > '4')
3799                     goto invalid_code;
3800                   ONE_MORE_BYTE (M);
3801                   if (M < 128)
3802                     goto invalid_code;
3803                   ONE_MORE_BYTE (L);
3804                   if (L < 128)
3805                     goto invalid_code;
3806                   size = ((M - 128) * 128) + (L - 128);
3807                   if (charbuf + 6 > charbuf_end)
3808                     goto break_loop;
3809                   *charbuf++ = ISO_CODE_ESC;
3810                   *charbuf++ = '%';
3811                   *charbuf++ = '/';
3812                   *charbuf++ = dim;
3813                   *charbuf++ = BYTE8_TO_CHAR (M);
3814                   *charbuf++ = BYTE8_TO_CHAR (L);
3815                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3816                 }
3817               else if (c1 == 'G')
3818                 {
3819                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3820                      ESC % G --UTF-8-BYTES-- ESC % @
3821                      We keep these bytes as is for the moment.
3822                      They may be decoded by post-read-conversion.  */
3823                   if (charbuf + 3 > charbuf_end)
3824                     goto break_loop;
3825                   *charbuf++ = ISO_CODE_ESC;
3826                   *charbuf++ = '%';
3827                   *charbuf++ = 'G';
3828                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3829                 }
3830               else
3831                 goto invalid_code;
3832               continue;
3833               break;
3834
3835             default:
3836               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3837                 goto invalid_code;
3838               {
3839                 int reg, chars96;
3840
3841                 if (c1 >= 0x28 && c1 <= 0x2B)
3842                   { /* designation of DIMENSION1_CHARS94 character set */
3843                     reg = c1 - 0x28, chars96 = 0;
3844                     ONE_MORE_BYTE (c1);
3845                   }
3846                 else if (c1 >= 0x2C && c1 <= 0x2F)
3847                   { /* designation of DIMENSION1_CHARS96 character set */
3848                     reg = c1 - 0x2C, chars96 = 1;
3849                     ONE_MORE_BYTE (c1);
3850                   }
3851                 else
3852                   goto invalid_code;
3853                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3854                 /* We must update these variables now.  */
3855                 if (reg == 0)
3856                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3857                 else if (reg == 1)
3858                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3859                 if (chars96 < 0)
3860                   goto invalid_code;
3861               }
3862               continue;
3863             }
3864           break;
3865
3866         default:
3867           abort ();
3868         }
3869
3870       if (cmp_status->state == COMPOSING_NO
3871           && charset->id != charset_ascii
3872           && last_id != charset->id)
3873         {
3874           if (last_id != charset_ascii)
3875             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3876           last_id = charset->id;
3877           last_offset = char_offset;
3878         }
3879
3880       /* Now we know CHARSET and 1st position code C1 of a character.
3881          Produce a decoded character while getting 2nd and 3rd
3882          position codes C2, C3 if necessary.  */
3883       if (CHARSET_DIMENSION (charset) > 1)
3884         {
3885           ONE_MORE_BYTE (c2);
3886           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3887               || ((c1 & 0x80) != (c2 & 0x80)))
3888             /* C2 is not in a valid range.  */
3889             goto invalid_code;
3890           if (CHARSET_DIMENSION (charset) == 2)
3891             c1 = (c1 << 8) | c2;
3892           else
3893             {
3894               ONE_MORE_BYTE (c3);
3895               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3896                   || ((c1 & 0x80) != (c3 & 0x80)))
3897                 /* C3 is not in a valid range.  */
3898                 goto invalid_code;
3899               c1 = (c1 << 16) | (c2 << 8) | c2;
3900             }
3901         }
3902       c1 &= 0x7F7F7F;
3903       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3904       if (c < 0)
3905         {
3906           MAYBE_FINISH_COMPOSITION ();
3907           for (; src_base < src; src_base++, char_offset++)
3908             {
3909               if (ASCII_BYTE_P (*src_base))
3910                 *charbuf++ = *src_base;
3911               else
3912                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3913             }
3914         }
3915       else if (cmp_status->state == COMPOSING_NO)
3916         {
3917           *charbuf++ = c;
3918           char_offset++;
3919         }
3920       else if ((cmp_status->state == COMPOSING_CHAR
3921                 ? cmp_status->nchars
3922                 : cmp_status->ncomps)
3923                >= MAX_COMPOSITION_COMPONENTS)
3924         {
3925           /* Too long composition.  */
3926           MAYBE_FINISH_COMPOSITION ();
3927           *charbuf++ = c;
3928           char_offset++;
3929         }
3930       else
3931         STORE_COMPOSITION_CHAR (c);
3932       continue;
3933
3934     invalid_code:
3935       MAYBE_FINISH_COMPOSITION ();
3936       src = src_base;
3937       consumed_chars = consumed_chars_base;
3938       ONE_MORE_BYTE (c);
3939       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3940       char_offset++;
3941       coding->errors++;
3942       continue;
3943
3944     break_loop:
3945       break;
3946     }
3947
3948  no_more_source:
3949   if (cmp_status->state != COMPOSING_NO)
3950     {
3951       if (coding->mode & CODING_MODE_LAST_BLOCK)
3952         MAYBE_FINISH_COMPOSITION ();
3953       else
3954         {
3955           charbuf -= cmp_status->length;
3956           for (i = 0; i < cmp_status->length; i++)
3957             cmp_status->carryover[i] = charbuf[i];
3958         }
3959     }
3960   else if (last_id != charset_ascii)
3961     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3962   coding->consumed_char += consumed_chars_base;
3963   coding->consumed = src_base - coding->source;
3964   coding->charbuf_used = charbuf - coding->charbuf;
3965 }
3966
3967
3968 /* ISO2022 encoding stuff.  */
3969
3970 /*
3971    It is not enough to say just "ISO2022" on encoding, we have to
3972    specify more details.  In Emacs, each coding system of ISO2022
3973    variant has the following specifications:
3974         1. Initial designation to G0 thru G3.
3975         2. Allows short-form designation?
3976         3. ASCII should be designated to G0 before control characters?
3977         4. ASCII should be designated to G0 at end of line?
3978         5. 7-bit environment or 8-bit environment?
3979         6. Use locking-shift?
3980         7. Use Single-shift?
3981    And the following two are only for Japanese:
3982         8. Use ASCII in place of JIS0201-1976-Roman?
3983         9. Use JISX0208-1983 in place of JISX0208-1978?
3984    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3985    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3986    details.
3987 */
3988
3989 /* Produce codes (escape sequence) for designating CHARSET to graphic
3990    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3991    '@', 'A', or 'B' and the coding system CODING allows, produce
3992    designation sequence of short-form.  */
3993
3994 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3995   do {                                                                  \
3996     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3997     const char *intermediate_char_94 = "()*+";                          \
3998     const char *intermediate_char_96 = ",-./";                          \
3999     int revision = -1;                                                  \
4000                                                                         \
4001     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4002       revision = CHARSET_ISO_REVISION (charset);                        \
4003                                                                         \
4004     if (revision >= 0)                                                  \
4005       {                                                                 \
4006         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4007         EMIT_ONE_BYTE ('@' + revision);                                 \
4008       }                                                                 \
4009     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4010     if (CHARSET_DIMENSION (charset) == 1)                               \
4011       {                                                                 \
4012         int b;                                                          \
4013         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4014           b = intermediate_char_94[reg];                                \
4015         else                                                            \
4016           b = intermediate_char_96[reg];                                \
4017         EMIT_ONE_ASCII_BYTE (b);                                        \
4018       }                                                                 \
4019     else                                                                \
4020       {                                                                 \
4021         EMIT_ONE_ASCII_BYTE ('$');                                      \
4022         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4023           {                                                             \
4024             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4025                 || reg != 0                                             \
4026                 || final_char < '@' || final_char > 'B')                \
4027               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4028           }                                                             \
4029         else                                                            \
4030           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4031       }                                                                 \
4032     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4033                                                                         \
4034     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4035   } while (0)
4036
4037
4038 /* The following two macros produce codes (control character or escape
4039    sequence) for ISO2022 single-shift functions (single-shift-2 and
4040    single-shift-3).  */
4041
4042 #define ENCODE_SINGLE_SHIFT_2                                           \
4043   do {                                                                  \
4044     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4045       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4046     else                                                                \
4047       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4048     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4049   } while (0)
4050
4051
4052 #define ENCODE_SINGLE_SHIFT_3                                           \
4053   do {                                                                  \
4054     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4055       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4056     else                                                                \
4057       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4058     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4059   } while (0)
4060
4061
4062 /* The following four macros produce codes (control character or
4063    escape sequence) for ISO2022 locking-shift functions (shift-in,
4064    shift-out, locking-shift-2, and locking-shift-3).  */
4065
4066 #define ENCODE_SHIFT_IN                                 \
4067   do {                                                  \
4068     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4069     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4070   } while (0)
4071
4072
4073 #define ENCODE_SHIFT_OUT                                \
4074   do {                                                  \
4075     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4076     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4077   } while (0)
4078
4079
4080 #define ENCODE_LOCKING_SHIFT_2                          \
4081   do {                                                  \
4082     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4083     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4084   } while (0)
4085
4086
4087 #define ENCODE_LOCKING_SHIFT_3                          \
4088   do {                                                  \
4089     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4090     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4091   } while (0)
4092
4093
4094 /* Produce codes for a DIMENSION1 character whose character set is
4095    CHARSET and whose position-code is C1.  Designation and invocation
4096    sequences are also produced in advance if necessary.  */
4097
4098 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4099   do {                                                                  \
4100     int id = CHARSET_ID (charset);                                      \
4101                                                                         \
4102     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4103         && id == charset_ascii)                                         \
4104       {                                                                 \
4105         id = charset_jisx0201_roman;                                    \
4106         charset = CHARSET_FROM_ID (id);                                 \
4107       }                                                                 \
4108                                                                         \
4109     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4110       {                                                                 \
4111         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4112           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4113         else                                                            \
4114           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4115         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4116         break;                                                          \
4117       }                                                                 \
4118     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4119       {                                                                 \
4120         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4121         break;                                                          \
4122       }                                                                 \
4123     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4124       {                                                                 \
4125         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4126         break;                                                          \
4127       }                                                                 \
4128     else                                                                \
4129       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4130          must invoke it, or, at first, designate it to some graphic     \
4131          register.  Then repeat the loop to actually produce the        \
4132          character.  */                                                 \
4133       dst = encode_invocation_designation (charset, coding, dst,        \
4134                                            &produced_chars);            \
4135   } while (1)
4136
4137
4138 /* Produce codes for a DIMENSION2 character whose character set is
4139    CHARSET and whose position-codes are C1 and C2.  Designation and
4140    invocation codes are also produced in advance if necessary.  */
4141
4142 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4143   do {                                                                  \
4144     int id = CHARSET_ID (charset);                                      \
4145                                                                         \
4146     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4147         && id == charset_jisx0208)                                      \
4148       {                                                                 \
4149         id = charset_jisx0208_1978;                                     \
4150         charset = CHARSET_FROM_ID (id);                                 \
4151       }                                                                 \
4152                                                                         \
4153     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4154       {                                                                 \
4155         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4156           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4157         else                                                            \
4158           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4159         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4160         break;                                                          \
4161       }                                                                 \
4162     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4163       {                                                                 \
4164         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4165         break;                                                          \
4166       }                                                                 \
4167     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4168       {                                                                 \
4169         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4170         break;                                                          \
4171       }                                                                 \
4172     else                                                                \
4173       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4174          must invoke it, or, at first, designate it to some graphic     \
4175          register.  Then repeat the loop to actually produce the        \
4176          character.  */                                                 \
4177       dst = encode_invocation_designation (charset, coding, dst,        \
4178                                            &produced_chars);            \
4179   } while (1)
4180
4181
4182 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4183   do {                                                                     \
4184     unsigned code;                                                         \
4185     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4186                                                                            \
4187     if (CHARSET_DIMENSION (charset) == 1)                                  \
4188       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4189     else                                                                   \
4190       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4191   } while (0)
4192
4193
4194 /* Produce designation and invocation codes at a place pointed by DST
4195    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4196    Return new DST.  */
4197
4198 static unsigned char *
4199 encode_invocation_designation (struct charset *charset,
4200                                struct coding_system *coding,
4201                                unsigned char *dst, ptrdiff_t *p_nchars)
4202 {
4203   bool multibytep = coding->dst_multibyte;
4204   ptrdiff_t produced_chars = *p_nchars;
4205   int reg;                      /* graphic register number */
4206   int id = CHARSET_ID (charset);
4207
4208   /* At first, check designations.  */
4209   for (reg = 0; reg < 4; reg++)
4210     if (id == CODING_ISO_DESIGNATION (coding, reg))
4211       break;
4212
4213   if (reg >= 4)
4214     {
4215       /* CHARSET is not yet designated to any graphic registers.  */
4216       /* At first check the requested designation.  */
4217       reg = CODING_ISO_REQUEST (coding, id);
4218       if (reg < 0)
4219         /* Since CHARSET requests no special designation, designate it
4220            to graphic register 0.  */
4221         reg = 0;
4222
4223       ENCODE_DESIGNATION (charset, reg, coding);
4224     }
4225
4226   if (CODING_ISO_INVOCATION (coding, 0) != reg
4227       && CODING_ISO_INVOCATION (coding, 1) != reg)
4228     {
4229       /* Since the graphic register REG is not invoked to any graphic
4230          planes, invoke it to graphic plane 0.  */
4231       switch (reg)
4232         {
4233         case 0:                 /* graphic register 0 */
4234           ENCODE_SHIFT_IN;
4235           break;
4236
4237         case 1:                 /* graphic register 1 */
4238           ENCODE_SHIFT_OUT;
4239           break;
4240
4241         case 2:                 /* graphic register 2 */
4242           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4243             ENCODE_SINGLE_SHIFT_2;
4244           else
4245             ENCODE_LOCKING_SHIFT_2;
4246           break;
4247
4248         case 3:                 /* graphic register 3 */
4249           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4250             ENCODE_SINGLE_SHIFT_3;
4251           else
4252             ENCODE_LOCKING_SHIFT_3;
4253           break;
4254         }
4255     }
4256
4257   *p_nchars = produced_chars;
4258   return dst;
4259 }
4260
4261
4262 /* Produce codes for designation and invocation to reset the graphic
4263    planes and registers to initial state.  */
4264 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4265   do {                                                                  \
4266     int reg;                                                            \
4267     struct charset *charset;                                            \
4268                                                                         \
4269     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4270       ENCODE_SHIFT_IN;                                                  \
4271     for (reg = 0; reg < 4; reg++)                                       \
4272       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4273           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4274               != CODING_ISO_INITIAL (coding, reg)))                     \
4275         {                                                               \
4276           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4277           ENCODE_DESIGNATION (charset, reg, coding);                    \
4278         }                                                               \
4279   } while (0)
4280
4281
4282 /* Produce designation sequences of charsets in the line started from
4283    CHARBUF to a place pointed by DST, and return the number of
4284    produced bytes.  DST should not directly point a buffer text area
4285    which may be relocated by char_charset call.
4286
4287    If the current block ends before any end-of-line, we may fail to
4288    find all the necessary designations.  */
4289
4290 static ptrdiff_t
4291 encode_designation_at_bol (struct coding_system *coding,
4292                            int *charbuf, int *charbuf_end,
4293                            unsigned char *dst)
4294 {
4295   unsigned char *orig = dst;
4296   struct charset *charset;
4297   /* Table of charsets to be designated to each graphic register.  */
4298   int r[4];
4299   int c, found = 0, reg;
4300   ptrdiff_t produced_chars = 0;
4301   bool multibytep = coding->dst_multibyte;
4302   Lisp_Object attrs;
4303   Lisp_Object charset_list;
4304
4305   attrs = CODING_ID_ATTRS (coding->id);
4306   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4307   if (EQ (charset_list, Qiso_2022))
4308     charset_list = Viso_2022_charset_list;
4309
4310   for (reg = 0; reg < 4; reg++)
4311     r[reg] = -1;
4312
4313   while (charbuf < charbuf_end && found < 4)
4314     {
4315       int id;
4316
4317       c = *charbuf++;
4318       if (c == '\n')
4319         break;
4320       charset = char_charset (c, charset_list, NULL);
4321       id = CHARSET_ID (charset);
4322       reg = CODING_ISO_REQUEST (coding, id);
4323       if (reg >= 0 && r[reg] < 0)
4324         {
4325           found++;
4326           r[reg] = id;
4327         }
4328     }
4329
4330   if (found)
4331     {
4332       for (reg = 0; reg < 4; reg++)
4333         if (r[reg] >= 0
4334             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4335           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4336     }
4337
4338   return dst - orig;
4339 }
4340
4341 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4342
4343 static bool
4344 encode_coding_iso_2022 (struct coding_system *coding)
4345 {
4346   bool multibytep = coding->dst_multibyte;
4347   int *charbuf = coding->charbuf;
4348   int *charbuf_end = charbuf + coding->charbuf_used;
4349   unsigned char *dst = coding->destination + coding->produced;
4350   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4351   int safe_room = 16;
4352   bool bol_designation
4353     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4354        && CODING_ISO_BOL (coding));
4355   ptrdiff_t produced_chars = 0;
4356   Lisp_Object attrs, eol_type, charset_list;
4357   bool ascii_compatible;
4358   int c;
4359   int preferred_charset_id = -1;
4360
4361   CODING_GET_INFO (coding, attrs, charset_list);
4362   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4363   if (VECTORP (eol_type))
4364     eol_type = Qunix;
4365
4366   setup_iso_safe_charsets (attrs);
4367   /* Charset list may have been changed.  */
4368   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4369   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4370
4371   ascii_compatible
4372     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4373        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4374                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4375
4376   while (charbuf < charbuf_end)
4377     {
4378       ASSURE_DESTINATION (safe_room);
4379
4380       if (bol_designation)
4381         {
4382           /* We have to produce designation sequences if any now.  */
4383           unsigned char desig_buf[16];
4384           int nbytes;
4385           ptrdiff_t offset;
4386
4387           charset_map_loaded = 0;
4388           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4389                                               desig_buf);
4390           if (charset_map_loaded
4391               && (offset = coding_change_destination (coding)))
4392             {
4393               dst += offset;
4394               dst_end += offset;
4395             }
4396           memcpy (dst, desig_buf, nbytes);
4397           dst += nbytes;
4398           /* We are sure that designation sequences are all ASCII bytes.  */
4399           produced_chars += nbytes;
4400           bol_designation = 0;
4401           ASSURE_DESTINATION (safe_room);
4402         }
4403
4404       c = *charbuf++;
4405
4406       if (c < 0)
4407         {
4408           /* Handle an annotation.  */
4409           switch (*charbuf)
4410             {
4411             case CODING_ANNOTATE_COMPOSITION_MASK:
4412               /* Not yet implemented.  */
4413               break;
4414             case CODING_ANNOTATE_CHARSET_MASK:
4415               preferred_charset_id = charbuf[2];
4416               if (preferred_charset_id >= 0
4417                   && NILP (Fmemq (make_number (preferred_charset_id),
4418                                   charset_list)))
4419                 preferred_charset_id = -1;
4420               break;
4421             default:
4422               abort ();
4423             }
4424           charbuf += -c - 1;
4425           continue;
4426         }
4427
4428       /* Now encode the character C.  */
4429       if (c < 0x20 || c == 0x7F)
4430         {
4431           if (c == '\n'
4432               || (c == '\r' && EQ (eol_type, Qmac)))
4433             {
4434               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4435                 ENCODE_RESET_PLANE_AND_REGISTER ();
4436               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4437                 {
4438                   int i;
4439
4440                   for (i = 0; i < 4; i++)
4441                     CODING_ISO_DESIGNATION (coding, i)
4442                       = CODING_ISO_INITIAL (coding, i);
4443                 }
4444               bol_designation = ((CODING_ISO_FLAGS (coding)
4445                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4446                                  != 0);
4447             }
4448           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4449             ENCODE_RESET_PLANE_AND_REGISTER ();
4450           EMIT_ONE_ASCII_BYTE (c);
4451         }
4452       else if (ASCII_CHAR_P (c))
4453         {
4454           if (ascii_compatible)
4455             EMIT_ONE_ASCII_BYTE (c);
4456           else
4457             {
4458               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4459               ENCODE_ISO_CHARACTER (charset, c);
4460             }
4461         }
4462       else if (CHAR_BYTE8_P (c))
4463         {
4464           c = CHAR_TO_BYTE8 (c);
4465           EMIT_ONE_BYTE (c);
4466         }
4467       else
4468         {
4469           struct charset *charset;
4470
4471           if (preferred_charset_id >= 0)
4472             {
4473               bool result;
4474
4475               charset = CHARSET_FROM_ID (preferred_charset_id);
4476               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4477               if (! result)
4478                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4479                                      NULL, charset);
4480             }
4481           else
4482             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4483                                  NULL, charset);
4484           if (!charset)
4485             {
4486               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4487                 {
4488                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4489                   charset = CHARSET_FROM_ID (charset_ascii);
4490                 }
4491               else
4492                 {
4493                   c = coding->default_char;
4494                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4495                                        charset_list, NULL, charset);
4496                 }
4497             }
4498           ENCODE_ISO_CHARACTER (charset, c);
4499         }
4500     }
4501
4502   if (coding->mode & CODING_MODE_LAST_BLOCK
4503       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4504     {
4505       ASSURE_DESTINATION (safe_room);
4506       ENCODE_RESET_PLANE_AND_REGISTER ();
4507     }
4508   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4509   CODING_ISO_BOL (coding) = bol_designation;
4510   coding->produced_char += produced_chars;
4511   coding->produced = dst - coding->destination;
4512   return 0;
4513 }
4514
4515 \f
4516 /*** 8,9. SJIS and BIG5 handlers ***/
4517
4518 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4519    quite widely.  So, for the moment, Emacs supports them in the bare
4520    C code.  But, in the future, they may be supported only by CCL.  */
4521
4522 /* SJIS is a coding system encoding three character sets: ASCII, right
4523    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4524    as is.  A character of charset katakana-jisx0201 is encoded by
4525    "position-code + 0x80".  A character of charset japanese-jisx0208
4526    is encoded in 2-byte but two position-codes are divided and shifted
4527    so that it fit in the range below.
4528
4529    --- CODE RANGE of SJIS ---
4530    (character set)      (range)
4531    ASCII                0x00 .. 0x7F
4532    KATAKANA-JISX0201    0xA0 .. 0xDF
4533    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4534             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4535    -------------------------------
4536
4537 */
4538
4539 /* BIG5 is a coding system encoding two character sets: ASCII and
4540    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4541    character set and is encoded in two-byte.
4542
4543    --- CODE RANGE of BIG5 ---
4544    (character set)      (range)
4545    ASCII                0x00 .. 0x7F
4546    Big5 (1st byte)      0xA1 .. 0xFE
4547         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4548    --------------------------
4549
4550   */
4551
4552 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4553    Return true if a text is encoded in SJIS.  */
4554
4555 static bool
4556 detect_coding_sjis (struct coding_system *coding,
4557                     struct coding_detection_info *detect_info)
4558 {
4559   const unsigned char *src = coding->source, *src_base;
4560   const unsigned char *src_end = coding->source + coding->src_bytes;
4561   bool multibytep = coding->src_multibyte;
4562   ptrdiff_t consumed_chars = 0;
4563   int found = 0;
4564   int c;
4565   Lisp_Object attrs, charset_list;
4566   int max_first_byte_of_2_byte_code;
4567
4568   CODING_GET_INFO (coding, attrs, charset_list);
4569   max_first_byte_of_2_byte_code
4570     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4571
4572   detect_info->checked |= CATEGORY_MASK_SJIS;
4573   /* A coding system of this category is always ASCII compatible.  */
4574   src += coding->head_ascii;
4575
4576   while (1)
4577     {
4578       src_base = src;
4579       ONE_MORE_BYTE (c);
4580       if (c < 0x80)
4581         continue;
4582       if ((c >= 0x81 && c <= 0x9F)
4583           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4584         {
4585           ONE_MORE_BYTE (c);
4586           if (c < 0x40 || c == 0x7F || c > 0xFC)
4587             break;
4588           found = CATEGORY_MASK_SJIS;
4589         }
4590       else if (c >= 0xA0 && c < 0xE0)
4591         found = CATEGORY_MASK_SJIS;
4592       else
4593         break;
4594     }
4595   detect_info->rejected |= CATEGORY_MASK_SJIS;
4596   return 0;
4597
4598  no_more_source:
4599   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4600     {
4601       detect_info->rejected |= CATEGORY_MASK_SJIS;
4602       return 0;
4603     }
4604   detect_info->found |= found;
4605   return 1;
4606 }
4607
4608 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4609    Return true if a text is encoded in BIG5.  */
4610
4611 static bool
4612 detect_coding_big5 (struct coding_system *coding,
4613                     struct coding_detection_info *detect_info)
4614 {
4615   const unsigned char *src = coding->source, *src_base;
4616   const unsigned char *src_end = coding->source + coding->src_bytes;
4617   bool multibytep = coding->src_multibyte;
4618   ptrdiff_t consumed_chars = 0;
4619   int found = 0;
4620   int c;
4621
4622   detect_info->checked |= CATEGORY_MASK_BIG5;
4623   /* A coding system of this category is always ASCII compatible.  */
4624   src += coding->head_ascii;
4625
4626   while (1)
4627     {
4628       src_base = src;
4629       ONE_MORE_BYTE (c);
4630       if (c < 0x80)
4631         continue;
4632       if (c >= 0xA1)
4633         {
4634           ONE_MORE_BYTE (c);
4635           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4636             return 0;
4637           found = CATEGORY_MASK_BIG5;
4638         }
4639       else
4640         break;
4641     }
4642   detect_info->rejected |= CATEGORY_MASK_BIG5;
4643   return 0;
4644
4645  no_more_source:
4646   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4647     {
4648       detect_info->rejected |= CATEGORY_MASK_BIG5;
4649       return 0;
4650     }
4651   detect_info->found |= found;
4652   return 1;
4653 }
4654
4655 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4656
4657 static void
4658 decode_coding_sjis (struct coding_system *coding)
4659 {
4660   const unsigned char *src = coding->source + coding->consumed;
4661   const unsigned char *src_end = coding->source + coding->src_bytes;
4662   const unsigned char *src_base;
4663   int *charbuf = coding->charbuf + coding->charbuf_used;
4664   /* We may produce one charset annotation in one loop and one more at
4665      the end.  */
4666   int *charbuf_end
4667     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4668   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4669   bool multibytep = coding->src_multibyte;
4670   struct charset *charset_roman, *charset_kanji, *charset_kana;
4671   struct charset *charset_kanji2;
4672   Lisp_Object attrs, charset_list, val;
4673   ptrdiff_t char_offset = coding->produced_char;
4674   ptrdiff_t last_offset = char_offset;
4675   int last_id = charset_ascii;
4676   bool eol_dos
4677     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4678   int byte_after_cr = -1;
4679
4680   CODING_GET_INFO (coding, attrs, charset_list);
4681
4682   val = charset_list;
4683   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4684   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4685   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4686   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4687
4688   while (1)
4689     {
4690       int c, c1;
4691       struct charset *charset;
4692
4693       src_base = src;
4694       consumed_chars_base = consumed_chars;
4695
4696       if (charbuf >= charbuf_end)
4697         {
4698           if (byte_after_cr >= 0)
4699             src_base--;
4700           break;
4701         }
4702
4703       if (byte_after_cr >= 0)
4704         c = byte_after_cr, byte_after_cr = -1;
4705       else
4706         ONE_MORE_BYTE (c);
4707       if (c < 0)
4708         goto invalid_code;
4709       if (c < 0x80)
4710         {
4711           if (eol_dos && c == '\r')
4712             ONE_MORE_BYTE (byte_after_cr);
4713           charset = charset_roman;
4714         }
4715       else if (c == 0x80 || c == 0xA0)
4716         goto invalid_code;
4717       else if (c >= 0xA1 && c <= 0xDF)
4718         {
4719           /* SJIS -> JISX0201-Kana */
4720           c &= 0x7F;
4721           charset = charset_kana;
4722         }
4723       else if (c <= 0xEF)
4724         {
4725           /* SJIS -> JISX0208 */
4726           ONE_MORE_BYTE (c1);
4727           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4728             goto invalid_code;
4729           c = (c << 8) | c1;
4730           SJIS_TO_JIS (c);
4731           charset = charset_kanji;
4732         }
4733       else if (c <= 0xFC && charset_kanji2)
4734         {
4735           /* SJIS -> JISX0213-2 */
4736           ONE_MORE_BYTE (c1);
4737           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4738             goto invalid_code;
4739           c = (c << 8) | c1;
4740           SJIS_TO_JIS2 (c);
4741           charset = charset_kanji2;
4742         }
4743       else
4744         goto invalid_code;
4745       if (charset->id != charset_ascii
4746           && last_id != charset->id)
4747         {
4748           if (last_id != charset_ascii)
4749             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4750           last_id = charset->id;
4751           last_offset = char_offset;
4752         }
4753       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4754       *charbuf++ = c;
4755       char_offset++;
4756       continue;
4757
4758     invalid_code:
4759       src = src_base;
4760       consumed_chars = consumed_chars_base;
4761       ONE_MORE_BYTE (c);
4762       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4763       char_offset++;
4764       coding->errors++;
4765     }
4766
4767  no_more_source:
4768   if (last_id != charset_ascii)
4769     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4770   coding->consumed_char += consumed_chars_base;
4771   coding->consumed = src_base - coding->source;
4772   coding->charbuf_used = charbuf - coding->charbuf;
4773 }
4774
4775 static void
4776 decode_coding_big5 (struct coding_system *coding)
4777 {
4778   const unsigned char *src = coding->source + coding->consumed;
4779   const unsigned char *src_end = coding->source + coding->src_bytes;
4780   const unsigned char *src_base;
4781   int *charbuf = coding->charbuf + coding->charbuf_used;
4782   /* We may produce one charset annotation in one loop and one more at
4783      the end.  */
4784   int *charbuf_end
4785     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4786   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4787   bool multibytep = coding->src_multibyte;
4788   struct charset *charset_roman, *charset_big5;
4789   Lisp_Object attrs, charset_list, val;
4790   ptrdiff_t char_offset = coding->produced_char;
4791   ptrdiff_t last_offset = char_offset;
4792   int last_id = charset_ascii;
4793   bool eol_dos
4794     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4795   int byte_after_cr = -1;
4796
4797   CODING_GET_INFO (coding, attrs, charset_list);
4798   val = charset_list;
4799   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4800   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4801
4802   while (1)
4803     {
4804       int c, c1;
4805       struct charset *charset;
4806
4807       src_base = src;
4808       consumed_chars_base = consumed_chars;
4809
4810       if (charbuf >= charbuf_end)
4811         {
4812           if (byte_after_cr >= 0)
4813             src_base--;
4814           break;
4815         }
4816
4817       if (byte_after_cr >= 0)
4818         c = byte_after_cr, byte_after_cr = -1;
4819       else
4820         ONE_MORE_BYTE (c);
4821
4822       if (c < 0)
4823         goto invalid_code;
4824       if (c < 0x80)
4825         {
4826           if (eol_dos && c == '\r')
4827             ONE_MORE_BYTE (byte_after_cr);
4828           charset = charset_roman;
4829         }
4830       else
4831         {
4832           /* BIG5 -> Big5 */
4833           if (c < 0xA1 || c > 0xFE)
4834             goto invalid_code;
4835           ONE_MORE_BYTE (c1);
4836           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4837             goto invalid_code;
4838           c = c << 8 | c1;
4839           charset = charset_big5;
4840         }
4841       if (charset->id != charset_ascii
4842           && last_id != charset->id)
4843         {
4844           if (last_id != charset_ascii)
4845             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4846           last_id = charset->id;
4847           last_offset = char_offset;
4848         }
4849       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4850       *charbuf++ = c;
4851       char_offset++;
4852       continue;
4853
4854     invalid_code:
4855       src = src_base;
4856       consumed_chars = consumed_chars_base;
4857       ONE_MORE_BYTE (c);
4858       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4859       char_offset++;
4860       coding->errors++;
4861     }
4862
4863  no_more_source:
4864   if (last_id != charset_ascii)
4865     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4866   coding->consumed_char += consumed_chars_base;
4867   coding->consumed = src_base - coding->source;
4868   coding->charbuf_used = charbuf - coding->charbuf;
4869 }
4870
4871 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4872    This function can encode charsets `ascii', `katakana-jisx0201',
4873    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4874    are sure that all these charsets are registered as official charset
4875    (i.e. do not have extended leading-codes).  Characters of other
4876    charsets are produced without any encoding.  */
4877
4878 static bool
4879 encode_coding_sjis (struct coding_system *coding)
4880 {
4881   bool multibytep = coding->dst_multibyte;
4882   int *charbuf = coding->charbuf;
4883   int *charbuf_end = charbuf + coding->charbuf_used;
4884   unsigned char *dst = coding->destination + coding->produced;
4885   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4886   int safe_room = 4;
4887   ptrdiff_t produced_chars = 0;
4888   Lisp_Object attrs, charset_list, val;
4889   bool ascii_compatible;
4890   struct charset *charset_kanji, *charset_kana;
4891   struct charset *charset_kanji2;
4892   int c;
4893
4894   CODING_GET_INFO (coding, attrs, charset_list);
4895   val = XCDR (charset_list);
4896   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4897   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4898   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4899
4900   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4901
4902   while (charbuf < charbuf_end)
4903     {
4904       ASSURE_DESTINATION (safe_room);
4905       c = *charbuf++;
4906       /* Now encode the character C.  */
4907       if (ASCII_CHAR_P (c) && ascii_compatible)
4908         EMIT_ONE_ASCII_BYTE (c);
4909       else if (CHAR_BYTE8_P (c))
4910         {
4911           c = CHAR_TO_BYTE8 (c);
4912           EMIT_ONE_BYTE (c);
4913         }
4914       else
4915         {
4916           unsigned code;
4917           struct charset *charset;
4918           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4919                                &code, charset);
4920
4921           if (!charset)
4922             {
4923               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4924                 {
4925                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4926                   charset = CHARSET_FROM_ID (charset_ascii);
4927                 }
4928               else
4929                 {
4930                   c = coding->default_char;
4931                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4932                                        charset_list, &code, charset);
4933                 }
4934             }
4935           if (code == CHARSET_INVALID_CODE (charset))
4936             abort ();
4937           if (charset == charset_kanji)
4938             {
4939               int c1, c2;
4940               JIS_TO_SJIS (code);
4941               c1 = code >> 8, c2 = code & 0xFF;
4942               EMIT_TWO_BYTES (c1, c2);
4943             }
4944           else if (charset == charset_kana)
4945             EMIT_ONE_BYTE (code | 0x80);
4946           else if (charset_kanji2 && charset == charset_kanji2)
4947             {
4948               int c1, c2;
4949
4950               c1 = code >> 8;
4951               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4952                   || c1 == 0x28
4953                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4954                 {
4955                   JIS_TO_SJIS2 (code);
4956                   c1 = code >> 8, c2 = code & 0xFF;
4957                   EMIT_TWO_BYTES (c1, c2);
4958                 }
4959               else
4960                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4961             }
4962           else
4963             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4964         }
4965     }
4966   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4967   coding->produced_char += produced_chars;
4968   coding->produced = dst - coding->destination;
4969   return 0;
4970 }
4971
4972 static bool
4973 encode_coding_big5 (struct coding_system *coding)
4974 {
4975   bool multibytep = coding->dst_multibyte;
4976   int *charbuf = coding->charbuf;
4977   int *charbuf_end = charbuf + coding->charbuf_used;
4978   unsigned char *dst = coding->destination + coding->produced;
4979   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4980   int safe_room = 4;
4981   ptrdiff_t produced_chars = 0;
4982   Lisp_Object attrs, charset_list, val;
4983   bool ascii_compatible;
4984   struct charset *charset_big5;
4985   int c;
4986
4987   CODING_GET_INFO (coding, attrs, charset_list);
4988   val = XCDR (charset_list);
4989   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4990   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4991
4992   while (charbuf < charbuf_end)
4993     {
4994       ASSURE_DESTINATION (safe_room);
4995       c = *charbuf++;
4996       /* Now encode the character C.  */
4997       if (ASCII_CHAR_P (c) && ascii_compatible)
4998         EMIT_ONE_ASCII_BYTE (c);
4999       else if (CHAR_BYTE8_P (c))
5000         {
5001           c = CHAR_TO_BYTE8 (c);
5002           EMIT_ONE_BYTE (c);
5003         }
5004       else
5005         {
5006           unsigned code;
5007           struct charset *charset;
5008           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5009                                &code, charset);
5010
5011           if (! charset)
5012             {
5013               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5014                 {
5015                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5016                   charset = CHARSET_FROM_ID (charset_ascii);
5017                 }
5018               else
5019                 {
5020                   c = coding->default_char;
5021                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5022                                        charset_list, &code, charset);
5023                 }
5024             }
5025           if (code == CHARSET_INVALID_CODE (charset))
5026             abort ();
5027           if (charset == charset_big5)
5028             {
5029               int c1, c2;
5030
5031               c1 = code >> 8, c2 = code & 0xFF;
5032               EMIT_TWO_BYTES (c1, c2);
5033             }
5034           else
5035             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5036         }
5037     }
5038   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5039   coding->produced_char += produced_chars;
5040   coding->produced = dst - coding->destination;
5041   return 0;
5042 }
5043
5044 \f
5045 /*** 10. CCL handlers ***/
5046
5047 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5048    Return true if a text is encoded in a coding system of which
5049    encoder/decoder are written in CCL program.  */
5050
5051 static bool
5052 detect_coding_ccl (struct coding_system *coding,
5053                    struct coding_detection_info *detect_info)
5054 {
5055   const unsigned char *src = coding->source, *src_base;
5056   const unsigned char *src_end = coding->source + coding->src_bytes;
5057   bool multibytep = coding->src_multibyte;
5058   ptrdiff_t consumed_chars = 0;
5059   int found = 0;
5060   unsigned char *valids;
5061   ptrdiff_t head_ascii = coding->head_ascii;
5062   Lisp_Object attrs;
5063
5064   detect_info->checked |= CATEGORY_MASK_CCL;
5065
5066   coding = &coding_categories[coding_category_ccl];
5067   valids = CODING_CCL_VALIDS (coding);
5068   attrs = CODING_ID_ATTRS (coding->id);
5069   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5070     src += head_ascii;
5071
5072   while (1)
5073     {
5074       int c;
5075
5076       src_base = src;
5077       ONE_MORE_BYTE (c);
5078       if (c < 0 || ! valids[c])
5079         break;
5080       if ((valids[c] > 1))
5081         found = CATEGORY_MASK_CCL;
5082     }
5083   detect_info->rejected |= CATEGORY_MASK_CCL;
5084   return 0;
5085
5086  no_more_source:
5087   detect_info->found |= found;
5088   return 1;
5089 }
5090
5091 static void
5092 decode_coding_ccl (struct coding_system *coding)
5093 {
5094   const unsigned char *src = coding->source + coding->consumed;
5095   const unsigned char *src_end = coding->source + coding->src_bytes;
5096   int *charbuf = coding->charbuf + coding->charbuf_used;
5097   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5098   ptrdiff_t consumed_chars = 0;
5099   bool multibytep = coding->src_multibyte;
5100   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5101   int source_charbuf[1024];
5102   int source_byteidx[1025];
5103   Lisp_Object attrs, charset_list;
5104
5105   CODING_GET_INFO (coding, attrs, charset_list);
5106
5107   while (1)
5108     {
5109       const unsigned char *p = src;
5110       int i = 0;
5111
5112       if (multibytep)
5113         {
5114           while (i < 1024 && p < src_end)
5115             {
5116               source_byteidx[i] = p - src;
5117               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5118             }
5119           source_byteidx[i] = p - src;
5120         }
5121       else
5122         while (i < 1024 && p < src_end)
5123           source_charbuf[i++] = *p++;
5124
5125       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5126         ccl->last_block = 1;
5127       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5128                   charset_list);
5129       charbuf += ccl->produced;
5130       if (multibytep)
5131         src += source_byteidx[ccl->consumed];
5132       else
5133         src += ccl->consumed;
5134       consumed_chars += ccl->consumed;
5135       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5136         break;
5137     }
5138
5139   switch (ccl->status)
5140     {
5141     case CCL_STAT_SUSPEND_BY_SRC:
5142       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5143       break;
5144     case CCL_STAT_SUSPEND_BY_DST:
5145       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5146       break;
5147     case CCL_STAT_QUIT:
5148     case CCL_STAT_INVALID_CMD:
5149       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5150       break;
5151     default:
5152       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5153       break;
5154     }
5155   coding->consumed_char += consumed_chars;
5156   coding->consumed = src - coding->source;
5157   coding->charbuf_used = charbuf - coding->charbuf;
5158 }
5159
5160 static bool
5161 encode_coding_ccl (struct coding_system *coding)
5162 {
5163   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5164   bool multibytep = coding->dst_multibyte;
5165   int *charbuf = coding->charbuf;
5166   int *charbuf_end = charbuf + coding->charbuf_used;
5167   unsigned char *dst = coding->destination + coding->produced;
5168   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5169   int destination_charbuf[1024];
5170   ptrdiff_t produced_chars = 0;
5171   int i;
5172   Lisp_Object attrs, charset_list;
5173
5174   CODING_GET_INFO (coding, attrs, charset_list);
5175   if (coding->consumed_char == coding->src_chars
5176       && coding->mode & CODING_MODE_LAST_BLOCK)
5177     ccl->last_block = 1;
5178
5179   do
5180     {
5181       ccl_driver (ccl, charbuf, destination_charbuf,
5182                   charbuf_end - charbuf, 1024, charset_list);
5183       if (multibytep)
5184         {
5185           ASSURE_DESTINATION (ccl->produced * 2);
5186           for (i = 0; i < ccl->produced; i++)
5187             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5188         }
5189       else
5190         {
5191           ASSURE_DESTINATION (ccl->produced);
5192           for (i = 0; i < ccl->produced; i++)
5193             *dst++ = destination_charbuf[i] & 0xFF;
5194           produced_chars += ccl->produced;
5195         }
5196       charbuf += ccl->consumed;
5197       if (ccl->status == CCL_STAT_QUIT
5198           || ccl->status == CCL_STAT_INVALID_CMD)
5199         break;
5200     }
5201   while (charbuf < charbuf_end);
5202
5203   switch (ccl->status)
5204     {
5205     case CCL_STAT_SUSPEND_BY_SRC:
5206       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5207       break;
5208     case CCL_STAT_SUSPEND_BY_DST:
5209       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5210       break;
5211     case CCL_STAT_QUIT:
5212     case CCL_STAT_INVALID_CMD:
5213       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5214       break;
5215     default:
5216       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5217       break;
5218     }
5219
5220   coding->produced_char += produced_chars;
5221   coding->produced = dst - coding->destination;
5222   return 0;
5223 }
5224
5225 \f
5226 /*** 10, 11. no-conversion handlers ***/
5227
5228 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5229
5230 static void
5231 decode_coding_raw_text (struct coding_system *coding)
5232 {
5233   bool eol_dos
5234     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5235
5236   coding->chars_at_source = 1;
5237   coding->consumed_char = coding->src_chars;
5238   coding->consumed = coding->src_bytes;
5239   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5240     {
5241       coding->consumed_char--;
5242       coding->consumed--;
5243       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5244     }
5245   else
5246     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5247 }
5248
5249 static bool
5250 encode_coding_raw_text (struct coding_system *coding)
5251 {
5252   bool multibytep = coding->dst_multibyte;
5253   int *charbuf = coding->charbuf;
5254   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5255   unsigned char *dst = coding->destination + coding->produced;
5256   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5257   ptrdiff_t produced_chars = 0;
5258   int c;
5259
5260   if (multibytep)
5261     {
5262       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5263
5264       if (coding->src_multibyte)
5265         while (charbuf < charbuf_end)
5266           {
5267             ASSURE_DESTINATION (safe_room);
5268             c = *charbuf++;
5269             if (ASCII_CHAR_P (c))
5270               EMIT_ONE_ASCII_BYTE (c);
5271             else if (CHAR_BYTE8_P (c))
5272               {
5273                 c = CHAR_TO_BYTE8 (c);
5274                 EMIT_ONE_BYTE (c);
5275               }
5276             else
5277               {
5278                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5279
5280                 CHAR_STRING_ADVANCE (c, p1);
5281                 do
5282                   {
5283                     EMIT_ONE_BYTE (*p0);
5284                     p0++;
5285                   }
5286                 while (p0 < p1);
5287               }
5288           }
5289       else
5290         while (charbuf < charbuf_end)
5291           {
5292             ASSURE_DESTINATION (safe_room);
5293             c = *charbuf++;
5294             EMIT_ONE_BYTE (c);
5295           }
5296     }
5297   else
5298     {
5299       if (coding->src_multibyte)
5300         {
5301           int safe_room = MAX_MULTIBYTE_LENGTH;
5302
5303           while (charbuf < charbuf_end)
5304             {
5305               ASSURE_DESTINATION (safe_room);
5306               c = *charbuf++;
5307               if (ASCII_CHAR_P (c))
5308                 *dst++ = c;
5309               else if (CHAR_BYTE8_P (c))
5310                 *dst++ = CHAR_TO_BYTE8 (c);
5311               else
5312                 CHAR_STRING_ADVANCE (c, dst);
5313             }
5314         }
5315       else
5316         {
5317           ASSURE_DESTINATION (charbuf_end - charbuf);
5318           while (charbuf < charbuf_end && dst < dst_end)
5319             *dst++ = *charbuf++;
5320         }
5321       produced_chars = dst - (coding->destination + coding->produced);
5322     }
5323   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5324   coding->produced_char += produced_chars;
5325   coding->produced = dst - coding->destination;
5326   return 0;
5327 }
5328
5329 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5330    Return true if a text is encoded in a charset-based coding system.  */
5331
5332 static bool
5333 detect_coding_charset (struct coding_system *coding,
5334                        struct coding_detection_info *detect_info)
5335 {
5336   const unsigned char *src = coding->source, *src_base;
5337   const unsigned char *src_end = coding->source + coding->src_bytes;
5338   bool multibytep = coding->src_multibyte;
5339   ptrdiff_t consumed_chars = 0;
5340   Lisp_Object attrs, valids, name;
5341   int found = 0;
5342   ptrdiff_t head_ascii = coding->head_ascii;
5343   bool check_latin_extra = 0;
5344
5345   detect_info->checked |= CATEGORY_MASK_CHARSET;
5346
5347   coding = &coding_categories[coding_category_charset];
5348   attrs = CODING_ID_ATTRS (coding->id);
5349   valids = AREF (attrs, coding_attr_charset_valids);
5350   name = CODING_ID_NAME (coding->id);
5351   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5352                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5353       || strncmp (SSDATA (SYMBOL_NAME (name)),
5354                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5355     check_latin_extra = 1;
5356
5357   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5358     src += head_ascii;
5359
5360   while (1)
5361     {
5362       int c;
5363       Lisp_Object val;
5364       struct charset *charset;
5365       int dim, idx;
5366
5367       src_base = src;
5368       ONE_MORE_BYTE (c);
5369       if (c < 0)
5370         continue;
5371       val = AREF (valids, c);
5372       if (NILP (val))
5373         break;
5374       if (c >= 0x80)
5375         {
5376           if (c < 0xA0
5377               && check_latin_extra
5378               && (!VECTORP (Vlatin_extra_code_table)
5379                   || NILP (AREF (Vlatin_extra_code_table, c))))
5380             break;
5381           found = CATEGORY_MASK_CHARSET;
5382         }
5383       if (INTEGERP (val))
5384         {
5385           charset = CHARSET_FROM_ID (XFASTINT (val));
5386           dim = CHARSET_DIMENSION (charset);
5387           for (idx = 1; idx < dim; idx++)
5388             {
5389               if (src == src_end)
5390                 goto too_short;
5391               ONE_MORE_BYTE (c);
5392               if (c < charset->code_space[(dim - 1 - idx) * 4]
5393                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5394                 break;
5395             }
5396           if (idx < dim)
5397             break;
5398         }
5399       else
5400         {
5401           idx = 1;
5402           for (; CONSP (val); val = XCDR (val))
5403             {
5404               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5405               dim = CHARSET_DIMENSION (charset);
5406               while (idx < dim)
5407                 {
5408                   if (src == src_end)
5409                     goto too_short;
5410                   ONE_MORE_BYTE (c);
5411                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5412                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5413                     break;
5414                   idx++;
5415                 }
5416               if (idx == dim)
5417                 {
5418                   val = Qnil;
5419                   break;
5420                 }
5421             }
5422           if (CONSP (val))
5423             break;
5424         }
5425     }
5426  too_short:
5427   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5428   return 0;
5429
5430  no_more_source:
5431   detect_info->found |= found;
5432   return 1;
5433 }
5434
5435 static void
5436 decode_coding_charset (struct coding_system *coding)
5437 {
5438   const unsigned char *src = coding->source + coding->consumed;
5439   const unsigned char *src_end = coding->source + coding->src_bytes;
5440   const unsigned char *src_base;
5441   int *charbuf = coding->charbuf + coding->charbuf_used;
5442   /* We may produce one charset annotation in one loop and one more at
5443      the end.  */
5444   int *charbuf_end
5445     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5446   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5447   bool multibytep = coding->src_multibyte;
5448   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5449   Lisp_Object valids;
5450   ptrdiff_t char_offset = coding->produced_char;
5451   ptrdiff_t last_offset = char_offset;
5452   int last_id = charset_ascii;
5453   bool eol_dos
5454     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5455   int byte_after_cr = -1;
5456
5457   valids = AREF (attrs, coding_attr_charset_valids);
5458
5459   while (1)
5460     {
5461       int c;
5462       Lisp_Object val;
5463       struct charset *charset;
5464       int dim;
5465       int len = 1;
5466       unsigned code;
5467
5468       src_base = src;
5469       consumed_chars_base = consumed_chars;
5470
5471       if (charbuf >= charbuf_end)
5472         {
5473           if (byte_after_cr >= 0)
5474             src_base--;
5475           break;
5476         }
5477
5478       if (byte_after_cr >= 0)
5479         {
5480           c = byte_after_cr;
5481           byte_after_cr = -1;
5482         }
5483       else
5484         {
5485           ONE_MORE_BYTE (c);
5486           if (eol_dos && c == '\r')
5487             ONE_MORE_BYTE (byte_after_cr);
5488         }
5489       if (c < 0)
5490         goto invalid_code;
5491       code = c;
5492
5493       val = AREF (valids, c);
5494       if (! INTEGERP (val) && ! CONSP (val))
5495         goto invalid_code;
5496       if (INTEGERP (val))
5497         {
5498           charset = CHARSET_FROM_ID (XFASTINT (val));
5499           dim = CHARSET_DIMENSION (charset);
5500           while (len < dim)
5501             {
5502               ONE_MORE_BYTE (c);
5503               code = (code << 8) | c;
5504               len++;
5505             }
5506           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5507                               charset, code, c);
5508         }
5509       else
5510         {
5511           /* VAL is a list of charset IDs.  It is assured that the
5512              list is sorted by charset dimensions (smaller one
5513              comes first).  */
5514           while (CONSP (val))
5515             {
5516               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5517               dim = CHARSET_DIMENSION (charset);
5518               while (len < dim)
5519                 {
5520                   ONE_MORE_BYTE (c);
5521                   code = (code << 8) | c;
5522                   len++;
5523                 }
5524               CODING_DECODE_CHAR (coding, src, src_base,
5525                                   src_end, charset, code, c);
5526               if (c >= 0)
5527                 break;
5528               val = XCDR (val);
5529             }
5530         }
5531       if (c < 0)
5532         goto invalid_code;
5533       if (charset->id != charset_ascii
5534           && last_id != charset->id)
5535         {
5536           if (last_id != charset_ascii)
5537             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5538           last_id = charset->id;
5539           last_offset = char_offset;
5540         }
5541
5542       *charbuf++ = c;
5543       char_offset++;
5544       continue;
5545
5546     invalid_code:
5547       src = src_base;
5548       consumed_chars = consumed_chars_base;
5549       ONE_MORE_BYTE (c);
5550       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5551       char_offset++;
5552       coding->errors++;
5553     }
5554
5555  no_more_source:
5556   if (last_id != charset_ascii)
5557     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5558   coding->consumed_char += consumed_chars_base;
5559   coding->consumed = src_base - coding->source;
5560   coding->charbuf_used = charbuf - coding->charbuf;
5561 }
5562
5563 static bool
5564 encode_coding_charset (struct coding_system *coding)
5565 {
5566   bool multibytep = coding->dst_multibyte;
5567   int *charbuf = coding->charbuf;
5568   int *charbuf_end = charbuf + coding->charbuf_used;
5569   unsigned char *dst = coding->destination + coding->produced;
5570   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5571   int safe_room = MAX_MULTIBYTE_LENGTH;
5572   ptrdiff_t produced_chars = 0;
5573   Lisp_Object attrs, charset_list;
5574   bool ascii_compatible;
5575   int c;
5576
5577   CODING_GET_INFO (coding, attrs, charset_list);
5578   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5579
5580   while (charbuf < charbuf_end)
5581     {
5582       struct charset *charset;
5583       unsigned code;
5584
5585       ASSURE_DESTINATION (safe_room);
5586       c = *charbuf++;
5587       if (ascii_compatible && ASCII_CHAR_P (c))
5588         EMIT_ONE_ASCII_BYTE (c);
5589       else if (CHAR_BYTE8_P (c))
5590         {
5591           c = CHAR_TO_BYTE8 (c);
5592           EMIT_ONE_BYTE (c);
5593         }
5594       else
5595         {
5596           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5597                                &code, charset);
5598
5599           if (charset)
5600             {
5601               if (CHARSET_DIMENSION (charset) == 1)
5602                 EMIT_ONE_BYTE (code);
5603               else if (CHARSET_DIMENSION (charset) == 2)
5604                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5605               else if (CHARSET_DIMENSION (charset) == 3)
5606                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5607               else
5608                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5609                                  (code >> 8) & 0xFF, code & 0xFF);
5610             }
5611           else
5612             {
5613               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5614                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5615               else
5616                 c = coding->default_char;
5617               EMIT_ONE_BYTE (c);
5618             }
5619         }
5620     }
5621
5622   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5623   coding->produced_char += produced_chars;
5624   coding->produced = dst - coding->destination;
5625   return 0;
5626 }
5627
5628 \f
5629 /*** 7. C library functions ***/
5630
5631 /* Setup coding context CODING from information about CODING_SYSTEM.
5632    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5633    CODING_SYSTEM is invalid, signal an error.  */
5634
5635 void
5636 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5637 {
5638   Lisp_Object attrs;
5639   Lisp_Object eol_type;
5640   Lisp_Object coding_type;
5641   Lisp_Object val;
5642
5643   if (NILP (coding_system))
5644     coding_system = Qundecided;
5645
5646   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5647
5648   attrs = CODING_ID_ATTRS (coding->id);
5649   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5650
5651   coding->mode = 0;
5652   coding->head_ascii = -1;
5653   if (VECTORP (eol_type))
5654     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5655                             | CODING_REQUIRE_DETECTION_MASK);
5656   else if (! EQ (eol_type, Qunix))
5657     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5658                             | CODING_REQUIRE_ENCODING_MASK);
5659   else
5660     coding->common_flags = 0;
5661   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5662     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5663   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5664     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5665   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5666     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5667
5668   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5669   coding->max_charset_id = SCHARS (val) - 1;
5670   coding->safe_charsets = SDATA (val);
5671   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5672   coding->carryover_bytes = 0;
5673
5674   coding_type = CODING_ATTR_TYPE (attrs);
5675   if (EQ (coding_type, Qundecided))
5676     {
5677       coding->detector = NULL;
5678       coding->decoder = decode_coding_raw_text;
5679       coding->encoder = encode_coding_raw_text;
5680       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5681     }
5682   else if (EQ (coding_type, Qiso_2022))
5683     {
5684       int i;
5685       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5686
5687       /* Invoke graphic register 0 to plane 0.  */
5688       CODING_ISO_INVOCATION (coding, 0) = 0;
5689       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5690       CODING_ISO_INVOCATION (coding, 1)
5691         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5692       /* Setup the initial status of designation.  */
5693       for (i = 0; i < 4; i++)
5694         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5695       /* Not single shifting initially.  */
5696       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5697       /* Beginning of buffer should also be regarded as bol. */
5698       CODING_ISO_BOL (coding) = 1;
5699       coding->detector = detect_coding_iso_2022;
5700       coding->decoder = decode_coding_iso_2022;
5701       coding->encoder = encode_coding_iso_2022;
5702       if (flags & CODING_ISO_FLAG_SAFE)
5703         coding->mode |= CODING_MODE_SAFE_ENCODING;
5704       coding->common_flags
5705         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5706             | CODING_REQUIRE_FLUSHING_MASK);
5707       if (flags & CODING_ISO_FLAG_COMPOSITION)
5708         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5709       if (flags & CODING_ISO_FLAG_DESIGNATION)
5710         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5711       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5712         {
5713           setup_iso_safe_charsets (attrs);
5714           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5715           coding->max_charset_id = SCHARS (val) - 1;
5716           coding->safe_charsets = SDATA (val);
5717         }
5718       CODING_ISO_FLAGS (coding) = flags;
5719       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5720       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5721       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5722       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5723     }
5724   else if (EQ (coding_type, Qcharset))
5725     {
5726       coding->detector = detect_coding_charset;
5727       coding->decoder = decode_coding_charset;
5728       coding->encoder = encode_coding_charset;
5729       coding->common_flags
5730         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5731     }
5732   else if (EQ (coding_type, Qutf_8))
5733     {
5734       val = AREF (attrs, coding_attr_utf_bom);
5735       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5736                                    : EQ (val, Qt) ? utf_with_bom
5737                                    : utf_without_bom);
5738       coding->detector = detect_coding_utf_8;
5739       coding->decoder = decode_coding_utf_8;
5740       coding->encoder = encode_coding_utf_8;
5741       coding->common_flags
5742         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5743       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5744         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5745     }
5746   else if (EQ (coding_type, Qutf_16))
5747     {
5748       val = AREF (attrs, coding_attr_utf_bom);
5749       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5750                                     : EQ (val, Qt) ? utf_with_bom
5751                                     : utf_without_bom);
5752       val = AREF (attrs, coding_attr_utf_16_endian);
5753       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5754                                        : utf_16_little_endian);
5755       CODING_UTF_16_SURROGATE (coding) = 0;
5756       coding->detector = detect_coding_utf_16;
5757       coding->decoder = decode_coding_utf_16;
5758       coding->encoder = encode_coding_utf_16;
5759       coding->common_flags
5760         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5761       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5762         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5763     }
5764   else if (EQ (coding_type, Qccl))
5765     {
5766       coding->detector = detect_coding_ccl;
5767       coding->decoder = decode_coding_ccl;
5768       coding->encoder = encode_coding_ccl;
5769       coding->common_flags
5770         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5771             | CODING_REQUIRE_FLUSHING_MASK);
5772     }
5773   else if (EQ (coding_type, Qemacs_mule))
5774     {
5775       coding->detector = detect_coding_emacs_mule;
5776       coding->decoder = decode_coding_emacs_mule;
5777       coding->encoder = encode_coding_emacs_mule;
5778       coding->common_flags
5779         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5780       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5781           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5782         {
5783           Lisp_Object tail, safe_charsets;
5784           int max_charset_id = 0;
5785
5786           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5787                tail = XCDR (tail))
5788             if (max_charset_id < XFASTINT (XCAR (tail)))
5789               max_charset_id = XFASTINT (XCAR (tail));
5790           safe_charsets = make_uninit_string (max_charset_id + 1);
5791           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5792           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5793                tail = XCDR (tail))
5794             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5795           coding->max_charset_id = max_charset_id;
5796           coding->safe_charsets = SDATA (safe_charsets);
5797         }
5798       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5799       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5800     }
5801   else if (EQ (coding_type, Qshift_jis))
5802     {
5803       coding->detector = detect_coding_sjis;
5804       coding->decoder = decode_coding_sjis;
5805       coding->encoder = encode_coding_sjis;
5806       coding->common_flags
5807         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5808     }
5809   else if (EQ (coding_type, Qbig5))
5810     {
5811       coding->detector = detect_coding_big5;
5812       coding->decoder = decode_coding_big5;
5813       coding->encoder = encode_coding_big5;
5814       coding->common_flags
5815         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5816     }
5817   else                          /* EQ (coding_type, Qraw_text) */
5818     {
5819       coding->detector = NULL;
5820       coding->decoder = decode_coding_raw_text;
5821       coding->encoder = encode_coding_raw_text;
5822       if (! EQ (eol_type, Qunix))
5823         {
5824           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5825           if (! VECTORP (eol_type))
5826             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5827         }
5828
5829     }
5830
5831   return;
5832 }
5833
5834 /* Return a list of charsets supported by CODING.  */
5835
5836 Lisp_Object
5837 coding_charset_list (struct coding_system *coding)
5838 {
5839   Lisp_Object attrs, charset_list;
5840
5841   CODING_GET_INFO (coding, attrs, charset_list);
5842   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5843     {
5844       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5845
5846       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5847         charset_list = Viso_2022_charset_list;
5848     }
5849   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5850     {
5851       charset_list = Vemacs_mule_charset_list;
5852     }
5853   return charset_list;
5854 }
5855
5856
5857 /* Return a list of charsets supported by CODING-SYSTEM.  */
5858
5859 Lisp_Object
5860 coding_system_charset_list (Lisp_Object coding_system)
5861 {
5862   ptrdiff_t id;
5863   Lisp_Object attrs, charset_list;
5864
5865   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5866   attrs = CODING_ID_ATTRS (id);
5867
5868   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5869     {
5870       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5871
5872       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5873         charset_list = Viso_2022_charset_list;
5874       else
5875         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5876     }
5877   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5878     {
5879       charset_list = Vemacs_mule_charset_list;
5880     }
5881   else
5882     {
5883       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5884     }
5885   return charset_list;
5886 }
5887
5888
5889 /* Return raw-text or one of its subsidiaries that has the same
5890    eol_type as CODING-SYSTEM.  */
5891
5892 Lisp_Object
5893 raw_text_coding_system (Lisp_Object coding_system)
5894 {
5895   Lisp_Object spec, attrs;
5896   Lisp_Object eol_type, raw_text_eol_type;
5897
5898   if (NILP (coding_system))
5899     return Qraw_text;
5900   spec = CODING_SYSTEM_SPEC (coding_system);
5901   attrs = AREF (spec, 0);
5902
5903   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5904     return coding_system;
5905
5906   eol_type = AREF (spec, 2);
5907   if (VECTORP (eol_type))
5908     return Qraw_text;
5909   spec = CODING_SYSTEM_SPEC (Qraw_text);
5910   raw_text_eol_type = AREF (spec, 2);
5911   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5912           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5913           : AREF (raw_text_eol_type, 2));
5914 }
5915
5916
5917 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5918    the subsidiary that has the same eol-spec as PARENT (if it is not
5919    nil and specifies end-of-line format) or the system's setting
5920    (system_eol_type).  */
5921
5922 Lisp_Object
5923 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5924 {
5925   Lisp_Object spec, eol_type;
5926
5927   if (NILP (coding_system))
5928     coding_system = Qraw_text;
5929   spec = CODING_SYSTEM_SPEC (coding_system);
5930   eol_type = AREF (spec, 2);
5931   if (VECTORP (eol_type))
5932     {
5933       Lisp_Object parent_eol_type;
5934
5935       if (! NILP (parent))
5936         {
5937           Lisp_Object parent_spec;
5938
5939           parent_spec = CODING_SYSTEM_SPEC (parent);
5940           parent_eol_type = AREF (parent_spec, 2);
5941           if (VECTORP (parent_eol_type))
5942             parent_eol_type = system_eol_type;
5943         }
5944       else
5945         parent_eol_type = system_eol_type;
5946       if (EQ (parent_eol_type, Qunix))
5947         coding_system = AREF (eol_type, 0);
5948       else if (EQ (parent_eol_type, Qdos))
5949         coding_system = AREF (eol_type, 1);
5950       else if (EQ (parent_eol_type, Qmac))
5951         coding_system = AREF (eol_type, 2);
5952     }
5953   return coding_system;
5954 }
5955
5956
5957 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5958    decided for writing to a process.  If not, complement them, and
5959    return a new coding system.  */
5960
5961 Lisp_Object
5962 complement_process_encoding_system (Lisp_Object coding_system)
5963 {
5964   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5965   Lisp_Object spec, attrs;
5966   int i;
5967
5968   for (i = 0; i < 3; i++)
5969     {
5970       if (i == 1)
5971         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5972       else if (i == 2)
5973         coding_system = preferred_coding_system ();
5974       spec = CODING_SYSTEM_SPEC (coding_system);
5975       if (NILP (spec))
5976         continue;
5977       attrs = AREF (spec, 0);
5978       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5979         coding_base = CODING_ATTR_BASE_NAME (attrs);
5980       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5981         eol_base = coding_system;
5982       if (! NILP (coding_base) && ! NILP (eol_base))
5983         break;
5984     }
5985
5986   if (i > 0)
5987     /* The original CODING_SYSTEM didn't specify text-conversion or
5988        eol-conversion.  Be sure that we return a fully complemented
5989        coding system.  */
5990     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5991   return coding_system;
5992 }
5993
5994
5995 /* Emacs has a mechanism to automatically detect a coding system if it
5996    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5997    it's impossible to distinguish some coding systems accurately
5998    because they use the same range of codes.  So, at first, coding
5999    systems are categorized into 7, those are:
6000
6001    o coding-category-emacs-mule
6002
6003         The category for a coding system which has the same code range
6004         as Emacs' internal format.  Assigned the coding-system (Lisp
6005         symbol) `emacs-mule' by default.
6006
6007    o coding-category-sjis
6008
6009         The category for a coding system which has the same code range
6010         as SJIS.  Assigned the coding-system (Lisp
6011         symbol) `japanese-shift-jis' by default.
6012
6013    o coding-category-iso-7
6014
6015         The category for a coding system which has the same code range
6016         as ISO2022 of 7-bit environment.  This doesn't use any locking
6017         shift and single shift functions.  This can encode/decode all
6018         charsets.  Assigned the coding-system (Lisp symbol)
6019         `iso-2022-7bit' by default.
6020
6021    o coding-category-iso-7-tight
6022
6023         Same as coding-category-iso-7 except that this can
6024         encode/decode only the specified charsets.
6025
6026    o coding-category-iso-8-1
6027
6028         The category for a coding system which has the same code range
6029         as ISO2022 of 8-bit environment and graphic plane 1 used only
6030         for DIMENSION1 charset.  This doesn't use any locking shift
6031         and single shift functions.  Assigned the coding-system (Lisp
6032         symbol) `iso-latin-1' by default.
6033
6034    o coding-category-iso-8-2
6035
6036         The category for a coding system which has the same code range
6037         as ISO2022 of 8-bit environment and graphic plane 1 used only
6038         for DIMENSION2 charset.  This doesn't use any locking shift
6039         and single shift functions.  Assigned the coding-system (Lisp
6040         symbol) `japanese-iso-8bit' by default.
6041
6042    o coding-category-iso-7-else
6043
6044         The category for a coding system which has the same code range
6045         as ISO2022 of 7-bit environment but uses locking shift or
6046         single shift functions.  Assigned the coding-system (Lisp
6047         symbol) `iso-2022-7bit-lock' by default.
6048
6049    o coding-category-iso-8-else
6050
6051         The category for a coding system which has the same code range
6052         as ISO2022 of 8-bit environment but uses locking shift or
6053         single shift functions.  Assigned the coding-system (Lisp
6054         symbol) `iso-2022-8bit-ss2' by default.
6055
6056    o coding-category-big5
6057
6058         The category for a coding system which has the same code range
6059         as BIG5.  Assigned the coding-system (Lisp symbol)
6060         `cn-big5' by default.
6061
6062    o coding-category-utf-8
6063
6064         The category for a coding system which has the same code range
6065         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6066         symbol) `utf-8' by default.
6067
6068    o coding-category-utf-16-be
6069
6070         The category for a coding system in which a text has an
6071         Unicode signature (cf. Unicode Standard) in the order of BIG
6072         endian at the head.  Assigned the coding-system (Lisp symbol)
6073         `utf-16-be' by default.
6074
6075    o coding-category-utf-16-le
6076
6077         The category for a coding system in which a text has an
6078         Unicode signature (cf. Unicode Standard) in the order of
6079         LITTLE endian at the head.  Assigned the coding-system (Lisp
6080         symbol) `utf-16-le' by default.
6081
6082    o coding-category-ccl
6083
6084         The category for a coding system of which encoder/decoder is
6085         written in CCL programs.  The default value is nil, i.e., no
6086         coding system is assigned.
6087
6088    o coding-category-binary
6089
6090         The category for a coding system not categorized in any of the
6091         above.  Assigned the coding-system (Lisp symbol)
6092         `no-conversion' by default.
6093
6094    Each of them is a Lisp symbol and the value is an actual
6095    `coding-system's (this is also a Lisp symbol) assigned by a user.
6096    What Emacs does actually is to detect a category of coding system.
6097    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6098    decide only one possible category, it selects a category of the
6099    highest priority.  Priorities of categories are also specified by a
6100    user in a Lisp variable `coding-category-list'.
6101
6102 */
6103
6104 #define EOL_SEEN_NONE   0
6105 #define EOL_SEEN_LF     1
6106 #define EOL_SEEN_CR     2
6107 #define EOL_SEEN_CRLF   4
6108
6109 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6110    SOURCE is encoded.  If CATEGORY is one of
6111    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6112    two-byte, else they are encoded by one-byte.
6113
6114    Return one of EOL_SEEN_XXX.  */
6115
6116 #define MAX_EOL_CHECK_COUNT 3
6117
6118 static int
6119 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6120             enum coding_category category)
6121 {
6122   const unsigned char *src = source, *src_end = src + src_bytes;
6123   unsigned char c;
6124   int total  = 0;
6125   int eol_seen = EOL_SEEN_NONE;
6126
6127   if ((1 << category) & CATEGORY_MASK_UTF_16)
6128     {
6129       bool msb = category == (coding_category_utf_16_le
6130                               | coding_category_utf_16_le_nosig);
6131       bool lsb = !msb;
6132
6133       while (src + 1 < src_end)
6134         {
6135           c = src[lsb];
6136           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6137             {
6138               int this_eol;
6139
6140               if (c == '\n')
6141                 this_eol = EOL_SEEN_LF;
6142               else if (src + 3 >= src_end
6143                        || src[msb + 2] != 0
6144                        || src[lsb + 2] != '\n')
6145                 this_eol = EOL_SEEN_CR;
6146               else
6147                 {
6148                   this_eol = EOL_SEEN_CRLF;
6149                   src += 2;
6150                 }
6151
6152               if (eol_seen == EOL_SEEN_NONE)
6153                 /* This is the first end-of-line.  */
6154                 eol_seen = this_eol;
6155               else if (eol_seen != this_eol)
6156                 {
6157                   /* The found type is different from what found before.
6158                      Allow for stray ^M characters in DOS EOL files.  */
6159                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6160                       || (eol_seen == EOL_SEEN_CRLF
6161                           && this_eol == EOL_SEEN_CR))
6162                     eol_seen = EOL_SEEN_CRLF;
6163                   else
6164                     {
6165                       eol_seen = EOL_SEEN_LF;
6166                       break;
6167                     }
6168                 }
6169               if (++total == MAX_EOL_CHECK_COUNT)
6170                 break;
6171             }
6172           src += 2;
6173         }
6174     }
6175   else
6176     while (src < src_end)
6177       {
6178         c = *src++;
6179         if (c == '\n' || c == '\r')
6180           {
6181             int this_eol;
6182
6183             if (c == '\n')
6184               this_eol = EOL_SEEN_LF;
6185             else if (src >= src_end || *src != '\n')
6186               this_eol = EOL_SEEN_CR;
6187             else
6188               this_eol = EOL_SEEN_CRLF, src++;
6189
6190             if (eol_seen == EOL_SEEN_NONE)
6191               /* This is the first end-of-line.  */
6192               eol_seen = this_eol;
6193             else if (eol_seen != this_eol)
6194               {
6195                 /* The found type is different from what found before.
6196                    Allow for stray ^M characters in DOS EOL files.  */
6197                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6198                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6199                   eol_seen = EOL_SEEN_CRLF;
6200                 else
6201                   {
6202                     eol_seen = EOL_SEEN_LF;
6203                     break;
6204                   }
6205               }
6206             if (++total == MAX_EOL_CHECK_COUNT)
6207               break;
6208           }
6209       }
6210   return eol_seen;
6211 }
6212
6213
6214 static Lisp_Object
6215 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6216 {
6217   Lisp_Object eol_type;
6218
6219   eol_type = CODING_ID_EOL_TYPE (coding->id);
6220   if (eol_seen & EOL_SEEN_LF)
6221     {
6222       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6223       eol_type = Qunix;
6224     }
6225   else if (eol_seen & EOL_SEEN_CRLF)
6226     {
6227       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6228       eol_type = Qdos;
6229     }
6230   else if (eol_seen & EOL_SEEN_CR)
6231     {
6232       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6233       eol_type = Qmac;
6234     }
6235   return eol_type;
6236 }
6237
6238 /* Detect how a text specified in CODING is encoded.  If a coding
6239    system is detected, update fields of CODING by the detected coding
6240    system.  */
6241
6242 static void
6243 detect_coding (struct coding_system *coding)
6244 {
6245   const unsigned char *src, *src_end;
6246   unsigned int saved_mode = coding->mode;
6247
6248   coding->consumed = coding->consumed_char = 0;
6249   coding->produced = coding->produced_char = 0;
6250   coding_set_source (coding);
6251
6252   src_end = coding->source + coding->src_bytes;
6253   coding->head_ascii = 0;
6254
6255   /* If we have not yet decided the text encoding type, detect it
6256      now.  */
6257   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6258     {
6259       int c, i;
6260       struct coding_detection_info detect_info;
6261       bool null_byte_found = 0, eight_bit_found = 0;
6262
6263       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6264       for (src = coding->source; src < src_end; src++)
6265         {
6266           c = *src;
6267           if (c & 0x80)
6268             {
6269               eight_bit_found = 1;
6270               if (null_byte_found)
6271                 break;
6272             }
6273           else if (c < 0x20)
6274             {
6275               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6276                   && ! inhibit_iso_escape_detection
6277                   && ! detect_info.checked)
6278                 {
6279                   if (detect_coding_iso_2022 (coding, &detect_info))
6280                     {
6281                       /* We have scanned the whole data.  */
6282                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6283                         {
6284                           /* We didn't find an 8-bit code.  We may
6285                              have found a null-byte, but it's very
6286                              rare that a binary file conforms to
6287                              ISO-2022.  */
6288                           src = src_end;
6289                           coding->head_ascii = src - coding->source;
6290                         }
6291                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6292                       break;
6293                     }
6294                 }
6295               else if (! c && !inhibit_null_byte_detection)
6296                 {
6297                   null_byte_found = 1;
6298                   if (eight_bit_found)
6299                     break;
6300                 }
6301               if (! eight_bit_found)
6302                 coding->head_ascii++;
6303             }
6304           else if (! eight_bit_found)
6305             coding->head_ascii++;
6306         }
6307
6308       if (null_byte_found || eight_bit_found
6309           || coding->head_ascii < coding->src_bytes
6310           || detect_info.found)
6311         {
6312           enum coding_category category;
6313           struct coding_system *this;
6314
6315           if (coding->head_ascii == coding->src_bytes)
6316             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6317             for (i = 0; i < coding_category_raw_text; i++)
6318               {
6319                 category = coding_priorities[i];
6320                 this = coding_categories + category;
6321                 if (detect_info.found & (1 << category))
6322                   break;
6323               }
6324           else
6325             {
6326               if (null_byte_found)
6327                 {
6328                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6329                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6330                 }
6331               for (i = 0; i < coding_category_raw_text; i++)
6332                 {
6333                   category = coding_priorities[i];
6334                   this = coding_categories + category;
6335                   if (this->id < 0)
6336                     {
6337                       /* No coding system of this category is defined.  */
6338                       detect_info.rejected |= (1 << category);
6339                     }
6340                   else if (category >= coding_category_raw_text)
6341                     continue;
6342                   else if (detect_info.checked & (1 << category))
6343                     {
6344                       if (detect_info.found & (1 << category))
6345                         break;
6346                     }
6347                   else if ((*(this->detector)) (coding, &detect_info)
6348                            && detect_info.found & (1 << category))
6349                     {
6350                       if (category == coding_category_utf_16_auto)
6351                         {
6352                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6353                             category = coding_category_utf_16_le;
6354                           else
6355                             category = coding_category_utf_16_be;
6356                         }
6357                       break;
6358                     }
6359                 }
6360             }
6361
6362           if (i < coding_category_raw_text)
6363             setup_coding_system (CODING_ID_NAME (this->id), coding);
6364           else if (null_byte_found)
6365             setup_coding_system (Qno_conversion, coding);
6366           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6367                    == CATEGORY_MASK_ANY)
6368             setup_coding_system (Qraw_text, coding);
6369           else if (detect_info.rejected)
6370             for (i = 0; i < coding_category_raw_text; i++)
6371               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6372                 {
6373                   this = coding_categories + coding_priorities[i];
6374                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6375                   break;
6376                 }
6377         }
6378     }
6379   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6380            == coding_category_utf_8_auto)
6381     {
6382       Lisp_Object coding_systems;
6383       struct coding_detection_info detect_info;
6384
6385       coding_systems
6386         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6387       detect_info.found = detect_info.rejected = 0;
6388       coding->head_ascii = 0;
6389       if (CONSP (coding_systems)
6390           && detect_coding_utf_8 (coding, &detect_info))
6391         {
6392           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6393             setup_coding_system (XCAR (coding_systems), coding);
6394           else
6395             setup_coding_system (XCDR (coding_systems), coding);
6396         }
6397     }
6398   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6399            == coding_category_utf_16_auto)
6400     {
6401       Lisp_Object coding_systems;
6402       struct coding_detection_info detect_info;
6403
6404       coding_systems
6405         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6406       detect_info.found = detect_info.rejected = 0;
6407       coding->head_ascii = 0;
6408       if (CONSP (coding_systems)
6409           && detect_coding_utf_16 (coding, &detect_info))
6410         {
6411           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6412             setup_coding_system (XCAR (coding_systems), coding);
6413           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6414             setup_coding_system (XCDR (coding_systems), coding);
6415         }
6416     }
6417   coding->mode = saved_mode;
6418 }
6419
6420
6421 static void
6422 decode_eol (struct coding_system *coding)
6423 {
6424   Lisp_Object eol_type;
6425   unsigned char *p, *pbeg, *pend;
6426
6427   eol_type = CODING_ID_EOL_TYPE (coding->id);
6428   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6429     return;
6430
6431   if (NILP (coding->dst_object))
6432     pbeg = coding->destination;
6433   else
6434     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6435   pend = pbeg + coding->produced;
6436
6437   if (VECTORP (eol_type))
6438     {
6439       int eol_seen = EOL_SEEN_NONE;
6440
6441       for (p = pbeg; p < pend; p++)
6442         {
6443           if (*p == '\n')
6444             eol_seen |= EOL_SEEN_LF;
6445           else if (*p == '\r')
6446             {
6447               if (p + 1 < pend && *(p + 1) == '\n')
6448                 {
6449                   eol_seen |= EOL_SEEN_CRLF;
6450                   p++;
6451                 }
6452               else
6453                 eol_seen |= EOL_SEEN_CR;
6454             }
6455         }
6456       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6457       if ((eol_seen & EOL_SEEN_CRLF) != 0
6458           && (eol_seen & EOL_SEEN_CR) != 0
6459           && (eol_seen & EOL_SEEN_LF) == 0)
6460         eol_seen = EOL_SEEN_CRLF;
6461       else if (eol_seen != EOL_SEEN_NONE
6462           && eol_seen != EOL_SEEN_LF
6463           && eol_seen != EOL_SEEN_CRLF
6464           && eol_seen != EOL_SEEN_CR)
6465         eol_seen = EOL_SEEN_LF;
6466       if (eol_seen != EOL_SEEN_NONE)
6467         eol_type = adjust_coding_eol_type (coding, eol_seen);
6468     }
6469
6470   if (EQ (eol_type, Qmac))
6471     {
6472       for (p = pbeg; p < pend; p++)
6473         if (*p == '\r')
6474           *p = '\n';
6475     }
6476   else if (EQ (eol_type, Qdos))
6477     {
6478       ptrdiff_t n = 0;
6479
6480       if (NILP (coding->dst_object))
6481         {
6482           /* Start deleting '\r' from the tail to minimize the memory
6483              movement.  */
6484           for (p = pend - 2; p >= pbeg; p--)
6485             if (*p == '\r')
6486               {
6487                 memmove (p, p + 1, pend-- - p - 1);
6488                 n++;
6489               }
6490         }
6491       else
6492         {
6493           ptrdiff_t pos_byte = coding->dst_pos_byte;
6494           ptrdiff_t pos = coding->dst_pos;
6495           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6496
6497           while (pos < pos_end)
6498             {
6499               p = BYTE_POS_ADDR (pos_byte);
6500               if (*p == '\r' && p[1] == '\n')
6501                 {
6502                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6503                   n++;
6504                   pos_end--;
6505                 }
6506               pos++;
6507               if (coding->dst_multibyte)
6508                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6509               else
6510                 pos_byte++;
6511             }
6512         }
6513       coding->produced -= n;
6514       coding->produced_char -= n;
6515     }
6516 }
6517
6518
6519 /* Return a translation table (or list of them) from coding system
6520    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6521    not ENCODEP). */
6522
6523 static Lisp_Object
6524 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6525 {
6526   Lisp_Object standard, translation_table;
6527   Lisp_Object val;
6528
6529   if (NILP (Venable_character_translation))
6530     {
6531       if (max_lookup)
6532         *max_lookup = 0;
6533       return Qnil;
6534     }
6535   if (encodep)
6536     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6537       standard = Vstandard_translation_table_for_encode;
6538   else
6539     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6540       standard = Vstandard_translation_table_for_decode;
6541   if (NILP (translation_table))
6542     translation_table = standard;
6543   else
6544     {
6545       if (SYMBOLP (translation_table))
6546         translation_table = Fget (translation_table, Qtranslation_table);
6547       else if (CONSP (translation_table))
6548         {
6549           translation_table = Fcopy_sequence (translation_table);
6550           for (val = translation_table; CONSP (val); val = XCDR (val))
6551             if (SYMBOLP (XCAR (val)))
6552               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6553         }
6554       if (CHAR_TABLE_P (standard))
6555         {
6556           if (CONSP (translation_table))
6557             translation_table = nconc2 (translation_table,
6558                                         Fcons (standard, Qnil));
6559           else
6560             translation_table = Fcons (translation_table,
6561                                        Fcons (standard, Qnil));
6562         }
6563     }
6564
6565   if (max_lookup)
6566     {
6567       *max_lookup = 1;
6568       if (CHAR_TABLE_P (translation_table)
6569           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6570         {
6571           val = XCHAR_TABLE (translation_table)->extras[1];
6572           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6573             *max_lookup = XFASTINT (val);
6574         }
6575       else if (CONSP (translation_table))
6576         {
6577           Lisp_Object tail;
6578
6579           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6580             if (CHAR_TABLE_P (XCAR (tail))
6581                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6582               {
6583                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6584                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6585                   *max_lookup = XFASTINT (tailval);
6586               }
6587         }
6588     }
6589   return translation_table;
6590 }
6591
6592 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6593   do {                                                          \
6594     trans = Qnil;                                               \
6595     if (CHAR_TABLE_P (table))                                   \
6596       {                                                         \
6597         trans = CHAR_TABLE_REF (table, c);                      \
6598         if (CHARACTERP (trans))                                 \
6599           c = XFASTINT (trans), trans = Qnil;                   \
6600       }                                                         \
6601     else if (CONSP (table))                                     \
6602       {                                                         \
6603         Lisp_Object tail;                                       \
6604                                                                 \
6605         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6606           if (CHAR_TABLE_P (XCAR (tail)))                       \
6607             {                                                   \
6608               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6609               if (CHARACTERP (trans))                           \
6610                 c = XFASTINT (trans), trans = Qnil;             \
6611               else if (! NILP (trans))                          \
6612                 break;                                          \
6613             }                                                   \
6614       }                                                         \
6615   } while (0)
6616
6617
6618 /* Return a translation of character(s) at BUF according to TRANS.
6619    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6620    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6621    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6622    translation is found, and Qnil if not found..
6623    If BUF is too short to lookup characters in FROM, return Qt.  */
6624
6625 static Lisp_Object
6626 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6627 {
6628
6629   if (INTEGERP (trans))
6630     return trans;
6631   for (; CONSP (trans); trans = XCDR (trans))
6632     {
6633       Lisp_Object val = XCAR (trans);
6634       Lisp_Object from = XCAR (val);
6635       ptrdiff_t len = ASIZE (from);
6636       ptrdiff_t i;
6637
6638       for (i = 0; i < len; i++)
6639         {
6640           if (buf + i == buf_end)
6641             return Qt;
6642           if (XINT (AREF (from, i)) != buf[i])
6643             break;
6644         }
6645       if (i == len)
6646         return val;
6647     }
6648   return Qnil;
6649 }
6650
6651
6652 static int
6653 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6654                bool last_block)
6655 {
6656   unsigned char *dst = coding->destination + coding->produced;
6657   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6658   ptrdiff_t produced;
6659   ptrdiff_t produced_chars = 0;
6660   int carryover = 0;
6661
6662   if (! coding->chars_at_source)
6663     {
6664       /* Source characters are in coding->charbuf.  */
6665       int *buf = coding->charbuf;
6666       int *buf_end = buf + coding->charbuf_used;
6667
6668       if (EQ (coding->src_object, coding->dst_object))
6669         {
6670           coding_set_source (coding);
6671           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6672         }
6673
6674       while (buf < buf_end)
6675         {
6676           int c = *buf;
6677           ptrdiff_t i;
6678
6679           if (c >= 0)
6680             {
6681               ptrdiff_t from_nchars = 1, to_nchars = 1;
6682               Lisp_Object trans = Qnil;
6683
6684               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6685               if (! NILP (trans))
6686                 {
6687                   trans = get_translation (trans, buf, buf_end);
6688                   if (INTEGERP (trans))
6689                     c = XINT (trans);
6690                   else if (CONSP (trans))
6691                     {
6692                       from_nchars = ASIZE (XCAR (trans));
6693                       trans = XCDR (trans);
6694                       if (INTEGERP (trans))
6695                         c = XINT (trans);
6696                       else
6697                         {
6698                           to_nchars = ASIZE (trans);
6699                           c = XINT (AREF (trans, 0));
6700                         }
6701                     }
6702                   else if (EQ (trans, Qt) && ! last_block)
6703                     break;
6704                 }
6705
6706               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6707                 {
6708                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6709                        / MAX_MULTIBYTE_LENGTH)
6710                       < to_nchars)
6711                     memory_full (SIZE_MAX);
6712                   dst = alloc_destination (coding,
6713                                            buf_end - buf
6714                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6715                                            dst);
6716                   if (EQ (coding->src_object, coding->dst_object))
6717                     {
6718                       coding_set_source (coding);
6719                       dst_end = (((unsigned char *) coding->source)
6720                                  + coding->consumed);
6721                     }
6722                   else
6723                     dst_end = coding->destination + coding->dst_bytes;
6724                 }
6725
6726               for (i = 0; i < to_nchars; i++)
6727                 {
6728                   if (i > 0)
6729                     c = XINT (AREF (trans, i));
6730                   if (coding->dst_multibyte
6731                       || ! CHAR_BYTE8_P (c))
6732                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6733                   else
6734                     *dst++ = CHAR_TO_BYTE8 (c);
6735                 }
6736               produced_chars += to_nchars;
6737               buf += from_nchars;
6738             }
6739           else
6740             /* This is an annotation datum.  (-C) is the length.  */
6741             buf += -c;
6742         }
6743       carryover = buf_end - buf;
6744     }
6745   else
6746     {
6747       /* Source characters are at coding->source.  */
6748       const unsigned char *src = coding->source;
6749       const unsigned char *src_end = src + coding->consumed;
6750
6751       if (EQ (coding->dst_object, coding->src_object))
6752         dst_end = (unsigned char *) src;
6753       if (coding->src_multibyte != coding->dst_multibyte)
6754         {
6755           if (coding->src_multibyte)
6756             {
6757               bool multibytep = 1;
6758               ptrdiff_t consumed_chars = 0;
6759
6760               while (1)
6761                 {
6762                   const unsigned char *src_base = src;
6763                   int c;
6764
6765                   ONE_MORE_BYTE (c);
6766                   if (dst == dst_end)
6767                     {
6768                       if (EQ (coding->src_object, coding->dst_object))
6769                         dst_end = (unsigned char *) src;
6770                       if (dst == dst_end)
6771                         {
6772                           ptrdiff_t offset = src - coding->source;
6773
6774                           dst = alloc_destination (coding, src_end - src + 1,
6775                                                    dst);
6776                           dst_end = coding->destination + coding->dst_bytes;
6777                           coding_set_source (coding);
6778                           src = coding->source + offset;
6779                           src_end = coding->source + coding->consumed;
6780                           if (EQ (coding->src_object, coding->dst_object))
6781                             dst_end = (unsigned char *) src;
6782                         }
6783                     }
6784                   *dst++ = c;
6785                   produced_chars++;
6786                 }
6787             no_more_source:
6788               ;
6789             }
6790           else
6791             while (src < src_end)
6792               {
6793                 bool multibytep = 1;
6794                 int c = *src++;
6795
6796                 if (dst >= dst_end - 1)
6797                   {
6798                     if (EQ (coding->src_object, coding->dst_object))
6799                       dst_end = (unsigned char *) src;
6800                     if (dst >= dst_end - 1)
6801                       {
6802                         ptrdiff_t offset = src - coding->source;
6803                         ptrdiff_t more_bytes;
6804
6805                         if (EQ (coding->src_object, coding->dst_object))
6806                           more_bytes = ((src_end - src) / 2) + 2;
6807                         else
6808                           more_bytes = src_end - src + 2;
6809                         dst = alloc_destination (coding, more_bytes, dst);
6810                         dst_end = coding->destination + coding->dst_bytes;
6811                         coding_set_source (coding);
6812                         src = coding->source + offset;
6813                         src_end = coding->source + coding->consumed;
6814                         if (EQ (coding->src_object, coding->dst_object))
6815                           dst_end = (unsigned char *) src;
6816                       }
6817                   }
6818                 EMIT_ONE_BYTE (c);
6819               }
6820         }
6821       else
6822         {
6823           if (!EQ (coding->src_object, coding->dst_object))
6824             {
6825               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6826
6827               if (require > 0)
6828                 {
6829                   ptrdiff_t offset = src - coding->source;
6830
6831                   dst = alloc_destination (coding, require, dst);
6832                   coding_set_source (coding);
6833                   src = coding->source + offset;
6834                   src_end = coding->source + coding->consumed;
6835                 }
6836             }
6837           produced_chars = coding->consumed_char;
6838           while (src < src_end)
6839             *dst++ = *src++;
6840         }
6841     }
6842
6843   produced = dst - (coding->destination + coding->produced);
6844   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6845     insert_from_gap (produced_chars, produced);
6846   coding->produced += produced;
6847   coding->produced_char += produced_chars;
6848   return carryover;
6849 }
6850
6851 /* Compose text in CODING->object according to the annotation data at
6852    CHARBUF.  CHARBUF is an array:
6853      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6854  */
6855
6856 static inline void
6857 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6858 {
6859   int len;
6860   ptrdiff_t to;
6861   enum composition_method method;
6862   Lisp_Object components;
6863
6864   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6865   to = pos + charbuf[2];
6866   method = (enum composition_method) (charbuf[4]);
6867
6868   if (method == COMPOSITION_RELATIVE)
6869     components = Qnil;
6870   else
6871     {
6872       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6873       int i, j;
6874
6875       if (method == COMPOSITION_WITH_RULE)
6876         len = charbuf[2] * 3 - 2;
6877       charbuf += MAX_ANNOTATION_LENGTH;
6878       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6879       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6880         {
6881           if (charbuf[i] >= 0)
6882             args[j] = make_number (charbuf[i]);
6883           else
6884             {
6885               i++;
6886               args[j] = make_number (charbuf[i] % 0x100);
6887             }
6888         }
6889       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6890     }
6891   compose_text (pos, to, components, Qnil, coding->dst_object);
6892 }
6893
6894
6895 /* Put `charset' property on text in CODING->object according to
6896    the annotation data at CHARBUF.  CHARBUF is an array:
6897      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6898  */
6899
6900 static inline void
6901 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6902 {
6903   ptrdiff_t from = pos - charbuf[2];
6904   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6905
6906   Fput_text_property (make_number (from), make_number (pos),
6907                       Qcharset, CHARSET_NAME (charset),
6908                       coding->dst_object);
6909 }
6910
6911
6912 #define CHARBUF_SIZE 0x4000
6913
6914 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6915   do {                                                                  \
6916     int size = CHARBUF_SIZE;                                            \
6917                                                                         \
6918     coding->charbuf = NULL;                                             \
6919     while (size > 1024)                                                 \
6920       {                                                                 \
6921         coding->charbuf = alloca (sizeof (int) * size);                 \
6922         if (coding->charbuf)                                            \
6923           break;                                                        \
6924         size >>= 1;                                                     \
6925       }                                                                 \
6926     if (! coding->charbuf)                                              \
6927       {                                                                 \
6928         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6929         return;                                                         \
6930       }                                                                 \
6931     coding->charbuf_size = size;                                        \
6932   } while (0)
6933
6934
6935 static void
6936 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6937 {
6938   int *charbuf = coding->charbuf;
6939   int *charbuf_end = charbuf + coding->charbuf_used;
6940
6941   if (NILP (coding->dst_object))
6942     return;
6943
6944   while (charbuf < charbuf_end)
6945     {
6946       if (*charbuf >= 0)
6947         pos++, charbuf++;
6948       else
6949         {
6950           int len = -*charbuf;
6951
6952           if (len > 2)
6953             switch (charbuf[1])
6954               {
6955               case CODING_ANNOTATE_COMPOSITION_MASK:
6956                 produce_composition (coding, charbuf, pos);
6957                 break;
6958               case CODING_ANNOTATE_CHARSET_MASK:
6959                 produce_charset (coding, charbuf, pos);
6960                 break;
6961               }
6962           charbuf += len;
6963         }
6964     }
6965 }
6966
6967 /* Decode the data at CODING->src_object into CODING->dst_object.
6968    CODING->src_object is a buffer, a string, or nil.
6969    CODING->dst_object is a buffer.
6970
6971    If CODING->src_object is a buffer, it must be the current buffer.
6972    In this case, if CODING->src_pos is positive, it is a position of
6973    the source text in the buffer, otherwise, the source text is in the
6974    gap area of the buffer, and CODING->src_pos specifies the offset of
6975    the text from GPT (which must be the same as PT).  If this is the
6976    same buffer as CODING->dst_object, CODING->src_pos must be
6977    negative.
6978
6979    If CODING->src_object is a string, CODING->src_pos is an index to
6980    that string.
6981
6982    If CODING->src_object is nil, CODING->source must already point to
6983    the non-relocatable memory area.  In this case, CODING->src_pos is
6984    an offset from CODING->source.
6985
6986    The decoded data is inserted at the current point of the buffer
6987    CODING->dst_object.
6988 */
6989
6990 static void
6991 decode_coding (struct coding_system *coding)
6992 {
6993   Lisp_Object attrs;
6994   Lisp_Object undo_list;
6995   Lisp_Object translation_table;
6996   struct ccl_spec cclspec;
6997   int carryover;
6998   int i;
6999
7000   if (BUFFERP (coding->src_object)
7001       && coding->src_pos > 0
7002       && coding->src_pos < GPT
7003       && coding->src_pos + coding->src_chars > GPT)
7004     move_gap_both (coding->src_pos, coding->src_pos_byte);
7005
7006   undo_list = Qt;
7007   if (BUFFERP (coding->dst_object))
7008     {
7009       set_buffer_internal (XBUFFER (coding->dst_object));
7010       if (GPT != PT)
7011         move_gap_both (PT, PT_BYTE);
7012
7013       /* We must disable undo_list in order to record the whole insert
7014          transaction via record_insert at the end.  But doing so also
7015          disables the recording of the first change to the undo_list.
7016          Therefore we check for first change here and record it via
7017          record_first_change if needed.  */
7018       if (MODIFF <= SAVE_MODIFF)
7019         record_first_change ();
7020
7021       undo_list = BVAR (current_buffer, undo_list);
7022       bset_undo_list (current_buffer, Qt);
7023     }
7024
7025   coding->consumed = coding->consumed_char = 0;
7026   coding->produced = coding->produced_char = 0;
7027   coding->chars_at_source = 0;
7028   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7029   coding->errors = 0;
7030
7031   ALLOC_CONVERSION_WORK_AREA (coding);
7032
7033   attrs = CODING_ID_ATTRS (coding->id);
7034   translation_table = get_translation_table (attrs, 0, NULL);
7035
7036   carryover = 0;
7037   if (coding->decoder == decode_coding_ccl)
7038     {
7039       coding->spec.ccl = &cclspec;
7040       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7041     }
7042   do
7043     {
7044       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7045
7046       coding_set_source (coding);
7047       coding->annotated = 0;
7048       coding->charbuf_used = carryover;
7049       (*(coding->decoder)) (coding);
7050       coding_set_destination (coding);
7051       carryover = produce_chars (coding, translation_table, 0);
7052       if (coding->annotated)
7053         produce_annotation (coding, pos);
7054       for (i = 0; i < carryover; i++)
7055         coding->charbuf[i]
7056           = coding->charbuf[coding->charbuf_used - carryover + i];
7057     }
7058   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7059          || (coding->consumed < coding->src_bytes
7060              && (coding->result == CODING_RESULT_SUCCESS
7061                  || coding->result == CODING_RESULT_INVALID_SRC)));
7062
7063   if (carryover > 0)
7064     {
7065       coding_set_destination (coding);
7066       coding->charbuf_used = carryover;
7067       produce_chars (coding, translation_table, 1);
7068     }
7069
7070   coding->carryover_bytes = 0;
7071   if (coding->consumed < coding->src_bytes)
7072     {
7073       int nbytes = coding->src_bytes - coding->consumed;
7074       const unsigned char *src;
7075
7076       coding_set_source (coding);
7077       coding_set_destination (coding);
7078       src = coding->source + coding->consumed;
7079
7080       if (coding->mode & CODING_MODE_LAST_BLOCK)
7081         {
7082           /* Flush out unprocessed data as binary chars.  We are sure
7083              that the number of data is less than the size of
7084              coding->charbuf.  */
7085           coding->charbuf_used = 0;
7086           coding->chars_at_source = 0;
7087
7088           while (nbytes-- > 0)
7089             {
7090               int c = *src++;
7091
7092               if (c & 0x80)
7093                 c = BYTE8_TO_CHAR (c);
7094               coding->charbuf[coding->charbuf_used++] = c;
7095             }
7096           produce_chars (coding, Qnil, 1);
7097         }
7098       else
7099         {
7100           /* Record unprocessed bytes in coding->carryover.  We are
7101              sure that the number of data is less than the size of
7102              coding->carryover.  */
7103           unsigned char *p = coding->carryover;
7104
7105           if (nbytes > sizeof coding->carryover)
7106             nbytes = sizeof coding->carryover;
7107           coding->carryover_bytes = nbytes;
7108           while (nbytes-- > 0)
7109             *p++ = *src++;
7110         }
7111       coding->consumed = coding->src_bytes;
7112     }
7113
7114   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7115       && !inhibit_eol_conversion)
7116     decode_eol (coding);
7117   if (BUFFERP (coding->dst_object))
7118     {
7119       bset_undo_list (current_buffer, undo_list);
7120       record_insert (coding->dst_pos, coding->produced_char);
7121     }
7122 }
7123
7124
7125 /* Extract an annotation datum from a composition starting at POS and
7126    ending before LIMIT of CODING->src_object (buffer or string), store
7127    the data in BUF, set *STOP to a starting position of the next
7128    composition (if any) or to LIMIT, and return the address of the
7129    next element of BUF.
7130
7131    If such an annotation is not found, set *STOP to a starting
7132    position of a composition after POS (if any) or to LIMIT, and
7133    return BUF.  */
7134
7135 static inline int *
7136 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7137                                struct coding_system *coding, int *buf,
7138                                ptrdiff_t *stop)
7139 {
7140   ptrdiff_t start, end;
7141   Lisp_Object prop;
7142
7143   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7144       || end > limit)
7145     *stop = limit;
7146   else if (start > pos)
7147     *stop = start;
7148   else
7149     {
7150       if (start == pos)
7151         {
7152           /* We found a composition.  Store the corresponding
7153              annotation data in BUF.  */
7154           int *head = buf;
7155           enum composition_method method = COMPOSITION_METHOD (prop);
7156           int nchars = COMPOSITION_LENGTH (prop);
7157
7158           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7159           if (method != COMPOSITION_RELATIVE)
7160             {
7161               Lisp_Object components;
7162               ptrdiff_t i, len, i_byte;
7163
7164               components = COMPOSITION_COMPONENTS (prop);
7165               if (VECTORP (components))
7166                 {
7167                   len = ASIZE (components);
7168                   for (i = 0; i < len; i++)
7169                     *buf++ = XINT (AREF (components, i));
7170                 }
7171               else if (STRINGP (components))
7172                 {
7173                   len = SCHARS (components);
7174                   i = i_byte = 0;
7175                   while (i < len)
7176                     {
7177                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7178                       buf++;
7179                     }
7180                 }
7181               else if (INTEGERP (components))
7182                 {
7183                   len = 1;
7184                   *buf++ = XINT (components);
7185                 }
7186               else if (CONSP (components))
7187                 {
7188                   for (len = 0; CONSP (components);
7189                        len++, components = XCDR (components))
7190                     *buf++ = XINT (XCAR (components));
7191                 }
7192               else
7193                 abort ();
7194               *head -= len;
7195             }
7196         }
7197
7198       if (find_composition (end, limit, &start, &end, &prop,
7199                             coding->src_object)
7200           && end <= limit)
7201         *stop = start;
7202       else
7203         *stop = limit;
7204     }
7205   return buf;
7206 }
7207
7208
7209 /* Extract an annotation datum from a text property `charset' at POS of
7210    CODING->src_object (buffer of string), store the data in BUF, set
7211    *STOP to the position where the value of `charset' property changes
7212    (limiting by LIMIT), and return the address of the next element of
7213    BUF.
7214
7215    If the property value is nil, set *STOP to the position where the
7216    property value is non-nil (limiting by LIMIT), and return BUF.  */
7217
7218 static inline int *
7219 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7220                            struct coding_system *coding, int *buf,
7221                            ptrdiff_t *stop)
7222 {
7223   Lisp_Object val, next;
7224   int id;
7225
7226   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7227   if (! NILP (val) && CHARSETP (val))
7228     id = XINT (CHARSET_SYMBOL_ID (val));
7229   else
7230     id = -1;
7231   ADD_CHARSET_DATA (buf, 0, id);
7232   next = Fnext_single_property_change (make_number (pos), Qcharset,
7233                                        coding->src_object,
7234                                        make_number (limit));
7235   *stop = XINT (next);
7236   return buf;
7237 }
7238
7239
7240 static void
7241 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7242                int max_lookup)
7243 {
7244   int *buf = coding->charbuf;
7245   int *buf_end = coding->charbuf + coding->charbuf_size;
7246   const unsigned char *src = coding->source + coding->consumed;
7247   const unsigned char *src_end = coding->source + coding->src_bytes;
7248   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7249   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7250   bool multibytep = coding->src_multibyte;
7251   Lisp_Object eol_type;
7252   int c;
7253   ptrdiff_t stop, stop_composition, stop_charset;
7254   int *lookup_buf = NULL;
7255
7256   if (! NILP (translation_table))
7257     lookup_buf = alloca (sizeof (int) * max_lookup);
7258
7259   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7260   if (VECTORP (eol_type))
7261     eol_type = Qunix;
7262
7263   /* Note: composition handling is not yet implemented.  */
7264   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7265
7266   if (NILP (coding->src_object))
7267     stop = stop_composition = stop_charset = end_pos;
7268   else
7269     {
7270       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7271         stop = stop_composition = pos;
7272       else
7273         stop = stop_composition = end_pos;
7274       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7275         stop = stop_charset = pos;
7276       else
7277         stop_charset = end_pos;
7278     }
7279
7280   /* Compensate for CRLF and conversion.  */
7281   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7282   while (buf < buf_end)
7283     {
7284       Lisp_Object trans;
7285
7286       if (pos == stop)
7287         {
7288           if (pos == end_pos)
7289             break;
7290           if (pos == stop_composition)
7291             buf = handle_composition_annotation (pos, end_pos, coding,
7292                                                  buf, &stop_composition);
7293           if (pos == stop_charset)
7294             buf = handle_charset_annotation (pos, end_pos, coding,
7295                                              buf, &stop_charset);
7296           stop = (stop_composition < stop_charset
7297                   ? stop_composition : stop_charset);
7298         }
7299
7300       if (! multibytep)
7301         {
7302           int bytes;
7303
7304           if (coding->encoder == encode_coding_raw_text
7305               || coding->encoder == encode_coding_ccl)
7306             c = *src++, pos++;
7307           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7308             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7309           else
7310             c = BYTE8_TO_CHAR (*src), src++, pos++;
7311         }
7312       else
7313         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7314       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7315         c = '\n';
7316       if (! EQ (eol_type, Qunix))
7317         {
7318           if (c == '\n')
7319             {
7320               if (EQ (eol_type, Qdos))
7321                 *buf++ = '\r';
7322               else
7323                 c = '\r';
7324             }
7325         }
7326
7327       trans = Qnil;
7328       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7329       if (NILP (trans))
7330         *buf++ = c;
7331       else
7332         {
7333           ptrdiff_t from_nchars = 1, to_nchars = 1;
7334           int *lookup_buf_end;
7335           const unsigned char *p = src;
7336           int i;
7337
7338           lookup_buf[0] = c;
7339           for (i = 1; i < max_lookup && p < src_end; i++)
7340             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7341           lookup_buf_end = lookup_buf + i;
7342           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7343           if (INTEGERP (trans))
7344             c = XINT (trans);
7345           else if (CONSP (trans))
7346             {
7347               from_nchars = ASIZE (XCAR (trans));
7348               trans = XCDR (trans);
7349               if (INTEGERP (trans))
7350                 c = XINT (trans);
7351               else
7352                 {
7353                   to_nchars = ASIZE (trans);
7354                   if (buf_end - buf < to_nchars)
7355                     break;
7356                   c = XINT (AREF (trans, 0));
7357                 }
7358             }
7359           else
7360             break;
7361           *buf++ = c;
7362           for (i = 1; i < to_nchars; i++)
7363             *buf++ = XINT (AREF (trans, i));
7364           for (i = 1; i < from_nchars; i++, pos++)
7365             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7366         }
7367     }
7368
7369   coding->consumed = src - coding->source;
7370   coding->consumed_char = pos - coding->src_pos;
7371   coding->charbuf_used = buf - coding->charbuf;
7372   coding->chars_at_source = 0;
7373 }
7374
7375
7376 /* Encode the text at CODING->src_object into CODING->dst_object.
7377    CODING->src_object is a buffer or a string.
7378    CODING->dst_object is a buffer or nil.
7379
7380    If CODING->src_object is a buffer, it must be the current buffer.
7381    In this case, if CODING->src_pos is positive, it is a position of
7382    the source text in the buffer, otherwise. the source text is in the
7383    gap area of the buffer, and coding->src_pos specifies the offset of
7384    the text from GPT (which must be the same as PT).  If this is the
7385    same buffer as CODING->dst_object, CODING->src_pos must be
7386    negative and CODING should not have `pre-write-conversion'.
7387
7388    If CODING->src_object is a string, CODING should not have
7389    `pre-write-conversion'.
7390
7391    If CODING->dst_object is a buffer, the encoded data is inserted at
7392    the current point of that buffer.
7393
7394    If CODING->dst_object is nil, the encoded data is placed at the
7395    memory area specified by CODING->destination.  */
7396
7397 static void
7398 encode_coding (struct coding_system *coding)
7399 {
7400   Lisp_Object attrs;
7401   Lisp_Object translation_table;
7402   int max_lookup;
7403   struct ccl_spec cclspec;
7404
7405   attrs = CODING_ID_ATTRS (coding->id);
7406   if (coding->encoder == encode_coding_raw_text)
7407     translation_table = Qnil, max_lookup = 0;
7408   else
7409     translation_table = get_translation_table (attrs, 1, &max_lookup);
7410
7411   if (BUFFERP (coding->dst_object))
7412     {
7413       set_buffer_internal (XBUFFER (coding->dst_object));
7414       coding->dst_multibyte
7415         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7416     }
7417
7418   coding->consumed = coding->consumed_char = 0;
7419   coding->produced = coding->produced_char = 0;
7420   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7421   coding->errors = 0;
7422
7423   ALLOC_CONVERSION_WORK_AREA (coding);
7424
7425   if (coding->encoder == encode_coding_ccl)
7426     {
7427       coding->spec.ccl = &cclspec;
7428       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7429     }
7430   do {
7431     coding_set_source (coding);
7432     consume_chars (coding, translation_table, max_lookup);
7433     coding_set_destination (coding);
7434     (*(coding->encoder)) (coding);
7435   } while (coding->consumed_char < coding->src_chars);
7436
7437   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7438     insert_from_gap (coding->produced_char, coding->produced);
7439 }
7440
7441
7442 /* Name (or base name) of work buffer for code conversion.  */
7443 static Lisp_Object Vcode_conversion_workbuf_name;
7444
7445 /* A working buffer used by the top level conversion.  Once it is
7446    created, it is never destroyed.  It has the name
7447    Vcode_conversion_workbuf_name.  The other working buffers are
7448    destroyed after the use is finished, and their names are modified
7449    versions of Vcode_conversion_workbuf_name.  */
7450 static Lisp_Object Vcode_conversion_reused_workbuf;
7451
7452 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7453 static bool reused_workbuf_in_use;
7454
7455
7456 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7457    multibyteness of returning buffer.  */
7458
7459 static Lisp_Object
7460 make_conversion_work_buffer (bool multibyte)
7461 {
7462   Lisp_Object name, workbuf;
7463   struct buffer *current;
7464
7465   if (reused_workbuf_in_use)
7466     {
7467       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7468       workbuf = Fget_buffer_create (name);
7469     }
7470   else
7471     {
7472       reused_workbuf_in_use = 1;
7473       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7474         Vcode_conversion_reused_workbuf
7475           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7476       workbuf = Vcode_conversion_reused_workbuf;
7477     }
7478   current = current_buffer;
7479   set_buffer_internal (XBUFFER (workbuf));
7480   /* We can't allow modification hooks to run in the work buffer.  For
7481      instance, directory_files_internal assumes that file decoding
7482      doesn't compile new regexps.  */
7483   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7484   Ferase_buffer ();
7485   bset_undo_list (current_buffer, Qt);
7486   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7487   set_buffer_internal (current);
7488   return workbuf;
7489 }
7490
7491
7492 static Lisp_Object
7493 code_conversion_restore (Lisp_Object arg)
7494 {
7495   Lisp_Object current, workbuf;
7496   struct gcpro gcpro1;
7497
7498   GCPRO1 (arg);
7499   current = XCAR (arg);
7500   workbuf = XCDR (arg);
7501   if (! NILP (workbuf))
7502     {
7503       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7504         reused_workbuf_in_use = 0;
7505       else
7506         Fkill_buffer (workbuf);
7507     }
7508   set_buffer_internal (XBUFFER (current));
7509   UNGCPRO;
7510   return Qnil;
7511 }
7512
7513 Lisp_Object
7514 code_conversion_save (bool with_work_buf, bool multibyte)
7515 {
7516   Lisp_Object workbuf = Qnil;
7517
7518   if (with_work_buf)
7519     workbuf = make_conversion_work_buffer (multibyte);
7520   record_unwind_protect (code_conversion_restore,
7521                          Fcons (Fcurrent_buffer (), workbuf));
7522   return workbuf;
7523 }
7524
7525 void
7526 decode_coding_gap (struct coding_system *coding,
7527                    ptrdiff_t chars, ptrdiff_t bytes)
7528 {
7529   ptrdiff_t count = SPECPDL_INDEX ();
7530   Lisp_Object attrs;
7531
7532   code_conversion_save (0, 0);
7533
7534   coding->src_object = Fcurrent_buffer ();
7535   coding->src_chars = chars;
7536   coding->src_bytes = bytes;
7537   coding->src_pos = -chars;
7538   coding->src_pos_byte = -bytes;
7539   coding->src_multibyte = chars < bytes;
7540   coding->dst_object = coding->src_object;
7541   coding->dst_pos = PT;
7542   coding->dst_pos_byte = PT_BYTE;
7543   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7544
7545   if (CODING_REQUIRE_DETECTION (coding))
7546     detect_coding (coding);
7547
7548   coding->mode |= CODING_MODE_LAST_BLOCK;
7549   current_buffer->text->inhibit_shrinking = 1;
7550   decode_coding (coding);
7551   current_buffer->text->inhibit_shrinking = 0;
7552
7553   attrs = CODING_ID_ATTRS (coding->id);
7554   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7555     {
7556       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7557       Lisp_Object val;
7558
7559       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7560       val = call1 (CODING_ATTR_POST_READ (attrs),
7561                    make_number (coding->produced_char));
7562       CHECK_NATNUM (val);
7563       coding->produced_char += Z - prev_Z;
7564       coding->produced += Z_BYTE - prev_Z_BYTE;
7565     }
7566
7567   unbind_to (count, Qnil);
7568 }
7569
7570
7571 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7572    SRC_OBJECT into DST_OBJECT by coding context CODING.
7573
7574    SRC_OBJECT is a buffer, a string, or Qnil.
7575
7576    If it is a buffer, the text is at point of the buffer.  FROM and TO
7577    are positions in the buffer.
7578
7579    If it is a string, the text is at the beginning of the string.
7580    FROM and TO are indices to the string.
7581
7582    If it is nil, the text is at coding->source.  FROM and TO are
7583    indices to coding->source.
7584
7585    DST_OBJECT is a buffer, Qt, or Qnil.
7586
7587    If it is a buffer, the decoded text is inserted at point of the
7588    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7589    is deleted.
7590
7591    If it is Qt, a string is made from the decoded text, and
7592    set in CODING->dst_object.
7593
7594    If it is Qnil, the decoded text is stored at CODING->destination.
7595    The caller must allocate CODING->dst_bytes bytes at
7596    CODING->destination by xmalloc.  If the decoded text is longer than
7597    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7598  */
7599
7600 void
7601 decode_coding_object (struct coding_system *coding,
7602                       Lisp_Object src_object,
7603                       ptrdiff_t from, ptrdiff_t from_byte,
7604                       ptrdiff_t to, ptrdiff_t to_byte,
7605                       Lisp_Object dst_object)
7606 {
7607   ptrdiff_t count = SPECPDL_INDEX ();
7608   unsigned char *destination IF_LINT (= NULL);
7609   ptrdiff_t dst_bytes IF_LINT (= 0);
7610   ptrdiff_t chars = to - from;
7611   ptrdiff_t bytes = to_byte - from_byte;
7612   Lisp_Object attrs;
7613   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7614   bool need_marker_adjustment = 0;
7615   Lisp_Object old_deactivate_mark;
7616
7617   old_deactivate_mark = Vdeactivate_mark;
7618
7619   if (NILP (dst_object))
7620     {
7621       destination = coding->destination;
7622       dst_bytes = coding->dst_bytes;
7623     }
7624
7625   coding->src_object = src_object;
7626   coding->src_chars = chars;
7627   coding->src_bytes = bytes;
7628   coding->src_multibyte = chars < bytes;
7629
7630   if (STRINGP (src_object))
7631     {
7632       coding->src_pos = from;
7633       coding->src_pos_byte = from_byte;
7634     }
7635   else if (BUFFERP (src_object))
7636     {
7637       set_buffer_internal (XBUFFER (src_object));
7638       if (from != GPT)
7639         move_gap_both (from, from_byte);
7640       if (EQ (src_object, dst_object))
7641         {
7642           struct Lisp_Marker *tail;
7643
7644           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7645             {
7646               tail->need_adjustment
7647                 = tail->charpos == (tail->insertion_type ? from : to);
7648               need_marker_adjustment |= tail->need_adjustment;
7649             }
7650           saved_pt = PT, saved_pt_byte = PT_BYTE;
7651           TEMP_SET_PT_BOTH (from, from_byte);
7652           current_buffer->text->inhibit_shrinking = 1;
7653           del_range_both (from, from_byte, to, to_byte, 1);
7654           coding->src_pos = -chars;
7655           coding->src_pos_byte = -bytes;
7656         }
7657       else
7658         {
7659           coding->src_pos = from;
7660           coding->src_pos_byte = from_byte;
7661         }
7662     }
7663
7664   if (CODING_REQUIRE_DETECTION (coding))
7665     detect_coding (coding);
7666   attrs = CODING_ID_ATTRS (coding->id);
7667
7668   if (EQ (dst_object, Qt)
7669       || (! NILP (CODING_ATTR_POST_READ (attrs))
7670           && NILP (dst_object)))
7671     {
7672       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7673       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7674       coding->dst_pos = BEG;
7675       coding->dst_pos_byte = BEG_BYTE;
7676     }
7677   else if (BUFFERP (dst_object))
7678     {
7679       code_conversion_save (0, 0);
7680       coding->dst_object = dst_object;
7681       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7682       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7683       coding->dst_multibyte
7684         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7685     }
7686   else
7687     {
7688       code_conversion_save (0, 0);
7689       coding->dst_object = Qnil;
7690       /* Most callers presume this will return a multibyte result, and they
7691          won't use `binary' or `raw-text' anyway, so let's not worry about
7692          CODING_FOR_UNIBYTE.  */
7693       coding->dst_multibyte = 1;
7694     }
7695
7696   decode_coding (coding);
7697
7698   if (BUFFERP (coding->dst_object))
7699     set_buffer_internal (XBUFFER (coding->dst_object));
7700
7701   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7702     {
7703       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7704       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7705       Lisp_Object val;
7706
7707       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7708       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7709               old_deactivate_mark);
7710       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7711                         make_number (coding->produced_char));
7712       UNGCPRO;
7713       CHECK_NATNUM (val);
7714       coding->produced_char += Z - prev_Z;
7715       coding->produced += Z_BYTE - prev_Z_BYTE;
7716     }
7717
7718   if (EQ (dst_object, Qt))
7719     {
7720       coding->dst_object = Fbuffer_string ();
7721     }
7722   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7723     {
7724       set_buffer_internal (XBUFFER (coding->dst_object));
7725       if (dst_bytes < coding->produced)
7726         {
7727           destination = xrealloc (destination, coding->produced);
7728           if (! destination)
7729             {
7730               record_conversion_result (coding,
7731                                         CODING_RESULT_INSUFFICIENT_MEM);
7732               unbind_to (count, Qnil);
7733               return;
7734             }
7735           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7736             move_gap_both (BEGV, BEGV_BYTE);
7737           memcpy (destination, BEGV_ADDR, coding->produced);
7738           coding->destination = destination;
7739         }
7740     }
7741
7742   if (saved_pt >= 0)
7743     {
7744       /* This is the case of:
7745          (BUFFERP (src_object) && EQ (src_object, dst_object))
7746          As we have moved PT while replacing the original buffer
7747          contents, we must recover it now.  */
7748       set_buffer_internal (XBUFFER (src_object));
7749       current_buffer->text->inhibit_shrinking = 0;
7750       if (saved_pt < from)
7751         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7752       else if (saved_pt < from + chars)
7753         TEMP_SET_PT_BOTH (from, from_byte);
7754       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7755         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7756                           saved_pt_byte + (coding->produced - bytes));
7757       else
7758         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7759                           saved_pt_byte + (coding->produced - bytes));
7760
7761       if (need_marker_adjustment)
7762         {
7763           struct Lisp_Marker *tail;
7764
7765           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7766             if (tail->need_adjustment)
7767               {
7768                 tail->need_adjustment = 0;
7769                 if (tail->insertion_type)
7770                   {
7771                     tail->bytepos = from_byte;
7772                     tail->charpos = from;
7773                   }
7774                 else
7775                   {
7776                     tail->bytepos = from_byte + coding->produced;
7777                     tail->charpos
7778                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7779                          ? tail->bytepos : from + coding->produced_char);
7780                   }
7781               }
7782         }
7783     }
7784
7785   Vdeactivate_mark = old_deactivate_mark;
7786   unbind_to (count, coding->dst_object);
7787 }
7788
7789
7790 void
7791 encode_coding_object (struct coding_system *coding,
7792                       Lisp_Object src_object,
7793                       ptrdiff_t from, ptrdiff_t from_byte,
7794                       ptrdiff_t to, ptrdiff_t to_byte,
7795                       Lisp_Object dst_object)
7796 {
7797   ptrdiff_t count = SPECPDL_INDEX ();
7798   ptrdiff_t chars = to - from;
7799   ptrdiff_t bytes = to_byte - from_byte;
7800   Lisp_Object attrs;
7801   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7802   bool need_marker_adjustment = 0;
7803   bool kill_src_buffer = 0;
7804   Lisp_Object old_deactivate_mark;
7805
7806   old_deactivate_mark = Vdeactivate_mark;
7807
7808   coding->src_object = src_object;
7809   coding->src_chars = chars;
7810   coding->src_bytes = bytes;
7811   coding->src_multibyte = chars < bytes;
7812
7813   attrs = CODING_ID_ATTRS (coding->id);
7814
7815   if (EQ (src_object, dst_object))
7816     {
7817       struct Lisp_Marker *tail;
7818
7819       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7820         {
7821           tail->need_adjustment
7822             = tail->charpos == (tail->insertion_type ? from : to);
7823           need_marker_adjustment |= tail->need_adjustment;
7824         }
7825     }
7826
7827   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7828     {
7829       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7830       set_buffer_internal (XBUFFER (coding->src_object));
7831       if (STRINGP (src_object))
7832         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7833       else if (BUFFERP (src_object))
7834         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7835       else
7836         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7837
7838       if (EQ (src_object, dst_object))
7839         {
7840           set_buffer_internal (XBUFFER (src_object));
7841           saved_pt = PT, saved_pt_byte = PT_BYTE;
7842           del_range_both (from, from_byte, to, to_byte, 1);
7843           set_buffer_internal (XBUFFER (coding->src_object));
7844         }
7845
7846       {
7847         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7848
7849         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7850                 old_deactivate_mark);
7851         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7852                     make_number (BEG), make_number (Z));
7853         UNGCPRO;
7854       }
7855       if (XBUFFER (coding->src_object) != current_buffer)
7856         kill_src_buffer = 1;
7857       coding->src_object = Fcurrent_buffer ();
7858       if (BEG != GPT)
7859         move_gap_both (BEG, BEG_BYTE);
7860       coding->src_chars = Z - BEG;
7861       coding->src_bytes = Z_BYTE - BEG_BYTE;
7862       coding->src_pos = BEG;
7863       coding->src_pos_byte = BEG_BYTE;
7864       coding->src_multibyte = Z < Z_BYTE;
7865     }
7866   else if (STRINGP (src_object))
7867     {
7868       code_conversion_save (0, 0);
7869       coding->src_pos = from;
7870       coding->src_pos_byte = from_byte;
7871     }
7872   else if (BUFFERP (src_object))
7873     {
7874       code_conversion_save (0, 0);
7875       set_buffer_internal (XBUFFER (src_object));
7876       if (EQ (src_object, dst_object))
7877         {
7878           saved_pt = PT, saved_pt_byte = PT_BYTE;
7879           coding->src_object = del_range_1 (from, to, 1, 1);
7880           coding->src_pos = 0;
7881           coding->src_pos_byte = 0;
7882         }
7883       else
7884         {
7885           if (from < GPT && to >= GPT)
7886             move_gap_both (from, from_byte);
7887           coding->src_pos = from;
7888           coding->src_pos_byte = from_byte;
7889         }
7890     }
7891   else
7892     code_conversion_save (0, 0);
7893
7894   if (BUFFERP (dst_object))
7895     {
7896       coding->dst_object = dst_object;
7897       if (EQ (src_object, dst_object))
7898         {
7899           coding->dst_pos = from;
7900           coding->dst_pos_byte = from_byte;
7901         }
7902       else
7903         {
7904           struct buffer *current = current_buffer;
7905
7906           set_buffer_temp (XBUFFER (dst_object));
7907           coding->dst_pos = PT;
7908           coding->dst_pos_byte = PT_BYTE;
7909           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7910           set_buffer_temp (current);
7911         }
7912       coding->dst_multibyte
7913         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7914     }
7915   else if (EQ (dst_object, Qt))
7916     {
7917       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7918       coding->dst_object = Qnil;
7919       coding->destination = xmalloc (dst_bytes);
7920       coding->dst_bytes = dst_bytes;
7921       coding->dst_multibyte = 0;
7922     }
7923   else
7924     {
7925       coding->dst_object = Qnil;
7926       coding->dst_multibyte = 0;
7927     }
7928
7929   encode_coding (coding);
7930
7931   if (EQ (dst_object, Qt))
7932     {
7933       if (BUFFERP (coding->dst_object))
7934         coding->dst_object = Fbuffer_string ();
7935       else
7936         {
7937           coding->dst_object
7938             = make_unibyte_string ((char *) coding->destination,
7939                                    coding->produced);
7940           xfree (coding->destination);
7941         }
7942     }
7943
7944   if (saved_pt >= 0)
7945     {
7946       /* This is the case of:
7947          (BUFFERP (src_object) && EQ (src_object, dst_object))
7948          As we have moved PT while replacing the original buffer
7949          contents, we must recover it now.  */
7950       set_buffer_internal (XBUFFER (src_object));
7951       if (saved_pt < from)
7952         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7953       else if (saved_pt < from + chars)
7954         TEMP_SET_PT_BOTH (from, from_byte);
7955       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7956         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7957                           saved_pt_byte + (coding->produced - bytes));
7958       else
7959         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7960                           saved_pt_byte + (coding->produced - bytes));
7961
7962       if (need_marker_adjustment)
7963         {
7964           struct Lisp_Marker *tail;
7965
7966           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7967             if (tail->need_adjustment)
7968               {
7969                 tail->need_adjustment = 0;
7970                 if (tail->insertion_type)
7971                   {
7972                     tail->bytepos = from_byte;
7973                     tail->charpos = from;
7974                   }
7975                 else
7976                   {
7977                     tail->bytepos = from_byte + coding->produced;
7978                     tail->charpos
7979                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7980                          ? tail->bytepos : from + coding->produced_char);
7981                   }
7982               }
7983         }
7984     }
7985
7986   if (kill_src_buffer)
7987     Fkill_buffer (coding->src_object);
7988
7989   Vdeactivate_mark = old_deactivate_mark;
7990   unbind_to (count, Qnil);
7991 }
7992
7993
7994 Lisp_Object
7995 preferred_coding_system (void)
7996 {
7997   int id = coding_categories[coding_priorities[0]].id;
7998
7999   return CODING_ID_NAME (id);
8000 }
8001
8002 \f
8003 #ifdef emacs
8004 /*** 8. Emacs Lisp library functions ***/
8005
8006 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8007        doc: /* Return t if OBJECT is nil or a coding-system.
8008 See the documentation of `define-coding-system' for information
8009 about coding-system objects.  */)
8010   (Lisp_Object object)
8011 {
8012   if (NILP (object)
8013       || CODING_SYSTEM_ID (object) >= 0)
8014     return Qt;
8015   if (! SYMBOLP (object)
8016       || NILP (Fget (object, Qcoding_system_define_form)))
8017     return Qnil;
8018   return Qt;
8019 }
8020
8021 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8022        Sread_non_nil_coding_system, 1, 1, 0,
8023        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8024   (Lisp_Object prompt)
8025 {
8026   Lisp_Object val;
8027   do
8028     {
8029       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8030                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8031     }
8032   while (SCHARS (val) == 0);
8033   return (Fintern (val, Qnil));
8034 }
8035
8036 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8037        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8038 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8039 Ignores case when completing coding systems (all Emacs coding systems
8040 are lower-case).  */)
8041   (Lisp_Object prompt, Lisp_Object default_coding_system)
8042 {
8043   Lisp_Object val;
8044   ptrdiff_t count = SPECPDL_INDEX ();
8045
8046   if (SYMBOLP (default_coding_system))
8047     default_coding_system = SYMBOL_NAME (default_coding_system);
8048   specbind (Qcompletion_ignore_case, Qt);
8049   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8050                           Qt, Qnil, Qcoding_system_history,
8051                           default_coding_system, Qnil);
8052   unbind_to (count, Qnil);
8053   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8054 }
8055
8056 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8057        1, 1, 0,
8058        doc: /* Check validity of CODING-SYSTEM.
8059 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8060 It is valid if it is nil or a symbol defined as a coding system by the
8061 function `define-coding-system'.  */)
8062   (Lisp_Object coding_system)
8063 {
8064   Lisp_Object define_form;
8065
8066   define_form = Fget (coding_system, Qcoding_system_define_form);
8067   if (! NILP (define_form))
8068     {
8069       Fput (coding_system, Qcoding_system_define_form, Qnil);
8070       safe_eval (define_form);
8071     }
8072   if (!NILP (Fcoding_system_p (coding_system)))
8073     return coding_system;
8074   xsignal1 (Qcoding_system_error, coding_system);
8075 }
8076
8077 \f
8078 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8079    HIGHEST, return the coding system of the highest
8080    priority among the detected coding systems.  Otherwise return a
8081    list of detected coding systems sorted by their priorities.  If
8082    MULTIBYTEP, it is assumed that the bytes are in correct
8083    multibyte form but contains only ASCII and eight-bit chars.
8084    Otherwise, the bytes are raw bytes.
8085
8086    CODING-SYSTEM controls the detection as below:
8087
8088    If it is nil, detect both text-format and eol-format.  If the
8089    text-format part of CODING-SYSTEM is already specified
8090    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8091    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8092    detect only text-format.  */
8093
8094 Lisp_Object
8095 detect_coding_system (const unsigned char *src,
8096                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8097                       bool highest, bool multibytep,
8098                       Lisp_Object coding_system)
8099 {
8100   const unsigned char *src_end = src + src_bytes;
8101   Lisp_Object attrs, eol_type;
8102   Lisp_Object val = Qnil;
8103   struct coding_system coding;
8104   ptrdiff_t id;
8105   struct coding_detection_info detect_info;
8106   enum coding_category base_category;
8107   bool null_byte_found = 0, eight_bit_found = 0;
8108
8109   if (NILP (coding_system))
8110     coding_system = Qundecided;
8111   setup_coding_system (coding_system, &coding);
8112   attrs = CODING_ID_ATTRS (coding.id);
8113   eol_type = CODING_ID_EOL_TYPE (coding.id);
8114   coding_system = CODING_ATTR_BASE_NAME (attrs);
8115
8116   coding.source = src;
8117   coding.src_chars = src_chars;
8118   coding.src_bytes = src_bytes;
8119   coding.src_multibyte = multibytep;
8120   coding.consumed = 0;
8121   coding.mode |= CODING_MODE_LAST_BLOCK;
8122   coding.head_ascii = 0;
8123
8124   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8125
8126   /* At first, detect text-format if necessary.  */
8127   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8128   if (base_category == coding_category_undecided)
8129     {
8130       enum coding_category category IF_LINT (= 0);
8131       struct coding_system *this IF_LINT (= NULL);
8132       int c, i;
8133
8134       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8135       for (; src < src_end; src++)
8136         {
8137           c = *src;
8138           if (c & 0x80)
8139             {
8140               eight_bit_found = 1;
8141               if (null_byte_found)
8142                 break;
8143             }
8144           else if (c < 0x20)
8145             {
8146               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8147                   && ! inhibit_iso_escape_detection
8148                   && ! detect_info.checked)
8149                 {
8150                   if (detect_coding_iso_2022 (&coding, &detect_info))
8151                     {
8152                       /* We have scanned the whole data.  */
8153                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8154                         {
8155                           /* We didn't find an 8-bit code.  We may
8156                              have found a null-byte, but it's very
8157                              rare that a binary file confirm to
8158                              ISO-2022.  */
8159                           src = src_end;
8160                           coding.head_ascii = src - coding.source;
8161                         }
8162                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8163                       break;
8164                     }
8165                 }
8166               else if (! c && !inhibit_null_byte_detection)
8167                 {
8168                   null_byte_found = 1;
8169                   if (eight_bit_found)
8170                     break;
8171                 }
8172               if (! eight_bit_found)
8173                 coding.head_ascii++;
8174             }
8175           else if (! eight_bit_found)
8176             coding.head_ascii++;
8177         }
8178
8179       if (null_byte_found || eight_bit_found
8180           || coding.head_ascii < coding.src_bytes
8181           || detect_info.found)
8182         {
8183           if (coding.head_ascii == coding.src_bytes)
8184             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8185             for (i = 0; i < coding_category_raw_text; i++)
8186               {
8187                 category = coding_priorities[i];
8188                 this = coding_categories + category;
8189                 if (detect_info.found & (1 << category))
8190                   break;
8191               }
8192           else
8193             {
8194               if (null_byte_found)
8195                 {
8196                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8197                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8198                 }
8199               for (i = 0; i < coding_category_raw_text; i++)
8200                 {
8201                   category = coding_priorities[i];
8202                   this = coding_categories + category;
8203
8204                   if (this->id < 0)
8205                     {
8206                       /* No coding system of this category is defined.  */
8207                       detect_info.rejected |= (1 << category);
8208                     }
8209                   else if (category >= coding_category_raw_text)
8210                     continue;
8211                   else if (detect_info.checked & (1 << category))
8212                     {
8213                       if (highest
8214                           && (detect_info.found & (1 << category)))
8215                         break;
8216                     }
8217                   else if ((*(this->detector)) (&coding, &detect_info)
8218                            && highest
8219                            && (detect_info.found & (1 << category)))
8220                     {
8221                       if (category == coding_category_utf_16_auto)
8222                         {
8223                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8224                             category = coding_category_utf_16_le;
8225                           else
8226                             category = coding_category_utf_16_be;
8227                         }
8228                       break;
8229                     }
8230                 }
8231             }
8232         }
8233
8234       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8235           || null_byte_found)
8236         {
8237           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8238           id = CODING_SYSTEM_ID (Qno_conversion);
8239           val = Fcons (make_number (id), Qnil);
8240         }
8241       else if (! detect_info.rejected && ! detect_info.found)
8242         {
8243           detect_info.found = CATEGORY_MASK_ANY;
8244           id = coding_categories[coding_category_undecided].id;
8245           val = Fcons (make_number (id), Qnil);
8246         }
8247       else if (highest)
8248         {
8249           if (detect_info.found)
8250             {
8251               detect_info.found = 1 << category;
8252               val = Fcons (make_number (this->id), Qnil);
8253             }
8254           else
8255             for (i = 0; i < coding_category_raw_text; i++)
8256               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8257                 {
8258                   detect_info.found = 1 << coding_priorities[i];
8259                   id = coding_categories[coding_priorities[i]].id;
8260                   val = Fcons (make_number (id), Qnil);
8261                   break;
8262                 }
8263         }
8264       else
8265         {
8266           int mask = detect_info.rejected | detect_info.found;
8267           int found = 0;
8268
8269           for (i = coding_category_raw_text - 1; i >= 0; i--)
8270             {
8271               category = coding_priorities[i];
8272               if (! (mask & (1 << category)))
8273                 {
8274                   found |= 1 << category;
8275                   id = coding_categories[category].id;
8276                   if (id >= 0)
8277                     val = Fcons (make_number (id), val);
8278                 }
8279             }
8280           for (i = coding_category_raw_text - 1; i >= 0; i--)
8281             {
8282               category = coding_priorities[i];
8283               if (detect_info.found & (1 << category))
8284                 {
8285                   id = coding_categories[category].id;
8286                   val = Fcons (make_number (id), val);
8287                 }
8288             }
8289           detect_info.found |= found;
8290         }
8291     }
8292   else if (base_category == coding_category_utf_8_auto)
8293     {
8294       if (detect_coding_utf_8 (&coding, &detect_info))
8295         {
8296           struct coding_system *this;
8297
8298           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8299             this = coding_categories + coding_category_utf_8_sig;
8300           else
8301             this = coding_categories + coding_category_utf_8_nosig;
8302           val = Fcons (make_number (this->id), Qnil);
8303         }
8304     }
8305   else if (base_category == coding_category_utf_16_auto)
8306     {
8307       if (detect_coding_utf_16 (&coding, &detect_info))
8308         {
8309           struct coding_system *this;
8310
8311           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8312             this = coding_categories + coding_category_utf_16_le;
8313           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8314             this = coding_categories + coding_category_utf_16_be;
8315           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8316             this = coding_categories + coding_category_utf_16_be_nosig;
8317           else
8318             this = coding_categories + coding_category_utf_16_le_nosig;
8319           val = Fcons (make_number (this->id), Qnil);
8320         }
8321     }
8322   else
8323     {
8324       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8325       val = Fcons (make_number (coding.id), Qnil);
8326     }
8327
8328   /* Then, detect eol-format if necessary.  */
8329   {
8330     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8331     Lisp_Object tail;
8332
8333     if (VECTORP (eol_type))
8334       {
8335         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8336           {
8337             if (null_byte_found)
8338               normal_eol = EOL_SEEN_LF;
8339             else
8340               normal_eol = detect_eol (coding.source, src_bytes,
8341                                        coding_category_raw_text);
8342           }
8343         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8344                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8345           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8346                                       coding_category_utf_16_be);
8347         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8348                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8349           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8350                                       coding_category_utf_16_le);
8351       }
8352     else
8353       {
8354         if (EQ (eol_type, Qunix))
8355           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8356         else if (EQ (eol_type, Qdos))
8357           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8358         else
8359           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8360       }
8361
8362     for (tail = val; CONSP (tail); tail = XCDR (tail))
8363       {
8364         enum coding_category category;
8365         int this_eol;
8366
8367         id = XINT (XCAR (tail));
8368         attrs = CODING_ID_ATTRS (id);
8369         category = XINT (CODING_ATTR_CATEGORY (attrs));
8370         eol_type = CODING_ID_EOL_TYPE (id);
8371         if (VECTORP (eol_type))
8372           {
8373             if (category == coding_category_utf_16_be
8374                 || category == coding_category_utf_16_be_nosig)
8375               this_eol = utf_16_be_eol;
8376             else if (category == coding_category_utf_16_le
8377                      || category == coding_category_utf_16_le_nosig)
8378               this_eol = utf_16_le_eol;
8379             else
8380               this_eol = normal_eol;
8381
8382             if (this_eol == EOL_SEEN_LF)
8383               XSETCAR (tail, AREF (eol_type, 0));
8384             else if (this_eol == EOL_SEEN_CRLF)
8385               XSETCAR (tail, AREF (eol_type, 1));
8386             else if (this_eol == EOL_SEEN_CR)
8387               XSETCAR (tail, AREF (eol_type, 2));
8388             else
8389               XSETCAR (tail, CODING_ID_NAME (id));
8390           }
8391         else
8392           XSETCAR (tail, CODING_ID_NAME (id));
8393       }
8394   }
8395
8396   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8397 }
8398
8399
8400 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8401        2, 3, 0,
8402        doc: /* Detect coding system of the text in the region between START and END.
8403 Return a list of possible coding systems ordered by priority.
8404 The coding systems to try and their priorities follows what
8405 the function `coding-system-priority-list' (which see) returns.
8406
8407 If only ASCII characters are found (except for such ISO-2022 control
8408 characters as ESC), it returns a list of single element `undecided'
8409 or its subsidiary coding system according to a detected end-of-line
8410 format.
8411
8412 If optional argument HIGHEST is non-nil, return the coding system of
8413 highest priority.  */)
8414   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8415 {
8416   ptrdiff_t from, to;
8417   ptrdiff_t from_byte, to_byte;
8418
8419   CHECK_NUMBER_COERCE_MARKER (start);
8420   CHECK_NUMBER_COERCE_MARKER (end);
8421
8422   validate_region (&start, &end);
8423   from = XINT (start), to = XINT (end);
8424   from_byte = CHAR_TO_BYTE (from);
8425   to_byte = CHAR_TO_BYTE (to);
8426
8427   if (from < GPT && to >= GPT)
8428     move_gap_both (to, to_byte);
8429
8430   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8431                                to - from, to_byte - from_byte,
8432                                !NILP (highest),
8433                                !NILP (BVAR (current_buffer
8434                                       , enable_multibyte_characters)),
8435                                Qnil);
8436 }
8437
8438 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8439        1, 2, 0,
8440        doc: /* Detect coding system of the text in STRING.
8441 Return a list of possible coding systems ordered by priority.
8442 The coding systems to try and their priorities follows what
8443 the function `coding-system-priority-list' (which see) returns.
8444
8445 If only ASCII characters are found (except for such ISO-2022 control
8446 characters as ESC), it returns a list of single element `undecided'
8447 or its subsidiary coding system according to a detected end-of-line
8448 format.
8449
8450 If optional argument HIGHEST is non-nil, return the coding system of
8451 highest priority.  */)
8452   (Lisp_Object string, Lisp_Object highest)
8453 {
8454   CHECK_STRING (string);
8455
8456   return detect_coding_system (SDATA (string),
8457                                SCHARS (string), SBYTES (string),
8458                                !NILP (highest), STRING_MULTIBYTE (string),
8459                                Qnil);
8460 }
8461
8462
8463 static inline bool
8464 char_encodable_p (int c, Lisp_Object attrs)
8465 {
8466   Lisp_Object tail;
8467   struct charset *charset;
8468   Lisp_Object translation_table;
8469
8470   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8471   if (! NILP (translation_table))
8472     c = translate_char (translation_table, c);
8473   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8474        CONSP (tail); tail = XCDR (tail))
8475     {
8476       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8477       if (CHAR_CHARSET_P (c, charset))
8478         break;
8479     }
8480   return (! NILP (tail));
8481 }
8482
8483
8484 /* Return a list of coding systems that safely encode the text between
8485    START and END.  If EXCLUDE is non-nil, it is a list of coding
8486    systems not to check.  The returned list doesn't contain any such
8487    coding systems.  In any case, if the text contains only ASCII or is
8488    unibyte, return t.  */
8489
8490 DEFUN ("find-coding-systems-region-internal",
8491        Ffind_coding_systems_region_internal,
8492        Sfind_coding_systems_region_internal, 2, 3, 0,
8493        doc: /* Internal use only.  */)
8494   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8495 {
8496   Lisp_Object coding_attrs_list, safe_codings;
8497   ptrdiff_t start_byte, end_byte;
8498   const unsigned char *p, *pbeg, *pend;
8499   int c;
8500   Lisp_Object tail, elt, work_table;
8501
8502   if (STRINGP (start))
8503     {
8504       if (!STRING_MULTIBYTE (start)
8505           || SCHARS (start) == SBYTES (start))
8506         return Qt;
8507       start_byte = 0;
8508       end_byte = SBYTES (start);
8509     }
8510   else
8511     {
8512       CHECK_NUMBER_COERCE_MARKER (start);
8513       CHECK_NUMBER_COERCE_MARKER (end);
8514       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8515         args_out_of_range (start, end);
8516       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8517         return Qt;
8518       start_byte = CHAR_TO_BYTE (XINT (start));
8519       end_byte = CHAR_TO_BYTE (XINT (end));
8520       if (XINT (end) - XINT (start) == end_byte - start_byte)
8521         return Qt;
8522
8523       if (XINT (start) < GPT && XINT (end) > GPT)
8524         {
8525           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8526             move_gap_both (XINT (start), start_byte);
8527           else
8528             move_gap_both (XINT (end), end_byte);
8529         }
8530     }
8531
8532   coding_attrs_list = Qnil;
8533   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8534     if (NILP (exclude)
8535         || NILP (Fmemq (XCAR (tail), exclude)))
8536       {
8537         Lisp_Object attrs;
8538
8539         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8540         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8541             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8542           {
8543             ASET (attrs, coding_attr_trans_tbl,
8544                   get_translation_table (attrs, 1, NULL));
8545             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8546           }
8547       }
8548
8549   if (STRINGP (start))
8550     p = pbeg = SDATA (start);
8551   else
8552     p = pbeg = BYTE_POS_ADDR (start_byte);
8553   pend = p + (end_byte - start_byte);
8554
8555   while (p < pend && ASCII_BYTE_P (*p)) p++;
8556   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8557
8558   work_table = Fmake_char_table (Qnil, Qnil);
8559   while (p < pend)
8560     {
8561       if (ASCII_BYTE_P (*p))
8562         p++;
8563       else
8564         {
8565           c = STRING_CHAR_ADVANCE (p);
8566           if (!NILP (char_table_ref (work_table, c)))
8567             /* This character was already checked.  Ignore it.  */
8568             continue;
8569
8570           charset_map_loaded = 0;
8571           for (tail = coding_attrs_list; CONSP (tail);)
8572             {
8573               elt = XCAR (tail);
8574               if (NILP (elt))
8575                 tail = XCDR (tail);
8576               else if (char_encodable_p (c, elt))
8577                 tail = XCDR (tail);
8578               else if (CONSP (XCDR (tail)))
8579                 {
8580                   XSETCAR (tail, XCAR (XCDR (tail)));
8581                   XSETCDR (tail, XCDR (XCDR (tail)));
8582                 }
8583               else
8584                 {
8585                   XSETCAR (tail, Qnil);
8586                   tail = XCDR (tail);
8587                 }
8588             }
8589           if (charset_map_loaded)
8590             {
8591               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8592
8593               if (STRINGP (start))
8594                 pbeg = SDATA (start);
8595               else
8596                 pbeg = BYTE_POS_ADDR (start_byte);
8597               p = pbeg + p_offset;
8598               pend = pbeg + pend_offset;
8599             }
8600           char_table_set (work_table, c, Qt);
8601         }
8602     }
8603
8604   safe_codings = list2 (Qraw_text, Qno_conversion);
8605   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8606     if (! NILP (XCAR (tail)))
8607       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8608
8609   return safe_codings;
8610 }
8611
8612
8613 DEFUN ("unencodable-char-position", Funencodable_char_position,
8614        Sunencodable_char_position, 3, 5, 0,
8615        doc: /*
8616 Return position of first un-encodable character in a region.
8617 START and END specify the region and CODING-SYSTEM specifies the
8618 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8619
8620 If optional 4th argument COUNT is non-nil, it specifies at most how
8621 many un-encodable characters to search.  In this case, the value is a
8622 list of positions.
8623
8624 If optional 5th argument STRING is non-nil, it is a string to search
8625 for un-encodable characters.  In that case, START and END are indexes
8626 to the string.  */)
8627   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8628 {
8629   EMACS_INT n;
8630   struct coding_system coding;
8631   Lisp_Object attrs, charset_list, translation_table;
8632   Lisp_Object positions;
8633   ptrdiff_t from, to;
8634   const unsigned char *p, *stop, *pend;
8635   bool ascii_compatible;
8636
8637   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8638   attrs = CODING_ID_ATTRS (coding.id);
8639   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8640     return Qnil;
8641   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8642   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8643   translation_table = get_translation_table (attrs, 1, NULL);
8644
8645   if (NILP (string))
8646     {
8647       validate_region (&start, &end);
8648       from = XINT (start);
8649       to = XINT (end);
8650       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8651           || (ascii_compatible
8652               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8653         return Qnil;
8654       p = CHAR_POS_ADDR (from);
8655       pend = CHAR_POS_ADDR (to);
8656       if (from < GPT && to >= GPT)
8657         stop = GPT_ADDR;
8658       else
8659         stop = pend;
8660     }
8661   else
8662     {
8663       CHECK_STRING (string);
8664       CHECK_NATNUM (start);
8665       CHECK_NATNUM (end);
8666       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8667         args_out_of_range_3 (string, start, end);
8668       from = XINT (start);
8669       to = XINT (end);
8670       if (! STRING_MULTIBYTE (string))
8671         return Qnil;
8672       p = SDATA (string) + string_char_to_byte (string, from);
8673       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8674       if (ascii_compatible && (to - from) == (pend - p))
8675         return Qnil;
8676     }
8677
8678   if (NILP (count))
8679     n = 1;
8680   else
8681     {
8682       CHECK_NATNUM (count);
8683       n = XINT (count);
8684     }
8685
8686   positions = Qnil;
8687   charset_map_loaded = 0;
8688   while (1)
8689     {
8690       int c;
8691
8692       if (ascii_compatible)
8693         while (p < stop && ASCII_BYTE_P (*p))
8694           p++, from++;
8695       if (p >= stop)
8696         {
8697           if (p >= pend)
8698             break;
8699           stop = pend;
8700           p = GAP_END_ADDR;
8701         }
8702
8703       c = STRING_CHAR_ADVANCE (p);
8704       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8705           && ! char_charset (translate_char (translation_table, c),
8706                              charset_list, NULL))
8707         {
8708           positions = Fcons (make_number (from), positions);
8709           n--;
8710           if (n == 0)
8711             break;
8712         }
8713
8714       from++;
8715       if (charset_map_loaded && NILP (string))
8716         {
8717           p = CHAR_POS_ADDR (from);
8718           pend = CHAR_POS_ADDR (to);
8719           if (from < GPT && to >= GPT)
8720             stop = GPT_ADDR;
8721           else
8722             stop = pend;
8723           charset_map_loaded = 0;
8724         }
8725     }
8726
8727   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8728 }
8729
8730
8731 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8732        Scheck_coding_systems_region, 3, 3, 0,
8733        doc: /* Check if the region is encodable by coding systems.
8734
8735 START and END are buffer positions specifying the region.
8736 CODING-SYSTEM-LIST is a list of coding systems to check.
8737
8738 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8739 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8740 whole region, POS0, POS1, ... are buffer positions where non-encodable
8741 characters are found.
8742
8743 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8744 value is nil.
8745
8746 START may be a string.  In that case, check if the string is
8747 encodable, and the value contains indices to the string instead of
8748 buffer positions.  END is ignored.
8749
8750 If the current buffer (or START if it is a string) is unibyte, the value
8751 is nil.  */)
8752   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8753 {
8754   Lisp_Object list;
8755   ptrdiff_t start_byte, end_byte;
8756   ptrdiff_t pos;
8757   const unsigned char *p, *pbeg, *pend;
8758   int c;
8759   Lisp_Object tail, elt, attrs;
8760
8761   if (STRINGP (start))
8762     {
8763       if (!STRING_MULTIBYTE (start)
8764           || SCHARS (start) == SBYTES (start))
8765         return Qnil;
8766       start_byte = 0;
8767       end_byte = SBYTES (start);
8768       pos = 0;
8769     }
8770   else
8771     {
8772       CHECK_NUMBER_COERCE_MARKER (start);
8773       CHECK_NUMBER_COERCE_MARKER (end);
8774       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8775         args_out_of_range (start, end);
8776       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8777         return Qnil;
8778       start_byte = CHAR_TO_BYTE (XINT (start));
8779       end_byte = CHAR_TO_BYTE (XINT (end));
8780       if (XINT (end) - XINT (start) == end_byte - start_byte)
8781         return Qnil;
8782
8783       if (XINT (start) < GPT && XINT (end) > GPT)
8784         {
8785           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8786             move_gap_both (XINT (start), start_byte);
8787           else
8788             move_gap_both (XINT (end), end_byte);
8789         }
8790       pos = XINT (start);
8791     }
8792
8793   list = Qnil;
8794   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8795     {
8796       elt = XCAR (tail);
8797       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8798       ASET (attrs, coding_attr_trans_tbl,
8799             get_translation_table (attrs, 1, NULL));
8800       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8801     }
8802
8803   if (STRINGP (start))
8804     p = pbeg = SDATA (start);
8805   else
8806     p = pbeg = BYTE_POS_ADDR (start_byte);
8807   pend = p + (end_byte - start_byte);
8808
8809   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8810   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8811
8812   while (p < pend)
8813     {
8814       if (ASCII_BYTE_P (*p))
8815         p++;
8816       else
8817         {
8818           c = STRING_CHAR_ADVANCE (p);
8819
8820           charset_map_loaded = 0;
8821           for (tail = list; CONSP (tail); tail = XCDR (tail))
8822             {
8823               elt = XCDR (XCAR (tail));
8824               if (! char_encodable_p (c, XCAR (elt)))
8825                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8826             }
8827           if (charset_map_loaded)
8828             {
8829               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8830
8831               if (STRINGP (start))
8832                 pbeg = SDATA (start);
8833               else
8834                 pbeg = BYTE_POS_ADDR (start_byte);
8835               p = pbeg + p_offset;
8836               pend = pbeg + pend_offset;
8837             }
8838         }
8839       pos++;
8840     }
8841
8842   tail = list;
8843   list = Qnil;
8844   for (; CONSP (tail); tail = XCDR (tail))
8845     {
8846       elt = XCAR (tail);
8847       if (CONSP (XCDR (XCDR (elt))))
8848         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8849                       list);
8850     }
8851
8852   return list;
8853 }
8854
8855
8856 static Lisp_Object
8857 code_convert_region (Lisp_Object start, Lisp_Object end,
8858                      Lisp_Object coding_system, Lisp_Object dst_object,
8859                      bool encodep, bool norecord)
8860 {
8861   struct coding_system coding;
8862   ptrdiff_t from, from_byte, to, to_byte;
8863   Lisp_Object src_object;
8864
8865   CHECK_NUMBER_COERCE_MARKER (start);
8866   CHECK_NUMBER_COERCE_MARKER (end);
8867   if (NILP (coding_system))
8868     coding_system = Qno_conversion;
8869   else
8870     CHECK_CODING_SYSTEM (coding_system);
8871   src_object = Fcurrent_buffer ();
8872   if (NILP (dst_object))
8873     dst_object = src_object;
8874   else if (! EQ (dst_object, Qt))
8875     CHECK_BUFFER (dst_object);
8876
8877   validate_region (&start, &end);
8878   from = XFASTINT (start);
8879   from_byte = CHAR_TO_BYTE (from);
8880   to = XFASTINT (end);
8881   to_byte = CHAR_TO_BYTE (to);
8882
8883   setup_coding_system (coding_system, &coding);
8884   coding.mode |= CODING_MODE_LAST_BLOCK;
8885
8886   if (encodep)
8887     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8888                           dst_object);
8889   else
8890     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8891                           dst_object);
8892   if (! norecord)
8893     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8894
8895   return (BUFFERP (dst_object)
8896           ? make_number (coding.produced_char)
8897           : coding.dst_object);
8898 }
8899
8900
8901 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8902        3, 4, "r\nzCoding system: ",
8903        doc: /* Decode the current region from the specified coding system.
8904 When called from a program, takes four arguments:
8905         START, END, CODING-SYSTEM, and DESTINATION.
8906 START and END are buffer positions.
8907
8908 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8909 If nil, the region between START and END is replaced by the decoded text.
8910 If buffer, the decoded text is inserted in that buffer after point (point
8911 does not move).
8912 In those cases, the length of the decoded text is returned.
8913 If DESTINATION is t, the decoded text is returned.
8914
8915 This function sets `last-coding-system-used' to the precise coding system
8916 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8917 not fully specified.)  */)
8918   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8919 {
8920   return code_convert_region (start, end, coding_system, destination, 0, 0);
8921 }
8922
8923 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8924        3, 4, "r\nzCoding system: ",
8925        doc: /* Encode the current region by specified coding system.
8926 When called from a program, takes four arguments:
8927         START, END, CODING-SYSTEM and DESTINATION.
8928 START and END are buffer positions.
8929
8930 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8931 If nil, the region between START and END is replace by the encoded text.
8932 If buffer, the encoded text is inserted in that buffer after point (point
8933 does not move).
8934 In those cases, the length of the encoded text is returned.
8935 If DESTINATION is t, the encoded text is returned.
8936
8937 This function sets `last-coding-system-used' to the precise coding system
8938 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8939 not fully specified.)  */)
8940   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8941 {
8942   return code_convert_region (start, end, coding_system, destination, 1, 0);
8943 }
8944
8945 Lisp_Object
8946 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8947                      Lisp_Object dst_object, bool encodep, bool nocopy,
8948                      bool norecord)
8949 {
8950   struct coding_system coding;
8951   ptrdiff_t chars, bytes;
8952
8953   CHECK_STRING (string);
8954   if (NILP (coding_system))
8955     {
8956       if (! norecord)
8957         Vlast_coding_system_used = Qno_conversion;
8958       if (NILP (dst_object))
8959         return (nocopy ? Fcopy_sequence (string) : string);
8960     }
8961
8962   if (NILP (coding_system))
8963     coding_system = Qno_conversion;
8964   else
8965     CHECK_CODING_SYSTEM (coding_system);
8966   if (NILP (dst_object))
8967     dst_object = Qt;
8968   else if (! EQ (dst_object, Qt))
8969     CHECK_BUFFER (dst_object);
8970
8971   setup_coding_system (coding_system, &coding);
8972   coding.mode |= CODING_MODE_LAST_BLOCK;
8973   chars = SCHARS (string);
8974   bytes = SBYTES (string);
8975   if (encodep)
8976     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8977   else
8978     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8979   if (! norecord)
8980     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8981
8982   return (BUFFERP (dst_object)
8983           ? make_number (coding.produced_char)
8984           : coding.dst_object);
8985 }
8986
8987
8988 /* Encode or decode STRING according to CODING_SYSTEM.
8989    Do not set Vlast_coding_system_used.
8990
8991    This function is called only from macros DECODE_FILE and
8992    ENCODE_FILE, thus we ignore character composition.  */
8993
8994 Lisp_Object
8995 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8996                               bool encodep)
8997 {
8998   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8999 }
9000
9001
9002 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9003        2, 4, 0,
9004        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9005
9006 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9007 if the decoding operation is trivial.
9008
9009 Optional fourth arg BUFFER non-nil means that the decoded text is
9010 inserted in that buffer after point (point does not move).  In this
9011 case, the return value is the length of the decoded text.
9012
9013 This function sets `last-coding-system-used' to the precise coding system
9014 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9015 not fully specified.)  */)
9016   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9017 {
9018   return code_convert_string (string, coding_system, buffer,
9019                               0, ! NILP (nocopy), 0);
9020 }
9021
9022 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9023        2, 4, 0,
9024        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9025
9026 Optional third arg NOCOPY non-nil means it is OK to return STRING
9027 itself if the encoding operation is trivial.
9028
9029 Optional fourth arg BUFFER non-nil means that the encoded text is
9030 inserted in that buffer after point (point does not move).  In this
9031 case, the return value is the length of the encoded text.
9032
9033 This function sets `last-coding-system-used' to the precise coding system
9034 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9035 not fully specified.)  */)
9036   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9037 {
9038   return code_convert_string (string, coding_system, buffer,
9039                               1, ! NILP (nocopy), 0);
9040 }
9041
9042 \f
9043 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9044        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9045 Return the corresponding character.  */)
9046   (Lisp_Object code)
9047 {
9048   Lisp_Object spec, attrs, val;
9049   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9050   EMACS_INT ch;
9051   int c;
9052
9053   CHECK_NATNUM (code);
9054   ch = XFASTINT (code);
9055   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9056   attrs = AREF (spec, 0);
9057
9058   if (ASCII_BYTE_P (ch)
9059       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9060     return code;
9061
9062   val = CODING_ATTR_CHARSET_LIST (attrs);
9063   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9064   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9065   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9066
9067   if (ch <= 0x7F)
9068     {
9069       c = ch;
9070       charset = charset_roman;
9071     }
9072   else if (ch >= 0xA0 && ch < 0xDF)
9073     {
9074       c = ch - 0x80;
9075       charset = charset_kana;
9076     }
9077   else
9078     {
9079       EMACS_INT c1 = ch >> 8;
9080       int c2 = ch & 0xFF;
9081
9082       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9083           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9084         error ("Invalid code: %"pI"d", ch);
9085       c = ch;
9086       SJIS_TO_JIS (c);
9087       charset = charset_kanji;
9088     }
9089   c = DECODE_CHAR (charset, c);
9090   if (c < 0)
9091     error ("Invalid code: %"pI"d", ch);
9092   return make_number (c);
9093 }
9094
9095
9096 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9097        doc: /* Encode a Japanese character CH to shift_jis encoding.
9098 Return the corresponding code in SJIS.  */)
9099   (Lisp_Object ch)
9100 {
9101   Lisp_Object spec, attrs, charset_list;
9102   int c;
9103   struct charset *charset;
9104   unsigned code;
9105
9106   CHECK_CHARACTER (ch);
9107   c = XFASTINT (ch);
9108   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9109   attrs = AREF (spec, 0);
9110
9111   if (ASCII_CHAR_P (c)
9112       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9113     return ch;
9114
9115   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9116   charset = char_charset (c, charset_list, &code);
9117   if (code == CHARSET_INVALID_CODE (charset))
9118     error ("Can't encode by shift_jis encoding: %c", c);
9119   JIS_TO_SJIS (code);
9120
9121   return make_number (code);
9122 }
9123
9124 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9125        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9126 Return the corresponding character.  */)
9127   (Lisp_Object code)
9128 {
9129   Lisp_Object spec, attrs, val;
9130   struct charset *charset_roman, *charset_big5, *charset;
9131   EMACS_INT ch;
9132   int c;
9133
9134   CHECK_NATNUM (code);
9135   ch = XFASTINT (code);
9136   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9137   attrs = AREF (spec, 0);
9138
9139   if (ASCII_BYTE_P (ch)
9140       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9141     return code;
9142
9143   val = CODING_ATTR_CHARSET_LIST (attrs);
9144   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9145   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9146
9147   if (ch <= 0x7F)
9148     {
9149       c = ch;
9150       charset = charset_roman;
9151     }
9152   else
9153     {
9154       EMACS_INT b1 = ch >> 8;
9155       int b2 = ch & 0x7F;
9156       if (b1 < 0xA1 || b1 > 0xFE
9157           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9158         error ("Invalid code: %"pI"d", ch);
9159       c = ch;
9160       charset = charset_big5;
9161     }
9162   c = DECODE_CHAR (charset, c);
9163   if (c < 0)
9164     error ("Invalid code: %"pI"d", ch);
9165   return make_number (c);
9166 }
9167
9168 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9169        doc: /* Encode the Big5 character CH to BIG5 coding system.
9170 Return the corresponding character code in Big5.  */)
9171   (Lisp_Object ch)
9172 {
9173   Lisp_Object spec, attrs, charset_list;
9174   struct charset *charset;
9175   int c;
9176   unsigned code;
9177
9178   CHECK_CHARACTER (ch);
9179   c = XFASTINT (ch);
9180   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9181   attrs = AREF (spec, 0);
9182   if (ASCII_CHAR_P (c)
9183       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9184     return ch;
9185
9186   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9187   charset = char_charset (c, charset_list, &code);
9188   if (code == CHARSET_INVALID_CODE (charset))
9189     error ("Can't encode by Big5 encoding: %c", c);
9190
9191   return make_number (code);
9192 }
9193
9194 \f
9195 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9196        Sset_terminal_coding_system_internal, 1, 2, 0,
9197        doc: /* Internal use only.  */)
9198   (Lisp_Object coding_system, Lisp_Object terminal)
9199 {
9200   struct terminal *term = get_terminal (terminal, 1);
9201   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9202   CHECK_SYMBOL (coding_system);
9203   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9204   /* We had better not send unsafe characters to terminal.  */
9205   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9206   /* Character composition should be disabled.  */
9207   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9208   terminal_coding->src_multibyte = 1;
9209   terminal_coding->dst_multibyte = 0;
9210   tset_charset_list
9211     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9212             ? coding_charset_list (terminal_coding)
9213             : Fcons (make_number (charset_ascii), Qnil)));
9214   return Qnil;
9215 }
9216
9217 DEFUN ("set-safe-terminal-coding-system-internal",
9218        Fset_safe_terminal_coding_system_internal,
9219        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9220        doc: /* Internal use only.  */)
9221   (Lisp_Object coding_system)
9222 {
9223   CHECK_SYMBOL (coding_system);
9224   setup_coding_system (Fcheck_coding_system (coding_system),
9225                        &safe_terminal_coding);
9226   /* Character composition should be disabled.  */
9227   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9228   safe_terminal_coding.src_multibyte = 1;
9229   safe_terminal_coding.dst_multibyte = 0;
9230   return Qnil;
9231 }
9232
9233 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9234        Sterminal_coding_system, 0, 1, 0,
9235        doc: /* Return coding system specified for terminal output on the given terminal.
9236 TERMINAL may be a terminal object, a frame, or nil for the selected
9237 frame's terminal device.  */)
9238   (Lisp_Object terminal)
9239 {
9240   struct coding_system *terminal_coding
9241     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9242   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9243
9244   /* For backward compatibility, return nil if it is `undecided'.  */
9245   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9246 }
9247
9248 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9249        Sset_keyboard_coding_system_internal, 1, 2, 0,
9250        doc: /* Internal use only.  */)
9251   (Lisp_Object coding_system, Lisp_Object terminal)
9252 {
9253   struct terminal *t = get_terminal (terminal, 1);
9254   CHECK_SYMBOL (coding_system);
9255   if (NILP (coding_system))
9256     coding_system = Qno_conversion;
9257   else
9258     Fcheck_coding_system (coding_system);
9259   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9260   /* Character composition should be disabled.  */
9261   TERMINAL_KEYBOARD_CODING (t)->common_flags
9262     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9263   return Qnil;
9264 }
9265
9266 DEFUN ("keyboard-coding-system",
9267        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9268        doc: /* Return coding system specified for decoding keyboard input.  */)
9269   (Lisp_Object terminal)
9270 {
9271   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9272                          (get_terminal (terminal, 1))->id);
9273 }
9274
9275 \f
9276 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9277        Sfind_operation_coding_system,  1, MANY, 0,
9278        doc: /* Choose a coding system for an operation based on the target name.
9279 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9280 DECODING-SYSTEM is the coding system to use for decoding
9281 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9282 for encoding (in case OPERATION does encoding).
9283
9284 The first argument OPERATION specifies an I/O primitive:
9285   For file I/O, `insert-file-contents' or `write-region'.
9286   For process I/O, `call-process', `call-process-region', or `start-process'.
9287   For network I/O, `open-network-stream'.
9288
9289 The remaining arguments should be the same arguments that were passed
9290 to the primitive.  Depending on which primitive, one of those arguments
9291 is selected as the TARGET.  For example, if OPERATION does file I/O,
9292 whichever argument specifies the file name is TARGET.
9293
9294 TARGET has a meaning which depends on OPERATION:
9295   For file I/O, TARGET is a file name (except for the special case below).
9296   For process I/O, TARGET is a process name.
9297   For network I/O, TARGET is a service name or a port number.
9298
9299 This function looks up what is specified for TARGET in
9300 `file-coding-system-alist', `process-coding-system-alist',
9301 or `network-coding-system-alist' depending on OPERATION.
9302 They may specify a coding system, a cons of coding systems,
9303 or a function symbol to call.
9304 In the last case, we call the function with one argument,
9305 which is a list of all the arguments given to this function.
9306 If the function can't decide a coding system, it can return
9307 `undecided' so that the normal code-detection is performed.
9308
9309 If OPERATION is `insert-file-contents', the argument corresponding to
9310 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9311 file name to look up, and BUFFER is a buffer that contains the file's
9312 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9313 function to call for FILENAME, that function should examine the
9314 contents of BUFFER instead of reading the file.
9315
9316 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9317   (ptrdiff_t nargs, Lisp_Object *args)
9318 {
9319   Lisp_Object operation, target_idx, target, val;
9320   register Lisp_Object chain;
9321
9322   if (nargs < 2)
9323     error ("Too few arguments");
9324   operation = args[0];
9325   if (!SYMBOLP (operation)
9326       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9327     error ("Invalid first argument");
9328   if (nargs <= 1 + XFASTINT (target_idx))
9329     error ("Too few arguments for operation `%s'",
9330            SDATA (SYMBOL_NAME (operation)));
9331   target = args[XFASTINT (target_idx) + 1];
9332   if (!(STRINGP (target)
9333         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9334             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9335         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9336     error ("Invalid argument %"pI"d of operation `%s'",
9337            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9338   if (CONSP (target))
9339     target = XCAR (target);
9340
9341   chain = ((EQ (operation, Qinsert_file_contents)
9342             || EQ (operation, Qwrite_region))
9343            ? Vfile_coding_system_alist
9344            : (EQ (operation, Qopen_network_stream)
9345               ? Vnetwork_coding_system_alist
9346               : Vprocess_coding_system_alist));
9347   if (NILP (chain))
9348     return Qnil;
9349
9350   for (; CONSP (chain); chain = XCDR (chain))
9351     {
9352       Lisp_Object elt;
9353
9354       elt = XCAR (chain);
9355       if (CONSP (elt)
9356           && ((STRINGP (target)
9357                && STRINGP (XCAR (elt))
9358                && fast_string_match (XCAR (elt), target) >= 0)
9359               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9360         {
9361           val = XCDR (elt);
9362           /* Here, if VAL is both a valid coding system and a valid
9363              function symbol, we return VAL as a coding system.  */
9364           if (CONSP (val))
9365             return val;
9366           if (! SYMBOLP (val))
9367             return Qnil;
9368           if (! NILP (Fcoding_system_p (val)))
9369             return Fcons (val, val);
9370           if (! NILP (Ffboundp (val)))
9371             {
9372               /* We use call1 rather than safe_call1
9373                  so as to get bug reports about functions called here
9374                  which don't handle the current interface.  */
9375               val = call1 (val, Flist (nargs, args));
9376               if (CONSP (val))
9377                 return val;
9378               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9379                 return Fcons (val, val);
9380             }
9381           return Qnil;
9382         }
9383     }
9384   return Qnil;
9385 }
9386
9387 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9388        Sset_coding_system_priority, 0, MANY, 0,
9389        doc: /* Assign higher priority to the coding systems given as arguments.
9390 If multiple coding systems belong to the same category,
9391 all but the first one are ignored.
9392
9393 usage: (set-coding-system-priority &rest coding-systems)  */)
9394   (ptrdiff_t nargs, Lisp_Object *args)
9395 {
9396   ptrdiff_t i, j;
9397   bool changed[coding_category_max];
9398   enum coding_category priorities[coding_category_max];
9399
9400   memset (changed, 0, sizeof changed);
9401
9402   for (i = j = 0; i < nargs; i++)
9403     {
9404       enum coding_category category;
9405       Lisp_Object spec, attrs;
9406
9407       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9408       attrs = AREF (spec, 0);
9409       category = XINT (CODING_ATTR_CATEGORY (attrs));
9410       if (changed[category])
9411         /* Ignore this coding system because a coding system of the
9412            same category already had a higher priority.  */
9413         continue;
9414       changed[category] = 1;
9415       priorities[j++] = category;
9416       if (coding_categories[category].id >= 0
9417           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9418         setup_coding_system (args[i], &coding_categories[category]);
9419       Fset (AREF (Vcoding_category_table, category), args[i]);
9420     }
9421
9422   /* Now we have decided top J priorities.  Reflect the order of the
9423      original priorities to the remaining priorities.  */
9424
9425   for (i = j, j = 0; i < coding_category_max; i++, j++)
9426     {
9427       while (j < coding_category_max
9428              && changed[coding_priorities[j]])
9429         j++;
9430       if (j == coding_category_max)
9431         abort ();
9432       priorities[i] = coding_priorities[j];
9433     }
9434
9435   memcpy (coding_priorities, priorities, sizeof priorities);
9436
9437   /* Update `coding-category-list'.  */
9438   Vcoding_category_list = Qnil;
9439   for (i = coding_category_max; i-- > 0; )
9440     Vcoding_category_list
9441       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9442                Vcoding_category_list);
9443
9444   return Qnil;
9445 }
9446
9447 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9448        Scoding_system_priority_list, 0, 1, 0,
9449        doc: /* Return a list of coding systems ordered by their priorities.
9450 The list contains a subset of coding systems; i.e. coding systems
9451 assigned to each coding category (see `coding-category-list').
9452
9453 HIGHESTP non-nil means just return the highest priority one.  */)
9454   (Lisp_Object highestp)
9455 {
9456   int i;
9457   Lisp_Object val;
9458
9459   for (i = 0, val = Qnil; i < coding_category_max; i++)
9460     {
9461       enum coding_category category = coding_priorities[i];
9462       int id = coding_categories[category].id;
9463       Lisp_Object attrs;
9464
9465       if (id < 0)
9466         continue;
9467       attrs = CODING_ID_ATTRS (id);
9468       if (! NILP (highestp))
9469         return CODING_ATTR_BASE_NAME (attrs);
9470       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9471     }
9472   return Fnreverse (val);
9473 }
9474
9475 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9476
9477 static Lisp_Object
9478 make_subsidiaries (Lisp_Object base)
9479 {
9480   Lisp_Object subsidiaries;
9481   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9482   char *buf = alloca (base_name_len + 6);
9483   int i;
9484
9485   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9486   subsidiaries = Fmake_vector (make_number (3), Qnil);
9487   for (i = 0; i < 3; i++)
9488     {
9489       strcpy (buf + base_name_len, suffixes[i]);
9490       ASET (subsidiaries, i, intern (buf));
9491     }
9492   return subsidiaries;
9493 }
9494
9495
9496 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9497        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9498        doc: /* For internal use only.
9499 usage: (define-coding-system-internal ...)  */)
9500   (ptrdiff_t nargs, Lisp_Object *args)
9501 {
9502   Lisp_Object name;
9503   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9504   Lisp_Object attrs;            /* Vector of attributes.  */
9505   Lisp_Object eol_type;
9506   Lisp_Object aliases;
9507   Lisp_Object coding_type, charset_list, safe_charsets;
9508   enum coding_category category;
9509   Lisp_Object tail, val;
9510   int max_charset_id = 0;
9511   int i;
9512
9513   if (nargs < coding_arg_max)
9514     goto short_args;
9515
9516   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9517
9518   name = args[coding_arg_name];
9519   CHECK_SYMBOL (name);
9520   ASET (attrs, coding_attr_base_name, name);
9521
9522   val = args[coding_arg_mnemonic];
9523   if (! STRINGP (val))
9524     CHECK_CHARACTER (val);
9525   ASET (attrs, coding_attr_mnemonic, val);
9526
9527   coding_type = args[coding_arg_coding_type];
9528   CHECK_SYMBOL (coding_type);
9529   ASET (attrs, coding_attr_type, coding_type);
9530
9531   charset_list = args[coding_arg_charset_list];
9532   if (SYMBOLP (charset_list))
9533     {
9534       if (EQ (charset_list, Qiso_2022))
9535         {
9536           if (! EQ (coding_type, Qiso_2022))
9537             error ("Invalid charset-list");
9538           charset_list = Viso_2022_charset_list;
9539         }
9540       else if (EQ (charset_list, Qemacs_mule))
9541         {
9542           if (! EQ (coding_type, Qemacs_mule))
9543             error ("Invalid charset-list");
9544           charset_list = Vemacs_mule_charset_list;
9545         }
9546       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9547         {
9548           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9549             error ("Invalid charset-list");
9550           if (max_charset_id < XFASTINT (XCAR (tail)))
9551             max_charset_id = XFASTINT (XCAR (tail));
9552         }
9553     }
9554   else
9555     {
9556       charset_list = Fcopy_sequence (charset_list);
9557       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9558         {
9559           struct charset *charset;
9560
9561           val = XCAR (tail);
9562           CHECK_CHARSET_GET_CHARSET (val, charset);
9563           if (EQ (coding_type, Qiso_2022)
9564               ? CHARSET_ISO_FINAL (charset) < 0
9565               : EQ (coding_type, Qemacs_mule)
9566               ? CHARSET_EMACS_MULE_ID (charset) < 0
9567               : 0)
9568             error ("Can't handle charset `%s'",
9569                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9570
9571           XSETCAR (tail, make_number (charset->id));
9572           if (max_charset_id < charset->id)
9573             max_charset_id = charset->id;
9574         }
9575     }
9576   ASET (attrs, coding_attr_charset_list, charset_list);
9577
9578   safe_charsets = make_uninit_string (max_charset_id + 1);
9579   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9580   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9581     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9582   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9583
9584   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9585
9586   val = args[coding_arg_decode_translation_table];
9587   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9588     CHECK_SYMBOL (val);
9589   ASET (attrs, coding_attr_decode_tbl, val);
9590
9591   val = args[coding_arg_encode_translation_table];
9592   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9593     CHECK_SYMBOL (val);
9594   ASET (attrs, coding_attr_encode_tbl, val);
9595
9596   val = args[coding_arg_post_read_conversion];
9597   CHECK_SYMBOL (val);
9598   ASET (attrs, coding_attr_post_read, val);
9599
9600   val = args[coding_arg_pre_write_conversion];
9601   CHECK_SYMBOL (val);
9602   ASET (attrs, coding_attr_pre_write, val);
9603
9604   val = args[coding_arg_default_char];
9605   if (NILP (val))
9606     ASET (attrs, coding_attr_default_char, make_number (' '));
9607   else
9608     {
9609       CHECK_CHARACTER (val);
9610       ASET (attrs, coding_attr_default_char, val);
9611     }
9612
9613   val = args[coding_arg_for_unibyte];
9614   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9615
9616   val = args[coding_arg_plist];
9617   CHECK_LIST (val);
9618   ASET (attrs, coding_attr_plist, val);
9619
9620   if (EQ (coding_type, Qcharset))
9621     {
9622       /* Generate a lisp vector of 256 elements.  Each element is nil,
9623          integer, or a list of charset IDs.
9624
9625          If Nth element is nil, the byte code N is invalid in this
9626          coding system.
9627
9628          If Nth element is a number NUM, N is the first byte of a
9629          charset whose ID is NUM.
9630
9631          If Nth element is a list of charset IDs, N is the first byte
9632          of one of them.  The list is sorted by dimensions of the
9633          charsets.  A charset of smaller dimension comes first. */
9634       val = Fmake_vector (make_number (256), Qnil);
9635
9636       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9637         {
9638           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9639           int dim = CHARSET_DIMENSION (charset);
9640           int idx = (dim - 1) * 4;
9641
9642           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9643             ASET (attrs, coding_attr_ascii_compat, Qt);
9644
9645           for (i = charset->code_space[idx];
9646                i <= charset->code_space[idx + 1]; i++)
9647             {
9648               Lisp_Object tmp, tmp2;
9649               int dim2;
9650
9651               tmp = AREF (val, i);
9652               if (NILP (tmp))
9653                 tmp = XCAR (tail);
9654               else if (NUMBERP (tmp))
9655                 {
9656                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9657                   if (dim < dim2)
9658                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9659                   else
9660                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9661                 }
9662               else
9663                 {
9664                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9665                     {
9666                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9667                       if (dim < dim2)
9668                         break;
9669                     }
9670                   if (NILP (tmp2))
9671                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9672                   else
9673                     {
9674                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9675                       XSETCAR (tmp2, XCAR (tail));
9676                     }
9677                 }
9678               ASET (val, i, tmp);
9679             }
9680         }
9681       ASET (attrs, coding_attr_charset_valids, val);
9682       category = coding_category_charset;
9683     }
9684   else if (EQ (coding_type, Qccl))
9685     {
9686       Lisp_Object valids;
9687
9688       if (nargs < coding_arg_ccl_max)
9689         goto short_args;
9690
9691       val = args[coding_arg_ccl_decoder];
9692       CHECK_CCL_PROGRAM (val);
9693       if (VECTORP (val))
9694         val = Fcopy_sequence (val);
9695       ASET (attrs, coding_attr_ccl_decoder, val);
9696
9697       val = args[coding_arg_ccl_encoder];
9698       CHECK_CCL_PROGRAM (val);
9699       if (VECTORP (val))
9700         val = Fcopy_sequence (val);
9701       ASET (attrs, coding_attr_ccl_encoder, val);
9702
9703       val = args[coding_arg_ccl_valids];
9704       valids = Fmake_string (make_number (256), make_number (0));
9705       for (tail = val; CONSP (tail); tail = XCDR (tail))
9706         {
9707           int from, to;
9708
9709           val = XCAR (tail);
9710           if (INTEGERP (val))
9711             {
9712               if (! (0 <= XINT (val) && XINT (val) <= 255))
9713                 args_out_of_range_3 (val, make_number (0), make_number (255));
9714               from = to = XINT (val);
9715             }
9716           else
9717             {
9718               CHECK_CONS (val);
9719               CHECK_NATNUM_CAR (val);
9720               CHECK_NUMBER_CDR (val);
9721               if (XINT (XCAR (val)) > 255)
9722                 args_out_of_range_3 (XCAR (val),
9723                                      make_number (0), make_number (255));
9724               from = XINT (XCAR (val));
9725               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9726                 args_out_of_range_3 (XCDR (val),
9727                                      XCAR (val), make_number (255));
9728               to = XINT (XCDR (val));
9729             }
9730           for (i = from; i <= to; i++)
9731             SSET (valids, i, 1);
9732         }
9733       ASET (attrs, coding_attr_ccl_valids, valids);
9734
9735       category = coding_category_ccl;
9736     }
9737   else if (EQ (coding_type, Qutf_16))
9738     {
9739       Lisp_Object bom, endian;
9740
9741       ASET (attrs, coding_attr_ascii_compat, Qnil);
9742
9743       if (nargs < coding_arg_utf16_max)
9744         goto short_args;
9745
9746       bom = args[coding_arg_utf16_bom];
9747       if (! NILP (bom) && ! EQ (bom, Qt))
9748         {
9749           CHECK_CONS (bom);
9750           val = XCAR (bom);
9751           CHECK_CODING_SYSTEM (val);
9752           val = XCDR (bom);
9753           CHECK_CODING_SYSTEM (val);
9754         }
9755       ASET (attrs, coding_attr_utf_bom, bom);
9756
9757       endian = args[coding_arg_utf16_endian];
9758       CHECK_SYMBOL (endian);
9759       if (NILP (endian))
9760         endian = Qbig;
9761       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9762         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9763       ASET (attrs, coding_attr_utf_16_endian, endian);
9764
9765       category = (CONSP (bom)
9766                   ? coding_category_utf_16_auto
9767                   : NILP (bom)
9768                   ? (EQ (endian, Qbig)
9769                      ? coding_category_utf_16_be_nosig
9770                      : coding_category_utf_16_le_nosig)
9771                   : (EQ (endian, Qbig)
9772                      ? coding_category_utf_16_be
9773                      : coding_category_utf_16_le));
9774     }
9775   else if (EQ (coding_type, Qiso_2022))
9776     {
9777       Lisp_Object initial, reg_usage, request, flags;
9778
9779       if (nargs < coding_arg_iso2022_max)
9780         goto short_args;
9781
9782       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9783       CHECK_VECTOR (initial);
9784       for (i = 0; i < 4; i++)
9785         {
9786           val = Faref (initial, make_number (i));
9787           if (! NILP (val))
9788             {
9789               struct charset *charset;
9790
9791               CHECK_CHARSET_GET_CHARSET (val, charset);
9792               ASET (initial, i, make_number (CHARSET_ID (charset)));
9793               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9794                 ASET (attrs, coding_attr_ascii_compat, Qt);
9795             }
9796           else
9797             ASET (initial, i, make_number (-1));
9798         }
9799
9800       reg_usage = args[coding_arg_iso2022_reg_usage];
9801       CHECK_CONS (reg_usage);
9802       CHECK_NUMBER_CAR (reg_usage);
9803       CHECK_NUMBER_CDR (reg_usage);
9804
9805       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9806       for (tail = request; CONSP (tail); tail = XCDR (tail))
9807         {
9808           int id;
9809           Lisp_Object tmp1;
9810
9811           val = XCAR (tail);
9812           CHECK_CONS (val);
9813           tmp1 = XCAR (val);
9814           CHECK_CHARSET_GET_ID (tmp1, id);
9815           CHECK_NATNUM_CDR (val);
9816           if (XINT (XCDR (val)) >= 4)
9817             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9818           XSETCAR (val, make_number (id));
9819         }
9820
9821       flags = args[coding_arg_iso2022_flags];
9822       CHECK_NATNUM (flags);
9823       i = XINT (flags) & INT_MAX;
9824       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9825         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9826       flags = make_number (i);
9827
9828       ASET (attrs, coding_attr_iso_initial, initial);
9829       ASET (attrs, coding_attr_iso_usage, reg_usage);
9830       ASET (attrs, coding_attr_iso_request, request);
9831       ASET (attrs, coding_attr_iso_flags, flags);
9832       setup_iso_safe_charsets (attrs);
9833
9834       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9835         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9836                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9837                     ? coding_category_iso_7_else
9838                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9839                     ? coding_category_iso_7
9840                     : coding_category_iso_7_tight);
9841       else
9842         {
9843           int id = XINT (AREF (initial, 1));
9844
9845           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9846                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9847                        || id < 0)
9848                       ? coding_category_iso_8_else
9849                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9850                       ? coding_category_iso_8_1
9851                       : coding_category_iso_8_2);
9852         }
9853       if (category != coding_category_iso_8_1
9854           && category != coding_category_iso_8_2)
9855         ASET (attrs, coding_attr_ascii_compat, Qnil);
9856     }
9857   else if (EQ (coding_type, Qemacs_mule))
9858     {
9859       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9860         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9861       ASET (attrs, coding_attr_ascii_compat, Qt);
9862       category = coding_category_emacs_mule;
9863     }
9864   else if (EQ (coding_type, Qshift_jis))
9865     {
9866
9867       struct charset *charset;
9868
9869       if (XINT (Flength (charset_list)) != 3
9870           && XINT (Flength (charset_list)) != 4)
9871         error ("There should be three or four charsets");
9872
9873       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9874       if (CHARSET_DIMENSION (charset) != 1)
9875         error ("Dimension of charset %s is not one",
9876                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9877       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9878         ASET (attrs, coding_attr_ascii_compat, Qt);
9879
9880       charset_list = XCDR (charset_list);
9881       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9882       if (CHARSET_DIMENSION (charset) != 1)
9883         error ("Dimension of charset %s is not one",
9884                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9885
9886       charset_list = XCDR (charset_list);
9887       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9888       if (CHARSET_DIMENSION (charset) != 2)
9889         error ("Dimension of charset %s is not two",
9890                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9891
9892       charset_list = XCDR (charset_list);
9893       if (! NILP (charset_list))
9894         {
9895           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9896           if (CHARSET_DIMENSION (charset) != 2)
9897             error ("Dimension of charset %s is not two",
9898                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9899         }
9900
9901       category = coding_category_sjis;
9902       Vsjis_coding_system = name;
9903     }
9904   else if (EQ (coding_type, Qbig5))
9905     {
9906       struct charset *charset;
9907
9908       if (XINT (Flength (charset_list)) != 2)
9909         error ("There should be just two charsets");
9910
9911       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9912       if (CHARSET_DIMENSION (charset) != 1)
9913         error ("Dimension of charset %s is not one",
9914                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9915       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9916         ASET (attrs, coding_attr_ascii_compat, Qt);
9917
9918       charset_list = XCDR (charset_list);
9919       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9920       if (CHARSET_DIMENSION (charset) != 2)
9921         error ("Dimension of charset %s is not two",
9922                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9923
9924       category = coding_category_big5;
9925       Vbig5_coding_system = name;
9926     }
9927   else if (EQ (coding_type, Qraw_text))
9928     {
9929       category = coding_category_raw_text;
9930       ASET (attrs, coding_attr_ascii_compat, Qt);
9931     }
9932   else if (EQ (coding_type, Qutf_8))
9933     {
9934       Lisp_Object bom;
9935
9936       if (nargs < coding_arg_utf8_max)
9937         goto short_args;
9938
9939       bom = args[coding_arg_utf8_bom];
9940       if (! NILP (bom) && ! EQ (bom, Qt))
9941         {
9942           CHECK_CONS (bom);
9943           val = XCAR (bom);
9944           CHECK_CODING_SYSTEM (val);
9945           val = XCDR (bom);
9946           CHECK_CODING_SYSTEM (val);
9947         }
9948       ASET (attrs, coding_attr_utf_bom, bom);
9949       if (NILP (bom))
9950         ASET (attrs, coding_attr_ascii_compat, Qt);
9951
9952       category = (CONSP (bom) ? coding_category_utf_8_auto
9953                   : NILP (bom) ? coding_category_utf_8_nosig
9954                   : coding_category_utf_8_sig);
9955     }
9956   else if (EQ (coding_type, Qundecided))
9957     category = coding_category_undecided;
9958   else
9959     error ("Invalid coding system type: %s",
9960            SDATA (SYMBOL_NAME (coding_type)));
9961
9962   ASET (attrs, coding_attr_category, make_number (category));
9963   ASET (attrs, coding_attr_plist,
9964         Fcons (QCcategory,
9965                Fcons (AREF (Vcoding_category_table, category),
9966                       CODING_ATTR_PLIST (attrs))));
9967   ASET (attrs, coding_attr_plist,
9968         Fcons (QCascii_compatible_p,
9969                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9970                       CODING_ATTR_PLIST (attrs))));
9971
9972   eol_type = args[coding_arg_eol_type];
9973   if (! NILP (eol_type)
9974       && ! EQ (eol_type, Qunix)
9975       && ! EQ (eol_type, Qdos)
9976       && ! EQ (eol_type, Qmac))
9977     error ("Invalid eol-type");
9978
9979   aliases = Fcons (name, Qnil);
9980
9981   if (NILP (eol_type))
9982     {
9983       eol_type = make_subsidiaries (name);
9984       for (i = 0; i < 3; i++)
9985         {
9986           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9987
9988           this_name = AREF (eol_type, i);
9989           this_aliases = Fcons (this_name, Qnil);
9990           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9991           this_spec = Fmake_vector (make_number (3), attrs);
9992           ASET (this_spec, 1, this_aliases);
9993           ASET (this_spec, 2, this_eol_type);
9994           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9995           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9996           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9997           if (NILP (val))
9998             Vcoding_system_alist
9999               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10000                        Vcoding_system_alist);
10001         }
10002     }
10003
10004   spec_vec = Fmake_vector (make_number (3), attrs);
10005   ASET (spec_vec, 1, aliases);
10006   ASET (spec_vec, 2, eol_type);
10007
10008   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10009   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10010   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10011   if (NILP (val))
10012     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10013                                   Vcoding_system_alist);
10014
10015   {
10016     int id = coding_categories[category].id;
10017
10018     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10019       setup_coding_system (name, &coding_categories[category]);
10020   }
10021
10022   return Qnil;
10023
10024  short_args:
10025   return Fsignal (Qwrong_number_of_arguments,
10026                   Fcons (intern ("define-coding-system-internal"),
10027                          make_number (nargs)));
10028 }
10029
10030
10031 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10032        3, 3, 0,
10033        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10034   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10035 {
10036   Lisp_Object spec, attrs;
10037
10038   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10039   attrs = AREF (spec, 0);
10040   if (EQ (prop, QCmnemonic))
10041     {
10042       if (! STRINGP (val))
10043         CHECK_CHARACTER (val);
10044       ASET (attrs, coding_attr_mnemonic, val);
10045     }
10046   else if (EQ (prop, QCdefault_char))
10047     {
10048       if (NILP (val))
10049         val = make_number (' ');
10050       else
10051         CHECK_CHARACTER (val);
10052       ASET (attrs, coding_attr_default_char, val);
10053     }
10054   else if (EQ (prop, QCdecode_translation_table))
10055     {
10056       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10057         CHECK_SYMBOL (val);
10058       ASET (attrs, coding_attr_decode_tbl, val);
10059     }
10060   else if (EQ (prop, QCencode_translation_table))
10061     {
10062       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10063         CHECK_SYMBOL (val);
10064       ASET (attrs, coding_attr_encode_tbl, val);
10065     }
10066   else if (EQ (prop, QCpost_read_conversion))
10067     {
10068       CHECK_SYMBOL (val);
10069       ASET (attrs, coding_attr_post_read, val);
10070     }
10071   else if (EQ (prop, QCpre_write_conversion))
10072     {
10073       CHECK_SYMBOL (val);
10074       ASET (attrs, coding_attr_pre_write, val);
10075     }
10076   else if (EQ (prop, QCascii_compatible_p))
10077     {
10078       ASET (attrs, coding_attr_ascii_compat, val);
10079     }
10080
10081   ASET (attrs, coding_attr_plist,
10082         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10083   return val;
10084 }
10085
10086
10087 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10088        Sdefine_coding_system_alias, 2, 2, 0,
10089        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10090   (Lisp_Object alias, Lisp_Object coding_system)
10091 {
10092   Lisp_Object spec, aliases, eol_type, val;
10093
10094   CHECK_SYMBOL (alias);
10095   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10096   aliases = AREF (spec, 1);
10097   /* ALIASES should be a list of length more than zero, and the first
10098      element is a base coding system.  Append ALIAS at the tail of the
10099      list.  */
10100   while (!NILP (XCDR (aliases)))
10101     aliases = XCDR (aliases);
10102   XSETCDR (aliases, Fcons (alias, Qnil));
10103
10104   eol_type = AREF (spec, 2);
10105   if (VECTORP (eol_type))
10106     {
10107       Lisp_Object subsidiaries;
10108       int i;
10109
10110       subsidiaries = make_subsidiaries (alias);
10111       for (i = 0; i < 3; i++)
10112         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10113                                      AREF (eol_type, i));
10114     }
10115
10116   Fputhash (alias, spec, Vcoding_system_hash_table);
10117   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10118   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10119   if (NILP (val))
10120     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10121                                   Vcoding_system_alist);
10122
10123   return Qnil;
10124 }
10125
10126 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10127        1, 1, 0,
10128        doc: /* Return the base of CODING-SYSTEM.
10129 Any alias or subsidiary coding system is not a base coding system.  */)
10130   (Lisp_Object coding_system)
10131 {
10132   Lisp_Object spec, attrs;
10133
10134   if (NILP (coding_system))
10135     return (Qno_conversion);
10136   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10137   attrs = AREF (spec, 0);
10138   return CODING_ATTR_BASE_NAME (attrs);
10139 }
10140
10141 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10142        1, 1, 0,
10143        doc: "Return the property list of CODING-SYSTEM.")
10144   (Lisp_Object coding_system)
10145 {
10146   Lisp_Object spec, attrs;
10147
10148   if (NILP (coding_system))
10149     coding_system = Qno_conversion;
10150   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10151   attrs = AREF (spec, 0);
10152   return CODING_ATTR_PLIST (attrs);
10153 }
10154
10155
10156 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10157        1, 1, 0,
10158        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10159   (Lisp_Object coding_system)
10160 {
10161   Lisp_Object spec;
10162
10163   if (NILP (coding_system))
10164     coding_system = Qno_conversion;
10165   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10166   return AREF (spec, 1);
10167 }
10168
10169 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10170        Scoding_system_eol_type, 1, 1, 0,
10171        doc: /* Return eol-type of CODING-SYSTEM.
10172 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10173
10174 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10175 and CR respectively.
10176
10177 A vector value indicates that a format of end-of-line should be
10178 detected automatically.  Nth element of the vector is the subsidiary
10179 coding system whose eol-type is N.  */)
10180   (Lisp_Object coding_system)
10181 {
10182   Lisp_Object spec, eol_type;
10183   int n;
10184
10185   if (NILP (coding_system))
10186     coding_system = Qno_conversion;
10187   if (! CODING_SYSTEM_P (coding_system))
10188     return Qnil;
10189   spec = CODING_SYSTEM_SPEC (coding_system);
10190   eol_type = AREF (spec, 2);
10191   if (VECTORP (eol_type))
10192     return Fcopy_sequence (eol_type);
10193   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10194   return make_number (n);
10195 }
10196
10197 #endif /* emacs */
10198
10199 \f
10200 /*** 9. Post-amble ***/
10201
10202 void
10203 init_coding_once (void)
10204 {
10205   int i;
10206
10207   for (i = 0; i < coding_category_max; i++)
10208     {
10209       coding_categories[i].id = -1;
10210       coding_priorities[i] = i;
10211     }
10212
10213   /* ISO2022 specific initialize routine.  */
10214   for (i = 0; i < 0x20; i++)
10215     iso_code_class[i] = ISO_control_0;
10216   for (i = 0x21; i < 0x7F; i++)
10217     iso_code_class[i] = ISO_graphic_plane_0;
10218   for (i = 0x80; i < 0xA0; i++)
10219     iso_code_class[i] = ISO_control_1;
10220   for (i = 0xA1; i < 0xFF; i++)
10221     iso_code_class[i] = ISO_graphic_plane_1;
10222   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10223   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10224   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10225   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10226   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10227   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10228   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10229   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10230   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10231
10232   for (i = 0; i < 256; i++)
10233     {
10234       emacs_mule_bytes[i] = 1;
10235     }
10236   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10237   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10238   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10239   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10240 }
10241
10242 #ifdef emacs
10243
10244 void
10245 syms_of_coding (void)
10246 {
10247   staticpro (&Vcoding_system_hash_table);
10248   {
10249     Lisp_Object args[2];
10250     args[0] = QCtest;
10251     args[1] = Qeq;
10252     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10253   }
10254
10255   staticpro (&Vsjis_coding_system);
10256   Vsjis_coding_system = Qnil;
10257
10258   staticpro (&Vbig5_coding_system);
10259   Vbig5_coding_system = Qnil;
10260
10261   staticpro (&Vcode_conversion_reused_workbuf);
10262   Vcode_conversion_reused_workbuf = Qnil;
10263
10264   staticpro (&Vcode_conversion_workbuf_name);
10265   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10266
10267   reused_workbuf_in_use = 0;
10268
10269   DEFSYM (Qcharset, "charset");
10270   DEFSYM (Qtarget_idx, "target-idx");
10271   DEFSYM (Qcoding_system_history, "coding-system-history");
10272   Fset (Qcoding_system_history, Qnil);
10273
10274   /* Target FILENAME is the first argument.  */
10275   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10276   /* Target FILENAME is the third argument.  */
10277   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10278
10279   DEFSYM (Qcall_process, "call-process");
10280   /* Target PROGRAM is the first argument.  */
10281   Fput (Qcall_process, Qtarget_idx, make_number (0));
10282
10283   DEFSYM (Qcall_process_region, "call-process-region");
10284   /* Target PROGRAM is the third argument.  */
10285   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10286
10287   DEFSYM (Qstart_process, "start-process");
10288   /* Target PROGRAM is the third argument.  */
10289   Fput (Qstart_process, Qtarget_idx, make_number (2));
10290
10291   DEFSYM (Qopen_network_stream, "open-network-stream");
10292   /* Target SERVICE is the fourth argument.  */
10293   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10294
10295   DEFSYM (Qcoding_system, "coding-system");
10296   DEFSYM (Qcoding_aliases, "coding-aliases");
10297
10298   DEFSYM (Qeol_type, "eol-type");
10299   DEFSYM (Qunix, "unix");
10300   DEFSYM (Qdos, "dos");
10301
10302   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10303   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10304   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10305   DEFSYM (Qdefault_char, "default-char");
10306   DEFSYM (Qundecided, "undecided");
10307   DEFSYM (Qno_conversion, "no-conversion");
10308   DEFSYM (Qraw_text, "raw-text");
10309
10310   DEFSYM (Qiso_2022, "iso-2022");
10311
10312   DEFSYM (Qutf_8, "utf-8");
10313   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10314
10315   DEFSYM (Qutf_16, "utf-16");
10316   DEFSYM (Qbig, "big");
10317   DEFSYM (Qlittle, "little");
10318
10319   DEFSYM (Qshift_jis, "shift-jis");
10320   DEFSYM (Qbig5, "big5");
10321
10322   DEFSYM (Qcoding_system_p, "coding-system-p");
10323
10324   DEFSYM (Qcoding_system_error, "coding-system-error");
10325   Fput (Qcoding_system_error, Qerror_conditions,
10326         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10327   Fput (Qcoding_system_error, Qerror_message,
10328         build_pure_c_string ("Invalid coding system"));
10329
10330   /* Intern this now in case it isn't already done.
10331      Setting this variable twice is harmless.
10332      But don't staticpro it here--that is done in alloc.c.  */
10333   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10334
10335   DEFSYM (Qtranslation_table, "translation-table");
10336   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10337   DEFSYM (Qtranslation_table_id, "translation-table-id");
10338   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10339   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10340
10341   DEFSYM (Qvalid_codes, "valid-codes");
10342
10343   DEFSYM (Qemacs_mule, "emacs-mule");
10344
10345   DEFSYM (QCcategory, ":category");
10346   DEFSYM (QCmnemonic, ":mnemonic");
10347   DEFSYM (QCdefault_char, ":default-char");
10348   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10349   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10350   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10351   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10352   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10353
10354   Vcoding_category_table
10355     = Fmake_vector (make_number (coding_category_max), Qnil);
10356   staticpro (&Vcoding_category_table);
10357   /* Followings are target of code detection.  */
10358   ASET (Vcoding_category_table, coding_category_iso_7,
10359         intern_c_string ("coding-category-iso-7"));
10360   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10361         intern_c_string ("coding-category-iso-7-tight"));
10362   ASET (Vcoding_category_table, coding_category_iso_8_1,
10363         intern_c_string ("coding-category-iso-8-1"));
10364   ASET (Vcoding_category_table, coding_category_iso_8_2,
10365         intern_c_string ("coding-category-iso-8-2"));
10366   ASET (Vcoding_category_table, coding_category_iso_7_else,
10367         intern_c_string ("coding-category-iso-7-else"));
10368   ASET (Vcoding_category_table, coding_category_iso_8_else,
10369         intern_c_string ("coding-category-iso-8-else"));
10370   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10371         intern_c_string ("coding-category-utf-8-auto"));
10372   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10373         intern_c_string ("coding-category-utf-8"));
10374   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10375         intern_c_string ("coding-category-utf-8-sig"));
10376   ASET (Vcoding_category_table, coding_category_utf_16_be,
10377         intern_c_string ("coding-category-utf-16-be"));
10378   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10379         intern_c_string ("coding-category-utf-16-auto"));
10380   ASET (Vcoding_category_table, coding_category_utf_16_le,
10381         intern_c_string ("coding-category-utf-16-le"));
10382   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10383         intern_c_string ("coding-category-utf-16-be-nosig"));
10384   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10385         intern_c_string ("coding-category-utf-16-le-nosig"));
10386   ASET (Vcoding_category_table, coding_category_charset,
10387         intern_c_string ("coding-category-charset"));
10388   ASET (Vcoding_category_table, coding_category_sjis,
10389         intern_c_string ("coding-category-sjis"));
10390   ASET (Vcoding_category_table, coding_category_big5,
10391         intern_c_string ("coding-category-big5"));
10392   ASET (Vcoding_category_table, coding_category_ccl,
10393         intern_c_string ("coding-category-ccl"));
10394   ASET (Vcoding_category_table, coding_category_emacs_mule,
10395         intern_c_string ("coding-category-emacs-mule"));
10396   /* Followings are NOT target of code detection.  */
10397   ASET (Vcoding_category_table, coding_category_raw_text,
10398         intern_c_string ("coding-category-raw-text"));
10399   ASET (Vcoding_category_table, coding_category_undecided,
10400         intern_c_string ("coding-category-undecided"));
10401
10402   DEFSYM (Qinsufficient_source, "insufficient-source");
10403   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10404   DEFSYM (Qinvalid_source, "invalid-source");
10405   DEFSYM (Qinterrupted, "interrupted");
10406   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10407   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10408
10409   defsubr (&Scoding_system_p);
10410   defsubr (&Sread_coding_system);
10411   defsubr (&Sread_non_nil_coding_system);
10412   defsubr (&Scheck_coding_system);
10413   defsubr (&Sdetect_coding_region);
10414   defsubr (&Sdetect_coding_string);
10415   defsubr (&Sfind_coding_systems_region_internal);
10416   defsubr (&Sunencodable_char_position);
10417   defsubr (&Scheck_coding_systems_region);
10418   defsubr (&Sdecode_coding_region);
10419   defsubr (&Sencode_coding_region);
10420   defsubr (&Sdecode_coding_string);
10421   defsubr (&Sencode_coding_string);
10422   defsubr (&Sdecode_sjis_char);
10423   defsubr (&Sencode_sjis_char);
10424   defsubr (&Sdecode_big5_char);
10425   defsubr (&Sencode_big5_char);
10426   defsubr (&Sset_terminal_coding_system_internal);
10427   defsubr (&Sset_safe_terminal_coding_system_internal);
10428   defsubr (&Sterminal_coding_system);
10429   defsubr (&Sset_keyboard_coding_system_internal);
10430   defsubr (&Skeyboard_coding_system);
10431   defsubr (&Sfind_operation_coding_system);
10432   defsubr (&Sset_coding_system_priority);
10433   defsubr (&Sdefine_coding_system_internal);
10434   defsubr (&Sdefine_coding_system_alias);
10435   defsubr (&Scoding_system_put);
10436   defsubr (&Scoding_system_base);
10437   defsubr (&Scoding_system_plist);
10438   defsubr (&Scoding_system_aliases);
10439   defsubr (&Scoding_system_eol_type);
10440   defsubr (&Scoding_system_priority_list);
10441
10442   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10443                doc: /* List of coding systems.
10444
10445 Do not alter the value of this variable manually.  This variable should be
10446 updated by the functions `define-coding-system' and
10447 `define-coding-system-alias'.  */);
10448   Vcoding_system_list = Qnil;
10449
10450   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10451                doc: /* Alist of coding system names.
10452 Each element is one element list of coding system name.
10453 This variable is given to `completing-read' as COLLECTION argument.
10454
10455 Do not alter the value of this variable manually.  This variable should be
10456 updated by the functions `make-coding-system' and
10457 `define-coding-system-alias'.  */);
10458   Vcoding_system_alist = Qnil;
10459
10460   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10461                doc: /* List of coding-categories (symbols) ordered by priority.
10462
10463 On detecting a coding system, Emacs tries code detection algorithms
10464 associated with each coding-category one by one in this order.  When
10465 one algorithm agrees with a byte sequence of source text, the coding
10466 system bound to the corresponding coding-category is selected.
10467
10468 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10469   {
10470     int i;
10471
10472     Vcoding_category_list = Qnil;
10473     for (i = coding_category_max - 1; i >= 0; i--)
10474       Vcoding_category_list
10475         = Fcons (AREF (Vcoding_category_table, i),
10476                  Vcoding_category_list);
10477   }
10478
10479   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10480                doc: /* Specify the coding system for read operations.
10481 It is useful to bind this variable with `let', but do not set it globally.
10482 If the value is a coding system, it is used for decoding on read operation.
10483 If not, an appropriate element is used from one of the coding system alists.
10484 There are three such tables: `file-coding-system-alist',
10485 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10486   Vcoding_system_for_read = Qnil;
10487
10488   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10489                doc: /* Specify the coding system for write operations.
10490 Programs bind this variable with `let', but you should not set it globally.
10491 If the value is a coding system, it is used for encoding of output,
10492 when writing it to a file and when sending it to a file or subprocess.
10493
10494 If this does not specify a coding system, an appropriate element
10495 is used from one of the coding system alists.
10496 There are three such tables: `file-coding-system-alist',
10497 `process-coding-system-alist', and `network-coding-system-alist'.
10498 For output to files, if the above procedure does not specify a coding system,
10499 the value of `buffer-file-coding-system' is used.  */);
10500   Vcoding_system_for_write = Qnil;
10501
10502   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10503                doc: /*
10504 Coding system used in the latest file or process I/O.  */);
10505   Vlast_coding_system_used = Qnil;
10506
10507   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10508                doc: /*
10509 Error status of the last code conversion.
10510
10511 When an error was detected in the last code conversion, this variable
10512 is set to one of the following symbols.
10513   `insufficient-source'
10514   `inconsistent-eol'
10515   `invalid-source'
10516   `interrupted'
10517   `insufficient-memory'
10518 When no error was detected, the value doesn't change.  So, to check
10519 the error status of a code conversion by this variable, you must
10520 explicitly set this variable to nil before performing code
10521 conversion.  */);
10522   Vlast_code_conversion_error = Qnil;
10523
10524   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10525                doc: /*
10526 *Non-nil means always inhibit code conversion of end-of-line format.
10527 See info node `Coding Systems' and info node `Text and Binary' concerning
10528 such conversion.  */);
10529   inhibit_eol_conversion = 0;
10530
10531   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10532                doc: /*
10533 Non-nil means process buffer inherits coding system of process output.
10534 Bind it to t if the process output is to be treated as if it were a file
10535 read from some filesystem.  */);
10536   inherit_process_coding_system = 0;
10537
10538   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10539                doc: /*
10540 Alist to decide a coding system to use for a file I/O operation.
10541 The format is ((PATTERN . VAL) ...),
10542 where PATTERN is a regular expression matching a file name,
10543 VAL is a coding system, a cons of coding systems, or a function symbol.
10544 If VAL is a coding system, it is used for both decoding and encoding
10545 the file contents.
10546 If VAL is a cons of coding systems, the car part is used for decoding,
10547 and the cdr part is used for encoding.
10548 If VAL is a function symbol, the function must return a coding system
10549 or a cons of coding systems which are used as above.  The function is
10550 called with an argument that is a list of the arguments with which
10551 `find-operation-coding-system' was called.  If the function can't decide
10552 a coding system, it can return `undecided' so that the normal
10553 code-detection is performed.
10554
10555 See also the function `find-operation-coding-system'
10556 and the variable `auto-coding-alist'.  */);
10557   Vfile_coding_system_alist = Qnil;
10558
10559   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10560                doc: /*
10561 Alist to decide a coding system to use for a process I/O operation.
10562 The format is ((PATTERN . VAL) ...),
10563 where PATTERN is a regular expression matching a program name,
10564 VAL is a coding system, a cons of coding systems, or a function symbol.
10565 If VAL is a coding system, it is used for both decoding what received
10566 from the program and encoding what sent to the program.
10567 If VAL is a cons of coding systems, the car part is used for decoding,
10568 and the cdr part is used for encoding.
10569 If VAL is a function symbol, the function must return a coding system
10570 or a cons of coding systems which are used as above.
10571
10572 See also the function `find-operation-coding-system'.  */);
10573   Vprocess_coding_system_alist = Qnil;
10574
10575   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10576                doc: /*
10577 Alist to decide a coding system to use for a network I/O operation.
10578 The format is ((PATTERN . VAL) ...),
10579 where PATTERN is a regular expression matching a network service name
10580 or is a port number to connect to,
10581 VAL is a coding system, a cons of coding systems, or a function symbol.
10582 If VAL is a coding system, it is used for both decoding what received
10583 from the network stream and encoding what sent to the network stream.
10584 If VAL is a cons of coding systems, the car part is used for decoding,
10585 and the cdr part is used for encoding.
10586 If VAL is a function symbol, the function must return a coding system
10587 or a cons of coding systems which are used as above.
10588
10589 See also the function `find-operation-coding-system'.  */);
10590   Vnetwork_coding_system_alist = Qnil;
10591
10592   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10593                doc: /* Coding system to use with system messages.
10594 Also used for decoding keyboard input on X Window system.  */);
10595   Vlocale_coding_system = Qnil;
10596
10597   /* The eol mnemonics are reset in startup.el system-dependently.  */
10598   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10599                doc: /*
10600 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10601   eol_mnemonic_unix = build_pure_c_string (":");
10602
10603   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10604                doc: /*
10605 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10606   eol_mnemonic_dos = build_pure_c_string ("\\");
10607
10608   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10609                doc: /*
10610 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10611   eol_mnemonic_mac = build_pure_c_string ("/");
10612
10613   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10614                doc: /*
10615 *String displayed in mode line when end-of-line format is not yet determined.  */);
10616   eol_mnemonic_undecided = build_pure_c_string (":");
10617
10618   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10619                doc: /*
10620 *Non-nil enables character translation while encoding and decoding.  */);
10621   Venable_character_translation = Qt;
10622
10623   DEFVAR_LISP ("standard-translation-table-for-decode",
10624                Vstandard_translation_table_for_decode,
10625                doc: /* Table for translating characters while decoding.  */);
10626   Vstandard_translation_table_for_decode = Qnil;
10627
10628   DEFVAR_LISP ("standard-translation-table-for-encode",
10629                Vstandard_translation_table_for_encode,
10630                doc: /* Table for translating characters while encoding.  */);
10631   Vstandard_translation_table_for_encode = Qnil;
10632
10633   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10634                doc: /* Alist of charsets vs revision numbers.
10635 While encoding, if a charset (car part of an element) is found,
10636 designate it with the escape sequence identifying revision (cdr part
10637 of the element).  */);
10638   Vcharset_revision_table = Qnil;
10639
10640   DEFVAR_LISP ("default-process-coding-system",
10641                Vdefault_process_coding_system,
10642                doc: /* Cons of coding systems used for process I/O by default.
10643 The car part is used for decoding a process output,
10644 the cdr part is used for encoding a text to be sent to a process.  */);
10645   Vdefault_process_coding_system = Qnil;
10646
10647   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10648                doc: /*
10649 Table of extra Latin codes in the range 128..159 (inclusive).
10650 This is a vector of length 256.
10651 If Nth element is non-nil, the existence of code N in a file
10652 \(or output of subprocess) doesn't prevent it to be detected as
10653 a coding system of ISO 2022 variant which has a flag
10654 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10655 or reading output of a subprocess.
10656 Only 128th through 159th elements have a meaning.  */);
10657   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10658
10659   DEFVAR_LISP ("select-safe-coding-system-function",
10660                Vselect_safe_coding_system_function,
10661                doc: /*
10662 Function to call to select safe coding system for encoding a text.
10663
10664 If set, this function is called to force a user to select a proper
10665 coding system which can encode the text in the case that a default
10666 coding system used in each operation can't encode the text.  The
10667 function should take care that the buffer is not modified while
10668 the coding system is being selected.
10669
10670 The default value is `select-safe-coding-system' (which see).  */);
10671   Vselect_safe_coding_system_function = Qnil;
10672
10673   DEFVAR_BOOL ("coding-system-require-warning",
10674                coding_system_require_warning,
10675                doc: /* Internal use only.
10676 If non-nil, on writing a file, `select-safe-coding-system-function' is
10677 called even if `coding-system-for-write' is non-nil.  The command
10678 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10679   coding_system_require_warning = 0;
10680
10681
10682   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10683                inhibit_iso_escape_detection,
10684                doc: /*
10685 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10686
10687 When Emacs reads text, it tries to detect how the text is encoded.
10688 This code detection is sensitive to escape sequences.  If Emacs sees
10689 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10690 of the ISO2022 encodings, and decodes text by the corresponding coding
10691 system (e.g. `iso-2022-7bit').
10692
10693 However, there may be a case that you want to read escape sequences in
10694 a file as is.  In such a case, you can set this variable to non-nil.
10695 Then the code detection will ignore any escape sequences, and no text is
10696 detected as encoded in some ISO-2022 encoding.  The result is that all
10697 escape sequences become visible in a buffer.
10698
10699 The default value is nil, and it is strongly recommended not to change
10700 it.  That is because many Emacs Lisp source files that contain
10701 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10702 in Emacs's distribution, and they won't be decoded correctly on
10703 reading if you suppress escape sequence detection.
10704
10705 The other way to read escape sequences in a file without decoding is
10706 to explicitly specify some coding system that doesn't use ISO-2022
10707 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10708   inhibit_iso_escape_detection = 0;
10709
10710   DEFVAR_BOOL ("inhibit-null-byte-detection",
10711                inhibit_null_byte_detection,
10712                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10713 By default, Emacs treats it as binary data, and does not attempt to
10714 decode it.  The effect is as if you specified `no-conversion' for
10715 reading that text.
10716
10717 Set this to non-nil when a regular text happens to include null bytes.
10718 Examples are Index nodes of Info files and null-byte delimited output
10719 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10720 decode text as usual.  */);
10721   inhibit_null_byte_detection = 0;
10722
10723   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10724                doc: /* Char table for translating self-inserting characters.
10725 This is applied to the result of input methods, not their input.
10726 See also `keyboard-translate-table'.
10727
10728 Use of this variable for character code unification was rendered
10729 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10730 internal character representation.  */);
10731     Vtranslation_table_for_input = Qnil;
10732
10733   {
10734     Lisp_Object args[coding_arg_max];
10735     Lisp_Object plist[16];
10736     int i;
10737
10738     for (i = 0; i < coding_arg_max; i++)
10739       args[i] = Qnil;
10740
10741     plist[0] = intern_c_string (":name");
10742     plist[1] = args[coding_arg_name] = Qno_conversion;
10743     plist[2] = intern_c_string (":mnemonic");
10744     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10745     plist[4] = intern_c_string (":coding-type");
10746     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10747     plist[6] = intern_c_string (":ascii-compatible-p");
10748     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10749     plist[8] = intern_c_string (":default-char");
10750     plist[9] = args[coding_arg_default_char] = make_number (0);
10751     plist[10] = intern_c_string (":for-unibyte");
10752     plist[11] = args[coding_arg_for_unibyte] = Qt;
10753     plist[12] = intern_c_string (":docstring");
10754     plist[13] = build_pure_c_string ("Do no conversion.\n\
10755 \n\
10756 When you visit a file with this coding, the file is read into a\n\
10757 unibyte buffer as is, thus each byte of a file is treated as a\n\
10758 character.");
10759     plist[14] = intern_c_string (":eol-type");
10760     plist[15] = args[coding_arg_eol_type] = Qunix;
10761     args[coding_arg_plist] = Flist (16, plist);
10762     Fdefine_coding_system_internal (coding_arg_max, args);
10763
10764     plist[1] = args[coding_arg_name] = Qundecided;
10765     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10766     plist[5] = args[coding_arg_coding_type] = Qundecided;
10767     /* This is already set.
10768        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10769     plist[8] = intern_c_string (":charset-list");
10770     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10771     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10772     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10773     plist[15] = args[coding_arg_eol_type] = Qnil;
10774     args[coding_arg_plist] = Flist (16, plist);
10775     Fdefine_coding_system_internal (coding_arg_max, args);
10776   }
10777
10778   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10779
10780   {
10781     int i;
10782
10783     for (i = 0; i < coding_category_max; i++)
10784       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10785   }
10786 #if defined (DOS_NT)
10787   system_eol_type = Qdos;
10788 #else
10789   system_eol_type = Qunix;
10790 #endif
10791   staticpro (&system_eol_type);
10792 }
10793
10794 char *
10795 emacs_strerror (int error_number)
10796 {
10797   char *str;
10798
10799   synchronize_system_messages_locale ();
10800   str = strerror (error_number);
10801
10802   if (! NILP (Vlocale_coding_system))
10803     {
10804       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10805                                                       Vlocale_coding_system,
10806                                                       0);
10807       str = SSDATA (dec);
10808     }
10809
10810   return str;
10811 }
10812
10813 #endif /* emacs */