src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2012 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288 #include <setjmp.h>
 289
 290 #include "lisp.h"
 291 #include "character.h"
 292 #include "buffer.h"
 293 #include "charset.h"
 294 #include "ccl.h"
 295 #include "composite.h"
 296 #include "coding.h"
 297 #include "window.h"
 298 #include "frame.h"
 299 #include "termhooks.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 static Lisp_Object Qcoding_system, Qeol_type;
 304 static Lisp_Object Qcoding_aliases;
 305 Lisp_Object Qunix, Qdos;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 static Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qutf_8;
 311 static Lisp_Object Qiso_2022;
 312 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 313 static Lisp_Object Qbig, Qlittle;
 314 static Lisp_Object Qcoding_system_history;
 315 static Lisp_Object Qvalid_codes;
 316 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 317 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 static Lisp_Object QCascii_compatible_p;
 320
 321 Lisp_Object Qcall_process, Qcall_process_region;
 322 Lisp_Object Qstart_process, Qopen_network_stream;
 323 static Lisp_Object Qtarget_idx;
 324
 325 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 326 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 327
 328 /* If a symbol has this property, evaluate the value to define the
 329    symbol as a coding system.  */
 330 static Lisp_Object Qcoding_system_define_form;
 331
 332 /* Format of end-of-line decided by system.  This is Qunix on
 333    Unix and Mac, Qdos on DOS/Windows.
 334    This has an effect only for external encoding (i.e. for output to
 335    file and process), not for in-buffer or Lisp string encoding.  */
 336 static Lisp_Object system_eol_type;
 337
 338 #ifdef emacs
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 /* Coding-systems are handed between Emacs Lisp programs and C internal
 348    routines by the following three variables.  */
 349 /* Coding system to be used to encode text for terminal display when
 350    terminal coding system is nil.  */
 351 struct coding_system safe_terminal_coding;
 352
 353 #endif /* emacs */
 354
 355 Lisp_Object Qtranslation_table;
 356 Lisp_Object Qtranslation_table_id;
 357 static Lisp_Object Qtranslation_table_for_decode;
 358 static Lisp_Object Qtranslation_table_for_encode;
 359
 360 /* Two special coding systems.  */
 361 static Lisp_Object Vsjis_coding_system;
 362 static Lisp_Object Vbig5_coding_system;
 363
 364 /* ISO2022 section */
 365
 366 #define CODING_ISO_INITIAL(coding, reg)                 \
 367   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 368                      coding_attr_iso_initial),          \
 369                reg)))
 370
 371
 372 #define CODING_ISO_REQUEST(coding, charset_id)          \
 373   (((charset_id) <= (coding)->max_charset_id            \
 374     ? ((coding)->safe_charsets[charset_id] != 255       \
 375        ? (coding)->safe_charsets[charset_id]            \
 376        : -1)                                            \
 377     : -1))
 378
 379
 380 #define CODING_ISO_FLAGS(coding)        \
 381   ((coding)->spec.iso_2022.flags)
 382 #define CODING_ISO_DESIGNATION(coding, reg)     \
 383   ((coding)->spec.iso_2022.current_designation[reg])
 384 #define CODING_ISO_INVOCATION(coding, plane)    \
 385   ((coding)->spec.iso_2022.current_invocation[plane])
 386 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 387   ((coding)->spec.iso_2022.single_shifting)
 388 #define CODING_ISO_BOL(coding)  \
 389   ((coding)->spec.iso_2022.bol)
 390 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 391   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 392 #define CODING_ISO_CMP_STATUS(coding)   \
 393   (&(coding)->spec.iso_2022.cmp_status)
 394 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 395   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 396 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 397   ((coding)->spec.iso_2022.embedded_utf_8)
 398
 399 /* Control characters of ISO2022.  */
 400                         /* code */      /* function */
 401 #define ISO_CODE_SO     0x0E            /* shift-out */
 402 #define ISO_CODE_SI     0x0F            /* shift-in */
 403 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 404 #define ISO_CODE_ESC    0x1B            /* escape */
 405 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 406 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 407 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 408
 409 /* All code (1-byte) of ISO2022 is classified into one of the
 410    followings.  */
 411 enum iso_code_class_type
 412   {
 413     ISO_control_0,              /* Control codes in the range
 414                                    0x00..0x1F and 0x7F, except for the
 415                                    following 5 codes.  */
 416     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 417     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 418     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 419     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 420     ISO_control_1,              /* Control codes in the range
 421                                    0x80..0x9F, except for the
 422                                    following 3 codes.  */
 423     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 424     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 425     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 426     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 427     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 428     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 429     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 430   };
 431
 432 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 433     `iso-flags' attribute of an iso2022 coding system.  */
 434
 435 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 436    instead of the correct short-form sequence (e.g. ESC $ A).  */
 437 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 438
 439 /* If set, reset graphic planes and registers at end-of-line to the
 440    initial state.  */
 441 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 442
 443 /* If set, reset graphic planes and registers before any control
 444    characters to the initial state.  */
 445 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 446
 447 /* If set, encode by 7-bit environment.  */
 448 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 449
 450 /* If set, use locking-shift function.  */
 451 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 452
 453 /* If set, use single-shift function.  Overwrite
 454    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 455 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 456
 457 /* If set, use designation escape sequence.  */
 458 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 459
 460 /* If set, produce revision number sequence.  */
 461 #define CODING_ISO_FLAG_REVISION        0x0080
 462
 463 /* If set, produce ISO6429's direction specifying sequence.  */
 464 #define CODING_ISO_FLAG_DIRECTION       0x0100
 465
 466 /* If set, assume designation states are reset at beginning of line on
 467    output.  */
 468 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 469
 470 /* If set, designation sequence should be placed at beginning of line
 471    on output.  */
 472 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 473
 474 /* If set, do not encode unsafe characters on output.  */
 475 #define CODING_ISO_FLAG_SAFE            0x0800
 476
 477 /* If set, extra latin codes (128..159) are accepted as a valid code
 478    on input.  */
 479 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 480
 481 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 482
 483 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 484
 485 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 486
 487 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 488
 489 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 490
 491 /* A character to be produced on output if encoding of the original
 492    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 493 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 494
 495 /* UTF-8 section */
 496 #define CODING_UTF_8_BOM(coding)        \
 497   ((coding)->spec.utf_8_bom)
 498
 499 /* UTF-16 section */
 500 #define CODING_UTF_16_BOM(coding)       \
 501   ((coding)->spec.utf_16.bom)
 502
 503 #define CODING_UTF_16_ENDIAN(coding)    \
 504   ((coding)->spec.utf_16.endian)
 505
 506 #define CODING_UTF_16_SURROGATE(coding) \
 507   ((coding)->spec.utf_16.surrogate)
 508
 509
 510 /* CCL section */
 511 #define CODING_CCL_DECODER(coding)      \
 512   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 513 #define CODING_CCL_ENCODER(coding)      \
 514   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 515 #define CODING_CCL_VALIDS(coding)                                          \
 516   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 517
 518 /* Index for each coding category in `coding_categories' */
 519
 520 enum coding_category
 521   {
 522     coding_category_iso_7,
 523     coding_category_iso_7_tight,
 524     coding_category_iso_8_1,
 525     coding_category_iso_8_2,
 526     coding_category_iso_7_else,
 527     coding_category_iso_8_else,
 528     coding_category_utf_8_auto,
 529     coding_category_utf_8_nosig,
 530     coding_category_utf_8_sig,
 531     coding_category_utf_16_auto,
 532     coding_category_utf_16_be,
 533     coding_category_utf_16_le,
 534     coding_category_utf_16_be_nosig,
 535     coding_category_utf_16_le_nosig,
 536     coding_category_charset,
 537     coding_category_sjis,
 538     coding_category_big5,
 539     coding_category_ccl,
 540     coding_category_emacs_mule,
 541     /* All above are targets of code detection.  */
 542     coding_category_raw_text,
 543     coding_category_undecided,
 544     coding_category_max
 545   };
 546
 547 /* Definitions of flag bits used in detect_coding_XXXX.  */
 548 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 549 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 550 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 551 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 552 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 553 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 554 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 555 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 556 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 557 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 558 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 559 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 560 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 561 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 562 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 563 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 564 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 565 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 566 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 567 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 568
 569 /* This value is returned if detect_coding_mask () find nothing other
 570    than ASCII characters.  */
 571 #define CATEGORY_MASK_ANY               \
 572   (CATEGORY_MASK_ISO_7                  \
 573    | CATEGORY_MASK_ISO_7_TIGHT          \
 574    | CATEGORY_MASK_ISO_8_1              \
 575    | CATEGORY_MASK_ISO_8_2              \
 576    | CATEGORY_MASK_ISO_7_ELSE           \
 577    | CATEGORY_MASK_ISO_8_ELSE           \
 578    | CATEGORY_MASK_UTF_8_AUTO           \
 579    | CATEGORY_MASK_UTF_8_NOSIG          \
 580    | CATEGORY_MASK_UTF_8_SIG            \
 581    | CATEGORY_MASK_UTF_16_AUTO          \
 582    | CATEGORY_MASK_UTF_16_BE            \
 583    | CATEGORY_MASK_UTF_16_LE            \
 584    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 585    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 586    | CATEGORY_MASK_CHARSET              \
 587    | CATEGORY_MASK_SJIS                 \
 588    | CATEGORY_MASK_BIG5                 \
 589    | CATEGORY_MASK_CCL                  \
 590    | CATEGORY_MASK_EMACS_MULE)
 591
 592
 593 #define CATEGORY_MASK_ISO_7BIT \
 594   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 595
 596 #define CATEGORY_MASK_ISO_8BIT \
 597   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 598
 599 #define CATEGORY_MASK_ISO_ELSE \
 600   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 601
 602 #define CATEGORY_MASK_ISO_ESCAPE        \
 603   (CATEGORY_MASK_ISO_7                  \
 604    | CATEGORY_MASK_ISO_7_TIGHT          \
 605    | CATEGORY_MASK_ISO_7_ELSE           \
 606    | CATEGORY_MASK_ISO_8_ELSE)
 607
 608 #define CATEGORY_MASK_ISO       \
 609   (  CATEGORY_MASK_ISO_7BIT     \
 610      | CATEGORY_MASK_ISO_8BIT   \
 611      | CATEGORY_MASK_ISO_ELSE)
 612
 613 #define CATEGORY_MASK_UTF_16            \
 614   (CATEGORY_MASK_UTF_16_AUTO            \
 615    | CATEGORY_MASK_UTF_16_BE            \
 616    | CATEGORY_MASK_UTF_16_LE            \
 617    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 618    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 619
 620 #define CATEGORY_MASK_UTF_8     \
 621   (CATEGORY_MASK_UTF_8_AUTO     \
 622    | CATEGORY_MASK_UTF_8_NOSIG  \
 623    | CATEGORY_MASK_UTF_8_SIG)
 624
 625 /* Table of coding categories (Lisp symbols).  This variable is for
 626    internal use only.  */
 627 static Lisp_Object Vcoding_category_table;
 628
 629 /* Table of coding-categories ordered by priority.  */
 630 static enum coding_category coding_priorities[coding_category_max];
 631
 632 /* Nth element is a coding context for the coding system bound to the
 633    Nth coding category.  */
 634 static struct coding_system coding_categories[coding_category_max];
 635
 636 /*** Commonly used macros and functions ***/
 637
 638 #ifndef min
 639 #define min(a, b) ((a) < (b) ? (a) : (b))
 640 #endif
 641 #ifndef max
 642 #define max(a, b) ((a) > (b) ? (a) : (b))
 643 #endif
 644
 645 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 646   do {                                                  \
 647     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 648     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 649   } while (0)
 650
 651
 652 /* Safely get one byte from the source text pointed by SRC which ends
 653    at SRC_END, and set C to that byte.  If there are not enough bytes
 654    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 655    and a multibyte character is found at SRC, set C to the
 656    negative value of the character code.  The caller should declare
 657    and set these variables appropriately in advance:
 658         src, src_end, multibytep */
 659
 660 #define ONE_MORE_BYTE(c)                                \
 661   do {                                                  \
 662     if (src == src_end)                                 \
 663       {                                                 \
 664         if (src_base < src)                             \
 665           record_conversion_result                      \
 666             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 667         goto no_more_source;                            \
 668       }                                                 \
 669     c = *src++;                                         \
 670     if (multibytep && (c & 0x80))                       \
 671       {                                                 \
 672         if ((c & 0xFE) == 0xC0)                         \
 673           c = ((c & 1) << 6) | *src++;                  \
 674         else                                            \
 675           {                                             \
 676             src--;                                      \
 677             c = - string_char (src, &src, NULL);        \
 678             record_conversion_result                    \
 679               (coding, CODING_RESULT_INVALID_SRC);      \
 680           }                                             \
 681       }                                                 \
 682     consumed_chars++;                                   \
 683   } while (0)
 684
 685 /* Safely get two bytes from the source text pointed by SRC which ends
 686    at SRC_END, and set C1 and C2 to those bytes while skipping the
 687    heading multibyte characters.  If there are not enough bytes in the
 688    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 689    a multibyte character is found for C2, set C2 to the negative value
 690    of the character code.  The caller should declare and set these
 691    variables appropriately in advance:
 692         src, src_end, multibytep
 693    It is intended that this macro is used in detect_coding_utf_16.  */
 694
 695 #define TWO_MORE_BYTES(c1, c2)                          \
 696   do {                                                  \
 697     do {                                                \
 698       if (src == src_end)                               \
 699         goto no_more_source;                            \
 700       c1 = *src++;                                      \
 701       if (multibytep && (c1 & 0x80))                    \
 702         {                                               \
 703           if ((c1 & 0xFE) == 0xC0)                      \
 704             c1 = ((c1 & 1) << 6) | *src++;              \
 705           else                                          \
 706             {                                           \
 707               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 708               c1 = -1;                                  \
 709             }                                           \
 710         }                                               \
 711     } while (c1 < 0);                                   \
 712     if (src == src_end)                                 \
 713       goto no_more_source;                              \
 714     c2 = *src++;                                        \
 715     if (multibytep && (c2 & 0x80))                      \
 716       {                                                 \
 717         if ((c2 & 0xFE) == 0xC0)                        \
 718           c2 = ((c2 & 1) << 6) | *src++;                \
 719         else                                            \
 720           c2 = -1;                                      \
 721       }                                                 \
 722   } while (0)
 723
 724
 725 /* Store a byte C in the place pointed by DST and increment DST to the
 726    next free point, and increment PRODUCED_CHARS.  The caller should
 727    assure that C is 0..127, and declare and set the variable `dst'
 728    appropriately in advance.
 729 */
 730
 731
 732 #define EMIT_ONE_ASCII_BYTE(c)  \
 733   do {                          \
 734     produced_chars++;           \
 735     *dst++ = (c);               \
 736   } while (0)
 737
 738
 739 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 740
 741 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 742   do {                                  \
 743     produced_chars += 2;                \
 744     *dst++ = (c1), *dst++ = (c2);       \
 745   } while (0)
 746
 747
 748 /* Store a byte C in the place pointed by DST and increment DST to the
 749    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 750    store in an appropriate multibyte form.  The caller should
 751    declare and set the variables `dst' and `multibytep' appropriately
 752    in advance.  */
 753
 754 #define EMIT_ONE_BYTE(c)                \
 755   do {                                  \
 756     produced_chars++;                   \
 757     if (multibytep)                     \
 758       {                                 \
 759         unsigned ch = (c);              \
 760         if (ch >= 0x80)                 \
 761           ch = BYTE8_TO_CHAR (ch);      \
 762         CHAR_STRING_ADVANCE (ch, dst);  \
 763       }                                 \
 764     else                                \
 765       *dst++ = (c);                     \
 766   } while (0)
 767
 768
 769 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 770
 771 #define EMIT_TWO_BYTES(c1, c2)          \
 772   do {                                  \
 773     produced_chars += 2;                \
 774     if (multibytep)                     \
 775       {                                 \
 776         unsigned ch;                    \
 777                                         \
 778         ch = (c1);                      \
 779         if (ch >= 0x80)                 \
 780           ch = BYTE8_TO_CHAR (ch);      \
 781         CHAR_STRING_ADVANCE (ch, dst);  \
 782         ch = (c2);                      \
 783         if (ch >= 0x80)                 \
 784           ch = BYTE8_TO_CHAR (ch);      \
 785         CHAR_STRING_ADVANCE (ch, dst);  \
 786       }                                 \
 787     else                                \
 788       {                                 \
 789         *dst++ = (c1);                  \
 790         *dst++ = (c2);                  \
 791       }                                 \
 792   } while (0)
 793
 794
 795 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 796   do {                                  \
 797     EMIT_ONE_BYTE (c1);                 \
 798     EMIT_TWO_BYTES (c2, c3);            \
 799   } while (0)
 800
 801
 802 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 803   do {                                          \
 804     EMIT_TWO_BYTES (c1, c2);                    \
 805     EMIT_TWO_BYTES (c3, c4);                    \
 806   } while (0)
 807
 808
 809 static void
 810 record_conversion_result (struct coding_system *coding,
 811                           enum coding_result_code result)
 812 {
 813   coding->result = result;
 814   switch (result)
 815     {
 816     case CODING_RESULT_INSUFFICIENT_SRC:
 817       Vlast_code_conversion_error = Qinsufficient_source;
 818       break;
 819     case CODING_RESULT_INCONSISTENT_EOL:
 820       Vlast_code_conversion_error = Qinconsistent_eol;
 821       break;
 822     case CODING_RESULT_INVALID_SRC:
 823       Vlast_code_conversion_error = Qinvalid_source;
 824       break;
 825     case CODING_RESULT_INTERRUPT:
 826       Vlast_code_conversion_error = Qinterrupted;
 827       break;
 828     case CODING_RESULT_INSUFFICIENT_MEM:
 829       Vlast_code_conversion_error = Qinsufficient_memory;
 830       break;
 831     case CODING_RESULT_INSUFFICIENT_DST:
 832       /* Don't record this error in Vlast_code_conversion_error
 833          because it happens just temporarily and is resolved when the
 834          whole conversion is finished.  */
 835       break;
 836     case CODING_RESULT_SUCCESS:
 837       break;
 838     default:
 839       Vlast_code_conversion_error = intern ("Unknown error");
 840     }
 841 }
 842
 843 /* These wrapper macros are used to preserve validity of pointers into
 844    buffer text across calls to decode_char, encode_char, etc, which
 845    could cause relocation of buffers if it loads a charset map,
 846    because loading a charset map allocates large structures.  */
 847
 848 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 849   do {                                                                       \
 850     ptrdiff_t offset;                                                        \
 851                                                                              \
 852     charset_map_loaded = 0;                                                  \
 853     c = DECODE_CHAR (charset, code);                                         \
 854     if (charset_map_loaded                                                   \
 855         && (offset = coding_change_source (coding)))                         \
 856       {                                                                      \
 857         src += offset;                                                       \
 858         src_base += offset;                                                  \
 859         src_end += offset;                                                   \
 860       }                                                                      \
 861   } while (0)
 862
 863 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 864   do {                                                                  \
 865     ptrdiff_t offset;                                                   \
 866                                                                         \
 867     charset_map_loaded = 0;                                             \
 868     code = ENCODE_CHAR (charset, c);                                    \
 869     if (charset_map_loaded                                              \
 870         && (offset = coding_change_destination (coding)))               \
 871       {                                                                 \
 872         dst += offset;                                                  \
 873         dst_end += offset;                                              \
 874       }                                                                 \
 875   } while (0)
 876
 877 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 878   do {                                                                  \
 879     ptrdiff_t offset;                                                   \
 880                                                                         \
 881     charset_map_loaded = 0;                                             \
 882     charset = char_charset (c, charset_list, code_return);              \
 883     if (charset_map_loaded                                              \
 884         && (offset = coding_change_destination (coding)))               \
 885       {                                                                 \
 886         dst += offset;                                                  \
 887         dst_end += offset;                                              \
 888       }                                                                 \
 889   } while (0)
 890
 891 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 892   do {                                                                  \
 893     ptrdiff_t offset;                                                   \
 894                                                                         \
 895     charset_map_loaded = 0;                                             \
 896     result = CHAR_CHARSET_P (c, charset);                               \
 897     if (charset_map_loaded                                              \
 898         && (offset = coding_change_destination (coding)))               \
 899       {                                                                 \
 900         dst += offset;                                                  \
 901         dst_end += offset;                                              \
 902       }                                                                 \
 903   } while (0)
 904
 905
 906 /* If there are at least BYTES length of room at dst, allocate memory
 907    for coding->destination and update dst and dst_end.  We don't have
 908    to take care of coding->source which will be relocated.  It is
 909    handled by calling coding_set_source in encode_coding.  */
 910
 911 #define ASSURE_DESTINATION(bytes)                               \
 912   do {                                                          \
 913     if (dst + (bytes) >= dst_end)                               \
 914       {                                                         \
 915         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 916                                                                 \
 917         dst = alloc_destination (coding, more_bytes, dst);      \
 918         dst_end = coding->destination + coding->dst_bytes;      \
 919       }                                                         \
 920   } while (0)
 921
 922
 923 /* Store multibyte form of the character C in P, and advance P to the
 924    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 925    never calls MAYBE_UNIFY_CHAR.  */
 926
 927 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 928   do {                                          \
 929     if ((c) <= MAX_1_BYTE_CHAR)                 \
 930       *(p)++ = (c);                             \
 931     else if ((c) <= MAX_2_BYTE_CHAR)            \
 932       *(p)++ = (0xC0 | ((c) >> 6)),             \
 933         *(p)++ = (0x80 | ((c) & 0x3F));         \
 934     else if ((c) <= MAX_3_BYTE_CHAR)            \
 935       *(p)++ = (0xE0 | ((c) >> 12)),            \
 936         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
 937         *(p)++ = (0x80 | ((c) & 0x3F));         \
 938     else if ((c) <= MAX_4_BYTE_CHAR)            \
 939       *(p)++ = (0xF0 | (c >> 18)),              \
 940         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 941         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 942         *(p)++ = (0x80 | (c & 0x3F));           \
 943     else if ((c) <= MAX_5_BYTE_CHAR)            \
 944       *(p)++ = 0xF8,                            \
 945         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
 946         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 947         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 948         *(p)++ = (0x80 | (c & 0x3F));           \
 949     else                                        \
 950       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
 951   } while (0)
 952
 953
 954 /* Return the character code of character whose multibyte form is at
 955    P, and advance P to the end of the multibyte form.  This is like
 956    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
 957
 958 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
 959   (!((p)[0] & 0x80)                                             \
 960    ? *(p)++                                                     \
 961    : ! ((p)[0] & 0x20)                                          \
 962    ? ((p) += 2,                                                 \
 963       ((((p)[-2] & 0x1F) << 6)                                  \
 964        | ((p)[-1] & 0x3F)                                       \
 965        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
 966    : ! ((p)[0] & 0x10)                                          \
 967    ? ((p) += 3,                                                 \
 968       ((((p)[-3] & 0x0F) << 12)                                 \
 969        | (((p)[-2] & 0x3F) << 6)                                \
 970        | ((p)[-1] & 0x3F)))                                     \
 971    : ! ((p)[0] & 0x08)                                          \
 972    ? ((p) += 4,                                                 \
 973       ((((p)[-4] & 0xF) << 18)                                  \
 974        | (((p)[-3] & 0x3F) << 12)                               \
 975        | (((p)[-2] & 0x3F) << 6)                                \
 976        | ((p)[-1] & 0x3F)))                                     \
 977    : ((p) += 5,                                                 \
 978       ((((p)[-4] & 0x3F) << 18)                                 \
 979        | (((p)[-3] & 0x3F) << 12)                               \
 980        | (((p)[-2] & 0x3F) << 6)                                \
 981        | ((p)[-1] & 0x3F))))
 982
 983
 984 /* Set coding->source from coding->src_object.  */
 985
 986 static void
 987 coding_set_source (struct coding_system *coding)
 988 {
 989   if (BUFFERP (coding->src_object))
 990     {
 991       struct buffer *buf = XBUFFER (coding->src_object);
 992
 993       if (coding->src_pos < 0)
 994         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 995       else
 996         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 997     }
 998   else if (STRINGP (coding->src_object))
 999     {
1000       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1001     }
1002   else
1003     {
1004       /* Otherwise, the source is C string and is never relocated
1005          automatically.  Thus we don't have to update anything.  */
1006     }
1007 }
1008
1009
1010 /* Set coding->source from coding->src_object, and return how many
1011    bytes coding->source was changed.  */
1012
1013 static ptrdiff_t
1014 coding_change_source (struct coding_system *coding)
1015 {
1016   const unsigned char *orig = coding->source;
1017   coding_set_source (coding);
1018   return coding->source - orig;
1019 }
1020
1021
1022 /* Set coding->destination from coding->dst_object.  */
1023
1024 static void
1025 coding_set_destination (struct coding_system *coding)
1026 {
1027   if (BUFFERP (coding->dst_object))
1028     {
1029       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1030         {
1031           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1032           coding->dst_bytes = (GAP_END_ADDR
1033                                - (coding->src_bytes - coding->consumed)
1034                                - coding->destination);
1035         }
1036       else
1037         {
1038           /* We are sure that coding->dst_pos_byte is before the gap
1039              of the buffer. */
1040           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1041                                  + coding->dst_pos_byte - BEG_BYTE);
1042           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1043                                - coding->destination);
1044         }
1045     }
1046   else
1047     {
1048       /* Otherwise, the destination is C string and is never relocated
1049          automatically.  Thus we don't have to update anything.  */
1050     }
1051 }
1052
1053
1054 /* Set coding->destination from coding->dst_object, and return how
1055    many bytes coding->destination was changed.  */
1056
1057 static ptrdiff_t
1058 coding_change_destination (struct coding_system *coding)
1059 {
1060   const unsigned char *orig = coding->destination;
1061   coding_set_destination (coding);
1062   return coding->destination - orig;
1063 }
1064
1065
1066 static void
1067 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1068 {
1069   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1070     string_overflow ();
1071   coding->destination = xrealloc (coding->destination,
1072                                   coding->dst_bytes + bytes);
1073   coding->dst_bytes += bytes;
1074 }
1075
1076 static void
1077 coding_alloc_by_making_gap (struct coding_system *coding,
1078                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1079 {
1080   if (EQ (coding->src_object, coding->dst_object))
1081     {
1082       /* The gap may contain the produced data at the head and not-yet
1083          consumed data at the tail.  To preserve those data, we at
1084          first make the gap size to zero, then increase the gap
1085          size.  */
1086       ptrdiff_t add = GAP_SIZE;
1087
1088       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1089       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1090       make_gap (bytes);
1091       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1092       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1093     }
1094   else
1095     {
1096       Lisp_Object this_buffer;
1097
1098       this_buffer = Fcurrent_buffer ();
1099       set_buffer_internal (XBUFFER (coding->dst_object));
1100       make_gap (bytes);
1101       set_buffer_internal (XBUFFER (this_buffer));
1102     }
1103 }
1104
1105
1106 static unsigned char *
1107 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1108                    unsigned char *dst)
1109 {
1110   ptrdiff_t offset = dst - coding->destination;
1111
1112   if (BUFFERP (coding->dst_object))
1113     {
1114       struct buffer *buf = XBUFFER (coding->dst_object);
1115
1116       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1117     }
1118   else
1119     coding_alloc_by_realloc (coding, nbytes);
1120   coding_set_destination (coding);
1121   dst = coding->destination + offset;
1122   return dst;
1123 }
1124
1125 /** Macros for annotations.  */
1126
1127 /* An annotation data is stored in the array coding->charbuf in this
1128    format:
1129      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1130    LENGTH is the number of elements in the annotation.
1131    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1132    NCHARS is the number of characters in the text annotated.
1133
1134    The format of the following elements depend on ANNOTATION_MASK.
1135
1136    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1137    follows:
1138      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1139
1140    NBYTES is the number of bytes specified in the header part of
1141    old-style emacs-mule encoding, or 0 for the other kind of
1142    composition.
1143
1144    METHOD is one of enum composition_method.
1145
1146    Optional COMPOSITION-COMPONENTS are characters and composition
1147    rules.
1148
1149    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1150    follows.
1151
1152    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1153    recover from an invalid annotation, and should be skipped by
1154    produce_annotation.  */
1155
1156 /* Maximum length of the header of annotation data.  */
1157 #define MAX_ANNOTATION_LENGTH 5
1158
1159 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1160   do {                                                  \
1161     *(buf)++ = -(len);                                  \
1162     *(buf)++ = (mask);                                  \
1163     *(buf)++ = (nchars);                                \
1164     coding->annotated = 1;                              \
1165   } while (0);
1166
1167 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1168   do {                                                                      \
1169     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1170     *buf++ = nbytes;                                                        \
1171     *buf++ = method;                                                        \
1172   } while (0)
1173
1174
1175 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1176   do {                                                                  \
1177     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1178     *buf++ = id;                                                        \
1179   } while (0)
1180
1181 \f
1182 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1183
1184
1185
1186 \f
1187 /*** 3. UTF-8 ***/
1188
1189 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1190    Return true if a text is encoded in UTF-8.  */
1191
1192 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1193 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1194 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1195 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1196 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1197 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1198
1199 #define UTF_8_BOM_1 0xEF
1200 #define UTF_8_BOM_2 0xBB
1201 #define UTF_8_BOM_3 0xBF
1202
1203 static bool
1204 detect_coding_utf_8 (struct coding_system *coding,
1205                      struct coding_detection_info *detect_info)
1206 {
1207   const unsigned char *src = coding->source, *src_base;
1208   const unsigned char *src_end = coding->source + coding->src_bytes;
1209   bool multibytep = coding->src_multibyte;
1210   ptrdiff_t consumed_chars = 0;
1211   bool bom_found = 0;
1212   bool found = 0;
1213
1214   detect_info->checked |= CATEGORY_MASK_UTF_8;
1215   /* A coding system of this category is always ASCII compatible.  */
1216   src += coding->head_ascii;
1217
1218   while (1)
1219     {
1220       int c, c1, c2, c3, c4;
1221
1222       src_base = src;
1223       ONE_MORE_BYTE (c);
1224       if (c < 0 || UTF_8_1_OCTET_P (c))
1225         continue;
1226       ONE_MORE_BYTE (c1);
1227       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1228         break;
1229       if (UTF_8_2_OCTET_LEADING_P (c))
1230         {
1231           found = 1;
1232           continue;
1233         }
1234       ONE_MORE_BYTE (c2);
1235       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1236         break;
1237       if (UTF_8_3_OCTET_LEADING_P (c))
1238         {
1239           found = 1;
1240           if (src_base == coding->source
1241               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1242             bom_found = 1;
1243           continue;
1244         }
1245       ONE_MORE_BYTE (c3);
1246       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1247         break;
1248       if (UTF_8_4_OCTET_LEADING_P (c))
1249         {
1250           found = 1;
1251           continue;
1252         }
1253       ONE_MORE_BYTE (c4);
1254       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1255         break;
1256       if (UTF_8_5_OCTET_LEADING_P (c))
1257         {
1258           found = 1;
1259           continue;
1260         }
1261       break;
1262     }
1263   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1264   return 0;
1265
1266  no_more_source:
1267   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1268     {
1269       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1270       return 0;
1271     }
1272   if (bom_found)
1273     {
1274       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1275       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1276     }
1277   else
1278     {
1279       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1280       if (found)
1281         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1282     }
1283   return 1;
1284 }
1285
1286
1287 static void
1288 decode_coding_utf_8 (struct coding_system *coding)
1289 {
1290   const unsigned char *src = coding->source + coding->consumed;
1291   const unsigned char *src_end = coding->source + coding->src_bytes;
1292   const unsigned char *src_base;
1293   int *charbuf = coding->charbuf + coding->charbuf_used;
1294   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1295   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1296   bool multibytep = coding->src_multibyte;
1297   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1298   bool eol_dos
1299     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1300   int byte_after_cr = -1;
1301
1302   if (bom != utf_without_bom)
1303     {
1304       int c1, c2, c3;
1305
1306       src_base = src;
1307       ONE_MORE_BYTE (c1);
1308       if (! UTF_8_3_OCTET_LEADING_P (c1))
1309         src = src_base;
1310       else
1311         {
1312           ONE_MORE_BYTE (c2);
1313           if (! UTF_8_EXTRA_OCTET_P (c2))
1314             src = src_base;
1315           else
1316             {
1317               ONE_MORE_BYTE (c3);
1318               if (! UTF_8_EXTRA_OCTET_P (c3))
1319                 src = src_base;
1320               else
1321                 {
1322                   if ((c1 != UTF_8_BOM_1)
1323                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1324                     src = src_base;
1325                   else
1326                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1327                 }
1328             }
1329         }
1330     }
1331   CODING_UTF_8_BOM (coding) = utf_without_bom;
1332
1333   while (1)
1334     {
1335       int c, c1, c2, c3, c4, c5;
1336
1337       src_base = src;
1338       consumed_chars_base = consumed_chars;
1339
1340       if (charbuf >= charbuf_end)
1341         {
1342           if (byte_after_cr >= 0)
1343             src_base--;
1344           break;
1345         }
1346
1347       if (byte_after_cr >= 0)
1348         c1 = byte_after_cr, byte_after_cr = -1;
1349       else
1350         ONE_MORE_BYTE (c1);
1351       if (c1 < 0)
1352         {
1353           c = - c1;
1354         }
1355       else if (UTF_8_1_OCTET_P (c1))
1356         {
1357           if (eol_dos && c1 == '\r')
1358             ONE_MORE_BYTE (byte_after_cr);
1359           c = c1;
1360         }
1361       else
1362         {
1363           ONE_MORE_BYTE (c2);
1364           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1365             goto invalid_code;
1366           if (UTF_8_2_OCTET_LEADING_P (c1))
1367             {
1368               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1369               /* Reject overlong sequences here and below.  Encoders
1370                  producing them are incorrect, they can be misleading,
1371                  and they mess up read/write invariance.  */
1372               if (c < 128)
1373                 goto invalid_code;
1374             }
1375           else
1376             {
1377               ONE_MORE_BYTE (c3);
1378               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1379                 goto invalid_code;
1380               if (UTF_8_3_OCTET_LEADING_P (c1))
1381                 {
1382                   c = (((c1 & 0xF) << 12)
1383                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1384                   if (c < 0x800
1385                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1386                     goto invalid_code;
1387                 }
1388               else
1389                 {
1390                   ONE_MORE_BYTE (c4);
1391                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1392                     goto invalid_code;
1393                   if (UTF_8_4_OCTET_LEADING_P (c1))
1394                     {
1395                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1396                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1397                     if (c < 0x10000)
1398                       goto invalid_code;
1399                     }
1400                   else
1401                     {
1402                       ONE_MORE_BYTE (c5);
1403                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1404                         goto invalid_code;
1405                       if (UTF_8_5_OCTET_LEADING_P (c1))
1406                         {
1407                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1408                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1409                                | (c5 & 0x3F));
1410                           if ((c > MAX_CHAR) || (c < 0x200000))
1411                             goto invalid_code;
1412                         }
1413                       else
1414                         goto invalid_code;
1415                     }
1416                 }
1417             }
1418         }
1419
1420       *charbuf++ = c;
1421       continue;
1422
1423     invalid_code:
1424       src = src_base;
1425       consumed_chars = consumed_chars_base;
1426       ONE_MORE_BYTE (c);
1427       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1428       coding->errors++;
1429     }
1430
1431  no_more_source:
1432   coding->consumed_char += consumed_chars_base;
1433   coding->consumed = src_base - coding->source;
1434   coding->charbuf_used = charbuf - coding->charbuf;
1435 }
1436
1437
1438 static bool
1439 encode_coding_utf_8 (struct coding_system *coding)
1440 {
1441   bool multibytep = coding->dst_multibyte;
1442   int *charbuf = coding->charbuf;
1443   int *charbuf_end = charbuf + coding->charbuf_used;
1444   unsigned char *dst = coding->destination + coding->produced;
1445   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1446   ptrdiff_t produced_chars = 0;
1447   int c;
1448
1449   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1450     {
1451       ASSURE_DESTINATION (3);
1452       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1453       CODING_UTF_8_BOM (coding) = utf_without_bom;
1454     }
1455
1456   if (multibytep)
1457     {
1458       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1459
1460       while (charbuf < charbuf_end)
1461         {
1462           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1463
1464           ASSURE_DESTINATION (safe_room);
1465           c = *charbuf++;
1466           if (CHAR_BYTE8_P (c))
1467             {
1468               c = CHAR_TO_BYTE8 (c);
1469               EMIT_ONE_BYTE (c);
1470             }
1471           else
1472             {
1473               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1474               for (p = str; p < pend; p++)
1475                 EMIT_ONE_BYTE (*p);
1476             }
1477         }
1478     }
1479   else
1480     {
1481       int safe_room = MAX_MULTIBYTE_LENGTH;
1482
1483       while (charbuf < charbuf_end)
1484         {
1485           ASSURE_DESTINATION (safe_room);
1486           c = *charbuf++;
1487           if (CHAR_BYTE8_P (c))
1488             *dst++ = CHAR_TO_BYTE8 (c);
1489           else
1490             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1491           produced_chars++;
1492         }
1493     }
1494   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1495   coding->produced_char += produced_chars;
1496   coding->produced = dst - coding->destination;
1497   return 0;
1498 }
1499
1500
1501 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1502    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1503
1504 #define UTF_16_HIGH_SURROGATE_P(val) \
1505   (((val) & 0xFC00) == 0xD800)
1506
1507 #define UTF_16_LOW_SURROGATE_P(val) \
1508   (((val) & 0xFC00) == 0xDC00)
1509
1510
1511 static bool
1512 detect_coding_utf_16 (struct coding_system *coding,
1513                       struct coding_detection_info *detect_info)
1514 {
1515   const unsigned char *src = coding->source;
1516   const unsigned char *src_end = coding->source + coding->src_bytes;
1517   bool multibytep = coding->src_multibyte;
1518   int c1, c2;
1519
1520   detect_info->checked |= CATEGORY_MASK_UTF_16;
1521   if (coding->mode & CODING_MODE_LAST_BLOCK
1522       && (coding->src_chars & 1))
1523     {
1524       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1525       return 0;
1526     }
1527
1528   TWO_MORE_BYTES (c1, c2);
1529   if ((c1 == 0xFF) && (c2 == 0xFE))
1530     {
1531       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1532                              | CATEGORY_MASK_UTF_16_AUTO);
1533       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1534                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1535                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1536     }
1537   else if ((c1 == 0xFE) && (c2 == 0xFF))
1538     {
1539       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1540                              | CATEGORY_MASK_UTF_16_AUTO);
1541       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1542                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1543                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1544     }
1545   else if (c2 < 0)
1546     {
1547       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1548       return 0;
1549     }
1550   else
1551     {
1552       /* We check the dispersion of Eth and Oth bytes where E is even and
1553          O is odd.  If both are high, we assume binary data.*/
1554       unsigned char e[256], o[256];
1555       unsigned e_num = 1, o_num = 1;
1556
1557       memset (e, 0, 256);
1558       memset (o, 0, 256);
1559       e[c1] = 1;
1560       o[c2] = 1;
1561
1562       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1563                                 |CATEGORY_MASK_UTF_16_BE
1564                                 | CATEGORY_MASK_UTF_16_LE);
1565
1566       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1567              != CATEGORY_MASK_UTF_16)
1568         {
1569           TWO_MORE_BYTES (c1, c2);
1570           if (c2 < 0)
1571             break;
1572           if (! e[c1])
1573             {
1574               e[c1] = 1;
1575               e_num++;
1576               if (e_num >= 128)
1577                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1578             }
1579           if (! o[c2])
1580             {
1581               o[c2] = 1;
1582               o_num++;
1583               if (o_num >= 128)
1584                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1585             }
1586         }
1587       return 0;
1588     }
1589
1590  no_more_source:
1591   return 1;
1592 }
1593
1594 static void
1595 decode_coding_utf_16 (struct coding_system *coding)
1596 {
1597   const unsigned char *src = coding->source + coding->consumed;
1598   const unsigned char *src_end = coding->source + coding->src_bytes;
1599   const unsigned char *src_base;
1600   int *charbuf = coding->charbuf + coding->charbuf_used;
1601   /* We may produces at most 3 chars in one loop.  */
1602   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1603   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1604   bool multibytep = coding->src_multibyte;
1605   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1606   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1607   int surrogate = CODING_UTF_16_SURROGATE (coding);
1608   bool eol_dos
1609     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1610   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1611
1612   if (bom == utf_with_bom)
1613     {
1614       int c, c1, c2;
1615
1616       src_base = src;
1617       ONE_MORE_BYTE (c1);
1618       ONE_MORE_BYTE (c2);
1619       c = (c1 << 8) | c2;
1620
1621       if (endian == utf_16_big_endian
1622           ? c != 0xFEFF : c != 0xFFFE)
1623         {
1624           /* The first two bytes are not BOM.  Treat them as bytes
1625              for a normal character.  */
1626           src = src_base;
1627           coding->errors++;
1628         }
1629       CODING_UTF_16_BOM (coding) = utf_without_bom;
1630     }
1631   else if (bom == utf_detect_bom)
1632     {
1633       /* We have already tried to detect BOM and failed in
1634          detect_coding.  */
1635       CODING_UTF_16_BOM (coding) = utf_without_bom;
1636     }
1637
1638   while (1)
1639     {
1640       int c, c1, c2;
1641
1642       src_base = src;
1643       consumed_chars_base = consumed_chars;
1644
1645       if (charbuf >= charbuf_end)
1646         {
1647           if (byte_after_cr1 >= 0)
1648             src_base -= 2;
1649           break;
1650         }
1651
1652       if (byte_after_cr1 >= 0)
1653         c1 = byte_after_cr1, byte_after_cr1 = -1;
1654       else
1655         ONE_MORE_BYTE (c1);
1656       if (c1 < 0)
1657         {
1658           *charbuf++ = -c1;
1659           continue;
1660         }
1661       if (byte_after_cr2 >= 0)
1662         c2 = byte_after_cr2, byte_after_cr2 = -1;
1663       else
1664         ONE_MORE_BYTE (c2);
1665       if (c2 < 0)
1666         {
1667           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1668           *charbuf++ = -c2;
1669           continue;
1670         }
1671       c = (endian == utf_16_big_endian
1672            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1673
1674       if (surrogate)
1675         {
1676           if (! UTF_16_LOW_SURROGATE_P (c))
1677             {
1678               if (endian == utf_16_big_endian)
1679                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1680               else
1681                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1682               *charbuf++ = c1;
1683               *charbuf++ = c2;
1684               coding->errors++;
1685               if (UTF_16_HIGH_SURROGATE_P (c))
1686                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1687               else
1688                 *charbuf++ = c;
1689             }
1690           else
1691             {
1692               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1693               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1694               *charbuf++ = 0x10000 + c;
1695             }
1696         }
1697       else
1698         {
1699           if (UTF_16_HIGH_SURROGATE_P (c))
1700             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1701           else
1702             {
1703               if (eol_dos && c == '\r')
1704                 {
1705                   ONE_MORE_BYTE (byte_after_cr1);
1706                   ONE_MORE_BYTE (byte_after_cr2);
1707                 }
1708               *charbuf++ = c;
1709             }
1710         }
1711     }
1712
1713  no_more_source:
1714   coding->consumed_char += consumed_chars_base;
1715   coding->consumed = src_base - coding->source;
1716   coding->charbuf_used = charbuf - coding->charbuf;
1717 }
1718
1719 static bool
1720 encode_coding_utf_16 (struct coding_system *coding)
1721 {
1722   bool multibytep = coding->dst_multibyte;
1723   int *charbuf = coding->charbuf;
1724   int *charbuf_end = charbuf + coding->charbuf_used;
1725   unsigned char *dst = coding->destination + coding->produced;
1726   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1727   int safe_room = 8;
1728   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1729   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1730   ptrdiff_t produced_chars = 0;
1731   int c;
1732
1733   if (bom != utf_without_bom)
1734     {
1735       ASSURE_DESTINATION (safe_room);
1736       if (big_endian)
1737         EMIT_TWO_BYTES (0xFE, 0xFF);
1738       else
1739         EMIT_TWO_BYTES (0xFF, 0xFE);
1740       CODING_UTF_16_BOM (coding) = utf_without_bom;
1741     }
1742
1743   while (charbuf < charbuf_end)
1744     {
1745       ASSURE_DESTINATION (safe_room);
1746       c = *charbuf++;
1747       if (c > MAX_UNICODE_CHAR)
1748         c = coding->default_char;
1749
1750       if (c < 0x10000)
1751         {
1752           if (big_endian)
1753             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1754           else
1755             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1756         }
1757       else
1758         {
1759           int c1, c2;
1760
1761           c -= 0x10000;
1762           c1 = (c >> 10) + 0xD800;
1763           c2 = (c & 0x3FF) + 0xDC00;
1764           if (big_endian)
1765             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1766           else
1767             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1768         }
1769     }
1770   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1771   coding->produced = dst - coding->destination;
1772   coding->produced_char += produced_chars;
1773   return 0;
1774 }
1775
1776 \f
1777 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1778
1779 /* Emacs' internal format for representation of multiple character
1780    sets is a kind of multi-byte encoding, i.e. characters are
1781    represented by variable-length sequences of one-byte codes.
1782
1783    ASCII characters and control characters (e.g. `tab', `newline') are
1784    represented by one-byte sequences which are their ASCII codes, in
1785    the range 0x00 through 0x7F.
1786
1787    8-bit characters of the range 0x80..0x9F are represented by
1788    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1789    code + 0x20).
1790
1791    8-bit characters of the range 0xA0..0xFF are represented by
1792    one-byte sequences which are their 8-bit code.
1793
1794    The other characters are represented by a sequence of `base
1795    leading-code', optional `extended leading-code', and one or two
1796    `position-code's.  The length of the sequence is determined by the
1797    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1798    whereas extended leading-code and position-code take the range 0xA0
1799    through 0xFF.  See `charset.h' for more details about leading-code
1800    and position-code.
1801
1802    --- CODE RANGE of Emacs' internal format ---
1803    character set        range
1804    -------------        -----
1805    ascii                0x00..0x7F
1806    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1807    eight-bit-graphic    0xA0..0xBF
1808    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1809    ---------------------------------------------
1810
1811    As this is the internal character representation, the format is
1812    usually not used externally (i.e. in a file or in a data sent to a
1813    process).  But, it is possible to have a text externally in this
1814    format (i.e. by encoding by the coding system `emacs-mule').
1815
1816    In that case, a sequence of one-byte codes has a slightly different
1817    form.
1818
1819    At first, all characters in eight-bit-control are represented by
1820    one-byte sequences which are their 8-bit code.
1821
1822    Next, character composition data are represented by the byte
1823    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1824    where,
1825         METHOD is 0xF2 plus one of composition method (enum
1826         composition_method),
1827
1828         BYTES is 0xA0 plus a byte length of this composition data,
1829
1830         CHARS is 0xA0 plus a number of characters composed by this
1831         data,
1832
1833         COMPONENTs are characters of multibyte form or composition
1834         rules encoded by two-byte of ASCII codes.
1835
1836    In addition, for backward compatibility, the following formats are
1837    also recognized as composition data on decoding.
1838
1839    0x80 MSEQ ...
1840    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1841
1842    Here,
1843         MSEQ is a multibyte form but in these special format:
1844           ASCII: 0xA0 ASCII_CODE+0x80,
1845           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1846         RULE is a one byte code of the range 0xA0..0xF0 that
1847         represents a composition rule.
1848   */
1849
1850 char emacs_mule_bytes[256];
1851
1852
1853 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1854    Return true if a text is encoded in 'emacs-mule'.  */
1855
1856 static bool
1857 detect_coding_emacs_mule (struct coding_system *coding,
1858                           struct coding_detection_info *detect_info)
1859 {
1860   const unsigned char *src = coding->source, *src_base;
1861   const unsigned char *src_end = coding->source + coding->src_bytes;
1862   bool multibytep = coding->src_multibyte;
1863   ptrdiff_t consumed_chars = 0;
1864   int c;
1865   int found = 0;
1866
1867   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1868   /* A coding system of this category is always ASCII compatible.  */
1869   src += coding->head_ascii;
1870
1871   while (1)
1872     {
1873       src_base = src;
1874       ONE_MORE_BYTE (c);
1875       if (c < 0)
1876         continue;
1877       if (c == 0x80)
1878         {
1879           /* Perhaps the start of composite character.  We simply skip
1880              it because analyzing it is too heavy for detecting.  But,
1881              at least, we check that the composite character
1882              constitutes of more than 4 bytes.  */
1883           const unsigned char *src_start;
1884
1885         repeat:
1886           src_start = src;
1887           do
1888             {
1889               ONE_MORE_BYTE (c);
1890             }
1891           while (c >= 0xA0);
1892
1893           if (src - src_start <= 4)
1894             break;
1895           found = CATEGORY_MASK_EMACS_MULE;
1896           if (c == 0x80)
1897             goto repeat;
1898         }
1899
1900       if (c < 0x80)
1901         {
1902           if (c < 0x20
1903               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1904             break;
1905         }
1906       else
1907         {
1908           int more_bytes = emacs_mule_bytes[c] - 1;
1909
1910           while (more_bytes > 0)
1911             {
1912               ONE_MORE_BYTE (c);
1913               if (c < 0xA0)
1914                 {
1915                   src--;        /* Unread the last byte.  */
1916                   break;
1917                 }
1918               more_bytes--;
1919             }
1920           if (more_bytes != 0)
1921             break;
1922           found = CATEGORY_MASK_EMACS_MULE;
1923         }
1924     }
1925   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1926   return 0;
1927
1928  no_more_source:
1929   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1930     {
1931       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1932       return 0;
1933     }
1934   detect_info->found |= found;
1935   return 1;
1936 }
1937
1938
1939 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1940    character.  If CMP_STATUS indicates that we must expect MSEQ or
1941    RULE described above, decode it and return the negative value of
1942    the decoded character or rule.  If an invalid byte is found, return
1943    -1.  If SRC is too short, return -2.  */
1944
1945 static int
1946 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1947                  int *nbytes, int *nchars, int *id,
1948                  struct composition_status *cmp_status)
1949 {
1950   const unsigned char *src_end = coding->source + coding->src_bytes;
1951   const unsigned char *src_base = src;
1952   bool multibytep = coding->src_multibyte;
1953   int charset_ID;
1954   unsigned code;
1955   int c;
1956   int consumed_chars = 0;
1957   bool mseq_found = 0;
1958
1959   ONE_MORE_BYTE (c);
1960   if (c < 0)
1961     {
1962       c = -c;
1963       charset_ID = emacs_mule_charset[0];
1964     }
1965   else
1966     {
1967       if (c >= 0xA0)
1968         {
1969           if (cmp_status->state != COMPOSING_NO
1970               && cmp_status->old_form)
1971             {
1972               if (cmp_status->state == COMPOSING_CHAR)
1973                 {
1974                   if (c == 0xA0)
1975                     {
1976                       ONE_MORE_BYTE (c);
1977                       c -= 0x80;
1978                       if (c < 0)
1979                         goto invalid_code;
1980                     }
1981                   else
1982                     c -= 0x20;
1983                   mseq_found = 1;
1984                 }
1985               else
1986                 {
1987                   *nbytes = src - src_base;
1988                   *nchars = consumed_chars;
1989                   return -c;
1990                 }
1991             }
1992           else
1993             goto invalid_code;
1994         }
1995
1996       switch (emacs_mule_bytes[c])
1997         {
1998         case 2:
1999           if ((charset_ID = emacs_mule_charset[c]) < 0)
2000             goto invalid_code;
2001           ONE_MORE_BYTE (c);
2002           if (c < 0xA0)
2003             goto invalid_code;
2004           code = c & 0x7F;
2005           break;
2006
2007         case 3:
2008           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2009               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2010             {
2011               ONE_MORE_BYTE (c);
2012               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2013                 goto invalid_code;
2014               ONE_MORE_BYTE (c);
2015               if (c < 0xA0)
2016                 goto invalid_code;
2017               code = c & 0x7F;
2018             }
2019           else
2020             {
2021               if ((charset_ID = emacs_mule_charset[c]) < 0)
2022                 goto invalid_code;
2023               ONE_MORE_BYTE (c);
2024               if (c < 0xA0)
2025                 goto invalid_code;
2026               code = (c & 0x7F) << 8;
2027               ONE_MORE_BYTE (c);
2028               if (c < 0xA0)
2029                 goto invalid_code;
2030               code |= c & 0x7F;
2031             }
2032           break;
2033
2034         case 4:
2035           ONE_MORE_BYTE (c);
2036           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2037             goto invalid_code;
2038           ONE_MORE_BYTE (c);
2039           if (c < 0xA0)
2040             goto invalid_code;
2041           code = (c & 0x7F) << 8;
2042           ONE_MORE_BYTE (c);
2043           if (c < 0xA0)
2044             goto invalid_code;
2045           code |= c & 0x7F;
2046           break;
2047
2048         case 1:
2049           code = c;
2050           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2051           break;
2052
2053         default:
2054           abort ();
2055         }
2056       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2057                           CHARSET_FROM_ID (charset_ID), code, c);
2058       if (c < 0)
2059         goto invalid_code;
2060     }
2061   *nbytes = src - src_base;
2062   *nchars = consumed_chars;
2063   if (id)
2064     *id = charset_ID;
2065   return (mseq_found ? -c : c);
2066
2067  no_more_source:
2068   return -2;
2069
2070  invalid_code:
2071   return -1;
2072 }
2073
2074
2075 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2076
2077 /* Handle these composition sequence ('|': the end of header elements,
2078    BYTES and CHARS >= 0xA0):
2079
2080    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2081    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2082    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2083
2084    and these old form:
2085
2086    (4) relative composition: 0x80 | MSEQ ... MSEQ
2087    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2088
2089    When the starter 0x80 and the following header elements are found,
2090    this annotation header is produced.
2091
2092         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2093
2094    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2095    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2096
2097    Then, upon reading the following elements, these codes are produced
2098    until the composition end is found:
2099
2100    (1) CHAR ... CHAR
2101    (2) ALT ... ALT CHAR ... CHAR
2102    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2103    (4) CHAR ... CHAR
2104    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2105
2106    When the composition end is found, LENGTH and NCHARS in the
2107    annotation header is updated as below:
2108
2109    (1) LENGTH: unchanged, NCHARS: unchanged
2110    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2111    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2112    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2113    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2114
2115    If an error is found while composing, the annotation header is
2116    changed to the original composition header (plus filler -1s) as
2117    below:
2118
2119    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2120    (5)          [ 0x80 0xFF -1 -1- -1 ]
2121
2122    and the sequence [ -2 DECODED-RULE ] is changed to the original
2123    byte sequence as below:
2124         o the original byte sequence is B: [ B -1 ]
2125         o the original byte sequence is B1 B2: [ B1 B2 ]
2126
2127    Most of the routines are implemented by macros because many
2128    variables and labels in the caller decode_coding_emacs_mule must be
2129    accessible, and they are usually called just once (thus doesn't
2130    increase the size of compiled object).  */
2131
2132 /* Decode a composition rule represented by C as a component of
2133    composition sequence of Emacs 20 style.  Set RULE to the decoded
2134    rule. */
2135
2136 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2137   do {                                                  \
2138     int gref, nref;                                     \
2139                                                         \
2140     c -= 0xA0;                                          \
2141     if (c < 0 || c >= 81)                               \
2142       goto invalid_code;                                \
2143     gref = c / 9, nref = c % 9;                         \
2144     if (gref == 4) gref = 10;                           \
2145     if (nref == 4) nref = 10;                           \
2146     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2147   } while (0)
2148
2149
2150 /* Decode a composition rule represented by C and the following byte
2151    at SRC as a component of composition sequence of Emacs 21 style.
2152    Set RULE to the decoded rule.  */
2153
2154 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2155   do {                                                  \
2156     int gref, nref;                                     \
2157                                                         \
2158     gref = c - 0x20;                                    \
2159     if (gref < 0 || gref >= 81)                         \
2160       goto invalid_code;                                \
2161     ONE_MORE_BYTE (c);                                  \
2162     nref = c - 0x20;                                    \
2163     if (nref < 0 || nref >= 81)                         \
2164       goto invalid_code;                                \
2165     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2166   } while (0)
2167
2168
2169 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2170    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2171    byte length of this composition information, CHARS is the number of
2172    characters composed by this composition.  */
2173
2174 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2175   do {                                                                  \
2176     enum composition_method method = c - 0xF2;                          \
2177     int nbytes, nchars;                                                 \
2178                                                                         \
2179     ONE_MORE_BYTE (c);                                                  \
2180     if (c < 0)                                                          \
2181       goto invalid_code;                                                \
2182     nbytes = c - 0xA0;                                                  \
2183     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2184       goto invalid_code;                                                \
2185     ONE_MORE_BYTE (c);                                                  \
2186     nchars = c - 0xA0;                                                  \
2187     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2188       goto invalid_code;                                                \
2189     cmp_status->old_form = 0;                                           \
2190     cmp_status->method = method;                                        \
2191     if (method == COMPOSITION_RELATIVE)                                 \
2192       cmp_status->state = COMPOSING_CHAR;                               \
2193     else                                                                \
2194       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2195     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2196     cmp_status->nchars = nchars;                                        \
2197     cmp_status->ncomps = nbytes - 4;                                    \
2198     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2199   } while (0)
2200
2201
2202 /* Start of Emacs 20 style format for relative composition.  */
2203
2204 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2205   do {                                                          \
2206     cmp_status->old_form = 1;                                   \
2207     cmp_status->method = COMPOSITION_RELATIVE;                  \
2208     cmp_status->state = COMPOSING_CHAR;                         \
2209     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2210     cmp_status->nchars = cmp_status->ncomps = 0;                \
2211     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2212   } while (0)
2213
2214
2215 /* Start of Emacs 20 style format for rule-base composition.  */
2216
2217 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2218   do {                                                          \
2219     cmp_status->old_form = 1;                                   \
2220     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2221     cmp_status->state = COMPOSING_CHAR;                         \
2222     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2223     cmp_status->nchars = cmp_status->ncomps = 0;                \
2224     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2225   } while (0)
2226
2227
2228 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2229   do {                                                  \
2230     const unsigned char *current_src = src;             \
2231                                                         \
2232     ONE_MORE_BYTE (c);                                  \
2233     if (c < 0)                                          \
2234       goto invalid_code;                                \
2235     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2236         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2237       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2238     else if (c < 0xA0)                                  \
2239       goto invalid_code;                                \
2240     else if (c < 0xC0)                                  \
2241       {                                                 \
2242         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2243         /* Re-read C as a composition component.  */    \
2244         src = current_src;                              \
2245       }                                                 \
2246     else if (c == 0xFF)                                 \
2247       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2248     else                                                \
2249       goto invalid_code;                                \
2250   } while (0)
2251
2252 #define EMACS_MULE_COMPOSITION_END()                            \
2253   do {                                                          \
2254     int idx = - cmp_status->length;                             \
2255                                                                 \
2256     if (cmp_status->old_form)                                   \
2257       charbuf[idx + 2] = cmp_status->nchars;                    \
2258     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2259       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2260     cmp_status->state = COMPOSING_NO;                           \
2261   } while (0)
2262
2263
2264 static int
2265 emacs_mule_finish_composition (int *charbuf,
2266                                struct composition_status *cmp_status)
2267 {
2268   int idx = - cmp_status->length;
2269   int new_chars;
2270
2271   if (cmp_status->old_form && cmp_status->nchars > 0)
2272     {
2273       charbuf[idx + 2] = cmp_status->nchars;
2274       new_chars = 0;
2275       if (cmp_status->method == COMPOSITION_WITH_RULE
2276           && cmp_status->state == COMPOSING_CHAR)
2277         {
2278           /* The last rule was invalid.  */
2279           int rule = charbuf[-1] + 0xA0;
2280
2281           charbuf[-2] = BYTE8_TO_CHAR (rule);
2282           charbuf[-1] = -1;
2283           new_chars = 1;
2284         }
2285     }
2286   else
2287     {
2288       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2289
2290       if (cmp_status->method == COMPOSITION_WITH_RULE)
2291         {
2292           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2293           charbuf[idx++] = -3;
2294           charbuf[idx++] = 0;
2295           new_chars = 1;
2296         }
2297       else
2298         {
2299           int nchars = charbuf[idx + 1] + 0xA0;
2300           int nbytes = charbuf[idx + 2] + 0xA0;
2301
2302           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2303           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2304           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2305           charbuf[idx++] = -1;
2306           new_chars = 4;
2307         }
2308     }
2309   cmp_status->state = COMPOSING_NO;
2310   return new_chars;
2311 }
2312
2313 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2314   do {                                                                    \
2315     if (cmp_status->state != COMPOSING_NO)                                \
2316       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2317   } while (0)
2318
2319
2320 static void
2321 decode_coding_emacs_mule (struct coding_system *coding)
2322 {
2323   const unsigned char *src = coding->source + coding->consumed;
2324   const unsigned char *src_end = coding->source + coding->src_bytes;
2325   const unsigned char *src_base;
2326   int *charbuf = coding->charbuf + coding->charbuf_used;
2327   /* We may produce two annotations (charset and composition) in one
2328      loop and one more charset annotation at the end.  */
2329   int *charbuf_end
2330     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2331       /* We can produce up to 2 characters in a loop.  */
2332       - 1;
2333   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2334   bool multibytep = coding->src_multibyte;
2335   ptrdiff_t char_offset = coding->produced_char;
2336   ptrdiff_t last_offset = char_offset;
2337   int last_id = charset_ascii;
2338   bool eol_dos
2339     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2340   int byte_after_cr = -1;
2341   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2342
2343   if (cmp_status->state != COMPOSING_NO)
2344     {
2345       int i;
2346
2347       if (charbuf_end - charbuf < cmp_status->length)
2348         abort ();
2349       for (i = 0; i < cmp_status->length; i++)
2350         *charbuf++ = cmp_status->carryover[i];
2351       coding->annotated = 1;
2352     }
2353
2354   while (1)
2355     {
2356       int c, id IF_LINT (= 0);
2357
2358       src_base = src;
2359       consumed_chars_base = consumed_chars;
2360
2361       if (charbuf >= charbuf_end)
2362         {
2363           if (byte_after_cr >= 0)
2364             src_base--;
2365           break;
2366         }
2367
2368       if (byte_after_cr >= 0)
2369         c = byte_after_cr, byte_after_cr = -1;
2370       else
2371         ONE_MORE_BYTE (c);
2372
2373       if (c < 0 || c == 0x80)
2374         {
2375           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2376           if (c < 0)
2377             {
2378               *charbuf++ = -c;
2379               char_offset++;
2380             }
2381           else
2382             DECODE_EMACS_MULE_COMPOSITION_START ();
2383           continue;
2384         }
2385
2386       if (c < 0x80)
2387         {
2388           if (eol_dos && c == '\r')
2389             ONE_MORE_BYTE (byte_after_cr);
2390           id = charset_ascii;
2391           if (cmp_status->state != COMPOSING_NO)
2392             {
2393               if (cmp_status->old_form)
2394                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2395               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2396                 cmp_status->ncomps--;
2397             }
2398         }
2399       else
2400         {
2401           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2402           /* emacs_mule_char can load a charset map from a file, which
2403              allocates a large structure and might cause buffer text
2404              to be relocated as result.  Thus, we need to remember the
2405              original pointer to buffer text, and fix up all related
2406              pointers after the call.  */
2407           const unsigned char *orig = coding->source;
2408           ptrdiff_t offset;
2409
2410           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2411                                cmp_status);
2412           offset = coding->source - orig;
2413           if (offset)
2414             {
2415               src += offset;
2416               src_base += offset;
2417               src_end += offset;
2418             }
2419           if (c < 0)
2420             {
2421               if (c == -1)
2422                 goto invalid_code;
2423               if (c == -2)
2424                 break;
2425             }
2426           src = src_base + nbytes;
2427           consumed_chars = consumed_chars_base + nchars;
2428           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2429             cmp_status->ncomps -= nchars;
2430         }
2431
2432       /* Now if C >= 0, we found a normally encoded character, if C <
2433          0, we found an old-style composition component character or
2434          rule.  */
2435
2436       if (cmp_status->state == COMPOSING_NO)
2437         {
2438           if (last_id != id)
2439             {
2440               if (last_id != charset_ascii)
2441                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2442                                   last_id);
2443               last_id = id;
2444               last_offset = char_offset;
2445             }
2446           *charbuf++ = c;
2447           char_offset++;
2448         }
2449       else if (cmp_status->state == COMPOSING_CHAR)
2450         {
2451           if (cmp_status->old_form)
2452             {
2453               if (c >= 0)
2454                 {
2455                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2456                   *charbuf++ = c;
2457                   char_offset++;
2458                 }
2459               else
2460                 {
2461                   *charbuf++ = -c;
2462                   cmp_status->nchars++;
2463                   cmp_status->length++;
2464                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2465                     EMACS_MULE_COMPOSITION_END ();
2466                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2467                     cmp_status->state = COMPOSING_RULE;
2468                 }
2469             }
2470           else
2471             {
2472               *charbuf++ = c;
2473               cmp_status->length++;
2474               cmp_status->nchars--;
2475               if (cmp_status->nchars == 0)
2476                 EMACS_MULE_COMPOSITION_END ();
2477             }
2478         }
2479       else if (cmp_status->state == COMPOSING_RULE)
2480         {
2481           int rule;
2482
2483           if (c >= 0)
2484             {
2485               EMACS_MULE_COMPOSITION_END ();
2486               *charbuf++ = c;
2487               char_offset++;
2488             }
2489           else
2490             {
2491               c = -c;
2492               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2493               if (rule < 0)
2494                 goto invalid_code;
2495               *charbuf++ = -2;
2496               *charbuf++ = rule;
2497               cmp_status->length += 2;
2498               cmp_status->state = COMPOSING_CHAR;
2499             }
2500         }
2501       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2502         {
2503           *charbuf++ = c;
2504           cmp_status->length++;
2505           if (cmp_status->ncomps == 0)
2506             cmp_status->state = COMPOSING_CHAR;
2507           else if (cmp_status->ncomps > 0)
2508             {
2509               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2510                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2511             }
2512           else
2513             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2514         }
2515       else                      /* COMPOSING_COMPONENT_RULE */
2516         {
2517           int rule;
2518
2519           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2520           if (rule < 0)
2521             goto invalid_code;
2522           *charbuf++ = -2;
2523           *charbuf++ = rule;
2524           cmp_status->length += 2;
2525           cmp_status->ncomps--;
2526           if (cmp_status->ncomps > 0)
2527             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2528           else
2529             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2530         }
2531       continue;
2532
2533     invalid_code:
2534       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2535       src = src_base;
2536       consumed_chars = consumed_chars_base;
2537       ONE_MORE_BYTE (c);
2538       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2539       char_offset++;
2540       coding->errors++;
2541     }
2542
2543  no_more_source:
2544   if (cmp_status->state != COMPOSING_NO)
2545     {
2546       if (coding->mode & CODING_MODE_LAST_BLOCK)
2547         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2548       else
2549         {
2550           int i;
2551
2552           charbuf -= cmp_status->length;
2553           for (i = 0; i < cmp_status->length; i++)
2554             cmp_status->carryover[i] = charbuf[i];
2555         }
2556     }
2557   if (last_id != charset_ascii)
2558     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2559   coding->consumed_char += consumed_chars_base;
2560   coding->consumed = src_base - coding->source;
2561   coding->charbuf_used = charbuf - coding->charbuf;
2562 }
2563
2564
2565 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2566   do {                                          \
2567     if (id < 0xA0)                              \
2568       codes[0] = id, codes[1] = 0;              \
2569     else if (id < 0xE0)                         \
2570       codes[0] = 0x9A, codes[1] = id;           \
2571     else if (id < 0xF0)                         \
2572       codes[0] = 0x9B, codes[1] = id;           \
2573     else if (id < 0xF5)                         \
2574       codes[0] = 0x9C, codes[1] = id;           \
2575     else                                        \
2576       codes[0] = 0x9D, codes[1] = id;           \
2577   } while (0);
2578
2579
2580 static bool
2581 encode_coding_emacs_mule (struct coding_system *coding)
2582 {
2583   bool multibytep = coding->dst_multibyte;
2584   int *charbuf = coding->charbuf;
2585   int *charbuf_end = charbuf + coding->charbuf_used;
2586   unsigned char *dst = coding->destination + coding->produced;
2587   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2588   int safe_room = 8;
2589   ptrdiff_t produced_chars = 0;
2590   Lisp_Object attrs, charset_list;
2591   int c;
2592   int preferred_charset_id = -1;
2593
2594   CODING_GET_INFO (coding, attrs, charset_list);
2595   if (! EQ (charset_list, Vemacs_mule_charset_list))
2596     {
2597       charset_list = Vemacs_mule_charset_list;
2598       ASET (attrs, coding_attr_charset_list, charset_list);
2599     }
2600
2601   while (charbuf < charbuf_end)
2602     {
2603       ASSURE_DESTINATION (safe_room);
2604       c = *charbuf++;
2605
2606       if (c < 0)
2607         {
2608           /* Handle an annotation.  */
2609           switch (*charbuf)
2610             {
2611             case CODING_ANNOTATE_COMPOSITION_MASK:
2612               /* Not yet implemented.  */
2613               break;
2614             case CODING_ANNOTATE_CHARSET_MASK:
2615               preferred_charset_id = charbuf[3];
2616               if (preferred_charset_id >= 0
2617                   && NILP (Fmemq (make_number (preferred_charset_id),
2618                                   charset_list)))
2619                 preferred_charset_id = -1;
2620               break;
2621             default:
2622               abort ();
2623             }
2624           charbuf += -c - 1;
2625           continue;
2626         }
2627
2628       if (ASCII_CHAR_P (c))
2629         EMIT_ONE_ASCII_BYTE (c);
2630       else if (CHAR_BYTE8_P (c))
2631         {
2632           c = CHAR_TO_BYTE8 (c);
2633           EMIT_ONE_BYTE (c);
2634         }
2635       else
2636         {
2637           struct charset *charset;
2638           unsigned code;
2639           int dimension;
2640           int emacs_mule_id;
2641           unsigned char leading_codes[2];
2642
2643           if (preferred_charset_id >= 0)
2644             {
2645               bool result;
2646
2647               charset = CHARSET_FROM_ID (preferred_charset_id);
2648               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2649               if (result)
2650                 code = ENCODE_CHAR (charset, c);
2651               else
2652                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2653                                      &code, charset);
2654             }
2655           else
2656             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2657                                  &code, charset);
2658           if (! charset)
2659             {
2660               c = coding->default_char;
2661               if (ASCII_CHAR_P (c))
2662                 {
2663                   EMIT_ONE_ASCII_BYTE (c);
2664                   continue;
2665                 }
2666               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2667                                    &code, charset);
2668             }
2669           dimension = CHARSET_DIMENSION (charset);
2670           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2671           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2672           EMIT_ONE_BYTE (leading_codes[0]);
2673           if (leading_codes[1])
2674             EMIT_ONE_BYTE (leading_codes[1]);
2675           if (dimension == 1)
2676             EMIT_ONE_BYTE (code | 0x80);
2677           else
2678             {
2679               code |= 0x8080;
2680               EMIT_ONE_BYTE (code >> 8);
2681               EMIT_ONE_BYTE (code & 0xFF);
2682             }
2683         }
2684     }
2685   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2686   coding->produced_char += produced_chars;
2687   coding->produced = dst - coding->destination;
2688   return 0;
2689 }
2690
2691 \f
2692 /*** 7. ISO2022 handlers ***/
2693
2694 /* The following note describes the coding system ISO2022 briefly.
2695    Since the intention of this note is to help understand the
2696    functions in this file, some parts are NOT ACCURATE or are OVERLY
2697    SIMPLIFIED.  For thorough understanding, please refer to the
2698    original document of ISO2022.  This is equivalent to the standard
2699    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2700
2701    ISO2022 provides many mechanisms to encode several character sets
2702    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2703    is encoded using bytes less than 128.  This may make the encoded
2704    text a little bit longer, but the text passes more easily through
2705    several types of gateway, some of which strip off the MSB (Most
2706    Significant Bit).
2707
2708    There are two kinds of character sets: control character sets and
2709    graphic character sets.  The former contain control characters such
2710    as `newline' and `escape' to provide control functions (control
2711    functions are also provided by escape sequences).  The latter
2712    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2713    two control character sets and many graphic character sets.
2714
2715    Graphic character sets are classified into one of the following
2716    four classes, according to the number of bytes (DIMENSION) and
2717    number of characters in one dimension (CHARS) of the set:
2718    - DIMENSION1_CHARS94
2719    - DIMENSION1_CHARS96
2720    - DIMENSION2_CHARS94
2721    - DIMENSION2_CHARS96
2722
2723    In addition, each character set is assigned an identification tag,
2724    unique for each set, called the "final character" (denoted as <F>
2725    hereafter).  The <F> of each character set is decided by ECMA(*)
2726    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2727    (0x30..0x3F are for private use only).
2728
2729    Note (*): ECMA = European Computer Manufacturers Association
2730
2731    Here are examples of graphic character sets [NAME(<F>)]:
2732         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2733         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2734         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2735         o DIMENSION2_CHARS96 -- none for the moment
2736
2737    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2738         C0 [0x00..0x1F] -- control character plane 0
2739         GL [0x20..0x7F] -- graphic character plane 0
2740         C1 [0x80..0x9F] -- control character plane 1
2741         GR [0xA0..0xFF] -- graphic character plane 1
2742
2743    A control character set is directly designated and invoked to C0 or
2744    C1 by an escape sequence.  The most common case is that:
2745    - ISO646's  control character set is designated/invoked to C0, and
2746    - ISO6429's control character set is designated/invoked to C1,
2747    and usually these designations/invocations are omitted in encoded
2748    text.  In a 7-bit environment, only C0 can be used, and a control
2749    character for C1 is encoded by an appropriate escape sequence to
2750    fit into the environment.  All control characters for C1 are
2751    defined to have corresponding escape sequences.
2752
2753    A graphic character set is at first designated to one of four
2754    graphic registers (G0 through G3), then these graphic registers are
2755    invoked to GL or GR.  These designations and invocations can be
2756    done independently.  The most common case is that G0 is invoked to
2757    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2758    these invocations and designations are omitted in encoded text.
2759    In a 7-bit environment, only GL can be used.
2760
2761    When a graphic character set of CHARS94 is invoked to GL, codes
2762    0x20 and 0x7F of the GL area work as control characters SPACE and
2763    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2764    be used.
2765
2766    There are two ways of invocation: locking-shift and single-shift.
2767    With locking-shift, the invocation lasts until the next different
2768    invocation, whereas with single-shift, the invocation affects the
2769    following character only and doesn't affect the locking-shift
2770    state.  Invocations are done by the following control characters or
2771    escape sequences:
2772
2773    ----------------------------------------------------------------------
2774    abbrev  function                  cntrl escape seq   description
2775    ----------------------------------------------------------------------
2776    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2777    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2778    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2779    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2780    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2781    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2782    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2783    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2784    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2785    ----------------------------------------------------------------------
2786    (*) These are not used by any known coding system.
2787
2788    Control characters for these functions are defined by macros
2789    ISO_CODE_XXX in `coding.h'.
2790
2791    Designations are done by the following escape sequences:
2792    ----------------------------------------------------------------------
2793    escape sequence      description
2794    ----------------------------------------------------------------------
2795    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2796    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2797    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2798    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2799    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2800    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2801    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2802    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2803    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2804    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2805    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2806    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2807    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2808    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2809    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2810    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2811    ----------------------------------------------------------------------
2812
2813    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2814    of dimension 1, chars 94, and final character <F>, etc...
2815
2816    Note (*): Although these designations are not allowed in ISO2022,
2817    Emacs accepts them on decoding, and produces them on encoding
2818    CHARS96 character sets in a coding system which is characterized as
2819    7-bit environment, non-locking-shift, and non-single-shift.
2820
2821    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2822    '(' must be omitted.  We refer to this as "short-form" hereafter.
2823
2824    Now you may notice that there are a lot of ways of encoding the
2825    same multilingual text in ISO2022.  Actually, there exist many
2826    coding systems such as Compound Text (used in X11's inter client
2827    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2828    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2829    localized platforms), and all of these are variants of ISO2022.
2830
2831    In addition to the above, Emacs handles two more kinds of escape
2832    sequences: ISO6429's direction specification and Emacs' private
2833    sequence for specifying character composition.
2834
2835    ISO6429's direction specification takes the following form:
2836         o CSI ']'      -- end of the current direction
2837         o CSI '0' ']'  -- end of the current direction
2838         o CSI '1' ']'  -- start of left-to-right text
2839         o CSI '2' ']'  -- start of right-to-left text
2840    The control character CSI (0x9B: control sequence introducer) is
2841    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2842
2843    Character composition specification takes the following form:
2844         o ESC '0' -- start relative composition
2845         o ESC '1' -- end composition
2846         o ESC '2' -- start rule-base composition (*)
2847         o ESC '3' -- start relative composition with alternate chars  (**)
2848         o ESC '4' -- start rule-base composition with alternate chars  (**)
2849   Since these are not standard escape sequences of any ISO standard,
2850   the use of them with these meanings is restricted to Emacs only.
2851
2852   (*) This form is used only in Emacs 20.7 and older versions,
2853   but newer versions can safely decode it.
2854   (**) This form is used only in Emacs 21.1 and newer versions,
2855   and older versions can't decode it.
2856
2857   Here's a list of example usages of these composition escape
2858   sequences (categorized by `enum composition_method').
2859
2860   COMPOSITION_RELATIVE:
2861         ESC 0 CHAR [ CHAR ] ESC 1
2862   COMPOSITION_WITH_RULE:
2863         ESC 2 CHAR [ RULE CHAR ] ESC 1
2864   COMPOSITION_WITH_ALTCHARS:
2865         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2866   COMPOSITION_WITH_RULE_ALTCHARS:
2867         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2868
2869 static enum iso_code_class_type iso_code_class[256];
2870
2871 #define SAFE_CHARSET_P(coding, id)      \
2872   ((id) <= (coding)->max_charset_id     \
2873    && (coding)->safe_charsets[id] != 255)
2874
2875 static void
2876 setup_iso_safe_charsets (Lisp_Object attrs)
2877 {
2878   Lisp_Object charset_list, safe_charsets;
2879   Lisp_Object request;
2880   Lisp_Object reg_usage;
2881   Lisp_Object tail;
2882   EMACS_INT reg94, reg96;
2883   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2884   int max_charset_id;
2885
2886   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2887   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2888       && ! EQ (charset_list, Viso_2022_charset_list))
2889     {
2890       charset_list = Viso_2022_charset_list;
2891       ASET (attrs, coding_attr_charset_list, charset_list);
2892       ASET (attrs, coding_attr_safe_charsets, Qnil);
2893     }
2894
2895   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2896     return;
2897
2898   max_charset_id = 0;
2899   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2900     {
2901       int id = XINT (XCAR (tail));
2902       if (max_charset_id < id)
2903         max_charset_id = id;
2904     }
2905
2906   safe_charsets = make_uninit_string (max_charset_id + 1);
2907   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2908   request = AREF (attrs, coding_attr_iso_request);
2909   reg_usage = AREF (attrs, coding_attr_iso_usage);
2910   reg94 = XINT (XCAR (reg_usage));
2911   reg96 = XINT (XCDR (reg_usage));
2912
2913   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2914     {
2915       Lisp_Object id;
2916       Lisp_Object reg;
2917       struct charset *charset;
2918
2919       id = XCAR (tail);
2920       charset = CHARSET_FROM_ID (XINT (id));
2921       reg = Fcdr (Fassq (id, request));
2922       if (! NILP (reg))
2923         SSET (safe_charsets, XINT (id), XINT (reg));
2924       else if (charset->iso_chars_96)
2925         {
2926           if (reg96 < 4)
2927             SSET (safe_charsets, XINT (id), reg96);
2928         }
2929       else
2930         {
2931           if (reg94 < 4)
2932             SSET (safe_charsets, XINT (id), reg94);
2933         }
2934     }
2935   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2936 }
2937
2938
2939 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2940    Return true if a text is encoded in one of ISO-2022 based coding
2941    systems.  */
2942
2943 static bool
2944 detect_coding_iso_2022 (struct coding_system *coding,
2945                         struct coding_detection_info *detect_info)
2946 {
2947   const unsigned char *src = coding->source, *src_base = src;
2948   const unsigned char *src_end = coding->source + coding->src_bytes;
2949   bool multibytep = coding->src_multibyte;
2950   bool single_shifting = 0;
2951   int id;
2952   int c, c1;
2953   ptrdiff_t consumed_chars = 0;
2954   int i;
2955   int rejected = 0;
2956   int found = 0;
2957   int composition_count = -1;
2958
2959   detect_info->checked |= CATEGORY_MASK_ISO;
2960
2961   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2962     {
2963       struct coding_system *this = &(coding_categories[i]);
2964       Lisp_Object attrs, val;
2965
2966       if (this->id < 0)
2967         continue;
2968       attrs = CODING_ID_ATTRS (this->id);
2969       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2970           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2971         setup_iso_safe_charsets (attrs);
2972       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2973       this->max_charset_id = SCHARS (val) - 1;
2974       this->safe_charsets = SDATA (val);
2975     }
2976
2977   /* A coding system of this category is always ASCII compatible.  */
2978   src += coding->head_ascii;
2979
2980   while (rejected != CATEGORY_MASK_ISO)
2981     {
2982       src_base = src;
2983       ONE_MORE_BYTE (c);
2984       switch (c)
2985         {
2986         case ISO_CODE_ESC:
2987           if (inhibit_iso_escape_detection)
2988             break;
2989           single_shifting = 0;
2990           ONE_MORE_BYTE (c);
2991           if (c == 'N' || c == 'O')
2992             {
2993               /* ESC <Fe> for SS2 or SS3.  */
2994               single_shifting = 1;
2995               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2996             }
2997           else if (c == '1')
2998             {
2999               /* End of composition.  */
3000               if (composition_count < 0
3001                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3002                 /* Invalid */
3003                 break;
3004               composition_count = -1;
3005               found |= CATEGORY_MASK_ISO;
3006             }
3007           else if (c >= '0' && c <= '4')
3008             {
3009               /* ESC <Fp> for start/end composition.  */
3010               composition_count = 0;
3011             }
3012           else
3013             {
3014               if (c >= '(' && c <= '/')
3015                 {
3016                   /* Designation sequence for a charset of dimension 1.  */
3017                   ONE_MORE_BYTE (c1);
3018                   if (c1 < ' ' || c1 >= 0x80
3019                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3020                     /* Invalid designation sequence.  Just ignore.  */
3021                     break;
3022                 }
3023               else if (c == '$')
3024                 {
3025                   /* Designation sequence for a charset of dimension 2.  */
3026                   ONE_MORE_BYTE (c);
3027                   if (c >= '@' && c <= 'B')
3028                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3029                     id = iso_charset_table[1][0][c];
3030                   else if (c >= '(' && c <= '/')
3031                     {
3032                       ONE_MORE_BYTE (c1);
3033                       if (c1 < ' ' || c1 >= 0x80
3034                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3035                         /* Invalid designation sequence.  Just ignore.  */
3036                         break;
3037                     }
3038                   else
3039                     /* Invalid designation sequence.  Just ignore it.  */
3040                     break;
3041                 }
3042               else
3043                 {
3044                   /* Invalid escape sequence.  Just ignore it.  */
3045                   break;
3046                 }
3047
3048               /* We found a valid designation sequence for CHARSET.  */
3049               rejected |= CATEGORY_MASK_ISO_8BIT;
3050               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3051                                   id))
3052                 found |= CATEGORY_MASK_ISO_7;
3053               else
3054                 rejected |= CATEGORY_MASK_ISO_7;
3055               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3056                                   id))
3057                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3058               else
3059                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3060               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3061                                   id))
3062                 found |= CATEGORY_MASK_ISO_7_ELSE;
3063               else
3064                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3065               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3066                                   id))
3067                 found |= CATEGORY_MASK_ISO_8_ELSE;
3068               else
3069                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3070             }
3071           break;
3072
3073         case ISO_CODE_SO:
3074         case ISO_CODE_SI:
3075           /* Locking shift out/in.  */
3076           if (inhibit_iso_escape_detection)
3077             break;
3078           single_shifting = 0;
3079           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3080           break;
3081
3082         case ISO_CODE_CSI:
3083           /* Control sequence introducer.  */
3084           single_shifting = 0;
3085           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3086           found |= CATEGORY_MASK_ISO_8_ELSE;
3087           goto check_extra_latin;
3088
3089         case ISO_CODE_SS2:
3090         case ISO_CODE_SS3:
3091           /* Single shift.   */
3092           if (inhibit_iso_escape_detection)
3093             break;
3094           single_shifting = 0;
3095           rejected |= CATEGORY_MASK_ISO_7BIT;
3096           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3097               & CODING_ISO_FLAG_SINGLE_SHIFT)
3098             {
3099               found |= CATEGORY_MASK_ISO_8_1;
3100               single_shifting = 1;
3101             }
3102           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3103               & CODING_ISO_FLAG_SINGLE_SHIFT)
3104             {
3105               found |= CATEGORY_MASK_ISO_8_2;
3106               single_shifting = 1;
3107             }
3108           if (single_shifting)
3109             break;
3110         check_extra_latin:
3111           if (! VECTORP (Vlatin_extra_code_table)
3112               || NILP (AREF (Vlatin_extra_code_table, c)))
3113             {
3114               rejected = CATEGORY_MASK_ISO;
3115               break;
3116             }
3117           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3118               & CODING_ISO_FLAG_LATIN_EXTRA)
3119             found |= CATEGORY_MASK_ISO_8_1;
3120           else
3121             rejected |= CATEGORY_MASK_ISO_8_1;
3122           rejected |= CATEGORY_MASK_ISO_8_2;
3123           break;
3124
3125         default:
3126           if (c < 0)
3127             continue;
3128           if (c < 0x80)
3129             {
3130               if (composition_count >= 0)
3131                 composition_count++;
3132               single_shifting = 0;
3133               break;
3134             }
3135           if (c >= 0xA0)
3136             {
3137               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3138               found |= CATEGORY_MASK_ISO_8_1;
3139               /* Check the length of succeeding codes of the range
3140                  0xA0..0FF.  If the byte length is even, we include
3141                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3142                  only when we are not single shifting.  */
3143               if (! single_shifting
3144                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3145                 {
3146                   int len = 1;
3147                   while (src < src_end)
3148                     {
3149                       src_base = src;
3150                       ONE_MORE_BYTE (c);
3151                       if (c < 0xA0)
3152                         {
3153                           src = src_base;
3154                           break;
3155                         }
3156                       len++;
3157                     }
3158
3159                   if (len & 1 && src < src_end)
3160                     {
3161                       rejected |= CATEGORY_MASK_ISO_8_2;
3162                       if (composition_count >= 0)
3163                         composition_count += len;
3164                     }
3165                   else
3166                     {
3167                       found |= CATEGORY_MASK_ISO_8_2;
3168                       if (composition_count >= 0)
3169                         composition_count += len / 2;
3170                     }
3171                 }
3172               break;
3173             }
3174         }
3175     }
3176   detect_info->rejected |= CATEGORY_MASK_ISO;
3177   return 0;
3178
3179  no_more_source:
3180   detect_info->rejected |= rejected;
3181   detect_info->found |= (found & ~rejected);
3182   return 1;
3183 }
3184
3185
3186 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3187    escape sequence should be kept.  */
3188 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3189   do {                                                                  \
3190     int id, prev;                                                       \
3191                                                                         \
3192     if (final < '0' || final >= 128                                     \
3193         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3194         || !SAFE_CHARSET_P (coding, id))                                \
3195       {                                                                 \
3196         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3197         chars_96 = -1;                                                  \
3198         break;                                                          \
3199       }                                                                 \
3200     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3201     if (id == charset_jisx0201_roman)                                   \
3202       {                                                                 \
3203         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3204           id = charset_ascii;                                           \
3205       }                                                                 \
3206     else if (id == charset_jisx0208_1978)                               \
3207       {                                                                 \
3208         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3209           id = charset_jisx0208;                                        \
3210       }                                                                 \
3211     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3212     /* If there was an invalid designation to REG previously, and this  \
3213        designation is ASCII to REG, we should keep this designation     \
3214        sequence.  */                                                    \
3215     if (prev == -2 && id == charset_ascii)                              \
3216       chars_96 = -1;                                                    \
3217   } while (0)
3218
3219
3220 /* Handle these composition sequence (ALT: alternate char):
3221
3222    (1) relative composition: ESC 0 CHAR ... ESC 1
3223    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3224    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3225    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3226
3227    When the start sequence (ESC 0/2/3/4) is found, this annotation
3228    header is produced.
3229
3230         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3231
3232    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3233    produced until the end sequence (ESC 1) is found:
3234
3235    (1) CHAR ... CHAR
3236    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3237    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3238    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3239
3240    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3241    annotation header is updated as below:
3242
3243    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3244    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3245    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3246    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3247
3248    If an error is found while composing, the annotation header is
3249    changed to:
3250
3251         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3252
3253    and the sequence [ -2 DECODED-RULE ] is changed to the original
3254    byte sequence as below:
3255         o the original byte sequence is B: [ B -1 ]
3256         o the original byte sequence is B1 B2: [ B1 B2 ]
3257    and the sequence [ -1 -1 ] is changed to the original byte
3258    sequence:
3259         [ ESC '0' ]
3260 */
3261
3262 /* Decode a composition rule C1 and maybe one more byte from the
3263    source, and set RULE to the encoded composition rule.  If the rule
3264    is invalid, goto invalid_code.  */
3265
3266 #define DECODE_COMPOSITION_RULE(rule)                                   \
3267   do {                                                                  \
3268     rule = c1 - 32;                                                     \
3269     if (rule < 0)                                                       \
3270       goto invalid_code;                                                \
3271     if (rule < 81)              /* old format (before ver.21) */        \
3272       {                                                                 \
3273         int gref = (rule) / 9;                                          \
3274         int nref = (rule) % 9;                                          \
3275         if (gref == 4) gref = 10;                                       \
3276         if (nref == 4) nref = 10;                                       \
3277         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3278       }                                                                 \
3279     else                        /* new format (after ver.21) */         \
3280       {                                                                 \
3281         int b;                                                          \
3282                                                                         \
3283         ONE_MORE_BYTE (b);                                              \
3284         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3285           goto invalid_code;                                            \
3286         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3287         rule += 0x100;   /* Distinguish it from the old format.  */     \
3288       }                                                                 \
3289   } while (0)
3290
3291 #define ENCODE_COMPOSITION_RULE(rule)                           \
3292   do {                                                          \
3293     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3294                                                                 \
3295     if (rule < 0x100)           /* old format */                \
3296       {                                                         \
3297         if (gref == 10) gref = 4;                               \
3298         if (nref == 10) nref = 4;                               \
3299         charbuf[idx] = 32 + gref * 9 + nref;                    \
3300         charbuf[idx + 1] = -1;                                  \
3301         new_chars++;                                            \
3302       }                                                         \
3303     else                                /* new format */        \
3304       {                                                         \
3305         charbuf[idx] = 32 + 81 + gref;                          \
3306         charbuf[idx + 1] = 32 + nref;                           \
3307         new_chars += 2;                                         \
3308       }                                                         \
3309   } while (0)
3310
3311 /* Finish the current composition as invalid.  */
3312
3313 static int
3314 finish_composition (int *charbuf, struct composition_status *cmp_status)
3315 {
3316   int idx = - cmp_status->length;
3317   int new_chars;
3318
3319   /* Recover the original ESC sequence */
3320   charbuf[idx++] = ISO_CODE_ESC;
3321   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3322                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3323                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3324                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3325                     : '4');
3326   charbuf[idx++] = -2;
3327   charbuf[idx++] = 0;
3328   charbuf[idx++] = -1;
3329   new_chars = cmp_status->nchars;
3330   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3331     for (; idx < 0; idx++)
3332       {
3333         int elt = charbuf[idx];
3334
3335         if (elt == -2)
3336           {
3337             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3338             idx++;
3339           }
3340         else if (elt == -1)
3341           {
3342             charbuf[idx++] = ISO_CODE_ESC;
3343             charbuf[idx] = '0';
3344             new_chars += 2;
3345           }
3346       }
3347   cmp_status->state = COMPOSING_NO;
3348   return new_chars;
3349 }
3350
3351 /* If characters are under composition, finish the composition.  */
3352 #define MAYBE_FINISH_COMPOSITION()                              \
3353   do {                                                          \
3354     if (cmp_status->state != COMPOSING_NO)                      \
3355       char_offset += finish_composition (charbuf, cmp_status);  \
3356   } while (0)
3357
3358 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3359
3360    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3361    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3362    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3363    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3364
3365    Produce this annotation sequence now:
3366
3367    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3368 */
3369
3370 #define DECODE_COMPOSITION_START(c1)                                       \
3371   do {                                                                     \
3372     if (c1 == '0'                                                          \
3373         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3374              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3375             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3376                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3377       {                                                                    \
3378         *charbuf++ = -1;                                                   \
3379         *charbuf++= -1;                                                    \
3380         cmp_status->state = COMPOSING_CHAR;                                \
3381         cmp_status->length += 2;                                           \
3382       }                                                                    \
3383     else                                                                   \
3384       {                                                                    \
3385         MAYBE_FINISH_COMPOSITION ();                                       \
3386         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3387                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3388                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3389                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3390         cmp_status->state                                                  \
3391           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3392         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3393         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3394         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3395         coding->annotated = 1;                                             \
3396       }                                                                    \
3397   } while (0)
3398
3399
3400 /* Handle composition end sequence ESC 1.  */
3401
3402 #define DECODE_COMPOSITION_END()                                        \
3403   do {                                                                  \
3404     if (cmp_status->nchars == 0                                         \
3405         || ((cmp_status->state == COMPOSING_CHAR)                       \
3406             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3407       {                                                                 \
3408         MAYBE_FINISH_COMPOSITION ();                                    \
3409         goto invalid_code;                                              \
3410       }                                                                 \
3411     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3412       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3413     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3414       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3415     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3416     char_offset += cmp_status->nchars;                                  \
3417     cmp_status->state = COMPOSING_NO;                                   \
3418   } while (0)
3419
3420 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3421
3422 #define STORE_COMPOSITION_RULE(rule)    \
3423   do {                                  \
3424     *charbuf++ = -2;                    \
3425     *charbuf++ = rule;                  \
3426     cmp_status->length += 2;            \
3427     cmp_status->state--;                \
3428   } while (0)
3429
3430 /* Store a composed char or a component char C in charbuf, and update
3431    cmp_status.  */
3432
3433 #define STORE_COMPOSITION_CHAR(c)                                       \
3434   do {                                                                  \
3435     *charbuf++ = (c);                                                   \
3436     cmp_status->length++;                                               \
3437     if (cmp_status->state == COMPOSING_CHAR)                            \
3438       cmp_status->nchars++;                                             \
3439     else                                                                \
3440       cmp_status->ncomps++;                                             \
3441     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3442         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3443             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3444       cmp_status->state++;                                              \
3445   } while (0)
3446
3447
3448 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3449
3450 static void
3451 decode_coding_iso_2022 (struct coding_system *coding)
3452 {
3453   const unsigned char *src = coding->source + coding->consumed;
3454   const unsigned char *src_end = coding->source + coding->src_bytes;
3455   const unsigned char *src_base;
3456   int *charbuf = coding->charbuf + coding->charbuf_used;
3457   /* We may produce two annotations (charset and composition) in one
3458      loop and one more charset annotation at the end.  */
3459   int *charbuf_end
3460     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3461   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3462   bool multibytep = coding->src_multibyte;
3463   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3464   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3465   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3466   int charset_id_2, charset_id_3;
3467   struct charset *charset;
3468   int c;
3469   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3470   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3471   ptrdiff_t char_offset = coding->produced_char;
3472   ptrdiff_t last_offset = char_offset;
3473   int last_id = charset_ascii;
3474   bool eol_dos
3475     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3476   int byte_after_cr = -1;
3477   int i;
3478
3479   setup_iso_safe_charsets (attrs);
3480   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3481
3482   if (cmp_status->state != COMPOSING_NO)
3483     {
3484       if (charbuf_end - charbuf < cmp_status->length)
3485         abort ();
3486       for (i = 0; i < cmp_status->length; i++)
3487         *charbuf++ = cmp_status->carryover[i];
3488       coding->annotated = 1;
3489     }
3490
3491   while (1)
3492     {
3493       int c1, c2, c3;
3494
3495       src_base = src;
3496       consumed_chars_base = consumed_chars;
3497
3498       if (charbuf >= charbuf_end)
3499         {
3500           if (byte_after_cr >= 0)
3501             src_base--;
3502           break;
3503         }
3504
3505       if (byte_after_cr >= 0)
3506         c1 = byte_after_cr, byte_after_cr = -1;
3507       else
3508         ONE_MORE_BYTE (c1);
3509       if (c1 < 0)
3510         goto invalid_code;
3511
3512       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3513         {
3514           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3515           char_offset++;
3516           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3517           continue;
3518         }
3519
3520       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3521         {
3522           if (c1 == ISO_CODE_ESC)
3523             {
3524               if (src + 1 >= src_end)
3525                 goto no_more_source;
3526               *charbuf++ = ISO_CODE_ESC;
3527               char_offset++;
3528               if (src[0] == '%' && src[1] == '@')
3529                 {
3530                   src += 2;
3531                   consumed_chars += 2;
3532                   char_offset += 2;
3533                   /* We are sure charbuf can contain two more chars. */
3534                   *charbuf++ = '%';
3535                   *charbuf++ = '@';
3536                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3537                 }
3538             }
3539           else
3540             {
3541               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3542               char_offset++;
3543             }
3544           continue;
3545         }
3546
3547       if ((cmp_status->state == COMPOSING_RULE
3548            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3549           && c1 != ISO_CODE_ESC)
3550         {
3551           int rule;
3552
3553           DECODE_COMPOSITION_RULE (rule);
3554           STORE_COMPOSITION_RULE (rule);
3555           continue;
3556         }
3557
3558       /* We produce at most one character.  */
3559       switch (iso_code_class [c1])
3560         {
3561         case ISO_0x20_or_0x7F:
3562           if (charset_id_0 < 0
3563               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3564             /* This is SPACE or DEL.  */
3565             charset = CHARSET_FROM_ID (charset_ascii);
3566           else
3567             charset = CHARSET_FROM_ID (charset_id_0);
3568           break;
3569
3570         case ISO_graphic_plane_0:
3571           if (charset_id_0 < 0)
3572             charset = CHARSET_FROM_ID (charset_ascii);
3573           else
3574             charset = CHARSET_FROM_ID (charset_id_0);
3575           break;
3576
3577         case ISO_0xA0_or_0xFF:
3578           if (charset_id_1 < 0
3579               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3580               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3581             goto invalid_code;
3582           /* This is a graphic character, we fall down ... */
3583
3584         case ISO_graphic_plane_1:
3585           if (charset_id_1 < 0)
3586             goto invalid_code;
3587           charset = CHARSET_FROM_ID (charset_id_1);
3588           break;
3589
3590         case ISO_control_0:
3591           if (eol_dos && c1 == '\r')
3592             ONE_MORE_BYTE (byte_after_cr);
3593           MAYBE_FINISH_COMPOSITION ();
3594           charset = CHARSET_FROM_ID (charset_ascii);
3595           break;
3596
3597         case ISO_control_1:
3598           goto invalid_code;
3599
3600         case ISO_shift_out:
3601           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3602               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3603             goto invalid_code;
3604           CODING_ISO_INVOCATION (coding, 0) = 1;
3605           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3606           continue;
3607
3608         case ISO_shift_in:
3609           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3610             goto invalid_code;
3611           CODING_ISO_INVOCATION (coding, 0) = 0;
3612           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3613           continue;
3614
3615         case ISO_single_shift_2_7:
3616           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3617             goto invalid_code;
3618         case ISO_single_shift_2:
3619           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3620             goto invalid_code;
3621           /* SS2 is handled as an escape sequence of ESC 'N' */
3622           c1 = 'N';
3623           goto label_escape_sequence;
3624
3625         case ISO_single_shift_3:
3626           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3627             goto invalid_code;
3628           /* SS2 is handled as an escape sequence of ESC 'O' */
3629           c1 = 'O';
3630           goto label_escape_sequence;
3631
3632         case ISO_control_sequence_introducer:
3633           /* CSI is handled as an escape sequence of ESC '[' ...  */
3634           c1 = '[';
3635           goto label_escape_sequence;
3636
3637         case ISO_escape:
3638           ONE_MORE_BYTE (c1);
3639         label_escape_sequence:
3640           /* Escape sequences handled here are invocation,
3641              designation, direction specification, and character
3642              composition specification.  */
3643           switch (c1)
3644             {
3645             case '&':           /* revision of following character set */
3646               ONE_MORE_BYTE (c1);
3647               if (!(c1 >= '@' && c1 <= '~'))
3648                 goto invalid_code;
3649               ONE_MORE_BYTE (c1);
3650               if (c1 != ISO_CODE_ESC)
3651                 goto invalid_code;
3652               ONE_MORE_BYTE (c1);
3653               goto label_escape_sequence;
3654
3655             case '$':           /* designation of 2-byte character set */
3656               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3657                 goto invalid_code;
3658               {
3659                 int reg, chars96;
3660
3661                 ONE_MORE_BYTE (c1);
3662                 if (c1 >= '@' && c1 <= 'B')
3663                   {     /* designation of JISX0208.1978, GB2312.1980,
3664                            or JISX0208.1980 */
3665                     reg = 0, chars96 = 0;
3666                   }
3667                 else if (c1 >= 0x28 && c1 <= 0x2B)
3668                   { /* designation of DIMENSION2_CHARS94 character set */
3669                     reg = c1 - 0x28, chars96 = 0;
3670                     ONE_MORE_BYTE (c1);
3671                   }
3672                 else if (c1 >= 0x2C && c1 <= 0x2F)
3673                   { /* designation of DIMENSION2_CHARS96 character set */
3674                     reg = c1 - 0x2C, chars96 = 1;
3675                     ONE_MORE_BYTE (c1);
3676                   }
3677                 else
3678                   goto invalid_code;
3679                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3680                 /* We must update these variables now.  */
3681                 if (reg == 0)
3682                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3683                 else if (reg == 1)
3684                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3685                 if (chars96 < 0)
3686                   goto invalid_code;
3687               }
3688               continue;
3689
3690             case 'n':           /* invocation of locking-shift-2 */
3691               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3692                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3693                 goto invalid_code;
3694               CODING_ISO_INVOCATION (coding, 0) = 2;
3695               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3696               continue;
3697
3698             case 'o':           /* invocation of locking-shift-3 */
3699               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3700                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3701                 goto invalid_code;
3702               CODING_ISO_INVOCATION (coding, 0) = 3;
3703               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3704               continue;
3705
3706             case 'N':           /* invocation of single-shift-2 */
3707               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3708                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3709                 goto invalid_code;
3710               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3711               if (charset_id_2 < 0)
3712                 charset = CHARSET_FROM_ID (charset_ascii);
3713               else
3714                 charset = CHARSET_FROM_ID (charset_id_2);
3715               ONE_MORE_BYTE (c1);
3716               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3717                 goto invalid_code;
3718               break;
3719
3720             case 'O':           /* invocation of single-shift-3 */
3721               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3722                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3723                 goto invalid_code;
3724               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3725               if (charset_id_3 < 0)
3726                 charset = CHARSET_FROM_ID (charset_ascii);
3727               else
3728                 charset = CHARSET_FROM_ID (charset_id_3);
3729               ONE_MORE_BYTE (c1);
3730               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3731                 goto invalid_code;
3732               break;
3733
3734             case '0': case '2': case '3': case '4': /* start composition */
3735               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3736                 goto invalid_code;
3737               if (last_id != charset_ascii)
3738                 {
3739                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3740                   last_id = charset_ascii;
3741                   last_offset = char_offset;
3742                 }
3743               DECODE_COMPOSITION_START (c1);
3744               continue;
3745
3746             case '1':           /* end composition */
3747               if (cmp_status->state == COMPOSING_NO)
3748                 goto invalid_code;
3749               DECODE_COMPOSITION_END ();
3750               continue;
3751
3752             case '[':           /* specification of direction */
3753               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3754                 goto invalid_code;
3755               /* For the moment, nested direction is not supported.
3756                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3757                  left-to-right, and nonzero means right-to-left.  */
3758               ONE_MORE_BYTE (c1);
3759               switch (c1)
3760                 {
3761                 case ']':       /* end of the current direction */
3762                   coding->mode &= ~CODING_MODE_DIRECTION;
3763
3764                 case '0':       /* end of the current direction */
3765                 case '1':       /* start of left-to-right direction */
3766                   ONE_MORE_BYTE (c1);
3767                   if (c1 == ']')
3768                     coding->mode &= ~CODING_MODE_DIRECTION;
3769                   else
3770                     goto invalid_code;
3771                   break;
3772
3773                 case '2':       /* start of right-to-left direction */
3774                   ONE_MORE_BYTE (c1);
3775                   if (c1 == ']')
3776                     coding->mode |= CODING_MODE_DIRECTION;
3777                   else
3778                     goto invalid_code;
3779                   break;
3780
3781                 default:
3782                   goto invalid_code;
3783                 }
3784               continue;
3785
3786             case '%':
3787               ONE_MORE_BYTE (c1);
3788               if (c1 == '/')
3789                 {
3790                   /* CTEXT extended segment:
3791                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3792                      We keep these bytes as is for the moment.
3793                      They may be decoded by post-read-conversion.  */
3794                   int dim, M, L;
3795                   int size;
3796
3797                   ONE_MORE_BYTE (dim);
3798                   if (dim < '0' || dim > '4')
3799                     goto invalid_code;
3800                   ONE_MORE_BYTE (M);
3801                   if (M < 128)
3802                     goto invalid_code;
3803                   ONE_MORE_BYTE (L);
3804                   if (L < 128)
3805                     goto invalid_code;
3806                   size = ((M - 128) * 128) + (L - 128);
3807                   if (charbuf + 6 > charbuf_end)
3808                     goto break_loop;
3809                   *charbuf++ = ISO_CODE_ESC;
3810                   *charbuf++ = '%';
3811                   *charbuf++ = '/';
3812                   *charbuf++ = dim;
3813                   *charbuf++ = BYTE8_TO_CHAR (M);
3814                   *charbuf++ = BYTE8_TO_CHAR (L);
3815                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3816                 }
3817               else if (c1 == 'G')
3818                 {
3819                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3820                      ESC % G --UTF-8-BYTES-- ESC % @
3821                      We keep these bytes as is for the moment.
3822                      They may be decoded by post-read-conversion.  */
3823                   if (charbuf + 3 > charbuf_end)
3824                     goto break_loop;
3825                   *charbuf++ = ISO_CODE_ESC;
3826                   *charbuf++ = '%';
3827                   *charbuf++ = 'G';
3828                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3829                 }
3830               else
3831                 goto invalid_code;
3832               continue;
3833               break;
3834
3835             default:
3836               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3837                 goto invalid_code;
3838               {
3839                 int reg, chars96;
3840
3841                 if (c1 >= 0x28 && c1 <= 0x2B)
3842                   { /* designation of DIMENSION1_CHARS94 character set */
3843                     reg = c1 - 0x28, chars96 = 0;
3844                     ONE_MORE_BYTE (c1);
3845                   }
3846                 else if (c1 >= 0x2C && c1 <= 0x2F)
3847                   { /* designation of DIMENSION1_CHARS96 character set */
3848                     reg = c1 - 0x2C, chars96 = 1;
3849                     ONE_MORE_BYTE (c1);
3850                   }
3851                 else
3852                   goto invalid_code;
3853                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3854                 /* We must update these variables now.  */
3855                 if (reg == 0)
3856                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3857                 else if (reg == 1)
3858                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3859                 if (chars96 < 0)
3860                   goto invalid_code;
3861               }
3862               continue;
3863             }
3864           break;
3865
3866         default:
3867           abort ();
3868         }
3869
3870       if (cmp_status->state == COMPOSING_NO
3871           && charset->id != charset_ascii
3872           && last_id != charset->id)
3873         {
3874           if (last_id != charset_ascii)
3875             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3876           last_id = charset->id;
3877           last_offset = char_offset;
3878         }
3879
3880       /* Now we know CHARSET and 1st position code C1 of a character.
3881          Produce a decoded character while getting 2nd and 3rd
3882          position codes C2, C3 if necessary.  */
3883       if (CHARSET_DIMENSION (charset) > 1)
3884         {
3885           ONE_MORE_BYTE (c2);
3886           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3887               || ((c1 & 0x80) != (c2 & 0x80)))
3888             /* C2 is not in a valid range.  */
3889             goto invalid_code;
3890           if (CHARSET_DIMENSION (charset) == 2)
3891             c1 = (c1 << 8) | c2;
3892           else
3893             {
3894               ONE_MORE_BYTE (c3);
3895               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3896                   || ((c1 & 0x80) != (c3 & 0x80)))
3897                 /* C3 is not in a valid range.  */
3898                 goto invalid_code;
3899               c1 = (c1 << 16) | (c2 << 8) | c2;
3900             }
3901         }
3902       c1 &= 0x7F7F7F;
3903       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3904       if (c < 0)
3905         {
3906           MAYBE_FINISH_COMPOSITION ();
3907           for (; src_base < src; src_base++, char_offset++)
3908             {
3909               if (ASCII_BYTE_P (*src_base))
3910                 *charbuf++ = *src_base;
3911               else
3912                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3913             }
3914         }
3915       else if (cmp_status->state == COMPOSING_NO)
3916         {
3917           *charbuf++ = c;
3918           char_offset++;
3919         }
3920       else if ((cmp_status->state == COMPOSING_CHAR
3921                 ? cmp_status->nchars
3922                 : cmp_status->ncomps)
3923                >= MAX_COMPOSITION_COMPONENTS)
3924         {
3925           /* Too long composition.  */
3926           MAYBE_FINISH_COMPOSITION ();
3927           *charbuf++ = c;
3928           char_offset++;
3929         }
3930       else
3931         STORE_COMPOSITION_CHAR (c);
3932       continue;
3933
3934     invalid_code:
3935       MAYBE_FINISH_COMPOSITION ();
3936       src = src_base;
3937       consumed_chars = consumed_chars_base;
3938       ONE_MORE_BYTE (c);
3939       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3940       char_offset++;
3941       coding->errors++;
3942       continue;
3943
3944     break_loop:
3945       break;
3946     }
3947
3948  no_more_source:
3949   if (cmp_status->state != COMPOSING_NO)
3950     {
3951       if (coding->mode & CODING_MODE_LAST_BLOCK)
3952         MAYBE_FINISH_COMPOSITION ();
3953       else
3954         {
3955           charbuf -= cmp_status->length;
3956           for (i = 0; i < cmp_status->length; i++)
3957             cmp_status->carryover[i] = charbuf[i];
3958         }
3959     }
3960   else if (last_id != charset_ascii)
3961     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3962   coding->consumed_char += consumed_chars_base;
3963   coding->consumed = src_base - coding->source;
3964   coding->charbuf_used = charbuf - coding->charbuf;
3965 }
3966
3967
3968 /* ISO2022 encoding stuff.  */
3969
3970 /*
3971    It is not enough to say just "ISO2022" on encoding, we have to
3972    specify more details.  In Emacs, each coding system of ISO2022
3973    variant has the following specifications:
3974         1. Initial designation to G0 thru G3.
3975         2. Allows short-form designation?
3976         3. ASCII should be designated to G0 before control characters?
3977         4. ASCII should be designated to G0 at end of line?
3978         5. 7-bit environment or 8-bit environment?
3979         6. Use locking-shift?
3980         7. Use Single-shift?
3981    And the following two are only for Japanese:
3982         8. Use ASCII in place of JIS0201-1976-Roman?
3983         9. Use JISX0208-1983 in place of JISX0208-1978?
3984    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3985    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3986    details.
3987 */
3988
3989 /* Produce codes (escape sequence) for designating CHARSET to graphic
3990    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3991    '@', 'A', or 'B' and the coding system CODING allows, produce
3992    designation sequence of short-form.  */
3993
3994 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3995   do {                                                                  \
3996     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3997     const char *intermediate_char_94 = "()*+";                          \
3998     const char *intermediate_char_96 = ",-./";                          \
3999     int revision = -1;                                                  \
4000                                                                         \
4001     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4002       revision = CHARSET_ISO_REVISION (charset);                        \
4003                                                                         \
4004     if (revision >= 0)                                                  \
4005       {                                                                 \
4006         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4007         EMIT_ONE_BYTE ('@' + revision);                                 \
4008       }                                                                 \
4009     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4010     if (CHARSET_DIMENSION (charset) == 1)                               \
4011       {                                                                 \
4012         int b;                                                          \
4013         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4014           b = intermediate_char_94[reg];                                \
4015         else                                                            \
4016           b = intermediate_char_96[reg];                                \
4017         EMIT_ONE_ASCII_BYTE (b);                                        \
4018       }                                                                 \
4019     else                                                                \
4020       {                                                                 \
4021         EMIT_ONE_ASCII_BYTE ('$');                                      \
4022         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4023           {                                                             \
4024             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4025                 || reg != 0                                             \
4026                 || final_char < '@' || final_char > 'B')                \
4027               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4028           }                                                             \
4029         else                                                            \
4030           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4031       }                                                                 \
4032     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4033                                                                         \
4034     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4035   } while (0)
4036
4037
4038 /* The following two macros produce codes (control character or escape
4039    sequence) for ISO2022 single-shift functions (single-shift-2 and
4040    single-shift-3).  */
4041
4042 #define ENCODE_SINGLE_SHIFT_2                                           \
4043   do {                                                                  \
4044     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4045       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4046     else                                                                \
4047       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4048     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4049   } while (0)
4050
4051
4052 #define ENCODE_SINGLE_SHIFT_3                                           \
4053   do {                                                                  \
4054     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4055       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4056     else                                                                \
4057       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4058     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4059   } while (0)
4060
4061
4062 /* The following four macros produce codes (control character or
4063    escape sequence) for ISO2022 locking-shift functions (shift-in,
4064    shift-out, locking-shift-2, and locking-shift-3).  */
4065
4066 #define ENCODE_SHIFT_IN                                 \
4067   do {                                                  \
4068     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4069     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4070   } while (0)
4071
4072
4073 #define ENCODE_SHIFT_OUT                                \
4074   do {                                                  \
4075     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4076     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4077   } while (0)
4078
4079
4080 #define ENCODE_LOCKING_SHIFT_2                          \
4081   do {                                                  \
4082     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4083     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4084   } while (0)
4085
4086
4087 #define ENCODE_LOCKING_SHIFT_3                          \
4088   do {                                                  \
4089     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4090     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4091   } while (0)
4092
4093
4094 /* Produce codes for a DIMENSION1 character whose character set is
4095    CHARSET and whose position-code is C1.  Designation and invocation
4096    sequences are also produced in advance if necessary.  */
4097
4098 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4099   do {                                                                  \
4100     int id = CHARSET_ID (charset);                                      \
4101                                                                         \
4102     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4103         && id == charset_ascii)                                         \
4104       {                                                                 \
4105         id = charset_jisx0201_roman;                                    \
4106         charset = CHARSET_FROM_ID (id);                                 \
4107       }                                                                 \
4108                                                                         \
4109     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4110       {                                                                 \
4111         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4112           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4113         else                                                            \
4114           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4115         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4116         break;                                                          \
4117       }                                                                 \
4118     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4119       {                                                                 \
4120         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4121         break;                                                          \
4122       }                                                                 \
4123     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4124       {                                                                 \
4125         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4126         break;                                                          \
4127       }                                                                 \
4128     else                                                                \
4129       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4130          must invoke it, or, at first, designate it to some graphic     \
4131          register.  Then repeat the loop to actually produce the        \
4132          character.  */                                                 \
4133       dst = encode_invocation_designation (charset, coding, dst,        \
4134                                            &produced_chars);            \
4135   } while (1)
4136
4137
4138 /* Produce codes for a DIMENSION2 character whose character set is
4139    CHARSET and whose position-codes are C1 and C2.  Designation and
4140    invocation codes are also produced in advance if necessary.  */
4141
4142 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4143   do {                                                                  \
4144     int id = CHARSET_ID (charset);                                      \
4145                                                                         \
4146     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4147         && id == charset_jisx0208)                                      \
4148       {                                                                 \
4149         id = charset_jisx0208_1978;                                     \
4150         charset = CHARSET_FROM_ID (id);                                 \
4151       }                                                                 \
4152                                                                         \
4153     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4154       {                                                                 \
4155         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4156           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4157         else                                                            \
4158           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4159         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4160         break;                                                          \
4161       }                                                                 \
4162     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4163       {                                                                 \
4164         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4165         break;                                                          \
4166       }                                                                 \
4167     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4168       {                                                                 \
4169         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4170         break;                                                          \
4171       }                                                                 \
4172     else                                                                \
4173       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4174          must invoke it, or, at first, designate it to some graphic     \
4175          register.  Then repeat the loop to actually produce the        \
4176          character.  */                                                 \
4177       dst = encode_invocation_designation (charset, coding, dst,        \
4178                                            &produced_chars);            \
4179   } while (1)
4180
4181
4182 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4183   do {                                                                     \
4184     unsigned code;                                                         \
4185     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4186                                                                            \
4187     if (CHARSET_DIMENSION (charset) == 1)                                  \
4188       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4189     else                                                                   \
4190       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4191   } while (0)
4192
4193
4194 /* Produce designation and invocation codes at a place pointed by DST
4195    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4196    Return new DST.  */
4197
4198 static unsigned char *
4199 encode_invocation_designation (struct charset *charset,
4200                                struct coding_system *coding,
4201                                unsigned char *dst, ptrdiff_t *p_nchars)
4202 {
4203   bool multibytep = coding->dst_multibyte;
4204   ptrdiff_t produced_chars = *p_nchars;
4205   int reg;                      /* graphic register number */
4206   int id = CHARSET_ID (charset);
4207
4208   /* At first, check designations.  */
4209   for (reg = 0; reg < 4; reg++)
4210     if (id == CODING_ISO_DESIGNATION (coding, reg))
4211       break;
4212
4213   if (reg >= 4)
4214     {
4215       /* CHARSET is not yet designated to any graphic registers.  */
4216       /* At first check the requested designation.  */
4217       reg = CODING_ISO_REQUEST (coding, id);
4218       if (reg < 0)
4219         /* Since CHARSET requests no special designation, designate it
4220            to graphic register 0.  */
4221         reg = 0;
4222
4223       ENCODE_DESIGNATION (charset, reg, coding);
4224     }
4225
4226   if (CODING_ISO_INVOCATION (coding, 0) != reg
4227       && CODING_ISO_INVOCATION (coding, 1) != reg)
4228     {
4229       /* Since the graphic register REG is not invoked to any graphic
4230          planes, invoke it to graphic plane 0.  */
4231       switch (reg)
4232         {
4233         case 0:                 /* graphic register 0 */
4234           ENCODE_SHIFT_IN;
4235           break;
4236
4237         case 1:                 /* graphic register 1 */
4238           ENCODE_SHIFT_OUT;
4239           break;
4240
4241         case 2:                 /* graphic register 2 */
4242           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4243             ENCODE_SINGLE_SHIFT_2;
4244           else
4245             ENCODE_LOCKING_SHIFT_2;
4246           break;
4247
4248         case 3:                 /* graphic register 3 */
4249           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4250             ENCODE_SINGLE_SHIFT_3;
4251           else
4252             ENCODE_LOCKING_SHIFT_3;
4253           break;
4254         }
4255     }
4256
4257   *p_nchars = produced_chars;
4258   return dst;
4259 }
4260
4261
4262 /* Produce codes for designation and invocation to reset the graphic
4263    planes and registers to initial state.  */
4264 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4265   do {                                                                  \
4266     int reg;                                                            \
4267     struct charset *charset;                                            \
4268                                                                         \
4269     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4270       ENCODE_SHIFT_IN;                                                  \
4271     for (reg = 0; reg < 4; reg++)                                       \
4272       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4273           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4274               != CODING_ISO_INITIAL (coding, reg)))                     \
4275         {                                                               \
4276           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4277           ENCODE_DESIGNATION (charset, reg, coding);                    \
4278         }                                                               \
4279   } while (0)
4280
4281
4282 /* Produce designation sequences of charsets in the line started from
4283    CHARBUF to a place pointed by DST, and return the number of
4284    produced bytes.  DST should not directly point a buffer text area
4285    which may be relocated by char_charset call.
4286
4287    If the current block ends before any end-of-line, we may fail to
4288    find all the necessary designations.  */
4289
4290 static ptrdiff_t
4291 encode_designation_at_bol (struct coding_system *coding,
4292                            int *charbuf, int *charbuf_end,
4293                            unsigned char *dst)
4294 {
4295   unsigned char *orig = dst;
4296   struct charset *charset;
4297   /* Table of charsets to be designated to each graphic register.  */
4298   int r[4];
4299   int c, found = 0, reg;
4300   ptrdiff_t produced_chars = 0;
4301   bool multibytep = coding->dst_multibyte;
4302   Lisp_Object attrs;
4303   Lisp_Object charset_list;
4304
4305   attrs = CODING_ID_ATTRS (coding->id);
4306   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4307   if (EQ (charset_list, Qiso_2022))
4308     charset_list = Viso_2022_charset_list;
4309
4310   for (reg = 0; reg < 4; reg++)
4311     r[reg] = -1;
4312
4313   while (charbuf < charbuf_end && found < 4)
4314     {
4315       int id;
4316
4317       c = *charbuf++;
4318       if (c == '\n')
4319         break;
4320       charset = char_charset (c, charset_list, NULL);
4321       id = CHARSET_ID (charset);
4322       reg = CODING_ISO_REQUEST (coding, id);
4323       if (reg >= 0 && r[reg] < 0)
4324         {
4325           found++;
4326           r[reg] = id;
4327         }
4328     }
4329
4330   if (found)
4331     {
4332       for (reg = 0; reg < 4; reg++)
4333         if (r[reg] >= 0
4334             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4335           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4336     }
4337
4338   return dst - orig;
4339 }
4340
4341 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4342
4343 static bool
4344 encode_coding_iso_2022 (struct coding_system *coding)
4345 {
4346   bool multibytep = coding->dst_multibyte;
4347   int *charbuf = coding->charbuf;
4348   int *charbuf_end = charbuf + coding->charbuf_used;
4349   unsigned char *dst = coding->destination + coding->produced;
4350   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4351   int safe_room = 16;
4352   bool bol_designation
4353     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4354        && CODING_ISO_BOL (coding));
4355   ptrdiff_t produced_chars = 0;
4356   Lisp_Object attrs, eol_type, charset_list;
4357   bool ascii_compatible;
4358   int c;
4359   int preferred_charset_id = -1;
4360
4361   CODING_GET_INFO (coding, attrs, charset_list);
4362   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4363   if (VECTORP (eol_type))
4364     eol_type = Qunix;
4365
4366   setup_iso_safe_charsets (attrs);
4367   /* Charset list may have been changed.  */
4368   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4369   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4370
4371   ascii_compatible
4372     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4373        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4374                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4375
4376   while (charbuf < charbuf_end)
4377     {
4378       ASSURE_DESTINATION (safe_room);
4379
4380       if (bol_designation)
4381         {
4382           /* We have to produce designation sequences if any now.  */
4383           unsigned char desig_buf[16];
4384           int nbytes;
4385           ptrdiff_t offset;
4386
4387           charset_map_loaded = 0;
4388           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4389                                               desig_buf);
4390           if (charset_map_loaded
4391               && (offset = coding_change_destination (coding)))
4392             {
4393               dst += offset;
4394               dst_end += offset;
4395             }
4396           memcpy (dst, desig_buf, nbytes);
4397           dst += nbytes;
4398           /* We are sure that designation sequences are all ASCII bytes.  */
4399           produced_chars += nbytes;
4400           bol_designation = 0;
4401           ASSURE_DESTINATION (safe_room);
4402         }
4403
4404       c = *charbuf++;
4405
4406       if (c < 0)
4407         {
4408           /* Handle an annotation.  */
4409           switch (*charbuf)
4410             {
4411             case CODING_ANNOTATE_COMPOSITION_MASK:
4412               /* Not yet implemented.  */
4413               break;
4414             case CODING_ANNOTATE_CHARSET_MASK:
4415               preferred_charset_id = charbuf[2];
4416               if (preferred_charset_id >= 0
4417                   && NILP (Fmemq (make_number (preferred_charset_id),
4418                                   charset_list)))
4419                 preferred_charset_id = -1;
4420               break;
4421             default:
4422               abort ();
4423             }
4424           charbuf += -c - 1;
4425           continue;
4426         }
4427
4428       /* Now encode the character C.  */
4429       if (c < 0x20 || c == 0x7F)
4430         {
4431           if (c == '\n'
4432               || (c == '\r' && EQ (eol_type, Qmac)))
4433             {
4434               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4435                 ENCODE_RESET_PLANE_AND_REGISTER ();
4436               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4437                 {
4438                   int i;
4439
4440                   for (i = 0; i < 4; i++)
4441                     CODING_ISO_DESIGNATION (coding, i)
4442                       = CODING_ISO_INITIAL (coding, i);
4443                 }
4444               bol_designation = ((CODING_ISO_FLAGS (coding)
4445                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4446                                  != 0);
4447             }
4448           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4449             ENCODE_RESET_PLANE_AND_REGISTER ();
4450           EMIT_ONE_ASCII_BYTE (c);
4451         }
4452       else if (ASCII_CHAR_P (c))
4453         {
4454           if (ascii_compatible)
4455             EMIT_ONE_ASCII_BYTE (c);
4456           else
4457             {
4458               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4459               ENCODE_ISO_CHARACTER (charset, c);
4460             }
4461         }
4462       else if (CHAR_BYTE8_P (c))
4463         {
4464           c = CHAR_TO_BYTE8 (c);
4465           EMIT_ONE_BYTE (c);
4466         }
4467       else
4468         {
4469           struct charset *charset;
4470
4471           if (preferred_charset_id >= 0)
4472             {
4473               bool result;
4474
4475               charset = CHARSET_FROM_ID (preferred_charset_id);
4476               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4477               if (! result)
4478                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4479                                      NULL, charset);
4480             }
4481           else
4482             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4483                                  NULL, charset);
4484           if (!charset)
4485             {
4486               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4487                 {
4488                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4489                   charset = CHARSET_FROM_ID (charset_ascii);
4490                 }
4491               else
4492                 {
4493                   c = coding->default_char;
4494                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4495                                        charset_list, NULL, charset);
4496                 }
4497             }
4498           ENCODE_ISO_CHARACTER (charset, c);
4499         }
4500     }
4501
4502   if (coding->mode & CODING_MODE_LAST_BLOCK
4503       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4504     {
4505       ASSURE_DESTINATION (safe_room);
4506       ENCODE_RESET_PLANE_AND_REGISTER ();
4507     }
4508   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4509   CODING_ISO_BOL (coding) = bol_designation;
4510   coding->produced_char += produced_chars;
4511   coding->produced = dst - coding->destination;
4512   return 0;
4513 }
4514
4515 \f
4516 /*** 8,9. SJIS and BIG5 handlers ***/
4517
4518 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4519    quite widely.  So, for the moment, Emacs supports them in the bare
4520    C code.  But, in the future, they may be supported only by CCL.  */
4521
4522 /* SJIS is a coding system encoding three character sets: ASCII, right
4523    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4524    as is.  A character of charset katakana-jisx0201 is encoded by
4525    "position-code + 0x80".  A character of charset japanese-jisx0208
4526    is encoded in 2-byte but two position-codes are divided and shifted
4527    so that it fit in the range below.
4528
4529    --- CODE RANGE of SJIS ---
4530    (character set)      (range)
4531    ASCII                0x00 .. 0x7F
4532    KATAKANA-JISX0201    0xA0 .. 0xDF
4533    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4534             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4535    -------------------------------
4536
4537 */
4538
4539 /* BIG5 is a coding system encoding two character sets: ASCII and
4540    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4541    character set and is encoded in two-byte.
4542
4543    --- CODE RANGE of BIG5 ---
4544    (character set)      (range)
4545    ASCII                0x00 .. 0x7F
4546    Big5 (1st byte)      0xA1 .. 0xFE
4547         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4548    --------------------------
4549
4550   */
4551
4552 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4553    Return true if a text is encoded in SJIS.  */
4554
4555 static bool
4556 detect_coding_sjis (struct coding_system *coding,
4557                     struct coding_detection_info *detect_info)
4558 {
4559   const unsigned char *src = coding->source, *src_base;
4560   const unsigned char *src_end = coding->source + coding->src_bytes;
4561   bool multibytep = coding->src_multibyte;
4562   ptrdiff_t consumed_chars = 0;
4563   int found = 0;
4564   int c;
4565   Lisp_Object attrs, charset_list;
4566   int max_first_byte_of_2_byte_code;
4567
4568   CODING_GET_INFO (coding, attrs, charset_list);
4569   max_first_byte_of_2_byte_code
4570     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4571
4572   detect_info->checked |= CATEGORY_MASK_SJIS;
4573   /* A coding system of this category is always ASCII compatible.  */
4574   src += coding->head_ascii;
4575
4576   while (1)
4577     {
4578       src_base = src;
4579       ONE_MORE_BYTE (c);
4580       if (c < 0x80)
4581         continue;
4582       if ((c >= 0x81 && c <= 0x9F)
4583           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4584         {
4585           ONE_MORE_BYTE (c);
4586           if (c < 0x40 || c == 0x7F || c > 0xFC)
4587             break;
4588           found = CATEGORY_MASK_SJIS;
4589         }
4590       else if (c >= 0xA0 && c < 0xE0)
4591         found = CATEGORY_MASK_SJIS;
4592       else
4593         break;
4594     }
4595   detect_info->rejected |= CATEGORY_MASK_SJIS;
4596   return 0;
4597
4598  no_more_source:
4599   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4600     {
4601       detect_info->rejected |= CATEGORY_MASK_SJIS;
4602       return 0;
4603     }
4604   detect_info->found |= found;
4605   return 1;
4606 }
4607
4608 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4609    Return true if a text is encoded in BIG5.  */
4610
4611 static bool
4612 detect_coding_big5 (struct coding_system *coding,
4613                     struct coding_detection_info *detect_info)
4614 {
4615   const unsigned char *src = coding->source, *src_base;
4616   const unsigned char *src_end = coding->source + coding->src_bytes;
4617   bool multibytep = coding->src_multibyte;
4618   ptrdiff_t consumed_chars = 0;
4619   int found = 0;
4620   int c;
4621
4622   detect_info->checked |= CATEGORY_MASK_BIG5;
4623   /* A coding system of this category is always ASCII compatible.  */
4624   src += coding->head_ascii;
4625
4626   while (1)
4627     {
4628       src_base = src;
4629       ONE_MORE_BYTE (c);
4630       if (c < 0x80)
4631         continue;
4632       if (c >= 0xA1)
4633         {
4634           ONE_MORE_BYTE (c);
4635           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4636             return 0;
4637           found = CATEGORY_MASK_BIG5;
4638         }
4639       else
4640         break;
4641     }
4642   detect_info->rejected |= CATEGORY_MASK_BIG5;
4643   return 0;
4644
4645  no_more_source:
4646   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4647     {
4648       detect_info->rejected |= CATEGORY_MASK_BIG5;
4649       return 0;
4650     }
4651   detect_info->found |= found;
4652   return 1;
4653 }
4654
4655 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4656
4657 static void
4658 decode_coding_sjis (struct coding_system *coding)
4659 {
4660   const unsigned char *src = coding->source + coding->consumed;
4661   const unsigned char *src_end = coding->source + coding->src_bytes;
4662   const unsigned char *src_base;
4663   int *charbuf = coding->charbuf + coding->charbuf_used;
4664   /* We may produce one charset annotation in one loop and one more at
4665      the end.  */
4666   int *charbuf_end
4667     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4668   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4669   bool multibytep = coding->src_multibyte;
4670   struct charset *charset_roman, *charset_kanji, *charset_kana;
4671   struct charset *charset_kanji2;
4672   Lisp_Object attrs, charset_list, val;
4673   ptrdiff_t char_offset = coding->produced_char;
4674   ptrdiff_t last_offset = char_offset;
4675   int last_id = charset_ascii;
4676   bool eol_dos
4677     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4678   int byte_after_cr = -1;
4679
4680   CODING_GET_INFO (coding, attrs, charset_list);
4681
4682   val = charset_list;
4683   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4684   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4685   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4686   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4687
4688   while (1)
4689     {
4690       int c, c1;
4691       struct charset *charset;
4692
4693       src_base = src;
4694       consumed_chars_base = consumed_chars;
4695
4696       if (charbuf >= charbuf_end)
4697         {
4698           if (byte_after_cr >= 0)
4699             src_base--;
4700           break;
4701         }
4702
4703       if (byte_after_cr >= 0)
4704         c = byte_after_cr, byte_after_cr = -1;
4705       else
4706         ONE_MORE_BYTE (c);
4707       if (c < 0)
4708         goto invalid_code;
4709       if (c < 0x80)
4710         {
4711           if (eol_dos && c == '\r')
4712             ONE_MORE_BYTE (byte_after_cr);
4713           charset = charset_roman;
4714         }
4715       else if (c == 0x80 || c == 0xA0)
4716         goto invalid_code;
4717       else if (c >= 0xA1 && c <= 0xDF)
4718         {
4719           /* SJIS -> JISX0201-Kana */
4720           c &= 0x7F;
4721           charset = charset_kana;
4722         }
4723       else if (c <= 0xEF)
4724         {
4725           /* SJIS -> JISX0208 */
4726           ONE_MORE_BYTE (c1);
4727           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4728             goto invalid_code;
4729           c = (c << 8) | c1;
4730           SJIS_TO_JIS (c);
4731           charset = charset_kanji;
4732         }
4733       else if (c <= 0xFC && charset_kanji2)
4734         {
4735           /* SJIS -> JISX0213-2 */
4736           ONE_MORE_BYTE (c1);
4737           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4738             goto invalid_code;
4739           c = (c << 8) | c1;
4740           SJIS_TO_JIS2 (c);
4741           charset = charset_kanji2;
4742         }
4743       else
4744         goto invalid_code;
4745       if (charset->id != charset_ascii
4746           && last_id != charset->id)
4747         {
4748           if (last_id != charset_ascii)
4749             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4750           last_id = charset->id;
4751           last_offset = char_offset;
4752         }
4753       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4754       *charbuf++ = c;
4755       char_offset++;
4756       continue;
4757
4758     invalid_code:
4759       src = src_base;
4760       consumed_chars = consumed_chars_base;
4761       ONE_MORE_BYTE (c);
4762       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4763       char_offset++;
4764       coding->errors++;
4765     }
4766
4767  no_more_source:
4768   if (last_id != charset_ascii)
4769     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4770   coding->consumed_char += consumed_chars_base;
4771   coding->consumed = src_base - coding->source;
4772   coding->charbuf_used = charbuf - coding->charbuf;
4773 }
4774
4775 static void
4776 decode_coding_big5 (struct coding_system *coding)
4777 {
4778   const unsigned char *src = coding->source + coding->consumed;
4779   const unsigned char *src_end = coding->source + coding->src_bytes;
4780   const unsigned char *src_base;
4781   int *charbuf = coding->charbuf + coding->charbuf_used;
4782   /* We may produce one charset annotation in one loop and one more at
4783      the end.  */
4784   int *charbuf_end
4785     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4786   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4787   bool multibytep = coding->src_multibyte;
4788   struct charset *charset_roman, *charset_big5;
4789   Lisp_Object attrs, charset_list, val;
4790   ptrdiff_t char_offset = coding->produced_char;
4791   ptrdiff_t last_offset = char_offset;
4792   int last_id = charset_ascii;
4793   bool eol_dos
4794     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4795   int byte_after_cr = -1;
4796
4797   CODING_GET_INFO (coding, attrs, charset_list);
4798   val = charset_list;
4799   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4800   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4801
4802   while (1)
4803     {
4804       int c, c1;
4805       struct charset *charset;
4806
4807       src_base = src;
4808       consumed_chars_base = consumed_chars;
4809
4810       if (charbuf >= charbuf_end)
4811         {
4812           if (byte_after_cr >= 0)
4813             src_base--;
4814           break;
4815         }
4816
4817       if (byte_after_cr >= 0)
4818         c = byte_after_cr, byte_after_cr = -1;
4819       else
4820         ONE_MORE_BYTE (c);
4821
4822       if (c < 0)
4823         goto invalid_code;
4824       if (c < 0x80)
4825         {
4826           if (eol_dos && c == '\r')
4827             ONE_MORE_BYTE (byte_after_cr);
4828           charset = charset_roman;
4829         }
4830       else
4831         {
4832           /* BIG5 -> Big5 */
4833           if (c < 0xA1 || c > 0xFE)
4834             goto invalid_code;
4835           ONE_MORE_BYTE (c1);
4836           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4837             goto invalid_code;
4838           c = c << 8 | c1;
4839           charset = charset_big5;
4840         }
4841       if (charset->id != charset_ascii
4842           && last_id != charset->id)
4843         {
4844           if (last_id != charset_ascii)
4845             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4846           last_id = charset->id;
4847           last_offset = char_offset;
4848         }
4849       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4850       *charbuf++ = c;
4851       char_offset++;
4852       continue;
4853
4854     invalid_code:
4855       src = src_base;
4856       consumed_chars = consumed_chars_base;
4857       ONE_MORE_BYTE (c);
4858       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4859       char_offset++;
4860       coding->errors++;
4861     }
4862
4863  no_more_source:
4864   if (last_id != charset_ascii)
4865     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4866   coding->consumed_char += consumed_chars_base;
4867   coding->consumed = src_base - coding->source;
4868   coding->charbuf_used = charbuf - coding->charbuf;
4869 }
4870
4871 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4872    This function can encode charsets `ascii', `katakana-jisx0201',
4873    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4874    are sure that all these charsets are registered as official charset
4875    (i.e. do not have extended leading-codes).  Characters of other
4876    charsets are produced without any encoding.  */
4877
4878 static bool
4879 encode_coding_sjis (struct coding_system *coding)
4880 {
4881   bool multibytep = coding->dst_multibyte;
4882   int *charbuf = coding->charbuf;
4883   int *charbuf_end = charbuf + coding->charbuf_used;
4884   unsigned char *dst = coding->destination + coding->produced;
4885   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4886   int safe_room = 4;
4887   ptrdiff_t produced_chars = 0;
4888   Lisp_Object attrs, charset_list, val;
4889   bool ascii_compatible;
4890   struct charset *charset_kanji, *charset_kana;
4891   struct charset *charset_kanji2;
4892   int c;
4893
4894   CODING_GET_INFO (coding, attrs, charset_list);
4895   val = XCDR (charset_list);
4896   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4897   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4898   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4899
4900   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4901
4902   while (charbuf < charbuf_end)
4903     {
4904       ASSURE_DESTINATION (safe_room);
4905       c = *charbuf++;
4906       /* Now encode the character C.  */
4907       if (ASCII_CHAR_P (c) && ascii_compatible)
4908         EMIT_ONE_ASCII_BYTE (c);
4909       else if (CHAR_BYTE8_P (c))
4910         {
4911           c = CHAR_TO_BYTE8 (c);
4912           EMIT_ONE_BYTE (c);
4913         }
4914       else
4915         {
4916           unsigned code;
4917           struct charset *charset;
4918           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4919                                &code, charset);
4920
4921           if (!charset)
4922             {
4923               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4924                 {
4925                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4926                   charset = CHARSET_FROM_ID (charset_ascii);
4927                 }
4928               else
4929                 {
4930                   c = coding->default_char;
4931                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4932                                        charset_list, &code, charset);
4933                 }
4934             }
4935           if (code == CHARSET_INVALID_CODE (charset))
4936             abort ();
4937           if (charset == charset_kanji)
4938             {
4939               int c1, c2;
4940               JIS_TO_SJIS (code);
4941               c1 = code >> 8, c2 = code & 0xFF;
4942               EMIT_TWO_BYTES (c1, c2);
4943             }
4944           else if (charset == charset_kana)
4945             EMIT_ONE_BYTE (code | 0x80);
4946           else if (charset_kanji2 && charset == charset_kanji2)
4947             {
4948               int c1, c2;
4949
4950               c1 = code >> 8;
4951               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4952                   || c1 == 0x28
4953                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4954                 {
4955                   JIS_TO_SJIS2 (code);
4956                   c1 = code >> 8, c2 = code & 0xFF;
4957                   EMIT_TWO_BYTES (c1, c2);
4958                 }
4959               else
4960                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4961             }
4962           else
4963             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4964         }
4965     }
4966   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4967   coding->produced_char += produced_chars;
4968   coding->produced = dst - coding->destination;
4969   return 0;
4970 }
4971
4972 static bool
4973 encode_coding_big5 (struct coding_system *coding)
4974 {
4975   bool multibytep = coding->dst_multibyte;
4976   int *charbuf = coding->charbuf;
4977   int *charbuf_end = charbuf + coding->charbuf_used;
4978   unsigned char *dst = coding->destination + coding->produced;
4979   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4980   int safe_room = 4;
4981   ptrdiff_t produced_chars = 0;
4982   Lisp_Object attrs, charset_list, val;
4983   bool ascii_compatible;
4984   struct charset *charset_big5;
4985   int c;
4986
4987   CODING_GET_INFO (coding, attrs, charset_list);
4988   val = XCDR (charset_list);
4989   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4990   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4991
4992   while (charbuf < charbuf_end)
4993     {
4994       ASSURE_DESTINATION (safe_room);
4995       c = *charbuf++;
4996       /* Now encode the character C.  */
4997       if (ASCII_CHAR_P (c) && ascii_compatible)
4998         EMIT_ONE_ASCII_BYTE (c);
4999       else if (CHAR_BYTE8_P (c))
5000         {
5001           c = CHAR_TO_BYTE8 (c);
5002           EMIT_ONE_BYTE (c);
5003         }
5004       else
5005         {
5006           unsigned code;
5007           struct charset *charset;
5008           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5009                                &code, charset);
5010
5011           if (! charset)
5012             {
5013               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5014                 {
5015                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5016                   charset = CHARSET_FROM_ID (charset_ascii);
5017                 }
5018               else
5019                 {
5020                   c = coding->default_char;
5021                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5022                                        charset_list, &code, charset);
5023                 }
5024             }
5025           if (code == CHARSET_INVALID_CODE (charset))
5026             abort ();
5027           if (charset == charset_big5)
5028             {
5029               int c1, c2;
5030
5031               c1 = code >> 8, c2 = code & 0xFF;
5032               EMIT_TWO_BYTES (c1, c2);
5033             }
5034           else
5035             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5036         }
5037     }
5038   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5039   coding->produced_char += produced_chars;
5040   coding->produced = dst - coding->destination;
5041   return 0;
5042 }
5043
5044 \f
5045 /*** 10. CCL handlers ***/
5046
5047 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5048    Return true if a text is encoded in a coding system of which
5049    encoder/decoder are written in CCL program.  */
5050
5051 static bool
5052 detect_coding_ccl (struct coding_system *coding,
5053                    struct coding_detection_info *detect_info)
5054 {
5055   const unsigned char *src = coding->source, *src_base;
5056   const unsigned char *src_end = coding->source + coding->src_bytes;
5057   bool multibytep = coding->src_multibyte;
5058   ptrdiff_t consumed_chars = 0;
5059   int found = 0;
5060   unsigned char *valids;
5061   ptrdiff_t head_ascii = coding->head_ascii;
5062   Lisp_Object attrs;
5063
5064   detect_info->checked |= CATEGORY_MASK_CCL;
5065
5066   coding = &coding_categories[coding_category_ccl];
5067   valids = CODING_CCL_VALIDS (coding);
5068   attrs = CODING_ID_ATTRS (coding->id);
5069   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5070     src += head_ascii;
5071
5072   while (1)
5073     {
5074       int c;
5075
5076       src_base = src;
5077       ONE_MORE_BYTE (c);
5078       if (c < 0 || ! valids[c])
5079         break;
5080       if ((valids[c] > 1))
5081         found = CATEGORY_MASK_CCL;
5082     }
5083   detect_info->rejected |= CATEGORY_MASK_CCL;
5084   return 0;
5085
5086  no_more_source:
5087   detect_info->found |= found;
5088   return 1;
5089 }
5090
5091 static void
5092 decode_coding_ccl (struct coding_system *coding)
5093 {
5094   const unsigned char *src = coding->source + coding->consumed;
5095   const unsigned char *src_end = coding->source + coding->src_bytes;
5096   int *charbuf = coding->charbuf + coding->charbuf_used;
5097   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5098   ptrdiff_t consumed_chars = 0;
5099   bool multibytep = coding->src_multibyte;
5100   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5101   int source_charbuf[1024];
5102   int source_byteidx[1025];
5103   Lisp_Object attrs, charset_list;
5104
5105   CODING_GET_INFO (coding, attrs, charset_list);
5106
5107   while (1)
5108     {
5109       const unsigned char *p = src;
5110       int i = 0;
5111
5112       if (multibytep)
5113         {
5114           while (i < 1024 && p < src_end)
5115             {
5116               source_byteidx[i] = p - src;
5117               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5118             }
5119           source_byteidx[i] = p - src;
5120         }
5121       else
5122         while (i < 1024 && p < src_end)
5123           source_charbuf[i++] = *p++;
5124
5125       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5126         ccl->last_block = 1;
5127       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5128                   charset_list);
5129       charbuf += ccl->produced;
5130       if (multibytep)
5131         src += source_byteidx[ccl->consumed];
5132       else
5133         src += ccl->consumed;
5134       consumed_chars += ccl->consumed;
5135       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5136         break;
5137     }
5138
5139   switch (ccl->status)
5140     {
5141     case CCL_STAT_SUSPEND_BY_SRC:
5142       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5143       break;
5144     case CCL_STAT_SUSPEND_BY_DST:
5145       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5146       break;
5147     case CCL_STAT_QUIT:
5148     case CCL_STAT_INVALID_CMD:
5149       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5150       break;
5151     default:
5152       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5153       break;
5154     }
5155   coding->consumed_char += consumed_chars;
5156   coding->consumed = src - coding->source;
5157   coding->charbuf_used = charbuf - coding->charbuf;
5158 }
5159
5160 static bool
5161 encode_coding_ccl (struct coding_system *coding)
5162 {
5163   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5164   bool multibytep = coding->dst_multibyte;
5165   int *charbuf = coding->charbuf;
5166   int *charbuf_end = charbuf + coding->charbuf_used;
5167   unsigned char *dst = coding->destination + coding->produced;
5168   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5169   int destination_charbuf[1024];
5170   ptrdiff_t produced_chars = 0;
5171   int i;
5172   Lisp_Object attrs, charset_list;
5173
5174   CODING_GET_INFO (coding, attrs, charset_list);
5175   if (coding->consumed_char == coding->src_chars
5176       && coding->mode & CODING_MODE_LAST_BLOCK)
5177     ccl->last_block = 1;
5178
5179   do
5180     {
5181       ccl_driver (ccl, charbuf, destination_charbuf,
5182                   charbuf_end - charbuf, 1024, charset_list);
5183       if (multibytep)
5184         {
5185           ASSURE_DESTINATION (ccl->produced * 2);
5186           for (i = 0; i < ccl->produced; i++)
5187             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5188         }
5189       else
5190         {
5191           ASSURE_DESTINATION (ccl->produced);
5192           for (i = 0; i < ccl->produced; i++)
5193             *dst++ = destination_charbuf[i] & 0xFF;
5194           produced_chars += ccl->produced;
5195         }
5196       charbuf += ccl->consumed;
5197       if (ccl->status == CCL_STAT_QUIT
5198           || ccl->status == CCL_STAT_INVALID_CMD)
5199         break;
5200     }
5201   while (charbuf < charbuf_end);
5202
5203   switch (ccl->status)
5204     {
5205     case CCL_STAT_SUSPEND_BY_SRC:
5206       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5207       break;
5208     case CCL_STAT_SUSPEND_BY_DST:
5209       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5210       break;
5211     case CCL_STAT_QUIT:
5212     case CCL_STAT_INVALID_CMD:
5213       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5214       break;
5215     default:
5216       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5217       break;
5218     }
5219
5220   coding->produced_char += produced_chars;
5221   coding->produced = dst - coding->destination;
5222   return 0;
5223 }
5224
5225 \f
5226 /*** 10, 11. no-conversion handlers ***/
5227
5228 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5229
5230 static void
5231 decode_coding_raw_text (struct coding_system *coding)
5232 {
5233   bool eol_dos
5234     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5235
5236   coding->chars_at_source = 1;
5237   coding->consumed_char = coding->src_chars;
5238   coding->consumed = coding->src_bytes;
5239   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5240     {
5241       coding->consumed_char--;
5242       coding->consumed--;
5243       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5244     }
5245   else
5246     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5247 }
5248
5249 static bool
5250 encode_coding_raw_text (struct coding_system *coding)
5251 {
5252   bool multibytep = coding->dst_multibyte;
5253   int *charbuf = coding->charbuf;
5254   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5255   unsigned char *dst = coding->destination + coding->produced;
5256   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5257   ptrdiff_t produced_chars = 0;
5258   int c;
5259
5260   if (multibytep)
5261     {
5262       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5263
5264       if (coding->src_multibyte)
5265         while (charbuf < charbuf_end)
5266           {
5267             ASSURE_DESTINATION (safe_room);
5268             c = *charbuf++;
5269             if (ASCII_CHAR_P (c))
5270               EMIT_ONE_ASCII_BYTE (c);
5271             else if (CHAR_BYTE8_P (c))
5272               {
5273                 c = CHAR_TO_BYTE8 (c);
5274                 EMIT_ONE_BYTE (c);
5275               }
5276             else
5277               {
5278                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5279
5280                 CHAR_STRING_ADVANCE (c, p1);
5281                 do
5282                   {
5283                     EMIT_ONE_BYTE (*p0);
5284                     p0++;
5285                   }
5286                 while (p0 < p1);
5287               }
5288           }
5289       else
5290         while (charbuf < charbuf_end)
5291           {
5292             ASSURE_DESTINATION (safe_room);
5293             c = *charbuf++;
5294             EMIT_ONE_BYTE (c);
5295           }
5296     }
5297   else
5298     {
5299       if (coding->src_multibyte)
5300         {
5301           int safe_room = MAX_MULTIBYTE_LENGTH;
5302
5303           while (charbuf < charbuf_end)
5304             {
5305               ASSURE_DESTINATION (safe_room);
5306               c = *charbuf++;
5307               if (ASCII_CHAR_P (c))
5308                 *dst++ = c;
5309               else if (CHAR_BYTE8_P (c))
5310                 *dst++ = CHAR_TO_BYTE8 (c);
5311               else
5312                 CHAR_STRING_ADVANCE (c, dst);
5313             }
5314         }
5315       else
5316         {
5317           ASSURE_DESTINATION (charbuf_end - charbuf);
5318           while (charbuf < charbuf_end && dst < dst_end)
5319             *dst++ = *charbuf++;
5320         }
5321       produced_chars = dst - (coding->destination + coding->produced);
5322     }
5323   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5324   coding->produced_char += produced_chars;
5325   coding->produced = dst - coding->destination;
5326   return 0;
5327 }
5328
5329 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5330    Return true if a text is encoded in a charset-based coding system.  */
5331
5332 static bool
5333 detect_coding_charset (struct coding_system *coding,
5334                        struct coding_detection_info *detect_info)
5335 {
5336   const unsigned char *src = coding->source, *src_base;
5337   const unsigned char *src_end = coding->source + coding->src_bytes;
5338   bool multibytep = coding->src_multibyte;
5339   ptrdiff_t consumed_chars = 0;
5340   Lisp_Object attrs, valids, name;
5341   int found = 0;
5342   ptrdiff_t head_ascii = coding->head_ascii;
5343   bool check_latin_extra = 0;
5344
5345   detect_info->checked |= CATEGORY_MASK_CHARSET;
5346
5347   coding = &coding_categories[coding_category_charset];
5348   attrs = CODING_ID_ATTRS (coding->id);
5349   valids = AREF (attrs, coding_attr_charset_valids);
5350   name = CODING_ID_NAME (coding->id);
5351   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5352                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5353       || strncmp (SSDATA (SYMBOL_NAME (name)),
5354                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5355     check_latin_extra = 1;
5356
5357   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5358     src += head_ascii;
5359
5360   while (1)
5361     {
5362       int c;
5363       Lisp_Object val;
5364       struct charset *charset;
5365       int dim, idx;
5366
5367       src_base = src;
5368       ONE_MORE_BYTE (c);
5369       if (c < 0)
5370         continue;
5371       val = AREF (valids, c);
5372       if (NILP (val))
5373         break;
5374       if (c >= 0x80)
5375         {
5376           if (c < 0xA0
5377               && check_latin_extra
5378               && (!VECTORP (Vlatin_extra_code_table)
5379                   || NILP (AREF (Vlatin_extra_code_table, c))))
5380             break;
5381           found = CATEGORY_MASK_CHARSET;
5382         }
5383       if (INTEGERP (val))
5384         {
5385           charset = CHARSET_FROM_ID (XFASTINT (val));
5386           dim = CHARSET_DIMENSION (charset);
5387           for (idx = 1; idx < dim; idx++)
5388             {
5389               if (src == src_end)
5390                 goto too_short;
5391               ONE_MORE_BYTE (c);
5392               if (c < charset->code_space[(dim - 1 - idx) * 4]
5393                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5394                 break;
5395             }
5396           if (idx < dim)
5397             break;
5398         }
5399       else
5400         {
5401           idx = 1;
5402           for (; CONSP (val); val = XCDR (val))
5403             {
5404               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5405               dim = CHARSET_DIMENSION (charset);
5406               while (idx < dim)
5407                 {
5408                   if (src == src_end)
5409                     goto too_short;
5410                   ONE_MORE_BYTE (c);
5411                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5412                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5413                     break;
5414                   idx++;
5415                 }
5416               if (idx == dim)
5417                 {
5418                   val = Qnil;
5419                   break;
5420                 }
5421             }
5422           if (CONSP (val))
5423             break;
5424         }
5425     }
5426  too_short:
5427   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5428   return 0;
5429
5430  no_more_source:
5431   detect_info->found |= found;
5432   return 1;
5433 }
5434
5435 static void
5436 decode_coding_charset (struct coding_system *coding)
5437 {
5438   const unsigned char *src = coding->source + coding->consumed;
5439   const unsigned char *src_end = coding->source + coding->src_bytes;
5440   const unsigned char *src_base;
5441   int *charbuf = coding->charbuf + coding->charbuf_used;
5442   /* We may produce one charset annotation in one loop and one more at
5443      the end.  */
5444   int *charbuf_end
5445     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5446   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5447   bool multibytep = coding->src_multibyte;
5448   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5449   Lisp_Object valids;
5450   ptrdiff_t char_offset = coding->produced_char;
5451   ptrdiff_t last_offset = char_offset;
5452   int last_id = charset_ascii;
5453   bool eol_dos
5454     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5455   int byte_after_cr = -1;
5456
5457   valids = AREF (attrs, coding_attr_charset_valids);
5458
5459   while (1)
5460     {
5461       int c;
5462       Lisp_Object val;
5463       struct charset *charset;
5464       int dim;
5465       int len = 1;
5466       unsigned code;
5467
5468       src_base = src;
5469       consumed_chars_base = consumed_chars;
5470
5471       if (charbuf >= charbuf_end)
5472         {
5473           if (byte_after_cr >= 0)
5474             src_base--;
5475           break;
5476         }
5477
5478       if (byte_after_cr >= 0)
5479         {
5480           c = byte_after_cr;
5481           byte_after_cr = -1;
5482         }
5483       else
5484         {
5485           ONE_MORE_BYTE (c);
5486           if (eol_dos && c == '\r')
5487             ONE_MORE_BYTE (byte_after_cr);
5488         }
5489       if (c < 0)
5490         goto invalid_code;
5491       code = c;
5492
5493       val = AREF (valids, c);
5494       if (! INTEGERP (val) && ! CONSP (val))
5495         goto invalid_code;
5496       if (INTEGERP (val))
5497         {
5498           charset = CHARSET_FROM_ID (XFASTINT (val));
5499           dim = CHARSET_DIMENSION (charset);
5500           while (len < dim)
5501             {
5502               ONE_MORE_BYTE (c);
5503               code = (code << 8) | c;
5504               len++;
5505             }
5506           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5507                               charset, code, c);
5508         }
5509       else
5510         {
5511           /* VAL is a list of charset IDs.  It is assured that the
5512              list is sorted by charset dimensions (smaller one
5513              comes first).  */
5514           while (CONSP (val))
5515             {
5516               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5517               dim = CHARSET_DIMENSION (charset);
5518               while (len < dim)
5519                 {
5520                   ONE_MORE_BYTE (c);
5521                   code = (code << 8) | c;
5522                   len++;
5523                 }
5524               CODING_DECODE_CHAR (coding, src, src_base,
5525                                   src_end, charset, code, c);
5526               if (c >= 0)
5527                 break;
5528               val = XCDR (val);
5529             }
5530         }
5531       if (c < 0)
5532         goto invalid_code;
5533       if (charset->id != charset_ascii
5534           && last_id != charset->id)
5535         {
5536           if (last_id != charset_ascii)
5537             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5538           last_id = charset->id;
5539           last_offset = char_offset;
5540         }
5541
5542       *charbuf++ = c;
5543       char_offset++;
5544       continue;
5545
5546     invalid_code:
5547       src = src_base;
5548       consumed_chars = consumed_chars_base;
5549       ONE_MORE_BYTE (c);
5550       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5551       char_offset++;
5552       coding->errors++;
5553     }
5554
5555  no_more_source:
5556   if (last_id != charset_ascii)
5557     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5558   coding->consumed_char += consumed_chars_base;
5559   coding->consumed = src_base - coding->source;
5560   coding->charbuf_used = charbuf - coding->charbuf;
5561 }
5562
5563 static bool
5564 encode_coding_charset (struct coding_system *coding)
5565 {
5566   bool multibytep = coding->dst_multibyte;
5567   int *charbuf = coding->charbuf;
5568   int *charbuf_end = charbuf + coding->charbuf_used;
5569   unsigned char *dst = coding->destination + coding->produced;
5570   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5571   int safe_room = MAX_MULTIBYTE_LENGTH;
5572   ptrdiff_t produced_chars = 0;
5573   Lisp_Object attrs, charset_list;
5574   bool ascii_compatible;
5575   int c;
5576
5577   CODING_GET_INFO (coding, attrs, charset_list);
5578   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5579
5580   while (charbuf < charbuf_end)
5581     {
5582       struct charset *charset;
5583       unsigned code;
5584
5585       ASSURE_DESTINATION (safe_room);
5586       c = *charbuf++;
5587       if (ascii_compatible && ASCII_CHAR_P (c))
5588         EMIT_ONE_ASCII_BYTE (c);
5589       else if (CHAR_BYTE8_P (c))
5590         {
5591           c = CHAR_TO_BYTE8 (c);
5592           EMIT_ONE_BYTE (c);
5593         }
5594       else
5595         {
5596           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5597                                &code, charset);
5598
5599           if (charset)
5600             {
5601               if (CHARSET_DIMENSION (charset) == 1)
5602                 EMIT_ONE_BYTE (code);
5603               else if (CHARSET_DIMENSION (charset) == 2)
5604                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5605               else if (CHARSET_DIMENSION (charset) == 3)
5606                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5607               else
5608                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5609                                  (code >> 8) & 0xFF, code & 0xFF);
5610             }
5611           else
5612             {
5613               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5614                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5615               else
5616                 c = coding->default_char;
5617               EMIT_ONE_BYTE (c);
5618             }
5619         }
5620     }
5621
5622   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5623   coding->produced_char += produced_chars;
5624   coding->produced = dst - coding->destination;
5625   return 0;
5626 }
5627
5628 \f
5629 /*** 7. C library functions ***/
5630
5631 /* Setup coding context CODING from information about CODING_SYSTEM.
5632    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5633    CODING_SYSTEM is invalid, signal an error.  */
5634
5635 void
5636 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5637 {
5638   Lisp_Object attrs;
5639   Lisp_Object eol_type;
5640   Lisp_Object coding_type;
5641   Lisp_Object val;
5642
5643   if (NILP (coding_system))
5644     coding_system = Qundecided;
5645
5646   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5647
5648   attrs = CODING_ID_ATTRS (coding->id);
5649   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5650
5651   coding->mode = 0;
5652   coding->head_ascii = -1;
5653   if (VECTORP (eol_type))
5654     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5655                             | CODING_REQUIRE_DETECTION_MASK);
5656   else if (! EQ (eol_type, Qunix))
5657     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5658                             | CODING_REQUIRE_ENCODING_MASK);
5659   else
5660     coding->common_flags = 0;
5661   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5662     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5663   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5664     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5665   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5666     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5667
5668   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5669   coding->max_charset_id = SCHARS (val) - 1;
5670   coding->safe_charsets = SDATA (val);
5671   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5672   coding->carryover_bytes = 0;
5673
5674   coding_type = CODING_ATTR_TYPE (attrs);
5675   if (EQ (coding_type, Qundecided))
5676     {
5677       coding->detector = NULL;
5678       coding->decoder = decode_coding_raw_text;
5679       coding->encoder = encode_coding_raw_text;
5680       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5681     }
5682   else if (EQ (coding_type, Qiso_2022))
5683     {
5684       int i;
5685       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5686
5687       /* Invoke graphic register 0 to plane 0.  */
5688       CODING_ISO_INVOCATION (coding, 0) = 0;
5689       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5690       CODING_ISO_INVOCATION (coding, 1)
5691         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5692       /* Setup the initial status of designation.  */
5693       for (i = 0; i < 4; i++)
5694         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5695       /* Not single shifting initially.  */
5696       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5697       /* Beginning of buffer should also be regarded as bol. */
5698       CODING_ISO_BOL (coding) = 1;
5699       coding->detector = detect_coding_iso_2022;
5700       coding->decoder = decode_coding_iso_2022;
5701       coding->encoder = encode_coding_iso_2022;
5702       if (flags & CODING_ISO_FLAG_SAFE)
5703         coding->mode |= CODING_MODE_SAFE_ENCODING;
5704       coding->common_flags
5705         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5706             | CODING_REQUIRE_FLUSHING_MASK);
5707       if (flags & CODING_ISO_FLAG_COMPOSITION)
5708         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5709       if (flags & CODING_ISO_FLAG_DESIGNATION)
5710         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5711       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5712         {
5713           setup_iso_safe_charsets (attrs);
5714           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5715           coding->max_charset_id = SCHARS (val) - 1;
5716           coding->safe_charsets = SDATA (val);
5717         }
5718       CODING_ISO_FLAGS (coding) = flags;
5719       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5720       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5721       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5722       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5723     }
5724   else if (EQ (coding_type, Qcharset))
5725     {
5726       coding->detector = detect_coding_charset;
5727       coding->decoder = decode_coding_charset;
5728       coding->encoder = encode_coding_charset;
5729       coding->common_flags
5730         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5731     }
5732   else if (EQ (coding_type, Qutf_8))
5733     {
5734       val = AREF (attrs, coding_attr_utf_bom);
5735       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5736                                    : EQ (val, Qt) ? utf_with_bom
5737                                    : utf_without_bom);
5738       coding->detector = detect_coding_utf_8;
5739       coding->decoder = decode_coding_utf_8;
5740       coding->encoder = encode_coding_utf_8;
5741       coding->common_flags
5742         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5743       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5744         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5745     }
5746   else if (EQ (coding_type, Qutf_16))
5747     {
5748       val = AREF (attrs, coding_attr_utf_bom);
5749       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5750                                     : EQ (val, Qt) ? utf_with_bom
5751                                     : utf_without_bom);
5752       val = AREF (attrs, coding_attr_utf_16_endian);
5753       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5754                                        : utf_16_little_endian);
5755       CODING_UTF_16_SURROGATE (coding) = 0;
5756       coding->detector = detect_coding_utf_16;
5757       coding->decoder = decode_coding_utf_16;
5758       coding->encoder = encode_coding_utf_16;
5759       coding->common_flags
5760         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5761       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5762         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5763     }
5764   else if (EQ (coding_type, Qccl))
5765     {
5766       coding->detector = detect_coding_ccl;
5767       coding->decoder = decode_coding_ccl;
5768       coding->encoder = encode_coding_ccl;
5769       coding->common_flags
5770         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5771             | CODING_REQUIRE_FLUSHING_MASK);
5772     }
5773   else if (EQ (coding_type, Qemacs_mule))
5774     {
5775       coding->detector = detect_coding_emacs_mule;
5776       coding->decoder = decode_coding_emacs_mule;
5777       coding->encoder = encode_coding_emacs_mule;
5778       coding->common_flags
5779         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5780       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5781           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5782         {
5783           Lisp_Object tail, safe_charsets;
5784           int max_charset_id = 0;
5785
5786           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5787                tail = XCDR (tail))
5788             if (max_charset_id < XFASTINT (XCAR (tail)))
5789               max_charset_id = XFASTINT (XCAR (tail));
5790           safe_charsets = make_uninit_string (max_charset_id + 1);
5791           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5792           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5793                tail = XCDR (tail))
5794             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5795           coding->max_charset_id = max_charset_id;
5796           coding->safe_charsets = SDATA (safe_charsets);
5797         }
5798       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5799       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5800     }
5801   else if (EQ (coding_type, Qshift_jis))
5802     {
5803       coding->detector = detect_coding_sjis;
5804       coding->decoder = decode_coding_sjis;
5805       coding->encoder = encode_coding_sjis;
5806       coding->common_flags
5807         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5808     }
5809   else if (EQ (coding_type, Qbig5))
5810     {
5811       coding->detector = detect_coding_big5;
5812       coding->decoder = decode_coding_big5;
5813       coding->encoder = encode_coding_big5;
5814       coding->common_flags
5815         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5816     }
5817   else                          /* EQ (coding_type, Qraw_text) */
5818     {
5819       coding->detector = NULL;
5820       coding->decoder = decode_coding_raw_text;
5821       coding->encoder = encode_coding_raw_text;
5822       if (! EQ (eol_type, Qunix))
5823         {
5824           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5825           if (! VECTORP (eol_type))
5826             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5827         }
5828
5829     }
5830
5831   return;
5832 }
5833
5834 /* Return a list of charsets supported by CODING.  */
5835
5836 Lisp_Object
5837 coding_charset_list (struct coding_system *coding)
5838 {
5839   Lisp_Object attrs, charset_list;
5840
5841   CODING_GET_INFO (coding, attrs, charset_list);
5842   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5843     {
5844       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5845
5846       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5847         charset_list = Viso_2022_charset_list;
5848     }
5849   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5850     {
5851       charset_list = Vemacs_mule_charset_list;
5852     }
5853   return charset_list;
5854 }
5855
5856
5857 /* Return a list of charsets supported by CODING-SYSTEM.  */
5858
5859 Lisp_Object
5860 coding_system_charset_list (Lisp_Object coding_system)
5861 {
5862   ptrdiff_t id;
5863   Lisp_Object attrs, charset_list;
5864
5865   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5866   attrs = CODING_ID_ATTRS (id);
5867
5868   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5869     {
5870       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5871
5872       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5873         charset_list = Viso_2022_charset_list;
5874       else
5875         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5876     }
5877   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5878     {
5879       charset_list = Vemacs_mule_charset_list;
5880     }
5881   else
5882     {
5883       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5884     }
5885   return charset_list;
5886 }
5887
5888
5889 /* Return raw-text or one of its subsidiaries that has the same
5890    eol_type as CODING-SYSTEM.  */
5891
5892 Lisp_Object
5893 raw_text_coding_system (Lisp_Object coding_system)
5894 {
5895   Lisp_Object spec, attrs;
5896   Lisp_Object eol_type, raw_text_eol_type;
5897
5898   if (NILP (coding_system))
5899     return Qraw_text;
5900   spec = CODING_SYSTEM_SPEC (coding_system);
5901   attrs = AREF (spec, 0);
5902
5903   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5904     return coding_system;
5905
5906   eol_type = AREF (spec, 2);
5907   if (VECTORP (eol_type))
5908     return Qraw_text;
5909   spec = CODING_SYSTEM_SPEC (Qraw_text);
5910   raw_text_eol_type = AREF (spec, 2);
5911   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5912           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5913           : AREF (raw_text_eol_type, 2));
5914 }
5915
5916
5917 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5918    the subsidiary that has the same eol-spec as PARENT (if it is not
5919    nil and specifies end-of-line format) or the system's setting
5920    (system_eol_type).  */
5921
5922 Lisp_Object
5923 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5924 {
5925   Lisp_Object spec, eol_type;
5926
5927   if (NILP (coding_system))
5928     coding_system = Qraw_text;
5929   spec = CODING_SYSTEM_SPEC (coding_system);
5930   eol_type = AREF (spec, 2);
5931   if (VECTORP (eol_type))
5932     {
5933       Lisp_Object parent_eol_type;
5934
5935       if (! NILP (parent))
5936         {
5937           Lisp_Object parent_spec;
5938
5939           parent_spec = CODING_SYSTEM_SPEC (parent);
5940           parent_eol_type = AREF (parent_spec, 2);
5941           if (VECTORP (parent_eol_type))
5942             parent_eol_type = system_eol_type;
5943         }
5944       else
5945         parent_eol_type = system_eol_type;
5946       if (EQ (parent_eol_type, Qunix))
5947         coding_system = AREF (eol_type, 0);
5948       else if (EQ (parent_eol_type, Qdos))
5949         coding_system = AREF (eol_type, 1);
5950       else if (EQ (parent_eol_type, Qmac))
5951         coding_system = AREF (eol_type, 2);
5952     }
5953   return coding_system;
5954 }
5955
5956
5957 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5958    decided for writing to a process.  If not, complement them, and
5959    return a new coding system.  */
5960
5961 Lisp_Object
5962 complement_process_encoding_system (Lisp_Object coding_system)
5963 {
5964   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5965   Lisp_Object spec, attrs;
5966   int i;
5967
5968   for (i = 0; i < 3; i++)
5969     {
5970       if (i == 1)
5971         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5972       else if (i == 2)
5973         coding_system = preferred_coding_system ();
5974       spec = CODING_SYSTEM_SPEC (coding_system);
5975       if (NILP (spec))
5976         continue;
5977       attrs = AREF (spec, 0);
5978       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5979         coding_base = CODING_ATTR_BASE_NAME (attrs);
5980       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5981         eol_base = coding_system;
5982       if (! NILP (coding_base) && ! NILP (eol_base))
5983         break;
5984     }
5985
5986   if (i > 0)
5987     /* The original CODING_SYSTEM didn't specify text-conversion or
5988        eol-conversion.  Be sure that we return a fully complemented
5989        coding system.  */
5990     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5991   return coding_system;
5992 }
5993
5994
5995 /* Emacs has a mechanism to automatically detect a coding system if it
5996    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5997    it's impossible to distinguish some coding systems accurately
5998    because they use the same range of codes.  So, at first, coding
5999    systems are categorized into 7, those are:
6000
6001    o coding-category-emacs-mule
6002
6003         The category for a coding system which has the same code range
6004         as Emacs' internal format.  Assigned the coding-system (Lisp
6005         symbol) `emacs-mule' by default.
6006
6007    o coding-category-sjis
6008
6009         The category for a coding system which has the same code range
6010         as SJIS.  Assigned the coding-system (Lisp
6011         symbol) `japanese-shift-jis' by default.
6012
6013    o coding-category-iso-7
6014
6015         The category for a coding system which has the same code range
6016         as ISO2022 of 7-bit environment.  This doesn't use any locking
6017         shift and single shift functions.  This can encode/decode all
6018         charsets.  Assigned the coding-system (Lisp symbol)
6019         `iso-2022-7bit' by default.
6020
6021    o coding-category-iso-7-tight
6022
6023         Same as coding-category-iso-7 except that this can
6024         encode/decode only the specified charsets.
6025
6026    o coding-category-iso-8-1
6027
6028         The category for a coding system which has the same code range
6029         as ISO2022 of 8-bit environment and graphic plane 1 used only
6030         for DIMENSION1 charset.  This doesn't use any locking shift
6031         and single shift functions.  Assigned the coding-system (Lisp
6032         symbol) `iso-latin-1' by default.
6033
6034    o coding-category-iso-8-2
6035
6036         The category for a coding system which has the same code range
6037         as ISO2022 of 8-bit environment and graphic plane 1 used only
6038         for DIMENSION2 charset.  This doesn't use any locking shift
6039         and single shift functions.  Assigned the coding-system (Lisp
6040         symbol) `japanese-iso-8bit' by default.
6041
6042    o coding-category-iso-7-else
6043
6044         The category for a coding system which has the same code range
6045         as ISO2022 of 7-bit environment but uses locking shift or
6046         single shift functions.  Assigned the coding-system (Lisp
6047         symbol) `iso-2022-7bit-lock' by default.
6048
6049    o coding-category-iso-8-else
6050
6051         The category for a coding system which has the same code range
6052         as ISO2022 of 8-bit environment but uses locking shift or
6053         single shift functions.  Assigned the coding-system (Lisp
6054         symbol) `iso-2022-8bit-ss2' by default.
6055
6056    o coding-category-big5
6057
6058         The category for a coding system which has the same code range
6059         as BIG5.  Assigned the coding-system (Lisp symbol)
6060         `cn-big5' by default.
6061
6062    o coding-category-utf-8
6063
6064         The category for a coding system which has the same code range
6065         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6066         symbol) `utf-8' by default.
6067
6068    o coding-category-utf-16-be
6069
6070         The category for a coding system in which a text has an
6071         Unicode signature (cf. Unicode Standard) in the order of BIG
6072         endian at the head.  Assigned the coding-system (Lisp symbol)
6073         `utf-16-be' by default.
6074
6075    o coding-category-utf-16-le
6076
6077         The category for a coding system in which a text has an
6078         Unicode signature (cf. Unicode Standard) in the order of
6079         LITTLE endian at the head.  Assigned the coding-system (Lisp
6080         symbol) `utf-16-le' by default.
6081
6082    o coding-category-ccl
6083
6084         The category for a coding system of which encoder/decoder is
6085         written in CCL programs.  The default value is nil, i.e., no
6086         coding system is assigned.
6087
6088    o coding-category-binary
6089
6090         The category for a coding system not categorized in any of the
6091         above.  Assigned the coding-system (Lisp symbol)
6092         `no-conversion' by default.
6093
6094    Each of them is a Lisp symbol and the value is an actual
6095    `coding-system's (this is also a Lisp symbol) assigned by a user.
6096    What Emacs does actually is to detect a category of coding system.
6097    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6098    decide only one possible category, it selects a category of the
6099    highest priority.  Priorities of categories are also specified by a
6100    user in a Lisp variable `coding-category-list'.
6101
6102 */
6103
6104 #define EOL_SEEN_NONE   0
6105 #define EOL_SEEN_LF     1
6106 #define EOL_SEEN_CR     2
6107 #define EOL_SEEN_CRLF   4
6108
6109 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6110    SOURCE is encoded.  If CATEGORY is one of
6111    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6112    two-byte, else they are encoded by one-byte.
6113
6114    Return one of EOL_SEEN_XXX.  */
6115
6116 #define MAX_EOL_CHECK_COUNT 3
6117
6118 static int
6119 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6120             enum coding_category category)
6121 {
6122   const unsigned char *src = source, *src_end = src + src_bytes;
6123   unsigned char c;
6124   int total  = 0;
6125   int eol_seen = EOL_SEEN_NONE;
6126
6127   if ((1 << category) & CATEGORY_MASK_UTF_16)
6128     {
6129       bool msb = category == (coding_category_utf_16_le
6130                               | coding_category_utf_16_le_nosig);
6131       bool lsb = !msb;
6132
6133       while (src + 1 < src_end)
6134         {
6135           c = src[lsb];
6136           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6137             {
6138               int this_eol;
6139
6140               if (c == '\n')
6141                 this_eol = EOL_SEEN_LF;
6142               else if (src + 3 >= src_end
6143                        || src[msb + 2] != 0
6144                        || src[lsb + 2] != '\n')
6145                 this_eol = EOL_SEEN_CR;
6146               else
6147                 {
6148                   this_eol = EOL_SEEN_CRLF;
6149                   src += 2;
6150                 }
6151
6152               if (eol_seen == EOL_SEEN_NONE)
6153                 /* This is the first end-of-line.  */
6154                 eol_seen = this_eol;
6155               else if (eol_seen != this_eol)
6156                 {
6157                   /* The found type is different from what found before.
6158                      Allow for stray ^M characters in DOS EOL files.  */
6159                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6160                       || (eol_seen == EOL_SEEN_CRLF
6161                           && this_eol == EOL_SEEN_CR))
6162                     eol_seen = EOL_SEEN_CRLF;
6163                   else
6164                     {
6165                       eol_seen = EOL_SEEN_LF;
6166                       break;
6167                     }
6168                 }
6169               if (++total == MAX_EOL_CHECK_COUNT)
6170                 break;
6171             }
6172           src += 2;
6173         }
6174     }
6175   else
6176     while (src < src_end)
6177       {
6178         c = *src++;
6179         if (c == '\n' || c == '\r')
6180           {
6181             int this_eol;
6182
6183             if (c == '\n')
6184               this_eol = EOL_SEEN_LF;
6185             else if (src >= src_end || *src != '\n')
6186               this_eol = EOL_SEEN_CR;
6187             else
6188               this_eol = EOL_SEEN_CRLF, src++;
6189
6190             if (eol_seen == EOL_SEEN_NONE)
6191               /* This is the first end-of-line.  */
6192               eol_seen = this_eol;
6193             else if (eol_seen != this_eol)
6194               {
6195                 /* The found type is different from what found before.
6196                    Allow for stray ^M characters in DOS EOL files.  */
6197                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6198                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6199                   eol_seen = EOL_SEEN_CRLF;
6200                 else
6201                   {
6202                     eol_seen = EOL_SEEN_LF;
6203                     break;
6204                   }
6205               }
6206             if (++total == MAX_EOL_CHECK_COUNT)
6207               break;
6208           }
6209       }
6210   return eol_seen;
6211 }
6212
6213
6214 static Lisp_Object
6215 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6216 {
6217   Lisp_Object eol_type;
6218
6219   eol_type = CODING_ID_EOL_TYPE (coding->id);
6220   if (eol_seen & EOL_SEEN_LF)
6221     {
6222       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6223       eol_type = Qunix;
6224     }
6225   else if (eol_seen & EOL_SEEN_CRLF)
6226     {
6227       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6228       eol_type = Qdos;
6229     }
6230   else if (eol_seen & EOL_SEEN_CR)
6231     {
6232       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6233       eol_type = Qmac;
6234     }
6235   return eol_type;
6236 }
6237
6238 /* Detect how a text specified in CODING is encoded.  If a coding
6239    system is detected, update fields of CODING by the detected coding
6240    system.  */
6241
6242 static void
6243 detect_coding (struct coding_system *coding)
6244 {
6245   const unsigned char *src, *src_end;
6246   unsigned int saved_mode = coding->mode;
6247
6248   coding->consumed = coding->consumed_char = 0;
6249   coding->produced = coding->produced_char = 0;
6250   coding_set_source (coding);
6251
6252   src_end = coding->source + coding->src_bytes;
6253   coding->head_ascii = 0;
6254
6255   /* If we have not yet decided the text encoding type, detect it
6256      now.  */
6257   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6258     {
6259       int c, i;
6260       struct coding_detection_info detect_info;
6261       bool null_byte_found = 0, eight_bit_found = 0;
6262
6263       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6264       for (src = coding->source; src < src_end; src++)
6265         {
6266           c = *src;
6267           if (c & 0x80)
6268             {
6269               eight_bit_found = 1;
6270               if (null_byte_found)
6271                 break;
6272             }
6273           else if (c < 0x20)
6274             {
6275               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6276                   && ! inhibit_iso_escape_detection
6277                   && ! detect_info.checked)
6278                 {
6279                   if (detect_coding_iso_2022 (coding, &detect_info))
6280                     {
6281                       /* We have scanned the whole data.  */
6282                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6283                         {
6284                           /* We didn't find an 8-bit code.  We may
6285                              have found a null-byte, but it's very
6286                              rare that a binary file conforms to
6287                              ISO-2022.  */
6288                           src = src_end;
6289                           coding->head_ascii = src - coding->source;
6290                         }
6291                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6292                       break;
6293                     }
6294                 }
6295               else if (! c && !inhibit_null_byte_detection)
6296                 {
6297                   null_byte_found = 1;
6298                   if (eight_bit_found)
6299                     break;
6300                 }
6301               if (! eight_bit_found)
6302                 coding->head_ascii++;
6303             }
6304           else if (! eight_bit_found)
6305             coding->head_ascii++;
6306         }
6307
6308       if (null_byte_found || eight_bit_found
6309           || coding->head_ascii < coding->src_bytes
6310           || detect_info.found)
6311         {
6312           enum coding_category category;
6313           struct coding_system *this;
6314
6315           if (coding->head_ascii == coding->src_bytes)
6316             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6317             for (i = 0; i < coding_category_raw_text; i++)
6318               {
6319                 category = coding_priorities[i];
6320                 this = coding_categories + category;
6321                 if (detect_info.found & (1 << category))
6322                   break;
6323               }
6324           else
6325             {
6326               if (null_byte_found)
6327                 {
6328                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6329                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6330                 }
6331               for (i = 0; i < coding_category_raw_text; i++)
6332                 {
6333                   category = coding_priorities[i];
6334                   this = coding_categories + category;
6335                   if (this->id < 0)
6336                     {
6337                       /* No coding system of this category is defined.  */
6338                       detect_info.rejected |= (1 << category);
6339                     }
6340                   else if (category >= coding_category_raw_text)
6341                     continue;
6342                   else if (detect_info.checked & (1 << category))
6343                     {
6344                       if (detect_info.found & (1 << category))
6345                         break;
6346                     }
6347                   else if ((*(this->detector)) (coding, &detect_info)
6348                            && detect_info.found & (1 << category))
6349                     {
6350                       if (category == coding_category_utf_16_auto)
6351                         {
6352                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6353                             category = coding_category_utf_16_le;
6354                           else
6355                             category = coding_category_utf_16_be;
6356                         }
6357                       break;
6358                     }
6359                 }
6360             }
6361
6362           if (i < coding_category_raw_text)
6363             setup_coding_system (CODING_ID_NAME (this->id), coding);
6364           else if (null_byte_found)
6365             setup_coding_system (Qno_conversion, coding);
6366           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6367                    == CATEGORY_MASK_ANY)
6368             setup_coding_system (Qraw_text, coding);
6369           else if (detect_info.rejected)
6370             for (i = 0; i < coding_category_raw_text; i++)
6371               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6372                 {
6373                   this = coding_categories + coding_priorities[i];
6374                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6375                   break;
6376                 }
6377         }
6378     }
6379   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6380            == coding_category_utf_8_auto)
6381     {
6382       Lisp_Object coding_systems;
6383       struct coding_detection_info detect_info;
6384
6385       coding_systems
6386         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6387       detect_info.found = detect_info.rejected = 0;
6388       coding->head_ascii = 0;
6389       if (CONSP (coding_systems)
6390           && detect_coding_utf_8 (coding, &detect_info))
6391         {
6392           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6393             setup_coding_system (XCAR (coding_systems), coding);
6394           else
6395             setup_coding_system (XCDR (coding_systems), coding);
6396         }
6397     }
6398   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6399            == coding_category_utf_16_auto)
6400     {
6401       Lisp_Object coding_systems;
6402       struct coding_detection_info detect_info;
6403
6404       coding_systems
6405         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6406       detect_info.found = detect_info.rejected = 0;
6407       coding->head_ascii = 0;
6408       if (CONSP (coding_systems)
6409           && detect_coding_utf_16 (coding, &detect_info))
6410         {
6411           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6412             setup_coding_system (XCAR (coding_systems), coding);
6413           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6414             setup_coding_system (XCDR (coding_systems), coding);
6415         }
6416     }
6417   coding->mode = saved_mode;
6418 }
6419
6420
6421 static void
6422 decode_eol (struct coding_system *coding)
6423 {
6424   Lisp_Object eol_type;
6425   unsigned char *p, *pbeg, *pend;
6426
6427   eol_type = CODING_ID_EOL_TYPE (coding->id);
6428   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6429     return;
6430
6431   if (NILP (coding->dst_object))
6432     pbeg = coding->destination;
6433   else
6434     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6435   pend = pbeg + coding->produced;
6436
6437   if (VECTORP (eol_type))
6438     {
6439       int eol_seen = EOL_SEEN_NONE;
6440
6441       for (p = pbeg; p < pend; p++)
6442         {
6443           if (*p == '\n')
6444             eol_seen |= EOL_SEEN_LF;
6445           else if (*p == '\r')
6446             {
6447               if (p + 1 < pend && *(p + 1) == '\n')
6448                 {
6449                   eol_seen |= EOL_SEEN_CRLF;
6450                   p++;
6451                 }
6452               else
6453                 eol_seen |= EOL_SEEN_CR;
6454             }
6455         }
6456       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6457       if ((eol_seen & EOL_SEEN_CRLF) != 0
6458           && (eol_seen & EOL_SEEN_CR) != 0
6459           && (eol_seen & EOL_SEEN_LF) == 0)
6460         eol_seen = EOL_SEEN_CRLF;
6461       else if (eol_seen != EOL_SEEN_NONE
6462           && eol_seen != EOL_SEEN_LF
6463           && eol_seen != EOL_SEEN_CRLF
6464           && eol_seen != EOL_SEEN_CR)
6465         eol_seen = EOL_SEEN_LF;
6466       if (eol_seen != EOL_SEEN_NONE)
6467         eol_type = adjust_coding_eol_type (coding, eol_seen);
6468     }
6469
6470   if (EQ (eol_type, Qmac))
6471     {
6472       for (p = pbeg; p < pend; p++)
6473         if (*p == '\r')
6474           *p = '\n';
6475     }
6476   else if (EQ (eol_type, Qdos))
6477     {
6478       ptrdiff_t n = 0;
6479
6480       if (NILP (coding->dst_object))
6481         {
6482           /* Start deleting '\r' from the tail to minimize the memory
6483              movement.  */
6484           for (p = pend - 2; p >= pbeg; p--)
6485             if (*p == '\r')
6486               {
6487                 memmove (p, p + 1, pend-- - p - 1);
6488                 n++;
6489               }
6490         }
6491       else
6492         {
6493           ptrdiff_t pos_byte = coding->dst_pos_byte;
6494           ptrdiff_t pos = coding->dst_pos;
6495           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6496
6497           while (pos < pos_end)
6498             {
6499               p = BYTE_POS_ADDR (pos_byte);
6500               if (*p == '\r' && p[1] == '\n')
6501                 {
6502                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6503                   n++;
6504                   pos_end--;
6505                 }
6506               pos++;
6507               if (coding->dst_multibyte)
6508                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6509               else
6510                 pos_byte++;
6511             }
6512         }
6513       coding->produced -= n;
6514       coding->produced_char -= n;
6515     }
6516 }
6517
6518
6519 /* Return a translation table (or list of them) from coding system
6520    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6521    not ENCODEP). */
6522
6523 static Lisp_Object
6524 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6525 {
6526   Lisp_Object standard, translation_table;
6527   Lisp_Object val;
6528
6529   if (NILP (Venable_character_translation))
6530     {
6531       if (max_lookup)
6532         *max_lookup = 0;
6533       return Qnil;
6534     }
6535   if (encodep)
6536     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6537       standard = Vstandard_translation_table_for_encode;
6538   else
6539     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6540       standard = Vstandard_translation_table_for_decode;
6541   if (NILP (translation_table))
6542     translation_table = standard;
6543   else
6544     {
6545       if (SYMBOLP (translation_table))
6546         translation_table = Fget (translation_table, Qtranslation_table);
6547       else if (CONSP (translation_table))
6548         {
6549           translation_table = Fcopy_sequence (translation_table);
6550           for (val = translation_table; CONSP (val); val = XCDR (val))
6551             if (SYMBOLP (XCAR (val)))
6552               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6553         }
6554       if (CHAR_TABLE_P (standard))
6555         {
6556           if (CONSP (translation_table))
6557             translation_table = nconc2 (translation_table,
6558                                         Fcons (standard, Qnil));
6559           else
6560             translation_table = Fcons (translation_table,
6561                                        Fcons (standard, Qnil));
6562         }
6563     }
6564
6565   if (max_lookup)
6566     {
6567       *max_lookup = 1;
6568       if (CHAR_TABLE_P (translation_table)
6569           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6570         {
6571           val = XCHAR_TABLE (translation_table)->extras[1];
6572           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6573             *max_lookup = XFASTINT (val);
6574         }
6575       else if (CONSP (translation_table))
6576         {
6577           Lisp_Object tail;
6578
6579           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6580             if (CHAR_TABLE_P (XCAR (tail))
6581                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6582               {
6583                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6584                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6585                   *max_lookup = XFASTINT (tailval);
6586               }
6587         }
6588     }
6589   return translation_table;
6590 }
6591
6592 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6593   do {                                                          \
6594     trans = Qnil;                                               \
6595     if (CHAR_TABLE_P (table))                                   \
6596       {                                                         \
6597         trans = CHAR_TABLE_REF (table, c);                      \
6598         if (CHARACTERP (trans))                                 \
6599           c = XFASTINT (trans), trans = Qnil;                   \
6600       }                                                         \
6601     else if (CONSP (table))                                     \
6602       {                                                         \
6603         Lisp_Object tail;                                       \
6604                                                                 \
6605         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6606           if (CHAR_TABLE_P (XCAR (tail)))                       \
6607             {                                                   \
6608               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6609               if (CHARACTERP (trans))                           \
6610                 c = XFASTINT (trans), trans = Qnil;             \
6611               else if (! NILP (trans))                          \
6612                 break;                                          \
6613             }                                                   \
6614       }                                                         \
6615   } while (0)
6616
6617
6618 /* Return a translation of character(s) at BUF according to TRANS.
6619    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6620    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6621    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6622    translation is found, and Qnil if not found..
6623    If BUF is too short to lookup characters in FROM, return Qt.  */
6624
6625 static Lisp_Object
6626 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6627 {
6628
6629   if (INTEGERP (trans))
6630     return trans;
6631   for (; CONSP (trans); trans = XCDR (trans))
6632     {
6633       Lisp_Object val = XCAR (trans);
6634       Lisp_Object from = XCAR (val);
6635       ptrdiff_t len = ASIZE (from);
6636       ptrdiff_t i;
6637
6638       for (i = 0; i < len; i++)
6639         {
6640           if (buf + i == buf_end)
6641             return Qt;
6642           if (XINT (AREF (from, i)) != buf[i])
6643             break;
6644         }
6645       if (i == len)
6646         return val;
6647     }
6648   return Qnil;
6649 }
6650
6651
6652 static int
6653 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6654                bool last_block)
6655 {
6656   unsigned char *dst = coding->destination + coding->produced;
6657   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6658   ptrdiff_t produced;
6659   ptrdiff_t produced_chars = 0;
6660   int carryover = 0;
6661
6662   if (! coding->chars_at_source)
6663     {
6664       /* Source characters are in coding->charbuf.  */
6665       int *buf = coding->charbuf;
6666       int *buf_end = buf + coding->charbuf_used;
6667
6668       if (EQ (coding->src_object, coding->dst_object))
6669         {
6670           coding_set_source (coding);
6671           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6672         }
6673
6674       while (buf < buf_end)
6675         {
6676           int c = *buf;
6677           ptrdiff_t i;
6678
6679           if (c >= 0)
6680             {
6681               ptrdiff_t from_nchars = 1, to_nchars = 1;
6682               Lisp_Object trans = Qnil;
6683
6684               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6685               if (! NILP (trans))
6686                 {
6687                   trans = get_translation (trans, buf, buf_end);
6688                   if (INTEGERP (trans))
6689                     c = XINT (trans);
6690                   else if (CONSP (trans))
6691                     {
6692                       from_nchars = ASIZE (XCAR (trans));
6693                       trans = XCDR (trans);
6694                       if (INTEGERP (trans))
6695                         c = XINT (trans);
6696                       else
6697                         {
6698                           to_nchars = ASIZE (trans);
6699                           c = XINT (AREF (trans, 0));
6700                         }
6701                     }
6702                   else if (EQ (trans, Qt) && ! last_block)
6703                     break;
6704                 }
6705
6706               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6707                 {
6708                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6709                        / MAX_MULTIBYTE_LENGTH)
6710                       < to_nchars)
6711                     memory_full (SIZE_MAX);
6712                   dst = alloc_destination (coding,
6713                                            buf_end - buf
6714                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6715                                            dst);
6716                   if (EQ (coding->src_object, coding->dst_object))
6717                     {
6718                       coding_set_source (coding);
6719                       dst_end = (((unsigned char *) coding->source)
6720                                  + coding->consumed);
6721                     }
6722                   else
6723                     dst_end = coding->destination + coding->dst_bytes;
6724                 }
6725
6726               for (i = 0; i < to_nchars; i++)
6727                 {
6728                   if (i > 0)
6729                     c = XINT (AREF (trans, i));
6730                   if (coding->dst_multibyte
6731                       || ! CHAR_BYTE8_P (c))
6732                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6733                   else
6734                     *dst++ = CHAR_TO_BYTE8 (c);
6735                 }
6736               produced_chars += to_nchars;
6737               buf += from_nchars;
6738             }
6739           else
6740             /* This is an annotation datum.  (-C) is the length.  */
6741             buf += -c;
6742         }
6743       carryover = buf_end - buf;
6744     }
6745   else
6746     {
6747       /* Source characters are at coding->source.  */
6748       const unsigned char *src = coding->source;
6749       const unsigned char *src_end = src + coding->consumed;
6750
6751       if (EQ (coding->dst_object, coding->src_object))
6752         dst_end = (unsigned char *) src;
6753       if (coding->src_multibyte != coding->dst_multibyte)
6754         {
6755           if (coding->src_multibyte)
6756             {
6757               bool multibytep = 1;
6758               ptrdiff_t consumed_chars = 0;
6759
6760               while (1)
6761                 {
6762                   const unsigned char *src_base = src;
6763                   int c;
6764
6765                   ONE_MORE_BYTE (c);
6766                   if (dst == dst_end)
6767                     {
6768                       if (EQ (coding->src_object, coding->dst_object))
6769                         dst_end = (unsigned char *) src;
6770                       if (dst == dst_end)
6771                         {
6772                           ptrdiff_t offset = src - coding->source;
6773
6774                           dst = alloc_destination (coding, src_end - src + 1,
6775                                                    dst);
6776                           dst_end = coding->destination + coding->dst_bytes;
6777                           coding_set_source (coding);
6778                           src = coding->source + offset;
6779                           src_end = coding->source + coding->consumed;
6780                           if (EQ (coding->src_object, coding->dst_object))
6781                             dst_end = (unsigned char *) src;
6782                         }
6783                     }
6784                   *dst++ = c;
6785                   produced_chars++;
6786                 }
6787             no_more_source:
6788               ;
6789             }
6790           else
6791             while (src < src_end)
6792               {
6793                 bool multibytep = 1;
6794                 int c = *src++;
6795
6796                 if (dst >= dst_end - 1)
6797                   {
6798                     if (EQ (coding->src_object, coding->dst_object))
6799                       dst_end = (unsigned char *) src;
6800                     if (dst >= dst_end - 1)
6801                       {
6802                         ptrdiff_t offset = src - coding->source;
6803                         ptrdiff_t more_bytes;
6804
6805                         if (EQ (coding->src_object, coding->dst_object))
6806                           more_bytes = ((src_end - src) / 2) + 2;
6807                         else
6808                           more_bytes = src_end - src + 2;
6809                         dst = alloc_destination (coding, more_bytes, dst);
6810                         dst_end = coding->destination + coding->dst_bytes;
6811                         coding_set_source (coding);
6812                         src = coding->source + offset;
6813                         src_end = coding->source + coding->consumed;
6814                         if (EQ (coding->src_object, coding->dst_object))
6815                           dst_end = (unsigned char *) src;
6816                       }
6817                   }
6818                 EMIT_ONE_BYTE (c);
6819               }
6820         }
6821       else
6822         {
6823           if (!EQ (coding->src_object, coding->dst_object))
6824             {
6825               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6826
6827               if (require > 0)
6828                 {
6829                   ptrdiff_t offset = src - coding->source;
6830
6831                   dst = alloc_destination (coding, require, dst);
6832                   coding_set_source (coding);
6833                   src = coding->source + offset;
6834                   src_end = coding->source + coding->consumed;
6835                 }
6836             }
6837           produced_chars = coding->consumed_char;
6838           while (src < src_end)
6839             *dst++ = *src++;
6840         }
6841     }
6842
6843   produced = dst - (coding->destination + coding->produced);
6844   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6845     insert_from_gap (produced_chars, produced);
6846   coding->produced += produced;
6847   coding->produced_char += produced_chars;
6848   return carryover;
6849 }
6850
6851 /* Compose text in CODING->object according to the annotation data at
6852    CHARBUF.  CHARBUF is an array:
6853      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6854  */
6855
6856 static inline void
6857 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6858 {
6859   int len;
6860   ptrdiff_t to;
6861   enum composition_method method;
6862   Lisp_Object components;
6863
6864   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6865   to = pos + charbuf[2];
6866   method = (enum composition_method) (charbuf[4]);
6867
6868   if (method == COMPOSITION_RELATIVE)
6869     components = Qnil;
6870   else
6871     {
6872       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6873       int i, j;
6874
6875       if (method == COMPOSITION_WITH_RULE)
6876         len = charbuf[2] * 3 - 2;
6877       charbuf += MAX_ANNOTATION_LENGTH;
6878       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6879       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6880         {
6881           if (charbuf[i] >= 0)
6882             args[j] = make_number (charbuf[i]);
6883           else
6884             {
6885               i++;
6886               args[j] = make_number (charbuf[i] % 0x100);
6887             }
6888         }
6889       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6890     }
6891   compose_text (pos, to, components, Qnil, coding->dst_object);
6892 }
6893
6894
6895 /* Put `charset' property on text in CODING->object according to
6896    the annotation data at CHARBUF.  CHARBUF is an array:
6897      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6898  */
6899
6900 static inline void
6901 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6902 {
6903   ptrdiff_t from = pos - charbuf[2];
6904   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6905
6906   Fput_text_property (make_number (from), make_number (pos),
6907                       Qcharset, CHARSET_NAME (charset),
6908                       coding->dst_object);
6909 }
6910
6911
6912 #define CHARBUF_SIZE 0x4000
6913
6914 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6915   do {                                                                  \
6916     int size = CHARBUF_SIZE;                                            \
6917                                                                         \
6918     coding->charbuf = NULL;                                             \
6919     while (size > 1024)                                                 \
6920       {                                                                 \
6921         coding->charbuf = alloca (sizeof (int) * size);                 \
6922         if (coding->charbuf)                                            \
6923           break;                                                        \
6924         size >>= 1;                                                     \
6925       }                                                                 \
6926     if (! coding->charbuf)                                              \
6927       {                                                                 \
6928         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6929         return;                                                         \
6930       }                                                                 \
6931     coding->charbuf_size = size;                                        \
6932   } while (0)
6933
6934
6935 static void
6936 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6937 {
6938   int *charbuf = coding->charbuf;
6939   int *charbuf_end = charbuf + coding->charbuf_used;
6940
6941   if (NILP (coding->dst_object))
6942     return;
6943
6944   while (charbuf < charbuf_end)
6945     {
6946       if (*charbuf >= 0)
6947         pos++, charbuf++;
6948       else
6949         {
6950           int len = -*charbuf;
6951
6952           if (len > 2)
6953             switch (charbuf[1])
6954               {
6955               case CODING_ANNOTATE_COMPOSITION_MASK:
6956                 produce_composition (coding, charbuf, pos);
6957                 break;
6958               case CODING_ANNOTATE_CHARSET_MASK:
6959                 produce_charset (coding, charbuf, pos);
6960                 break;
6961               }
6962           charbuf += len;
6963         }
6964     }
6965 }
6966
6967 /* Decode the data at CODING->src_object into CODING->dst_object.
6968    CODING->src_object is a buffer, a string, or nil.
6969    CODING->dst_object is a buffer.
6970
6971    If CODING->src_object is a buffer, it must be the current buffer.
6972    In this case, if CODING->src_pos is positive, it is a position of
6973    the source text in the buffer, otherwise, the source text is in the
6974    gap area of the buffer, and CODING->src_pos specifies the offset of
6975    the text from GPT (which must be the same as PT).  If this is the
6976    same buffer as CODING->dst_object, CODING->src_pos must be
6977    negative.
6978
6979    If CODING->src_object is a string, CODING->src_pos is an index to
6980    that string.
6981
6982    If CODING->src_object is nil, CODING->source must already point to
6983    the non-relocatable memory area.  In this case, CODING->src_pos is
6984    an offset from CODING->source.
6985
6986    The decoded data is inserted at the current point of the buffer
6987    CODING->dst_object.
6988 */
6989
6990 static void
6991 decode_coding (struct coding_system *coding)
6992 {
6993   Lisp_Object attrs;
6994   Lisp_Object undo_list;
6995   Lisp_Object translation_table;
6996   struct ccl_spec cclspec;
6997   int carryover;
6998   int i;
6999
7000   if (BUFFERP (coding->src_object)
7001       && coding->src_pos > 0
7002       && coding->src_pos < GPT
7003       && coding->src_pos + coding->src_chars > GPT)
7004     move_gap_both (coding->src_pos, coding->src_pos_byte);
7005
7006   undo_list = Qt;
7007   if (BUFFERP (coding->dst_object))
7008     {
7009       if (current_buffer != XBUFFER (coding->dst_object))
7010         set_buffer_internal (XBUFFER (coding->dst_object));
7011       if (GPT != PT)
7012         move_gap_both (PT, PT_BYTE);
7013
7014       /* We must disable undo_list in order to record the whole insert
7015          transaction via record_insert at the end.  But doing so also
7016          disables the recording of the first change to the undo_list.
7017          Therefore we check for first change here and record it via
7018          record_first_change if needed.  */
7019       if (MODIFF <= SAVE_MODIFF)
7020         record_first_change ();
7021
7022       undo_list = BVAR (current_buffer, undo_list);
7023       bset_undo_list (current_buffer, Qt);
7024     }
7025
7026   coding->consumed = coding->consumed_char = 0;
7027   coding->produced = coding->produced_char = 0;
7028   coding->chars_at_source = 0;
7029   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7030   coding->errors = 0;
7031
7032   ALLOC_CONVERSION_WORK_AREA (coding);
7033
7034   attrs = CODING_ID_ATTRS (coding->id);
7035   translation_table = get_translation_table (attrs, 0, NULL);
7036
7037   carryover = 0;
7038   if (coding->decoder == decode_coding_ccl)
7039     {
7040       coding->spec.ccl = &cclspec;
7041       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7042     }
7043   do
7044     {
7045       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7046
7047       coding_set_source (coding);
7048       coding->annotated = 0;
7049       coding->charbuf_used = carryover;
7050       (*(coding->decoder)) (coding);
7051       coding_set_destination (coding);
7052       carryover = produce_chars (coding, translation_table, 0);
7053       if (coding->annotated)
7054         produce_annotation (coding, pos);
7055       for (i = 0; i < carryover; i++)
7056         coding->charbuf[i]
7057           = coding->charbuf[coding->charbuf_used - carryover + i];
7058     }
7059   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7060          || (coding->consumed < coding->src_bytes
7061              && (coding->result == CODING_RESULT_SUCCESS
7062                  || coding->result == CODING_RESULT_INVALID_SRC)));
7063
7064   if (carryover > 0)
7065     {
7066       coding_set_destination (coding);
7067       coding->charbuf_used = carryover;
7068       produce_chars (coding, translation_table, 1);
7069     }
7070
7071   coding->carryover_bytes = 0;
7072   if (coding->consumed < coding->src_bytes)
7073     {
7074       int nbytes = coding->src_bytes - coding->consumed;
7075       const unsigned char *src;
7076
7077       coding_set_source (coding);
7078       coding_set_destination (coding);
7079       src = coding->source + coding->consumed;
7080
7081       if (coding->mode & CODING_MODE_LAST_BLOCK)
7082         {
7083           /* Flush out unprocessed data as binary chars.  We are sure
7084              that the number of data is less than the size of
7085              coding->charbuf.  */
7086           coding->charbuf_used = 0;
7087           coding->chars_at_source = 0;
7088
7089           while (nbytes-- > 0)
7090             {
7091               int c = *src++;
7092
7093               if (c & 0x80)
7094                 c = BYTE8_TO_CHAR (c);
7095               coding->charbuf[coding->charbuf_used++] = c;
7096             }
7097           produce_chars (coding, Qnil, 1);
7098         }
7099       else
7100         {
7101           /* Record unprocessed bytes in coding->carryover.  We are
7102              sure that the number of data is less than the size of
7103              coding->carryover.  */
7104           unsigned char *p = coding->carryover;
7105
7106           if (nbytes > sizeof coding->carryover)
7107             nbytes = sizeof coding->carryover;
7108           coding->carryover_bytes = nbytes;
7109           while (nbytes-- > 0)
7110             *p++ = *src++;
7111         }
7112       coding->consumed = coding->src_bytes;
7113     }
7114
7115   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7116       && !inhibit_eol_conversion)
7117     decode_eol (coding);
7118   if (BUFFERP (coding->dst_object))
7119     {
7120       bset_undo_list (current_buffer, undo_list);
7121       record_insert (coding->dst_pos, coding->produced_char);
7122     }
7123 }
7124
7125
7126 /* Extract an annotation datum from a composition starting at POS and
7127    ending before LIMIT of CODING->src_object (buffer or string), store
7128    the data in BUF, set *STOP to a starting position of the next
7129    composition (if any) or to LIMIT, and return the address of the
7130    next element of BUF.
7131
7132    If such an annotation is not found, set *STOP to a starting
7133    position of a composition after POS (if any) or to LIMIT, and
7134    return BUF.  */
7135
7136 static inline int *
7137 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7138                                struct coding_system *coding, int *buf,
7139                                ptrdiff_t *stop)
7140 {
7141   ptrdiff_t start, end;
7142   Lisp_Object prop;
7143
7144   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7145       || end > limit)
7146     *stop = limit;
7147   else if (start > pos)
7148     *stop = start;
7149   else
7150     {
7151       if (start == pos)
7152         {
7153           /* We found a composition.  Store the corresponding
7154              annotation data in BUF.  */
7155           int *head = buf;
7156           enum composition_method method = COMPOSITION_METHOD (prop);
7157           int nchars = COMPOSITION_LENGTH (prop);
7158
7159           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7160           if (method != COMPOSITION_RELATIVE)
7161             {
7162               Lisp_Object components;
7163               ptrdiff_t i, len, i_byte;
7164
7165               components = COMPOSITION_COMPONENTS (prop);
7166               if (VECTORP (components))
7167                 {
7168                   len = ASIZE (components);
7169                   for (i = 0; i < len; i++)
7170                     *buf++ = XINT (AREF (components, i));
7171                 }
7172               else if (STRINGP (components))
7173                 {
7174                   len = SCHARS (components);
7175                   i = i_byte = 0;
7176                   while (i < len)
7177                     {
7178                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7179                       buf++;
7180                     }
7181                 }
7182               else if (INTEGERP (components))
7183                 {
7184                   len = 1;
7185                   *buf++ = XINT (components);
7186                 }
7187               else if (CONSP (components))
7188                 {
7189                   for (len = 0; CONSP (components);
7190                        len++, components = XCDR (components))
7191                     *buf++ = XINT (XCAR (components));
7192                 }
7193               else
7194                 abort ();
7195               *head -= len;
7196             }
7197         }
7198
7199       if (find_composition (end, limit, &start, &end, &prop,
7200                             coding->src_object)
7201           && end <= limit)
7202         *stop = start;
7203       else
7204         *stop = limit;
7205     }
7206   return buf;
7207 }
7208
7209
7210 /* Extract an annotation datum from a text property `charset' at POS of
7211    CODING->src_object (buffer of string), store the data in BUF, set
7212    *STOP to the position where the value of `charset' property changes
7213    (limiting by LIMIT), and return the address of the next element of
7214    BUF.
7215
7216    If the property value is nil, set *STOP to the position where the
7217    property value is non-nil (limiting by LIMIT), and return BUF.  */
7218
7219 static inline int *
7220 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7221                            struct coding_system *coding, int *buf,
7222                            ptrdiff_t *stop)
7223 {
7224   Lisp_Object val, next;
7225   int id;
7226
7227   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7228   if (! NILP (val) && CHARSETP (val))
7229     id = XINT (CHARSET_SYMBOL_ID (val));
7230   else
7231     id = -1;
7232   ADD_CHARSET_DATA (buf, 0, id);
7233   next = Fnext_single_property_change (make_number (pos), Qcharset,
7234                                        coding->src_object,
7235                                        make_number (limit));
7236   *stop = XINT (next);
7237   return buf;
7238 }
7239
7240
7241 static void
7242 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7243                int max_lookup)
7244 {
7245   int *buf = coding->charbuf;
7246   int *buf_end = coding->charbuf + coding->charbuf_size;
7247   const unsigned char *src = coding->source + coding->consumed;
7248   const unsigned char *src_end = coding->source + coding->src_bytes;
7249   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7250   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7251   bool multibytep = coding->src_multibyte;
7252   Lisp_Object eol_type;
7253   int c;
7254   ptrdiff_t stop, stop_composition, stop_charset;
7255   int *lookup_buf = NULL;
7256
7257   if (! NILP (translation_table))
7258     lookup_buf = alloca (sizeof (int) * max_lookup);
7259
7260   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7261   if (VECTORP (eol_type))
7262     eol_type = Qunix;
7263
7264   /* Note: composition handling is not yet implemented.  */
7265   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7266
7267   if (NILP (coding->src_object))
7268     stop = stop_composition = stop_charset = end_pos;
7269   else
7270     {
7271       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7272         stop = stop_composition = pos;
7273       else
7274         stop = stop_composition = end_pos;
7275       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7276         stop = stop_charset = pos;
7277       else
7278         stop_charset = end_pos;
7279     }
7280
7281   /* Compensate for CRLF and conversion.  */
7282   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7283   while (buf < buf_end)
7284     {
7285       Lisp_Object trans;
7286
7287       if (pos == stop)
7288         {
7289           if (pos == end_pos)
7290             break;
7291           if (pos == stop_composition)
7292             buf = handle_composition_annotation (pos, end_pos, coding,
7293                                                  buf, &stop_composition);
7294           if (pos == stop_charset)
7295             buf = handle_charset_annotation (pos, end_pos, coding,
7296                                              buf, &stop_charset);
7297           stop = (stop_composition < stop_charset
7298                   ? stop_composition : stop_charset);
7299         }
7300
7301       if (! multibytep)
7302         {
7303           int bytes;
7304
7305           if (coding->encoder == encode_coding_raw_text
7306               || coding->encoder == encode_coding_ccl)
7307             c = *src++, pos++;
7308           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7309             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7310           else
7311             c = BYTE8_TO_CHAR (*src), src++, pos++;
7312         }
7313       else
7314         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7315       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7316         c = '\n';
7317       if (! EQ (eol_type, Qunix))
7318         {
7319           if (c == '\n')
7320             {
7321               if (EQ (eol_type, Qdos))
7322                 *buf++ = '\r';
7323               else
7324                 c = '\r';
7325             }
7326         }
7327
7328       trans = Qnil;
7329       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7330       if (NILP (trans))
7331         *buf++ = c;
7332       else
7333         {
7334           ptrdiff_t from_nchars = 1, to_nchars = 1;
7335           int *lookup_buf_end;
7336           const unsigned char *p = src;
7337           int i;
7338
7339           lookup_buf[0] = c;
7340           for (i = 1; i < max_lookup && p < src_end; i++)
7341             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7342           lookup_buf_end = lookup_buf + i;
7343           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7344           if (INTEGERP (trans))
7345             c = XINT (trans);
7346           else if (CONSP (trans))
7347             {
7348               from_nchars = ASIZE (XCAR (trans));
7349               trans = XCDR (trans);
7350               if (INTEGERP (trans))
7351                 c = XINT (trans);
7352               else
7353                 {
7354                   to_nchars = ASIZE (trans);
7355                   if (buf_end - buf < to_nchars)
7356                     break;
7357                   c = XINT (AREF (trans, 0));
7358                 }
7359             }
7360           else
7361             break;
7362           *buf++ = c;
7363           for (i = 1; i < to_nchars; i++)
7364             *buf++ = XINT (AREF (trans, i));
7365           for (i = 1; i < from_nchars; i++, pos++)
7366             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7367         }
7368     }
7369
7370   coding->consumed = src - coding->source;
7371   coding->consumed_char = pos - coding->src_pos;
7372   coding->charbuf_used = buf - coding->charbuf;
7373   coding->chars_at_source = 0;
7374 }
7375
7376
7377 /* Encode the text at CODING->src_object into CODING->dst_object.
7378    CODING->src_object is a buffer or a string.
7379    CODING->dst_object is a buffer or nil.
7380
7381    If CODING->src_object is a buffer, it must be the current buffer.
7382    In this case, if CODING->src_pos is positive, it is a position of
7383    the source text in the buffer, otherwise. the source text is in the
7384    gap area of the buffer, and coding->src_pos specifies the offset of
7385    the text from GPT (which must be the same as PT).  If this is the
7386    same buffer as CODING->dst_object, CODING->src_pos must be
7387    negative and CODING should not have `pre-write-conversion'.
7388
7389    If CODING->src_object is a string, CODING should not have
7390    `pre-write-conversion'.
7391
7392    If CODING->dst_object is a buffer, the encoded data is inserted at
7393    the current point of that buffer.
7394
7395    If CODING->dst_object is nil, the encoded data is placed at the
7396    memory area specified by CODING->destination.  */
7397
7398 static void
7399 encode_coding (struct coding_system *coding)
7400 {
7401   Lisp_Object attrs;
7402   Lisp_Object translation_table;
7403   int max_lookup;
7404   struct ccl_spec cclspec;
7405
7406   attrs = CODING_ID_ATTRS (coding->id);
7407   if (coding->encoder == encode_coding_raw_text)
7408     translation_table = Qnil, max_lookup = 0;
7409   else
7410     translation_table = get_translation_table (attrs, 1, &max_lookup);
7411
7412   if (BUFFERP (coding->dst_object))
7413     {
7414       set_buffer_internal (XBUFFER (coding->dst_object));
7415       coding->dst_multibyte
7416         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7417     }
7418
7419   coding->consumed = coding->consumed_char = 0;
7420   coding->produced = coding->produced_char = 0;
7421   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7422   coding->errors = 0;
7423
7424   ALLOC_CONVERSION_WORK_AREA (coding);
7425
7426   if (coding->encoder == encode_coding_ccl)
7427     {
7428       coding->spec.ccl = &cclspec;
7429       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7430     }
7431   do {
7432     coding_set_source (coding);
7433     consume_chars (coding, translation_table, max_lookup);
7434     coding_set_destination (coding);
7435     (*(coding->encoder)) (coding);
7436   } while (coding->consumed_char < coding->src_chars);
7437
7438   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7439     insert_from_gap (coding->produced_char, coding->produced);
7440 }
7441
7442
7443 /* Name (or base name) of work buffer for code conversion.  */
7444 static Lisp_Object Vcode_conversion_workbuf_name;
7445
7446 /* A working buffer used by the top level conversion.  Once it is
7447    created, it is never destroyed.  It has the name
7448    Vcode_conversion_workbuf_name.  The other working buffers are
7449    destroyed after the use is finished, and their names are modified
7450    versions of Vcode_conversion_workbuf_name.  */
7451 static Lisp_Object Vcode_conversion_reused_workbuf;
7452
7453 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7454 static bool reused_workbuf_in_use;
7455
7456
7457 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7458    multibyteness of returning buffer.  */
7459
7460 static Lisp_Object
7461 make_conversion_work_buffer (bool multibyte)
7462 {
7463   Lisp_Object name, workbuf;
7464   struct buffer *current;
7465
7466   if (reused_workbuf_in_use)
7467     {
7468       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7469       workbuf = Fget_buffer_create (name);
7470     }
7471   else
7472     {
7473       reused_workbuf_in_use = 1;
7474       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7475         Vcode_conversion_reused_workbuf
7476           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7477       workbuf = Vcode_conversion_reused_workbuf;
7478     }
7479   current = current_buffer;
7480   set_buffer_internal (XBUFFER (workbuf));
7481   /* We can't allow modification hooks to run in the work buffer.  For
7482      instance, directory_files_internal assumes that file decoding
7483      doesn't compile new regexps.  */
7484   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7485   Ferase_buffer ();
7486   bset_undo_list (current_buffer, Qt);
7487   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7488   set_buffer_internal (current);
7489   return workbuf;
7490 }
7491
7492
7493 static Lisp_Object
7494 code_conversion_restore (Lisp_Object arg)
7495 {
7496   Lisp_Object current, workbuf;
7497   struct gcpro gcpro1;
7498
7499   GCPRO1 (arg);
7500   current = XCAR (arg);
7501   workbuf = XCDR (arg);
7502   if (! NILP (workbuf))
7503     {
7504       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7505         reused_workbuf_in_use = 0;
7506       else
7507         Fkill_buffer (workbuf);
7508     }
7509   set_buffer_internal (XBUFFER (current));
7510   UNGCPRO;
7511   return Qnil;
7512 }
7513
7514 Lisp_Object
7515 code_conversion_save (bool with_work_buf, bool multibyte)
7516 {
7517   Lisp_Object workbuf = Qnil;
7518
7519   if (with_work_buf)
7520     workbuf = make_conversion_work_buffer (multibyte);
7521   record_unwind_protect (code_conversion_restore,
7522                          Fcons (Fcurrent_buffer (), workbuf));
7523   return workbuf;
7524 }
7525
7526 void
7527 decode_coding_gap (struct coding_system *coding,
7528                    ptrdiff_t chars, ptrdiff_t bytes)
7529 {
7530   ptrdiff_t count = SPECPDL_INDEX ();
7531   Lisp_Object attrs;
7532
7533   code_conversion_save (0, 0);
7534
7535   coding->src_object = Fcurrent_buffer ();
7536   coding->src_chars = chars;
7537   coding->src_bytes = bytes;
7538   coding->src_pos = -chars;
7539   coding->src_pos_byte = -bytes;
7540   coding->src_multibyte = chars < bytes;
7541   coding->dst_object = coding->src_object;
7542   coding->dst_pos = PT;
7543   coding->dst_pos_byte = PT_BYTE;
7544   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7545
7546   if (CODING_REQUIRE_DETECTION (coding))
7547     detect_coding (coding);
7548
7549   coding->mode |= CODING_MODE_LAST_BLOCK;
7550   current_buffer->text->inhibit_shrinking = 1;
7551   decode_coding (coding);
7552   current_buffer->text->inhibit_shrinking = 0;
7553
7554   attrs = CODING_ID_ATTRS (coding->id);
7555   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7556     {
7557       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7558       Lisp_Object val;
7559
7560       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7561       val = call1 (CODING_ATTR_POST_READ (attrs),
7562                    make_number (coding->produced_char));
7563       CHECK_NATNUM (val);
7564       coding->produced_char += Z - prev_Z;
7565       coding->produced += Z_BYTE - prev_Z_BYTE;
7566     }
7567
7568   unbind_to (count, Qnil);
7569 }
7570
7571
7572 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7573    SRC_OBJECT into DST_OBJECT by coding context CODING.
7574
7575    SRC_OBJECT is a buffer, a string, or Qnil.
7576
7577    If it is a buffer, the text is at point of the buffer.  FROM and TO
7578    are positions in the buffer.
7579
7580    If it is a string, the text is at the beginning of the string.
7581    FROM and TO are indices to the string.
7582
7583    If it is nil, the text is at coding->source.  FROM and TO are
7584    indices to coding->source.
7585
7586    DST_OBJECT is a buffer, Qt, or Qnil.
7587
7588    If it is a buffer, the decoded text is inserted at point of the
7589    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7590    is deleted.
7591
7592    If it is Qt, a string is made from the decoded text, and
7593    set in CODING->dst_object.
7594
7595    If it is Qnil, the decoded text is stored at CODING->destination.
7596    The caller must allocate CODING->dst_bytes bytes at
7597    CODING->destination by xmalloc.  If the decoded text is longer than
7598    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7599  */
7600
7601 void
7602 decode_coding_object (struct coding_system *coding,
7603                       Lisp_Object src_object,
7604                       ptrdiff_t from, ptrdiff_t from_byte,
7605                       ptrdiff_t to, ptrdiff_t to_byte,
7606                       Lisp_Object dst_object)
7607 {
7608   ptrdiff_t count = SPECPDL_INDEX ();
7609   unsigned char *destination IF_LINT (= NULL);
7610   ptrdiff_t dst_bytes IF_LINT (= 0);
7611   ptrdiff_t chars = to - from;
7612   ptrdiff_t bytes = to_byte - from_byte;
7613   Lisp_Object attrs;
7614   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7615   bool need_marker_adjustment = 0;
7616   Lisp_Object old_deactivate_mark;
7617
7618   old_deactivate_mark = Vdeactivate_mark;
7619
7620   if (NILP (dst_object))
7621     {
7622       destination = coding->destination;
7623       dst_bytes = coding->dst_bytes;
7624     }
7625
7626   coding->src_object = src_object;
7627   coding->src_chars = chars;
7628   coding->src_bytes = bytes;
7629   coding->src_multibyte = chars < bytes;
7630
7631   if (STRINGP (src_object))
7632     {
7633       coding->src_pos = from;
7634       coding->src_pos_byte = from_byte;
7635     }
7636   else if (BUFFERP (src_object))
7637     {
7638       set_buffer_internal (XBUFFER (src_object));
7639       if (from != GPT)
7640         move_gap_both (from, from_byte);
7641       if (EQ (src_object, dst_object))
7642         {
7643           struct Lisp_Marker *tail;
7644
7645           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7646             {
7647               tail->need_adjustment
7648                 = tail->charpos == (tail->insertion_type ? from : to);
7649               need_marker_adjustment |= tail->need_adjustment;
7650             }
7651           saved_pt = PT, saved_pt_byte = PT_BYTE;
7652           TEMP_SET_PT_BOTH (from, from_byte);
7653           current_buffer->text->inhibit_shrinking = 1;
7654           del_range_both (from, from_byte, to, to_byte, 1);
7655           coding->src_pos = -chars;
7656           coding->src_pos_byte = -bytes;
7657         }
7658       else
7659         {
7660           coding->src_pos = from;
7661           coding->src_pos_byte = from_byte;
7662         }
7663     }
7664
7665   if (CODING_REQUIRE_DETECTION (coding))
7666     detect_coding (coding);
7667   attrs = CODING_ID_ATTRS (coding->id);
7668
7669   if (EQ (dst_object, Qt)
7670       || (! NILP (CODING_ATTR_POST_READ (attrs))
7671           && NILP (dst_object)))
7672     {
7673       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7674       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7675       coding->dst_pos = BEG;
7676       coding->dst_pos_byte = BEG_BYTE;
7677     }
7678   else if (BUFFERP (dst_object))
7679     {
7680       code_conversion_save (0, 0);
7681       coding->dst_object = dst_object;
7682       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7683       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7684       coding->dst_multibyte
7685         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7686     }
7687   else
7688     {
7689       code_conversion_save (0, 0);
7690       coding->dst_object = Qnil;
7691       /* Most callers presume this will return a multibyte result, and they
7692          won't use `binary' or `raw-text' anyway, so let's not worry about
7693          CODING_FOR_UNIBYTE.  */
7694       coding->dst_multibyte = 1;
7695     }
7696
7697   decode_coding (coding);
7698
7699   if (BUFFERP (coding->dst_object))
7700     set_buffer_internal (XBUFFER (coding->dst_object));
7701
7702   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7703     {
7704       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7705       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7706       Lisp_Object val;
7707
7708       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7709       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7710               old_deactivate_mark);
7711       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7712                         make_number (coding->produced_char));
7713       UNGCPRO;
7714       CHECK_NATNUM (val);
7715       coding->produced_char += Z - prev_Z;
7716       coding->produced += Z_BYTE - prev_Z_BYTE;
7717     }
7718
7719   if (EQ (dst_object, Qt))
7720     {
7721       coding->dst_object = Fbuffer_string ();
7722     }
7723   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7724     {
7725       set_buffer_internal (XBUFFER (coding->dst_object));
7726       if (dst_bytes < coding->produced)
7727         {
7728           destination = xrealloc (destination, coding->produced);
7729           if (! destination)
7730             {
7731               record_conversion_result (coding,
7732                                         CODING_RESULT_INSUFFICIENT_MEM);
7733               unbind_to (count, Qnil);
7734               return;
7735             }
7736           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7737             move_gap_both (BEGV, BEGV_BYTE);
7738           memcpy (destination, BEGV_ADDR, coding->produced);
7739           coding->destination = destination;
7740         }
7741     }
7742
7743   if (saved_pt >= 0)
7744     {
7745       /* This is the case of:
7746          (BUFFERP (src_object) && EQ (src_object, dst_object))
7747          As we have moved PT while replacing the original buffer
7748          contents, we must recover it now.  */
7749       set_buffer_internal (XBUFFER (src_object));
7750       current_buffer->text->inhibit_shrinking = 0;
7751       if (saved_pt < from)
7752         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7753       else if (saved_pt < from + chars)
7754         TEMP_SET_PT_BOTH (from, from_byte);
7755       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7756         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7757                           saved_pt_byte + (coding->produced - bytes));
7758       else
7759         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7760                           saved_pt_byte + (coding->produced - bytes));
7761
7762       if (need_marker_adjustment)
7763         {
7764           struct Lisp_Marker *tail;
7765
7766           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7767             if (tail->need_adjustment)
7768               {
7769                 tail->need_adjustment = 0;
7770                 if (tail->insertion_type)
7771                   {
7772                     tail->bytepos = from_byte;
7773                     tail->charpos = from;
7774                   }
7775                 else
7776                   {
7777                     tail->bytepos = from_byte + coding->produced;
7778                     tail->charpos
7779                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7780                          ? tail->bytepos : from + coding->produced_char);
7781                   }
7782               }
7783         }
7784     }
7785
7786   Vdeactivate_mark = old_deactivate_mark;
7787   unbind_to (count, coding->dst_object);
7788 }
7789
7790
7791 void
7792 encode_coding_object (struct coding_system *coding,
7793                       Lisp_Object src_object,
7794                       ptrdiff_t from, ptrdiff_t from_byte,
7795                       ptrdiff_t to, ptrdiff_t to_byte,
7796                       Lisp_Object dst_object)
7797 {
7798   ptrdiff_t count = SPECPDL_INDEX ();
7799   ptrdiff_t chars = to - from;
7800   ptrdiff_t bytes = to_byte - from_byte;
7801   Lisp_Object attrs;
7802   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7803   bool need_marker_adjustment = 0;
7804   bool kill_src_buffer = 0;
7805   Lisp_Object old_deactivate_mark;
7806
7807   old_deactivate_mark = Vdeactivate_mark;
7808
7809   coding->src_object = src_object;
7810   coding->src_chars = chars;
7811   coding->src_bytes = bytes;
7812   coding->src_multibyte = chars < bytes;
7813
7814   attrs = CODING_ID_ATTRS (coding->id);
7815
7816   if (EQ (src_object, dst_object))
7817     {
7818       struct Lisp_Marker *tail;
7819
7820       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7821         {
7822           tail->need_adjustment
7823             = tail->charpos == (tail->insertion_type ? from : to);
7824           need_marker_adjustment |= tail->need_adjustment;
7825         }
7826     }
7827
7828   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7829     {
7830       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7831       set_buffer_internal (XBUFFER (coding->src_object));
7832       if (STRINGP (src_object))
7833         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7834       else if (BUFFERP (src_object))
7835         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7836       else
7837         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7838
7839       if (EQ (src_object, dst_object))
7840         {
7841           set_buffer_internal (XBUFFER (src_object));
7842           saved_pt = PT, saved_pt_byte = PT_BYTE;
7843           del_range_both (from, from_byte, to, to_byte, 1);
7844           set_buffer_internal (XBUFFER (coding->src_object));
7845         }
7846
7847       {
7848         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7849
7850         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7851                 old_deactivate_mark);
7852         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7853                     make_number (BEG), make_number (Z));
7854         UNGCPRO;
7855       }
7856       if (XBUFFER (coding->src_object) != current_buffer)
7857         kill_src_buffer = 1;
7858       coding->src_object = Fcurrent_buffer ();
7859       if (BEG != GPT)
7860         move_gap_both (BEG, BEG_BYTE);
7861       coding->src_chars = Z - BEG;
7862       coding->src_bytes = Z_BYTE - BEG_BYTE;
7863       coding->src_pos = BEG;
7864       coding->src_pos_byte = BEG_BYTE;
7865       coding->src_multibyte = Z < Z_BYTE;
7866     }
7867   else if (STRINGP (src_object))
7868     {
7869       code_conversion_save (0, 0);
7870       coding->src_pos = from;
7871       coding->src_pos_byte = from_byte;
7872     }
7873   else if (BUFFERP (src_object))
7874     {
7875       code_conversion_save (0, 0);
7876       set_buffer_internal (XBUFFER (src_object));
7877       if (EQ (src_object, dst_object))
7878         {
7879           saved_pt = PT, saved_pt_byte = PT_BYTE;
7880           coding->src_object = del_range_1 (from, to, 1, 1);
7881           coding->src_pos = 0;
7882           coding->src_pos_byte = 0;
7883         }
7884       else
7885         {
7886           if (from < GPT && to >= GPT)
7887             move_gap_both (from, from_byte);
7888           coding->src_pos = from;
7889           coding->src_pos_byte = from_byte;
7890         }
7891     }
7892   else
7893     code_conversion_save (0, 0);
7894
7895   if (BUFFERP (dst_object))
7896     {
7897       coding->dst_object = dst_object;
7898       if (EQ (src_object, dst_object))
7899         {
7900           coding->dst_pos = from;
7901           coding->dst_pos_byte = from_byte;
7902         }
7903       else
7904         {
7905           struct buffer *current = current_buffer;
7906
7907           set_buffer_temp (XBUFFER (dst_object));
7908           coding->dst_pos = PT;
7909           coding->dst_pos_byte = PT_BYTE;
7910           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7911           set_buffer_temp (current);
7912         }
7913       coding->dst_multibyte
7914         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7915     }
7916   else if (EQ (dst_object, Qt))
7917     {
7918       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7919       coding->dst_object = Qnil;
7920       coding->destination = xmalloc (dst_bytes);
7921       coding->dst_bytes = dst_bytes;
7922       coding->dst_multibyte = 0;
7923     }
7924   else
7925     {
7926       coding->dst_object = Qnil;
7927       coding->dst_multibyte = 0;
7928     }
7929
7930   encode_coding (coding);
7931
7932   if (EQ (dst_object, Qt))
7933     {
7934       if (BUFFERP (coding->dst_object))
7935         coding->dst_object = Fbuffer_string ();
7936       else
7937         {
7938           coding->dst_object
7939             = make_unibyte_string ((char *) coding->destination,
7940                                    coding->produced);
7941           xfree (coding->destination);
7942         }
7943     }
7944
7945   if (saved_pt >= 0)
7946     {
7947       /* This is the case of:
7948          (BUFFERP (src_object) && EQ (src_object, dst_object))
7949          As we have moved PT while replacing the original buffer
7950          contents, we must recover it now.  */
7951       set_buffer_internal (XBUFFER (src_object));
7952       if (saved_pt < from)
7953         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7954       else if (saved_pt < from + chars)
7955         TEMP_SET_PT_BOTH (from, from_byte);
7956       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7957         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7958                           saved_pt_byte + (coding->produced - bytes));
7959       else
7960         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7961                           saved_pt_byte + (coding->produced - bytes));
7962
7963       if (need_marker_adjustment)
7964         {
7965           struct Lisp_Marker *tail;
7966
7967           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7968             if (tail->need_adjustment)
7969               {
7970                 tail->need_adjustment = 0;
7971                 if (tail->insertion_type)
7972                   {
7973                     tail->bytepos = from_byte;
7974                     tail->charpos = from;
7975                   }
7976                 else
7977                   {
7978                     tail->bytepos = from_byte + coding->produced;
7979                     tail->charpos
7980                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7981                          ? tail->bytepos : from + coding->produced_char);
7982                   }
7983               }
7984         }
7985     }
7986
7987   if (kill_src_buffer)
7988     Fkill_buffer (coding->src_object);
7989
7990   Vdeactivate_mark = old_deactivate_mark;
7991   unbind_to (count, Qnil);
7992 }
7993
7994
7995 Lisp_Object
7996 preferred_coding_system (void)
7997 {
7998   int id = coding_categories[coding_priorities[0]].id;
7999
8000   return CODING_ID_NAME (id);
8001 }
8002
8003 \f
8004 #ifdef emacs
8005 /*** 8. Emacs Lisp library functions ***/
8006
8007 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8008        doc: /* Return t if OBJECT is nil or a coding-system.
8009 See the documentation of `define-coding-system' for information
8010 about coding-system objects.  */)
8011   (Lisp_Object object)
8012 {
8013   if (NILP (object)
8014       || CODING_SYSTEM_ID (object) >= 0)
8015     return Qt;
8016   if (! SYMBOLP (object)
8017       || NILP (Fget (object, Qcoding_system_define_form)))
8018     return Qnil;
8019   return Qt;
8020 }
8021
8022 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8023        Sread_non_nil_coding_system, 1, 1, 0,
8024        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8025   (Lisp_Object prompt)
8026 {
8027   Lisp_Object val;
8028   do
8029     {
8030       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8031                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8032     }
8033   while (SCHARS (val) == 0);
8034   return (Fintern (val, Qnil));
8035 }
8036
8037 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8038        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8039 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8040 Ignores case when completing coding systems (all Emacs coding systems
8041 are lower-case).  */)
8042   (Lisp_Object prompt, Lisp_Object default_coding_system)
8043 {
8044   Lisp_Object val;
8045   ptrdiff_t count = SPECPDL_INDEX ();
8046
8047   if (SYMBOLP (default_coding_system))
8048     default_coding_system = SYMBOL_NAME (default_coding_system);
8049   specbind (Qcompletion_ignore_case, Qt);
8050   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8051                           Qt, Qnil, Qcoding_system_history,
8052                           default_coding_system, Qnil);
8053   unbind_to (count, Qnil);
8054   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8055 }
8056
8057 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8058        1, 1, 0,
8059        doc: /* Check validity of CODING-SYSTEM.
8060 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8061 It is valid if it is nil or a symbol defined as a coding system by the
8062 function `define-coding-system'.  */)
8063   (Lisp_Object coding_system)
8064 {
8065   Lisp_Object define_form;
8066
8067   define_form = Fget (coding_system, Qcoding_system_define_form);
8068   if (! NILP (define_form))
8069     {
8070       Fput (coding_system, Qcoding_system_define_form, Qnil);
8071       safe_eval (define_form);
8072     }
8073   if (!NILP (Fcoding_system_p (coding_system)))
8074     return coding_system;
8075   xsignal1 (Qcoding_system_error, coding_system);
8076 }
8077
8078 \f
8079 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8080    HIGHEST, return the coding system of the highest
8081    priority among the detected coding systems.  Otherwise return a
8082    list of detected coding systems sorted by their priorities.  If
8083    MULTIBYTEP, it is assumed that the bytes are in correct
8084    multibyte form but contains only ASCII and eight-bit chars.
8085    Otherwise, the bytes are raw bytes.
8086
8087    CODING-SYSTEM controls the detection as below:
8088
8089    If it is nil, detect both text-format and eol-format.  If the
8090    text-format part of CODING-SYSTEM is already specified
8091    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8092    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8093    detect only text-format.  */
8094
8095 Lisp_Object
8096 detect_coding_system (const unsigned char *src,
8097                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8098                       bool highest, bool multibytep,
8099                       Lisp_Object coding_system)
8100 {
8101   const unsigned char *src_end = src + src_bytes;
8102   Lisp_Object attrs, eol_type;
8103   Lisp_Object val = Qnil;
8104   struct coding_system coding;
8105   ptrdiff_t id;
8106   struct coding_detection_info detect_info;
8107   enum coding_category base_category;
8108   bool null_byte_found = 0, eight_bit_found = 0;
8109
8110   if (NILP (coding_system))
8111     coding_system = Qundecided;
8112   setup_coding_system (coding_system, &coding);
8113   attrs = CODING_ID_ATTRS (coding.id);
8114   eol_type = CODING_ID_EOL_TYPE (coding.id);
8115   coding_system = CODING_ATTR_BASE_NAME (attrs);
8116
8117   coding.source = src;
8118   coding.src_chars = src_chars;
8119   coding.src_bytes = src_bytes;
8120   coding.src_multibyte = multibytep;
8121   coding.consumed = 0;
8122   coding.mode |= CODING_MODE_LAST_BLOCK;
8123   coding.head_ascii = 0;
8124
8125   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8126
8127   /* At first, detect text-format if necessary.  */
8128   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8129   if (base_category == coding_category_undecided)
8130     {
8131       enum coding_category category IF_LINT (= 0);
8132       struct coding_system *this IF_LINT (= NULL);
8133       int c, i;
8134
8135       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8136       for (; src < src_end; src++)
8137         {
8138           c = *src;
8139           if (c & 0x80)
8140             {
8141               eight_bit_found = 1;
8142               if (null_byte_found)
8143                 break;
8144             }
8145           else if (c < 0x20)
8146             {
8147               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8148                   && ! inhibit_iso_escape_detection
8149                   && ! detect_info.checked)
8150                 {
8151                   if (detect_coding_iso_2022 (&coding, &detect_info))
8152                     {
8153                       /* We have scanned the whole data.  */
8154                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8155                         {
8156                           /* We didn't find an 8-bit code.  We may
8157                              have found a null-byte, but it's very
8158                              rare that a binary file confirm to
8159                              ISO-2022.  */
8160                           src = src_end;
8161                           coding.head_ascii = src - coding.source;
8162                         }
8163                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8164                       break;
8165                     }
8166                 }
8167               else if (! c && !inhibit_null_byte_detection)
8168                 {
8169                   null_byte_found = 1;
8170                   if (eight_bit_found)
8171                     break;
8172                 }
8173               if (! eight_bit_found)
8174                 coding.head_ascii++;
8175             }
8176           else if (! eight_bit_found)
8177             coding.head_ascii++;
8178         }
8179
8180       if (null_byte_found || eight_bit_found
8181           || coding.head_ascii < coding.src_bytes
8182           || detect_info.found)
8183         {
8184           if (coding.head_ascii == coding.src_bytes)
8185             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8186             for (i = 0; i < coding_category_raw_text; i++)
8187               {
8188                 category = coding_priorities[i];
8189                 this = coding_categories + category;
8190                 if (detect_info.found & (1 << category))
8191                   break;
8192               }
8193           else
8194             {
8195               if (null_byte_found)
8196                 {
8197                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8198                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8199                 }
8200               for (i = 0; i < coding_category_raw_text; i++)
8201                 {
8202                   category = coding_priorities[i];
8203                   this = coding_categories + category;
8204
8205                   if (this->id < 0)
8206                     {
8207                       /* No coding system of this category is defined.  */
8208                       detect_info.rejected |= (1 << category);
8209                     }
8210                   else if (category >= coding_category_raw_text)
8211                     continue;
8212                   else if (detect_info.checked & (1 << category))
8213                     {
8214                       if (highest
8215                           && (detect_info.found & (1 << category)))
8216                         break;
8217                     }
8218                   else if ((*(this->detector)) (&coding, &detect_info)
8219                            && highest
8220                            && (detect_info.found & (1 << category)))
8221                     {
8222                       if (category == coding_category_utf_16_auto)
8223                         {
8224                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8225                             category = coding_category_utf_16_le;
8226                           else
8227                             category = coding_category_utf_16_be;
8228                         }
8229                       break;
8230                     }
8231                 }
8232             }
8233         }
8234
8235       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8236           || null_byte_found)
8237         {
8238           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8239           id = CODING_SYSTEM_ID (Qno_conversion);
8240           val = Fcons (make_number (id), Qnil);
8241         }
8242       else if (! detect_info.rejected && ! detect_info.found)
8243         {
8244           detect_info.found = CATEGORY_MASK_ANY;
8245           id = coding_categories[coding_category_undecided].id;
8246           val = Fcons (make_number (id), Qnil);
8247         }
8248       else if (highest)
8249         {
8250           if (detect_info.found)
8251             {
8252               detect_info.found = 1 << category;
8253               val = Fcons (make_number (this->id), Qnil);
8254             }
8255           else
8256             for (i = 0; i < coding_category_raw_text; i++)
8257               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8258                 {
8259                   detect_info.found = 1 << coding_priorities[i];
8260                   id = coding_categories[coding_priorities[i]].id;
8261                   val = Fcons (make_number (id), Qnil);
8262                   break;
8263                 }
8264         }
8265       else
8266         {
8267           int mask = detect_info.rejected | detect_info.found;
8268           int found = 0;
8269
8270           for (i = coding_category_raw_text - 1; i >= 0; i--)
8271             {
8272               category = coding_priorities[i];
8273               if (! (mask & (1 << category)))
8274                 {
8275                   found |= 1 << category;
8276                   id = coding_categories[category].id;
8277                   if (id >= 0)
8278                     val = Fcons (make_number (id), val);
8279                 }
8280             }
8281           for (i = coding_category_raw_text - 1; i >= 0; i--)
8282             {
8283               category = coding_priorities[i];
8284               if (detect_info.found & (1 << category))
8285                 {
8286                   id = coding_categories[category].id;
8287                   val = Fcons (make_number (id), val);
8288                 }
8289             }
8290           detect_info.found |= found;
8291         }
8292     }
8293   else if (base_category == coding_category_utf_8_auto)
8294     {
8295       if (detect_coding_utf_8 (&coding, &detect_info))
8296         {
8297           struct coding_system *this;
8298
8299           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8300             this = coding_categories + coding_category_utf_8_sig;
8301           else
8302             this = coding_categories + coding_category_utf_8_nosig;
8303           val = Fcons (make_number (this->id), Qnil);
8304         }
8305     }
8306   else if (base_category == coding_category_utf_16_auto)
8307     {
8308       if (detect_coding_utf_16 (&coding, &detect_info))
8309         {
8310           struct coding_system *this;
8311
8312           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8313             this = coding_categories + coding_category_utf_16_le;
8314           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8315             this = coding_categories + coding_category_utf_16_be;
8316           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8317             this = coding_categories + coding_category_utf_16_be_nosig;
8318           else
8319             this = coding_categories + coding_category_utf_16_le_nosig;
8320           val = Fcons (make_number (this->id), Qnil);
8321         }
8322     }
8323   else
8324     {
8325       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8326       val = Fcons (make_number (coding.id), Qnil);
8327     }
8328
8329   /* Then, detect eol-format if necessary.  */
8330   {
8331     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8332     Lisp_Object tail;
8333
8334     if (VECTORP (eol_type))
8335       {
8336         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8337           {
8338             if (null_byte_found)
8339               normal_eol = EOL_SEEN_LF;
8340             else
8341               normal_eol = detect_eol (coding.source, src_bytes,
8342                                        coding_category_raw_text);
8343           }
8344         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8345                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8346           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8347                                       coding_category_utf_16_be);
8348         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8349                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8350           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8351                                       coding_category_utf_16_le);
8352       }
8353     else
8354       {
8355         if (EQ (eol_type, Qunix))
8356           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8357         else if (EQ (eol_type, Qdos))
8358           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8359         else
8360           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8361       }
8362
8363     for (tail = val; CONSP (tail); tail = XCDR (tail))
8364       {
8365         enum coding_category category;
8366         int this_eol;
8367
8368         id = XINT (XCAR (tail));
8369         attrs = CODING_ID_ATTRS (id);
8370         category = XINT (CODING_ATTR_CATEGORY (attrs));
8371         eol_type = CODING_ID_EOL_TYPE (id);
8372         if (VECTORP (eol_type))
8373           {
8374             if (category == coding_category_utf_16_be
8375                 || category == coding_category_utf_16_be_nosig)
8376               this_eol = utf_16_be_eol;
8377             else if (category == coding_category_utf_16_le
8378                      || category == coding_category_utf_16_le_nosig)
8379               this_eol = utf_16_le_eol;
8380             else
8381               this_eol = normal_eol;
8382
8383             if (this_eol == EOL_SEEN_LF)
8384               XSETCAR (tail, AREF (eol_type, 0));
8385             else if (this_eol == EOL_SEEN_CRLF)
8386               XSETCAR (tail, AREF (eol_type, 1));
8387             else if (this_eol == EOL_SEEN_CR)
8388               XSETCAR (tail, AREF (eol_type, 2));
8389             else
8390               XSETCAR (tail, CODING_ID_NAME (id));
8391           }
8392         else
8393           XSETCAR (tail, CODING_ID_NAME (id));
8394       }
8395   }
8396
8397   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8398 }
8399
8400
8401 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8402        2, 3, 0,
8403        doc: /* Detect coding system of the text in the region between START and END.
8404 Return a list of possible coding systems ordered by priority.
8405 The coding systems to try and their priorities follows what
8406 the function `coding-system-priority-list' (which see) returns.
8407
8408 If only ASCII characters are found (except for such ISO-2022 control
8409 characters as ESC), it returns a list of single element `undecided'
8410 or its subsidiary coding system according to a detected end-of-line
8411 format.
8412
8413 If optional argument HIGHEST is non-nil, return the coding system of
8414 highest priority.  */)
8415   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8416 {
8417   ptrdiff_t from, to;
8418   ptrdiff_t from_byte, to_byte;
8419
8420   CHECK_NUMBER_COERCE_MARKER (start);
8421   CHECK_NUMBER_COERCE_MARKER (end);
8422
8423   validate_region (&start, &end);
8424   from = XINT (start), to = XINT (end);
8425   from_byte = CHAR_TO_BYTE (from);
8426   to_byte = CHAR_TO_BYTE (to);
8427
8428   if (from < GPT && to >= GPT)
8429     move_gap_both (to, to_byte);
8430
8431   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8432                                to - from, to_byte - from_byte,
8433                                !NILP (highest),
8434                                !NILP (BVAR (current_buffer
8435                                       , enable_multibyte_characters)),
8436                                Qnil);
8437 }
8438
8439 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8440        1, 2, 0,
8441        doc: /* Detect coding system of the text in STRING.
8442 Return a list of possible coding systems ordered by priority.
8443 The coding systems to try and their priorities follows what
8444 the function `coding-system-priority-list' (which see) returns.
8445
8446 If only ASCII characters are found (except for such ISO-2022 control
8447 characters as ESC), it returns a list of single element `undecided'
8448 or its subsidiary coding system according to a detected end-of-line
8449 format.
8450
8451 If optional argument HIGHEST is non-nil, return the coding system of
8452 highest priority.  */)
8453   (Lisp_Object string, Lisp_Object highest)
8454 {
8455   CHECK_STRING (string);
8456
8457   return detect_coding_system (SDATA (string),
8458                                SCHARS (string), SBYTES (string),
8459                                !NILP (highest), STRING_MULTIBYTE (string),
8460                                Qnil);
8461 }
8462
8463
8464 static inline bool
8465 char_encodable_p (int c, Lisp_Object attrs)
8466 {
8467   Lisp_Object tail;
8468   struct charset *charset;
8469   Lisp_Object translation_table;
8470
8471   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8472   if (! NILP (translation_table))
8473     c = translate_char (translation_table, c);
8474   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8475        CONSP (tail); tail = XCDR (tail))
8476     {
8477       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8478       if (CHAR_CHARSET_P (c, charset))
8479         break;
8480     }
8481   return (! NILP (tail));
8482 }
8483
8484
8485 /* Return a list of coding systems that safely encode the text between
8486    START and END.  If EXCLUDE is non-nil, it is a list of coding
8487    systems not to check.  The returned list doesn't contain any such
8488    coding systems.  In any case, if the text contains only ASCII or is
8489    unibyte, return t.  */
8490
8491 DEFUN ("find-coding-systems-region-internal",
8492        Ffind_coding_systems_region_internal,
8493        Sfind_coding_systems_region_internal, 2, 3, 0,
8494        doc: /* Internal use only.  */)
8495   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8496 {
8497   Lisp_Object coding_attrs_list, safe_codings;
8498   ptrdiff_t start_byte, end_byte;
8499   const unsigned char *p, *pbeg, *pend;
8500   int c;
8501   Lisp_Object tail, elt, work_table;
8502
8503   if (STRINGP (start))
8504     {
8505       if (!STRING_MULTIBYTE (start)
8506           || SCHARS (start) == SBYTES (start))
8507         return Qt;
8508       start_byte = 0;
8509       end_byte = SBYTES (start);
8510     }
8511   else
8512     {
8513       CHECK_NUMBER_COERCE_MARKER (start);
8514       CHECK_NUMBER_COERCE_MARKER (end);
8515       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8516         args_out_of_range (start, end);
8517       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8518         return Qt;
8519       start_byte = CHAR_TO_BYTE (XINT (start));
8520       end_byte = CHAR_TO_BYTE (XINT (end));
8521       if (XINT (end) - XINT (start) == end_byte - start_byte)
8522         return Qt;
8523
8524       if (XINT (start) < GPT && XINT (end) > GPT)
8525         {
8526           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8527             move_gap_both (XINT (start), start_byte);
8528           else
8529             move_gap_both (XINT (end), end_byte);
8530         }
8531     }
8532
8533   coding_attrs_list = Qnil;
8534   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8535     if (NILP (exclude)
8536         || NILP (Fmemq (XCAR (tail), exclude)))
8537       {
8538         Lisp_Object attrs;
8539
8540         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8541         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8542             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8543           {
8544             ASET (attrs, coding_attr_trans_tbl,
8545                   get_translation_table (attrs, 1, NULL));
8546             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8547           }
8548       }
8549
8550   if (STRINGP (start))
8551     p = pbeg = SDATA (start);
8552   else
8553     p = pbeg = BYTE_POS_ADDR (start_byte);
8554   pend = p + (end_byte - start_byte);
8555
8556   while (p < pend && ASCII_BYTE_P (*p)) p++;
8557   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8558
8559   work_table = Fmake_char_table (Qnil, Qnil);
8560   while (p < pend)
8561     {
8562       if (ASCII_BYTE_P (*p))
8563         p++;
8564       else
8565         {
8566           c = STRING_CHAR_ADVANCE (p);
8567           if (!NILP (char_table_ref (work_table, c)))
8568             /* This character was already checked.  Ignore it.  */
8569             continue;
8570
8571           charset_map_loaded = 0;
8572           for (tail = coding_attrs_list; CONSP (tail);)
8573             {
8574               elt = XCAR (tail);
8575               if (NILP (elt))
8576                 tail = XCDR (tail);
8577               else if (char_encodable_p (c, elt))
8578                 tail = XCDR (tail);
8579               else if (CONSP (XCDR (tail)))
8580                 {
8581                   XSETCAR (tail, XCAR (XCDR (tail)));
8582                   XSETCDR (tail, XCDR (XCDR (tail)));
8583                 }
8584               else
8585                 {
8586                   XSETCAR (tail, Qnil);
8587                   tail = XCDR (tail);
8588                 }
8589             }
8590           if (charset_map_loaded)
8591             {
8592               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8593
8594               if (STRINGP (start))
8595                 pbeg = SDATA (start);
8596               else
8597                 pbeg = BYTE_POS_ADDR (start_byte);
8598               p = pbeg + p_offset;
8599               pend = pbeg + pend_offset;
8600             }
8601           char_table_set (work_table, c, Qt);
8602         }
8603     }
8604
8605   safe_codings = list2 (Qraw_text, Qno_conversion);
8606   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8607     if (! NILP (XCAR (tail)))
8608       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8609
8610   return safe_codings;
8611 }
8612
8613
8614 DEFUN ("unencodable-char-position", Funencodable_char_position,
8615        Sunencodable_char_position, 3, 5, 0,
8616        doc: /*
8617 Return position of first un-encodable character in a region.
8618 START and END specify the region and CODING-SYSTEM specifies the
8619 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8620
8621 If optional 4th argument COUNT is non-nil, it specifies at most how
8622 many un-encodable characters to search.  In this case, the value is a
8623 list of positions.
8624
8625 If optional 5th argument STRING is non-nil, it is a string to search
8626 for un-encodable characters.  In that case, START and END are indexes
8627 to the string.  */)
8628   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8629 {
8630   EMACS_INT n;
8631   struct coding_system coding;
8632   Lisp_Object attrs, charset_list, translation_table;
8633   Lisp_Object positions;
8634   ptrdiff_t from, to;
8635   const unsigned char *p, *stop, *pend;
8636   bool ascii_compatible;
8637
8638   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8639   attrs = CODING_ID_ATTRS (coding.id);
8640   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8641     return Qnil;
8642   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8643   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8644   translation_table = get_translation_table (attrs, 1, NULL);
8645
8646   if (NILP (string))
8647     {
8648       validate_region (&start, &end);
8649       from = XINT (start);
8650       to = XINT (end);
8651       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8652           || (ascii_compatible
8653               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8654         return Qnil;
8655       p = CHAR_POS_ADDR (from);
8656       pend = CHAR_POS_ADDR (to);
8657       if (from < GPT && to >= GPT)
8658         stop = GPT_ADDR;
8659       else
8660         stop = pend;
8661     }
8662   else
8663     {
8664       CHECK_STRING (string);
8665       CHECK_NATNUM (start);
8666       CHECK_NATNUM (end);
8667       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8668         args_out_of_range_3 (string, start, end);
8669       from = XINT (start);
8670       to = XINT (end);
8671       if (! STRING_MULTIBYTE (string))
8672         return Qnil;
8673       p = SDATA (string) + string_char_to_byte (string, from);
8674       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8675       if (ascii_compatible && (to - from) == (pend - p))
8676         return Qnil;
8677     }
8678
8679   if (NILP (count))
8680     n = 1;
8681   else
8682     {
8683       CHECK_NATNUM (count);
8684       n = XINT (count);
8685     }
8686
8687   positions = Qnil;
8688   charset_map_loaded = 0;
8689   while (1)
8690     {
8691       int c;
8692
8693       if (ascii_compatible)
8694         while (p < stop && ASCII_BYTE_P (*p))
8695           p++, from++;
8696       if (p >= stop)
8697         {
8698           if (p >= pend)
8699             break;
8700           stop = pend;
8701           p = GAP_END_ADDR;
8702         }
8703
8704       c = STRING_CHAR_ADVANCE (p);
8705       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8706           && ! char_charset (translate_char (translation_table, c),
8707                              charset_list, NULL))
8708         {
8709           positions = Fcons (make_number (from), positions);
8710           n--;
8711           if (n == 0)
8712             break;
8713         }
8714
8715       from++;
8716       if (charset_map_loaded && NILP (string))
8717         {
8718           p = CHAR_POS_ADDR (from);
8719           pend = CHAR_POS_ADDR (to);
8720           if (from < GPT && to >= GPT)
8721             stop = GPT_ADDR;
8722           else
8723             stop = pend;
8724           charset_map_loaded = 0;
8725         }
8726     }
8727
8728   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8729 }
8730
8731
8732 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8733        Scheck_coding_systems_region, 3, 3, 0,
8734        doc: /* Check if the region is encodable by coding systems.
8735
8736 START and END are buffer positions specifying the region.
8737 CODING-SYSTEM-LIST is a list of coding systems to check.
8738
8739 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8740 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8741 whole region, POS0, POS1, ... are buffer positions where non-encodable
8742 characters are found.
8743
8744 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8745 value is nil.
8746
8747 START may be a string.  In that case, check if the string is
8748 encodable, and the value contains indices to the string instead of
8749 buffer positions.  END is ignored.
8750
8751 If the current buffer (or START if it is a string) is unibyte, the value
8752 is nil.  */)
8753   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8754 {
8755   Lisp_Object list;
8756   ptrdiff_t start_byte, end_byte;
8757   ptrdiff_t pos;
8758   const unsigned char *p, *pbeg, *pend;
8759   int c;
8760   Lisp_Object tail, elt, attrs;
8761
8762   if (STRINGP (start))
8763     {
8764       if (!STRING_MULTIBYTE (start)
8765           || SCHARS (start) == SBYTES (start))
8766         return Qnil;
8767       start_byte = 0;
8768       end_byte = SBYTES (start);
8769       pos = 0;
8770     }
8771   else
8772     {
8773       CHECK_NUMBER_COERCE_MARKER (start);
8774       CHECK_NUMBER_COERCE_MARKER (end);
8775       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8776         args_out_of_range (start, end);
8777       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8778         return Qnil;
8779       start_byte = CHAR_TO_BYTE (XINT (start));
8780       end_byte = CHAR_TO_BYTE (XINT (end));
8781       if (XINT (end) - XINT (start) == end_byte - start_byte)
8782         return Qnil;
8783
8784       if (XINT (start) < GPT && XINT (end) > GPT)
8785         {
8786           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8787             move_gap_both (XINT (start), start_byte);
8788           else
8789             move_gap_both (XINT (end), end_byte);
8790         }
8791       pos = XINT (start);
8792     }
8793
8794   list = Qnil;
8795   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8796     {
8797       elt = XCAR (tail);
8798       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8799       ASET (attrs, coding_attr_trans_tbl,
8800             get_translation_table (attrs, 1, NULL));
8801       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8802     }
8803
8804   if (STRINGP (start))
8805     p = pbeg = SDATA (start);
8806   else
8807     p = pbeg = BYTE_POS_ADDR (start_byte);
8808   pend = p + (end_byte - start_byte);
8809
8810   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8811   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8812
8813   while (p < pend)
8814     {
8815       if (ASCII_BYTE_P (*p))
8816         p++;
8817       else
8818         {
8819           c = STRING_CHAR_ADVANCE (p);
8820
8821           charset_map_loaded = 0;
8822           for (tail = list; CONSP (tail); tail = XCDR (tail))
8823             {
8824               elt = XCDR (XCAR (tail));
8825               if (! char_encodable_p (c, XCAR (elt)))
8826                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8827             }
8828           if (charset_map_loaded)
8829             {
8830               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8831
8832               if (STRINGP (start))
8833                 pbeg = SDATA (start);
8834               else
8835                 pbeg = BYTE_POS_ADDR (start_byte);
8836               p = pbeg + p_offset;
8837               pend = pbeg + pend_offset;
8838             }
8839         }
8840       pos++;
8841     }
8842
8843   tail = list;
8844   list = Qnil;
8845   for (; CONSP (tail); tail = XCDR (tail))
8846     {
8847       elt = XCAR (tail);
8848       if (CONSP (XCDR (XCDR (elt))))
8849         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8850                       list);
8851     }
8852
8853   return list;
8854 }
8855
8856
8857 static Lisp_Object
8858 code_convert_region (Lisp_Object start, Lisp_Object end,
8859                      Lisp_Object coding_system, Lisp_Object dst_object,
8860                      bool encodep, bool norecord)
8861 {
8862   struct coding_system coding;
8863   ptrdiff_t from, from_byte, to, to_byte;
8864   Lisp_Object src_object;
8865
8866   CHECK_NUMBER_COERCE_MARKER (start);
8867   CHECK_NUMBER_COERCE_MARKER (end);
8868   if (NILP (coding_system))
8869     coding_system = Qno_conversion;
8870   else
8871     CHECK_CODING_SYSTEM (coding_system);
8872   src_object = Fcurrent_buffer ();
8873   if (NILP (dst_object))
8874     dst_object = src_object;
8875   else if (! EQ (dst_object, Qt))
8876     CHECK_BUFFER (dst_object);
8877
8878   validate_region (&start, &end);
8879   from = XFASTINT (start);
8880   from_byte = CHAR_TO_BYTE (from);
8881   to = XFASTINT (end);
8882   to_byte = CHAR_TO_BYTE (to);
8883
8884   setup_coding_system (coding_system, &coding);
8885   coding.mode |= CODING_MODE_LAST_BLOCK;
8886
8887   if (encodep)
8888     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8889                           dst_object);
8890   else
8891     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8892                           dst_object);
8893   if (! norecord)
8894     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8895
8896   return (BUFFERP (dst_object)
8897           ? make_number (coding.produced_char)
8898           : coding.dst_object);
8899 }
8900
8901
8902 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8903        3, 4, "r\nzCoding system: ",
8904        doc: /* Decode the current region from the specified coding system.
8905 When called from a program, takes four arguments:
8906         START, END, CODING-SYSTEM, and DESTINATION.
8907 START and END are buffer positions.
8908
8909 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8910 If nil, the region between START and END is replaced by the decoded text.
8911 If buffer, the decoded text is inserted in that buffer after point (point
8912 does not move).
8913 In those cases, the length of the decoded text is returned.
8914 If DESTINATION is t, the decoded text is returned.
8915
8916 This function sets `last-coding-system-used' to the precise coding system
8917 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8918 not fully specified.)  */)
8919   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8920 {
8921   return code_convert_region (start, end, coding_system, destination, 0, 0);
8922 }
8923
8924 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8925        3, 4, "r\nzCoding system: ",
8926        doc: /* Encode the current region by specified coding system.
8927 When called from a program, takes four arguments:
8928         START, END, CODING-SYSTEM and DESTINATION.
8929 START and END are buffer positions.
8930
8931 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8932 If nil, the region between START and END is replace by the encoded text.
8933 If buffer, the encoded text is inserted in that buffer after point (point
8934 does not move).
8935 In those cases, the length of the encoded text is returned.
8936 If DESTINATION is t, the encoded text is returned.
8937
8938 This function sets `last-coding-system-used' to the precise coding system
8939 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8940 not fully specified.)  */)
8941   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8942 {
8943   return code_convert_region (start, end, coding_system, destination, 1, 0);
8944 }
8945
8946 Lisp_Object
8947 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8948                      Lisp_Object dst_object, bool encodep, bool nocopy,
8949                      bool norecord)
8950 {
8951   struct coding_system coding;
8952   ptrdiff_t chars, bytes;
8953
8954   CHECK_STRING (string);
8955   if (NILP (coding_system))
8956     {
8957       if (! norecord)
8958         Vlast_coding_system_used = Qno_conversion;
8959       if (NILP (dst_object))
8960         return (nocopy ? Fcopy_sequence (string) : string);
8961     }
8962
8963   if (NILP (coding_system))
8964     coding_system = Qno_conversion;
8965   else
8966     CHECK_CODING_SYSTEM (coding_system);
8967   if (NILP (dst_object))
8968     dst_object = Qt;
8969   else if (! EQ (dst_object, Qt))
8970     CHECK_BUFFER (dst_object);
8971
8972   setup_coding_system (coding_system, &coding);
8973   coding.mode |= CODING_MODE_LAST_BLOCK;
8974   chars = SCHARS (string);
8975   bytes = SBYTES (string);
8976   if (encodep)
8977     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8978   else
8979     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8980   if (! norecord)
8981     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8982
8983   return (BUFFERP (dst_object)
8984           ? make_number (coding.produced_char)
8985           : coding.dst_object);
8986 }
8987
8988
8989 /* Encode or decode STRING according to CODING_SYSTEM.
8990    Do not set Vlast_coding_system_used.
8991
8992    This function is called only from macros DECODE_FILE and
8993    ENCODE_FILE, thus we ignore character composition.  */
8994
8995 Lisp_Object
8996 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8997                               bool encodep)
8998 {
8999   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9000 }
9001
9002
9003 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9004        2, 4, 0,
9005        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9006
9007 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9008 if the decoding operation is trivial.
9009
9010 Optional fourth arg BUFFER non-nil means that the decoded text is
9011 inserted in that buffer after point (point does not move).  In this
9012 case, the return value is the length of the decoded text.
9013
9014 This function sets `last-coding-system-used' to the precise coding system
9015 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9016 not fully specified.)  */)
9017   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9018 {
9019   return code_convert_string (string, coding_system, buffer,
9020                               0, ! NILP (nocopy), 0);
9021 }
9022
9023 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9024        2, 4, 0,
9025        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9026
9027 Optional third arg NOCOPY non-nil means it is OK to return STRING
9028 itself if the encoding operation is trivial.
9029
9030 Optional fourth arg BUFFER non-nil means that the encoded text is
9031 inserted in that buffer after point (point does not move).  In this
9032 case, the return value is the length of the encoded text.
9033
9034 This function sets `last-coding-system-used' to the precise coding system
9035 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9036 not fully specified.)  */)
9037   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9038 {
9039   return code_convert_string (string, coding_system, buffer,
9040                               1, ! NILP (nocopy), 0);
9041 }
9042
9043 \f
9044 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9045        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9046 Return the corresponding character.  */)
9047   (Lisp_Object code)
9048 {
9049   Lisp_Object spec, attrs, val;
9050   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9051   EMACS_INT ch;
9052   int c;
9053
9054   CHECK_NATNUM (code);
9055   ch = XFASTINT (code);
9056   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9057   attrs = AREF (spec, 0);
9058
9059   if (ASCII_BYTE_P (ch)
9060       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9061     return code;
9062
9063   val = CODING_ATTR_CHARSET_LIST (attrs);
9064   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9065   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9066   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9067
9068   if (ch <= 0x7F)
9069     {
9070       c = ch;
9071       charset = charset_roman;
9072     }
9073   else if (ch >= 0xA0 && ch < 0xDF)
9074     {
9075       c = ch - 0x80;
9076       charset = charset_kana;
9077     }
9078   else
9079     {
9080       EMACS_INT c1 = ch >> 8;
9081       int c2 = ch & 0xFF;
9082
9083       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9084           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9085         error ("Invalid code: %"pI"d", ch);
9086       c = ch;
9087       SJIS_TO_JIS (c);
9088       charset = charset_kanji;
9089     }
9090   c = DECODE_CHAR (charset, c);
9091   if (c < 0)
9092     error ("Invalid code: %"pI"d", ch);
9093   return make_number (c);
9094 }
9095
9096
9097 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9098        doc: /* Encode a Japanese character CH to shift_jis encoding.
9099 Return the corresponding code in SJIS.  */)
9100   (Lisp_Object ch)
9101 {
9102   Lisp_Object spec, attrs, charset_list;
9103   int c;
9104   struct charset *charset;
9105   unsigned code;
9106
9107   CHECK_CHARACTER (ch);
9108   c = XFASTINT (ch);
9109   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9110   attrs = AREF (spec, 0);
9111
9112   if (ASCII_CHAR_P (c)
9113       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9114     return ch;
9115
9116   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9117   charset = char_charset (c, charset_list, &code);
9118   if (code == CHARSET_INVALID_CODE (charset))
9119     error ("Can't encode by shift_jis encoding: %c", c);
9120   JIS_TO_SJIS (code);
9121
9122   return make_number (code);
9123 }
9124
9125 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9126        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9127 Return the corresponding character.  */)
9128   (Lisp_Object code)
9129 {
9130   Lisp_Object spec, attrs, val;
9131   struct charset *charset_roman, *charset_big5, *charset;
9132   EMACS_INT ch;
9133   int c;
9134
9135   CHECK_NATNUM (code);
9136   ch = XFASTINT (code);
9137   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9138   attrs = AREF (spec, 0);
9139
9140   if (ASCII_BYTE_P (ch)
9141       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9142     return code;
9143
9144   val = CODING_ATTR_CHARSET_LIST (attrs);
9145   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9146   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9147
9148   if (ch <= 0x7F)
9149     {
9150       c = ch;
9151       charset = charset_roman;
9152     }
9153   else
9154     {
9155       EMACS_INT b1 = ch >> 8;
9156       int b2 = ch & 0x7F;
9157       if (b1 < 0xA1 || b1 > 0xFE
9158           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9159         error ("Invalid code: %"pI"d", ch);
9160       c = ch;
9161       charset = charset_big5;
9162     }
9163   c = DECODE_CHAR (charset, c);
9164   if (c < 0)
9165     error ("Invalid code: %"pI"d", ch);
9166   return make_number (c);
9167 }
9168
9169 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9170        doc: /* Encode the Big5 character CH to BIG5 coding system.
9171 Return the corresponding character code in Big5.  */)
9172   (Lisp_Object ch)
9173 {
9174   Lisp_Object spec, attrs, charset_list;
9175   struct charset *charset;
9176   int c;
9177   unsigned code;
9178
9179   CHECK_CHARACTER (ch);
9180   c = XFASTINT (ch);
9181   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9182   attrs = AREF (spec, 0);
9183   if (ASCII_CHAR_P (c)
9184       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9185     return ch;
9186
9187   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9188   charset = char_charset (c, charset_list, &code);
9189   if (code == CHARSET_INVALID_CODE (charset))
9190     error ("Can't encode by Big5 encoding: %c", c);
9191
9192   return make_number (code);
9193 }
9194
9195 \f
9196 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9197        Sset_terminal_coding_system_internal, 1, 2, 0,
9198        doc: /* Internal use only.  */)
9199   (Lisp_Object coding_system, Lisp_Object terminal)
9200 {
9201   struct terminal *term = get_terminal (terminal, 1);
9202   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9203   CHECK_SYMBOL (coding_system);
9204   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9205   /* We had better not send unsafe characters to terminal.  */
9206   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9207   /* Character composition should be disabled.  */
9208   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9209   terminal_coding->src_multibyte = 1;
9210   terminal_coding->dst_multibyte = 0;
9211   tset_charset_list
9212     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9213             ? coding_charset_list (terminal_coding)
9214             : Fcons (make_number (charset_ascii), Qnil)));
9215   return Qnil;
9216 }
9217
9218 DEFUN ("set-safe-terminal-coding-system-internal",
9219        Fset_safe_terminal_coding_system_internal,
9220        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9221        doc: /* Internal use only.  */)
9222   (Lisp_Object coding_system)
9223 {
9224   CHECK_SYMBOL (coding_system);
9225   setup_coding_system (Fcheck_coding_system (coding_system),
9226                        &safe_terminal_coding);
9227   /* Character composition should be disabled.  */
9228   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9229   safe_terminal_coding.src_multibyte = 1;
9230   safe_terminal_coding.dst_multibyte = 0;
9231   return Qnil;
9232 }
9233
9234 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9235        Sterminal_coding_system, 0, 1, 0,
9236        doc: /* Return coding system specified for terminal output on the given terminal.
9237 TERMINAL may be a terminal object, a frame, or nil for the selected
9238 frame's terminal device.  */)
9239   (Lisp_Object terminal)
9240 {
9241   struct coding_system *terminal_coding
9242     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9243   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9244
9245   /* For backward compatibility, return nil if it is `undecided'.  */
9246   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9247 }
9248
9249 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9250        Sset_keyboard_coding_system_internal, 1, 2, 0,
9251        doc: /* Internal use only.  */)
9252   (Lisp_Object coding_system, Lisp_Object terminal)
9253 {
9254   struct terminal *t = get_terminal (terminal, 1);
9255   CHECK_SYMBOL (coding_system);
9256   if (NILP (coding_system))
9257     coding_system = Qno_conversion;
9258   else
9259     Fcheck_coding_system (coding_system);
9260   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9261   /* Character composition should be disabled.  */
9262   TERMINAL_KEYBOARD_CODING (t)->common_flags
9263     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9264   return Qnil;
9265 }
9266
9267 DEFUN ("keyboard-coding-system",
9268        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9269        doc: /* Return coding system specified for decoding keyboard input.  */)
9270   (Lisp_Object terminal)
9271 {
9272   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9273                          (get_terminal (terminal, 1))->id);
9274 }
9275
9276 \f
9277 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9278        Sfind_operation_coding_system,  1, MANY, 0,
9279        doc: /* Choose a coding system for an operation based on the target name.
9280 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9281 DECODING-SYSTEM is the coding system to use for decoding
9282 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9283 for encoding (in case OPERATION does encoding).
9284
9285 The first argument OPERATION specifies an I/O primitive:
9286   For file I/O, `insert-file-contents' or `write-region'.
9287   For process I/O, `call-process', `call-process-region', or `start-process'.
9288   For network I/O, `open-network-stream'.
9289
9290 The remaining arguments should be the same arguments that were passed
9291 to the primitive.  Depending on which primitive, one of those arguments
9292 is selected as the TARGET.  For example, if OPERATION does file I/O,
9293 whichever argument specifies the file name is TARGET.
9294
9295 TARGET has a meaning which depends on OPERATION:
9296   For file I/O, TARGET is a file name (except for the special case below).
9297   For process I/O, TARGET is a process name.
9298   For network I/O, TARGET is a service name or a port number.
9299
9300 This function looks up what is specified for TARGET in
9301 `file-coding-system-alist', `process-coding-system-alist',
9302 or `network-coding-system-alist' depending on OPERATION.
9303 They may specify a coding system, a cons of coding systems,
9304 or a function symbol to call.
9305 In the last case, we call the function with one argument,
9306 which is a list of all the arguments given to this function.
9307 If the function can't decide a coding system, it can return
9308 `undecided' so that the normal code-detection is performed.
9309
9310 If OPERATION is `insert-file-contents', the argument corresponding to
9311 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9312 file name to look up, and BUFFER is a buffer that contains the file's
9313 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9314 function to call for FILENAME, that function should examine the
9315 contents of BUFFER instead of reading the file.
9316
9317 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9318   (ptrdiff_t nargs, Lisp_Object *args)
9319 {
9320   Lisp_Object operation, target_idx, target, val;
9321   register Lisp_Object chain;
9322
9323   if (nargs < 2)
9324     error ("Too few arguments");
9325   operation = args[0];
9326   if (!SYMBOLP (operation)
9327       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9328     error ("Invalid first argument");
9329   if (nargs <= 1 + XFASTINT (target_idx))
9330     error ("Too few arguments for operation `%s'",
9331            SDATA (SYMBOL_NAME (operation)));
9332   target = args[XFASTINT (target_idx) + 1];
9333   if (!(STRINGP (target)
9334         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9335             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9336         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9337     error ("Invalid argument %"pI"d of operation `%s'",
9338            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9339   if (CONSP (target))
9340     target = XCAR (target);
9341
9342   chain = ((EQ (operation, Qinsert_file_contents)
9343             || EQ (operation, Qwrite_region))
9344            ? Vfile_coding_system_alist
9345            : (EQ (operation, Qopen_network_stream)
9346               ? Vnetwork_coding_system_alist
9347               : Vprocess_coding_system_alist));
9348   if (NILP (chain))
9349     return Qnil;
9350
9351   for (; CONSP (chain); chain = XCDR (chain))
9352     {
9353       Lisp_Object elt;
9354
9355       elt = XCAR (chain);
9356       if (CONSP (elt)
9357           && ((STRINGP (target)
9358                && STRINGP (XCAR (elt))
9359                && fast_string_match (XCAR (elt), target) >= 0)
9360               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9361         {
9362           val = XCDR (elt);
9363           /* Here, if VAL is both a valid coding system and a valid
9364              function symbol, we return VAL as a coding system.  */
9365           if (CONSP (val))
9366             return val;
9367           if (! SYMBOLP (val))
9368             return Qnil;
9369           if (! NILP (Fcoding_system_p (val)))
9370             return Fcons (val, val);
9371           if (! NILP (Ffboundp (val)))
9372             {
9373               /* We use call1 rather than safe_call1
9374                  so as to get bug reports about functions called here
9375                  which don't handle the current interface.  */
9376               val = call1 (val, Flist (nargs, args));
9377               if (CONSP (val))
9378                 return val;
9379               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9380                 return Fcons (val, val);
9381             }
9382           return Qnil;
9383         }
9384     }
9385   return Qnil;
9386 }
9387
9388 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9389        Sset_coding_system_priority, 0, MANY, 0,
9390        doc: /* Assign higher priority to the coding systems given as arguments.
9391 If multiple coding systems belong to the same category,
9392 all but the first one are ignored.
9393
9394 usage: (set-coding-system-priority &rest coding-systems)  */)
9395   (ptrdiff_t nargs, Lisp_Object *args)
9396 {
9397   ptrdiff_t i, j;
9398   bool changed[coding_category_max];
9399   enum coding_category priorities[coding_category_max];
9400
9401   memset (changed, 0, sizeof changed);
9402
9403   for (i = j = 0; i < nargs; i++)
9404     {
9405       enum coding_category category;
9406       Lisp_Object spec, attrs;
9407
9408       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9409       attrs = AREF (spec, 0);
9410       category = XINT (CODING_ATTR_CATEGORY (attrs));
9411       if (changed[category])
9412         /* Ignore this coding system because a coding system of the
9413            same category already had a higher priority.  */
9414         continue;
9415       changed[category] = 1;
9416       priorities[j++] = category;
9417       if (coding_categories[category].id >= 0
9418           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9419         setup_coding_system (args[i], &coding_categories[category]);
9420       Fset (AREF (Vcoding_category_table, category), args[i]);
9421     }
9422
9423   /* Now we have decided top J priorities.  Reflect the order of the
9424      original priorities to the remaining priorities.  */
9425
9426   for (i = j, j = 0; i < coding_category_max; i++, j++)
9427     {
9428       while (j < coding_category_max
9429              && changed[coding_priorities[j]])
9430         j++;
9431       if (j == coding_category_max)
9432         abort ();
9433       priorities[i] = coding_priorities[j];
9434     }
9435
9436   memcpy (coding_priorities, priorities, sizeof priorities);
9437
9438   /* Update `coding-category-list'.  */
9439   Vcoding_category_list = Qnil;
9440   for (i = coding_category_max; i-- > 0; )
9441     Vcoding_category_list
9442       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9443                Vcoding_category_list);
9444
9445   return Qnil;
9446 }
9447
9448 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9449        Scoding_system_priority_list, 0, 1, 0,
9450        doc: /* Return a list of coding systems ordered by their priorities.
9451 The list contains a subset of coding systems; i.e. coding systems
9452 assigned to each coding category (see `coding-category-list').
9453
9454 HIGHESTP non-nil means just return the highest priority one.  */)
9455   (Lisp_Object highestp)
9456 {
9457   int i;
9458   Lisp_Object val;
9459
9460   for (i = 0, val = Qnil; i < coding_category_max; i++)
9461     {
9462       enum coding_category category = coding_priorities[i];
9463       int id = coding_categories[category].id;
9464       Lisp_Object attrs;
9465
9466       if (id < 0)
9467         continue;
9468       attrs = CODING_ID_ATTRS (id);
9469       if (! NILP (highestp))
9470         return CODING_ATTR_BASE_NAME (attrs);
9471       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9472     }
9473   return Fnreverse (val);
9474 }
9475
9476 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9477
9478 static Lisp_Object
9479 make_subsidiaries (Lisp_Object base)
9480 {
9481   Lisp_Object subsidiaries;
9482   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9483   char *buf = alloca (base_name_len + 6);
9484   int i;
9485
9486   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9487   subsidiaries = Fmake_vector (make_number (3), Qnil);
9488   for (i = 0; i < 3; i++)
9489     {
9490       strcpy (buf + base_name_len, suffixes[i]);
9491       ASET (subsidiaries, i, intern (buf));
9492     }
9493   return subsidiaries;
9494 }
9495
9496
9497 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9498        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9499        doc: /* For internal use only.
9500 usage: (define-coding-system-internal ...)  */)
9501   (ptrdiff_t nargs, Lisp_Object *args)
9502 {
9503   Lisp_Object name;
9504   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9505   Lisp_Object attrs;            /* Vector of attributes.  */
9506   Lisp_Object eol_type;
9507   Lisp_Object aliases;
9508   Lisp_Object coding_type, charset_list, safe_charsets;
9509   enum coding_category category;
9510   Lisp_Object tail, val;
9511   int max_charset_id = 0;
9512   int i;
9513
9514   if (nargs < coding_arg_max)
9515     goto short_args;
9516
9517   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9518
9519   name = args[coding_arg_name];
9520   CHECK_SYMBOL (name);
9521   ASET (attrs, coding_attr_base_name, name);
9522
9523   val = args[coding_arg_mnemonic];
9524   if (! STRINGP (val))
9525     CHECK_CHARACTER (val);
9526   ASET (attrs, coding_attr_mnemonic, val);
9527
9528   coding_type = args[coding_arg_coding_type];
9529   CHECK_SYMBOL (coding_type);
9530   ASET (attrs, coding_attr_type, coding_type);
9531
9532   charset_list = args[coding_arg_charset_list];
9533   if (SYMBOLP (charset_list))
9534     {
9535       if (EQ (charset_list, Qiso_2022))
9536         {
9537           if (! EQ (coding_type, Qiso_2022))
9538             error ("Invalid charset-list");
9539           charset_list = Viso_2022_charset_list;
9540         }
9541       else if (EQ (charset_list, Qemacs_mule))
9542         {
9543           if (! EQ (coding_type, Qemacs_mule))
9544             error ("Invalid charset-list");
9545           charset_list = Vemacs_mule_charset_list;
9546         }
9547       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9548         {
9549           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9550             error ("Invalid charset-list");
9551           if (max_charset_id < XFASTINT (XCAR (tail)))
9552             max_charset_id = XFASTINT (XCAR (tail));
9553         }
9554     }
9555   else
9556     {
9557       charset_list = Fcopy_sequence (charset_list);
9558       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9559         {
9560           struct charset *charset;
9561
9562           val = XCAR (tail);
9563           CHECK_CHARSET_GET_CHARSET (val, charset);
9564           if (EQ (coding_type, Qiso_2022)
9565               ? CHARSET_ISO_FINAL (charset) < 0
9566               : EQ (coding_type, Qemacs_mule)
9567               ? CHARSET_EMACS_MULE_ID (charset) < 0
9568               : 0)
9569             error ("Can't handle charset `%s'",
9570                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9571
9572           XSETCAR (tail, make_number (charset->id));
9573           if (max_charset_id < charset->id)
9574             max_charset_id = charset->id;
9575         }
9576     }
9577   ASET (attrs, coding_attr_charset_list, charset_list);
9578
9579   safe_charsets = make_uninit_string (max_charset_id + 1);
9580   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9581   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9582     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9583   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9584
9585   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9586
9587   val = args[coding_arg_decode_translation_table];
9588   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9589     CHECK_SYMBOL (val);
9590   ASET (attrs, coding_attr_decode_tbl, val);
9591
9592   val = args[coding_arg_encode_translation_table];
9593   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9594     CHECK_SYMBOL (val);
9595   ASET (attrs, coding_attr_encode_tbl, val);
9596
9597   val = args[coding_arg_post_read_conversion];
9598   CHECK_SYMBOL (val);
9599   ASET (attrs, coding_attr_post_read, val);
9600
9601   val = args[coding_arg_pre_write_conversion];
9602   CHECK_SYMBOL (val);
9603   ASET (attrs, coding_attr_pre_write, val);
9604
9605   val = args[coding_arg_default_char];
9606   if (NILP (val))
9607     ASET (attrs, coding_attr_default_char, make_number (' '));
9608   else
9609     {
9610       CHECK_CHARACTER (val);
9611       ASET (attrs, coding_attr_default_char, val);
9612     }
9613
9614   val = args[coding_arg_for_unibyte];
9615   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9616
9617   val = args[coding_arg_plist];
9618   CHECK_LIST (val);
9619   ASET (attrs, coding_attr_plist, val);
9620
9621   if (EQ (coding_type, Qcharset))
9622     {
9623       /* Generate a lisp vector of 256 elements.  Each element is nil,
9624          integer, or a list of charset IDs.
9625
9626          If Nth element is nil, the byte code N is invalid in this
9627          coding system.
9628
9629          If Nth element is a number NUM, N is the first byte of a
9630          charset whose ID is NUM.
9631
9632          If Nth element is a list of charset IDs, N is the first byte
9633          of one of them.  The list is sorted by dimensions of the
9634          charsets.  A charset of smaller dimension comes first. */
9635       val = Fmake_vector (make_number (256), Qnil);
9636
9637       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9638         {
9639           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9640           int dim = CHARSET_DIMENSION (charset);
9641           int idx = (dim - 1) * 4;
9642
9643           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9644             ASET (attrs, coding_attr_ascii_compat, Qt);
9645
9646           for (i = charset->code_space[idx];
9647                i <= charset->code_space[idx + 1]; i++)
9648             {
9649               Lisp_Object tmp, tmp2;
9650               int dim2;
9651
9652               tmp = AREF (val, i);
9653               if (NILP (tmp))
9654                 tmp = XCAR (tail);
9655               else if (NUMBERP (tmp))
9656                 {
9657                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9658                   if (dim < dim2)
9659                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9660                   else
9661                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9662                 }
9663               else
9664                 {
9665                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9666                     {
9667                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9668                       if (dim < dim2)
9669                         break;
9670                     }
9671                   if (NILP (tmp2))
9672                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9673                   else
9674                     {
9675                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9676                       XSETCAR (tmp2, XCAR (tail));
9677                     }
9678                 }
9679               ASET (val, i, tmp);
9680             }
9681         }
9682       ASET (attrs, coding_attr_charset_valids, val);
9683       category = coding_category_charset;
9684     }
9685   else if (EQ (coding_type, Qccl))
9686     {
9687       Lisp_Object valids;
9688
9689       if (nargs < coding_arg_ccl_max)
9690         goto short_args;
9691
9692       val = args[coding_arg_ccl_decoder];
9693       CHECK_CCL_PROGRAM (val);
9694       if (VECTORP (val))
9695         val = Fcopy_sequence (val);
9696       ASET (attrs, coding_attr_ccl_decoder, val);
9697
9698       val = args[coding_arg_ccl_encoder];
9699       CHECK_CCL_PROGRAM (val);
9700       if (VECTORP (val))
9701         val = Fcopy_sequence (val);
9702       ASET (attrs, coding_attr_ccl_encoder, val);
9703
9704       val = args[coding_arg_ccl_valids];
9705       valids = Fmake_string (make_number (256), make_number (0));
9706       for (tail = val; CONSP (tail); tail = XCDR (tail))
9707         {
9708           int from, to;
9709
9710           val = XCAR (tail);
9711           if (INTEGERP (val))
9712             {
9713               if (! (0 <= XINT (val) && XINT (val) <= 255))
9714                 args_out_of_range_3 (val, make_number (0), make_number (255));
9715               from = to = XINT (val);
9716             }
9717           else
9718             {
9719               CHECK_CONS (val);
9720               CHECK_NATNUM_CAR (val);
9721               CHECK_NUMBER_CDR (val);
9722               if (XINT (XCAR (val)) > 255)
9723                 args_out_of_range_3 (XCAR (val),
9724                                      make_number (0), make_number (255));
9725               from = XINT (XCAR (val));
9726               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9727                 args_out_of_range_3 (XCDR (val),
9728                                      XCAR (val), make_number (255));
9729               to = XINT (XCDR (val));
9730             }
9731           for (i = from; i <= to; i++)
9732             SSET (valids, i, 1);
9733         }
9734       ASET (attrs, coding_attr_ccl_valids, valids);
9735
9736       category = coding_category_ccl;
9737     }
9738   else if (EQ (coding_type, Qutf_16))
9739     {
9740       Lisp_Object bom, endian;
9741
9742       ASET (attrs, coding_attr_ascii_compat, Qnil);
9743
9744       if (nargs < coding_arg_utf16_max)
9745         goto short_args;
9746
9747       bom = args[coding_arg_utf16_bom];
9748       if (! NILP (bom) && ! EQ (bom, Qt))
9749         {
9750           CHECK_CONS (bom);
9751           val = XCAR (bom);
9752           CHECK_CODING_SYSTEM (val);
9753           val = XCDR (bom);
9754           CHECK_CODING_SYSTEM (val);
9755         }
9756       ASET (attrs, coding_attr_utf_bom, bom);
9757
9758       endian = args[coding_arg_utf16_endian];
9759       CHECK_SYMBOL (endian);
9760       if (NILP (endian))
9761         endian = Qbig;
9762       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9763         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9764       ASET (attrs, coding_attr_utf_16_endian, endian);
9765
9766       category = (CONSP (bom)
9767                   ? coding_category_utf_16_auto
9768                   : NILP (bom)
9769                   ? (EQ (endian, Qbig)
9770                      ? coding_category_utf_16_be_nosig
9771                      : coding_category_utf_16_le_nosig)
9772                   : (EQ (endian, Qbig)
9773                      ? coding_category_utf_16_be
9774                      : coding_category_utf_16_le));
9775     }
9776   else if (EQ (coding_type, Qiso_2022))
9777     {
9778       Lisp_Object initial, reg_usage, request, flags;
9779
9780       if (nargs < coding_arg_iso2022_max)
9781         goto short_args;
9782
9783       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9784       CHECK_VECTOR (initial);
9785       for (i = 0; i < 4; i++)
9786         {
9787           val = Faref (initial, make_number (i));
9788           if (! NILP (val))
9789             {
9790               struct charset *charset;
9791
9792               CHECK_CHARSET_GET_CHARSET (val, charset);
9793               ASET (initial, i, make_number (CHARSET_ID (charset)));
9794               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9795                 ASET (attrs, coding_attr_ascii_compat, Qt);
9796             }
9797           else
9798             ASET (initial, i, make_number (-1));
9799         }
9800
9801       reg_usage = args[coding_arg_iso2022_reg_usage];
9802       CHECK_CONS (reg_usage);
9803       CHECK_NUMBER_CAR (reg_usage);
9804       CHECK_NUMBER_CDR (reg_usage);
9805
9806       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9807       for (tail = request; CONSP (tail); tail = XCDR (tail))
9808         {
9809           int id;
9810           Lisp_Object tmp1;
9811
9812           val = XCAR (tail);
9813           CHECK_CONS (val);
9814           tmp1 = XCAR (val);
9815           CHECK_CHARSET_GET_ID (tmp1, id);
9816           CHECK_NATNUM_CDR (val);
9817           if (XINT (XCDR (val)) >= 4)
9818             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9819           XSETCAR (val, make_number (id));
9820         }
9821
9822       flags = args[coding_arg_iso2022_flags];
9823       CHECK_NATNUM (flags);
9824       i = XINT (flags) & INT_MAX;
9825       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9826         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9827       flags = make_number (i);
9828
9829       ASET (attrs, coding_attr_iso_initial, initial);
9830       ASET (attrs, coding_attr_iso_usage, reg_usage);
9831       ASET (attrs, coding_attr_iso_request, request);
9832       ASET (attrs, coding_attr_iso_flags, flags);
9833       setup_iso_safe_charsets (attrs);
9834
9835       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9836         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9837                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9838                     ? coding_category_iso_7_else
9839                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9840                     ? coding_category_iso_7
9841                     : coding_category_iso_7_tight);
9842       else
9843         {
9844           int id = XINT (AREF (initial, 1));
9845
9846           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9847                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9848                        || id < 0)
9849                       ? coding_category_iso_8_else
9850                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9851                       ? coding_category_iso_8_1
9852                       : coding_category_iso_8_2);
9853         }
9854       if (category != coding_category_iso_8_1
9855           && category != coding_category_iso_8_2)
9856         ASET (attrs, coding_attr_ascii_compat, Qnil);
9857     }
9858   else if (EQ (coding_type, Qemacs_mule))
9859     {
9860       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9861         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9862       ASET (attrs, coding_attr_ascii_compat, Qt);
9863       category = coding_category_emacs_mule;
9864     }
9865   else if (EQ (coding_type, Qshift_jis))
9866     {
9867
9868       struct charset *charset;
9869
9870       if (XINT (Flength (charset_list)) != 3
9871           && XINT (Flength (charset_list)) != 4)
9872         error ("There should be three or four charsets");
9873
9874       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9875       if (CHARSET_DIMENSION (charset) != 1)
9876         error ("Dimension of charset %s is not one",
9877                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9878       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9879         ASET (attrs, coding_attr_ascii_compat, Qt);
9880
9881       charset_list = XCDR (charset_list);
9882       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9883       if (CHARSET_DIMENSION (charset) != 1)
9884         error ("Dimension of charset %s is not one",
9885                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9886
9887       charset_list = XCDR (charset_list);
9888       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9889       if (CHARSET_DIMENSION (charset) != 2)
9890         error ("Dimension of charset %s is not two",
9891                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9892
9893       charset_list = XCDR (charset_list);
9894       if (! NILP (charset_list))
9895         {
9896           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9897           if (CHARSET_DIMENSION (charset) != 2)
9898             error ("Dimension of charset %s is not two",
9899                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9900         }
9901
9902       category = coding_category_sjis;
9903       Vsjis_coding_system = name;
9904     }
9905   else if (EQ (coding_type, Qbig5))
9906     {
9907       struct charset *charset;
9908
9909       if (XINT (Flength (charset_list)) != 2)
9910         error ("There should be just two charsets");
9911
9912       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9913       if (CHARSET_DIMENSION (charset) != 1)
9914         error ("Dimension of charset %s is not one",
9915                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9916       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9917         ASET (attrs, coding_attr_ascii_compat, Qt);
9918
9919       charset_list = XCDR (charset_list);
9920       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9921       if (CHARSET_DIMENSION (charset) != 2)
9922         error ("Dimension of charset %s is not two",
9923                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9924
9925       category = coding_category_big5;
9926       Vbig5_coding_system = name;
9927     }
9928   else if (EQ (coding_type, Qraw_text))
9929     {
9930       category = coding_category_raw_text;
9931       ASET (attrs, coding_attr_ascii_compat, Qt);
9932     }
9933   else if (EQ (coding_type, Qutf_8))
9934     {
9935       Lisp_Object bom;
9936
9937       if (nargs < coding_arg_utf8_max)
9938         goto short_args;
9939
9940       bom = args[coding_arg_utf8_bom];
9941       if (! NILP (bom) && ! EQ (bom, Qt))
9942         {
9943           CHECK_CONS (bom);
9944           val = XCAR (bom);
9945           CHECK_CODING_SYSTEM (val);
9946           val = XCDR (bom);
9947           CHECK_CODING_SYSTEM (val);
9948         }
9949       ASET (attrs, coding_attr_utf_bom, bom);
9950       if (NILP (bom))
9951         ASET (attrs, coding_attr_ascii_compat, Qt);
9952
9953       category = (CONSP (bom) ? coding_category_utf_8_auto
9954                   : NILP (bom) ? coding_category_utf_8_nosig
9955                   : coding_category_utf_8_sig);
9956     }
9957   else if (EQ (coding_type, Qundecided))
9958     category = coding_category_undecided;
9959   else
9960     error ("Invalid coding system type: %s",
9961            SDATA (SYMBOL_NAME (coding_type)));
9962
9963   ASET (attrs, coding_attr_category, make_number (category));
9964   ASET (attrs, coding_attr_plist,
9965         Fcons (QCcategory,
9966                Fcons (AREF (Vcoding_category_table, category),
9967                       CODING_ATTR_PLIST (attrs))));
9968   ASET (attrs, coding_attr_plist,
9969         Fcons (QCascii_compatible_p,
9970                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9971                       CODING_ATTR_PLIST (attrs))));
9972
9973   eol_type = args[coding_arg_eol_type];
9974   if (! NILP (eol_type)
9975       && ! EQ (eol_type, Qunix)
9976       && ! EQ (eol_type, Qdos)
9977       && ! EQ (eol_type, Qmac))
9978     error ("Invalid eol-type");
9979
9980   aliases = Fcons (name, Qnil);
9981
9982   if (NILP (eol_type))
9983     {
9984       eol_type = make_subsidiaries (name);
9985       for (i = 0; i < 3; i++)
9986         {
9987           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9988
9989           this_name = AREF (eol_type, i);
9990           this_aliases = Fcons (this_name, Qnil);
9991           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9992           this_spec = Fmake_vector (make_number (3), attrs);
9993           ASET (this_spec, 1, this_aliases);
9994           ASET (this_spec, 2, this_eol_type);
9995           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9996           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9997           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9998           if (NILP (val))
9999             Vcoding_system_alist
10000               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10001                        Vcoding_system_alist);
10002         }
10003     }
10004
10005   spec_vec = Fmake_vector (make_number (3), attrs);
10006   ASET (spec_vec, 1, aliases);
10007   ASET (spec_vec, 2, eol_type);
10008
10009   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10010   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10011   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10012   if (NILP (val))
10013     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10014                                   Vcoding_system_alist);
10015
10016   {
10017     int id = coding_categories[category].id;
10018
10019     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10020       setup_coding_system (name, &coding_categories[category]);
10021   }
10022
10023   return Qnil;
10024
10025  short_args:
10026   return Fsignal (Qwrong_number_of_arguments,
10027                   Fcons (intern ("define-coding-system-internal"),
10028                          make_number (nargs)));
10029 }
10030
10031
10032 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10033        3, 3, 0,
10034        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10035   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10036 {
10037   Lisp_Object spec, attrs;
10038
10039   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10040   attrs = AREF (spec, 0);
10041   if (EQ (prop, QCmnemonic))
10042     {
10043       if (! STRINGP (val))
10044         CHECK_CHARACTER (val);
10045       ASET (attrs, coding_attr_mnemonic, val);
10046     }
10047   else if (EQ (prop, QCdefault_char))
10048     {
10049       if (NILP (val))
10050         val = make_number (' ');
10051       else
10052         CHECK_CHARACTER (val);
10053       ASET (attrs, coding_attr_default_char, val);
10054     }
10055   else if (EQ (prop, QCdecode_translation_table))
10056     {
10057       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10058         CHECK_SYMBOL (val);
10059       ASET (attrs, coding_attr_decode_tbl, val);
10060     }
10061   else if (EQ (prop, QCencode_translation_table))
10062     {
10063       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10064         CHECK_SYMBOL (val);
10065       ASET (attrs, coding_attr_encode_tbl, val);
10066     }
10067   else if (EQ (prop, QCpost_read_conversion))
10068     {
10069       CHECK_SYMBOL (val);
10070       ASET (attrs, coding_attr_post_read, val);
10071     }
10072   else if (EQ (prop, QCpre_write_conversion))
10073     {
10074       CHECK_SYMBOL (val);
10075       ASET (attrs, coding_attr_pre_write, val);
10076     }
10077   else if (EQ (prop, QCascii_compatible_p))
10078     {
10079       ASET (attrs, coding_attr_ascii_compat, val);
10080     }
10081
10082   ASET (attrs, coding_attr_plist,
10083         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10084   return val;
10085 }
10086
10087
10088 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10089        Sdefine_coding_system_alias, 2, 2, 0,
10090        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10091   (Lisp_Object alias, Lisp_Object coding_system)
10092 {
10093   Lisp_Object spec, aliases, eol_type, val;
10094
10095   CHECK_SYMBOL (alias);
10096   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10097   aliases = AREF (spec, 1);
10098   /* ALIASES should be a list of length more than zero, and the first
10099      element is a base coding system.  Append ALIAS at the tail of the
10100      list.  */
10101   while (!NILP (XCDR (aliases)))
10102     aliases = XCDR (aliases);
10103   XSETCDR (aliases, Fcons (alias, Qnil));
10104
10105   eol_type = AREF (spec, 2);
10106   if (VECTORP (eol_type))
10107     {
10108       Lisp_Object subsidiaries;
10109       int i;
10110
10111       subsidiaries = make_subsidiaries (alias);
10112       for (i = 0; i < 3; i++)
10113         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10114                                      AREF (eol_type, i));
10115     }
10116
10117   Fputhash (alias, spec, Vcoding_system_hash_table);
10118   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10119   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10120   if (NILP (val))
10121     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10122                                   Vcoding_system_alist);
10123
10124   return Qnil;
10125 }
10126
10127 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10128        1, 1, 0,
10129        doc: /* Return the base of CODING-SYSTEM.
10130 Any alias or subsidiary coding system is not a base coding system.  */)
10131   (Lisp_Object coding_system)
10132 {
10133   Lisp_Object spec, attrs;
10134
10135   if (NILP (coding_system))
10136     return (Qno_conversion);
10137   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10138   attrs = AREF (spec, 0);
10139   return CODING_ATTR_BASE_NAME (attrs);
10140 }
10141
10142 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10143        1, 1, 0,
10144        doc: "Return the property list of CODING-SYSTEM.")
10145   (Lisp_Object coding_system)
10146 {
10147   Lisp_Object spec, attrs;
10148
10149   if (NILP (coding_system))
10150     coding_system = Qno_conversion;
10151   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10152   attrs = AREF (spec, 0);
10153   return CODING_ATTR_PLIST (attrs);
10154 }
10155
10156
10157 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10158        1, 1, 0,
10159        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10160   (Lisp_Object coding_system)
10161 {
10162   Lisp_Object spec;
10163
10164   if (NILP (coding_system))
10165     coding_system = Qno_conversion;
10166   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10167   return AREF (spec, 1);
10168 }
10169
10170 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10171        Scoding_system_eol_type, 1, 1, 0,
10172        doc: /* Return eol-type of CODING-SYSTEM.
10173 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10174
10175 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10176 and CR respectively.
10177
10178 A vector value indicates that a format of end-of-line should be
10179 detected automatically.  Nth element of the vector is the subsidiary
10180 coding system whose eol-type is N.  */)
10181   (Lisp_Object coding_system)
10182 {
10183   Lisp_Object spec, eol_type;
10184   int n;
10185
10186   if (NILP (coding_system))
10187     coding_system = Qno_conversion;
10188   if (! CODING_SYSTEM_P (coding_system))
10189     return Qnil;
10190   spec = CODING_SYSTEM_SPEC (coding_system);
10191   eol_type = AREF (spec, 2);
10192   if (VECTORP (eol_type))
10193     return Fcopy_sequence (eol_type);
10194   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10195   return make_number (n);
10196 }
10197
10198 #endif /* emacs */
10199
10200 \f
10201 /*** 9. Post-amble ***/
10202
10203 void
10204 init_coding_once (void)
10205 {
10206   int i;
10207
10208   for (i = 0; i < coding_category_max; i++)
10209     {
10210       coding_categories[i].id = -1;
10211       coding_priorities[i] = i;
10212     }
10213
10214   /* ISO2022 specific initialize routine.  */
10215   for (i = 0; i < 0x20; i++)
10216     iso_code_class[i] = ISO_control_0;
10217   for (i = 0x21; i < 0x7F; i++)
10218     iso_code_class[i] = ISO_graphic_plane_0;
10219   for (i = 0x80; i < 0xA0; i++)
10220     iso_code_class[i] = ISO_control_1;
10221   for (i = 0xA1; i < 0xFF; i++)
10222     iso_code_class[i] = ISO_graphic_plane_1;
10223   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10224   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10225   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10226   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10227   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10228   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10229   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10230   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10231   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10232
10233   for (i = 0; i < 256; i++)
10234     {
10235       emacs_mule_bytes[i] = 1;
10236     }
10237   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10238   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10239   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10240   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10241 }
10242
10243 #ifdef emacs
10244
10245 void
10246 syms_of_coding (void)
10247 {
10248   staticpro (&Vcoding_system_hash_table);
10249   {
10250     Lisp_Object args[2];
10251     args[0] = QCtest;
10252     args[1] = Qeq;
10253     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10254   }
10255
10256   staticpro (&Vsjis_coding_system);
10257   Vsjis_coding_system = Qnil;
10258
10259   staticpro (&Vbig5_coding_system);
10260   Vbig5_coding_system = Qnil;
10261
10262   staticpro (&Vcode_conversion_reused_workbuf);
10263   Vcode_conversion_reused_workbuf = Qnil;
10264
10265   staticpro (&Vcode_conversion_workbuf_name);
10266   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10267
10268   reused_workbuf_in_use = 0;
10269
10270   DEFSYM (Qcharset, "charset");
10271   DEFSYM (Qtarget_idx, "target-idx");
10272   DEFSYM (Qcoding_system_history, "coding-system-history");
10273   Fset (Qcoding_system_history, Qnil);
10274
10275   /* Target FILENAME is the first argument.  */
10276   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10277   /* Target FILENAME is the third argument.  */
10278   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10279
10280   DEFSYM (Qcall_process, "call-process");
10281   /* Target PROGRAM is the first argument.  */
10282   Fput (Qcall_process, Qtarget_idx, make_number (0));
10283
10284   DEFSYM (Qcall_process_region, "call-process-region");
10285   /* Target PROGRAM is the third argument.  */
10286   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10287
10288   DEFSYM (Qstart_process, "start-process");
10289   /* Target PROGRAM is the third argument.  */
10290   Fput (Qstart_process, Qtarget_idx, make_number (2));
10291
10292   DEFSYM (Qopen_network_stream, "open-network-stream");
10293   /* Target SERVICE is the fourth argument.  */
10294   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10295
10296   DEFSYM (Qcoding_system, "coding-system");
10297   DEFSYM (Qcoding_aliases, "coding-aliases");
10298
10299   DEFSYM (Qeol_type, "eol-type");
10300   DEFSYM (Qunix, "unix");
10301   DEFSYM (Qdos, "dos");
10302
10303   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10304   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10305   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10306   DEFSYM (Qdefault_char, "default-char");
10307   DEFSYM (Qundecided, "undecided");
10308   DEFSYM (Qno_conversion, "no-conversion");
10309   DEFSYM (Qraw_text, "raw-text");
10310
10311   DEFSYM (Qiso_2022, "iso-2022");
10312
10313   DEFSYM (Qutf_8, "utf-8");
10314   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10315
10316   DEFSYM (Qutf_16, "utf-16");
10317   DEFSYM (Qbig, "big");
10318   DEFSYM (Qlittle, "little");
10319
10320   DEFSYM (Qshift_jis, "shift-jis");
10321   DEFSYM (Qbig5, "big5");
10322
10323   DEFSYM (Qcoding_system_p, "coding-system-p");
10324
10325   DEFSYM (Qcoding_system_error, "coding-system-error");
10326   Fput (Qcoding_system_error, Qerror_conditions,
10327         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10328   Fput (Qcoding_system_error, Qerror_message,
10329         build_pure_c_string ("Invalid coding system"));
10330
10331   /* Intern this now in case it isn't already done.
10332      Setting this variable twice is harmless.
10333      But don't staticpro it here--that is done in alloc.c.  */
10334   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10335
10336   DEFSYM (Qtranslation_table, "translation-table");
10337   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10338   DEFSYM (Qtranslation_table_id, "translation-table-id");
10339   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10340   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10341
10342   DEFSYM (Qvalid_codes, "valid-codes");
10343
10344   DEFSYM (Qemacs_mule, "emacs-mule");
10345
10346   DEFSYM (QCcategory, ":category");
10347   DEFSYM (QCmnemonic, ":mnemonic");
10348   DEFSYM (QCdefault_char, ":default-char");
10349   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10350   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10351   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10352   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10353   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10354
10355   Vcoding_category_table
10356     = Fmake_vector (make_number (coding_category_max), Qnil);
10357   staticpro (&Vcoding_category_table);
10358   /* Followings are target of code detection.  */
10359   ASET (Vcoding_category_table, coding_category_iso_7,
10360         intern_c_string ("coding-category-iso-7"));
10361   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10362         intern_c_string ("coding-category-iso-7-tight"));
10363   ASET (Vcoding_category_table, coding_category_iso_8_1,
10364         intern_c_string ("coding-category-iso-8-1"));
10365   ASET (Vcoding_category_table, coding_category_iso_8_2,
10366         intern_c_string ("coding-category-iso-8-2"));
10367   ASET (Vcoding_category_table, coding_category_iso_7_else,
10368         intern_c_string ("coding-category-iso-7-else"));
10369   ASET (Vcoding_category_table, coding_category_iso_8_else,
10370         intern_c_string ("coding-category-iso-8-else"));
10371   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10372         intern_c_string ("coding-category-utf-8-auto"));
10373   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10374         intern_c_string ("coding-category-utf-8"));
10375   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10376         intern_c_string ("coding-category-utf-8-sig"));
10377   ASET (Vcoding_category_table, coding_category_utf_16_be,
10378         intern_c_string ("coding-category-utf-16-be"));
10379   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10380         intern_c_string ("coding-category-utf-16-auto"));
10381   ASET (Vcoding_category_table, coding_category_utf_16_le,
10382         intern_c_string ("coding-category-utf-16-le"));
10383   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10384         intern_c_string ("coding-category-utf-16-be-nosig"));
10385   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10386         intern_c_string ("coding-category-utf-16-le-nosig"));
10387   ASET (Vcoding_category_table, coding_category_charset,
10388         intern_c_string ("coding-category-charset"));
10389   ASET (Vcoding_category_table, coding_category_sjis,
10390         intern_c_string ("coding-category-sjis"));
10391   ASET (Vcoding_category_table, coding_category_big5,
10392         intern_c_string ("coding-category-big5"));
10393   ASET (Vcoding_category_table, coding_category_ccl,
10394         intern_c_string ("coding-category-ccl"));
10395   ASET (Vcoding_category_table, coding_category_emacs_mule,
10396         intern_c_string ("coding-category-emacs-mule"));
10397   /* Followings are NOT target of code detection.  */
10398   ASET (Vcoding_category_table, coding_category_raw_text,
10399         intern_c_string ("coding-category-raw-text"));
10400   ASET (Vcoding_category_table, coding_category_undecided,
10401         intern_c_string ("coding-category-undecided"));
10402
10403   DEFSYM (Qinsufficient_source, "insufficient-source");
10404   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10405   DEFSYM (Qinvalid_source, "invalid-source");
10406   DEFSYM (Qinterrupted, "interrupted");
10407   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10408   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10409
10410   defsubr (&Scoding_system_p);
10411   defsubr (&Sread_coding_system);
10412   defsubr (&Sread_non_nil_coding_system);
10413   defsubr (&Scheck_coding_system);
10414   defsubr (&Sdetect_coding_region);
10415   defsubr (&Sdetect_coding_string);
10416   defsubr (&Sfind_coding_systems_region_internal);
10417   defsubr (&Sunencodable_char_position);
10418   defsubr (&Scheck_coding_systems_region);
10419   defsubr (&Sdecode_coding_region);
10420   defsubr (&Sencode_coding_region);
10421   defsubr (&Sdecode_coding_string);
10422   defsubr (&Sencode_coding_string);
10423   defsubr (&Sdecode_sjis_char);
10424   defsubr (&Sencode_sjis_char);
10425   defsubr (&Sdecode_big5_char);
10426   defsubr (&Sencode_big5_char);
10427   defsubr (&Sset_terminal_coding_system_internal);
10428   defsubr (&Sset_safe_terminal_coding_system_internal);
10429   defsubr (&Sterminal_coding_system);
10430   defsubr (&Sset_keyboard_coding_system_internal);
10431   defsubr (&Skeyboard_coding_system);
10432   defsubr (&Sfind_operation_coding_system);
10433   defsubr (&Sset_coding_system_priority);
10434   defsubr (&Sdefine_coding_system_internal);
10435   defsubr (&Sdefine_coding_system_alias);
10436   defsubr (&Scoding_system_put);
10437   defsubr (&Scoding_system_base);
10438   defsubr (&Scoding_system_plist);
10439   defsubr (&Scoding_system_aliases);
10440   defsubr (&Scoding_system_eol_type);
10441   defsubr (&Scoding_system_priority_list);
10442
10443   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10444                doc: /* List of coding systems.
10445
10446 Do not alter the value of this variable manually.  This variable should be
10447 updated by the functions `define-coding-system' and
10448 `define-coding-system-alias'.  */);
10449   Vcoding_system_list = Qnil;
10450
10451   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10452                doc: /* Alist of coding system names.
10453 Each element is one element list of coding system name.
10454 This variable is given to `completing-read' as COLLECTION argument.
10455
10456 Do not alter the value of this variable manually.  This variable should be
10457 updated by the functions `make-coding-system' and
10458 `define-coding-system-alias'.  */);
10459   Vcoding_system_alist = Qnil;
10460
10461   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10462                doc: /* List of coding-categories (symbols) ordered by priority.
10463
10464 On detecting a coding system, Emacs tries code detection algorithms
10465 associated with each coding-category one by one in this order.  When
10466 one algorithm agrees with a byte sequence of source text, the coding
10467 system bound to the corresponding coding-category is selected.
10468
10469 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10470   {
10471     int i;
10472
10473     Vcoding_category_list = Qnil;
10474     for (i = coding_category_max - 1; i >= 0; i--)
10475       Vcoding_category_list
10476         = Fcons (AREF (Vcoding_category_table, i),
10477                  Vcoding_category_list);
10478   }
10479
10480   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10481                doc: /* Specify the coding system for read operations.
10482 It is useful to bind this variable with `let', but do not set it globally.
10483 If the value is a coding system, it is used for decoding on read operation.
10484 If not, an appropriate element is used from one of the coding system alists.
10485 There are three such tables: `file-coding-system-alist',
10486 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10487   Vcoding_system_for_read = Qnil;
10488
10489   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10490                doc: /* Specify the coding system for write operations.
10491 Programs bind this variable with `let', but you should not set it globally.
10492 If the value is a coding system, it is used for encoding of output,
10493 when writing it to a file and when sending it to a file or subprocess.
10494
10495 If this does not specify a coding system, an appropriate element
10496 is used from one of the coding system alists.
10497 There are three such tables: `file-coding-system-alist',
10498 `process-coding-system-alist', and `network-coding-system-alist'.
10499 For output to files, if the above procedure does not specify a coding system,
10500 the value of `buffer-file-coding-system' is used.  */);
10501   Vcoding_system_for_write = Qnil;
10502
10503   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10504                doc: /*
10505 Coding system used in the latest file or process I/O.  */);
10506   Vlast_coding_system_used = Qnil;
10507
10508   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10509                doc: /*
10510 Error status of the last code conversion.
10511
10512 When an error was detected in the last code conversion, this variable
10513 is set to one of the following symbols.
10514   `insufficient-source'
10515   `inconsistent-eol'
10516   `invalid-source'
10517   `interrupted'
10518   `insufficient-memory'
10519 When no error was detected, the value doesn't change.  So, to check
10520 the error status of a code conversion by this variable, you must
10521 explicitly set this variable to nil before performing code
10522 conversion.  */);
10523   Vlast_code_conversion_error = Qnil;
10524
10525   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10526                doc: /*
10527 *Non-nil means always inhibit code conversion of end-of-line format.
10528 See info node `Coding Systems' and info node `Text and Binary' concerning
10529 such conversion.  */);
10530   inhibit_eol_conversion = 0;
10531
10532   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10533                doc: /*
10534 Non-nil means process buffer inherits coding system of process output.
10535 Bind it to t if the process output is to be treated as if it were a file
10536 read from some filesystem.  */);
10537   inherit_process_coding_system = 0;
10538
10539   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10540                doc: /*
10541 Alist to decide a coding system to use for a file I/O operation.
10542 The format is ((PATTERN . VAL) ...),
10543 where PATTERN is a regular expression matching a file name,
10544 VAL is a coding system, a cons of coding systems, or a function symbol.
10545 If VAL is a coding system, it is used for both decoding and encoding
10546 the file contents.
10547 If VAL is a cons of coding systems, the car part is used for decoding,
10548 and the cdr part is used for encoding.
10549 If VAL is a function symbol, the function must return a coding system
10550 or a cons of coding systems which are used as above.  The function is
10551 called with an argument that is a list of the arguments with which
10552 `find-operation-coding-system' was called.  If the function can't decide
10553 a coding system, it can return `undecided' so that the normal
10554 code-detection is performed.
10555
10556 See also the function `find-operation-coding-system'
10557 and the variable `auto-coding-alist'.  */);
10558   Vfile_coding_system_alist = Qnil;
10559
10560   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10561                doc: /*
10562 Alist to decide a coding system to use for a process I/O operation.
10563 The format is ((PATTERN . VAL) ...),
10564 where PATTERN is a regular expression matching a program name,
10565 VAL is a coding system, a cons of coding systems, or a function symbol.
10566 If VAL is a coding system, it is used for both decoding what received
10567 from the program and encoding what sent to the program.
10568 If VAL is a cons of coding systems, the car part is used for decoding,
10569 and the cdr part is used for encoding.
10570 If VAL is a function symbol, the function must return a coding system
10571 or a cons of coding systems which are used as above.
10572
10573 See also the function `find-operation-coding-system'.  */);
10574   Vprocess_coding_system_alist = Qnil;
10575
10576   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10577                doc: /*
10578 Alist to decide a coding system to use for a network I/O operation.
10579 The format is ((PATTERN . VAL) ...),
10580 where PATTERN is a regular expression matching a network service name
10581 or is a port number to connect to,
10582 VAL is a coding system, a cons of coding systems, or a function symbol.
10583 If VAL is a coding system, it is used for both decoding what received
10584 from the network stream and encoding what sent to the network stream.
10585 If VAL is a cons of coding systems, the car part is used for decoding,
10586 and the cdr part is used for encoding.
10587 If VAL is a function symbol, the function must return a coding system
10588 or a cons of coding systems which are used as above.
10589
10590 See also the function `find-operation-coding-system'.  */);
10591   Vnetwork_coding_system_alist = Qnil;
10592
10593   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10594                doc: /* Coding system to use with system messages.
10595 Also used for decoding keyboard input on X Window system.  */);
10596   Vlocale_coding_system = Qnil;
10597
10598   /* The eol mnemonics are reset in startup.el system-dependently.  */
10599   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10600                doc: /*
10601 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10602   eol_mnemonic_unix = build_pure_c_string (":");
10603
10604   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10605                doc: /*
10606 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10607   eol_mnemonic_dos = build_pure_c_string ("\\");
10608
10609   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10610                doc: /*
10611 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10612   eol_mnemonic_mac = build_pure_c_string ("/");
10613
10614   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10615                doc: /*
10616 *String displayed in mode line when end-of-line format is not yet determined.  */);
10617   eol_mnemonic_undecided = build_pure_c_string (":");
10618
10619   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10620                doc: /*
10621 *Non-nil enables character translation while encoding and decoding.  */);
10622   Venable_character_translation = Qt;
10623
10624   DEFVAR_LISP ("standard-translation-table-for-decode",
10625                Vstandard_translation_table_for_decode,
10626                doc: /* Table for translating characters while decoding.  */);
10627   Vstandard_translation_table_for_decode = Qnil;
10628
10629   DEFVAR_LISP ("standard-translation-table-for-encode",
10630                Vstandard_translation_table_for_encode,
10631                doc: /* Table for translating characters while encoding.  */);
10632   Vstandard_translation_table_for_encode = Qnil;
10633
10634   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10635                doc: /* Alist of charsets vs revision numbers.
10636 While encoding, if a charset (car part of an element) is found,
10637 designate it with the escape sequence identifying revision (cdr part
10638 of the element).  */);
10639   Vcharset_revision_table = Qnil;
10640
10641   DEFVAR_LISP ("default-process-coding-system",
10642                Vdefault_process_coding_system,
10643                doc: /* Cons of coding systems used for process I/O by default.
10644 The car part is used for decoding a process output,
10645 the cdr part is used for encoding a text to be sent to a process.  */);
10646   Vdefault_process_coding_system = Qnil;
10647
10648   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10649                doc: /*
10650 Table of extra Latin codes in the range 128..159 (inclusive).
10651 This is a vector of length 256.
10652 If Nth element is non-nil, the existence of code N in a file
10653 \(or output of subprocess) doesn't prevent it to be detected as
10654 a coding system of ISO 2022 variant which has a flag
10655 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10656 or reading output of a subprocess.
10657 Only 128th through 159th elements have a meaning.  */);
10658   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10659
10660   DEFVAR_LISP ("select-safe-coding-system-function",
10661                Vselect_safe_coding_system_function,
10662                doc: /*
10663 Function to call to select safe coding system for encoding a text.
10664
10665 If set, this function is called to force a user to select a proper
10666 coding system which can encode the text in the case that a default
10667 coding system used in each operation can't encode the text.  The
10668 function should take care that the buffer is not modified while
10669 the coding system is being selected.
10670
10671 The default value is `select-safe-coding-system' (which see).  */);
10672   Vselect_safe_coding_system_function = Qnil;
10673
10674   DEFVAR_BOOL ("coding-system-require-warning",
10675                coding_system_require_warning,
10676                doc: /* Internal use only.
10677 If non-nil, on writing a file, `select-safe-coding-system-function' is
10678 called even if `coding-system-for-write' is non-nil.  The command
10679 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10680   coding_system_require_warning = 0;
10681
10682
10683   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10684                inhibit_iso_escape_detection,
10685                doc: /*
10686 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10687
10688 When Emacs reads text, it tries to detect how the text is encoded.
10689 This code detection is sensitive to escape sequences.  If Emacs sees
10690 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10691 of the ISO2022 encodings, and decodes text by the corresponding coding
10692 system (e.g. `iso-2022-7bit').
10693
10694 However, there may be a case that you want to read escape sequences in
10695 a file as is.  In such a case, you can set this variable to non-nil.
10696 Then the code detection will ignore any escape sequences, and no text is
10697 detected as encoded in some ISO-2022 encoding.  The result is that all
10698 escape sequences become visible in a buffer.
10699
10700 The default value is nil, and it is strongly recommended not to change
10701 it.  That is because many Emacs Lisp source files that contain
10702 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10703 in Emacs's distribution, and they won't be decoded correctly on
10704 reading if you suppress escape sequence detection.
10705
10706 The other way to read escape sequences in a file without decoding is
10707 to explicitly specify some coding system that doesn't use ISO-2022
10708 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10709   inhibit_iso_escape_detection = 0;
10710
10711   DEFVAR_BOOL ("inhibit-null-byte-detection",
10712                inhibit_null_byte_detection,
10713                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10714 By default, Emacs treats it as binary data, and does not attempt to
10715 decode it.  The effect is as if you specified `no-conversion' for
10716 reading that text.
10717
10718 Set this to non-nil when a regular text happens to include null bytes.
10719 Examples are Index nodes of Info files and null-byte delimited output
10720 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10721 decode text as usual.  */);
10722   inhibit_null_byte_detection = 0;
10723
10724   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10725                doc: /* Char table for translating self-inserting characters.
10726 This is applied to the result of input methods, not their input.
10727 See also `keyboard-translate-table'.
10728
10729 Use of this variable for character code unification was rendered
10730 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10731 internal character representation.  */);
10732     Vtranslation_table_for_input = Qnil;
10733
10734   {
10735     Lisp_Object args[coding_arg_max];
10736     Lisp_Object plist[16];
10737     int i;
10738
10739     for (i = 0; i < coding_arg_max; i++)
10740       args[i] = Qnil;
10741
10742     plist[0] = intern_c_string (":name");
10743     plist[1] = args[coding_arg_name] = Qno_conversion;
10744     plist[2] = intern_c_string (":mnemonic");
10745     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10746     plist[4] = intern_c_string (":coding-type");
10747     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10748     plist[6] = intern_c_string (":ascii-compatible-p");
10749     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10750     plist[8] = intern_c_string (":default-char");
10751     plist[9] = args[coding_arg_default_char] = make_number (0);
10752     plist[10] = intern_c_string (":for-unibyte");
10753     plist[11] = args[coding_arg_for_unibyte] = Qt;
10754     plist[12] = intern_c_string (":docstring");
10755     plist[13] = build_pure_c_string ("Do no conversion.\n\
10756 \n\
10757 When you visit a file with this coding, the file is read into a\n\
10758 unibyte buffer as is, thus each byte of a file is treated as a\n\
10759 character.");
10760     plist[14] = intern_c_string (":eol-type");
10761     plist[15] = args[coding_arg_eol_type] = Qunix;
10762     args[coding_arg_plist] = Flist (16, plist);
10763     Fdefine_coding_system_internal (coding_arg_max, args);
10764
10765     plist[1] = args[coding_arg_name] = Qundecided;
10766     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10767     plist[5] = args[coding_arg_coding_type] = Qundecided;
10768     /* This is already set.
10769        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10770     plist[8] = intern_c_string (":charset-list");
10771     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10772     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10773     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10774     plist[15] = args[coding_arg_eol_type] = Qnil;
10775     args[coding_arg_plist] = Flist (16, plist);
10776     Fdefine_coding_system_internal (coding_arg_max, args);
10777   }
10778
10779   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10780
10781   {
10782     int i;
10783
10784     for (i = 0; i < coding_category_max; i++)
10785       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10786   }
10787 #if defined (DOS_NT)
10788   system_eol_type = Qdos;
10789 #else
10790   system_eol_type = Qunix;
10791 #endif
10792   staticpro (&system_eol_type);
10793 }
10794
10795 char *
10796 emacs_strerror (int error_number)
10797 {
10798   char *str;
10799
10800   synchronize_system_messages_locale ();
10801   str = strerror (error_number);
10802
10803   if (! NILP (Vlocale_coding_system))
10804     {
10805       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10806                                                       Vlocale_coding_system,
10807                                                       0);
10808       str = SSDATA (dec);
10809     }
10810
10811   return str;
10812 }
10813
10814 #endif /* emacs */