src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2012 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #include "lisp.h"
 290 #include "character.h"
 291 #include "buffer.h"
 292 #include "charset.h"
 293 #include "ccl.h"
 294 #include "composite.h"
 295 #include "coding.h"
 296 #include "window.h"
 297 #include "frame.h"
 298 #include "termhooks.h"
 299
 300 Lisp_Object Vcoding_system_hash_table;
 301
 302 static Lisp_Object Qcoding_system, Qeol_type;
 303 static Lisp_Object Qcoding_aliases;
 304 Lisp_Object Qunix, Qdos;
 305 Lisp_Object Qbuffer_file_coding_system;
 306 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 307 static Lisp_Object Qdefault_char;
 308 Lisp_Object Qno_conversion, Qundecided;
 309 Lisp_Object Qcharset, Qutf_8;
 310 static Lisp_Object Qiso_2022;
 311 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 312 static Lisp_Object Qbig, Qlittle;
 313 static Lisp_Object Qcoding_system_history;
 314 static Lisp_Object Qvalid_codes;
 315 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 316 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 317 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 318 static Lisp_Object QCascii_compatible_p;
 319
 320 Lisp_Object Qcall_process, Qcall_process_region;
 321 Lisp_Object Qstart_process, Qopen_network_stream;
 322 static Lisp_Object Qtarget_idx;
 323
 324 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 325 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 326
 327 /* If a symbol has this property, evaluate the value to define the
 328    symbol as a coding system.  */
 329 static Lisp_Object Qcoding_system_define_form;
 330
 331 /* Format of end-of-line decided by system.  This is Qunix on
 332    Unix and Mac, Qdos on DOS/Windows.
 333    This has an effect only for external encoding (i.e. for output to
 334    file and process), not for in-buffer or Lisp string encoding.  */
 335 static Lisp_Object system_eol_type;
 336
 337 #ifdef emacs
 338
 339 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 340
 341 /* Coding system emacs-mule and raw-text are for converting only
 342    end-of-line format.  */
 343 Lisp_Object Qemacs_mule, Qraw_text;
 344 Lisp_Object Qutf_8_emacs;
 345
 346 /* Coding-systems are handed between Emacs Lisp programs and C internal
 347    routines by the following three variables.  */
 348 /* Coding system to be used to encode text for terminal display when
 349    terminal coding system is nil.  */
 350 struct coding_system safe_terminal_coding;
 351
 352 #endif /* emacs */
 353
 354 Lisp_Object Qtranslation_table;
 355 Lisp_Object Qtranslation_table_id;
 356 static Lisp_Object Qtranslation_table_for_decode;
 357 static Lisp_Object Qtranslation_table_for_encode;
 358
 359 /* Two special coding systems.  */
 360 static Lisp_Object Vsjis_coding_system;
 361 static Lisp_Object Vbig5_coding_system;
 362
 363 /* ISO2022 section */
 364
 365 #define CODING_ISO_INITIAL(coding, reg)                 \
 366   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 367                      coding_attr_iso_initial),          \
 368                reg)))
 369
 370
 371 #define CODING_ISO_REQUEST(coding, charset_id)          \
 372   (((charset_id) <= (coding)->max_charset_id            \
 373     ? ((coding)->safe_charsets[charset_id] != 255       \
 374        ? (coding)->safe_charsets[charset_id]            \
 375        : -1)                                            \
 376     : -1))
 377
 378
 379 #define CODING_ISO_FLAGS(coding)        \
 380   ((coding)->spec.iso_2022.flags)
 381 #define CODING_ISO_DESIGNATION(coding, reg)     \
 382   ((coding)->spec.iso_2022.current_designation[reg])
 383 #define CODING_ISO_INVOCATION(coding, plane)    \
 384   ((coding)->spec.iso_2022.current_invocation[plane])
 385 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 386   ((coding)->spec.iso_2022.single_shifting)
 387 #define CODING_ISO_BOL(coding)  \
 388   ((coding)->spec.iso_2022.bol)
 389 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 390   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 391 #define CODING_ISO_CMP_STATUS(coding)   \
 392   (&(coding)->spec.iso_2022.cmp_status)
 393 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 394   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 395 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 396   ((coding)->spec.iso_2022.embedded_utf_8)
 397
 398 /* Control characters of ISO2022.  */
 399                         /* code */      /* function */
 400 #define ISO_CODE_SO     0x0E            /* shift-out */
 401 #define ISO_CODE_SI     0x0F            /* shift-in */
 402 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 403 #define ISO_CODE_ESC    0x1B            /* escape */
 404 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 405 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 406 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 407
 408 /* All code (1-byte) of ISO2022 is classified into one of the
 409    followings.  */
 410 enum iso_code_class_type
 411   {
 412     ISO_control_0,              /* Control codes in the range
 413                                    0x00..0x1F and 0x7F, except for the
 414                                    following 5 codes.  */
 415     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 416     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 417     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 418     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 419     ISO_control_1,              /* Control codes in the range
 420                                    0x80..0x9F, except for the
 421                                    following 3 codes.  */
 422     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 423     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 424     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 425     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 426     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 427     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 428     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 429   };
 430
 431 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 432     `iso-flags' attribute of an iso2022 coding system.  */
 433
 434 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 435    instead of the correct short-form sequence (e.g. ESC $ A).  */
 436 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 437
 438 /* If set, reset graphic planes and registers at end-of-line to the
 439    initial state.  */
 440 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 441
 442 /* If set, reset graphic planes and registers before any control
 443    characters to the initial state.  */
 444 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 445
 446 /* If set, encode by 7-bit environment.  */
 447 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 448
 449 /* If set, use locking-shift function.  */
 450 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 451
 452 /* If set, use single-shift function.  Overwrite
 453    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 454 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 455
 456 /* If set, use designation escape sequence.  */
 457 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 458
 459 /* If set, produce revision number sequence.  */
 460 #define CODING_ISO_FLAG_REVISION        0x0080
 461
 462 /* If set, produce ISO6429's direction specifying sequence.  */
 463 #define CODING_ISO_FLAG_DIRECTION       0x0100
 464
 465 /* If set, assume designation states are reset at beginning of line on
 466    output.  */
 467 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 468
 469 /* If set, designation sequence should be placed at beginning of line
 470    on output.  */
 471 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 472
 473 /* If set, do not encode unsafe characters on output.  */
 474 #define CODING_ISO_FLAG_SAFE            0x0800
 475
 476 /* If set, extra latin codes (128..159) are accepted as a valid code
 477    on input.  */
 478 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 479
 480 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 481
 482 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 483
 484 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 485
 486 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 487
 488 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 489
 490 /* A character to be produced on output if encoding of the original
 491    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 492 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 493
 494 /* UTF-8 section */
 495 #define CODING_UTF_8_BOM(coding)        \
 496   ((coding)->spec.utf_8_bom)
 497
 498 /* UTF-16 section */
 499 #define CODING_UTF_16_BOM(coding)       \
 500   ((coding)->spec.utf_16.bom)
 501
 502 #define CODING_UTF_16_ENDIAN(coding)    \
 503   ((coding)->spec.utf_16.endian)
 504
 505 #define CODING_UTF_16_SURROGATE(coding) \
 506   ((coding)->spec.utf_16.surrogate)
 507
 508
 509 /* CCL section */
 510 #define CODING_CCL_DECODER(coding)      \
 511   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 512 #define CODING_CCL_ENCODER(coding)      \
 513   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 514 #define CODING_CCL_VALIDS(coding)                                          \
 515   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 516
 517 /* Index for each coding category in `coding_categories' */
 518
 519 enum coding_category
 520   {
 521     coding_category_iso_7,
 522     coding_category_iso_7_tight,
 523     coding_category_iso_8_1,
 524     coding_category_iso_8_2,
 525     coding_category_iso_7_else,
 526     coding_category_iso_8_else,
 527     coding_category_utf_8_auto,
 528     coding_category_utf_8_nosig,
 529     coding_category_utf_8_sig,
 530     coding_category_utf_16_auto,
 531     coding_category_utf_16_be,
 532     coding_category_utf_16_le,
 533     coding_category_utf_16_be_nosig,
 534     coding_category_utf_16_le_nosig,
 535     coding_category_charset,
 536     coding_category_sjis,
 537     coding_category_big5,
 538     coding_category_ccl,
 539     coding_category_emacs_mule,
 540     /* All above are targets of code detection.  */
 541     coding_category_raw_text,
 542     coding_category_undecided,
 543     coding_category_max
 544   };
 545
 546 /* Definitions of flag bits used in detect_coding_XXXX.  */
 547 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 548 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 549 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 550 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 551 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 552 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 553 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 554 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 555 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 556 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 557 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 558 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 559 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 560 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 561 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 562 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 563 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 564 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 565 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 566 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 567
 568 /* This value is returned if detect_coding_mask () find nothing other
 569    than ASCII characters.  */
 570 #define CATEGORY_MASK_ANY               \
 571   (CATEGORY_MASK_ISO_7                  \
 572    | CATEGORY_MASK_ISO_7_TIGHT          \
 573    | CATEGORY_MASK_ISO_8_1              \
 574    | CATEGORY_MASK_ISO_8_2              \
 575    | CATEGORY_MASK_ISO_7_ELSE           \
 576    | CATEGORY_MASK_ISO_8_ELSE           \
 577    | CATEGORY_MASK_UTF_8_AUTO           \
 578    | CATEGORY_MASK_UTF_8_NOSIG          \
 579    | CATEGORY_MASK_UTF_8_SIG            \
 580    | CATEGORY_MASK_UTF_16_AUTO          \
 581    | CATEGORY_MASK_UTF_16_BE            \
 582    | CATEGORY_MASK_UTF_16_LE            \
 583    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 584    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 585    | CATEGORY_MASK_CHARSET              \
 586    | CATEGORY_MASK_SJIS                 \
 587    | CATEGORY_MASK_BIG5                 \
 588    | CATEGORY_MASK_CCL                  \
 589    | CATEGORY_MASK_EMACS_MULE)
 590
 591
 592 #define CATEGORY_MASK_ISO_7BIT \
 593   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 594
 595 #define CATEGORY_MASK_ISO_8BIT \
 596   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 597
 598 #define CATEGORY_MASK_ISO_ELSE \
 599   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 600
 601 #define CATEGORY_MASK_ISO_ESCAPE        \
 602   (CATEGORY_MASK_ISO_7                  \
 603    | CATEGORY_MASK_ISO_7_TIGHT          \
 604    | CATEGORY_MASK_ISO_7_ELSE           \
 605    | CATEGORY_MASK_ISO_8_ELSE)
 606
 607 #define CATEGORY_MASK_ISO       \
 608   (  CATEGORY_MASK_ISO_7BIT     \
 609      | CATEGORY_MASK_ISO_8BIT   \
 610      | CATEGORY_MASK_ISO_ELSE)
 611
 612 #define CATEGORY_MASK_UTF_16            \
 613   (CATEGORY_MASK_UTF_16_AUTO            \
 614    | CATEGORY_MASK_UTF_16_BE            \
 615    | CATEGORY_MASK_UTF_16_LE            \
 616    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 617    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 618
 619 #define CATEGORY_MASK_UTF_8     \
 620   (CATEGORY_MASK_UTF_8_AUTO     \
 621    | CATEGORY_MASK_UTF_8_NOSIG  \
 622    | CATEGORY_MASK_UTF_8_SIG)
 623
 624 /* Table of coding categories (Lisp symbols).  This variable is for
 625    internal use only.  */
 626 static Lisp_Object Vcoding_category_table;
 627
 628 /* Table of coding-categories ordered by priority.  */
 629 static enum coding_category coding_priorities[coding_category_max];
 630
 631 /* Nth element is a coding context for the coding system bound to the
 632    Nth coding category.  */
 633 static struct coding_system coding_categories[coding_category_max];
 634
 635 /*** Commonly used macros and functions ***/
 636
 637 #ifndef min
 638 #define min(a, b) ((a) < (b) ? (a) : (b))
 639 #endif
 640 #ifndef max
 641 #define max(a, b) ((a) > (b) ? (a) : (b))
 642 #endif
 643
 644 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 645   do {                                                  \
 646     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 647     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 648   } while (0)
 649
 650
 651 /* Safely get one byte from the source text pointed by SRC which ends
 652    at SRC_END, and set C to that byte.  If there are not enough bytes
 653    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 654    and a multibyte character is found at SRC, set C to the
 655    negative value of the character code.  The caller should declare
 656    and set these variables appropriately in advance:
 657         src, src_end, multibytep */
 658
 659 #define ONE_MORE_BYTE(c)                                \
 660   do {                                                  \
 661     if (src == src_end)                                 \
 662       {                                                 \
 663         if (src_base < src)                             \
 664           record_conversion_result                      \
 665             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 666         goto no_more_source;                            \
 667       }                                                 \
 668     c = *src++;                                         \
 669     if (multibytep && (c & 0x80))                       \
 670       {                                                 \
 671         if ((c & 0xFE) == 0xC0)                         \
 672           c = ((c & 1) << 6) | *src++;                  \
 673         else                                            \
 674           {                                             \
 675             src--;                                      \
 676             c = - string_char (src, &src, NULL);        \
 677             record_conversion_result                    \
 678               (coding, CODING_RESULT_INVALID_SRC);      \
 679           }                                             \
 680       }                                                 \
 681     consumed_chars++;                                   \
 682   } while (0)
 683
 684 /* Safely get two bytes from the source text pointed by SRC which ends
 685    at SRC_END, and set C1 and C2 to those bytes while skipping the
 686    heading multibyte characters.  If there are not enough bytes in the
 687    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 688    a multibyte character is found for C2, set C2 to the negative value
 689    of the character code.  The caller should declare and set these
 690    variables appropriately in advance:
 691         src, src_end, multibytep
 692    It is intended that this macro is used in detect_coding_utf_16.  */
 693
 694 #define TWO_MORE_BYTES(c1, c2)                          \
 695   do {                                                  \
 696     do {                                                \
 697       if (src == src_end)                               \
 698         goto no_more_source;                            \
 699       c1 = *src++;                                      \
 700       if (multibytep && (c1 & 0x80))                    \
 701         {                                               \
 702           if ((c1 & 0xFE) == 0xC0)                      \
 703             c1 = ((c1 & 1) << 6) | *src++;              \
 704           else                                          \
 705             {                                           \
 706               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 707               c1 = -1;                                  \
 708             }                                           \
 709         }                                               \
 710     } while (c1 < 0);                                   \
 711     if (src == src_end)                                 \
 712       goto no_more_source;                              \
 713     c2 = *src++;                                        \
 714     if (multibytep && (c2 & 0x80))                      \
 715       {                                                 \
 716         if ((c2 & 0xFE) == 0xC0)                        \
 717           c2 = ((c2 & 1) << 6) | *src++;                \
 718         else                                            \
 719           c2 = -1;                                      \
 720       }                                                 \
 721   } while (0)
 722
 723
 724 /* Store a byte C in the place pointed by DST and increment DST to the
 725    next free point, and increment PRODUCED_CHARS.  The caller should
 726    assure that C is 0..127, and declare and set the variable `dst'
 727    appropriately in advance.
 728 */
 729
 730
 731 #define EMIT_ONE_ASCII_BYTE(c)  \
 732   do {                          \
 733     produced_chars++;           \
 734     *dst++ = (c);               \
 735   } while (0)
 736
 737
 738 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 739
 740 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 741   do {                                  \
 742     produced_chars += 2;                \
 743     *dst++ = (c1), *dst++ = (c2);       \
 744   } while (0)
 745
 746
 747 /* Store a byte C in the place pointed by DST and increment DST to the
 748    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 749    store in an appropriate multibyte form.  The caller should
 750    declare and set the variables `dst' and `multibytep' appropriately
 751    in advance.  */
 752
 753 #define EMIT_ONE_BYTE(c)                \
 754   do {                                  \
 755     produced_chars++;                   \
 756     if (multibytep)                     \
 757       {                                 \
 758         unsigned ch = (c);              \
 759         if (ch >= 0x80)                 \
 760           ch = BYTE8_TO_CHAR (ch);      \
 761         CHAR_STRING_ADVANCE (ch, dst);  \
 762       }                                 \
 763     else                                \
 764       *dst++ = (c);                     \
 765   } while (0)
 766
 767
 768 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 769
 770 #define EMIT_TWO_BYTES(c1, c2)          \
 771   do {                                  \
 772     produced_chars += 2;                \
 773     if (multibytep)                     \
 774       {                                 \
 775         unsigned ch;                    \
 776                                         \
 777         ch = (c1);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781         ch = (c2);                      \
 782         if (ch >= 0x80)                 \
 783           ch = BYTE8_TO_CHAR (ch);      \
 784         CHAR_STRING_ADVANCE (ch, dst);  \
 785       }                                 \
 786     else                                \
 787       {                                 \
 788         *dst++ = (c1);                  \
 789         *dst++ = (c2);                  \
 790       }                                 \
 791   } while (0)
 792
 793
 794 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 795   do {                                  \
 796     EMIT_ONE_BYTE (c1);                 \
 797     EMIT_TWO_BYTES (c2, c3);            \
 798   } while (0)
 799
 800
 801 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 802   do {                                          \
 803     EMIT_TWO_BYTES (c1, c2);                    \
 804     EMIT_TWO_BYTES (c3, c4);                    \
 805   } while (0)
 806
 807
 808 static void
 809 record_conversion_result (struct coding_system *coding,
 810                           enum coding_result_code result)
 811 {
 812   coding->result = result;
 813   switch (result)
 814     {
 815     case CODING_RESULT_INSUFFICIENT_SRC:
 816       Vlast_code_conversion_error = Qinsufficient_source;
 817       break;
 818     case CODING_RESULT_INCONSISTENT_EOL:
 819       Vlast_code_conversion_error = Qinconsistent_eol;
 820       break;
 821     case CODING_RESULT_INVALID_SRC:
 822       Vlast_code_conversion_error = Qinvalid_source;
 823       break;
 824     case CODING_RESULT_INTERRUPT:
 825       Vlast_code_conversion_error = Qinterrupted;
 826       break;
 827     case CODING_RESULT_INSUFFICIENT_MEM:
 828       Vlast_code_conversion_error = Qinsufficient_memory;
 829       break;
 830     case CODING_RESULT_INSUFFICIENT_DST:
 831       /* Don't record this error in Vlast_code_conversion_error
 832          because it happens just temporarily and is resolved when the
 833          whole conversion is finished.  */
 834       break;
 835     case CODING_RESULT_SUCCESS:
 836       break;
 837     default:
 838       Vlast_code_conversion_error = intern ("Unknown error");
 839     }
 840 }
 841
 842 /* These wrapper macros are used to preserve validity of pointers into
 843    buffer text across calls to decode_char, encode_char, etc, which
 844    could cause relocation of buffers if it loads a charset map,
 845    because loading a charset map allocates large structures.  */
 846
 847 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 848   do {                                                                       \
 849     ptrdiff_t offset;                                                        \
 850                                                                              \
 851     charset_map_loaded = 0;                                                  \
 852     c = DECODE_CHAR (charset, code);                                         \
 853     if (charset_map_loaded                                                   \
 854         && (offset = coding_change_source (coding)))                         \
 855       {                                                                      \
 856         src += offset;                                                       \
 857         src_base += offset;                                                  \
 858         src_end += offset;                                                   \
 859       }                                                                      \
 860   } while (0)
 861
 862 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 863   do {                                                                  \
 864     ptrdiff_t offset;                                                   \
 865                                                                         \
 866     charset_map_loaded = 0;                                             \
 867     code = ENCODE_CHAR (charset, c);                                    \
 868     if (charset_map_loaded                                              \
 869         && (offset = coding_change_destination (coding)))               \
 870       {                                                                 \
 871         dst += offset;                                                  \
 872         dst_end += offset;                                              \
 873       }                                                                 \
 874   } while (0)
 875
 876 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 877   do {                                                                  \
 878     ptrdiff_t offset;                                                   \
 879                                                                         \
 880     charset_map_loaded = 0;                                             \
 881     charset = char_charset (c, charset_list, code_return);              \
 882     if (charset_map_loaded                                              \
 883         && (offset = coding_change_destination (coding)))               \
 884       {                                                                 \
 885         dst += offset;                                                  \
 886         dst_end += offset;                                              \
 887       }                                                                 \
 888   } while (0)
 889
 890 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 891   do {                                                                  \
 892     ptrdiff_t offset;                                                   \
 893                                                                         \
 894     charset_map_loaded = 0;                                             \
 895     result = CHAR_CHARSET_P (c, charset);                               \
 896     if (charset_map_loaded                                              \
 897         && (offset = coding_change_destination (coding)))               \
 898       {                                                                 \
 899         dst += offset;                                                  \
 900         dst_end += offset;                                              \
 901       }                                                                 \
 902   } while (0)
 903
 904
 905 /* If there are at least BYTES length of room at dst, allocate memory
 906    for coding->destination and update dst and dst_end.  We don't have
 907    to take care of coding->source which will be relocated.  It is
 908    handled by calling coding_set_source in encode_coding.  */
 909
 910 #define ASSURE_DESTINATION(bytes)                               \
 911   do {                                                          \
 912     if (dst + (bytes) >= dst_end)                               \
 913       {                                                         \
 914         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 915                                                                 \
 916         dst = alloc_destination (coding, more_bytes, dst);      \
 917         dst_end = coding->destination + coding->dst_bytes;      \
 918       }                                                         \
 919   } while (0)
 920
 921
 922 /* Store multibyte form of the character C in P, and advance P to the
 923    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 924    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 925    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 926
 927 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 928
 929 /* Return the character code of character whose multibyte form is at
 930    P, and advance P to the end of the multibyte form.  This used to be
 931    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 932    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 933
 934 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 935
 936 /* Set coding->source from coding->src_object.  */
 937
 938 static void
 939 coding_set_source (struct coding_system *coding)
 940 {
 941   if (BUFFERP (coding->src_object))
 942     {
 943       struct buffer *buf = XBUFFER (coding->src_object);
 944
 945       if (coding->src_pos < 0)
 946         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 947       else
 948         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 949     }
 950   else if (STRINGP (coding->src_object))
 951     {
 952       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 953     }
 954   else
 955     {
 956       /* Otherwise, the source is C string and is never relocated
 957          automatically.  Thus we don't have to update anything.  */
 958     }
 959 }
 960
 961
 962 /* Set coding->source from coding->src_object, and return how many
 963    bytes coding->source was changed.  */
 964
 965 static ptrdiff_t
 966 coding_change_source (struct coding_system *coding)
 967 {
 968   const unsigned char *orig = coding->source;
 969   coding_set_source (coding);
 970   return coding->source - orig;
 971 }
 972
 973
 974 /* Set coding->destination from coding->dst_object.  */
 975
 976 static void
 977 coding_set_destination (struct coding_system *coding)
 978 {
 979   if (BUFFERP (coding->dst_object))
 980     {
 981       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 982         {
 983           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 984           coding->dst_bytes = (GAP_END_ADDR
 985                                - (coding->src_bytes - coding->consumed)
 986                                - coding->destination);
 987         }
 988       else
 989         {
 990           /* We are sure that coding->dst_pos_byte is before the gap
 991              of the buffer. */
 992           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 993                                  + coding->dst_pos_byte - BEG_BYTE);
 994           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 995                                - coding->destination);
 996         }
 997     }
 998   else
 999     {
1000       /* Otherwise, the destination is C string and is never relocated
1001          automatically.  Thus we don't have to update anything.  */
1002     }
1003 }
1004
1005
1006 /* Set coding->destination from coding->dst_object, and return how
1007    many bytes coding->destination was changed.  */
1008
1009 static ptrdiff_t
1010 coding_change_destination (struct coding_system *coding)
1011 {
1012   const unsigned char *orig = coding->destination;
1013   coding_set_destination (coding);
1014   return coding->destination - orig;
1015 }
1016
1017
1018 static void
1019 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1020 {
1021   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1022     string_overflow ();
1023   coding->destination = xrealloc (coding->destination,
1024                                   coding->dst_bytes + bytes);
1025   coding->dst_bytes += bytes;
1026 }
1027
1028 static void
1029 coding_alloc_by_making_gap (struct coding_system *coding,
1030                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1031 {
1032   if (EQ (coding->src_object, coding->dst_object))
1033     {
1034       /* The gap may contain the produced data at the head and not-yet
1035          consumed data at the tail.  To preserve those data, we at
1036          first make the gap size to zero, then increase the gap
1037          size.  */
1038       ptrdiff_t add = GAP_SIZE;
1039
1040       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1041       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1042       make_gap (bytes);
1043       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1044       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1045     }
1046   else
1047     {
1048       Lisp_Object this_buffer;
1049
1050       this_buffer = Fcurrent_buffer ();
1051       set_buffer_internal (XBUFFER (coding->dst_object));
1052       make_gap (bytes);
1053       set_buffer_internal (XBUFFER (this_buffer));
1054     }
1055 }
1056
1057
1058 static unsigned char *
1059 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1060                    unsigned char *dst)
1061 {
1062   ptrdiff_t offset = dst - coding->destination;
1063
1064   if (BUFFERP (coding->dst_object))
1065     {
1066       struct buffer *buf = XBUFFER (coding->dst_object);
1067
1068       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1069     }
1070   else
1071     coding_alloc_by_realloc (coding, nbytes);
1072   coding_set_destination (coding);
1073   dst = coding->destination + offset;
1074   return dst;
1075 }
1076
1077 /** Macros for annotations.  */
1078
1079 /* An annotation data is stored in the array coding->charbuf in this
1080    format:
1081      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1082    LENGTH is the number of elements in the annotation.
1083    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1084    NCHARS is the number of characters in the text annotated.
1085
1086    The format of the following elements depend on ANNOTATION_MASK.
1087
1088    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1089    follows:
1090      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1091
1092    NBYTES is the number of bytes specified in the header part of
1093    old-style emacs-mule encoding, or 0 for the other kind of
1094    composition.
1095
1096    METHOD is one of enum composition_method.
1097
1098    Optional COMPOSITION-COMPONENTS are characters and composition
1099    rules.
1100
1101    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1102    follows.
1103
1104    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1105    recover from an invalid annotation, and should be skipped by
1106    produce_annotation.  */
1107
1108 /* Maximum length of the header of annotation data.  */
1109 #define MAX_ANNOTATION_LENGTH 5
1110
1111 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1112   do {                                                  \
1113     *(buf)++ = -(len);                                  \
1114     *(buf)++ = (mask);                                  \
1115     *(buf)++ = (nchars);                                \
1116     coding->annotated = 1;                              \
1117   } while (0);
1118
1119 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1120   do {                                                                      \
1121     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1122     *buf++ = nbytes;                                                        \
1123     *buf++ = method;                                                        \
1124   } while (0)
1125
1126
1127 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1128   do {                                                                  \
1129     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1130     *buf++ = id;                                                        \
1131   } while (0)
1132
1133 \f
1134 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1135
1136
1137
1138 \f
1139 /*** 3. UTF-8 ***/
1140
1141 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1142    Return true if a text is encoded in UTF-8.  */
1143
1144 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1145 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1146 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1147 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1148 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1149 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1150
1151 #define UTF_8_BOM_1 0xEF
1152 #define UTF_8_BOM_2 0xBB
1153 #define UTF_8_BOM_3 0xBF
1154
1155 static bool
1156 detect_coding_utf_8 (struct coding_system *coding,
1157                      struct coding_detection_info *detect_info)
1158 {
1159   const unsigned char *src = coding->source, *src_base;
1160   const unsigned char *src_end = coding->source + coding->src_bytes;
1161   bool multibytep = coding->src_multibyte;
1162   ptrdiff_t consumed_chars = 0;
1163   bool bom_found = 0;
1164   bool found = 0;
1165
1166   detect_info->checked |= CATEGORY_MASK_UTF_8;
1167   /* A coding system of this category is always ASCII compatible.  */
1168   src += coding->head_ascii;
1169
1170   while (1)
1171     {
1172       int c, c1, c2, c3, c4;
1173
1174       src_base = src;
1175       ONE_MORE_BYTE (c);
1176       if (c < 0 || UTF_8_1_OCTET_P (c))
1177         continue;
1178       ONE_MORE_BYTE (c1);
1179       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1180         break;
1181       if (UTF_8_2_OCTET_LEADING_P (c))
1182         {
1183           found = 1;
1184           continue;
1185         }
1186       ONE_MORE_BYTE (c2);
1187       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1188         break;
1189       if (UTF_8_3_OCTET_LEADING_P (c))
1190         {
1191           found = 1;
1192           if (src_base == coding->source
1193               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1194             bom_found = 1;
1195           continue;
1196         }
1197       ONE_MORE_BYTE (c3);
1198       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1199         break;
1200       if (UTF_8_4_OCTET_LEADING_P (c))
1201         {
1202           found = 1;
1203           continue;
1204         }
1205       ONE_MORE_BYTE (c4);
1206       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1207         break;
1208       if (UTF_8_5_OCTET_LEADING_P (c))
1209         {
1210           found = 1;
1211           continue;
1212         }
1213       break;
1214     }
1215   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1216   return 0;
1217
1218  no_more_source:
1219   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1220     {
1221       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1222       return 0;
1223     }
1224   if (bom_found)
1225     {
1226       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1227       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1228     }
1229   else
1230     {
1231       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1232       if (found)
1233         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1234     }
1235   return 1;
1236 }
1237
1238
1239 static void
1240 decode_coding_utf_8 (struct coding_system *coding)
1241 {
1242   const unsigned char *src = coding->source + coding->consumed;
1243   const unsigned char *src_end = coding->source + coding->src_bytes;
1244   const unsigned char *src_base;
1245   int *charbuf = coding->charbuf + coding->charbuf_used;
1246   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1247   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1248   bool multibytep = coding->src_multibyte;
1249   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1250   bool eol_dos
1251     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1252   int byte_after_cr = -1;
1253
1254   if (bom != utf_without_bom)
1255     {
1256       int c1, c2, c3;
1257
1258       src_base = src;
1259       ONE_MORE_BYTE (c1);
1260       if (! UTF_8_3_OCTET_LEADING_P (c1))
1261         src = src_base;
1262       else
1263         {
1264           ONE_MORE_BYTE (c2);
1265           if (! UTF_8_EXTRA_OCTET_P (c2))
1266             src = src_base;
1267           else
1268             {
1269               ONE_MORE_BYTE (c3);
1270               if (! UTF_8_EXTRA_OCTET_P (c3))
1271                 src = src_base;
1272               else
1273                 {
1274                   if ((c1 != UTF_8_BOM_1)
1275                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1276                     src = src_base;
1277                   else
1278                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1279                 }
1280             }
1281         }
1282     }
1283   CODING_UTF_8_BOM (coding) = utf_without_bom;
1284
1285   while (1)
1286     {
1287       int c, c1, c2, c3, c4, c5;
1288
1289       src_base = src;
1290       consumed_chars_base = consumed_chars;
1291
1292       if (charbuf >= charbuf_end)
1293         {
1294           if (byte_after_cr >= 0)
1295             src_base--;
1296           break;
1297         }
1298
1299       if (byte_after_cr >= 0)
1300         c1 = byte_after_cr, byte_after_cr = -1;
1301       else
1302         ONE_MORE_BYTE (c1);
1303       if (c1 < 0)
1304         {
1305           c = - c1;
1306         }
1307       else if (UTF_8_1_OCTET_P (c1))
1308         {
1309           if (eol_dos && c1 == '\r')
1310             ONE_MORE_BYTE (byte_after_cr);
1311           c = c1;
1312         }
1313       else
1314         {
1315           ONE_MORE_BYTE (c2);
1316           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1317             goto invalid_code;
1318           if (UTF_8_2_OCTET_LEADING_P (c1))
1319             {
1320               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1321               /* Reject overlong sequences here and below.  Encoders
1322                  producing them are incorrect, they can be misleading,
1323                  and they mess up read/write invariance.  */
1324               if (c < 128)
1325                 goto invalid_code;
1326             }
1327           else
1328             {
1329               ONE_MORE_BYTE (c3);
1330               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1331                 goto invalid_code;
1332               if (UTF_8_3_OCTET_LEADING_P (c1))
1333                 {
1334                   c = (((c1 & 0xF) << 12)
1335                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1336                   if (c < 0x800
1337                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1338                     goto invalid_code;
1339                 }
1340               else
1341                 {
1342                   ONE_MORE_BYTE (c4);
1343                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1344                     goto invalid_code;
1345                   if (UTF_8_4_OCTET_LEADING_P (c1))
1346                     {
1347                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1348                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1349                     if (c < 0x10000)
1350                       goto invalid_code;
1351                     }
1352                   else
1353                     {
1354                       ONE_MORE_BYTE (c5);
1355                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1356                         goto invalid_code;
1357                       if (UTF_8_5_OCTET_LEADING_P (c1))
1358                         {
1359                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1360                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1361                                | (c5 & 0x3F));
1362                           if ((c > MAX_CHAR) || (c < 0x200000))
1363                             goto invalid_code;
1364                         }
1365                       else
1366                         goto invalid_code;
1367                     }
1368                 }
1369             }
1370         }
1371
1372       *charbuf++ = c;
1373       continue;
1374
1375     invalid_code:
1376       src = src_base;
1377       consumed_chars = consumed_chars_base;
1378       ONE_MORE_BYTE (c);
1379       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1380       coding->errors++;
1381     }
1382
1383  no_more_source:
1384   coding->consumed_char += consumed_chars_base;
1385   coding->consumed = src_base - coding->source;
1386   coding->charbuf_used = charbuf - coding->charbuf;
1387 }
1388
1389
1390 static bool
1391 encode_coding_utf_8 (struct coding_system *coding)
1392 {
1393   bool multibytep = coding->dst_multibyte;
1394   int *charbuf = coding->charbuf;
1395   int *charbuf_end = charbuf + coding->charbuf_used;
1396   unsigned char *dst = coding->destination + coding->produced;
1397   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1398   ptrdiff_t produced_chars = 0;
1399   int c;
1400
1401   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1402     {
1403       ASSURE_DESTINATION (3);
1404       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1405       CODING_UTF_8_BOM (coding) = utf_without_bom;
1406     }
1407
1408   if (multibytep)
1409     {
1410       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1411
1412       while (charbuf < charbuf_end)
1413         {
1414           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1415
1416           ASSURE_DESTINATION (safe_room);
1417           c = *charbuf++;
1418           if (CHAR_BYTE8_P (c))
1419             {
1420               c = CHAR_TO_BYTE8 (c);
1421               EMIT_ONE_BYTE (c);
1422             }
1423           else
1424             {
1425               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1426               for (p = str; p < pend; p++)
1427                 EMIT_ONE_BYTE (*p);
1428             }
1429         }
1430     }
1431   else
1432     {
1433       int safe_room = MAX_MULTIBYTE_LENGTH;
1434
1435       while (charbuf < charbuf_end)
1436         {
1437           ASSURE_DESTINATION (safe_room);
1438           c = *charbuf++;
1439           if (CHAR_BYTE8_P (c))
1440             *dst++ = CHAR_TO_BYTE8 (c);
1441           else
1442             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1443           produced_chars++;
1444         }
1445     }
1446   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1447   coding->produced_char += produced_chars;
1448   coding->produced = dst - coding->destination;
1449   return 0;
1450 }
1451
1452
1453 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1454    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1455
1456 #define UTF_16_HIGH_SURROGATE_P(val) \
1457   (((val) & 0xFC00) == 0xD800)
1458
1459 #define UTF_16_LOW_SURROGATE_P(val) \
1460   (((val) & 0xFC00) == 0xDC00)
1461
1462
1463 static bool
1464 detect_coding_utf_16 (struct coding_system *coding,
1465                       struct coding_detection_info *detect_info)
1466 {
1467   const unsigned char *src = coding->source;
1468   const unsigned char *src_end = coding->source + coding->src_bytes;
1469   bool multibytep = coding->src_multibyte;
1470   int c1, c2;
1471
1472   detect_info->checked |= CATEGORY_MASK_UTF_16;
1473   if (coding->mode & CODING_MODE_LAST_BLOCK
1474       && (coding->src_chars & 1))
1475     {
1476       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1477       return 0;
1478     }
1479
1480   TWO_MORE_BYTES (c1, c2);
1481   if ((c1 == 0xFF) && (c2 == 0xFE))
1482     {
1483       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1484                              | CATEGORY_MASK_UTF_16_AUTO);
1485       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1486                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1487                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1488     }
1489   else if ((c1 == 0xFE) && (c2 == 0xFF))
1490     {
1491       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1492                              | CATEGORY_MASK_UTF_16_AUTO);
1493       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1494                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1495                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1496     }
1497   else if (c2 < 0)
1498     {
1499       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1500       return 0;
1501     }
1502   else
1503     {
1504       /* We check the dispersion of Eth and Oth bytes where E is even and
1505          O is odd.  If both are high, we assume binary data.*/
1506       unsigned char e[256], o[256];
1507       unsigned e_num = 1, o_num = 1;
1508
1509       memset (e, 0, 256);
1510       memset (o, 0, 256);
1511       e[c1] = 1;
1512       o[c2] = 1;
1513
1514       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1515                                 |CATEGORY_MASK_UTF_16_BE
1516                                 | CATEGORY_MASK_UTF_16_LE);
1517
1518       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1519              != CATEGORY_MASK_UTF_16)
1520         {
1521           TWO_MORE_BYTES (c1, c2);
1522           if (c2 < 0)
1523             break;
1524           if (! e[c1])
1525             {
1526               e[c1] = 1;
1527               e_num++;
1528               if (e_num >= 128)
1529                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1530             }
1531           if (! o[c2])
1532             {
1533               o[c2] = 1;
1534               o_num++;
1535               if (o_num >= 128)
1536                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1537             }
1538         }
1539       return 0;
1540     }
1541
1542  no_more_source:
1543   return 1;
1544 }
1545
1546 static void
1547 decode_coding_utf_16 (struct coding_system *coding)
1548 {
1549   const unsigned char *src = coding->source + coding->consumed;
1550   const unsigned char *src_end = coding->source + coding->src_bytes;
1551   const unsigned char *src_base;
1552   int *charbuf = coding->charbuf + coding->charbuf_used;
1553   /* We may produces at most 3 chars in one loop.  */
1554   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1555   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1556   bool multibytep = coding->src_multibyte;
1557   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1558   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1559   int surrogate = CODING_UTF_16_SURROGATE (coding);
1560   bool eol_dos
1561     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1562   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1563
1564   if (bom == utf_with_bom)
1565     {
1566       int c, c1, c2;
1567
1568       src_base = src;
1569       ONE_MORE_BYTE (c1);
1570       ONE_MORE_BYTE (c2);
1571       c = (c1 << 8) | c2;
1572
1573       if (endian == utf_16_big_endian
1574           ? c != 0xFEFF : c != 0xFFFE)
1575         {
1576           /* The first two bytes are not BOM.  Treat them as bytes
1577              for a normal character.  */
1578           src = src_base;
1579           coding->errors++;
1580         }
1581       CODING_UTF_16_BOM (coding) = utf_without_bom;
1582     }
1583   else if (bom == utf_detect_bom)
1584     {
1585       /* We have already tried to detect BOM and failed in
1586          detect_coding.  */
1587       CODING_UTF_16_BOM (coding) = utf_without_bom;
1588     }
1589
1590   while (1)
1591     {
1592       int c, c1, c2;
1593
1594       src_base = src;
1595       consumed_chars_base = consumed_chars;
1596
1597       if (charbuf >= charbuf_end)
1598         {
1599           if (byte_after_cr1 >= 0)
1600             src_base -= 2;
1601           break;
1602         }
1603
1604       if (byte_after_cr1 >= 0)
1605         c1 = byte_after_cr1, byte_after_cr1 = -1;
1606       else
1607         ONE_MORE_BYTE (c1);
1608       if (c1 < 0)
1609         {
1610           *charbuf++ = -c1;
1611           continue;
1612         }
1613       if (byte_after_cr2 >= 0)
1614         c2 = byte_after_cr2, byte_after_cr2 = -1;
1615       else
1616         ONE_MORE_BYTE (c2);
1617       if (c2 < 0)
1618         {
1619           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1620           *charbuf++ = -c2;
1621           continue;
1622         }
1623       c = (endian == utf_16_big_endian
1624            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1625
1626       if (surrogate)
1627         {
1628           if (! UTF_16_LOW_SURROGATE_P (c))
1629             {
1630               if (endian == utf_16_big_endian)
1631                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1632               else
1633                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1634               *charbuf++ = c1;
1635               *charbuf++ = c2;
1636               coding->errors++;
1637               if (UTF_16_HIGH_SURROGATE_P (c))
1638                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1639               else
1640                 *charbuf++ = c;
1641             }
1642           else
1643             {
1644               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1645               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1646               *charbuf++ = 0x10000 + c;
1647             }
1648         }
1649       else
1650         {
1651           if (UTF_16_HIGH_SURROGATE_P (c))
1652             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1653           else
1654             {
1655               if (eol_dos && c == '\r')
1656                 {
1657                   ONE_MORE_BYTE (byte_after_cr1);
1658                   ONE_MORE_BYTE (byte_after_cr2);
1659                 }
1660               *charbuf++ = c;
1661             }
1662         }
1663     }
1664
1665  no_more_source:
1666   coding->consumed_char += consumed_chars_base;
1667   coding->consumed = src_base - coding->source;
1668   coding->charbuf_used = charbuf - coding->charbuf;
1669 }
1670
1671 static bool
1672 encode_coding_utf_16 (struct coding_system *coding)
1673 {
1674   bool multibytep = coding->dst_multibyte;
1675   int *charbuf = coding->charbuf;
1676   int *charbuf_end = charbuf + coding->charbuf_used;
1677   unsigned char *dst = coding->destination + coding->produced;
1678   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1679   int safe_room = 8;
1680   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1681   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1682   ptrdiff_t produced_chars = 0;
1683   int c;
1684
1685   if (bom != utf_without_bom)
1686     {
1687       ASSURE_DESTINATION (safe_room);
1688       if (big_endian)
1689         EMIT_TWO_BYTES (0xFE, 0xFF);
1690       else
1691         EMIT_TWO_BYTES (0xFF, 0xFE);
1692       CODING_UTF_16_BOM (coding) = utf_without_bom;
1693     }
1694
1695   while (charbuf < charbuf_end)
1696     {
1697       ASSURE_DESTINATION (safe_room);
1698       c = *charbuf++;
1699       if (c > MAX_UNICODE_CHAR)
1700         c = coding->default_char;
1701
1702       if (c < 0x10000)
1703         {
1704           if (big_endian)
1705             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1706           else
1707             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1708         }
1709       else
1710         {
1711           int c1, c2;
1712
1713           c -= 0x10000;
1714           c1 = (c >> 10) + 0xD800;
1715           c2 = (c & 0x3FF) + 0xDC00;
1716           if (big_endian)
1717             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1718           else
1719             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1720         }
1721     }
1722   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1723   coding->produced = dst - coding->destination;
1724   coding->produced_char += produced_chars;
1725   return 0;
1726 }
1727
1728 \f
1729 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1730
1731 /* Emacs' internal format for representation of multiple character
1732    sets is a kind of multi-byte encoding, i.e. characters are
1733    represented by variable-length sequences of one-byte codes.
1734
1735    ASCII characters and control characters (e.g. `tab', `newline') are
1736    represented by one-byte sequences which are their ASCII codes, in
1737    the range 0x00 through 0x7F.
1738
1739    8-bit characters of the range 0x80..0x9F are represented by
1740    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1741    code + 0x20).
1742
1743    8-bit characters of the range 0xA0..0xFF are represented by
1744    one-byte sequences which are their 8-bit code.
1745
1746    The other characters are represented by a sequence of `base
1747    leading-code', optional `extended leading-code', and one or two
1748    `position-code's.  The length of the sequence is determined by the
1749    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1750    whereas extended leading-code and position-code take the range 0xA0
1751    through 0xFF.  See `charset.h' for more details about leading-code
1752    and position-code.
1753
1754    --- CODE RANGE of Emacs' internal format ---
1755    character set        range
1756    -------------        -----
1757    ascii                0x00..0x7F
1758    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1759    eight-bit-graphic    0xA0..0xBF
1760    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1761    ---------------------------------------------
1762
1763    As this is the internal character representation, the format is
1764    usually not used externally (i.e. in a file or in a data sent to a
1765    process).  But, it is possible to have a text externally in this
1766    format (i.e. by encoding by the coding system `emacs-mule').
1767
1768    In that case, a sequence of one-byte codes has a slightly different
1769    form.
1770
1771    At first, all characters in eight-bit-control are represented by
1772    one-byte sequences which are their 8-bit code.
1773
1774    Next, character composition data are represented by the byte
1775    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1776    where,
1777         METHOD is 0xF2 plus one of composition method (enum
1778         composition_method),
1779
1780         BYTES is 0xA0 plus a byte length of this composition data,
1781
1782         CHARS is 0xA0 plus a number of characters composed by this
1783         data,
1784
1785         COMPONENTs are characters of multibyte form or composition
1786         rules encoded by two-byte of ASCII codes.
1787
1788    In addition, for backward compatibility, the following formats are
1789    also recognized as composition data on decoding.
1790
1791    0x80 MSEQ ...
1792    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1793
1794    Here,
1795         MSEQ is a multibyte form but in these special format:
1796           ASCII: 0xA0 ASCII_CODE+0x80,
1797           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1798         RULE is a one byte code of the range 0xA0..0xF0 that
1799         represents a composition rule.
1800   */
1801
1802 char emacs_mule_bytes[256];
1803
1804
1805 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1806    Return true if a text is encoded in 'emacs-mule'.  */
1807
1808 static bool
1809 detect_coding_emacs_mule (struct coding_system *coding,
1810                           struct coding_detection_info *detect_info)
1811 {
1812   const unsigned char *src = coding->source, *src_base;
1813   const unsigned char *src_end = coding->source + coding->src_bytes;
1814   bool multibytep = coding->src_multibyte;
1815   ptrdiff_t consumed_chars = 0;
1816   int c;
1817   int found = 0;
1818
1819   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1820   /* A coding system of this category is always ASCII compatible.  */
1821   src += coding->head_ascii;
1822
1823   while (1)
1824     {
1825       src_base = src;
1826       ONE_MORE_BYTE (c);
1827       if (c < 0)
1828         continue;
1829       if (c == 0x80)
1830         {
1831           /* Perhaps the start of composite character.  We simply skip
1832              it because analyzing it is too heavy for detecting.  But,
1833              at least, we check that the composite character
1834              constitutes of more than 4 bytes.  */
1835           const unsigned char *src_start;
1836
1837         repeat:
1838           src_start = src;
1839           do
1840             {
1841               ONE_MORE_BYTE (c);
1842             }
1843           while (c >= 0xA0);
1844
1845           if (src - src_start <= 4)
1846             break;
1847           found = CATEGORY_MASK_EMACS_MULE;
1848           if (c == 0x80)
1849             goto repeat;
1850         }
1851
1852       if (c < 0x80)
1853         {
1854           if (c < 0x20
1855               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1856             break;
1857         }
1858       else
1859         {
1860           int more_bytes = emacs_mule_bytes[c] - 1;
1861
1862           while (more_bytes > 0)
1863             {
1864               ONE_MORE_BYTE (c);
1865               if (c < 0xA0)
1866                 {
1867                   src--;        /* Unread the last byte.  */
1868                   break;
1869                 }
1870               more_bytes--;
1871             }
1872           if (more_bytes != 0)
1873             break;
1874           found = CATEGORY_MASK_EMACS_MULE;
1875         }
1876     }
1877   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1878   return 0;
1879
1880  no_more_source:
1881   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1882     {
1883       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1884       return 0;
1885     }
1886   detect_info->found |= found;
1887   return 1;
1888 }
1889
1890
1891 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1892    character.  If CMP_STATUS indicates that we must expect MSEQ or
1893    RULE described above, decode it and return the negative value of
1894    the decoded character or rule.  If an invalid byte is found, return
1895    -1.  If SRC is too short, return -2.  */
1896
1897 static int
1898 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1899                  int *nbytes, int *nchars, int *id,
1900                  struct composition_status *cmp_status)
1901 {
1902   const unsigned char *src_end = coding->source + coding->src_bytes;
1903   const unsigned char *src_base = src;
1904   bool multibytep = coding->src_multibyte;
1905   int charset_ID;
1906   unsigned code;
1907   int c;
1908   int consumed_chars = 0;
1909   bool mseq_found = 0;
1910
1911   ONE_MORE_BYTE (c);
1912   if (c < 0)
1913     {
1914       c = -c;
1915       charset_ID = emacs_mule_charset[0];
1916     }
1917   else
1918     {
1919       if (c >= 0xA0)
1920         {
1921           if (cmp_status->state != COMPOSING_NO
1922               && cmp_status->old_form)
1923             {
1924               if (cmp_status->state == COMPOSING_CHAR)
1925                 {
1926                   if (c == 0xA0)
1927                     {
1928                       ONE_MORE_BYTE (c);
1929                       c -= 0x80;
1930                       if (c < 0)
1931                         goto invalid_code;
1932                     }
1933                   else
1934                     c -= 0x20;
1935                   mseq_found = 1;
1936                 }
1937               else
1938                 {
1939                   *nbytes = src - src_base;
1940                   *nchars = consumed_chars;
1941                   return -c;
1942                 }
1943             }
1944           else
1945             goto invalid_code;
1946         }
1947
1948       switch (emacs_mule_bytes[c])
1949         {
1950         case 2:
1951           if ((charset_ID = emacs_mule_charset[c]) < 0)
1952             goto invalid_code;
1953           ONE_MORE_BYTE (c);
1954           if (c < 0xA0)
1955             goto invalid_code;
1956           code = c & 0x7F;
1957           break;
1958
1959         case 3:
1960           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1961               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1962             {
1963               ONE_MORE_BYTE (c);
1964               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1965                 goto invalid_code;
1966               ONE_MORE_BYTE (c);
1967               if (c < 0xA0)
1968                 goto invalid_code;
1969               code = c & 0x7F;
1970             }
1971           else
1972             {
1973               if ((charset_ID = emacs_mule_charset[c]) < 0)
1974                 goto invalid_code;
1975               ONE_MORE_BYTE (c);
1976               if (c < 0xA0)
1977                 goto invalid_code;
1978               code = (c & 0x7F) << 8;
1979               ONE_MORE_BYTE (c);
1980               if (c < 0xA0)
1981                 goto invalid_code;
1982               code |= c & 0x7F;
1983             }
1984           break;
1985
1986         case 4:
1987           ONE_MORE_BYTE (c);
1988           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1989             goto invalid_code;
1990           ONE_MORE_BYTE (c);
1991           if (c < 0xA0)
1992             goto invalid_code;
1993           code = (c & 0x7F) << 8;
1994           ONE_MORE_BYTE (c);
1995           if (c < 0xA0)
1996             goto invalid_code;
1997           code |= c & 0x7F;
1998           break;
1999
2000         case 1:
2001           code = c;
2002           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2003           break;
2004
2005         default:
2006           emacs_abort ();
2007         }
2008       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2009                           CHARSET_FROM_ID (charset_ID), code, c);
2010       if (c < 0)
2011         goto invalid_code;
2012     }
2013   *nbytes = src - src_base;
2014   *nchars = consumed_chars;
2015   if (id)
2016     *id = charset_ID;
2017   return (mseq_found ? -c : c);
2018
2019  no_more_source:
2020   return -2;
2021
2022  invalid_code:
2023   return -1;
2024 }
2025
2026
2027 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2028
2029 /* Handle these composition sequence ('|': the end of header elements,
2030    BYTES and CHARS >= 0xA0):
2031
2032    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2033    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2034    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2035
2036    and these old form:
2037
2038    (4) relative composition: 0x80 | MSEQ ... MSEQ
2039    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2040
2041    When the starter 0x80 and the following header elements are found,
2042    this annotation header is produced.
2043
2044         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2045
2046    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2047    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2048
2049    Then, upon reading the following elements, these codes are produced
2050    until the composition end is found:
2051
2052    (1) CHAR ... CHAR
2053    (2) ALT ... ALT CHAR ... CHAR
2054    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2055    (4) CHAR ... CHAR
2056    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2057
2058    When the composition end is found, LENGTH and NCHARS in the
2059    annotation header is updated as below:
2060
2061    (1) LENGTH: unchanged, NCHARS: unchanged
2062    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2063    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2064    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2065    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2066
2067    If an error is found while composing, the annotation header is
2068    changed to the original composition header (plus filler -1s) as
2069    below:
2070
2071    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2072    (5)          [ 0x80 0xFF -1 -1- -1 ]
2073
2074    and the sequence [ -2 DECODED-RULE ] is changed to the original
2075    byte sequence as below:
2076         o the original byte sequence is B: [ B -1 ]
2077         o the original byte sequence is B1 B2: [ B1 B2 ]
2078
2079    Most of the routines are implemented by macros because many
2080    variables and labels in the caller decode_coding_emacs_mule must be
2081    accessible, and they are usually called just once (thus doesn't
2082    increase the size of compiled object).  */
2083
2084 /* Decode a composition rule represented by C as a component of
2085    composition sequence of Emacs 20 style.  Set RULE to the decoded
2086    rule. */
2087
2088 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2089   do {                                                  \
2090     int gref, nref;                                     \
2091                                                         \
2092     c -= 0xA0;                                          \
2093     if (c < 0 || c >= 81)                               \
2094       goto invalid_code;                                \
2095     gref = c / 9, nref = c % 9;                         \
2096     if (gref == 4) gref = 10;                           \
2097     if (nref == 4) nref = 10;                           \
2098     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2099   } while (0)
2100
2101
2102 /* Decode a composition rule represented by C and the following byte
2103    at SRC as a component of composition sequence of Emacs 21 style.
2104    Set RULE to the decoded rule.  */
2105
2106 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2107   do {                                                  \
2108     int gref, nref;                                     \
2109                                                         \
2110     gref = c - 0x20;                                    \
2111     if (gref < 0 || gref >= 81)                         \
2112       goto invalid_code;                                \
2113     ONE_MORE_BYTE (c);                                  \
2114     nref = c - 0x20;                                    \
2115     if (nref < 0 || nref >= 81)                         \
2116       goto invalid_code;                                \
2117     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2118   } while (0)
2119
2120
2121 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2122    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2123    byte length of this composition information, CHARS is the number of
2124    characters composed by this composition.  */
2125
2126 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2127   do {                                                                  \
2128     enum composition_method method = c - 0xF2;                          \
2129     int nbytes, nchars;                                                 \
2130                                                                         \
2131     ONE_MORE_BYTE (c);                                                  \
2132     if (c < 0)                                                          \
2133       goto invalid_code;                                                \
2134     nbytes = c - 0xA0;                                                  \
2135     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2136       goto invalid_code;                                                \
2137     ONE_MORE_BYTE (c);                                                  \
2138     nchars = c - 0xA0;                                                  \
2139     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2140       goto invalid_code;                                                \
2141     cmp_status->old_form = 0;                                           \
2142     cmp_status->method = method;                                        \
2143     if (method == COMPOSITION_RELATIVE)                                 \
2144       cmp_status->state = COMPOSING_CHAR;                               \
2145     else                                                                \
2146       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2147     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2148     cmp_status->nchars = nchars;                                        \
2149     cmp_status->ncomps = nbytes - 4;                                    \
2150     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2151   } while (0)
2152
2153
2154 /* Start of Emacs 20 style format for relative composition.  */
2155
2156 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2157   do {                                                          \
2158     cmp_status->old_form = 1;                                   \
2159     cmp_status->method = COMPOSITION_RELATIVE;                  \
2160     cmp_status->state = COMPOSING_CHAR;                         \
2161     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2162     cmp_status->nchars = cmp_status->ncomps = 0;                \
2163     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2164   } while (0)
2165
2166
2167 /* Start of Emacs 20 style format for rule-base composition.  */
2168
2169 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2170   do {                                                          \
2171     cmp_status->old_form = 1;                                   \
2172     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2173     cmp_status->state = COMPOSING_CHAR;                         \
2174     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2175     cmp_status->nchars = cmp_status->ncomps = 0;                \
2176     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2177   } while (0)
2178
2179
2180 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2181   do {                                                  \
2182     const unsigned char *current_src = src;             \
2183                                                         \
2184     ONE_MORE_BYTE (c);                                  \
2185     if (c < 0)                                          \
2186       goto invalid_code;                                \
2187     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2188         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2189       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2190     else if (c < 0xA0)                                  \
2191       goto invalid_code;                                \
2192     else if (c < 0xC0)                                  \
2193       {                                                 \
2194         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2195         /* Re-read C as a composition component.  */    \
2196         src = current_src;                              \
2197       }                                                 \
2198     else if (c == 0xFF)                                 \
2199       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2200     else                                                \
2201       goto invalid_code;                                \
2202   } while (0)
2203
2204 #define EMACS_MULE_COMPOSITION_END()                            \
2205   do {                                                          \
2206     int idx = - cmp_status->length;                             \
2207                                                                 \
2208     if (cmp_status->old_form)                                   \
2209       charbuf[idx + 2] = cmp_status->nchars;                    \
2210     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2211       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2212     cmp_status->state = COMPOSING_NO;                           \
2213   } while (0)
2214
2215
2216 static int
2217 emacs_mule_finish_composition (int *charbuf,
2218                                struct composition_status *cmp_status)
2219 {
2220   int idx = - cmp_status->length;
2221   int new_chars;
2222
2223   if (cmp_status->old_form && cmp_status->nchars > 0)
2224     {
2225       charbuf[idx + 2] = cmp_status->nchars;
2226       new_chars = 0;
2227       if (cmp_status->method == COMPOSITION_WITH_RULE
2228           && cmp_status->state == COMPOSING_CHAR)
2229         {
2230           /* The last rule was invalid.  */
2231           int rule = charbuf[-1] + 0xA0;
2232
2233           charbuf[-2] = BYTE8_TO_CHAR (rule);
2234           charbuf[-1] = -1;
2235           new_chars = 1;
2236         }
2237     }
2238   else
2239     {
2240       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2241
2242       if (cmp_status->method == COMPOSITION_WITH_RULE)
2243         {
2244           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2245           charbuf[idx++] = -3;
2246           charbuf[idx++] = 0;
2247           new_chars = 1;
2248         }
2249       else
2250         {
2251           int nchars = charbuf[idx + 1] + 0xA0;
2252           int nbytes = charbuf[idx + 2] + 0xA0;
2253
2254           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2255           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2256           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2257           charbuf[idx++] = -1;
2258           new_chars = 4;
2259         }
2260     }
2261   cmp_status->state = COMPOSING_NO;
2262   return new_chars;
2263 }
2264
2265 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2266   do {                                                                    \
2267     if (cmp_status->state != COMPOSING_NO)                                \
2268       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2269   } while (0)
2270
2271
2272 static void
2273 decode_coding_emacs_mule (struct coding_system *coding)
2274 {
2275   const unsigned char *src = coding->source + coding->consumed;
2276   const unsigned char *src_end = coding->source + coding->src_bytes;
2277   const unsigned char *src_base;
2278   int *charbuf = coding->charbuf + coding->charbuf_used;
2279   /* We may produce two annotations (charset and composition) in one
2280      loop and one more charset annotation at the end.  */
2281   int *charbuf_end
2282     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2283       /* We can produce up to 2 characters in a loop.  */
2284       - 1;
2285   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2286   bool multibytep = coding->src_multibyte;
2287   ptrdiff_t char_offset = coding->produced_char;
2288   ptrdiff_t last_offset = char_offset;
2289   int last_id = charset_ascii;
2290   bool eol_dos
2291     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2292   int byte_after_cr = -1;
2293   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2294
2295   if (cmp_status->state != COMPOSING_NO)
2296     {
2297       int i;
2298
2299       if (charbuf_end - charbuf < cmp_status->length)
2300         emacs_abort ();
2301       for (i = 0; i < cmp_status->length; i++)
2302         *charbuf++ = cmp_status->carryover[i];
2303       coding->annotated = 1;
2304     }
2305
2306   while (1)
2307     {
2308       int c, id IF_LINT (= 0);
2309
2310       src_base = src;
2311       consumed_chars_base = consumed_chars;
2312
2313       if (charbuf >= charbuf_end)
2314         {
2315           if (byte_after_cr >= 0)
2316             src_base--;
2317           break;
2318         }
2319
2320       if (byte_after_cr >= 0)
2321         c = byte_after_cr, byte_after_cr = -1;
2322       else
2323         ONE_MORE_BYTE (c);
2324
2325       if (c < 0 || c == 0x80)
2326         {
2327           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2328           if (c < 0)
2329             {
2330               *charbuf++ = -c;
2331               char_offset++;
2332             }
2333           else
2334             DECODE_EMACS_MULE_COMPOSITION_START ();
2335           continue;
2336         }
2337
2338       if (c < 0x80)
2339         {
2340           if (eol_dos && c == '\r')
2341             ONE_MORE_BYTE (byte_after_cr);
2342           id = charset_ascii;
2343           if (cmp_status->state != COMPOSING_NO)
2344             {
2345               if (cmp_status->old_form)
2346                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2347               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2348                 cmp_status->ncomps--;
2349             }
2350         }
2351       else
2352         {
2353           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2354           /* emacs_mule_char can load a charset map from a file, which
2355              allocates a large structure and might cause buffer text
2356              to be relocated as result.  Thus, we need to remember the
2357              original pointer to buffer text, and fix up all related
2358              pointers after the call.  */
2359           const unsigned char *orig = coding->source;
2360           ptrdiff_t offset;
2361
2362           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2363                                cmp_status);
2364           offset = coding->source - orig;
2365           if (offset)
2366             {
2367               src += offset;
2368               src_base += offset;
2369               src_end += offset;
2370             }
2371           if (c < 0)
2372             {
2373               if (c == -1)
2374                 goto invalid_code;
2375               if (c == -2)
2376                 break;
2377             }
2378           src = src_base + nbytes;
2379           consumed_chars = consumed_chars_base + nchars;
2380           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2381             cmp_status->ncomps -= nchars;
2382         }
2383
2384       /* Now if C >= 0, we found a normally encoded character, if C <
2385          0, we found an old-style composition component character or
2386          rule.  */
2387
2388       if (cmp_status->state == COMPOSING_NO)
2389         {
2390           if (last_id != id)
2391             {
2392               if (last_id != charset_ascii)
2393                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2394                                   last_id);
2395               last_id = id;
2396               last_offset = char_offset;
2397             }
2398           *charbuf++ = c;
2399           char_offset++;
2400         }
2401       else if (cmp_status->state == COMPOSING_CHAR)
2402         {
2403           if (cmp_status->old_form)
2404             {
2405               if (c >= 0)
2406                 {
2407                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2408                   *charbuf++ = c;
2409                   char_offset++;
2410                 }
2411               else
2412                 {
2413                   *charbuf++ = -c;
2414                   cmp_status->nchars++;
2415                   cmp_status->length++;
2416                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2417                     EMACS_MULE_COMPOSITION_END ();
2418                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2419                     cmp_status->state = COMPOSING_RULE;
2420                 }
2421             }
2422           else
2423             {
2424               *charbuf++ = c;
2425               cmp_status->length++;
2426               cmp_status->nchars--;
2427               if (cmp_status->nchars == 0)
2428                 EMACS_MULE_COMPOSITION_END ();
2429             }
2430         }
2431       else if (cmp_status->state == COMPOSING_RULE)
2432         {
2433           int rule;
2434
2435           if (c >= 0)
2436             {
2437               EMACS_MULE_COMPOSITION_END ();
2438               *charbuf++ = c;
2439               char_offset++;
2440             }
2441           else
2442             {
2443               c = -c;
2444               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2445               if (rule < 0)
2446                 goto invalid_code;
2447               *charbuf++ = -2;
2448               *charbuf++ = rule;
2449               cmp_status->length += 2;
2450               cmp_status->state = COMPOSING_CHAR;
2451             }
2452         }
2453       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2454         {
2455           *charbuf++ = c;
2456           cmp_status->length++;
2457           if (cmp_status->ncomps == 0)
2458             cmp_status->state = COMPOSING_CHAR;
2459           else if (cmp_status->ncomps > 0)
2460             {
2461               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2462                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2463             }
2464           else
2465             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2466         }
2467       else                      /* COMPOSING_COMPONENT_RULE */
2468         {
2469           int rule;
2470
2471           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2472           if (rule < 0)
2473             goto invalid_code;
2474           *charbuf++ = -2;
2475           *charbuf++ = rule;
2476           cmp_status->length += 2;
2477           cmp_status->ncomps--;
2478           if (cmp_status->ncomps > 0)
2479             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2480           else
2481             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2482         }
2483       continue;
2484
2485     invalid_code:
2486       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2487       src = src_base;
2488       consumed_chars = consumed_chars_base;
2489       ONE_MORE_BYTE (c);
2490       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2491       char_offset++;
2492       coding->errors++;
2493     }
2494
2495  no_more_source:
2496   if (cmp_status->state != COMPOSING_NO)
2497     {
2498       if (coding->mode & CODING_MODE_LAST_BLOCK)
2499         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2500       else
2501         {
2502           int i;
2503
2504           charbuf -= cmp_status->length;
2505           for (i = 0; i < cmp_status->length; i++)
2506             cmp_status->carryover[i] = charbuf[i];
2507         }
2508     }
2509   if (last_id != charset_ascii)
2510     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2511   coding->consumed_char += consumed_chars_base;
2512   coding->consumed = src_base - coding->source;
2513   coding->charbuf_used = charbuf - coding->charbuf;
2514 }
2515
2516
2517 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2518   do {                                          \
2519     if (id < 0xA0)                              \
2520       codes[0] = id, codes[1] = 0;              \
2521     else if (id < 0xE0)                         \
2522       codes[0] = 0x9A, codes[1] = id;           \
2523     else if (id < 0xF0)                         \
2524       codes[0] = 0x9B, codes[1] = id;           \
2525     else if (id < 0xF5)                         \
2526       codes[0] = 0x9C, codes[1] = id;           \
2527     else                                        \
2528       codes[0] = 0x9D, codes[1] = id;           \
2529   } while (0);
2530
2531
2532 static bool
2533 encode_coding_emacs_mule (struct coding_system *coding)
2534 {
2535   bool multibytep = coding->dst_multibyte;
2536   int *charbuf = coding->charbuf;
2537   int *charbuf_end = charbuf + coding->charbuf_used;
2538   unsigned char *dst = coding->destination + coding->produced;
2539   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2540   int safe_room = 8;
2541   ptrdiff_t produced_chars = 0;
2542   Lisp_Object attrs, charset_list;
2543   int c;
2544   int preferred_charset_id = -1;
2545
2546   CODING_GET_INFO (coding, attrs, charset_list);
2547   if (! EQ (charset_list, Vemacs_mule_charset_list))
2548     {
2549       charset_list = Vemacs_mule_charset_list;
2550       ASET (attrs, coding_attr_charset_list, charset_list);
2551     }
2552
2553   while (charbuf < charbuf_end)
2554     {
2555       ASSURE_DESTINATION (safe_room);
2556       c = *charbuf++;
2557
2558       if (c < 0)
2559         {
2560           /* Handle an annotation.  */
2561           switch (*charbuf)
2562             {
2563             case CODING_ANNOTATE_COMPOSITION_MASK:
2564               /* Not yet implemented.  */
2565               break;
2566             case CODING_ANNOTATE_CHARSET_MASK:
2567               preferred_charset_id = charbuf[3];
2568               if (preferred_charset_id >= 0
2569                   && NILP (Fmemq (make_number (preferred_charset_id),
2570                                   charset_list)))
2571                 preferred_charset_id = -1;
2572               break;
2573             default:
2574               emacs_abort ();
2575             }
2576           charbuf += -c - 1;
2577           continue;
2578         }
2579
2580       if (ASCII_CHAR_P (c))
2581         EMIT_ONE_ASCII_BYTE (c);
2582       else if (CHAR_BYTE8_P (c))
2583         {
2584           c = CHAR_TO_BYTE8 (c);
2585           EMIT_ONE_BYTE (c);
2586         }
2587       else
2588         {
2589           struct charset *charset;
2590           unsigned code;
2591           int dimension;
2592           int emacs_mule_id;
2593           unsigned char leading_codes[2];
2594
2595           if (preferred_charset_id >= 0)
2596             {
2597               bool result;
2598
2599               charset = CHARSET_FROM_ID (preferred_charset_id);
2600               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2601               if (result)
2602                 code = ENCODE_CHAR (charset, c);
2603               else
2604                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2605                                      &code, charset);
2606             }
2607           else
2608             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2609                                  &code, charset);
2610           if (! charset)
2611             {
2612               c = coding->default_char;
2613               if (ASCII_CHAR_P (c))
2614                 {
2615                   EMIT_ONE_ASCII_BYTE (c);
2616                   continue;
2617                 }
2618               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2619                                    &code, charset);
2620             }
2621           dimension = CHARSET_DIMENSION (charset);
2622           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2623           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2624           EMIT_ONE_BYTE (leading_codes[0]);
2625           if (leading_codes[1])
2626             EMIT_ONE_BYTE (leading_codes[1]);
2627           if (dimension == 1)
2628             EMIT_ONE_BYTE (code | 0x80);
2629           else
2630             {
2631               code |= 0x8080;
2632               EMIT_ONE_BYTE (code >> 8);
2633               EMIT_ONE_BYTE (code & 0xFF);
2634             }
2635         }
2636     }
2637   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2638   coding->produced_char += produced_chars;
2639   coding->produced = dst - coding->destination;
2640   return 0;
2641 }
2642
2643 \f
2644 /*** 7. ISO2022 handlers ***/
2645
2646 /* The following note describes the coding system ISO2022 briefly.
2647    Since the intention of this note is to help understand the
2648    functions in this file, some parts are NOT ACCURATE or are OVERLY
2649    SIMPLIFIED.  For thorough understanding, please refer to the
2650    original document of ISO2022.  This is equivalent to the standard
2651    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2652
2653    ISO2022 provides many mechanisms to encode several character sets
2654    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2655    is encoded using bytes less than 128.  This may make the encoded
2656    text a little bit longer, but the text passes more easily through
2657    several types of gateway, some of which strip off the MSB (Most
2658    Significant Bit).
2659
2660    There are two kinds of character sets: control character sets and
2661    graphic character sets.  The former contain control characters such
2662    as `newline' and `escape' to provide control functions (control
2663    functions are also provided by escape sequences).  The latter
2664    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2665    two control character sets and many graphic character sets.
2666
2667    Graphic character sets are classified into one of the following
2668    four classes, according to the number of bytes (DIMENSION) and
2669    number of characters in one dimension (CHARS) of the set:
2670    - DIMENSION1_CHARS94
2671    - DIMENSION1_CHARS96
2672    - DIMENSION2_CHARS94
2673    - DIMENSION2_CHARS96
2674
2675    In addition, each character set is assigned an identification tag,
2676    unique for each set, called the "final character" (denoted as <F>
2677    hereafter).  The <F> of each character set is decided by ECMA(*)
2678    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2679    (0x30..0x3F are for private use only).
2680
2681    Note (*): ECMA = European Computer Manufacturers Association
2682
2683    Here are examples of graphic character sets [NAME(<F>)]:
2684         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2685         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2686         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2687         o DIMENSION2_CHARS96 -- none for the moment
2688
2689    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2690         C0 [0x00..0x1F] -- control character plane 0
2691         GL [0x20..0x7F] -- graphic character plane 0
2692         C1 [0x80..0x9F] -- control character plane 1
2693         GR [0xA0..0xFF] -- graphic character plane 1
2694
2695    A control character set is directly designated and invoked to C0 or
2696    C1 by an escape sequence.  The most common case is that:
2697    - ISO646's  control character set is designated/invoked to C0, and
2698    - ISO6429's control character set is designated/invoked to C1,
2699    and usually these designations/invocations are omitted in encoded
2700    text.  In a 7-bit environment, only C0 can be used, and a control
2701    character for C1 is encoded by an appropriate escape sequence to
2702    fit into the environment.  All control characters for C1 are
2703    defined to have corresponding escape sequences.
2704
2705    A graphic character set is at first designated to one of four
2706    graphic registers (G0 through G3), then these graphic registers are
2707    invoked to GL or GR.  These designations and invocations can be
2708    done independently.  The most common case is that G0 is invoked to
2709    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2710    these invocations and designations are omitted in encoded text.
2711    In a 7-bit environment, only GL can be used.
2712
2713    When a graphic character set of CHARS94 is invoked to GL, codes
2714    0x20 and 0x7F of the GL area work as control characters SPACE and
2715    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2716    be used.
2717
2718    There are two ways of invocation: locking-shift and single-shift.
2719    With locking-shift, the invocation lasts until the next different
2720    invocation, whereas with single-shift, the invocation affects the
2721    following character only and doesn't affect the locking-shift
2722    state.  Invocations are done by the following control characters or
2723    escape sequences:
2724
2725    ----------------------------------------------------------------------
2726    abbrev  function                  cntrl escape seq   description
2727    ----------------------------------------------------------------------
2728    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2729    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2730    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2731    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2732    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2733    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2734    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2735    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2736    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2737    ----------------------------------------------------------------------
2738    (*) These are not used by any known coding system.
2739
2740    Control characters for these functions are defined by macros
2741    ISO_CODE_XXX in `coding.h'.
2742
2743    Designations are done by the following escape sequences:
2744    ----------------------------------------------------------------------
2745    escape sequence      description
2746    ----------------------------------------------------------------------
2747    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2748    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2749    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2750    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2751    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2752    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2753    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2754    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2755    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2756    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2757    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2758    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2759    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2760    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2761    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2762    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2763    ----------------------------------------------------------------------
2764
2765    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2766    of dimension 1, chars 94, and final character <F>, etc...
2767
2768    Note (*): Although these designations are not allowed in ISO2022,
2769    Emacs accepts them on decoding, and produces them on encoding
2770    CHARS96 character sets in a coding system which is characterized as
2771    7-bit environment, non-locking-shift, and non-single-shift.
2772
2773    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2774    '(' must be omitted.  We refer to this as "short-form" hereafter.
2775
2776    Now you may notice that there are a lot of ways of encoding the
2777    same multilingual text in ISO2022.  Actually, there exist many
2778    coding systems such as Compound Text (used in X11's inter client
2779    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2780    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2781    localized platforms), and all of these are variants of ISO2022.
2782
2783    In addition to the above, Emacs handles two more kinds of escape
2784    sequences: ISO6429's direction specification and Emacs' private
2785    sequence for specifying character composition.
2786
2787    ISO6429's direction specification takes the following form:
2788         o CSI ']'      -- end of the current direction
2789         o CSI '0' ']'  -- end of the current direction
2790         o CSI '1' ']'  -- start of left-to-right text
2791         o CSI '2' ']'  -- start of right-to-left text
2792    The control character CSI (0x9B: control sequence introducer) is
2793    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2794
2795    Character composition specification takes the following form:
2796         o ESC '0' -- start relative composition
2797         o ESC '1' -- end composition
2798         o ESC '2' -- start rule-base composition (*)
2799         o ESC '3' -- start relative composition with alternate chars  (**)
2800         o ESC '4' -- start rule-base composition with alternate chars  (**)
2801   Since these are not standard escape sequences of any ISO standard,
2802   the use of them with these meanings is restricted to Emacs only.
2803
2804   (*) This form is used only in Emacs 20.7 and older versions,
2805   but newer versions can safely decode it.
2806   (**) This form is used only in Emacs 21.1 and newer versions,
2807   and older versions can't decode it.
2808
2809   Here's a list of example usages of these composition escape
2810   sequences (categorized by `enum composition_method').
2811
2812   COMPOSITION_RELATIVE:
2813         ESC 0 CHAR [ CHAR ] ESC 1
2814   COMPOSITION_WITH_RULE:
2815         ESC 2 CHAR [ RULE CHAR ] ESC 1
2816   COMPOSITION_WITH_ALTCHARS:
2817         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2818   COMPOSITION_WITH_RULE_ALTCHARS:
2819         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2820
2821 static enum iso_code_class_type iso_code_class[256];
2822
2823 #define SAFE_CHARSET_P(coding, id)      \
2824   ((id) <= (coding)->max_charset_id     \
2825    && (coding)->safe_charsets[id] != 255)
2826
2827 static void
2828 setup_iso_safe_charsets (Lisp_Object attrs)
2829 {
2830   Lisp_Object charset_list, safe_charsets;
2831   Lisp_Object request;
2832   Lisp_Object reg_usage;
2833   Lisp_Object tail;
2834   EMACS_INT reg94, reg96;
2835   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2836   int max_charset_id;
2837
2838   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2839   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2840       && ! EQ (charset_list, Viso_2022_charset_list))
2841     {
2842       charset_list = Viso_2022_charset_list;
2843       ASET (attrs, coding_attr_charset_list, charset_list);
2844       ASET (attrs, coding_attr_safe_charsets, Qnil);
2845     }
2846
2847   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2848     return;
2849
2850   max_charset_id = 0;
2851   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2852     {
2853       int id = XINT (XCAR (tail));
2854       if (max_charset_id < id)
2855         max_charset_id = id;
2856     }
2857
2858   safe_charsets = make_uninit_string (max_charset_id + 1);
2859   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2860   request = AREF (attrs, coding_attr_iso_request);
2861   reg_usage = AREF (attrs, coding_attr_iso_usage);
2862   reg94 = XINT (XCAR (reg_usage));
2863   reg96 = XINT (XCDR (reg_usage));
2864
2865   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2866     {
2867       Lisp_Object id;
2868       Lisp_Object reg;
2869       struct charset *charset;
2870
2871       id = XCAR (tail);
2872       charset = CHARSET_FROM_ID (XINT (id));
2873       reg = Fcdr (Fassq (id, request));
2874       if (! NILP (reg))
2875         SSET (safe_charsets, XINT (id), XINT (reg));
2876       else if (charset->iso_chars_96)
2877         {
2878           if (reg96 < 4)
2879             SSET (safe_charsets, XINT (id), reg96);
2880         }
2881       else
2882         {
2883           if (reg94 < 4)
2884             SSET (safe_charsets, XINT (id), reg94);
2885         }
2886     }
2887   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2888 }
2889
2890
2891 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2892    Return true if a text is encoded in one of ISO-2022 based coding
2893    systems.  */
2894
2895 static bool
2896 detect_coding_iso_2022 (struct coding_system *coding,
2897                         struct coding_detection_info *detect_info)
2898 {
2899   const unsigned char *src = coding->source, *src_base = src;
2900   const unsigned char *src_end = coding->source + coding->src_bytes;
2901   bool multibytep = coding->src_multibyte;
2902   bool single_shifting = 0;
2903   int id;
2904   int c, c1;
2905   ptrdiff_t consumed_chars = 0;
2906   int i;
2907   int rejected = 0;
2908   int found = 0;
2909   int composition_count = -1;
2910
2911   detect_info->checked |= CATEGORY_MASK_ISO;
2912
2913   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2914     {
2915       struct coding_system *this = &(coding_categories[i]);
2916       Lisp_Object attrs, val;
2917
2918       if (this->id < 0)
2919         continue;
2920       attrs = CODING_ID_ATTRS (this->id);
2921       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2922           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2923         setup_iso_safe_charsets (attrs);
2924       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2925       this->max_charset_id = SCHARS (val) - 1;
2926       this->safe_charsets = SDATA (val);
2927     }
2928
2929   /* A coding system of this category is always ASCII compatible.  */
2930   src += coding->head_ascii;
2931
2932   while (rejected != CATEGORY_MASK_ISO)
2933     {
2934       src_base = src;
2935       ONE_MORE_BYTE (c);
2936       switch (c)
2937         {
2938         case ISO_CODE_ESC:
2939           if (inhibit_iso_escape_detection)
2940             break;
2941           single_shifting = 0;
2942           ONE_MORE_BYTE (c);
2943           if (c == 'N' || c == 'O')
2944             {
2945               /* ESC <Fe> for SS2 or SS3.  */
2946               single_shifting = 1;
2947               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2948             }
2949           else if (c == '1')
2950             {
2951               /* End of composition.  */
2952               if (composition_count < 0
2953                   || composition_count > MAX_COMPOSITION_COMPONENTS)
2954                 /* Invalid */
2955                 break;
2956               composition_count = -1;
2957               found |= CATEGORY_MASK_ISO;
2958             }
2959           else if (c >= '0' && c <= '4')
2960             {
2961               /* ESC <Fp> for start/end composition.  */
2962               composition_count = 0;
2963             }
2964           else
2965             {
2966               if (c >= '(' && c <= '/')
2967                 {
2968                   /* Designation sequence for a charset of dimension 1.  */
2969                   ONE_MORE_BYTE (c1);
2970                   if (c1 < ' ' || c1 >= 0x80
2971                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2972                     /* Invalid designation sequence.  Just ignore.  */
2973                     break;
2974                 }
2975               else if (c == '$')
2976                 {
2977                   /* Designation sequence for a charset of dimension 2.  */
2978                   ONE_MORE_BYTE (c);
2979                   if (c >= '@' && c <= 'B')
2980                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2981                     id = iso_charset_table[1][0][c];
2982                   else if (c >= '(' && c <= '/')
2983                     {
2984                       ONE_MORE_BYTE (c1);
2985                       if (c1 < ' ' || c1 >= 0x80
2986                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2987                         /* Invalid designation sequence.  Just ignore.  */
2988                         break;
2989                     }
2990                   else
2991                     /* Invalid designation sequence.  Just ignore it.  */
2992                     break;
2993                 }
2994               else
2995                 {
2996                   /* Invalid escape sequence.  Just ignore it.  */
2997                   break;
2998                 }
2999
3000               /* We found a valid designation sequence for CHARSET.  */
3001               rejected |= CATEGORY_MASK_ISO_8BIT;
3002               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3003                                   id))
3004                 found |= CATEGORY_MASK_ISO_7;
3005               else
3006                 rejected |= CATEGORY_MASK_ISO_7;
3007               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3008                                   id))
3009                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3010               else
3011                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3012               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3013                                   id))
3014                 found |= CATEGORY_MASK_ISO_7_ELSE;
3015               else
3016                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3017               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3018                                   id))
3019                 found |= CATEGORY_MASK_ISO_8_ELSE;
3020               else
3021                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3022             }
3023           break;
3024
3025         case ISO_CODE_SO:
3026         case ISO_CODE_SI:
3027           /* Locking shift out/in.  */
3028           if (inhibit_iso_escape_detection)
3029             break;
3030           single_shifting = 0;
3031           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3032           break;
3033
3034         case ISO_CODE_CSI:
3035           /* Control sequence introducer.  */
3036           single_shifting = 0;
3037           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3038           found |= CATEGORY_MASK_ISO_8_ELSE;
3039           goto check_extra_latin;
3040
3041         case ISO_CODE_SS2:
3042         case ISO_CODE_SS3:
3043           /* Single shift.   */
3044           if (inhibit_iso_escape_detection)
3045             break;
3046           single_shifting = 0;
3047           rejected |= CATEGORY_MASK_ISO_7BIT;
3048           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3049               & CODING_ISO_FLAG_SINGLE_SHIFT)
3050             {
3051               found |= CATEGORY_MASK_ISO_8_1;
3052               single_shifting = 1;
3053             }
3054           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3055               & CODING_ISO_FLAG_SINGLE_SHIFT)
3056             {
3057               found |= CATEGORY_MASK_ISO_8_2;
3058               single_shifting = 1;
3059             }
3060           if (single_shifting)
3061             break;
3062         check_extra_latin:
3063           if (! VECTORP (Vlatin_extra_code_table)
3064               || NILP (AREF (Vlatin_extra_code_table, c)))
3065             {
3066               rejected = CATEGORY_MASK_ISO;
3067               break;
3068             }
3069           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3070               & CODING_ISO_FLAG_LATIN_EXTRA)
3071             found |= CATEGORY_MASK_ISO_8_1;
3072           else
3073             rejected |= CATEGORY_MASK_ISO_8_1;
3074           rejected |= CATEGORY_MASK_ISO_8_2;
3075           break;
3076
3077         default:
3078           if (c < 0)
3079             continue;
3080           if (c < 0x80)
3081             {
3082               if (composition_count >= 0)
3083                 composition_count++;
3084               single_shifting = 0;
3085               break;
3086             }
3087           if (c >= 0xA0)
3088             {
3089               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3090               found |= CATEGORY_MASK_ISO_8_1;
3091               /* Check the length of succeeding codes of the range
3092                  0xA0..0FF.  If the byte length is even, we include
3093                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3094                  only when we are not single shifting.  */
3095               if (! single_shifting
3096                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3097                 {
3098                   int len = 1;
3099                   while (src < src_end)
3100                     {
3101                       src_base = src;
3102                       ONE_MORE_BYTE (c);
3103                       if (c < 0xA0)
3104                         {
3105                           src = src_base;
3106                           break;
3107                         }
3108                       len++;
3109                     }
3110
3111                   if (len & 1 && src < src_end)
3112                     {
3113                       rejected |= CATEGORY_MASK_ISO_8_2;
3114                       if (composition_count >= 0)
3115                         composition_count += len;
3116                     }
3117                   else
3118                     {
3119                       found |= CATEGORY_MASK_ISO_8_2;
3120                       if (composition_count >= 0)
3121                         composition_count += len / 2;
3122                     }
3123                 }
3124               break;
3125             }
3126         }
3127     }
3128   detect_info->rejected |= CATEGORY_MASK_ISO;
3129   return 0;
3130
3131  no_more_source:
3132   detect_info->rejected |= rejected;
3133   detect_info->found |= (found & ~rejected);
3134   return 1;
3135 }
3136
3137
3138 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3139    escape sequence should be kept.  */
3140 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3141   do {                                                                  \
3142     int id, prev;                                                       \
3143                                                                         \
3144     if (final < '0' || final >= 128                                     \
3145         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3146         || !SAFE_CHARSET_P (coding, id))                                \
3147       {                                                                 \
3148         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3149         chars_96 = -1;                                                  \
3150         break;                                                          \
3151       }                                                                 \
3152     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3153     if (id == charset_jisx0201_roman)                                   \
3154       {                                                                 \
3155         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3156           id = charset_ascii;                                           \
3157       }                                                                 \
3158     else if (id == charset_jisx0208_1978)                               \
3159       {                                                                 \
3160         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3161           id = charset_jisx0208;                                        \
3162       }                                                                 \
3163     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3164     /* If there was an invalid designation to REG previously, and this  \
3165        designation is ASCII to REG, we should keep this designation     \
3166        sequence.  */                                                    \
3167     if (prev == -2 && id == charset_ascii)                              \
3168       chars_96 = -1;                                                    \
3169   } while (0)
3170
3171
3172 /* Handle these composition sequence (ALT: alternate char):
3173
3174    (1) relative composition: ESC 0 CHAR ... ESC 1
3175    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3176    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3177    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3178
3179    When the start sequence (ESC 0/2/3/4) is found, this annotation
3180    header is produced.
3181
3182         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3183
3184    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3185    produced until the end sequence (ESC 1) is found:
3186
3187    (1) CHAR ... CHAR
3188    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3189    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3190    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3191
3192    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3193    annotation header is updated as below:
3194
3195    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3196    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3197    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3198    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3199
3200    If an error is found while composing, the annotation header is
3201    changed to:
3202
3203         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3204
3205    and the sequence [ -2 DECODED-RULE ] is changed to the original
3206    byte sequence as below:
3207         o the original byte sequence is B: [ B -1 ]
3208         o the original byte sequence is B1 B2: [ B1 B2 ]
3209    and the sequence [ -1 -1 ] is changed to the original byte
3210    sequence:
3211         [ ESC '0' ]
3212 */
3213
3214 /* Decode a composition rule C1 and maybe one more byte from the
3215    source, and set RULE to the encoded composition rule.  If the rule
3216    is invalid, goto invalid_code.  */
3217
3218 #define DECODE_COMPOSITION_RULE(rule)                                   \
3219   do {                                                                  \
3220     rule = c1 - 32;                                                     \
3221     if (rule < 0)                                                       \
3222       goto invalid_code;                                                \
3223     if (rule < 81)              /* old format (before ver.21) */        \
3224       {                                                                 \
3225         int gref = (rule) / 9;                                          \
3226         int nref = (rule) % 9;                                          \
3227         if (gref == 4) gref = 10;                                       \
3228         if (nref == 4) nref = 10;                                       \
3229         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3230       }                                                                 \
3231     else                        /* new format (after ver.21) */         \
3232       {                                                                 \
3233         int b;                                                          \
3234                                                                         \
3235         ONE_MORE_BYTE (b);                                              \
3236         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3237           goto invalid_code;                                            \
3238         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3239         rule += 0x100;   /* Distinguish it from the old format.  */     \
3240       }                                                                 \
3241   } while (0)
3242
3243 #define ENCODE_COMPOSITION_RULE(rule)                           \
3244   do {                                                          \
3245     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3246                                                                 \
3247     if (rule < 0x100)           /* old format */                \
3248       {                                                         \
3249         if (gref == 10) gref = 4;                               \
3250         if (nref == 10) nref = 4;                               \
3251         charbuf[idx] = 32 + gref * 9 + nref;                    \
3252         charbuf[idx + 1] = -1;                                  \
3253         new_chars++;                                            \
3254       }                                                         \
3255     else                                /* new format */        \
3256       {                                                         \
3257         charbuf[idx] = 32 + 81 + gref;                          \
3258         charbuf[idx + 1] = 32 + nref;                           \
3259         new_chars += 2;                                         \
3260       }                                                         \
3261   } while (0)
3262
3263 /* Finish the current composition as invalid.  */
3264
3265 static int
3266 finish_composition (int *charbuf, struct composition_status *cmp_status)
3267 {
3268   int idx = - cmp_status->length;
3269   int new_chars;
3270
3271   /* Recover the original ESC sequence */
3272   charbuf[idx++] = ISO_CODE_ESC;
3273   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3274                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3275                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3276                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3277                     : '4');
3278   charbuf[idx++] = -2;
3279   charbuf[idx++] = 0;
3280   charbuf[idx++] = -1;
3281   new_chars = cmp_status->nchars;
3282   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3283     for (; idx < 0; idx++)
3284       {
3285         int elt = charbuf[idx];
3286
3287         if (elt == -2)
3288           {
3289             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3290             idx++;
3291           }
3292         else if (elt == -1)
3293           {
3294             charbuf[idx++] = ISO_CODE_ESC;
3295             charbuf[idx] = '0';
3296             new_chars += 2;
3297           }
3298       }
3299   cmp_status->state = COMPOSING_NO;
3300   return new_chars;
3301 }
3302
3303 /* If characters are under composition, finish the composition.  */
3304 #define MAYBE_FINISH_COMPOSITION()                              \
3305   do {                                                          \
3306     if (cmp_status->state != COMPOSING_NO)                      \
3307       char_offset += finish_composition (charbuf, cmp_status);  \
3308   } while (0)
3309
3310 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3311
3312    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3313    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3314    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3315    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3316
3317    Produce this annotation sequence now:
3318
3319    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3320 */
3321
3322 #define DECODE_COMPOSITION_START(c1)                                       \
3323   do {                                                                     \
3324     if (c1 == '0'                                                          \
3325         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3326              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3327             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3328                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3329       {                                                                    \
3330         *charbuf++ = -1;                                                   \
3331         *charbuf++= -1;                                                    \
3332         cmp_status->state = COMPOSING_CHAR;                                \
3333         cmp_status->length += 2;                                           \
3334       }                                                                    \
3335     else                                                                   \
3336       {                                                                    \
3337         MAYBE_FINISH_COMPOSITION ();                                       \
3338         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3339                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3340                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3341                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3342         cmp_status->state                                                  \
3343           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3344         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3345         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3346         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3347         coding->annotated = 1;                                             \
3348       }                                                                    \
3349   } while (0)
3350
3351
3352 /* Handle composition end sequence ESC 1.  */
3353
3354 #define DECODE_COMPOSITION_END()                                        \
3355   do {                                                                  \
3356     if (cmp_status->nchars == 0                                         \
3357         || ((cmp_status->state == COMPOSING_CHAR)                       \
3358             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3359       {                                                                 \
3360         MAYBE_FINISH_COMPOSITION ();                                    \
3361         goto invalid_code;                                              \
3362       }                                                                 \
3363     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3364       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3365     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3366       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3367     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3368     char_offset += cmp_status->nchars;                                  \
3369     cmp_status->state = COMPOSING_NO;                                   \
3370   } while (0)
3371
3372 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3373
3374 #define STORE_COMPOSITION_RULE(rule)    \
3375   do {                                  \
3376     *charbuf++ = -2;                    \
3377     *charbuf++ = rule;                  \
3378     cmp_status->length += 2;            \
3379     cmp_status->state--;                \
3380   } while (0)
3381
3382 /* Store a composed char or a component char C in charbuf, and update
3383    cmp_status.  */
3384
3385 #define STORE_COMPOSITION_CHAR(c)                                       \
3386   do {                                                                  \
3387     *charbuf++ = (c);                                                   \
3388     cmp_status->length++;                                               \
3389     if (cmp_status->state == COMPOSING_CHAR)                            \
3390       cmp_status->nchars++;                                             \
3391     else                                                                \
3392       cmp_status->ncomps++;                                             \
3393     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3394         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3395             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3396       cmp_status->state++;                                              \
3397   } while (0)
3398
3399
3400 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3401
3402 static void
3403 decode_coding_iso_2022 (struct coding_system *coding)
3404 {
3405   const unsigned char *src = coding->source + coding->consumed;
3406   const unsigned char *src_end = coding->source + coding->src_bytes;
3407   const unsigned char *src_base;
3408   int *charbuf = coding->charbuf + coding->charbuf_used;
3409   /* We may produce two annotations (charset and composition) in one
3410      loop and one more charset annotation at the end.  */
3411   int *charbuf_end
3412     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3413   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3414   bool multibytep = coding->src_multibyte;
3415   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3416   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3417   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3418   int charset_id_2, charset_id_3;
3419   struct charset *charset;
3420   int c;
3421   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3422   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3423   ptrdiff_t char_offset = coding->produced_char;
3424   ptrdiff_t last_offset = char_offset;
3425   int last_id = charset_ascii;
3426   bool eol_dos
3427     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3428   int byte_after_cr = -1;
3429   int i;
3430
3431   setup_iso_safe_charsets (attrs);
3432   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3433
3434   if (cmp_status->state != COMPOSING_NO)
3435     {
3436       if (charbuf_end - charbuf < cmp_status->length)
3437         emacs_abort ();
3438       for (i = 0; i < cmp_status->length; i++)
3439         *charbuf++ = cmp_status->carryover[i];
3440       coding->annotated = 1;
3441     }
3442
3443   while (1)
3444     {
3445       int c1, c2, c3;
3446
3447       src_base = src;
3448       consumed_chars_base = consumed_chars;
3449
3450       if (charbuf >= charbuf_end)
3451         {
3452           if (byte_after_cr >= 0)
3453             src_base--;
3454           break;
3455         }
3456
3457       if (byte_after_cr >= 0)
3458         c1 = byte_after_cr, byte_after_cr = -1;
3459       else
3460         ONE_MORE_BYTE (c1);
3461       if (c1 < 0)
3462         goto invalid_code;
3463
3464       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3465         {
3466           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3467           char_offset++;
3468           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3469           continue;
3470         }
3471
3472       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3473         {
3474           if (c1 == ISO_CODE_ESC)
3475             {
3476               if (src + 1 >= src_end)
3477                 goto no_more_source;
3478               *charbuf++ = ISO_CODE_ESC;
3479               char_offset++;
3480               if (src[0] == '%' && src[1] == '@')
3481                 {
3482                   src += 2;
3483                   consumed_chars += 2;
3484                   char_offset += 2;
3485                   /* We are sure charbuf can contain two more chars. */
3486                   *charbuf++ = '%';
3487                   *charbuf++ = '@';
3488                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3489                 }
3490             }
3491           else
3492             {
3493               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3494               char_offset++;
3495             }
3496           continue;
3497         }
3498
3499       if ((cmp_status->state == COMPOSING_RULE
3500            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3501           && c1 != ISO_CODE_ESC)
3502         {
3503           int rule;
3504
3505           DECODE_COMPOSITION_RULE (rule);
3506           STORE_COMPOSITION_RULE (rule);
3507           continue;
3508         }
3509
3510       /* We produce at most one character.  */
3511       switch (iso_code_class [c1])
3512         {
3513         case ISO_0x20_or_0x7F:
3514           if (charset_id_0 < 0
3515               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3516             /* This is SPACE or DEL.  */
3517             charset = CHARSET_FROM_ID (charset_ascii);
3518           else
3519             charset = CHARSET_FROM_ID (charset_id_0);
3520           break;
3521
3522         case ISO_graphic_plane_0:
3523           if (charset_id_0 < 0)
3524             charset = CHARSET_FROM_ID (charset_ascii);
3525           else
3526             charset = CHARSET_FROM_ID (charset_id_0);
3527           break;
3528
3529         case ISO_0xA0_or_0xFF:
3530           if (charset_id_1 < 0
3531               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3532               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3533             goto invalid_code;
3534           /* This is a graphic character, we fall down ... */
3535
3536         case ISO_graphic_plane_1:
3537           if (charset_id_1 < 0)
3538             goto invalid_code;
3539           charset = CHARSET_FROM_ID (charset_id_1);
3540           break;
3541
3542         case ISO_control_0:
3543           if (eol_dos && c1 == '\r')
3544             ONE_MORE_BYTE (byte_after_cr);
3545           MAYBE_FINISH_COMPOSITION ();
3546           charset = CHARSET_FROM_ID (charset_ascii);
3547           break;
3548
3549         case ISO_control_1:
3550           goto invalid_code;
3551
3552         case ISO_shift_out:
3553           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3554               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3555             goto invalid_code;
3556           CODING_ISO_INVOCATION (coding, 0) = 1;
3557           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3558           continue;
3559
3560         case ISO_shift_in:
3561           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3562             goto invalid_code;
3563           CODING_ISO_INVOCATION (coding, 0) = 0;
3564           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3565           continue;
3566
3567         case ISO_single_shift_2_7:
3568           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3569             goto invalid_code;
3570         case ISO_single_shift_2:
3571           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3572             goto invalid_code;
3573           /* SS2 is handled as an escape sequence of ESC 'N' */
3574           c1 = 'N';
3575           goto label_escape_sequence;
3576
3577         case ISO_single_shift_3:
3578           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3579             goto invalid_code;
3580           /* SS2 is handled as an escape sequence of ESC 'O' */
3581           c1 = 'O';
3582           goto label_escape_sequence;
3583
3584         case ISO_control_sequence_introducer:
3585           /* CSI is handled as an escape sequence of ESC '[' ...  */
3586           c1 = '[';
3587           goto label_escape_sequence;
3588
3589         case ISO_escape:
3590           ONE_MORE_BYTE (c1);
3591         label_escape_sequence:
3592           /* Escape sequences handled here are invocation,
3593              designation, direction specification, and character
3594              composition specification.  */
3595           switch (c1)
3596             {
3597             case '&':           /* revision of following character set */
3598               ONE_MORE_BYTE (c1);
3599               if (!(c1 >= '@' && c1 <= '~'))
3600                 goto invalid_code;
3601               ONE_MORE_BYTE (c1);
3602               if (c1 != ISO_CODE_ESC)
3603                 goto invalid_code;
3604               ONE_MORE_BYTE (c1);
3605               goto label_escape_sequence;
3606
3607             case '$':           /* designation of 2-byte character set */
3608               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3609                 goto invalid_code;
3610               {
3611                 int reg, chars96;
3612
3613                 ONE_MORE_BYTE (c1);
3614                 if (c1 >= '@' && c1 <= 'B')
3615                   {     /* designation of JISX0208.1978, GB2312.1980,
3616                            or JISX0208.1980 */
3617                     reg = 0, chars96 = 0;
3618                   }
3619                 else if (c1 >= 0x28 && c1 <= 0x2B)
3620                   { /* designation of DIMENSION2_CHARS94 character set */
3621                     reg = c1 - 0x28, chars96 = 0;
3622                     ONE_MORE_BYTE (c1);
3623                   }
3624                 else if (c1 >= 0x2C && c1 <= 0x2F)
3625                   { /* designation of DIMENSION2_CHARS96 character set */
3626                     reg = c1 - 0x2C, chars96 = 1;
3627                     ONE_MORE_BYTE (c1);
3628                   }
3629                 else
3630                   goto invalid_code;
3631                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3632                 /* We must update these variables now.  */
3633                 if (reg == 0)
3634                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3635                 else if (reg == 1)
3636                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3637                 if (chars96 < 0)
3638                   goto invalid_code;
3639               }
3640               continue;
3641
3642             case 'n':           /* invocation of locking-shift-2 */
3643               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3644                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3645                 goto invalid_code;
3646               CODING_ISO_INVOCATION (coding, 0) = 2;
3647               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3648               continue;
3649
3650             case 'o':           /* invocation of locking-shift-3 */
3651               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3652                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3653                 goto invalid_code;
3654               CODING_ISO_INVOCATION (coding, 0) = 3;
3655               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3656               continue;
3657
3658             case 'N':           /* invocation of single-shift-2 */
3659               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3660                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3661                 goto invalid_code;
3662               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3663               if (charset_id_2 < 0)
3664                 charset = CHARSET_FROM_ID (charset_ascii);
3665               else
3666                 charset = CHARSET_FROM_ID (charset_id_2);
3667               ONE_MORE_BYTE (c1);
3668               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3669                 goto invalid_code;
3670               break;
3671
3672             case 'O':           /* invocation of single-shift-3 */
3673               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3674                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3675                 goto invalid_code;
3676               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3677               if (charset_id_3 < 0)
3678                 charset = CHARSET_FROM_ID (charset_ascii);
3679               else
3680                 charset = CHARSET_FROM_ID (charset_id_3);
3681               ONE_MORE_BYTE (c1);
3682               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3683                 goto invalid_code;
3684               break;
3685
3686             case '0': case '2': case '3': case '4': /* start composition */
3687               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3688                 goto invalid_code;
3689               if (last_id != charset_ascii)
3690                 {
3691                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3692                   last_id = charset_ascii;
3693                   last_offset = char_offset;
3694                 }
3695               DECODE_COMPOSITION_START (c1);
3696               continue;
3697
3698             case '1':           /* end composition */
3699               if (cmp_status->state == COMPOSING_NO)
3700                 goto invalid_code;
3701               DECODE_COMPOSITION_END ();
3702               continue;
3703
3704             case '[':           /* specification of direction */
3705               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3706                 goto invalid_code;
3707               /* For the moment, nested direction is not supported.
3708                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3709                  left-to-right, and nonzero means right-to-left.  */
3710               ONE_MORE_BYTE (c1);
3711               switch (c1)
3712                 {
3713                 case ']':       /* end of the current direction */
3714                   coding->mode &= ~CODING_MODE_DIRECTION;
3715
3716                 case '0':       /* end of the current direction */
3717                 case '1':       /* start of left-to-right direction */
3718                   ONE_MORE_BYTE (c1);
3719                   if (c1 == ']')
3720                     coding->mode &= ~CODING_MODE_DIRECTION;
3721                   else
3722                     goto invalid_code;
3723                   break;
3724
3725                 case '2':       /* start of right-to-left direction */
3726                   ONE_MORE_BYTE (c1);
3727                   if (c1 == ']')
3728                     coding->mode |= CODING_MODE_DIRECTION;
3729                   else
3730                     goto invalid_code;
3731                   break;
3732
3733                 default:
3734                   goto invalid_code;
3735                 }
3736               continue;
3737
3738             case '%':
3739               ONE_MORE_BYTE (c1);
3740               if (c1 == '/')
3741                 {
3742                   /* CTEXT extended segment:
3743                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3744                      We keep these bytes as is for the moment.
3745                      They may be decoded by post-read-conversion.  */
3746                   int dim, M, L;
3747                   int size;
3748
3749                   ONE_MORE_BYTE (dim);
3750                   if (dim < '0' || dim > '4')
3751                     goto invalid_code;
3752                   ONE_MORE_BYTE (M);
3753                   if (M < 128)
3754                     goto invalid_code;
3755                   ONE_MORE_BYTE (L);
3756                   if (L < 128)
3757                     goto invalid_code;
3758                   size = ((M - 128) * 128) + (L - 128);
3759                   if (charbuf + 6 > charbuf_end)
3760                     goto break_loop;
3761                   *charbuf++ = ISO_CODE_ESC;
3762                   *charbuf++ = '%';
3763                   *charbuf++ = '/';
3764                   *charbuf++ = dim;
3765                   *charbuf++ = BYTE8_TO_CHAR (M);
3766                   *charbuf++ = BYTE8_TO_CHAR (L);
3767                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3768                 }
3769               else if (c1 == 'G')
3770                 {
3771                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3772                      ESC % G --UTF-8-BYTES-- ESC % @
3773                      We keep these bytes as is for the moment.
3774                      They may be decoded by post-read-conversion.  */
3775                   if (charbuf + 3 > charbuf_end)
3776                     goto break_loop;
3777                   *charbuf++ = ISO_CODE_ESC;
3778                   *charbuf++ = '%';
3779                   *charbuf++ = 'G';
3780                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3781                 }
3782               else
3783                 goto invalid_code;
3784               continue;
3785               break;
3786
3787             default:
3788               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3789                 goto invalid_code;
3790               {
3791                 int reg, chars96;
3792
3793                 if (c1 >= 0x28 && c1 <= 0x2B)
3794                   { /* designation of DIMENSION1_CHARS94 character set */
3795                     reg = c1 - 0x28, chars96 = 0;
3796                     ONE_MORE_BYTE (c1);
3797                   }
3798                 else if (c1 >= 0x2C && c1 <= 0x2F)
3799                   { /* designation of DIMENSION1_CHARS96 character set */
3800                     reg = c1 - 0x2C, chars96 = 1;
3801                     ONE_MORE_BYTE (c1);
3802                   }
3803                 else
3804                   goto invalid_code;
3805                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3806                 /* We must update these variables now.  */
3807                 if (reg == 0)
3808                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3809                 else if (reg == 1)
3810                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3811                 if (chars96 < 0)
3812                   goto invalid_code;
3813               }
3814               continue;
3815             }
3816           break;
3817
3818         default:
3819           emacs_abort ();
3820         }
3821
3822       if (cmp_status->state == COMPOSING_NO
3823           && charset->id != charset_ascii
3824           && last_id != charset->id)
3825         {
3826           if (last_id != charset_ascii)
3827             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3828           last_id = charset->id;
3829           last_offset = char_offset;
3830         }
3831
3832       /* Now we know CHARSET and 1st position code C1 of a character.
3833          Produce a decoded character while getting 2nd and 3rd
3834          position codes C2, C3 if necessary.  */
3835       if (CHARSET_DIMENSION (charset) > 1)
3836         {
3837           ONE_MORE_BYTE (c2);
3838           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3839               || ((c1 & 0x80) != (c2 & 0x80)))
3840             /* C2 is not in a valid range.  */
3841             goto invalid_code;
3842           if (CHARSET_DIMENSION (charset) == 2)
3843             c1 = (c1 << 8) | c2;
3844           else
3845             {
3846               ONE_MORE_BYTE (c3);
3847               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3848                   || ((c1 & 0x80) != (c3 & 0x80)))
3849                 /* C3 is not in a valid range.  */
3850                 goto invalid_code;
3851               c1 = (c1 << 16) | (c2 << 8) | c2;
3852             }
3853         }
3854       c1 &= 0x7F7F7F;
3855       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3856       if (c < 0)
3857         {
3858           MAYBE_FINISH_COMPOSITION ();
3859           for (; src_base < src; src_base++, char_offset++)
3860             {
3861               if (ASCII_BYTE_P (*src_base))
3862                 *charbuf++ = *src_base;
3863               else
3864                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3865             }
3866         }
3867       else if (cmp_status->state == COMPOSING_NO)
3868         {
3869           *charbuf++ = c;
3870           char_offset++;
3871         }
3872       else if ((cmp_status->state == COMPOSING_CHAR
3873                 ? cmp_status->nchars
3874                 : cmp_status->ncomps)
3875                >= MAX_COMPOSITION_COMPONENTS)
3876         {
3877           /* Too long composition.  */
3878           MAYBE_FINISH_COMPOSITION ();
3879           *charbuf++ = c;
3880           char_offset++;
3881         }
3882       else
3883         STORE_COMPOSITION_CHAR (c);
3884       continue;
3885
3886     invalid_code:
3887       MAYBE_FINISH_COMPOSITION ();
3888       src = src_base;
3889       consumed_chars = consumed_chars_base;
3890       ONE_MORE_BYTE (c);
3891       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3892       char_offset++;
3893       coding->errors++;
3894       continue;
3895
3896     break_loop:
3897       break;
3898     }
3899
3900  no_more_source:
3901   if (cmp_status->state != COMPOSING_NO)
3902     {
3903       if (coding->mode & CODING_MODE_LAST_BLOCK)
3904         MAYBE_FINISH_COMPOSITION ();
3905       else
3906         {
3907           charbuf -= cmp_status->length;
3908           for (i = 0; i < cmp_status->length; i++)
3909             cmp_status->carryover[i] = charbuf[i];
3910         }
3911     }
3912   else if (last_id != charset_ascii)
3913     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3914   coding->consumed_char += consumed_chars_base;
3915   coding->consumed = src_base - coding->source;
3916   coding->charbuf_used = charbuf - coding->charbuf;
3917 }
3918
3919
3920 /* ISO2022 encoding stuff.  */
3921
3922 /*
3923    It is not enough to say just "ISO2022" on encoding, we have to
3924    specify more details.  In Emacs, each coding system of ISO2022
3925    variant has the following specifications:
3926         1. Initial designation to G0 thru G3.
3927         2. Allows short-form designation?
3928         3. ASCII should be designated to G0 before control characters?
3929         4. ASCII should be designated to G0 at end of line?
3930         5. 7-bit environment or 8-bit environment?
3931         6. Use locking-shift?
3932         7. Use Single-shift?
3933    And the following two are only for Japanese:
3934         8. Use ASCII in place of JIS0201-1976-Roman?
3935         9. Use JISX0208-1983 in place of JISX0208-1978?
3936    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3937    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3938    details.
3939 */
3940
3941 /* Produce codes (escape sequence) for designating CHARSET to graphic
3942    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3943    '@', 'A', or 'B' and the coding system CODING allows, produce
3944    designation sequence of short-form.  */
3945
3946 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3947   do {                                                                  \
3948     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3949     const char *intermediate_char_94 = "()*+";                          \
3950     const char *intermediate_char_96 = ",-./";                          \
3951     int revision = -1;                                                  \
3952                                                                         \
3953     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3954       revision = CHARSET_ISO_REVISION (charset);                        \
3955                                                                         \
3956     if (revision >= 0)                                                  \
3957       {                                                                 \
3958         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3959         EMIT_ONE_BYTE ('@' + revision);                                 \
3960       }                                                                 \
3961     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3962     if (CHARSET_DIMENSION (charset) == 1)                               \
3963       {                                                                 \
3964         int b;                                                          \
3965         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3966           b = intermediate_char_94[reg];                                \
3967         else                                                            \
3968           b = intermediate_char_96[reg];                                \
3969         EMIT_ONE_ASCII_BYTE (b);                                        \
3970       }                                                                 \
3971     else                                                                \
3972       {                                                                 \
3973         EMIT_ONE_ASCII_BYTE ('$');                                      \
3974         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3975           {                                                             \
3976             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3977                 || reg != 0                                             \
3978                 || final_char < '@' || final_char > 'B')                \
3979               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3980           }                                                             \
3981         else                                                            \
3982           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3983       }                                                                 \
3984     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3985                                                                         \
3986     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3987   } while (0)
3988
3989
3990 /* The following two macros produce codes (control character or escape
3991    sequence) for ISO2022 single-shift functions (single-shift-2 and
3992    single-shift-3).  */
3993
3994 #define ENCODE_SINGLE_SHIFT_2                                           \
3995   do {                                                                  \
3996     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3997       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3998     else                                                                \
3999       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4000     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4001   } while (0)
4002
4003
4004 #define ENCODE_SINGLE_SHIFT_3                                           \
4005   do {                                                                  \
4006     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4007       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4008     else                                                                \
4009       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4010     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4011   } while (0)
4012
4013
4014 /* The following four macros produce codes (control character or
4015    escape sequence) for ISO2022 locking-shift functions (shift-in,
4016    shift-out, locking-shift-2, and locking-shift-3).  */
4017
4018 #define ENCODE_SHIFT_IN                                 \
4019   do {                                                  \
4020     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4021     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4022   } while (0)
4023
4024
4025 #define ENCODE_SHIFT_OUT                                \
4026   do {                                                  \
4027     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4028     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4029   } while (0)
4030
4031
4032 #define ENCODE_LOCKING_SHIFT_2                          \
4033   do {                                                  \
4034     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4035     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4036   } while (0)
4037
4038
4039 #define ENCODE_LOCKING_SHIFT_3                          \
4040   do {                                                  \
4041     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4042     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4043   } while (0)
4044
4045
4046 /* Produce codes for a DIMENSION1 character whose character set is
4047    CHARSET and whose position-code is C1.  Designation and invocation
4048    sequences are also produced in advance if necessary.  */
4049
4050 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4051   do {                                                                  \
4052     int id = CHARSET_ID (charset);                                      \
4053                                                                         \
4054     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4055         && id == charset_ascii)                                         \
4056       {                                                                 \
4057         id = charset_jisx0201_roman;                                    \
4058         charset = CHARSET_FROM_ID (id);                                 \
4059       }                                                                 \
4060                                                                         \
4061     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4062       {                                                                 \
4063         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4064           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4065         else                                                            \
4066           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4067         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4068         break;                                                          \
4069       }                                                                 \
4070     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4071       {                                                                 \
4072         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4073         break;                                                          \
4074       }                                                                 \
4075     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4076       {                                                                 \
4077         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4078         break;                                                          \
4079       }                                                                 \
4080     else                                                                \
4081       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4082          must invoke it, or, at first, designate it to some graphic     \
4083          register.  Then repeat the loop to actually produce the        \
4084          character.  */                                                 \
4085       dst = encode_invocation_designation (charset, coding, dst,        \
4086                                            &produced_chars);            \
4087   } while (1)
4088
4089
4090 /* Produce codes for a DIMENSION2 character whose character set is
4091    CHARSET and whose position-codes are C1 and C2.  Designation and
4092    invocation codes are also produced in advance if necessary.  */
4093
4094 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4095   do {                                                                  \
4096     int id = CHARSET_ID (charset);                                      \
4097                                                                         \
4098     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4099         && id == charset_jisx0208)                                      \
4100       {                                                                 \
4101         id = charset_jisx0208_1978;                                     \
4102         charset = CHARSET_FROM_ID (id);                                 \
4103       }                                                                 \
4104                                                                         \
4105     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4106       {                                                                 \
4107         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4108           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4109         else                                                            \
4110           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4111         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4112         break;                                                          \
4113       }                                                                 \
4114     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4115       {                                                                 \
4116         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4117         break;                                                          \
4118       }                                                                 \
4119     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4120       {                                                                 \
4121         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4122         break;                                                          \
4123       }                                                                 \
4124     else                                                                \
4125       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4126          must invoke it, or, at first, designate it to some graphic     \
4127          register.  Then repeat the loop to actually produce the        \
4128          character.  */                                                 \
4129       dst = encode_invocation_designation (charset, coding, dst,        \
4130                                            &produced_chars);            \
4131   } while (1)
4132
4133
4134 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4135   do {                                                                     \
4136     unsigned code;                                                         \
4137     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4138                                                                            \
4139     if (CHARSET_DIMENSION (charset) == 1)                                  \
4140       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4141     else                                                                   \
4142       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4143   } while (0)
4144
4145
4146 /* Produce designation and invocation codes at a place pointed by DST
4147    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4148    Return new DST.  */
4149
4150 static unsigned char *
4151 encode_invocation_designation (struct charset *charset,
4152                                struct coding_system *coding,
4153                                unsigned char *dst, ptrdiff_t *p_nchars)
4154 {
4155   bool multibytep = coding->dst_multibyte;
4156   ptrdiff_t produced_chars = *p_nchars;
4157   int reg;                      /* graphic register number */
4158   int id = CHARSET_ID (charset);
4159
4160   /* At first, check designations.  */
4161   for (reg = 0; reg < 4; reg++)
4162     if (id == CODING_ISO_DESIGNATION (coding, reg))
4163       break;
4164
4165   if (reg >= 4)
4166     {
4167       /* CHARSET is not yet designated to any graphic registers.  */
4168       /* At first check the requested designation.  */
4169       reg = CODING_ISO_REQUEST (coding, id);
4170       if (reg < 0)
4171         /* Since CHARSET requests no special designation, designate it
4172            to graphic register 0.  */
4173         reg = 0;
4174
4175       ENCODE_DESIGNATION (charset, reg, coding);
4176     }
4177
4178   if (CODING_ISO_INVOCATION (coding, 0) != reg
4179       && CODING_ISO_INVOCATION (coding, 1) != reg)
4180     {
4181       /* Since the graphic register REG is not invoked to any graphic
4182          planes, invoke it to graphic plane 0.  */
4183       switch (reg)
4184         {
4185         case 0:                 /* graphic register 0 */
4186           ENCODE_SHIFT_IN;
4187           break;
4188
4189         case 1:                 /* graphic register 1 */
4190           ENCODE_SHIFT_OUT;
4191           break;
4192
4193         case 2:                 /* graphic register 2 */
4194           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4195             ENCODE_SINGLE_SHIFT_2;
4196           else
4197             ENCODE_LOCKING_SHIFT_2;
4198           break;
4199
4200         case 3:                 /* graphic register 3 */
4201           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4202             ENCODE_SINGLE_SHIFT_3;
4203           else
4204             ENCODE_LOCKING_SHIFT_3;
4205           break;
4206         }
4207     }
4208
4209   *p_nchars = produced_chars;
4210   return dst;
4211 }
4212
4213
4214 /* Produce codes for designation and invocation to reset the graphic
4215    planes and registers to initial state.  */
4216 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4217   do {                                                                  \
4218     int reg;                                                            \
4219     struct charset *charset;                                            \
4220                                                                         \
4221     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4222       ENCODE_SHIFT_IN;                                                  \
4223     for (reg = 0; reg < 4; reg++)                                       \
4224       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4225           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4226               != CODING_ISO_INITIAL (coding, reg)))                     \
4227         {                                                               \
4228           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4229           ENCODE_DESIGNATION (charset, reg, coding);                    \
4230         }                                                               \
4231   } while (0)
4232
4233
4234 /* Produce designation sequences of charsets in the line started from
4235    CHARBUF to a place pointed by DST, and return the number of
4236    produced bytes.  DST should not directly point a buffer text area
4237    which may be relocated by char_charset call.
4238
4239    If the current block ends before any end-of-line, we may fail to
4240    find all the necessary designations.  */
4241
4242 static ptrdiff_t
4243 encode_designation_at_bol (struct coding_system *coding,
4244                            int *charbuf, int *charbuf_end,
4245                            unsigned char *dst)
4246 {
4247   unsigned char *orig = dst;
4248   struct charset *charset;
4249   /* Table of charsets to be designated to each graphic register.  */
4250   int r[4];
4251   int c, found = 0, reg;
4252   ptrdiff_t produced_chars = 0;
4253   bool multibytep = coding->dst_multibyte;
4254   Lisp_Object attrs;
4255   Lisp_Object charset_list;
4256
4257   attrs = CODING_ID_ATTRS (coding->id);
4258   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4259   if (EQ (charset_list, Qiso_2022))
4260     charset_list = Viso_2022_charset_list;
4261
4262   for (reg = 0; reg < 4; reg++)
4263     r[reg] = -1;
4264
4265   while (charbuf < charbuf_end && found < 4)
4266     {
4267       int id;
4268
4269       c = *charbuf++;
4270       if (c == '\n')
4271         break;
4272       charset = char_charset (c, charset_list, NULL);
4273       id = CHARSET_ID (charset);
4274       reg = CODING_ISO_REQUEST (coding, id);
4275       if (reg >= 0 && r[reg] < 0)
4276         {
4277           found++;
4278           r[reg] = id;
4279         }
4280     }
4281
4282   if (found)
4283     {
4284       for (reg = 0; reg < 4; reg++)
4285         if (r[reg] >= 0
4286             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4287           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4288     }
4289
4290   return dst - orig;
4291 }
4292
4293 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4294
4295 static bool
4296 encode_coding_iso_2022 (struct coding_system *coding)
4297 {
4298   bool multibytep = coding->dst_multibyte;
4299   int *charbuf = coding->charbuf;
4300   int *charbuf_end = charbuf + coding->charbuf_used;
4301   unsigned char *dst = coding->destination + coding->produced;
4302   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4303   int safe_room = 16;
4304   bool bol_designation
4305     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4306        && CODING_ISO_BOL (coding));
4307   ptrdiff_t produced_chars = 0;
4308   Lisp_Object attrs, eol_type, charset_list;
4309   bool ascii_compatible;
4310   int c;
4311   int preferred_charset_id = -1;
4312
4313   CODING_GET_INFO (coding, attrs, charset_list);
4314   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4315   if (VECTORP (eol_type))
4316     eol_type = Qunix;
4317
4318   setup_iso_safe_charsets (attrs);
4319   /* Charset list may have been changed.  */
4320   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4321   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4322
4323   ascii_compatible
4324     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4325        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4326                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4327
4328   while (charbuf < charbuf_end)
4329     {
4330       ASSURE_DESTINATION (safe_room);
4331
4332       if (bol_designation)
4333         {
4334           /* We have to produce designation sequences if any now.  */
4335           unsigned char desig_buf[16];
4336           int nbytes;
4337           ptrdiff_t offset;
4338
4339           charset_map_loaded = 0;
4340           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4341                                               desig_buf);
4342           if (charset_map_loaded
4343               && (offset = coding_change_destination (coding)))
4344             {
4345               dst += offset;
4346               dst_end += offset;
4347             }
4348           memcpy (dst, desig_buf, nbytes);
4349           dst += nbytes;
4350           /* We are sure that designation sequences are all ASCII bytes.  */
4351           produced_chars += nbytes;
4352           bol_designation = 0;
4353           ASSURE_DESTINATION (safe_room);
4354         }
4355
4356       c = *charbuf++;
4357
4358       if (c < 0)
4359         {
4360           /* Handle an annotation.  */
4361           switch (*charbuf)
4362             {
4363             case CODING_ANNOTATE_COMPOSITION_MASK:
4364               /* Not yet implemented.  */
4365               break;
4366             case CODING_ANNOTATE_CHARSET_MASK:
4367               preferred_charset_id = charbuf[2];
4368               if (preferred_charset_id >= 0
4369                   && NILP (Fmemq (make_number (preferred_charset_id),
4370                                   charset_list)))
4371                 preferred_charset_id = -1;
4372               break;
4373             default:
4374               emacs_abort ();
4375             }
4376           charbuf += -c - 1;
4377           continue;
4378         }
4379
4380       /* Now encode the character C.  */
4381       if (c < 0x20 || c == 0x7F)
4382         {
4383           if (c == '\n'
4384               || (c == '\r' && EQ (eol_type, Qmac)))
4385             {
4386               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4387                 ENCODE_RESET_PLANE_AND_REGISTER ();
4388               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4389                 {
4390                   int i;
4391
4392                   for (i = 0; i < 4; i++)
4393                     CODING_ISO_DESIGNATION (coding, i)
4394                       = CODING_ISO_INITIAL (coding, i);
4395                 }
4396               bol_designation = ((CODING_ISO_FLAGS (coding)
4397                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4398                                  != 0);
4399             }
4400           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4401             ENCODE_RESET_PLANE_AND_REGISTER ();
4402           EMIT_ONE_ASCII_BYTE (c);
4403         }
4404       else if (ASCII_CHAR_P (c))
4405         {
4406           if (ascii_compatible)
4407             EMIT_ONE_ASCII_BYTE (c);
4408           else
4409             {
4410               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4411               ENCODE_ISO_CHARACTER (charset, c);
4412             }
4413         }
4414       else if (CHAR_BYTE8_P (c))
4415         {
4416           c = CHAR_TO_BYTE8 (c);
4417           EMIT_ONE_BYTE (c);
4418         }
4419       else
4420         {
4421           struct charset *charset;
4422
4423           if (preferred_charset_id >= 0)
4424             {
4425               bool result;
4426
4427               charset = CHARSET_FROM_ID (preferred_charset_id);
4428               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4429               if (! result)
4430                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4431                                      NULL, charset);
4432             }
4433           else
4434             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4435                                  NULL, charset);
4436           if (!charset)
4437             {
4438               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4439                 {
4440                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4441                   charset = CHARSET_FROM_ID (charset_ascii);
4442                 }
4443               else
4444                 {
4445                   c = coding->default_char;
4446                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4447                                        charset_list, NULL, charset);
4448                 }
4449             }
4450           ENCODE_ISO_CHARACTER (charset, c);
4451         }
4452     }
4453
4454   if (coding->mode & CODING_MODE_LAST_BLOCK
4455       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4456     {
4457       ASSURE_DESTINATION (safe_room);
4458       ENCODE_RESET_PLANE_AND_REGISTER ();
4459     }
4460   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4461   CODING_ISO_BOL (coding) = bol_designation;
4462   coding->produced_char += produced_chars;
4463   coding->produced = dst - coding->destination;
4464   return 0;
4465 }
4466
4467 \f
4468 /*** 8,9. SJIS and BIG5 handlers ***/
4469
4470 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4471    quite widely.  So, for the moment, Emacs supports them in the bare
4472    C code.  But, in the future, they may be supported only by CCL.  */
4473
4474 /* SJIS is a coding system encoding three character sets: ASCII, right
4475    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4476    as is.  A character of charset katakana-jisx0201 is encoded by
4477    "position-code + 0x80".  A character of charset japanese-jisx0208
4478    is encoded in 2-byte but two position-codes are divided and shifted
4479    so that it fit in the range below.
4480
4481    --- CODE RANGE of SJIS ---
4482    (character set)      (range)
4483    ASCII                0x00 .. 0x7F
4484    KATAKANA-JISX0201    0xA0 .. 0xDF
4485    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4486             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4487    -------------------------------
4488
4489 */
4490
4491 /* BIG5 is a coding system encoding two character sets: ASCII and
4492    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4493    character set and is encoded in two-byte.
4494
4495    --- CODE RANGE of BIG5 ---
4496    (character set)      (range)
4497    ASCII                0x00 .. 0x7F
4498    Big5 (1st byte)      0xA1 .. 0xFE
4499         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4500    --------------------------
4501
4502   */
4503
4504 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4505    Return true if a text is encoded in SJIS.  */
4506
4507 static bool
4508 detect_coding_sjis (struct coding_system *coding,
4509                     struct coding_detection_info *detect_info)
4510 {
4511   const unsigned char *src = coding->source, *src_base;
4512   const unsigned char *src_end = coding->source + coding->src_bytes;
4513   bool multibytep = coding->src_multibyte;
4514   ptrdiff_t consumed_chars = 0;
4515   int found = 0;
4516   int c;
4517   Lisp_Object attrs, charset_list;
4518   int max_first_byte_of_2_byte_code;
4519
4520   CODING_GET_INFO (coding, attrs, charset_list);
4521   max_first_byte_of_2_byte_code
4522     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4523
4524   detect_info->checked |= CATEGORY_MASK_SJIS;
4525   /* A coding system of this category is always ASCII compatible.  */
4526   src += coding->head_ascii;
4527
4528   while (1)
4529     {
4530       src_base = src;
4531       ONE_MORE_BYTE (c);
4532       if (c < 0x80)
4533         continue;
4534       if ((c >= 0x81 && c <= 0x9F)
4535           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4536         {
4537           ONE_MORE_BYTE (c);
4538           if (c < 0x40 || c == 0x7F || c > 0xFC)
4539             break;
4540           found = CATEGORY_MASK_SJIS;
4541         }
4542       else if (c >= 0xA0 && c < 0xE0)
4543         found = CATEGORY_MASK_SJIS;
4544       else
4545         break;
4546     }
4547   detect_info->rejected |= CATEGORY_MASK_SJIS;
4548   return 0;
4549
4550  no_more_source:
4551   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4552     {
4553       detect_info->rejected |= CATEGORY_MASK_SJIS;
4554       return 0;
4555     }
4556   detect_info->found |= found;
4557   return 1;
4558 }
4559
4560 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4561    Return true if a text is encoded in BIG5.  */
4562
4563 static bool
4564 detect_coding_big5 (struct coding_system *coding,
4565                     struct coding_detection_info *detect_info)
4566 {
4567   const unsigned char *src = coding->source, *src_base;
4568   const unsigned char *src_end = coding->source + coding->src_bytes;
4569   bool multibytep = coding->src_multibyte;
4570   ptrdiff_t consumed_chars = 0;
4571   int found = 0;
4572   int c;
4573
4574   detect_info->checked |= CATEGORY_MASK_BIG5;
4575   /* A coding system of this category is always ASCII compatible.  */
4576   src += coding->head_ascii;
4577
4578   while (1)
4579     {
4580       src_base = src;
4581       ONE_MORE_BYTE (c);
4582       if (c < 0x80)
4583         continue;
4584       if (c >= 0xA1)
4585         {
4586           ONE_MORE_BYTE (c);
4587           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4588             return 0;
4589           found = CATEGORY_MASK_BIG5;
4590         }
4591       else
4592         break;
4593     }
4594   detect_info->rejected |= CATEGORY_MASK_BIG5;
4595   return 0;
4596
4597  no_more_source:
4598   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4599     {
4600       detect_info->rejected |= CATEGORY_MASK_BIG5;
4601       return 0;
4602     }
4603   detect_info->found |= found;
4604   return 1;
4605 }
4606
4607 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4608
4609 static void
4610 decode_coding_sjis (struct coding_system *coding)
4611 {
4612   const unsigned char *src = coding->source + coding->consumed;
4613   const unsigned char *src_end = coding->source + coding->src_bytes;
4614   const unsigned char *src_base;
4615   int *charbuf = coding->charbuf + coding->charbuf_used;
4616   /* We may produce one charset annotation in one loop and one more at
4617      the end.  */
4618   int *charbuf_end
4619     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4620   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4621   bool multibytep = coding->src_multibyte;
4622   struct charset *charset_roman, *charset_kanji, *charset_kana;
4623   struct charset *charset_kanji2;
4624   Lisp_Object attrs, charset_list, val;
4625   ptrdiff_t char_offset = coding->produced_char;
4626   ptrdiff_t last_offset = char_offset;
4627   int last_id = charset_ascii;
4628   bool eol_dos
4629     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4630   int byte_after_cr = -1;
4631
4632   CODING_GET_INFO (coding, attrs, charset_list);
4633
4634   val = charset_list;
4635   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4636   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4637   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4638   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4639
4640   while (1)
4641     {
4642       int c, c1;
4643       struct charset *charset;
4644
4645       src_base = src;
4646       consumed_chars_base = consumed_chars;
4647
4648       if (charbuf >= charbuf_end)
4649         {
4650           if (byte_after_cr >= 0)
4651             src_base--;
4652           break;
4653         }
4654
4655       if (byte_after_cr >= 0)
4656         c = byte_after_cr, byte_after_cr = -1;
4657       else
4658         ONE_MORE_BYTE (c);
4659       if (c < 0)
4660         goto invalid_code;
4661       if (c < 0x80)
4662         {
4663           if (eol_dos && c == '\r')
4664             ONE_MORE_BYTE (byte_after_cr);
4665           charset = charset_roman;
4666         }
4667       else if (c == 0x80 || c == 0xA0)
4668         goto invalid_code;
4669       else if (c >= 0xA1 && c <= 0xDF)
4670         {
4671           /* SJIS -> JISX0201-Kana */
4672           c &= 0x7F;
4673           charset = charset_kana;
4674         }
4675       else if (c <= 0xEF)
4676         {
4677           /* SJIS -> JISX0208 */
4678           ONE_MORE_BYTE (c1);
4679           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4680             goto invalid_code;
4681           c = (c << 8) | c1;
4682           SJIS_TO_JIS (c);
4683           charset = charset_kanji;
4684         }
4685       else if (c <= 0xFC && charset_kanji2)
4686         {
4687           /* SJIS -> JISX0213-2 */
4688           ONE_MORE_BYTE (c1);
4689           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4690             goto invalid_code;
4691           c = (c << 8) | c1;
4692           SJIS_TO_JIS2 (c);
4693           charset = charset_kanji2;
4694         }
4695       else
4696         goto invalid_code;
4697       if (charset->id != charset_ascii
4698           && last_id != charset->id)
4699         {
4700           if (last_id != charset_ascii)
4701             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4702           last_id = charset->id;
4703           last_offset = char_offset;
4704         }
4705       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4706       *charbuf++ = c;
4707       char_offset++;
4708       continue;
4709
4710     invalid_code:
4711       src = src_base;
4712       consumed_chars = consumed_chars_base;
4713       ONE_MORE_BYTE (c);
4714       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4715       char_offset++;
4716       coding->errors++;
4717     }
4718
4719  no_more_source:
4720   if (last_id != charset_ascii)
4721     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4722   coding->consumed_char += consumed_chars_base;
4723   coding->consumed = src_base - coding->source;
4724   coding->charbuf_used = charbuf - coding->charbuf;
4725 }
4726
4727 static void
4728 decode_coding_big5 (struct coding_system *coding)
4729 {
4730   const unsigned char *src = coding->source + coding->consumed;
4731   const unsigned char *src_end = coding->source + coding->src_bytes;
4732   const unsigned char *src_base;
4733   int *charbuf = coding->charbuf + coding->charbuf_used;
4734   /* We may produce one charset annotation in one loop and one more at
4735      the end.  */
4736   int *charbuf_end
4737     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4738   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4739   bool multibytep = coding->src_multibyte;
4740   struct charset *charset_roman, *charset_big5;
4741   Lisp_Object attrs, charset_list, val;
4742   ptrdiff_t char_offset = coding->produced_char;
4743   ptrdiff_t last_offset = char_offset;
4744   int last_id = charset_ascii;
4745   bool eol_dos
4746     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4747   int byte_after_cr = -1;
4748
4749   CODING_GET_INFO (coding, attrs, charset_list);
4750   val = charset_list;
4751   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4752   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4753
4754   while (1)
4755     {
4756       int c, c1;
4757       struct charset *charset;
4758
4759       src_base = src;
4760       consumed_chars_base = consumed_chars;
4761
4762       if (charbuf >= charbuf_end)
4763         {
4764           if (byte_after_cr >= 0)
4765             src_base--;
4766           break;
4767         }
4768
4769       if (byte_after_cr >= 0)
4770         c = byte_after_cr, byte_after_cr = -1;
4771       else
4772         ONE_MORE_BYTE (c);
4773
4774       if (c < 0)
4775         goto invalid_code;
4776       if (c < 0x80)
4777         {
4778           if (eol_dos && c == '\r')
4779             ONE_MORE_BYTE (byte_after_cr);
4780           charset = charset_roman;
4781         }
4782       else
4783         {
4784           /* BIG5 -> Big5 */
4785           if (c < 0xA1 || c > 0xFE)
4786             goto invalid_code;
4787           ONE_MORE_BYTE (c1);
4788           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4789             goto invalid_code;
4790           c = c << 8 | c1;
4791           charset = charset_big5;
4792         }
4793       if (charset->id != charset_ascii
4794           && last_id != charset->id)
4795         {
4796           if (last_id != charset_ascii)
4797             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4798           last_id = charset->id;
4799           last_offset = char_offset;
4800         }
4801       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4802       *charbuf++ = c;
4803       char_offset++;
4804       continue;
4805
4806     invalid_code:
4807       src = src_base;
4808       consumed_chars = consumed_chars_base;
4809       ONE_MORE_BYTE (c);
4810       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4811       char_offset++;
4812       coding->errors++;
4813     }
4814
4815  no_more_source:
4816   if (last_id != charset_ascii)
4817     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4818   coding->consumed_char += consumed_chars_base;
4819   coding->consumed = src_base - coding->source;
4820   coding->charbuf_used = charbuf - coding->charbuf;
4821 }
4822
4823 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4824    This function can encode charsets `ascii', `katakana-jisx0201',
4825    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4826    are sure that all these charsets are registered as official charset
4827    (i.e. do not have extended leading-codes).  Characters of other
4828    charsets are produced without any encoding.  */
4829
4830 static bool
4831 encode_coding_sjis (struct coding_system *coding)
4832 {
4833   bool multibytep = coding->dst_multibyte;
4834   int *charbuf = coding->charbuf;
4835   int *charbuf_end = charbuf + coding->charbuf_used;
4836   unsigned char *dst = coding->destination + coding->produced;
4837   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4838   int safe_room = 4;
4839   ptrdiff_t produced_chars = 0;
4840   Lisp_Object attrs, charset_list, val;
4841   bool ascii_compatible;
4842   struct charset *charset_kanji, *charset_kana;
4843   struct charset *charset_kanji2;
4844   int c;
4845
4846   CODING_GET_INFO (coding, attrs, charset_list);
4847   val = XCDR (charset_list);
4848   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4849   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4850   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4851
4852   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4853
4854   while (charbuf < charbuf_end)
4855     {
4856       ASSURE_DESTINATION (safe_room);
4857       c = *charbuf++;
4858       /* Now encode the character C.  */
4859       if (ASCII_CHAR_P (c) && ascii_compatible)
4860         EMIT_ONE_ASCII_BYTE (c);
4861       else if (CHAR_BYTE8_P (c))
4862         {
4863           c = CHAR_TO_BYTE8 (c);
4864           EMIT_ONE_BYTE (c);
4865         }
4866       else
4867         {
4868           unsigned code;
4869           struct charset *charset;
4870           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4871                                &code, charset);
4872
4873           if (!charset)
4874             {
4875               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4876                 {
4877                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4878                   charset = CHARSET_FROM_ID (charset_ascii);
4879                 }
4880               else
4881                 {
4882                   c = coding->default_char;
4883                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4884                                        charset_list, &code, charset);
4885                 }
4886             }
4887           if (code == CHARSET_INVALID_CODE (charset))
4888             emacs_abort ();
4889           if (charset == charset_kanji)
4890             {
4891               int c1, c2;
4892               JIS_TO_SJIS (code);
4893               c1 = code >> 8, c2 = code & 0xFF;
4894               EMIT_TWO_BYTES (c1, c2);
4895             }
4896           else if (charset == charset_kana)
4897             EMIT_ONE_BYTE (code | 0x80);
4898           else if (charset_kanji2 && charset == charset_kanji2)
4899             {
4900               int c1, c2;
4901
4902               c1 = code >> 8;
4903               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4904                   || c1 == 0x28
4905                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4906                 {
4907                   JIS_TO_SJIS2 (code);
4908                   c1 = code >> 8, c2 = code & 0xFF;
4909                   EMIT_TWO_BYTES (c1, c2);
4910                 }
4911               else
4912                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4913             }
4914           else
4915             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4916         }
4917     }
4918   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4919   coding->produced_char += produced_chars;
4920   coding->produced = dst - coding->destination;
4921   return 0;
4922 }
4923
4924 static bool
4925 encode_coding_big5 (struct coding_system *coding)
4926 {
4927   bool multibytep = coding->dst_multibyte;
4928   int *charbuf = coding->charbuf;
4929   int *charbuf_end = charbuf + coding->charbuf_used;
4930   unsigned char *dst = coding->destination + coding->produced;
4931   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4932   int safe_room = 4;
4933   ptrdiff_t produced_chars = 0;
4934   Lisp_Object attrs, charset_list, val;
4935   bool ascii_compatible;
4936   struct charset *charset_big5;
4937   int c;
4938
4939   CODING_GET_INFO (coding, attrs, charset_list);
4940   val = XCDR (charset_list);
4941   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4942   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4943
4944   while (charbuf < charbuf_end)
4945     {
4946       ASSURE_DESTINATION (safe_room);
4947       c = *charbuf++;
4948       /* Now encode the character C.  */
4949       if (ASCII_CHAR_P (c) && ascii_compatible)
4950         EMIT_ONE_ASCII_BYTE (c);
4951       else if (CHAR_BYTE8_P (c))
4952         {
4953           c = CHAR_TO_BYTE8 (c);
4954           EMIT_ONE_BYTE (c);
4955         }
4956       else
4957         {
4958           unsigned code;
4959           struct charset *charset;
4960           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4961                                &code, charset);
4962
4963           if (! charset)
4964             {
4965               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4966                 {
4967                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4968                   charset = CHARSET_FROM_ID (charset_ascii);
4969                 }
4970               else
4971                 {
4972                   c = coding->default_char;
4973                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4974                                        charset_list, &code, charset);
4975                 }
4976             }
4977           if (code == CHARSET_INVALID_CODE (charset))
4978             emacs_abort ();
4979           if (charset == charset_big5)
4980             {
4981               int c1, c2;
4982
4983               c1 = code >> 8, c2 = code & 0xFF;
4984               EMIT_TWO_BYTES (c1, c2);
4985             }
4986           else
4987             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4988         }
4989     }
4990   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4991   coding->produced_char += produced_chars;
4992   coding->produced = dst - coding->destination;
4993   return 0;
4994 }
4995
4996 \f
4997 /*** 10. CCL handlers ***/
4998
4999 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5000    Return true if a text is encoded in a coding system of which
5001    encoder/decoder are written in CCL program.  */
5002
5003 static bool
5004 detect_coding_ccl (struct coding_system *coding,
5005                    struct coding_detection_info *detect_info)
5006 {
5007   const unsigned char *src = coding->source, *src_base;
5008   const unsigned char *src_end = coding->source + coding->src_bytes;
5009   bool multibytep = coding->src_multibyte;
5010   ptrdiff_t consumed_chars = 0;
5011   int found = 0;
5012   unsigned char *valids;
5013   ptrdiff_t head_ascii = coding->head_ascii;
5014   Lisp_Object attrs;
5015
5016   detect_info->checked |= CATEGORY_MASK_CCL;
5017
5018   coding = &coding_categories[coding_category_ccl];
5019   valids = CODING_CCL_VALIDS (coding);
5020   attrs = CODING_ID_ATTRS (coding->id);
5021   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5022     src += head_ascii;
5023
5024   while (1)
5025     {
5026       int c;
5027
5028       src_base = src;
5029       ONE_MORE_BYTE (c);
5030       if (c < 0 || ! valids[c])
5031         break;
5032       if ((valids[c] > 1))
5033         found = CATEGORY_MASK_CCL;
5034     }
5035   detect_info->rejected |= CATEGORY_MASK_CCL;
5036   return 0;
5037
5038  no_more_source:
5039   detect_info->found |= found;
5040   return 1;
5041 }
5042
5043 static void
5044 decode_coding_ccl (struct coding_system *coding)
5045 {
5046   const unsigned char *src = coding->source + coding->consumed;
5047   const unsigned char *src_end = coding->source + coding->src_bytes;
5048   int *charbuf = coding->charbuf + coding->charbuf_used;
5049   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5050   ptrdiff_t consumed_chars = 0;
5051   bool multibytep = coding->src_multibyte;
5052   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5053   int source_charbuf[1024];
5054   int source_byteidx[1025];
5055   Lisp_Object attrs, charset_list;
5056
5057   CODING_GET_INFO (coding, attrs, charset_list);
5058
5059   while (1)
5060     {
5061       const unsigned char *p = src;
5062       int i = 0;
5063
5064       if (multibytep)
5065         {
5066           while (i < 1024 && p < src_end)
5067             {
5068               source_byteidx[i] = p - src;
5069               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5070             }
5071           source_byteidx[i] = p - src;
5072         }
5073       else
5074         while (i < 1024 && p < src_end)
5075           source_charbuf[i++] = *p++;
5076
5077       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5078         ccl->last_block = 1;
5079       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5080                   charset_list);
5081       charbuf += ccl->produced;
5082       if (multibytep)
5083         src += source_byteidx[ccl->consumed];
5084       else
5085         src += ccl->consumed;
5086       consumed_chars += ccl->consumed;
5087       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5088         break;
5089     }
5090
5091   switch (ccl->status)
5092     {
5093     case CCL_STAT_SUSPEND_BY_SRC:
5094       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5095       break;
5096     case CCL_STAT_SUSPEND_BY_DST:
5097       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5098       break;
5099     case CCL_STAT_QUIT:
5100     case CCL_STAT_INVALID_CMD:
5101       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5102       break;
5103     default:
5104       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5105       break;
5106     }
5107   coding->consumed_char += consumed_chars;
5108   coding->consumed = src - coding->source;
5109   coding->charbuf_used = charbuf - coding->charbuf;
5110 }
5111
5112 static bool
5113 encode_coding_ccl (struct coding_system *coding)
5114 {
5115   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5116   bool multibytep = coding->dst_multibyte;
5117   int *charbuf = coding->charbuf;
5118   int *charbuf_end = charbuf + coding->charbuf_used;
5119   unsigned char *dst = coding->destination + coding->produced;
5120   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5121   int destination_charbuf[1024];
5122   ptrdiff_t produced_chars = 0;
5123   int i;
5124   Lisp_Object attrs, charset_list;
5125
5126   CODING_GET_INFO (coding, attrs, charset_list);
5127   if (coding->consumed_char == coding->src_chars
5128       && coding->mode & CODING_MODE_LAST_BLOCK)
5129     ccl->last_block = 1;
5130
5131   do
5132     {
5133       ccl_driver (ccl, charbuf, destination_charbuf,
5134                   charbuf_end - charbuf, 1024, charset_list);
5135       if (multibytep)
5136         {
5137           ASSURE_DESTINATION (ccl->produced * 2);
5138           for (i = 0; i < ccl->produced; i++)
5139             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5140         }
5141       else
5142         {
5143           ASSURE_DESTINATION (ccl->produced);
5144           for (i = 0; i < ccl->produced; i++)
5145             *dst++ = destination_charbuf[i] & 0xFF;
5146           produced_chars += ccl->produced;
5147         }
5148       charbuf += ccl->consumed;
5149       if (ccl->status == CCL_STAT_QUIT
5150           || ccl->status == CCL_STAT_INVALID_CMD)
5151         break;
5152     }
5153   while (charbuf < charbuf_end);
5154
5155   switch (ccl->status)
5156     {
5157     case CCL_STAT_SUSPEND_BY_SRC:
5158       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5159       break;
5160     case CCL_STAT_SUSPEND_BY_DST:
5161       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5162       break;
5163     case CCL_STAT_QUIT:
5164     case CCL_STAT_INVALID_CMD:
5165       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5166       break;
5167     default:
5168       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5169       break;
5170     }
5171
5172   coding->produced_char += produced_chars;
5173   coding->produced = dst - coding->destination;
5174   return 0;
5175 }
5176
5177 \f
5178 /*** 10, 11. no-conversion handlers ***/
5179
5180 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5181
5182 static void
5183 decode_coding_raw_text (struct coding_system *coding)
5184 {
5185   bool eol_dos
5186     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5187
5188   coding->chars_at_source = 1;
5189   coding->consumed_char = coding->src_chars;
5190   coding->consumed = coding->src_bytes;
5191   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5192     {
5193       coding->consumed_char--;
5194       coding->consumed--;
5195       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5196     }
5197   else
5198     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5199 }
5200
5201 static bool
5202 encode_coding_raw_text (struct coding_system *coding)
5203 {
5204   bool multibytep = coding->dst_multibyte;
5205   int *charbuf = coding->charbuf;
5206   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5207   unsigned char *dst = coding->destination + coding->produced;
5208   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5209   ptrdiff_t produced_chars = 0;
5210   int c;
5211
5212   if (multibytep)
5213     {
5214       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5215
5216       if (coding->src_multibyte)
5217         while (charbuf < charbuf_end)
5218           {
5219             ASSURE_DESTINATION (safe_room);
5220             c = *charbuf++;
5221             if (ASCII_CHAR_P (c))
5222               EMIT_ONE_ASCII_BYTE (c);
5223             else if (CHAR_BYTE8_P (c))
5224               {
5225                 c = CHAR_TO_BYTE8 (c);
5226                 EMIT_ONE_BYTE (c);
5227               }
5228             else
5229               {
5230                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5231
5232                 CHAR_STRING_ADVANCE (c, p1);
5233                 do
5234                   {
5235                     EMIT_ONE_BYTE (*p0);
5236                     p0++;
5237                   }
5238                 while (p0 < p1);
5239               }
5240           }
5241       else
5242         while (charbuf < charbuf_end)
5243           {
5244             ASSURE_DESTINATION (safe_room);
5245             c = *charbuf++;
5246             EMIT_ONE_BYTE (c);
5247           }
5248     }
5249   else
5250     {
5251       if (coding->src_multibyte)
5252         {
5253           int safe_room = MAX_MULTIBYTE_LENGTH;
5254
5255           while (charbuf < charbuf_end)
5256             {
5257               ASSURE_DESTINATION (safe_room);
5258               c = *charbuf++;
5259               if (ASCII_CHAR_P (c))
5260                 *dst++ = c;
5261               else if (CHAR_BYTE8_P (c))
5262                 *dst++ = CHAR_TO_BYTE8 (c);
5263               else
5264                 CHAR_STRING_ADVANCE (c, dst);
5265             }
5266         }
5267       else
5268         {
5269           ASSURE_DESTINATION (charbuf_end - charbuf);
5270           while (charbuf < charbuf_end && dst < dst_end)
5271             *dst++ = *charbuf++;
5272         }
5273       produced_chars = dst - (coding->destination + coding->produced);
5274     }
5275   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5276   coding->produced_char += produced_chars;
5277   coding->produced = dst - coding->destination;
5278   return 0;
5279 }
5280
5281 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5282    Return true if a text is encoded in a charset-based coding system.  */
5283
5284 static bool
5285 detect_coding_charset (struct coding_system *coding,
5286                        struct coding_detection_info *detect_info)
5287 {
5288   const unsigned char *src = coding->source, *src_base;
5289   const unsigned char *src_end = coding->source + coding->src_bytes;
5290   bool multibytep = coding->src_multibyte;
5291   ptrdiff_t consumed_chars = 0;
5292   Lisp_Object attrs, valids, name;
5293   int found = 0;
5294   ptrdiff_t head_ascii = coding->head_ascii;
5295   bool check_latin_extra = 0;
5296
5297   detect_info->checked |= CATEGORY_MASK_CHARSET;
5298
5299   coding = &coding_categories[coding_category_charset];
5300   attrs = CODING_ID_ATTRS (coding->id);
5301   valids = AREF (attrs, coding_attr_charset_valids);
5302   name = CODING_ID_NAME (coding->id);
5303   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5304                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5305       || strncmp (SSDATA (SYMBOL_NAME (name)),
5306                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5307     check_latin_extra = 1;
5308
5309   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5310     src += head_ascii;
5311
5312   while (1)
5313     {
5314       int c;
5315       Lisp_Object val;
5316       struct charset *charset;
5317       int dim, idx;
5318
5319       src_base = src;
5320       ONE_MORE_BYTE (c);
5321       if (c < 0)
5322         continue;
5323       val = AREF (valids, c);
5324       if (NILP (val))
5325         break;
5326       if (c >= 0x80)
5327         {
5328           if (c < 0xA0
5329               && check_latin_extra
5330               && (!VECTORP (Vlatin_extra_code_table)
5331                   || NILP (AREF (Vlatin_extra_code_table, c))))
5332             break;
5333           found = CATEGORY_MASK_CHARSET;
5334         }
5335       if (INTEGERP (val))
5336         {
5337           charset = CHARSET_FROM_ID (XFASTINT (val));
5338           dim = CHARSET_DIMENSION (charset);
5339           for (idx = 1; idx < dim; idx++)
5340             {
5341               if (src == src_end)
5342                 goto too_short;
5343               ONE_MORE_BYTE (c);
5344               if (c < charset->code_space[(dim - 1 - idx) * 4]
5345                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5346                 break;
5347             }
5348           if (idx < dim)
5349             break;
5350         }
5351       else
5352         {
5353           idx = 1;
5354           for (; CONSP (val); val = XCDR (val))
5355             {
5356               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5357               dim = CHARSET_DIMENSION (charset);
5358               while (idx < dim)
5359                 {
5360                   if (src == src_end)
5361                     goto too_short;
5362                   ONE_MORE_BYTE (c);
5363                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5364                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5365                     break;
5366                   idx++;
5367                 }
5368               if (idx == dim)
5369                 {
5370                   val = Qnil;
5371                   break;
5372                 }
5373             }
5374           if (CONSP (val))
5375             break;
5376         }
5377     }
5378  too_short:
5379   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5380   return 0;
5381
5382  no_more_source:
5383   detect_info->found |= found;
5384   return 1;
5385 }
5386
5387 static void
5388 decode_coding_charset (struct coding_system *coding)
5389 {
5390   const unsigned char *src = coding->source + coding->consumed;
5391   const unsigned char *src_end = coding->source + coding->src_bytes;
5392   const unsigned char *src_base;
5393   int *charbuf = coding->charbuf + coding->charbuf_used;
5394   /* We may produce one charset annotation in one loop and one more at
5395      the end.  */
5396   int *charbuf_end
5397     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5398   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5399   bool multibytep = coding->src_multibyte;
5400   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5401   Lisp_Object valids;
5402   ptrdiff_t char_offset = coding->produced_char;
5403   ptrdiff_t last_offset = char_offset;
5404   int last_id = charset_ascii;
5405   bool eol_dos
5406     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5407   int byte_after_cr = -1;
5408
5409   valids = AREF (attrs, coding_attr_charset_valids);
5410
5411   while (1)
5412     {
5413       int c;
5414       Lisp_Object val;
5415       struct charset *charset;
5416       int dim;
5417       int len = 1;
5418       unsigned code;
5419
5420       src_base = src;
5421       consumed_chars_base = consumed_chars;
5422
5423       if (charbuf >= charbuf_end)
5424         {
5425           if (byte_after_cr >= 0)
5426             src_base--;
5427           break;
5428         }
5429
5430       if (byte_after_cr >= 0)
5431         {
5432           c = byte_after_cr;
5433           byte_after_cr = -1;
5434         }
5435       else
5436         {
5437           ONE_MORE_BYTE (c);
5438           if (eol_dos && c == '\r')
5439             ONE_MORE_BYTE (byte_after_cr);
5440         }
5441       if (c < 0)
5442         goto invalid_code;
5443       code = c;
5444
5445       val = AREF (valids, c);
5446       if (! INTEGERP (val) && ! CONSP (val))
5447         goto invalid_code;
5448       if (INTEGERP (val))
5449         {
5450           charset = CHARSET_FROM_ID (XFASTINT (val));
5451           dim = CHARSET_DIMENSION (charset);
5452           while (len < dim)
5453             {
5454               ONE_MORE_BYTE (c);
5455               code = (code << 8) | c;
5456               len++;
5457             }
5458           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5459                               charset, code, c);
5460         }
5461       else
5462         {
5463           /* VAL is a list of charset IDs.  It is assured that the
5464              list is sorted by charset dimensions (smaller one
5465              comes first).  */
5466           while (CONSP (val))
5467             {
5468               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5469               dim = CHARSET_DIMENSION (charset);
5470               while (len < dim)
5471                 {
5472                   ONE_MORE_BYTE (c);
5473                   code = (code << 8) | c;
5474                   len++;
5475                 }
5476               CODING_DECODE_CHAR (coding, src, src_base,
5477                                   src_end, charset, code, c);
5478               if (c >= 0)
5479                 break;
5480               val = XCDR (val);
5481             }
5482         }
5483       if (c < 0)
5484         goto invalid_code;
5485       if (charset->id != charset_ascii
5486           && last_id != charset->id)
5487         {
5488           if (last_id != charset_ascii)
5489             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5490           last_id = charset->id;
5491           last_offset = char_offset;
5492         }
5493
5494       *charbuf++ = c;
5495       char_offset++;
5496       continue;
5497
5498     invalid_code:
5499       src = src_base;
5500       consumed_chars = consumed_chars_base;
5501       ONE_MORE_BYTE (c);
5502       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5503       char_offset++;
5504       coding->errors++;
5505     }
5506
5507  no_more_source:
5508   if (last_id != charset_ascii)
5509     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5510   coding->consumed_char += consumed_chars_base;
5511   coding->consumed = src_base - coding->source;
5512   coding->charbuf_used = charbuf - coding->charbuf;
5513 }
5514
5515 static bool
5516 encode_coding_charset (struct coding_system *coding)
5517 {
5518   bool multibytep = coding->dst_multibyte;
5519   int *charbuf = coding->charbuf;
5520   int *charbuf_end = charbuf + coding->charbuf_used;
5521   unsigned char *dst = coding->destination + coding->produced;
5522   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5523   int safe_room = MAX_MULTIBYTE_LENGTH;
5524   ptrdiff_t produced_chars = 0;
5525   Lisp_Object attrs, charset_list;
5526   bool ascii_compatible;
5527   int c;
5528
5529   CODING_GET_INFO (coding, attrs, charset_list);
5530   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5531
5532   while (charbuf < charbuf_end)
5533     {
5534       struct charset *charset;
5535       unsigned code;
5536
5537       ASSURE_DESTINATION (safe_room);
5538       c = *charbuf++;
5539       if (ascii_compatible && ASCII_CHAR_P (c))
5540         EMIT_ONE_ASCII_BYTE (c);
5541       else if (CHAR_BYTE8_P (c))
5542         {
5543           c = CHAR_TO_BYTE8 (c);
5544           EMIT_ONE_BYTE (c);
5545         }
5546       else
5547         {
5548           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5549                                &code, charset);
5550
5551           if (charset)
5552             {
5553               if (CHARSET_DIMENSION (charset) == 1)
5554                 EMIT_ONE_BYTE (code);
5555               else if (CHARSET_DIMENSION (charset) == 2)
5556                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5557               else if (CHARSET_DIMENSION (charset) == 3)
5558                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5559               else
5560                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5561                                  (code >> 8) & 0xFF, code & 0xFF);
5562             }
5563           else
5564             {
5565               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5566                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5567               else
5568                 c = coding->default_char;
5569               EMIT_ONE_BYTE (c);
5570             }
5571         }
5572     }
5573
5574   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5575   coding->produced_char += produced_chars;
5576   coding->produced = dst - coding->destination;
5577   return 0;
5578 }
5579
5580 \f
5581 /*** 7. C library functions ***/
5582
5583 /* Setup coding context CODING from information about CODING_SYSTEM.
5584    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5585    CODING_SYSTEM is invalid, signal an error.  */
5586
5587 void
5588 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5589 {
5590   Lisp_Object attrs;
5591   Lisp_Object eol_type;
5592   Lisp_Object coding_type;
5593   Lisp_Object val;
5594
5595   if (NILP (coding_system))
5596     coding_system = Qundecided;
5597
5598   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5599
5600   attrs = CODING_ID_ATTRS (coding->id);
5601   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5602
5603   coding->mode = 0;
5604   coding->head_ascii = -1;
5605   if (VECTORP (eol_type))
5606     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5607                             | CODING_REQUIRE_DETECTION_MASK);
5608   else if (! EQ (eol_type, Qunix))
5609     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5610                             | CODING_REQUIRE_ENCODING_MASK);
5611   else
5612     coding->common_flags = 0;
5613   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5614     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5615   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5616     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5617   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5618     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5619
5620   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5621   coding->max_charset_id = SCHARS (val) - 1;
5622   coding->safe_charsets = SDATA (val);
5623   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5624   coding->carryover_bytes = 0;
5625
5626   coding_type = CODING_ATTR_TYPE (attrs);
5627   if (EQ (coding_type, Qundecided))
5628     {
5629       coding->detector = NULL;
5630       coding->decoder = decode_coding_raw_text;
5631       coding->encoder = encode_coding_raw_text;
5632       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5633     }
5634   else if (EQ (coding_type, Qiso_2022))
5635     {
5636       int i;
5637       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5638
5639       /* Invoke graphic register 0 to plane 0.  */
5640       CODING_ISO_INVOCATION (coding, 0) = 0;
5641       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5642       CODING_ISO_INVOCATION (coding, 1)
5643         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5644       /* Setup the initial status of designation.  */
5645       for (i = 0; i < 4; i++)
5646         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5647       /* Not single shifting initially.  */
5648       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5649       /* Beginning of buffer should also be regarded as bol. */
5650       CODING_ISO_BOL (coding) = 1;
5651       coding->detector = detect_coding_iso_2022;
5652       coding->decoder = decode_coding_iso_2022;
5653       coding->encoder = encode_coding_iso_2022;
5654       if (flags & CODING_ISO_FLAG_SAFE)
5655         coding->mode |= CODING_MODE_SAFE_ENCODING;
5656       coding->common_flags
5657         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5658             | CODING_REQUIRE_FLUSHING_MASK);
5659       if (flags & CODING_ISO_FLAG_COMPOSITION)
5660         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5661       if (flags & CODING_ISO_FLAG_DESIGNATION)
5662         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5663       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5664         {
5665           setup_iso_safe_charsets (attrs);
5666           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5667           coding->max_charset_id = SCHARS (val) - 1;
5668           coding->safe_charsets = SDATA (val);
5669         }
5670       CODING_ISO_FLAGS (coding) = flags;
5671       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5672       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5673       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5674       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5675     }
5676   else if (EQ (coding_type, Qcharset))
5677     {
5678       coding->detector = detect_coding_charset;
5679       coding->decoder = decode_coding_charset;
5680       coding->encoder = encode_coding_charset;
5681       coding->common_flags
5682         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5683     }
5684   else if (EQ (coding_type, Qutf_8))
5685     {
5686       val = AREF (attrs, coding_attr_utf_bom);
5687       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5688                                    : EQ (val, Qt) ? utf_with_bom
5689                                    : utf_without_bom);
5690       coding->detector = detect_coding_utf_8;
5691       coding->decoder = decode_coding_utf_8;
5692       coding->encoder = encode_coding_utf_8;
5693       coding->common_flags
5694         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5695       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5696         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5697     }
5698   else if (EQ (coding_type, Qutf_16))
5699     {
5700       val = AREF (attrs, coding_attr_utf_bom);
5701       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5702                                     : EQ (val, Qt) ? utf_with_bom
5703                                     : utf_without_bom);
5704       val = AREF (attrs, coding_attr_utf_16_endian);
5705       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5706                                        : utf_16_little_endian);
5707       CODING_UTF_16_SURROGATE (coding) = 0;
5708       coding->detector = detect_coding_utf_16;
5709       coding->decoder = decode_coding_utf_16;
5710       coding->encoder = encode_coding_utf_16;
5711       coding->common_flags
5712         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5713       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5714         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5715     }
5716   else if (EQ (coding_type, Qccl))
5717     {
5718       coding->detector = detect_coding_ccl;
5719       coding->decoder = decode_coding_ccl;
5720       coding->encoder = encode_coding_ccl;
5721       coding->common_flags
5722         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5723             | CODING_REQUIRE_FLUSHING_MASK);
5724     }
5725   else if (EQ (coding_type, Qemacs_mule))
5726     {
5727       coding->detector = detect_coding_emacs_mule;
5728       coding->decoder = decode_coding_emacs_mule;
5729       coding->encoder = encode_coding_emacs_mule;
5730       coding->common_flags
5731         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5732       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5733           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5734         {
5735           Lisp_Object tail, safe_charsets;
5736           int max_charset_id = 0;
5737
5738           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5739                tail = XCDR (tail))
5740             if (max_charset_id < XFASTINT (XCAR (tail)))
5741               max_charset_id = XFASTINT (XCAR (tail));
5742           safe_charsets = make_uninit_string (max_charset_id + 1);
5743           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5744           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5745                tail = XCDR (tail))
5746             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5747           coding->max_charset_id = max_charset_id;
5748           coding->safe_charsets = SDATA (safe_charsets);
5749         }
5750       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5751       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5752     }
5753   else if (EQ (coding_type, Qshift_jis))
5754     {
5755       coding->detector = detect_coding_sjis;
5756       coding->decoder = decode_coding_sjis;
5757       coding->encoder = encode_coding_sjis;
5758       coding->common_flags
5759         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5760     }
5761   else if (EQ (coding_type, Qbig5))
5762     {
5763       coding->detector = detect_coding_big5;
5764       coding->decoder = decode_coding_big5;
5765       coding->encoder = encode_coding_big5;
5766       coding->common_flags
5767         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5768     }
5769   else                          /* EQ (coding_type, Qraw_text) */
5770     {
5771       coding->detector = NULL;
5772       coding->decoder = decode_coding_raw_text;
5773       coding->encoder = encode_coding_raw_text;
5774       if (! EQ (eol_type, Qunix))
5775         {
5776           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5777           if (! VECTORP (eol_type))
5778             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5779         }
5780
5781     }
5782
5783   return;
5784 }
5785
5786 /* Return a list of charsets supported by CODING.  */
5787
5788 Lisp_Object
5789 coding_charset_list (struct coding_system *coding)
5790 {
5791   Lisp_Object attrs, charset_list;
5792
5793   CODING_GET_INFO (coding, attrs, charset_list);
5794   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5795     {
5796       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5797
5798       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5799         charset_list = Viso_2022_charset_list;
5800     }
5801   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5802     {
5803       charset_list = Vemacs_mule_charset_list;
5804     }
5805   return charset_list;
5806 }
5807
5808
5809 /* Return a list of charsets supported by CODING-SYSTEM.  */
5810
5811 Lisp_Object
5812 coding_system_charset_list (Lisp_Object coding_system)
5813 {
5814   ptrdiff_t id;
5815   Lisp_Object attrs, charset_list;
5816
5817   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5818   attrs = CODING_ID_ATTRS (id);
5819
5820   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5821     {
5822       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5823
5824       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5825         charset_list = Viso_2022_charset_list;
5826       else
5827         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5828     }
5829   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5830     {
5831       charset_list = Vemacs_mule_charset_list;
5832     }
5833   else
5834     {
5835       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5836     }
5837   return charset_list;
5838 }
5839
5840
5841 /* Return raw-text or one of its subsidiaries that has the same
5842    eol_type as CODING-SYSTEM.  */
5843
5844 Lisp_Object
5845 raw_text_coding_system (Lisp_Object coding_system)
5846 {
5847   Lisp_Object spec, attrs;
5848   Lisp_Object eol_type, raw_text_eol_type;
5849
5850   if (NILP (coding_system))
5851     return Qraw_text;
5852   spec = CODING_SYSTEM_SPEC (coding_system);
5853   attrs = AREF (spec, 0);
5854
5855   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5856     return coding_system;
5857
5858   eol_type = AREF (spec, 2);
5859   if (VECTORP (eol_type))
5860     return Qraw_text;
5861   spec = CODING_SYSTEM_SPEC (Qraw_text);
5862   raw_text_eol_type = AREF (spec, 2);
5863   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5864           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5865           : AREF (raw_text_eol_type, 2));
5866 }
5867
5868
5869 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5870    the subsidiary that has the same eol-spec as PARENT (if it is not
5871    nil and specifies end-of-line format) or the system's setting
5872    (system_eol_type).  */
5873
5874 Lisp_Object
5875 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5876 {
5877   Lisp_Object spec, eol_type;
5878
5879   if (NILP (coding_system))
5880     coding_system = Qraw_text;
5881   spec = CODING_SYSTEM_SPEC (coding_system);
5882   eol_type = AREF (spec, 2);
5883   if (VECTORP (eol_type))
5884     {
5885       Lisp_Object parent_eol_type;
5886
5887       if (! NILP (parent))
5888         {
5889           Lisp_Object parent_spec;
5890
5891           parent_spec = CODING_SYSTEM_SPEC (parent);
5892           parent_eol_type = AREF (parent_spec, 2);
5893           if (VECTORP (parent_eol_type))
5894             parent_eol_type = system_eol_type;
5895         }
5896       else
5897         parent_eol_type = system_eol_type;
5898       if (EQ (parent_eol_type, Qunix))
5899         coding_system = AREF (eol_type, 0);
5900       else if (EQ (parent_eol_type, Qdos))
5901         coding_system = AREF (eol_type, 1);
5902       else if (EQ (parent_eol_type, Qmac))
5903         coding_system = AREF (eol_type, 2);
5904     }
5905   return coding_system;
5906 }
5907
5908
5909 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5910    decided for writing to a process.  If not, complement them, and
5911    return a new coding system.  */
5912
5913 Lisp_Object
5914 complement_process_encoding_system (Lisp_Object coding_system)
5915 {
5916   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5917   Lisp_Object spec, attrs;
5918   int i;
5919
5920   for (i = 0; i < 3; i++)
5921     {
5922       if (i == 1)
5923         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5924       else if (i == 2)
5925         coding_system = preferred_coding_system ();
5926       spec = CODING_SYSTEM_SPEC (coding_system);
5927       if (NILP (spec))
5928         continue;
5929       attrs = AREF (spec, 0);
5930       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5931         coding_base = CODING_ATTR_BASE_NAME (attrs);
5932       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5933         eol_base = coding_system;
5934       if (! NILP (coding_base) && ! NILP (eol_base))
5935         break;
5936     }
5937
5938   if (i > 0)
5939     /* The original CODING_SYSTEM didn't specify text-conversion or
5940        eol-conversion.  Be sure that we return a fully complemented
5941        coding system.  */
5942     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5943   return coding_system;
5944 }
5945
5946
5947 /* Emacs has a mechanism to automatically detect a coding system if it
5948    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5949    it's impossible to distinguish some coding systems accurately
5950    because they use the same range of codes.  So, at first, coding
5951    systems are categorized into 7, those are:
5952
5953    o coding-category-emacs-mule
5954
5955         The category for a coding system which has the same code range
5956         as Emacs' internal format.  Assigned the coding-system (Lisp
5957         symbol) `emacs-mule' by default.
5958
5959    o coding-category-sjis
5960
5961         The category for a coding system which has the same code range
5962         as SJIS.  Assigned the coding-system (Lisp
5963         symbol) `japanese-shift-jis' by default.
5964
5965    o coding-category-iso-7
5966
5967         The category for a coding system which has the same code range
5968         as ISO2022 of 7-bit environment.  This doesn't use any locking
5969         shift and single shift functions.  This can encode/decode all
5970         charsets.  Assigned the coding-system (Lisp symbol)
5971         `iso-2022-7bit' by default.
5972
5973    o coding-category-iso-7-tight
5974
5975         Same as coding-category-iso-7 except that this can
5976         encode/decode only the specified charsets.
5977
5978    o coding-category-iso-8-1
5979
5980         The category for a coding system which has the same code range
5981         as ISO2022 of 8-bit environment and graphic plane 1 used only
5982         for DIMENSION1 charset.  This doesn't use any locking shift
5983         and single shift functions.  Assigned the coding-system (Lisp
5984         symbol) `iso-latin-1' by default.
5985
5986    o coding-category-iso-8-2
5987
5988         The category for a coding system which has the same code range
5989         as ISO2022 of 8-bit environment and graphic plane 1 used only
5990         for DIMENSION2 charset.  This doesn't use any locking shift
5991         and single shift functions.  Assigned the coding-system (Lisp
5992         symbol) `japanese-iso-8bit' by default.
5993
5994    o coding-category-iso-7-else
5995
5996         The category for a coding system which has the same code range
5997         as ISO2022 of 7-bit environment but uses locking shift or
5998         single shift functions.  Assigned the coding-system (Lisp
5999         symbol) `iso-2022-7bit-lock' by default.
6000
6001    o coding-category-iso-8-else
6002
6003         The category for a coding system which has the same code range
6004         as ISO2022 of 8-bit environment but uses locking shift or
6005         single shift functions.  Assigned the coding-system (Lisp
6006         symbol) `iso-2022-8bit-ss2' by default.
6007
6008    o coding-category-big5
6009
6010         The category for a coding system which has the same code range
6011         as BIG5.  Assigned the coding-system (Lisp symbol)
6012         `cn-big5' by default.
6013
6014    o coding-category-utf-8
6015
6016         The category for a coding system which has the same code range
6017         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6018         symbol) `utf-8' by default.
6019
6020    o coding-category-utf-16-be
6021
6022         The category for a coding system in which a text has an
6023         Unicode signature (cf. Unicode Standard) in the order of BIG
6024         endian at the head.  Assigned the coding-system (Lisp symbol)
6025         `utf-16-be' by default.
6026
6027    o coding-category-utf-16-le
6028
6029         The category for a coding system in which a text has an
6030         Unicode signature (cf. Unicode Standard) in the order of
6031         LITTLE endian at the head.  Assigned the coding-system (Lisp
6032         symbol) `utf-16-le' by default.
6033
6034    o coding-category-ccl
6035
6036         The category for a coding system of which encoder/decoder is
6037         written in CCL programs.  The default value is nil, i.e., no
6038         coding system is assigned.
6039
6040    o coding-category-binary
6041
6042         The category for a coding system not categorized in any of the
6043         above.  Assigned the coding-system (Lisp symbol)
6044         `no-conversion' by default.
6045
6046    Each of them is a Lisp symbol and the value is an actual
6047    `coding-system's (this is also a Lisp symbol) assigned by a user.
6048    What Emacs does actually is to detect a category of coding system.
6049    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6050    decide only one possible category, it selects a category of the
6051    highest priority.  Priorities of categories are also specified by a
6052    user in a Lisp variable `coding-category-list'.
6053
6054 */
6055
6056 #define EOL_SEEN_NONE   0
6057 #define EOL_SEEN_LF     1
6058 #define EOL_SEEN_CR     2
6059 #define EOL_SEEN_CRLF   4
6060
6061 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6062    SOURCE is encoded.  If CATEGORY is one of
6063    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6064    two-byte, else they are encoded by one-byte.
6065
6066    Return one of EOL_SEEN_XXX.  */
6067
6068 #define MAX_EOL_CHECK_COUNT 3
6069
6070 static int
6071 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6072             enum coding_category category)
6073 {
6074   const unsigned char *src = source, *src_end = src + src_bytes;
6075   unsigned char c;
6076   int total  = 0;
6077   int eol_seen = EOL_SEEN_NONE;
6078
6079   if ((1 << category) & CATEGORY_MASK_UTF_16)
6080     {
6081       bool msb = category == (coding_category_utf_16_le
6082                               | coding_category_utf_16_le_nosig);
6083       bool lsb = !msb;
6084
6085       while (src + 1 < src_end)
6086         {
6087           c = src[lsb];
6088           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6089             {
6090               int this_eol;
6091
6092               if (c == '\n')
6093                 this_eol = EOL_SEEN_LF;
6094               else if (src + 3 >= src_end
6095                        || src[msb + 2] != 0
6096                        || src[lsb + 2] != '\n')
6097                 this_eol = EOL_SEEN_CR;
6098               else
6099                 {
6100                   this_eol = EOL_SEEN_CRLF;
6101                   src += 2;
6102                 }
6103
6104               if (eol_seen == EOL_SEEN_NONE)
6105                 /* This is the first end-of-line.  */
6106                 eol_seen = this_eol;
6107               else if (eol_seen != this_eol)
6108                 {
6109                   /* The found type is different from what found before.
6110                      Allow for stray ^M characters in DOS EOL files.  */
6111                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6112                       || (eol_seen == EOL_SEEN_CRLF
6113                           && this_eol == EOL_SEEN_CR))
6114                     eol_seen = EOL_SEEN_CRLF;
6115                   else
6116                     {
6117                       eol_seen = EOL_SEEN_LF;
6118                       break;
6119                     }
6120                 }
6121               if (++total == MAX_EOL_CHECK_COUNT)
6122                 break;
6123             }
6124           src += 2;
6125         }
6126     }
6127   else
6128     while (src < src_end)
6129       {
6130         c = *src++;
6131         if (c == '\n' || c == '\r')
6132           {
6133             int this_eol;
6134
6135             if (c == '\n')
6136               this_eol = EOL_SEEN_LF;
6137             else if (src >= src_end || *src != '\n')
6138               this_eol = EOL_SEEN_CR;
6139             else
6140               this_eol = EOL_SEEN_CRLF, src++;
6141
6142             if (eol_seen == EOL_SEEN_NONE)
6143               /* This is the first end-of-line.  */
6144               eol_seen = this_eol;
6145             else if (eol_seen != this_eol)
6146               {
6147                 /* The found type is different from what found before.
6148                    Allow for stray ^M characters in DOS EOL files.  */
6149                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6150                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6151                   eol_seen = EOL_SEEN_CRLF;
6152                 else
6153                   {
6154                     eol_seen = EOL_SEEN_LF;
6155                     break;
6156                   }
6157               }
6158             if (++total == MAX_EOL_CHECK_COUNT)
6159               break;
6160           }
6161       }
6162   return eol_seen;
6163 }
6164
6165
6166 static Lisp_Object
6167 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6168 {
6169   Lisp_Object eol_type;
6170
6171   eol_type = CODING_ID_EOL_TYPE (coding->id);
6172   if (eol_seen & EOL_SEEN_LF)
6173     {
6174       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6175       eol_type = Qunix;
6176     }
6177   else if (eol_seen & EOL_SEEN_CRLF)
6178     {
6179       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6180       eol_type = Qdos;
6181     }
6182   else if (eol_seen & EOL_SEEN_CR)
6183     {
6184       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6185       eol_type = Qmac;
6186     }
6187   return eol_type;
6188 }
6189
6190 /* Detect how a text specified in CODING is encoded.  If a coding
6191    system is detected, update fields of CODING by the detected coding
6192    system.  */
6193
6194 static void
6195 detect_coding (struct coding_system *coding)
6196 {
6197   const unsigned char *src, *src_end;
6198   unsigned int saved_mode = coding->mode;
6199
6200   coding->consumed = coding->consumed_char = 0;
6201   coding->produced = coding->produced_char = 0;
6202   coding_set_source (coding);
6203
6204   src_end = coding->source + coding->src_bytes;
6205   coding->head_ascii = 0;
6206
6207   /* If we have not yet decided the text encoding type, detect it
6208      now.  */
6209   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6210     {
6211       int c, i;
6212       struct coding_detection_info detect_info;
6213       bool null_byte_found = 0, eight_bit_found = 0;
6214
6215       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6216       for (src = coding->source; src < src_end; src++)
6217         {
6218           c = *src;
6219           if (c & 0x80)
6220             {
6221               eight_bit_found = 1;
6222               if (null_byte_found)
6223                 break;
6224             }
6225           else if (c < 0x20)
6226             {
6227               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6228                   && ! inhibit_iso_escape_detection
6229                   && ! detect_info.checked)
6230                 {
6231                   if (detect_coding_iso_2022 (coding, &detect_info))
6232                     {
6233                       /* We have scanned the whole data.  */
6234                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6235                         {
6236                           /* We didn't find an 8-bit code.  We may
6237                              have found a null-byte, but it's very
6238                              rare that a binary file conforms to
6239                              ISO-2022.  */
6240                           src = src_end;
6241                           coding->head_ascii = src - coding->source;
6242                         }
6243                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6244                       break;
6245                     }
6246                 }
6247               else if (! c && !inhibit_null_byte_detection)
6248                 {
6249                   null_byte_found = 1;
6250                   if (eight_bit_found)
6251                     break;
6252                 }
6253               if (! eight_bit_found)
6254                 coding->head_ascii++;
6255             }
6256           else if (! eight_bit_found)
6257             coding->head_ascii++;
6258         }
6259
6260       if (null_byte_found || eight_bit_found
6261           || coding->head_ascii < coding->src_bytes
6262           || detect_info.found)
6263         {
6264           enum coding_category category;
6265           struct coding_system *this;
6266
6267           if (coding->head_ascii == coding->src_bytes)
6268             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6269             for (i = 0; i < coding_category_raw_text; i++)
6270               {
6271                 category = coding_priorities[i];
6272                 this = coding_categories + category;
6273                 if (detect_info.found & (1 << category))
6274                   break;
6275               }
6276           else
6277             {
6278               if (null_byte_found)
6279                 {
6280                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6281                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6282                 }
6283               for (i = 0; i < coding_category_raw_text; i++)
6284                 {
6285                   category = coding_priorities[i];
6286                   this = coding_categories + category;
6287                   if (this->id < 0)
6288                     {
6289                       /* No coding system of this category is defined.  */
6290                       detect_info.rejected |= (1 << category);
6291                     }
6292                   else if (category >= coding_category_raw_text)
6293                     continue;
6294                   else if (detect_info.checked & (1 << category))
6295                     {
6296                       if (detect_info.found & (1 << category))
6297                         break;
6298                     }
6299                   else if ((*(this->detector)) (coding, &detect_info)
6300                            && detect_info.found & (1 << category))
6301                     {
6302                       if (category == coding_category_utf_16_auto)
6303                         {
6304                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6305                             category = coding_category_utf_16_le;
6306                           else
6307                             category = coding_category_utf_16_be;
6308                         }
6309                       break;
6310                     }
6311                 }
6312             }
6313
6314           if (i < coding_category_raw_text)
6315             setup_coding_system (CODING_ID_NAME (this->id), coding);
6316           else if (null_byte_found)
6317             setup_coding_system (Qno_conversion, coding);
6318           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6319                    == CATEGORY_MASK_ANY)
6320             setup_coding_system (Qraw_text, coding);
6321           else if (detect_info.rejected)
6322             for (i = 0; i < coding_category_raw_text; i++)
6323               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6324                 {
6325                   this = coding_categories + coding_priorities[i];
6326                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6327                   break;
6328                 }
6329         }
6330     }
6331   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6332            == coding_category_utf_8_auto)
6333     {
6334       Lisp_Object coding_systems;
6335       struct coding_detection_info detect_info;
6336
6337       coding_systems
6338         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6339       detect_info.found = detect_info.rejected = 0;
6340       coding->head_ascii = 0;
6341       if (CONSP (coding_systems)
6342           && detect_coding_utf_8 (coding, &detect_info))
6343         {
6344           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6345             setup_coding_system (XCAR (coding_systems), coding);
6346           else
6347             setup_coding_system (XCDR (coding_systems), coding);
6348         }
6349     }
6350   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6351            == coding_category_utf_16_auto)
6352     {
6353       Lisp_Object coding_systems;
6354       struct coding_detection_info detect_info;
6355
6356       coding_systems
6357         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6358       detect_info.found = detect_info.rejected = 0;
6359       coding->head_ascii = 0;
6360       if (CONSP (coding_systems)
6361           && detect_coding_utf_16 (coding, &detect_info))
6362         {
6363           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6364             setup_coding_system (XCAR (coding_systems), coding);
6365           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6366             setup_coding_system (XCDR (coding_systems), coding);
6367         }
6368     }
6369   coding->mode = saved_mode;
6370 }
6371
6372
6373 static void
6374 decode_eol (struct coding_system *coding)
6375 {
6376   Lisp_Object eol_type;
6377   unsigned char *p, *pbeg, *pend;
6378
6379   eol_type = CODING_ID_EOL_TYPE (coding->id);
6380   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6381     return;
6382
6383   if (NILP (coding->dst_object))
6384     pbeg = coding->destination;
6385   else
6386     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6387   pend = pbeg + coding->produced;
6388
6389   if (VECTORP (eol_type))
6390     {
6391       int eol_seen = EOL_SEEN_NONE;
6392
6393       for (p = pbeg; p < pend; p++)
6394         {
6395           if (*p == '\n')
6396             eol_seen |= EOL_SEEN_LF;
6397           else if (*p == '\r')
6398             {
6399               if (p + 1 < pend && *(p + 1) == '\n')
6400                 {
6401                   eol_seen |= EOL_SEEN_CRLF;
6402                   p++;
6403                 }
6404               else
6405                 eol_seen |= EOL_SEEN_CR;
6406             }
6407         }
6408       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6409       if ((eol_seen & EOL_SEEN_CRLF) != 0
6410           && (eol_seen & EOL_SEEN_CR) != 0
6411           && (eol_seen & EOL_SEEN_LF) == 0)
6412         eol_seen = EOL_SEEN_CRLF;
6413       else if (eol_seen != EOL_SEEN_NONE
6414           && eol_seen != EOL_SEEN_LF
6415           && eol_seen != EOL_SEEN_CRLF
6416           && eol_seen != EOL_SEEN_CR)
6417         eol_seen = EOL_SEEN_LF;
6418       if (eol_seen != EOL_SEEN_NONE)
6419         eol_type = adjust_coding_eol_type (coding, eol_seen);
6420     }
6421
6422   if (EQ (eol_type, Qmac))
6423     {
6424       for (p = pbeg; p < pend; p++)
6425         if (*p == '\r')
6426           *p = '\n';
6427     }
6428   else if (EQ (eol_type, Qdos))
6429     {
6430       ptrdiff_t n = 0;
6431
6432       if (NILP (coding->dst_object))
6433         {
6434           /* Start deleting '\r' from the tail to minimize the memory
6435              movement.  */
6436           for (p = pend - 2; p >= pbeg; p--)
6437             if (*p == '\r')
6438               {
6439                 memmove (p, p + 1, pend-- - p - 1);
6440                 n++;
6441               }
6442         }
6443       else
6444         {
6445           ptrdiff_t pos_byte = coding->dst_pos_byte;
6446           ptrdiff_t pos = coding->dst_pos;
6447           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6448
6449           while (pos < pos_end)
6450             {
6451               p = BYTE_POS_ADDR (pos_byte);
6452               if (*p == '\r' && p[1] == '\n')
6453                 {
6454                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6455                   n++;
6456                   pos_end--;
6457                 }
6458               pos++;
6459               if (coding->dst_multibyte)
6460                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6461               else
6462                 pos_byte++;
6463             }
6464         }
6465       coding->produced -= n;
6466       coding->produced_char -= n;
6467     }
6468 }
6469
6470
6471 /* Return a translation table (or list of them) from coding system
6472    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6473    not ENCODEP). */
6474
6475 static Lisp_Object
6476 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6477 {
6478   Lisp_Object standard, translation_table;
6479   Lisp_Object val;
6480
6481   if (NILP (Venable_character_translation))
6482     {
6483       if (max_lookup)
6484         *max_lookup = 0;
6485       return Qnil;
6486     }
6487   if (encodep)
6488     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6489       standard = Vstandard_translation_table_for_encode;
6490   else
6491     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6492       standard = Vstandard_translation_table_for_decode;
6493   if (NILP (translation_table))
6494     translation_table = standard;
6495   else
6496     {
6497       if (SYMBOLP (translation_table))
6498         translation_table = Fget (translation_table, Qtranslation_table);
6499       else if (CONSP (translation_table))
6500         {
6501           translation_table = Fcopy_sequence (translation_table);
6502           for (val = translation_table; CONSP (val); val = XCDR (val))
6503             if (SYMBOLP (XCAR (val)))
6504               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6505         }
6506       if (CHAR_TABLE_P (standard))
6507         {
6508           if (CONSP (translation_table))
6509             translation_table = nconc2 (translation_table,
6510                                         Fcons (standard, Qnil));
6511           else
6512             translation_table = Fcons (translation_table,
6513                                        Fcons (standard, Qnil));
6514         }
6515     }
6516
6517   if (max_lookup)
6518     {
6519       *max_lookup = 1;
6520       if (CHAR_TABLE_P (translation_table)
6521           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6522         {
6523           val = XCHAR_TABLE (translation_table)->extras[1];
6524           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6525             *max_lookup = XFASTINT (val);
6526         }
6527       else if (CONSP (translation_table))
6528         {
6529           Lisp_Object tail;
6530
6531           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6532             if (CHAR_TABLE_P (XCAR (tail))
6533                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6534               {
6535                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6536                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6537                   *max_lookup = XFASTINT (tailval);
6538               }
6539         }
6540     }
6541   return translation_table;
6542 }
6543
6544 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6545   do {                                                          \
6546     trans = Qnil;                                               \
6547     if (CHAR_TABLE_P (table))                                   \
6548       {                                                         \
6549         trans = CHAR_TABLE_REF (table, c);                      \
6550         if (CHARACTERP (trans))                                 \
6551           c = XFASTINT (trans), trans = Qnil;                   \
6552       }                                                         \
6553     else if (CONSP (table))                                     \
6554       {                                                         \
6555         Lisp_Object tail;                                       \
6556                                                                 \
6557         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6558           if (CHAR_TABLE_P (XCAR (tail)))                       \
6559             {                                                   \
6560               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6561               if (CHARACTERP (trans))                           \
6562                 c = XFASTINT (trans), trans = Qnil;             \
6563               else if (! NILP (trans))                          \
6564                 break;                                          \
6565             }                                                   \
6566       }                                                         \
6567   } while (0)
6568
6569
6570 /* Return a translation of character(s) at BUF according to TRANS.
6571    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6572    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6573    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6574    translation is found, and Qnil if not found..
6575    If BUF is too short to lookup characters in FROM, return Qt.  */
6576
6577 static Lisp_Object
6578 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6579 {
6580
6581   if (INTEGERP (trans))
6582     return trans;
6583   for (; CONSP (trans); trans = XCDR (trans))
6584     {
6585       Lisp_Object val = XCAR (trans);
6586       Lisp_Object from = XCAR (val);
6587       ptrdiff_t len = ASIZE (from);
6588       ptrdiff_t i;
6589
6590       for (i = 0; i < len; i++)
6591         {
6592           if (buf + i == buf_end)
6593             return Qt;
6594           if (XINT (AREF (from, i)) != buf[i])
6595             break;
6596         }
6597       if (i == len)
6598         return val;
6599     }
6600   return Qnil;
6601 }
6602
6603
6604 static int
6605 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6606                bool last_block)
6607 {
6608   unsigned char *dst = coding->destination + coding->produced;
6609   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6610   ptrdiff_t produced;
6611   ptrdiff_t produced_chars = 0;
6612   int carryover = 0;
6613
6614   if (! coding->chars_at_source)
6615     {
6616       /* Source characters are in coding->charbuf.  */
6617       int *buf = coding->charbuf;
6618       int *buf_end = buf + coding->charbuf_used;
6619
6620       if (EQ (coding->src_object, coding->dst_object))
6621         {
6622           coding_set_source (coding);
6623           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6624         }
6625
6626       while (buf < buf_end)
6627         {
6628           int c = *buf;
6629           ptrdiff_t i;
6630
6631           if (c >= 0)
6632             {
6633               ptrdiff_t from_nchars = 1, to_nchars = 1;
6634               Lisp_Object trans = Qnil;
6635
6636               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6637               if (! NILP (trans))
6638                 {
6639                   trans = get_translation (trans, buf, buf_end);
6640                   if (INTEGERP (trans))
6641                     c = XINT (trans);
6642                   else if (CONSP (trans))
6643                     {
6644                       from_nchars = ASIZE (XCAR (trans));
6645                       trans = XCDR (trans);
6646                       if (INTEGERP (trans))
6647                         c = XINT (trans);
6648                       else
6649                         {
6650                           to_nchars = ASIZE (trans);
6651                           c = XINT (AREF (trans, 0));
6652                         }
6653                     }
6654                   else if (EQ (trans, Qt) && ! last_block)
6655                     break;
6656                 }
6657
6658               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6659                 {
6660                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6661                        / MAX_MULTIBYTE_LENGTH)
6662                       < to_nchars)
6663                     memory_full (SIZE_MAX);
6664                   dst = alloc_destination (coding,
6665                                            buf_end - buf
6666                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6667                                            dst);
6668                   if (EQ (coding->src_object, coding->dst_object))
6669                     {
6670                       coding_set_source (coding);
6671                       dst_end = (((unsigned char *) coding->source)
6672                                  + coding->consumed);
6673                     }
6674                   else
6675                     dst_end = coding->destination + coding->dst_bytes;
6676                 }
6677
6678               for (i = 0; i < to_nchars; i++)
6679                 {
6680                   if (i > 0)
6681                     c = XINT (AREF (trans, i));
6682                   if (coding->dst_multibyte
6683                       || ! CHAR_BYTE8_P (c))
6684                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6685                   else
6686                     *dst++ = CHAR_TO_BYTE8 (c);
6687                 }
6688               produced_chars += to_nchars;
6689               buf += from_nchars;
6690             }
6691           else
6692             /* This is an annotation datum.  (-C) is the length.  */
6693             buf += -c;
6694         }
6695       carryover = buf_end - buf;
6696     }
6697   else
6698     {
6699       /* Source characters are at coding->source.  */
6700       const unsigned char *src = coding->source;
6701       const unsigned char *src_end = src + coding->consumed;
6702
6703       if (EQ (coding->dst_object, coding->src_object))
6704         dst_end = (unsigned char *) src;
6705       if (coding->src_multibyte != coding->dst_multibyte)
6706         {
6707           if (coding->src_multibyte)
6708             {
6709               bool multibytep = 1;
6710               ptrdiff_t consumed_chars = 0;
6711
6712               while (1)
6713                 {
6714                   const unsigned char *src_base = src;
6715                   int c;
6716
6717                   ONE_MORE_BYTE (c);
6718                   if (dst == dst_end)
6719                     {
6720                       if (EQ (coding->src_object, coding->dst_object))
6721                         dst_end = (unsigned char *) src;
6722                       if (dst == dst_end)
6723                         {
6724                           ptrdiff_t offset = src - coding->source;
6725
6726                           dst = alloc_destination (coding, src_end - src + 1,
6727                                                    dst);
6728                           dst_end = coding->destination + coding->dst_bytes;
6729                           coding_set_source (coding);
6730                           src = coding->source + offset;
6731                           src_end = coding->source + coding->consumed;
6732                           if (EQ (coding->src_object, coding->dst_object))
6733                             dst_end = (unsigned char *) src;
6734                         }
6735                     }
6736                   *dst++ = c;
6737                   produced_chars++;
6738                 }
6739             no_more_source:
6740               ;
6741             }
6742           else
6743             while (src < src_end)
6744               {
6745                 bool multibytep = 1;
6746                 int c = *src++;
6747
6748                 if (dst >= dst_end - 1)
6749                   {
6750                     if (EQ (coding->src_object, coding->dst_object))
6751                       dst_end = (unsigned char *) src;
6752                     if (dst >= dst_end - 1)
6753                       {
6754                         ptrdiff_t offset = src - coding->source;
6755                         ptrdiff_t more_bytes;
6756
6757                         if (EQ (coding->src_object, coding->dst_object))
6758                           more_bytes = ((src_end - src) / 2) + 2;
6759                         else
6760                           more_bytes = src_end - src + 2;
6761                         dst = alloc_destination (coding, more_bytes, dst);
6762                         dst_end = coding->destination + coding->dst_bytes;
6763                         coding_set_source (coding);
6764                         src = coding->source + offset;
6765                         src_end = coding->source + coding->consumed;
6766                         if (EQ (coding->src_object, coding->dst_object))
6767                           dst_end = (unsigned char *) src;
6768                       }
6769                   }
6770                 EMIT_ONE_BYTE (c);
6771               }
6772         }
6773       else
6774         {
6775           if (!EQ (coding->src_object, coding->dst_object))
6776             {
6777               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6778
6779               if (require > 0)
6780                 {
6781                   ptrdiff_t offset = src - coding->source;
6782
6783                   dst = alloc_destination (coding, require, dst);
6784                   coding_set_source (coding);
6785                   src = coding->source + offset;
6786                   src_end = coding->source + coding->consumed;
6787                 }
6788             }
6789           produced_chars = coding->consumed_char;
6790           while (src < src_end)
6791             *dst++ = *src++;
6792         }
6793     }
6794
6795   produced = dst - (coding->destination + coding->produced);
6796   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6797     insert_from_gap (produced_chars, produced);
6798   coding->produced += produced;
6799   coding->produced_char += produced_chars;
6800   return carryover;
6801 }
6802
6803 /* Compose text in CODING->object according to the annotation data at
6804    CHARBUF.  CHARBUF is an array:
6805      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6806  */
6807
6808 static inline void
6809 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6810 {
6811   int len;
6812   ptrdiff_t to;
6813   enum composition_method method;
6814   Lisp_Object components;
6815
6816   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6817   to = pos + charbuf[2];
6818   method = (enum composition_method) (charbuf[4]);
6819
6820   if (method == COMPOSITION_RELATIVE)
6821     components = Qnil;
6822   else
6823     {
6824       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6825       int i, j;
6826
6827       if (method == COMPOSITION_WITH_RULE)
6828         len = charbuf[2] * 3 - 2;
6829       charbuf += MAX_ANNOTATION_LENGTH;
6830       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6831       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6832         {
6833           if (charbuf[i] >= 0)
6834             args[j] = make_number (charbuf[i]);
6835           else
6836             {
6837               i++;
6838               args[j] = make_number (charbuf[i] % 0x100);
6839             }
6840         }
6841       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6842     }
6843   compose_text (pos, to, components, Qnil, coding->dst_object);
6844 }
6845
6846
6847 /* Put `charset' property on text in CODING->object according to
6848    the annotation data at CHARBUF.  CHARBUF is an array:
6849      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6850  */
6851
6852 static inline void
6853 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6854 {
6855   ptrdiff_t from = pos - charbuf[2];
6856   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6857
6858   Fput_text_property (make_number (from), make_number (pos),
6859                       Qcharset, CHARSET_NAME (charset),
6860                       coding->dst_object);
6861 }
6862
6863
6864 #define CHARBUF_SIZE 0x4000
6865
6866 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6867   do {                                                                  \
6868     int size = CHARBUF_SIZE;                                            \
6869                                                                         \
6870     coding->charbuf = NULL;                                             \
6871     while (size > 1024)                                                 \
6872       {                                                                 \
6873         coding->charbuf = alloca (sizeof (int) * size);                 \
6874         if (coding->charbuf)                                            \
6875           break;                                                        \
6876         size >>= 1;                                                     \
6877       }                                                                 \
6878     if (! coding->charbuf)                                              \
6879       {                                                                 \
6880         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6881         return;                                                         \
6882       }                                                                 \
6883     coding->charbuf_size = size;                                        \
6884   } while (0)
6885
6886
6887 static void
6888 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6889 {
6890   int *charbuf = coding->charbuf;
6891   int *charbuf_end = charbuf + coding->charbuf_used;
6892
6893   if (NILP (coding->dst_object))
6894     return;
6895
6896   while (charbuf < charbuf_end)
6897     {
6898       if (*charbuf >= 0)
6899         pos++, charbuf++;
6900       else
6901         {
6902           int len = -*charbuf;
6903
6904           if (len > 2)
6905             switch (charbuf[1])
6906               {
6907               case CODING_ANNOTATE_COMPOSITION_MASK:
6908                 produce_composition (coding, charbuf, pos);
6909                 break;
6910               case CODING_ANNOTATE_CHARSET_MASK:
6911                 produce_charset (coding, charbuf, pos);
6912                 break;
6913               }
6914           charbuf += len;
6915         }
6916     }
6917 }
6918
6919 /* Decode the data at CODING->src_object into CODING->dst_object.
6920    CODING->src_object is a buffer, a string, or nil.
6921    CODING->dst_object is a buffer.
6922
6923    If CODING->src_object is a buffer, it must be the current buffer.
6924    In this case, if CODING->src_pos is positive, it is a position of
6925    the source text in the buffer, otherwise, the source text is in the
6926    gap area of the buffer, and CODING->src_pos specifies the offset of
6927    the text from GPT (which must be the same as PT).  If this is the
6928    same buffer as CODING->dst_object, CODING->src_pos must be
6929    negative.
6930
6931    If CODING->src_object is a string, CODING->src_pos is an index to
6932    that string.
6933
6934    If CODING->src_object is nil, CODING->source must already point to
6935    the non-relocatable memory area.  In this case, CODING->src_pos is
6936    an offset from CODING->source.
6937
6938    The decoded data is inserted at the current point of the buffer
6939    CODING->dst_object.
6940 */
6941
6942 static void
6943 decode_coding (struct coding_system *coding)
6944 {
6945   Lisp_Object attrs;
6946   Lisp_Object undo_list;
6947   Lisp_Object translation_table;
6948   struct ccl_spec cclspec;
6949   int carryover;
6950   int i;
6951
6952   if (BUFFERP (coding->src_object)
6953       && coding->src_pos > 0
6954       && coding->src_pos < GPT
6955       && coding->src_pos + coding->src_chars > GPT)
6956     move_gap_both (coding->src_pos, coding->src_pos_byte);
6957
6958   undo_list = Qt;
6959   if (BUFFERP (coding->dst_object))
6960     {
6961       set_buffer_internal (XBUFFER (coding->dst_object));
6962       if (GPT != PT)
6963         move_gap_both (PT, PT_BYTE);
6964
6965       /* We must disable undo_list in order to record the whole insert
6966          transaction via record_insert at the end.  But doing so also
6967          disables the recording of the first change to the undo_list.
6968          Therefore we check for first change here and record it via
6969          record_first_change if needed.  */
6970       if (MODIFF <= SAVE_MODIFF)
6971         record_first_change ();
6972
6973       undo_list = BVAR (current_buffer, undo_list);
6974       bset_undo_list (current_buffer, Qt);
6975     }
6976
6977   coding->consumed = coding->consumed_char = 0;
6978   coding->produced = coding->produced_char = 0;
6979   coding->chars_at_source = 0;
6980   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6981   coding->errors = 0;
6982
6983   ALLOC_CONVERSION_WORK_AREA (coding);
6984
6985   attrs = CODING_ID_ATTRS (coding->id);
6986   translation_table = get_translation_table (attrs, 0, NULL);
6987
6988   carryover = 0;
6989   if (coding->decoder == decode_coding_ccl)
6990     {
6991       coding->spec.ccl = &cclspec;
6992       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
6993     }
6994   do
6995     {
6996       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
6997
6998       coding_set_source (coding);
6999       coding->annotated = 0;
7000       coding->charbuf_used = carryover;
7001       (*(coding->decoder)) (coding);
7002       coding_set_destination (coding);
7003       carryover = produce_chars (coding, translation_table, 0);
7004       if (coding->annotated)
7005         produce_annotation (coding, pos);
7006       for (i = 0; i < carryover; i++)
7007         coding->charbuf[i]
7008           = coding->charbuf[coding->charbuf_used - carryover + i];
7009     }
7010   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7011          || (coding->consumed < coding->src_bytes
7012              && (coding->result == CODING_RESULT_SUCCESS
7013                  || coding->result == CODING_RESULT_INVALID_SRC)));
7014
7015   if (carryover > 0)
7016     {
7017       coding_set_destination (coding);
7018       coding->charbuf_used = carryover;
7019       produce_chars (coding, translation_table, 1);
7020     }
7021
7022   coding->carryover_bytes = 0;
7023   if (coding->consumed < coding->src_bytes)
7024     {
7025       int nbytes = coding->src_bytes - coding->consumed;
7026       const unsigned char *src;
7027
7028       coding_set_source (coding);
7029       coding_set_destination (coding);
7030       src = coding->source + coding->consumed;
7031
7032       if (coding->mode & CODING_MODE_LAST_BLOCK)
7033         {
7034           /* Flush out unprocessed data as binary chars.  We are sure
7035              that the number of data is less than the size of
7036              coding->charbuf.  */
7037           coding->charbuf_used = 0;
7038           coding->chars_at_source = 0;
7039
7040           while (nbytes-- > 0)
7041             {
7042               int c = *src++;
7043
7044               if (c & 0x80)
7045                 c = BYTE8_TO_CHAR (c);
7046               coding->charbuf[coding->charbuf_used++] = c;
7047             }
7048           produce_chars (coding, Qnil, 1);
7049         }
7050       else
7051         {
7052           /* Record unprocessed bytes in coding->carryover.  We are
7053              sure that the number of data is less than the size of
7054              coding->carryover.  */
7055           unsigned char *p = coding->carryover;
7056
7057           if (nbytes > sizeof coding->carryover)
7058             nbytes = sizeof coding->carryover;
7059           coding->carryover_bytes = nbytes;
7060           while (nbytes-- > 0)
7061             *p++ = *src++;
7062         }
7063       coding->consumed = coding->src_bytes;
7064     }
7065
7066   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7067       && !inhibit_eol_conversion)
7068     decode_eol (coding);
7069   if (BUFFERP (coding->dst_object))
7070     {
7071       bset_undo_list (current_buffer, undo_list);
7072       record_insert (coding->dst_pos, coding->produced_char);
7073     }
7074 }
7075
7076
7077 /* Extract an annotation datum from a composition starting at POS and
7078    ending before LIMIT of CODING->src_object (buffer or string), store
7079    the data in BUF, set *STOP to a starting position of the next
7080    composition (if any) or to LIMIT, and return the address of the
7081    next element of BUF.
7082
7083    If such an annotation is not found, set *STOP to a starting
7084    position of a composition after POS (if any) or to LIMIT, and
7085    return BUF.  */
7086
7087 static inline int *
7088 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7089                                struct coding_system *coding, int *buf,
7090                                ptrdiff_t *stop)
7091 {
7092   ptrdiff_t start, end;
7093   Lisp_Object prop;
7094
7095   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7096       || end > limit)
7097     *stop = limit;
7098   else if (start > pos)
7099     *stop = start;
7100   else
7101     {
7102       if (start == pos)
7103         {
7104           /* We found a composition.  Store the corresponding
7105              annotation data in BUF.  */
7106           int *head = buf;
7107           enum composition_method method = COMPOSITION_METHOD (prop);
7108           int nchars = COMPOSITION_LENGTH (prop);
7109
7110           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7111           if (method != COMPOSITION_RELATIVE)
7112             {
7113               Lisp_Object components;
7114               ptrdiff_t i, len, i_byte;
7115
7116               components = COMPOSITION_COMPONENTS (prop);
7117               if (VECTORP (components))
7118                 {
7119                   len = ASIZE (components);
7120                   for (i = 0; i < len; i++)
7121                     *buf++ = XINT (AREF (components, i));
7122                 }
7123               else if (STRINGP (components))
7124                 {
7125                   len = SCHARS (components);
7126                   i = i_byte = 0;
7127                   while (i < len)
7128                     {
7129                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7130                       buf++;
7131                     }
7132                 }
7133               else if (INTEGERP (components))
7134                 {
7135                   len = 1;
7136                   *buf++ = XINT (components);
7137                 }
7138               else if (CONSP (components))
7139                 {
7140                   for (len = 0; CONSP (components);
7141                        len++, components = XCDR (components))
7142                     *buf++ = XINT (XCAR (components));
7143                 }
7144               else
7145                 emacs_abort ();
7146               *head -= len;
7147             }
7148         }
7149
7150       if (find_composition (end, limit, &start, &end, &prop,
7151                             coding->src_object)
7152           && end <= limit)
7153         *stop = start;
7154       else
7155         *stop = limit;
7156     }
7157   return buf;
7158 }
7159
7160
7161 /* Extract an annotation datum from a text property `charset' at POS of
7162    CODING->src_object (buffer of string), store the data in BUF, set
7163    *STOP to the position where the value of `charset' property changes
7164    (limiting by LIMIT), and return the address of the next element of
7165    BUF.
7166
7167    If the property value is nil, set *STOP to the position where the
7168    property value is non-nil (limiting by LIMIT), and return BUF.  */
7169
7170 static inline int *
7171 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7172                            struct coding_system *coding, int *buf,
7173                            ptrdiff_t *stop)
7174 {
7175   Lisp_Object val, next;
7176   int id;
7177
7178   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7179   if (! NILP (val) && CHARSETP (val))
7180     id = XINT (CHARSET_SYMBOL_ID (val));
7181   else
7182     id = -1;
7183   ADD_CHARSET_DATA (buf, 0, id);
7184   next = Fnext_single_property_change (make_number (pos), Qcharset,
7185                                        coding->src_object,
7186                                        make_number (limit));
7187   *stop = XINT (next);
7188   return buf;
7189 }
7190
7191
7192 static void
7193 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7194                int max_lookup)
7195 {
7196   int *buf = coding->charbuf;
7197   int *buf_end = coding->charbuf + coding->charbuf_size;
7198   const unsigned char *src = coding->source + coding->consumed;
7199   const unsigned char *src_end = coding->source + coding->src_bytes;
7200   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7201   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7202   bool multibytep = coding->src_multibyte;
7203   Lisp_Object eol_type;
7204   int c;
7205   ptrdiff_t stop, stop_composition, stop_charset;
7206   int *lookup_buf = NULL;
7207
7208   if (! NILP (translation_table))
7209     lookup_buf = alloca (sizeof (int) * max_lookup);
7210
7211   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7212   if (VECTORP (eol_type))
7213     eol_type = Qunix;
7214
7215   /* Note: composition handling is not yet implemented.  */
7216   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7217
7218   if (NILP (coding->src_object))
7219     stop = stop_composition = stop_charset = end_pos;
7220   else
7221     {
7222       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7223         stop = stop_composition = pos;
7224       else
7225         stop = stop_composition = end_pos;
7226       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7227         stop = stop_charset = pos;
7228       else
7229         stop_charset = end_pos;
7230     }
7231
7232   /* Compensate for CRLF and conversion.  */
7233   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7234   while (buf < buf_end)
7235     {
7236       Lisp_Object trans;
7237
7238       if (pos == stop)
7239         {
7240           if (pos == end_pos)
7241             break;
7242           if (pos == stop_composition)
7243             buf = handle_composition_annotation (pos, end_pos, coding,
7244                                                  buf, &stop_composition);
7245           if (pos == stop_charset)
7246             buf = handle_charset_annotation (pos, end_pos, coding,
7247                                              buf, &stop_charset);
7248           stop = (stop_composition < stop_charset
7249                   ? stop_composition : stop_charset);
7250         }
7251
7252       if (! multibytep)
7253         {
7254           int bytes;
7255
7256           if (coding->encoder == encode_coding_raw_text
7257               || coding->encoder == encode_coding_ccl)
7258             c = *src++, pos++;
7259           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7260             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7261           else
7262             c = BYTE8_TO_CHAR (*src), src++, pos++;
7263         }
7264       else
7265         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7266       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7267         c = '\n';
7268       if (! EQ (eol_type, Qunix))
7269         {
7270           if (c == '\n')
7271             {
7272               if (EQ (eol_type, Qdos))
7273                 *buf++ = '\r';
7274               else
7275                 c = '\r';
7276             }
7277         }
7278
7279       trans = Qnil;
7280       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7281       if (NILP (trans))
7282         *buf++ = c;
7283       else
7284         {
7285           ptrdiff_t from_nchars = 1, to_nchars = 1;
7286           int *lookup_buf_end;
7287           const unsigned char *p = src;
7288           int i;
7289
7290           lookup_buf[0] = c;
7291           for (i = 1; i < max_lookup && p < src_end; i++)
7292             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7293           lookup_buf_end = lookup_buf + i;
7294           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7295           if (INTEGERP (trans))
7296             c = XINT (trans);
7297           else if (CONSP (trans))
7298             {
7299               from_nchars = ASIZE (XCAR (trans));
7300               trans = XCDR (trans);
7301               if (INTEGERP (trans))
7302                 c = XINT (trans);
7303               else
7304                 {
7305                   to_nchars = ASIZE (trans);
7306                   if (buf_end - buf < to_nchars)
7307                     break;
7308                   c = XINT (AREF (trans, 0));
7309                 }
7310             }
7311           else
7312             break;
7313           *buf++ = c;
7314           for (i = 1; i < to_nchars; i++)
7315             *buf++ = XINT (AREF (trans, i));
7316           for (i = 1; i < from_nchars; i++, pos++)
7317             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7318         }
7319     }
7320
7321   coding->consumed = src - coding->source;
7322   coding->consumed_char = pos - coding->src_pos;
7323   coding->charbuf_used = buf - coding->charbuf;
7324   coding->chars_at_source = 0;
7325 }
7326
7327
7328 /* Encode the text at CODING->src_object into CODING->dst_object.
7329    CODING->src_object is a buffer or a string.
7330    CODING->dst_object is a buffer or nil.
7331
7332    If CODING->src_object is a buffer, it must be the current buffer.
7333    In this case, if CODING->src_pos is positive, it is a position of
7334    the source text in the buffer, otherwise. the source text is in the
7335    gap area of the buffer, and coding->src_pos specifies the offset of
7336    the text from GPT (which must be the same as PT).  If this is the
7337    same buffer as CODING->dst_object, CODING->src_pos must be
7338    negative and CODING should not have `pre-write-conversion'.
7339
7340    If CODING->src_object is a string, CODING should not have
7341    `pre-write-conversion'.
7342
7343    If CODING->dst_object is a buffer, the encoded data is inserted at
7344    the current point of that buffer.
7345
7346    If CODING->dst_object is nil, the encoded data is placed at the
7347    memory area specified by CODING->destination.  */
7348
7349 static void
7350 encode_coding (struct coding_system *coding)
7351 {
7352   Lisp_Object attrs;
7353   Lisp_Object translation_table;
7354   int max_lookup;
7355   struct ccl_spec cclspec;
7356
7357   attrs = CODING_ID_ATTRS (coding->id);
7358   if (coding->encoder == encode_coding_raw_text)
7359     translation_table = Qnil, max_lookup = 0;
7360   else
7361     translation_table = get_translation_table (attrs, 1, &max_lookup);
7362
7363   if (BUFFERP (coding->dst_object))
7364     {
7365       set_buffer_internal (XBUFFER (coding->dst_object));
7366       coding->dst_multibyte
7367         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7368     }
7369
7370   coding->consumed = coding->consumed_char = 0;
7371   coding->produced = coding->produced_char = 0;
7372   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7373   coding->errors = 0;
7374
7375   ALLOC_CONVERSION_WORK_AREA (coding);
7376
7377   if (coding->encoder == encode_coding_ccl)
7378     {
7379       coding->spec.ccl = &cclspec;
7380       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7381     }
7382   do {
7383     coding_set_source (coding);
7384     consume_chars (coding, translation_table, max_lookup);
7385     coding_set_destination (coding);
7386     (*(coding->encoder)) (coding);
7387   } while (coding->consumed_char < coding->src_chars);
7388
7389   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7390     insert_from_gap (coding->produced_char, coding->produced);
7391 }
7392
7393
7394 /* Name (or base name) of work buffer for code conversion.  */
7395 static Lisp_Object Vcode_conversion_workbuf_name;
7396
7397 /* A working buffer used by the top level conversion.  Once it is
7398    created, it is never destroyed.  It has the name
7399    Vcode_conversion_workbuf_name.  The other working buffers are
7400    destroyed after the use is finished, and their names are modified
7401    versions of Vcode_conversion_workbuf_name.  */
7402 static Lisp_Object Vcode_conversion_reused_workbuf;
7403
7404 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7405 static bool reused_workbuf_in_use;
7406
7407
7408 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7409    multibyteness of returning buffer.  */
7410
7411 static Lisp_Object
7412 make_conversion_work_buffer (bool multibyte)
7413 {
7414   Lisp_Object name, workbuf;
7415   struct buffer *current;
7416
7417   if (reused_workbuf_in_use)
7418     {
7419       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7420       workbuf = Fget_buffer_create (name);
7421     }
7422   else
7423     {
7424       reused_workbuf_in_use = 1;
7425       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7426         Vcode_conversion_reused_workbuf
7427           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7428       workbuf = Vcode_conversion_reused_workbuf;
7429     }
7430   current = current_buffer;
7431   set_buffer_internal (XBUFFER (workbuf));
7432   /* We can't allow modification hooks to run in the work buffer.  For
7433      instance, directory_files_internal assumes that file decoding
7434      doesn't compile new regexps.  */
7435   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7436   Ferase_buffer ();
7437   bset_undo_list (current_buffer, Qt);
7438   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7439   set_buffer_internal (current);
7440   return workbuf;
7441 }
7442
7443
7444 static Lisp_Object
7445 code_conversion_restore (Lisp_Object arg)
7446 {
7447   Lisp_Object current, workbuf;
7448   struct gcpro gcpro1;
7449
7450   GCPRO1 (arg);
7451   current = XCAR (arg);
7452   workbuf = XCDR (arg);
7453   if (! NILP (workbuf))
7454     {
7455       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7456         reused_workbuf_in_use = 0;
7457       else
7458         Fkill_buffer (workbuf);
7459     }
7460   set_buffer_internal (XBUFFER (current));
7461   UNGCPRO;
7462   return Qnil;
7463 }
7464
7465 Lisp_Object
7466 code_conversion_save (bool with_work_buf, bool multibyte)
7467 {
7468   Lisp_Object workbuf = Qnil;
7469
7470   if (with_work_buf)
7471     workbuf = make_conversion_work_buffer (multibyte);
7472   record_unwind_protect (code_conversion_restore,
7473                          Fcons (Fcurrent_buffer (), workbuf));
7474   return workbuf;
7475 }
7476
7477 void
7478 decode_coding_gap (struct coding_system *coding,
7479                    ptrdiff_t chars, ptrdiff_t bytes)
7480 {
7481   ptrdiff_t count = SPECPDL_INDEX ();
7482   Lisp_Object attrs;
7483
7484   code_conversion_save (0, 0);
7485
7486   coding->src_object = Fcurrent_buffer ();
7487   coding->src_chars = chars;
7488   coding->src_bytes = bytes;
7489   coding->src_pos = -chars;
7490   coding->src_pos_byte = -bytes;
7491   coding->src_multibyte = chars < bytes;
7492   coding->dst_object = coding->src_object;
7493   coding->dst_pos = PT;
7494   coding->dst_pos_byte = PT_BYTE;
7495   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7496
7497   if (CODING_REQUIRE_DETECTION (coding))
7498     detect_coding (coding);
7499
7500   coding->mode |= CODING_MODE_LAST_BLOCK;
7501   current_buffer->text->inhibit_shrinking = 1;
7502   decode_coding (coding);
7503   current_buffer->text->inhibit_shrinking = 0;
7504
7505   attrs = CODING_ID_ATTRS (coding->id);
7506   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7507     {
7508       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7509       Lisp_Object val;
7510
7511       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7512       val = call1 (CODING_ATTR_POST_READ (attrs),
7513                    make_number (coding->produced_char));
7514       CHECK_NATNUM (val);
7515       coding->produced_char += Z - prev_Z;
7516       coding->produced += Z_BYTE - prev_Z_BYTE;
7517     }
7518
7519   unbind_to (count, Qnil);
7520 }
7521
7522
7523 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7524    SRC_OBJECT into DST_OBJECT by coding context CODING.
7525
7526    SRC_OBJECT is a buffer, a string, or Qnil.
7527
7528    If it is a buffer, the text is at point of the buffer.  FROM and TO
7529    are positions in the buffer.
7530
7531    If it is a string, the text is at the beginning of the string.
7532    FROM and TO are indices to the string.
7533
7534    If it is nil, the text is at coding->source.  FROM and TO are
7535    indices to coding->source.
7536
7537    DST_OBJECT is a buffer, Qt, or Qnil.
7538
7539    If it is a buffer, the decoded text is inserted at point of the
7540    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7541    is deleted.
7542
7543    If it is Qt, a string is made from the decoded text, and
7544    set in CODING->dst_object.
7545
7546    If it is Qnil, the decoded text is stored at CODING->destination.
7547    The caller must allocate CODING->dst_bytes bytes at
7548    CODING->destination by xmalloc.  If the decoded text is longer than
7549    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7550  */
7551
7552 void
7553 decode_coding_object (struct coding_system *coding,
7554                       Lisp_Object src_object,
7555                       ptrdiff_t from, ptrdiff_t from_byte,
7556                       ptrdiff_t to, ptrdiff_t to_byte,
7557                       Lisp_Object dst_object)
7558 {
7559   ptrdiff_t count = SPECPDL_INDEX ();
7560   unsigned char *destination IF_LINT (= NULL);
7561   ptrdiff_t dst_bytes IF_LINT (= 0);
7562   ptrdiff_t chars = to - from;
7563   ptrdiff_t bytes = to_byte - from_byte;
7564   Lisp_Object attrs;
7565   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7566   bool need_marker_adjustment = 0;
7567   Lisp_Object old_deactivate_mark;
7568
7569   old_deactivate_mark = Vdeactivate_mark;
7570
7571   if (NILP (dst_object))
7572     {
7573       destination = coding->destination;
7574       dst_bytes = coding->dst_bytes;
7575     }
7576
7577   coding->src_object = src_object;
7578   coding->src_chars = chars;
7579   coding->src_bytes = bytes;
7580   coding->src_multibyte = chars < bytes;
7581
7582   if (STRINGP (src_object))
7583     {
7584       coding->src_pos = from;
7585       coding->src_pos_byte = from_byte;
7586     }
7587   else if (BUFFERP (src_object))
7588     {
7589       set_buffer_internal (XBUFFER (src_object));
7590       if (from != GPT)
7591         move_gap_both (from, from_byte);
7592       if (EQ (src_object, dst_object))
7593         {
7594           struct Lisp_Marker *tail;
7595
7596           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7597             {
7598               tail->need_adjustment
7599                 = tail->charpos == (tail->insertion_type ? from : to);
7600               need_marker_adjustment |= tail->need_adjustment;
7601             }
7602           saved_pt = PT, saved_pt_byte = PT_BYTE;
7603           TEMP_SET_PT_BOTH (from, from_byte);
7604           current_buffer->text->inhibit_shrinking = 1;
7605           del_range_both (from, from_byte, to, to_byte, 1);
7606           coding->src_pos = -chars;
7607           coding->src_pos_byte = -bytes;
7608         }
7609       else
7610         {
7611           coding->src_pos = from;
7612           coding->src_pos_byte = from_byte;
7613         }
7614     }
7615
7616   if (CODING_REQUIRE_DETECTION (coding))
7617     detect_coding (coding);
7618   attrs = CODING_ID_ATTRS (coding->id);
7619
7620   if (EQ (dst_object, Qt)
7621       || (! NILP (CODING_ATTR_POST_READ (attrs))
7622           && NILP (dst_object)))
7623     {
7624       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7625       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7626       coding->dst_pos = BEG;
7627       coding->dst_pos_byte = BEG_BYTE;
7628     }
7629   else if (BUFFERP (dst_object))
7630     {
7631       code_conversion_save (0, 0);
7632       coding->dst_object = dst_object;
7633       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7634       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7635       coding->dst_multibyte
7636         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7637     }
7638   else
7639     {
7640       code_conversion_save (0, 0);
7641       coding->dst_object = Qnil;
7642       /* Most callers presume this will return a multibyte result, and they
7643          won't use `binary' or `raw-text' anyway, so let's not worry about
7644          CODING_FOR_UNIBYTE.  */
7645       coding->dst_multibyte = 1;
7646     }
7647
7648   decode_coding (coding);
7649
7650   if (BUFFERP (coding->dst_object))
7651     set_buffer_internal (XBUFFER (coding->dst_object));
7652
7653   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7654     {
7655       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7656       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7657       Lisp_Object val;
7658
7659       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7660       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7661               old_deactivate_mark);
7662       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7663                         make_number (coding->produced_char));
7664       UNGCPRO;
7665       CHECK_NATNUM (val);
7666       coding->produced_char += Z - prev_Z;
7667       coding->produced += Z_BYTE - prev_Z_BYTE;
7668     }
7669
7670   if (EQ (dst_object, Qt))
7671     {
7672       coding->dst_object = Fbuffer_string ();
7673     }
7674   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7675     {
7676       set_buffer_internal (XBUFFER (coding->dst_object));
7677       if (dst_bytes < coding->produced)
7678         {
7679           destination = xrealloc (destination, coding->produced);
7680           if (! destination)
7681             {
7682               record_conversion_result (coding,
7683                                         CODING_RESULT_INSUFFICIENT_MEM);
7684               unbind_to (count, Qnil);
7685               return;
7686             }
7687           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7688             move_gap_both (BEGV, BEGV_BYTE);
7689           memcpy (destination, BEGV_ADDR, coding->produced);
7690           coding->destination = destination;
7691         }
7692     }
7693
7694   if (saved_pt >= 0)
7695     {
7696       /* This is the case of:
7697          (BUFFERP (src_object) && EQ (src_object, dst_object))
7698          As we have moved PT while replacing the original buffer
7699          contents, we must recover it now.  */
7700       set_buffer_internal (XBUFFER (src_object));
7701       current_buffer->text->inhibit_shrinking = 0;
7702       if (saved_pt < from)
7703         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7704       else if (saved_pt < from + chars)
7705         TEMP_SET_PT_BOTH (from, from_byte);
7706       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7707         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7708                           saved_pt_byte + (coding->produced - bytes));
7709       else
7710         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7711                           saved_pt_byte + (coding->produced - bytes));
7712
7713       if (need_marker_adjustment)
7714         {
7715           struct Lisp_Marker *tail;
7716
7717           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7718             if (tail->need_adjustment)
7719               {
7720                 tail->need_adjustment = 0;
7721                 if (tail->insertion_type)
7722                   {
7723                     tail->bytepos = from_byte;
7724                     tail->charpos = from;
7725                   }
7726                 else
7727                   {
7728                     tail->bytepos = from_byte + coding->produced;
7729                     tail->charpos
7730                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7731                          ? tail->bytepos : from + coding->produced_char);
7732                   }
7733               }
7734         }
7735     }
7736
7737   Vdeactivate_mark = old_deactivate_mark;
7738   unbind_to (count, coding->dst_object);
7739 }
7740
7741
7742 void
7743 encode_coding_object (struct coding_system *coding,
7744                       Lisp_Object src_object,
7745                       ptrdiff_t from, ptrdiff_t from_byte,
7746                       ptrdiff_t to, ptrdiff_t to_byte,
7747                       Lisp_Object dst_object)
7748 {
7749   ptrdiff_t count = SPECPDL_INDEX ();
7750   ptrdiff_t chars = to - from;
7751   ptrdiff_t bytes = to_byte - from_byte;
7752   Lisp_Object attrs;
7753   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7754   bool need_marker_adjustment = 0;
7755   bool kill_src_buffer = 0;
7756   Lisp_Object old_deactivate_mark;
7757
7758   old_deactivate_mark = Vdeactivate_mark;
7759
7760   coding->src_object = src_object;
7761   coding->src_chars = chars;
7762   coding->src_bytes = bytes;
7763   coding->src_multibyte = chars < bytes;
7764
7765   attrs = CODING_ID_ATTRS (coding->id);
7766
7767   if (EQ (src_object, dst_object))
7768     {
7769       struct Lisp_Marker *tail;
7770
7771       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7772         {
7773           tail->need_adjustment
7774             = tail->charpos == (tail->insertion_type ? from : to);
7775           need_marker_adjustment |= tail->need_adjustment;
7776         }
7777     }
7778
7779   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7780     {
7781       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7782       set_buffer_internal (XBUFFER (coding->src_object));
7783       if (STRINGP (src_object))
7784         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7785       else if (BUFFERP (src_object))
7786         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7787       else
7788         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7789
7790       if (EQ (src_object, dst_object))
7791         {
7792           set_buffer_internal (XBUFFER (src_object));
7793           saved_pt = PT, saved_pt_byte = PT_BYTE;
7794           del_range_both (from, from_byte, to, to_byte, 1);
7795           set_buffer_internal (XBUFFER (coding->src_object));
7796         }
7797
7798       {
7799         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7800
7801         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7802                 old_deactivate_mark);
7803         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7804                     make_number (BEG), make_number (Z));
7805         UNGCPRO;
7806       }
7807       if (XBUFFER (coding->src_object) != current_buffer)
7808         kill_src_buffer = 1;
7809       coding->src_object = Fcurrent_buffer ();
7810       if (BEG != GPT)
7811         move_gap_both (BEG, BEG_BYTE);
7812       coding->src_chars = Z - BEG;
7813       coding->src_bytes = Z_BYTE - BEG_BYTE;
7814       coding->src_pos = BEG;
7815       coding->src_pos_byte = BEG_BYTE;
7816       coding->src_multibyte = Z < Z_BYTE;
7817     }
7818   else if (STRINGP (src_object))
7819     {
7820       code_conversion_save (0, 0);
7821       coding->src_pos = from;
7822       coding->src_pos_byte = from_byte;
7823     }
7824   else if (BUFFERP (src_object))
7825     {
7826       code_conversion_save (0, 0);
7827       set_buffer_internal (XBUFFER (src_object));
7828       if (EQ (src_object, dst_object))
7829         {
7830           saved_pt = PT, saved_pt_byte = PT_BYTE;
7831           coding->src_object = del_range_1 (from, to, 1, 1);
7832           coding->src_pos = 0;
7833           coding->src_pos_byte = 0;
7834         }
7835       else
7836         {
7837           if (from < GPT && to >= GPT)
7838             move_gap_both (from, from_byte);
7839           coding->src_pos = from;
7840           coding->src_pos_byte = from_byte;
7841         }
7842     }
7843   else
7844     code_conversion_save (0, 0);
7845
7846   if (BUFFERP (dst_object))
7847     {
7848       coding->dst_object = dst_object;
7849       if (EQ (src_object, dst_object))
7850         {
7851           coding->dst_pos = from;
7852           coding->dst_pos_byte = from_byte;
7853         }
7854       else
7855         {
7856           struct buffer *current = current_buffer;
7857
7858           set_buffer_temp (XBUFFER (dst_object));
7859           coding->dst_pos = PT;
7860           coding->dst_pos_byte = PT_BYTE;
7861           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7862           set_buffer_temp (current);
7863         }
7864       coding->dst_multibyte
7865         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7866     }
7867   else if (EQ (dst_object, Qt))
7868     {
7869       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7870       coding->dst_object = Qnil;
7871       coding->destination = xmalloc (dst_bytes);
7872       coding->dst_bytes = dst_bytes;
7873       coding->dst_multibyte = 0;
7874     }
7875   else
7876     {
7877       coding->dst_object = Qnil;
7878       coding->dst_multibyte = 0;
7879     }
7880
7881   encode_coding (coding);
7882
7883   if (EQ (dst_object, Qt))
7884     {
7885       if (BUFFERP (coding->dst_object))
7886         coding->dst_object = Fbuffer_string ();
7887       else
7888         {
7889           coding->dst_object
7890             = make_unibyte_string ((char *) coding->destination,
7891                                    coding->produced);
7892           xfree (coding->destination);
7893         }
7894     }
7895
7896   if (saved_pt >= 0)
7897     {
7898       /* This is the case of:
7899          (BUFFERP (src_object) && EQ (src_object, dst_object))
7900          As we have moved PT while replacing the original buffer
7901          contents, we must recover it now.  */
7902       set_buffer_internal (XBUFFER (src_object));
7903       if (saved_pt < from)
7904         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7905       else if (saved_pt < from + chars)
7906         TEMP_SET_PT_BOTH (from, from_byte);
7907       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7908         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7909                           saved_pt_byte + (coding->produced - bytes));
7910       else
7911         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7912                           saved_pt_byte + (coding->produced - bytes));
7913
7914       if (need_marker_adjustment)
7915         {
7916           struct Lisp_Marker *tail;
7917
7918           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7919             if (tail->need_adjustment)
7920               {
7921                 tail->need_adjustment = 0;
7922                 if (tail->insertion_type)
7923                   {
7924                     tail->bytepos = from_byte;
7925                     tail->charpos = from;
7926                   }
7927                 else
7928                   {
7929                     tail->bytepos = from_byte + coding->produced;
7930                     tail->charpos
7931                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7932                          ? tail->bytepos : from + coding->produced_char);
7933                   }
7934               }
7935         }
7936     }
7937
7938   if (kill_src_buffer)
7939     Fkill_buffer (coding->src_object);
7940
7941   Vdeactivate_mark = old_deactivate_mark;
7942   unbind_to (count, Qnil);
7943 }
7944
7945
7946 Lisp_Object
7947 preferred_coding_system (void)
7948 {
7949   int id = coding_categories[coding_priorities[0]].id;
7950
7951   return CODING_ID_NAME (id);
7952 }
7953
7954 \f
7955 #ifdef emacs
7956 /*** 8. Emacs Lisp library functions ***/
7957
7958 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7959        doc: /* Return t if OBJECT is nil or a coding-system.
7960 See the documentation of `define-coding-system' for information
7961 about coding-system objects.  */)
7962   (Lisp_Object object)
7963 {
7964   if (NILP (object)
7965       || CODING_SYSTEM_ID (object) >= 0)
7966     return Qt;
7967   if (! SYMBOLP (object)
7968       || NILP (Fget (object, Qcoding_system_define_form)))
7969     return Qnil;
7970   return Qt;
7971 }
7972
7973 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7974        Sread_non_nil_coding_system, 1, 1, 0,
7975        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7976   (Lisp_Object prompt)
7977 {
7978   Lisp_Object val;
7979   do
7980     {
7981       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7982                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7983     }
7984   while (SCHARS (val) == 0);
7985   return (Fintern (val, Qnil));
7986 }
7987
7988 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7989        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7990 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
7991 Ignores case when completing coding systems (all Emacs coding systems
7992 are lower-case).  */)
7993   (Lisp_Object prompt, Lisp_Object default_coding_system)
7994 {
7995   Lisp_Object val;
7996   ptrdiff_t count = SPECPDL_INDEX ();
7997
7998   if (SYMBOLP (default_coding_system))
7999     default_coding_system = SYMBOL_NAME (default_coding_system);
8000   specbind (Qcompletion_ignore_case, Qt);
8001   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8002                           Qt, Qnil, Qcoding_system_history,
8003                           default_coding_system, Qnil);
8004   unbind_to (count, Qnil);
8005   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8006 }
8007
8008 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8009        1, 1, 0,
8010        doc: /* Check validity of CODING-SYSTEM.
8011 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8012 It is valid if it is nil or a symbol defined as a coding system by the
8013 function `define-coding-system'.  */)
8014   (Lisp_Object coding_system)
8015 {
8016   Lisp_Object define_form;
8017
8018   define_form = Fget (coding_system, Qcoding_system_define_form);
8019   if (! NILP (define_form))
8020     {
8021       Fput (coding_system, Qcoding_system_define_form, Qnil);
8022       safe_eval (define_form);
8023     }
8024   if (!NILP (Fcoding_system_p (coding_system)))
8025     return coding_system;
8026   xsignal1 (Qcoding_system_error, coding_system);
8027 }
8028
8029 \f
8030 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8031    HIGHEST, return the coding system of the highest
8032    priority among the detected coding systems.  Otherwise return a
8033    list of detected coding systems sorted by their priorities.  If
8034    MULTIBYTEP, it is assumed that the bytes are in correct
8035    multibyte form but contains only ASCII and eight-bit chars.
8036    Otherwise, the bytes are raw bytes.
8037
8038    CODING-SYSTEM controls the detection as below:
8039
8040    If it is nil, detect both text-format and eol-format.  If the
8041    text-format part of CODING-SYSTEM is already specified
8042    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8043    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8044    detect only text-format.  */
8045
8046 Lisp_Object
8047 detect_coding_system (const unsigned char *src,
8048                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8049                       bool highest, bool multibytep,
8050                       Lisp_Object coding_system)
8051 {
8052   const unsigned char *src_end = src + src_bytes;
8053   Lisp_Object attrs, eol_type;
8054   Lisp_Object val = Qnil;
8055   struct coding_system coding;
8056   ptrdiff_t id;
8057   struct coding_detection_info detect_info;
8058   enum coding_category base_category;
8059   bool null_byte_found = 0, eight_bit_found = 0;
8060
8061   if (NILP (coding_system))
8062     coding_system = Qundecided;
8063   setup_coding_system (coding_system, &coding);
8064   attrs = CODING_ID_ATTRS (coding.id);
8065   eol_type = CODING_ID_EOL_TYPE (coding.id);
8066   coding_system = CODING_ATTR_BASE_NAME (attrs);
8067
8068   coding.source = src;
8069   coding.src_chars = src_chars;
8070   coding.src_bytes = src_bytes;
8071   coding.src_multibyte = multibytep;
8072   coding.consumed = 0;
8073   coding.mode |= CODING_MODE_LAST_BLOCK;
8074   coding.head_ascii = 0;
8075
8076   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8077
8078   /* At first, detect text-format if necessary.  */
8079   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8080   if (base_category == coding_category_undecided)
8081     {
8082       enum coding_category category IF_LINT (= 0);
8083       struct coding_system *this IF_LINT (= NULL);
8084       int c, i;
8085
8086       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8087       for (; src < src_end; src++)
8088         {
8089           c = *src;
8090           if (c & 0x80)
8091             {
8092               eight_bit_found = 1;
8093               if (null_byte_found)
8094                 break;
8095             }
8096           else if (c < 0x20)
8097             {
8098               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8099                   && ! inhibit_iso_escape_detection
8100                   && ! detect_info.checked)
8101                 {
8102                   if (detect_coding_iso_2022 (&coding, &detect_info))
8103                     {
8104                       /* We have scanned the whole data.  */
8105                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8106                         {
8107                           /* We didn't find an 8-bit code.  We may
8108                              have found a null-byte, but it's very
8109                              rare that a binary file confirm to
8110                              ISO-2022.  */
8111                           src = src_end;
8112                           coding.head_ascii = src - coding.source;
8113                         }
8114                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8115                       break;
8116                     }
8117                 }
8118               else if (! c && !inhibit_null_byte_detection)
8119                 {
8120                   null_byte_found = 1;
8121                   if (eight_bit_found)
8122                     break;
8123                 }
8124               if (! eight_bit_found)
8125                 coding.head_ascii++;
8126             }
8127           else if (! eight_bit_found)
8128             coding.head_ascii++;
8129         }
8130
8131       if (null_byte_found || eight_bit_found
8132           || coding.head_ascii < coding.src_bytes
8133           || detect_info.found)
8134         {
8135           if (coding.head_ascii == coding.src_bytes)
8136             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8137             for (i = 0; i < coding_category_raw_text; i++)
8138               {
8139                 category = coding_priorities[i];
8140                 this = coding_categories + category;
8141                 if (detect_info.found & (1 << category))
8142                   break;
8143               }
8144           else
8145             {
8146               if (null_byte_found)
8147                 {
8148                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8149                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8150                 }
8151               for (i = 0; i < coding_category_raw_text; i++)
8152                 {
8153                   category = coding_priorities[i];
8154                   this = coding_categories + category;
8155
8156                   if (this->id < 0)
8157                     {
8158                       /* No coding system of this category is defined.  */
8159                       detect_info.rejected |= (1 << category);
8160                     }
8161                   else if (category >= coding_category_raw_text)
8162                     continue;
8163                   else if (detect_info.checked & (1 << category))
8164                     {
8165                       if (highest
8166                           && (detect_info.found & (1 << category)))
8167                         break;
8168                     }
8169                   else if ((*(this->detector)) (&coding, &detect_info)
8170                            && highest
8171                            && (detect_info.found & (1 << category)))
8172                     {
8173                       if (category == coding_category_utf_16_auto)
8174                         {
8175                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8176                             category = coding_category_utf_16_le;
8177                           else
8178                             category = coding_category_utf_16_be;
8179                         }
8180                       break;
8181                     }
8182                 }
8183             }
8184         }
8185
8186       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8187           || null_byte_found)
8188         {
8189           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8190           id = CODING_SYSTEM_ID (Qno_conversion);
8191           val = Fcons (make_number (id), Qnil);
8192         }
8193       else if (! detect_info.rejected && ! detect_info.found)
8194         {
8195           detect_info.found = CATEGORY_MASK_ANY;
8196           id = coding_categories[coding_category_undecided].id;
8197           val = Fcons (make_number (id), Qnil);
8198         }
8199       else if (highest)
8200         {
8201           if (detect_info.found)
8202             {
8203               detect_info.found = 1 << category;
8204               val = Fcons (make_number (this->id), Qnil);
8205             }
8206           else
8207             for (i = 0; i < coding_category_raw_text; i++)
8208               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8209                 {
8210                   detect_info.found = 1 << coding_priorities[i];
8211                   id = coding_categories[coding_priorities[i]].id;
8212                   val = Fcons (make_number (id), Qnil);
8213                   break;
8214                 }
8215         }
8216       else
8217         {
8218           int mask = detect_info.rejected | detect_info.found;
8219           int found = 0;
8220
8221           for (i = coding_category_raw_text - 1; i >= 0; i--)
8222             {
8223               category = coding_priorities[i];
8224               if (! (mask & (1 << category)))
8225                 {
8226                   found |= 1 << category;
8227                   id = coding_categories[category].id;
8228                   if (id >= 0)
8229                     val = Fcons (make_number (id), val);
8230                 }
8231             }
8232           for (i = coding_category_raw_text - 1; i >= 0; i--)
8233             {
8234               category = coding_priorities[i];
8235               if (detect_info.found & (1 << category))
8236                 {
8237                   id = coding_categories[category].id;
8238                   val = Fcons (make_number (id), val);
8239                 }
8240             }
8241           detect_info.found |= found;
8242         }
8243     }
8244   else if (base_category == coding_category_utf_8_auto)
8245     {
8246       if (detect_coding_utf_8 (&coding, &detect_info))
8247         {
8248           struct coding_system *this;
8249
8250           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8251             this = coding_categories + coding_category_utf_8_sig;
8252           else
8253             this = coding_categories + coding_category_utf_8_nosig;
8254           val = Fcons (make_number (this->id), Qnil);
8255         }
8256     }
8257   else if (base_category == coding_category_utf_16_auto)
8258     {
8259       if (detect_coding_utf_16 (&coding, &detect_info))
8260         {
8261           struct coding_system *this;
8262
8263           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8264             this = coding_categories + coding_category_utf_16_le;
8265           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8266             this = coding_categories + coding_category_utf_16_be;
8267           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8268             this = coding_categories + coding_category_utf_16_be_nosig;
8269           else
8270             this = coding_categories + coding_category_utf_16_le_nosig;
8271           val = Fcons (make_number (this->id), Qnil);
8272         }
8273     }
8274   else
8275     {
8276       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8277       val = Fcons (make_number (coding.id), Qnil);
8278     }
8279
8280   /* Then, detect eol-format if necessary.  */
8281   {
8282     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8283     Lisp_Object tail;
8284
8285     if (VECTORP (eol_type))
8286       {
8287         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8288           {
8289             if (null_byte_found)
8290               normal_eol = EOL_SEEN_LF;
8291             else
8292               normal_eol = detect_eol (coding.source, src_bytes,
8293                                        coding_category_raw_text);
8294           }
8295         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8296                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8297           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8298                                       coding_category_utf_16_be);
8299         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8300                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8301           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8302                                       coding_category_utf_16_le);
8303       }
8304     else
8305       {
8306         if (EQ (eol_type, Qunix))
8307           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8308         else if (EQ (eol_type, Qdos))
8309           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8310         else
8311           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8312       }
8313
8314     for (tail = val; CONSP (tail); tail = XCDR (tail))
8315       {
8316         enum coding_category category;
8317         int this_eol;
8318
8319         id = XINT (XCAR (tail));
8320         attrs = CODING_ID_ATTRS (id);
8321         category = XINT (CODING_ATTR_CATEGORY (attrs));
8322         eol_type = CODING_ID_EOL_TYPE (id);
8323         if (VECTORP (eol_type))
8324           {
8325             if (category == coding_category_utf_16_be
8326                 || category == coding_category_utf_16_be_nosig)
8327               this_eol = utf_16_be_eol;
8328             else if (category == coding_category_utf_16_le
8329                      || category == coding_category_utf_16_le_nosig)
8330               this_eol = utf_16_le_eol;
8331             else
8332               this_eol = normal_eol;
8333
8334             if (this_eol == EOL_SEEN_LF)
8335               XSETCAR (tail, AREF (eol_type, 0));
8336             else if (this_eol == EOL_SEEN_CRLF)
8337               XSETCAR (tail, AREF (eol_type, 1));
8338             else if (this_eol == EOL_SEEN_CR)
8339               XSETCAR (tail, AREF (eol_type, 2));
8340             else
8341               XSETCAR (tail, CODING_ID_NAME (id));
8342           }
8343         else
8344           XSETCAR (tail, CODING_ID_NAME (id));
8345       }
8346   }
8347
8348   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8349 }
8350
8351
8352 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8353        2, 3, 0,
8354        doc: /* Detect coding system of the text in the region between START and END.
8355 Return a list of possible coding systems ordered by priority.
8356 The coding systems to try and their priorities follows what
8357 the function `coding-system-priority-list' (which see) returns.
8358
8359 If only ASCII characters are found (except for such ISO-2022 control
8360 characters as ESC), it returns a list of single element `undecided'
8361 or its subsidiary coding system according to a detected end-of-line
8362 format.
8363
8364 If optional argument HIGHEST is non-nil, return the coding system of
8365 highest priority.  */)
8366   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8367 {
8368   ptrdiff_t from, to;
8369   ptrdiff_t from_byte, to_byte;
8370
8371   CHECK_NUMBER_COERCE_MARKER (start);
8372   CHECK_NUMBER_COERCE_MARKER (end);
8373
8374   validate_region (&start, &end);
8375   from = XINT (start), to = XINT (end);
8376   from_byte = CHAR_TO_BYTE (from);
8377   to_byte = CHAR_TO_BYTE (to);
8378
8379   if (from < GPT && to >= GPT)
8380     move_gap_both (to, to_byte);
8381
8382   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8383                                to - from, to_byte - from_byte,
8384                                !NILP (highest),
8385                                !NILP (BVAR (current_buffer
8386                                       , enable_multibyte_characters)),
8387                                Qnil);
8388 }
8389
8390 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8391        1, 2, 0,
8392        doc: /* Detect coding system of the text in STRING.
8393 Return a list of possible coding systems ordered by priority.
8394 The coding systems to try and their priorities follows what
8395 the function `coding-system-priority-list' (which see) returns.
8396
8397 If only ASCII characters are found (except for such ISO-2022 control
8398 characters as ESC), it returns a list of single element `undecided'
8399 or its subsidiary coding system according to a detected end-of-line
8400 format.
8401
8402 If optional argument HIGHEST is non-nil, return the coding system of
8403 highest priority.  */)
8404   (Lisp_Object string, Lisp_Object highest)
8405 {
8406   CHECK_STRING (string);
8407
8408   return detect_coding_system (SDATA (string),
8409                                SCHARS (string), SBYTES (string),
8410                                !NILP (highest), STRING_MULTIBYTE (string),
8411                                Qnil);
8412 }
8413
8414
8415 static inline bool
8416 char_encodable_p (int c, Lisp_Object attrs)
8417 {
8418   Lisp_Object tail;
8419   struct charset *charset;
8420   Lisp_Object translation_table;
8421
8422   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8423   if (! NILP (translation_table))
8424     c = translate_char (translation_table, c);
8425   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8426        CONSP (tail); tail = XCDR (tail))
8427     {
8428       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8429       if (CHAR_CHARSET_P (c, charset))
8430         break;
8431     }
8432   return (! NILP (tail));
8433 }
8434
8435
8436 /* Return a list of coding systems that safely encode the text between
8437    START and END.  If EXCLUDE is non-nil, it is a list of coding
8438    systems not to check.  The returned list doesn't contain any such
8439    coding systems.  In any case, if the text contains only ASCII or is
8440    unibyte, return t.  */
8441
8442 DEFUN ("find-coding-systems-region-internal",
8443        Ffind_coding_systems_region_internal,
8444        Sfind_coding_systems_region_internal, 2, 3, 0,
8445        doc: /* Internal use only.  */)
8446   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8447 {
8448   Lisp_Object coding_attrs_list, safe_codings;
8449   ptrdiff_t start_byte, end_byte;
8450   const unsigned char *p, *pbeg, *pend;
8451   int c;
8452   Lisp_Object tail, elt, work_table;
8453
8454   if (STRINGP (start))
8455     {
8456       if (!STRING_MULTIBYTE (start)
8457           || SCHARS (start) == SBYTES (start))
8458         return Qt;
8459       start_byte = 0;
8460       end_byte = SBYTES (start);
8461     }
8462   else
8463     {
8464       CHECK_NUMBER_COERCE_MARKER (start);
8465       CHECK_NUMBER_COERCE_MARKER (end);
8466       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8467         args_out_of_range (start, end);
8468       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8469         return Qt;
8470       start_byte = CHAR_TO_BYTE (XINT (start));
8471       end_byte = CHAR_TO_BYTE (XINT (end));
8472       if (XINT (end) - XINT (start) == end_byte - start_byte)
8473         return Qt;
8474
8475       if (XINT (start) < GPT && XINT (end) > GPT)
8476         {
8477           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8478             move_gap_both (XINT (start), start_byte);
8479           else
8480             move_gap_both (XINT (end), end_byte);
8481         }
8482     }
8483
8484   coding_attrs_list = Qnil;
8485   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8486     if (NILP (exclude)
8487         || NILP (Fmemq (XCAR (tail), exclude)))
8488       {
8489         Lisp_Object attrs;
8490
8491         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8492         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8493             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8494           {
8495             ASET (attrs, coding_attr_trans_tbl,
8496                   get_translation_table (attrs, 1, NULL));
8497             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8498           }
8499       }
8500
8501   if (STRINGP (start))
8502     p = pbeg = SDATA (start);
8503   else
8504     p = pbeg = BYTE_POS_ADDR (start_byte);
8505   pend = p + (end_byte - start_byte);
8506
8507   while (p < pend && ASCII_BYTE_P (*p)) p++;
8508   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8509
8510   work_table = Fmake_char_table (Qnil, Qnil);
8511   while (p < pend)
8512     {
8513       if (ASCII_BYTE_P (*p))
8514         p++;
8515       else
8516         {
8517           c = STRING_CHAR_ADVANCE (p);
8518           if (!NILP (char_table_ref (work_table, c)))
8519             /* This character was already checked.  Ignore it.  */
8520             continue;
8521
8522           charset_map_loaded = 0;
8523           for (tail = coding_attrs_list; CONSP (tail);)
8524             {
8525               elt = XCAR (tail);
8526               if (NILP (elt))
8527                 tail = XCDR (tail);
8528               else if (char_encodable_p (c, elt))
8529                 tail = XCDR (tail);
8530               else if (CONSP (XCDR (tail)))
8531                 {
8532                   XSETCAR (tail, XCAR (XCDR (tail)));
8533                   XSETCDR (tail, XCDR (XCDR (tail)));
8534                 }
8535               else
8536                 {
8537                   XSETCAR (tail, Qnil);
8538                   tail = XCDR (tail);
8539                 }
8540             }
8541           if (charset_map_loaded)
8542             {
8543               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8544
8545               if (STRINGP (start))
8546                 pbeg = SDATA (start);
8547               else
8548                 pbeg = BYTE_POS_ADDR (start_byte);
8549               p = pbeg + p_offset;
8550               pend = pbeg + pend_offset;
8551             }
8552           char_table_set (work_table, c, Qt);
8553         }
8554     }
8555
8556   safe_codings = list2 (Qraw_text, Qno_conversion);
8557   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8558     if (! NILP (XCAR (tail)))
8559       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8560
8561   return safe_codings;
8562 }
8563
8564
8565 DEFUN ("unencodable-char-position", Funencodable_char_position,
8566        Sunencodable_char_position, 3, 5, 0,
8567        doc: /*
8568 Return position of first un-encodable character in a region.
8569 START and END specify the region and CODING-SYSTEM specifies the
8570 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8571
8572 If optional 4th argument COUNT is non-nil, it specifies at most how
8573 many un-encodable characters to search.  In this case, the value is a
8574 list of positions.
8575
8576 If optional 5th argument STRING is non-nil, it is a string to search
8577 for un-encodable characters.  In that case, START and END are indexes
8578 to the string.  */)
8579   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8580 {
8581   EMACS_INT n;
8582   struct coding_system coding;
8583   Lisp_Object attrs, charset_list, translation_table;
8584   Lisp_Object positions;
8585   ptrdiff_t from, to;
8586   const unsigned char *p, *stop, *pend;
8587   bool ascii_compatible;
8588
8589   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8590   attrs = CODING_ID_ATTRS (coding.id);
8591   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8592     return Qnil;
8593   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8594   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8595   translation_table = get_translation_table (attrs, 1, NULL);
8596
8597   if (NILP (string))
8598     {
8599       validate_region (&start, &end);
8600       from = XINT (start);
8601       to = XINT (end);
8602       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8603           || (ascii_compatible
8604               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8605         return Qnil;
8606       p = CHAR_POS_ADDR (from);
8607       pend = CHAR_POS_ADDR (to);
8608       if (from < GPT && to >= GPT)
8609         stop = GPT_ADDR;
8610       else
8611         stop = pend;
8612     }
8613   else
8614     {
8615       CHECK_STRING (string);
8616       CHECK_NATNUM (start);
8617       CHECK_NATNUM (end);
8618       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8619         args_out_of_range_3 (string, start, end);
8620       from = XINT (start);
8621       to = XINT (end);
8622       if (! STRING_MULTIBYTE (string))
8623         return Qnil;
8624       p = SDATA (string) + string_char_to_byte (string, from);
8625       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8626       if (ascii_compatible && (to - from) == (pend - p))
8627         return Qnil;
8628     }
8629
8630   if (NILP (count))
8631     n = 1;
8632   else
8633     {
8634       CHECK_NATNUM (count);
8635       n = XINT (count);
8636     }
8637
8638   positions = Qnil;
8639   charset_map_loaded = 0;
8640   while (1)
8641     {
8642       int c;
8643
8644       if (ascii_compatible)
8645         while (p < stop && ASCII_BYTE_P (*p))
8646           p++, from++;
8647       if (p >= stop)
8648         {
8649           if (p >= pend)
8650             break;
8651           stop = pend;
8652           p = GAP_END_ADDR;
8653         }
8654
8655       c = STRING_CHAR_ADVANCE (p);
8656       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8657           && ! char_charset (translate_char (translation_table, c),
8658                              charset_list, NULL))
8659         {
8660           positions = Fcons (make_number (from), positions);
8661           n--;
8662           if (n == 0)
8663             break;
8664         }
8665
8666       from++;
8667       if (charset_map_loaded && NILP (string))
8668         {
8669           p = CHAR_POS_ADDR (from);
8670           pend = CHAR_POS_ADDR (to);
8671           if (from < GPT && to >= GPT)
8672             stop = GPT_ADDR;
8673           else
8674             stop = pend;
8675           charset_map_loaded = 0;
8676         }
8677     }
8678
8679   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8680 }
8681
8682
8683 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8684        Scheck_coding_systems_region, 3, 3, 0,
8685        doc: /* Check if the region is encodable by coding systems.
8686
8687 START and END are buffer positions specifying the region.
8688 CODING-SYSTEM-LIST is a list of coding systems to check.
8689
8690 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8691 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8692 whole region, POS0, POS1, ... are buffer positions where non-encodable
8693 characters are found.
8694
8695 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8696 value is nil.
8697
8698 START may be a string.  In that case, check if the string is
8699 encodable, and the value contains indices to the string instead of
8700 buffer positions.  END is ignored.
8701
8702 If the current buffer (or START if it is a string) is unibyte, the value
8703 is nil.  */)
8704   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8705 {
8706   Lisp_Object list;
8707   ptrdiff_t start_byte, end_byte;
8708   ptrdiff_t pos;
8709   const unsigned char *p, *pbeg, *pend;
8710   int c;
8711   Lisp_Object tail, elt, attrs;
8712
8713   if (STRINGP (start))
8714     {
8715       if (!STRING_MULTIBYTE (start)
8716           || SCHARS (start) == SBYTES (start))
8717         return Qnil;
8718       start_byte = 0;
8719       end_byte = SBYTES (start);
8720       pos = 0;
8721     }
8722   else
8723     {
8724       CHECK_NUMBER_COERCE_MARKER (start);
8725       CHECK_NUMBER_COERCE_MARKER (end);
8726       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8727         args_out_of_range (start, end);
8728       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8729         return Qnil;
8730       start_byte = CHAR_TO_BYTE (XINT (start));
8731       end_byte = CHAR_TO_BYTE (XINT (end));
8732       if (XINT (end) - XINT (start) == end_byte - start_byte)
8733         return Qnil;
8734
8735       if (XINT (start) < GPT && XINT (end) > GPT)
8736         {
8737           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8738             move_gap_both (XINT (start), start_byte);
8739           else
8740             move_gap_both (XINT (end), end_byte);
8741         }
8742       pos = XINT (start);
8743     }
8744
8745   list = Qnil;
8746   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8747     {
8748       elt = XCAR (tail);
8749       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8750       ASET (attrs, coding_attr_trans_tbl,
8751             get_translation_table (attrs, 1, NULL));
8752       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8753     }
8754
8755   if (STRINGP (start))
8756     p = pbeg = SDATA (start);
8757   else
8758     p = pbeg = BYTE_POS_ADDR (start_byte);
8759   pend = p + (end_byte - start_byte);
8760
8761   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8762   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8763
8764   while (p < pend)
8765     {
8766       if (ASCII_BYTE_P (*p))
8767         p++;
8768       else
8769         {
8770           c = STRING_CHAR_ADVANCE (p);
8771
8772           charset_map_loaded = 0;
8773           for (tail = list; CONSP (tail); tail = XCDR (tail))
8774             {
8775               elt = XCDR (XCAR (tail));
8776               if (! char_encodable_p (c, XCAR (elt)))
8777                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8778             }
8779           if (charset_map_loaded)
8780             {
8781               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8782
8783               if (STRINGP (start))
8784                 pbeg = SDATA (start);
8785               else
8786                 pbeg = BYTE_POS_ADDR (start_byte);
8787               p = pbeg + p_offset;
8788               pend = pbeg + pend_offset;
8789             }
8790         }
8791       pos++;
8792     }
8793
8794   tail = list;
8795   list = Qnil;
8796   for (; CONSP (tail); tail = XCDR (tail))
8797     {
8798       elt = XCAR (tail);
8799       if (CONSP (XCDR (XCDR (elt))))
8800         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8801                       list);
8802     }
8803
8804   return list;
8805 }
8806
8807
8808 static Lisp_Object
8809 code_convert_region (Lisp_Object start, Lisp_Object end,
8810                      Lisp_Object coding_system, Lisp_Object dst_object,
8811                      bool encodep, bool norecord)
8812 {
8813   struct coding_system coding;
8814   ptrdiff_t from, from_byte, to, to_byte;
8815   Lisp_Object src_object;
8816
8817   CHECK_NUMBER_COERCE_MARKER (start);
8818   CHECK_NUMBER_COERCE_MARKER (end);
8819   if (NILP (coding_system))
8820     coding_system = Qno_conversion;
8821   else
8822     CHECK_CODING_SYSTEM (coding_system);
8823   src_object = Fcurrent_buffer ();
8824   if (NILP (dst_object))
8825     dst_object = src_object;
8826   else if (! EQ (dst_object, Qt))
8827     CHECK_BUFFER (dst_object);
8828
8829   validate_region (&start, &end);
8830   from = XFASTINT (start);
8831   from_byte = CHAR_TO_BYTE (from);
8832   to = XFASTINT (end);
8833   to_byte = CHAR_TO_BYTE (to);
8834
8835   setup_coding_system (coding_system, &coding);
8836   coding.mode |= CODING_MODE_LAST_BLOCK;
8837
8838   if (encodep)
8839     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8840                           dst_object);
8841   else
8842     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8843                           dst_object);
8844   if (! norecord)
8845     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8846
8847   return (BUFFERP (dst_object)
8848           ? make_number (coding.produced_char)
8849           : coding.dst_object);
8850 }
8851
8852
8853 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8854        3, 4, "r\nzCoding system: ",
8855        doc: /* Decode the current region from the specified coding system.
8856 When called from a program, takes four arguments:
8857         START, END, CODING-SYSTEM, and DESTINATION.
8858 START and END are buffer positions.
8859
8860 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8861 If nil, the region between START and END is replaced by the decoded text.
8862 If buffer, the decoded text is inserted in that buffer after point (point
8863 does not move).
8864 In those cases, the length of the decoded text is returned.
8865 If DESTINATION is t, the decoded text is returned.
8866
8867 This function sets `last-coding-system-used' to the precise coding system
8868 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8869 not fully specified.)  */)
8870   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8871 {
8872   return code_convert_region (start, end, coding_system, destination, 0, 0);
8873 }
8874
8875 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8876        3, 4, "r\nzCoding system: ",
8877        doc: /* Encode the current region by specified coding system.
8878 When called from a program, takes four arguments:
8879         START, END, CODING-SYSTEM and DESTINATION.
8880 START and END are buffer positions.
8881
8882 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8883 If nil, the region between START and END is replace by the encoded text.
8884 If buffer, the encoded text is inserted in that buffer after point (point
8885 does not move).
8886 In those cases, the length of the encoded text is returned.
8887 If DESTINATION is t, the encoded text is returned.
8888
8889 This function sets `last-coding-system-used' to the precise coding system
8890 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8891 not fully specified.)  */)
8892   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8893 {
8894   return code_convert_region (start, end, coding_system, destination, 1, 0);
8895 }
8896
8897 Lisp_Object
8898 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8899                      Lisp_Object dst_object, bool encodep, bool nocopy,
8900                      bool norecord)
8901 {
8902   struct coding_system coding;
8903   ptrdiff_t chars, bytes;
8904
8905   CHECK_STRING (string);
8906   if (NILP (coding_system))
8907     {
8908       if (! norecord)
8909         Vlast_coding_system_used = Qno_conversion;
8910       if (NILP (dst_object))
8911         return (nocopy ? Fcopy_sequence (string) : string);
8912     }
8913
8914   if (NILP (coding_system))
8915     coding_system = Qno_conversion;
8916   else
8917     CHECK_CODING_SYSTEM (coding_system);
8918   if (NILP (dst_object))
8919     dst_object = Qt;
8920   else if (! EQ (dst_object, Qt))
8921     CHECK_BUFFER (dst_object);
8922
8923   setup_coding_system (coding_system, &coding);
8924   coding.mode |= CODING_MODE_LAST_BLOCK;
8925   chars = SCHARS (string);
8926   bytes = SBYTES (string);
8927   if (encodep)
8928     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8929   else
8930     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8931   if (! norecord)
8932     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8933
8934   return (BUFFERP (dst_object)
8935           ? make_number (coding.produced_char)
8936           : coding.dst_object);
8937 }
8938
8939
8940 /* Encode or decode STRING according to CODING_SYSTEM.
8941    Do not set Vlast_coding_system_used.
8942
8943    This function is called only from macros DECODE_FILE and
8944    ENCODE_FILE, thus we ignore character composition.  */
8945
8946 Lisp_Object
8947 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8948                               bool encodep)
8949 {
8950   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8951 }
8952
8953
8954 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8955        2, 4, 0,
8956        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8957
8958 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8959 if the decoding operation is trivial.
8960
8961 Optional fourth arg BUFFER non-nil means that the decoded text is
8962 inserted in that buffer after point (point does not move).  In this
8963 case, the return value is the length of the decoded text.
8964
8965 This function sets `last-coding-system-used' to the precise coding system
8966 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8967 not fully specified.)  */)
8968   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
8969 {
8970   return code_convert_string (string, coding_system, buffer,
8971                               0, ! NILP (nocopy), 0);
8972 }
8973
8974 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8975        2, 4, 0,
8976        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8977
8978 Optional third arg NOCOPY non-nil means it is OK to return STRING
8979 itself if the encoding operation is trivial.
8980
8981 Optional fourth arg BUFFER non-nil means that the encoded text is
8982 inserted in that buffer after point (point does not move).  In this
8983 case, the return value is the length of the encoded text.
8984
8985 This function sets `last-coding-system-used' to the precise coding system
8986 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8987 not fully specified.)  */)
8988   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
8989 {
8990   return code_convert_string (string, coding_system, buffer,
8991                               1, ! NILP (nocopy), 0);
8992 }
8993
8994 \f
8995 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8996        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8997 Return the corresponding character.  */)
8998   (Lisp_Object code)
8999 {
9000   Lisp_Object spec, attrs, val;
9001   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9002   EMACS_INT ch;
9003   int c;
9004
9005   CHECK_NATNUM (code);
9006   ch = XFASTINT (code);
9007   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9008   attrs = AREF (spec, 0);
9009
9010   if (ASCII_BYTE_P (ch)
9011       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9012     return code;
9013
9014   val = CODING_ATTR_CHARSET_LIST (attrs);
9015   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9016   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9017   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9018
9019   if (ch <= 0x7F)
9020     {
9021       c = ch;
9022       charset = charset_roman;
9023     }
9024   else if (ch >= 0xA0 && ch < 0xDF)
9025     {
9026       c = ch - 0x80;
9027       charset = charset_kana;
9028     }
9029   else
9030     {
9031       EMACS_INT c1 = ch >> 8;
9032       int c2 = ch & 0xFF;
9033
9034       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9035           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9036         error ("Invalid code: %"pI"d", ch);
9037       c = ch;
9038       SJIS_TO_JIS (c);
9039       charset = charset_kanji;
9040     }
9041   c = DECODE_CHAR (charset, c);
9042   if (c < 0)
9043     error ("Invalid code: %"pI"d", ch);
9044   return make_number (c);
9045 }
9046
9047
9048 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9049        doc: /* Encode a Japanese character CH to shift_jis encoding.
9050 Return the corresponding code in SJIS.  */)
9051   (Lisp_Object ch)
9052 {
9053   Lisp_Object spec, attrs, charset_list;
9054   int c;
9055   struct charset *charset;
9056   unsigned code;
9057
9058   CHECK_CHARACTER (ch);
9059   c = XFASTINT (ch);
9060   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9061   attrs = AREF (spec, 0);
9062
9063   if (ASCII_CHAR_P (c)
9064       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9065     return ch;
9066
9067   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9068   charset = char_charset (c, charset_list, &code);
9069   if (code == CHARSET_INVALID_CODE (charset))
9070     error ("Can't encode by shift_jis encoding: %c", c);
9071   JIS_TO_SJIS (code);
9072
9073   return make_number (code);
9074 }
9075
9076 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9077        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9078 Return the corresponding character.  */)
9079   (Lisp_Object code)
9080 {
9081   Lisp_Object spec, attrs, val;
9082   struct charset *charset_roman, *charset_big5, *charset;
9083   EMACS_INT ch;
9084   int c;
9085
9086   CHECK_NATNUM (code);
9087   ch = XFASTINT (code);
9088   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9089   attrs = AREF (spec, 0);
9090
9091   if (ASCII_BYTE_P (ch)
9092       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9093     return code;
9094
9095   val = CODING_ATTR_CHARSET_LIST (attrs);
9096   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9097   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9098
9099   if (ch <= 0x7F)
9100     {
9101       c = ch;
9102       charset = charset_roman;
9103     }
9104   else
9105     {
9106       EMACS_INT b1 = ch >> 8;
9107       int b2 = ch & 0x7F;
9108       if (b1 < 0xA1 || b1 > 0xFE
9109           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9110         error ("Invalid code: %"pI"d", ch);
9111       c = ch;
9112       charset = charset_big5;
9113     }
9114   c = DECODE_CHAR (charset, c);
9115   if (c < 0)
9116     error ("Invalid code: %"pI"d", ch);
9117   return make_number (c);
9118 }
9119
9120 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9121        doc: /* Encode the Big5 character CH to BIG5 coding system.
9122 Return the corresponding character code in Big5.  */)
9123   (Lisp_Object ch)
9124 {
9125   Lisp_Object spec, attrs, charset_list;
9126   struct charset *charset;
9127   int c;
9128   unsigned code;
9129
9130   CHECK_CHARACTER (ch);
9131   c = XFASTINT (ch);
9132   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9133   attrs = AREF (spec, 0);
9134   if (ASCII_CHAR_P (c)
9135       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9136     return ch;
9137
9138   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9139   charset = char_charset (c, charset_list, &code);
9140   if (code == CHARSET_INVALID_CODE (charset))
9141     error ("Can't encode by Big5 encoding: %c", c);
9142
9143   return make_number (code);
9144 }
9145
9146 \f
9147 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9148        Sset_terminal_coding_system_internal, 1, 2, 0,
9149        doc: /* Internal use only.  */)
9150   (Lisp_Object coding_system, Lisp_Object terminal)
9151 {
9152   struct terminal *term = get_terminal (terminal, 1);
9153   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9154   CHECK_SYMBOL (coding_system);
9155   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9156   /* We had better not send unsafe characters to terminal.  */
9157   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9158   /* Character composition should be disabled.  */
9159   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9160   terminal_coding->src_multibyte = 1;
9161   terminal_coding->dst_multibyte = 0;
9162   tset_charset_list
9163     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9164             ? coding_charset_list (terminal_coding)
9165             : Fcons (make_number (charset_ascii), Qnil)));
9166   return Qnil;
9167 }
9168
9169 DEFUN ("set-safe-terminal-coding-system-internal",
9170        Fset_safe_terminal_coding_system_internal,
9171        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9172        doc: /* Internal use only.  */)
9173   (Lisp_Object coding_system)
9174 {
9175   CHECK_SYMBOL (coding_system);
9176   setup_coding_system (Fcheck_coding_system (coding_system),
9177                        &safe_terminal_coding);
9178   /* Character composition should be disabled.  */
9179   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9180   safe_terminal_coding.src_multibyte = 1;
9181   safe_terminal_coding.dst_multibyte = 0;
9182   return Qnil;
9183 }
9184
9185 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9186        Sterminal_coding_system, 0, 1, 0,
9187        doc: /* Return coding system specified for terminal output on the given terminal.
9188 TERMINAL may be a terminal object, a frame, or nil for the selected
9189 frame's terminal device.  */)
9190   (Lisp_Object terminal)
9191 {
9192   struct coding_system *terminal_coding
9193     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9194   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9195
9196   /* For backward compatibility, return nil if it is `undecided'.  */
9197   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9198 }
9199
9200 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9201        Sset_keyboard_coding_system_internal, 1, 2, 0,
9202        doc: /* Internal use only.  */)
9203   (Lisp_Object coding_system, Lisp_Object terminal)
9204 {
9205   struct terminal *t = get_terminal (terminal, 1);
9206   CHECK_SYMBOL (coding_system);
9207   if (NILP (coding_system))
9208     coding_system = Qno_conversion;
9209   else
9210     Fcheck_coding_system (coding_system);
9211   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9212   /* Character composition should be disabled.  */
9213   TERMINAL_KEYBOARD_CODING (t)->common_flags
9214     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9215   return Qnil;
9216 }
9217
9218 DEFUN ("keyboard-coding-system",
9219        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9220        doc: /* Return coding system specified for decoding keyboard input.  */)
9221   (Lisp_Object terminal)
9222 {
9223   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9224                          (get_terminal (terminal, 1))->id);
9225 }
9226
9227 \f
9228 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9229        Sfind_operation_coding_system,  1, MANY, 0,
9230        doc: /* Choose a coding system for an operation based on the target name.
9231 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9232 DECODING-SYSTEM is the coding system to use for decoding
9233 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9234 for encoding (in case OPERATION does encoding).
9235
9236 The first argument OPERATION specifies an I/O primitive:
9237   For file I/O, `insert-file-contents' or `write-region'.
9238   For process I/O, `call-process', `call-process-region', or `start-process'.
9239   For network I/O, `open-network-stream'.
9240
9241 The remaining arguments should be the same arguments that were passed
9242 to the primitive.  Depending on which primitive, one of those arguments
9243 is selected as the TARGET.  For example, if OPERATION does file I/O,
9244 whichever argument specifies the file name is TARGET.
9245
9246 TARGET has a meaning which depends on OPERATION:
9247   For file I/O, TARGET is a file name (except for the special case below).
9248   For process I/O, TARGET is a process name.
9249   For network I/O, TARGET is a service name or a port number.
9250
9251 This function looks up what is specified for TARGET in
9252 `file-coding-system-alist', `process-coding-system-alist',
9253 or `network-coding-system-alist' depending on OPERATION.
9254 They may specify a coding system, a cons of coding systems,
9255 or a function symbol to call.
9256 In the last case, we call the function with one argument,
9257 which is a list of all the arguments given to this function.
9258 If the function can't decide a coding system, it can return
9259 `undecided' so that the normal code-detection is performed.
9260
9261 If OPERATION is `insert-file-contents', the argument corresponding to
9262 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9263 file name to look up, and BUFFER is a buffer that contains the file's
9264 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9265 function to call for FILENAME, that function should examine the
9266 contents of BUFFER instead of reading the file.
9267
9268 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9269   (ptrdiff_t nargs, Lisp_Object *args)
9270 {
9271   Lisp_Object operation, target_idx, target, val;
9272   register Lisp_Object chain;
9273
9274   if (nargs < 2)
9275     error ("Too few arguments");
9276   operation = args[0];
9277   if (!SYMBOLP (operation)
9278       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9279     error ("Invalid first argument");
9280   if (nargs <= 1 + XFASTINT (target_idx))
9281     error ("Too few arguments for operation `%s'",
9282            SDATA (SYMBOL_NAME (operation)));
9283   target = args[XFASTINT (target_idx) + 1];
9284   if (!(STRINGP (target)
9285         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9286             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9287         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9288     error ("Invalid argument %"pI"d of operation `%s'",
9289            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9290   if (CONSP (target))
9291     target = XCAR (target);
9292
9293   chain = ((EQ (operation, Qinsert_file_contents)
9294             || EQ (operation, Qwrite_region))
9295            ? Vfile_coding_system_alist
9296            : (EQ (operation, Qopen_network_stream)
9297               ? Vnetwork_coding_system_alist
9298               : Vprocess_coding_system_alist));
9299   if (NILP (chain))
9300     return Qnil;
9301
9302   for (; CONSP (chain); chain = XCDR (chain))
9303     {
9304       Lisp_Object elt;
9305
9306       elt = XCAR (chain);
9307       if (CONSP (elt)
9308           && ((STRINGP (target)
9309                && STRINGP (XCAR (elt))
9310                && fast_string_match (XCAR (elt), target) >= 0)
9311               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9312         {
9313           val = XCDR (elt);
9314           /* Here, if VAL is both a valid coding system and a valid
9315              function symbol, we return VAL as a coding system.  */
9316           if (CONSP (val))
9317             return val;
9318           if (! SYMBOLP (val))
9319             return Qnil;
9320           if (! NILP (Fcoding_system_p (val)))
9321             return Fcons (val, val);
9322           if (! NILP (Ffboundp (val)))
9323             {
9324               /* We use call1 rather than safe_call1
9325                  so as to get bug reports about functions called here
9326                  which don't handle the current interface.  */
9327               val = call1 (val, Flist (nargs, args));
9328               if (CONSP (val))
9329                 return val;
9330               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9331                 return Fcons (val, val);
9332             }
9333           return Qnil;
9334         }
9335     }
9336   return Qnil;
9337 }
9338
9339 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9340        Sset_coding_system_priority, 0, MANY, 0,
9341        doc: /* Assign higher priority to the coding systems given as arguments.
9342 If multiple coding systems belong to the same category,
9343 all but the first one are ignored.
9344
9345 usage: (set-coding-system-priority &rest coding-systems)  */)
9346   (ptrdiff_t nargs, Lisp_Object *args)
9347 {
9348   ptrdiff_t i, j;
9349   bool changed[coding_category_max];
9350   enum coding_category priorities[coding_category_max];
9351
9352   memset (changed, 0, sizeof changed);
9353
9354   for (i = j = 0; i < nargs; i++)
9355     {
9356       enum coding_category category;
9357       Lisp_Object spec, attrs;
9358
9359       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9360       attrs = AREF (spec, 0);
9361       category = XINT (CODING_ATTR_CATEGORY (attrs));
9362       if (changed[category])
9363         /* Ignore this coding system because a coding system of the
9364            same category already had a higher priority.  */
9365         continue;
9366       changed[category] = 1;
9367       priorities[j++] = category;
9368       if (coding_categories[category].id >= 0
9369           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9370         setup_coding_system (args[i], &coding_categories[category]);
9371       Fset (AREF (Vcoding_category_table, category), args[i]);
9372     }
9373
9374   /* Now we have decided top J priorities.  Reflect the order of the
9375      original priorities to the remaining priorities.  */
9376
9377   for (i = j, j = 0; i < coding_category_max; i++, j++)
9378     {
9379       while (j < coding_category_max
9380              && changed[coding_priorities[j]])
9381         j++;
9382       if (j == coding_category_max)
9383         emacs_abort ();
9384       priorities[i] = coding_priorities[j];
9385     }
9386
9387   memcpy (coding_priorities, priorities, sizeof priorities);
9388
9389   /* Update `coding-category-list'.  */
9390   Vcoding_category_list = Qnil;
9391   for (i = coding_category_max; i-- > 0; )
9392     Vcoding_category_list
9393       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9394                Vcoding_category_list);
9395
9396   return Qnil;
9397 }
9398
9399 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9400        Scoding_system_priority_list, 0, 1, 0,
9401        doc: /* Return a list of coding systems ordered by their priorities.
9402 The list contains a subset of coding systems; i.e. coding systems
9403 assigned to each coding category (see `coding-category-list').
9404
9405 HIGHESTP non-nil means just return the highest priority one.  */)
9406   (Lisp_Object highestp)
9407 {
9408   int i;
9409   Lisp_Object val;
9410
9411   for (i = 0, val = Qnil; i < coding_category_max; i++)
9412     {
9413       enum coding_category category = coding_priorities[i];
9414       int id = coding_categories[category].id;
9415       Lisp_Object attrs;
9416
9417       if (id < 0)
9418         continue;
9419       attrs = CODING_ID_ATTRS (id);
9420       if (! NILP (highestp))
9421         return CODING_ATTR_BASE_NAME (attrs);
9422       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9423     }
9424   return Fnreverse (val);
9425 }
9426
9427 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9428
9429 static Lisp_Object
9430 make_subsidiaries (Lisp_Object base)
9431 {
9432   Lisp_Object subsidiaries;
9433   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9434   char *buf = alloca (base_name_len + 6);
9435   int i;
9436
9437   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9438   subsidiaries = Fmake_vector (make_number (3), Qnil);
9439   for (i = 0; i < 3; i++)
9440     {
9441       strcpy (buf + base_name_len, suffixes[i]);
9442       ASET (subsidiaries, i, intern (buf));
9443     }
9444   return subsidiaries;
9445 }
9446
9447
9448 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9449        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9450        doc: /* For internal use only.
9451 usage: (define-coding-system-internal ...)  */)
9452   (ptrdiff_t nargs, Lisp_Object *args)
9453 {
9454   Lisp_Object name;
9455   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9456   Lisp_Object attrs;            /* Vector of attributes.  */
9457   Lisp_Object eol_type;
9458   Lisp_Object aliases;
9459   Lisp_Object coding_type, charset_list, safe_charsets;
9460   enum coding_category category;
9461   Lisp_Object tail, val;
9462   int max_charset_id = 0;
9463   int i;
9464
9465   if (nargs < coding_arg_max)
9466     goto short_args;
9467
9468   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9469
9470   name = args[coding_arg_name];
9471   CHECK_SYMBOL (name);
9472   ASET (attrs, coding_attr_base_name, name);
9473
9474   val = args[coding_arg_mnemonic];
9475   if (! STRINGP (val))
9476     CHECK_CHARACTER (val);
9477   ASET (attrs, coding_attr_mnemonic, val);
9478
9479   coding_type = args[coding_arg_coding_type];
9480   CHECK_SYMBOL (coding_type);
9481   ASET (attrs, coding_attr_type, coding_type);
9482
9483   charset_list = args[coding_arg_charset_list];
9484   if (SYMBOLP (charset_list))
9485     {
9486       if (EQ (charset_list, Qiso_2022))
9487         {
9488           if (! EQ (coding_type, Qiso_2022))
9489             error ("Invalid charset-list");
9490           charset_list = Viso_2022_charset_list;
9491         }
9492       else if (EQ (charset_list, Qemacs_mule))
9493         {
9494           if (! EQ (coding_type, Qemacs_mule))
9495             error ("Invalid charset-list");
9496           charset_list = Vemacs_mule_charset_list;
9497         }
9498       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9499         {
9500           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9501             error ("Invalid charset-list");
9502           if (max_charset_id < XFASTINT (XCAR (tail)))
9503             max_charset_id = XFASTINT (XCAR (tail));
9504         }
9505     }
9506   else
9507     {
9508       charset_list = Fcopy_sequence (charset_list);
9509       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9510         {
9511           struct charset *charset;
9512
9513           val = XCAR (tail);
9514           CHECK_CHARSET_GET_CHARSET (val, charset);
9515           if (EQ (coding_type, Qiso_2022)
9516               ? CHARSET_ISO_FINAL (charset) < 0
9517               : EQ (coding_type, Qemacs_mule)
9518               ? CHARSET_EMACS_MULE_ID (charset) < 0
9519               : 0)
9520             error ("Can't handle charset `%s'",
9521                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9522
9523           XSETCAR (tail, make_number (charset->id));
9524           if (max_charset_id < charset->id)
9525             max_charset_id = charset->id;
9526         }
9527     }
9528   ASET (attrs, coding_attr_charset_list, charset_list);
9529
9530   safe_charsets = make_uninit_string (max_charset_id + 1);
9531   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9532   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9533     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9534   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9535
9536   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9537
9538   val = args[coding_arg_decode_translation_table];
9539   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9540     CHECK_SYMBOL (val);
9541   ASET (attrs, coding_attr_decode_tbl, val);
9542
9543   val = args[coding_arg_encode_translation_table];
9544   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9545     CHECK_SYMBOL (val);
9546   ASET (attrs, coding_attr_encode_tbl, val);
9547
9548   val = args[coding_arg_post_read_conversion];
9549   CHECK_SYMBOL (val);
9550   ASET (attrs, coding_attr_post_read, val);
9551
9552   val = args[coding_arg_pre_write_conversion];
9553   CHECK_SYMBOL (val);
9554   ASET (attrs, coding_attr_pre_write, val);
9555
9556   val = args[coding_arg_default_char];
9557   if (NILP (val))
9558     ASET (attrs, coding_attr_default_char, make_number (' '));
9559   else
9560     {
9561       CHECK_CHARACTER (val);
9562       ASET (attrs, coding_attr_default_char, val);
9563     }
9564
9565   val = args[coding_arg_for_unibyte];
9566   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9567
9568   val = args[coding_arg_plist];
9569   CHECK_LIST (val);
9570   ASET (attrs, coding_attr_plist, val);
9571
9572   if (EQ (coding_type, Qcharset))
9573     {
9574       /* Generate a lisp vector of 256 elements.  Each element is nil,
9575          integer, or a list of charset IDs.
9576
9577          If Nth element is nil, the byte code N is invalid in this
9578          coding system.
9579
9580          If Nth element is a number NUM, N is the first byte of a
9581          charset whose ID is NUM.
9582
9583          If Nth element is a list of charset IDs, N is the first byte
9584          of one of them.  The list is sorted by dimensions of the
9585          charsets.  A charset of smaller dimension comes first. */
9586       val = Fmake_vector (make_number (256), Qnil);
9587
9588       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9589         {
9590           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9591           int dim = CHARSET_DIMENSION (charset);
9592           int idx = (dim - 1) * 4;
9593
9594           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9595             ASET (attrs, coding_attr_ascii_compat, Qt);
9596
9597           for (i = charset->code_space[idx];
9598                i <= charset->code_space[idx + 1]; i++)
9599             {
9600               Lisp_Object tmp, tmp2;
9601               int dim2;
9602
9603               tmp = AREF (val, i);
9604               if (NILP (tmp))
9605                 tmp = XCAR (tail);
9606               else if (NUMBERP (tmp))
9607                 {
9608                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9609                   if (dim < dim2)
9610                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9611                   else
9612                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9613                 }
9614               else
9615                 {
9616                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9617                     {
9618                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9619                       if (dim < dim2)
9620                         break;
9621                     }
9622                   if (NILP (tmp2))
9623                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9624                   else
9625                     {
9626                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9627                       XSETCAR (tmp2, XCAR (tail));
9628                     }
9629                 }
9630               ASET (val, i, tmp);
9631             }
9632         }
9633       ASET (attrs, coding_attr_charset_valids, val);
9634       category = coding_category_charset;
9635     }
9636   else if (EQ (coding_type, Qccl))
9637     {
9638       Lisp_Object valids;
9639
9640       if (nargs < coding_arg_ccl_max)
9641         goto short_args;
9642
9643       val = args[coding_arg_ccl_decoder];
9644       CHECK_CCL_PROGRAM (val);
9645       if (VECTORP (val))
9646         val = Fcopy_sequence (val);
9647       ASET (attrs, coding_attr_ccl_decoder, val);
9648
9649       val = args[coding_arg_ccl_encoder];
9650       CHECK_CCL_PROGRAM (val);
9651       if (VECTORP (val))
9652         val = Fcopy_sequence (val);
9653       ASET (attrs, coding_attr_ccl_encoder, val);
9654
9655       val = args[coding_arg_ccl_valids];
9656       valids = Fmake_string (make_number (256), make_number (0));
9657       for (tail = val; CONSP (tail); tail = XCDR (tail))
9658         {
9659           int from, to;
9660
9661           val = XCAR (tail);
9662           if (INTEGERP (val))
9663             {
9664               if (! (0 <= XINT (val) && XINT (val) <= 255))
9665                 args_out_of_range_3 (val, make_number (0), make_number (255));
9666               from = to = XINT (val);
9667             }
9668           else
9669             {
9670               CHECK_CONS (val);
9671               CHECK_NATNUM_CAR (val);
9672               CHECK_NUMBER_CDR (val);
9673               if (XINT (XCAR (val)) > 255)
9674                 args_out_of_range_3 (XCAR (val),
9675                                      make_number (0), make_number (255));
9676               from = XINT (XCAR (val));
9677               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9678                 args_out_of_range_3 (XCDR (val),
9679                                      XCAR (val), make_number (255));
9680               to = XINT (XCDR (val));
9681             }
9682           for (i = from; i <= to; i++)
9683             SSET (valids, i, 1);
9684         }
9685       ASET (attrs, coding_attr_ccl_valids, valids);
9686
9687       category = coding_category_ccl;
9688     }
9689   else if (EQ (coding_type, Qutf_16))
9690     {
9691       Lisp_Object bom, endian;
9692
9693       ASET (attrs, coding_attr_ascii_compat, Qnil);
9694
9695       if (nargs < coding_arg_utf16_max)
9696         goto short_args;
9697
9698       bom = args[coding_arg_utf16_bom];
9699       if (! NILP (bom) && ! EQ (bom, Qt))
9700         {
9701           CHECK_CONS (bom);
9702           val = XCAR (bom);
9703           CHECK_CODING_SYSTEM (val);
9704           val = XCDR (bom);
9705           CHECK_CODING_SYSTEM (val);
9706         }
9707       ASET (attrs, coding_attr_utf_bom, bom);
9708
9709       endian = args[coding_arg_utf16_endian];
9710       CHECK_SYMBOL (endian);
9711       if (NILP (endian))
9712         endian = Qbig;
9713       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9714         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9715       ASET (attrs, coding_attr_utf_16_endian, endian);
9716
9717       category = (CONSP (bom)
9718                   ? coding_category_utf_16_auto
9719                   : NILP (bom)
9720                   ? (EQ (endian, Qbig)
9721                      ? coding_category_utf_16_be_nosig
9722                      : coding_category_utf_16_le_nosig)
9723                   : (EQ (endian, Qbig)
9724                      ? coding_category_utf_16_be
9725                      : coding_category_utf_16_le));
9726     }
9727   else if (EQ (coding_type, Qiso_2022))
9728     {
9729       Lisp_Object initial, reg_usage, request, flags;
9730
9731       if (nargs < coding_arg_iso2022_max)
9732         goto short_args;
9733
9734       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9735       CHECK_VECTOR (initial);
9736       for (i = 0; i < 4; i++)
9737         {
9738           val = Faref (initial, make_number (i));
9739           if (! NILP (val))
9740             {
9741               struct charset *charset;
9742
9743               CHECK_CHARSET_GET_CHARSET (val, charset);
9744               ASET (initial, i, make_number (CHARSET_ID (charset)));
9745               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9746                 ASET (attrs, coding_attr_ascii_compat, Qt);
9747             }
9748           else
9749             ASET (initial, i, make_number (-1));
9750         }
9751
9752       reg_usage = args[coding_arg_iso2022_reg_usage];
9753       CHECK_CONS (reg_usage);
9754       CHECK_NUMBER_CAR (reg_usage);
9755       CHECK_NUMBER_CDR (reg_usage);
9756
9757       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9758       for (tail = request; CONSP (tail); tail = XCDR (tail))
9759         {
9760           int id;
9761           Lisp_Object tmp1;
9762
9763           val = XCAR (tail);
9764           CHECK_CONS (val);
9765           tmp1 = XCAR (val);
9766           CHECK_CHARSET_GET_ID (tmp1, id);
9767           CHECK_NATNUM_CDR (val);
9768           if (XINT (XCDR (val)) >= 4)
9769             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9770           XSETCAR (val, make_number (id));
9771         }
9772
9773       flags = args[coding_arg_iso2022_flags];
9774       CHECK_NATNUM (flags);
9775       i = XINT (flags) & INT_MAX;
9776       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9777         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9778       flags = make_number (i);
9779
9780       ASET (attrs, coding_attr_iso_initial, initial);
9781       ASET (attrs, coding_attr_iso_usage, reg_usage);
9782       ASET (attrs, coding_attr_iso_request, request);
9783       ASET (attrs, coding_attr_iso_flags, flags);
9784       setup_iso_safe_charsets (attrs);
9785
9786       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9787         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9788                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9789                     ? coding_category_iso_7_else
9790                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9791                     ? coding_category_iso_7
9792                     : coding_category_iso_7_tight);
9793       else
9794         {
9795           int id = XINT (AREF (initial, 1));
9796
9797           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9798                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9799                        || id < 0)
9800                       ? coding_category_iso_8_else
9801                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9802                       ? coding_category_iso_8_1
9803                       : coding_category_iso_8_2);
9804         }
9805       if (category != coding_category_iso_8_1
9806           && category != coding_category_iso_8_2)
9807         ASET (attrs, coding_attr_ascii_compat, Qnil);
9808     }
9809   else if (EQ (coding_type, Qemacs_mule))
9810     {
9811       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9812         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9813       ASET (attrs, coding_attr_ascii_compat, Qt);
9814       category = coding_category_emacs_mule;
9815     }
9816   else if (EQ (coding_type, Qshift_jis))
9817     {
9818
9819       struct charset *charset;
9820
9821       if (XINT (Flength (charset_list)) != 3
9822           && XINT (Flength (charset_list)) != 4)
9823         error ("There should be three or four charsets");
9824
9825       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9826       if (CHARSET_DIMENSION (charset) != 1)
9827         error ("Dimension of charset %s is not one",
9828                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9829       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9830         ASET (attrs, coding_attr_ascii_compat, Qt);
9831
9832       charset_list = XCDR (charset_list);
9833       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9834       if (CHARSET_DIMENSION (charset) != 1)
9835         error ("Dimension of charset %s is not one",
9836                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9837
9838       charset_list = XCDR (charset_list);
9839       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9840       if (CHARSET_DIMENSION (charset) != 2)
9841         error ("Dimension of charset %s is not two",
9842                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9843
9844       charset_list = XCDR (charset_list);
9845       if (! NILP (charset_list))
9846         {
9847           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9848           if (CHARSET_DIMENSION (charset) != 2)
9849             error ("Dimension of charset %s is not two",
9850                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9851         }
9852
9853       category = coding_category_sjis;
9854       Vsjis_coding_system = name;
9855     }
9856   else if (EQ (coding_type, Qbig5))
9857     {
9858       struct charset *charset;
9859
9860       if (XINT (Flength (charset_list)) != 2)
9861         error ("There should be just two charsets");
9862
9863       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9864       if (CHARSET_DIMENSION (charset) != 1)
9865         error ("Dimension of charset %s is not one",
9866                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9867       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9868         ASET (attrs, coding_attr_ascii_compat, Qt);
9869
9870       charset_list = XCDR (charset_list);
9871       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9872       if (CHARSET_DIMENSION (charset) != 2)
9873         error ("Dimension of charset %s is not two",
9874                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9875
9876       category = coding_category_big5;
9877       Vbig5_coding_system = name;
9878     }
9879   else if (EQ (coding_type, Qraw_text))
9880     {
9881       category = coding_category_raw_text;
9882       ASET (attrs, coding_attr_ascii_compat, Qt);
9883     }
9884   else if (EQ (coding_type, Qutf_8))
9885     {
9886       Lisp_Object bom;
9887
9888       if (nargs < coding_arg_utf8_max)
9889         goto short_args;
9890
9891       bom = args[coding_arg_utf8_bom];
9892       if (! NILP (bom) && ! EQ (bom, Qt))
9893         {
9894           CHECK_CONS (bom);
9895           val = XCAR (bom);
9896           CHECK_CODING_SYSTEM (val);
9897           val = XCDR (bom);
9898           CHECK_CODING_SYSTEM (val);
9899         }
9900       ASET (attrs, coding_attr_utf_bom, bom);
9901       if (NILP (bom))
9902         ASET (attrs, coding_attr_ascii_compat, Qt);
9903
9904       category = (CONSP (bom) ? coding_category_utf_8_auto
9905                   : NILP (bom) ? coding_category_utf_8_nosig
9906                   : coding_category_utf_8_sig);
9907     }
9908   else if (EQ (coding_type, Qundecided))
9909     category = coding_category_undecided;
9910   else
9911     error ("Invalid coding system type: %s",
9912            SDATA (SYMBOL_NAME (coding_type)));
9913
9914   ASET (attrs, coding_attr_category, make_number (category));
9915   ASET (attrs, coding_attr_plist,
9916         Fcons (QCcategory,
9917                Fcons (AREF (Vcoding_category_table, category),
9918                       CODING_ATTR_PLIST (attrs))));
9919   ASET (attrs, coding_attr_plist,
9920         Fcons (QCascii_compatible_p,
9921                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9922                       CODING_ATTR_PLIST (attrs))));
9923
9924   eol_type = args[coding_arg_eol_type];
9925   if (! NILP (eol_type)
9926       && ! EQ (eol_type, Qunix)
9927       && ! EQ (eol_type, Qdos)
9928       && ! EQ (eol_type, Qmac))
9929     error ("Invalid eol-type");
9930
9931   aliases = Fcons (name, Qnil);
9932
9933   if (NILP (eol_type))
9934     {
9935       eol_type = make_subsidiaries (name);
9936       for (i = 0; i < 3; i++)
9937         {
9938           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9939
9940           this_name = AREF (eol_type, i);
9941           this_aliases = Fcons (this_name, Qnil);
9942           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9943           this_spec = Fmake_vector (make_number (3), attrs);
9944           ASET (this_spec, 1, this_aliases);
9945           ASET (this_spec, 2, this_eol_type);
9946           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9947           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9948           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9949           if (NILP (val))
9950             Vcoding_system_alist
9951               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9952                        Vcoding_system_alist);
9953         }
9954     }
9955
9956   spec_vec = Fmake_vector (make_number (3), attrs);
9957   ASET (spec_vec, 1, aliases);
9958   ASET (spec_vec, 2, eol_type);
9959
9960   Fputhash (name, spec_vec, Vcoding_system_hash_table);
9961   Vcoding_system_list = Fcons (name, Vcoding_system_list);
9962   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9963   if (NILP (val))
9964     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9965                                   Vcoding_system_alist);
9966
9967   {
9968     int id = coding_categories[category].id;
9969
9970     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9971       setup_coding_system (name, &coding_categories[category]);
9972   }
9973
9974   return Qnil;
9975
9976  short_args:
9977   return Fsignal (Qwrong_number_of_arguments,
9978                   Fcons (intern ("define-coding-system-internal"),
9979                          make_number (nargs)));
9980 }
9981
9982
9983 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9984        3, 3, 0,
9985        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
9986   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
9987 {
9988   Lisp_Object spec, attrs;
9989
9990   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9991   attrs = AREF (spec, 0);
9992   if (EQ (prop, QCmnemonic))
9993     {
9994       if (! STRINGP (val))
9995         CHECK_CHARACTER (val);
9996       ASET (attrs, coding_attr_mnemonic, val);
9997     }
9998   else if (EQ (prop, QCdefault_char))
9999     {
10000       if (NILP (val))
10001         val = make_number (' ');
10002       else
10003         CHECK_CHARACTER (val);
10004       ASET (attrs, coding_attr_default_char, val);
10005     }
10006   else if (EQ (prop, QCdecode_translation_table))
10007     {
10008       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10009         CHECK_SYMBOL (val);
10010       ASET (attrs, coding_attr_decode_tbl, val);
10011     }
10012   else if (EQ (prop, QCencode_translation_table))
10013     {
10014       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10015         CHECK_SYMBOL (val);
10016       ASET (attrs, coding_attr_encode_tbl, val);
10017     }
10018   else if (EQ (prop, QCpost_read_conversion))
10019     {
10020       CHECK_SYMBOL (val);
10021       ASET (attrs, coding_attr_post_read, val);
10022     }
10023   else if (EQ (prop, QCpre_write_conversion))
10024     {
10025       CHECK_SYMBOL (val);
10026       ASET (attrs, coding_attr_pre_write, val);
10027     }
10028   else if (EQ (prop, QCascii_compatible_p))
10029     {
10030       ASET (attrs, coding_attr_ascii_compat, val);
10031     }
10032
10033   ASET (attrs, coding_attr_plist,
10034         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10035   return val;
10036 }
10037
10038
10039 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10040        Sdefine_coding_system_alias, 2, 2, 0,
10041        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10042   (Lisp_Object alias, Lisp_Object coding_system)
10043 {
10044   Lisp_Object spec, aliases, eol_type, val;
10045
10046   CHECK_SYMBOL (alias);
10047   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10048   aliases = AREF (spec, 1);
10049   /* ALIASES should be a list of length more than zero, and the first
10050      element is a base coding system.  Append ALIAS at the tail of the
10051      list.  */
10052   while (!NILP (XCDR (aliases)))
10053     aliases = XCDR (aliases);
10054   XSETCDR (aliases, Fcons (alias, Qnil));
10055
10056   eol_type = AREF (spec, 2);
10057   if (VECTORP (eol_type))
10058     {
10059       Lisp_Object subsidiaries;
10060       int i;
10061
10062       subsidiaries = make_subsidiaries (alias);
10063       for (i = 0; i < 3; i++)
10064         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10065                                      AREF (eol_type, i));
10066     }
10067
10068   Fputhash (alias, spec, Vcoding_system_hash_table);
10069   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10070   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10071   if (NILP (val))
10072     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10073                                   Vcoding_system_alist);
10074
10075   return Qnil;
10076 }
10077
10078 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10079        1, 1, 0,
10080        doc: /* Return the base of CODING-SYSTEM.
10081 Any alias or subsidiary coding system is not a base coding system.  */)
10082   (Lisp_Object coding_system)
10083 {
10084   Lisp_Object spec, attrs;
10085
10086   if (NILP (coding_system))
10087     return (Qno_conversion);
10088   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10089   attrs = AREF (spec, 0);
10090   return CODING_ATTR_BASE_NAME (attrs);
10091 }
10092
10093 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10094        1, 1, 0,
10095        doc: "Return the property list of CODING-SYSTEM.")
10096   (Lisp_Object coding_system)
10097 {
10098   Lisp_Object spec, attrs;
10099
10100   if (NILP (coding_system))
10101     coding_system = Qno_conversion;
10102   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10103   attrs = AREF (spec, 0);
10104   return CODING_ATTR_PLIST (attrs);
10105 }
10106
10107
10108 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10109        1, 1, 0,
10110        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10111   (Lisp_Object coding_system)
10112 {
10113   Lisp_Object spec;
10114
10115   if (NILP (coding_system))
10116     coding_system = Qno_conversion;
10117   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10118   return AREF (spec, 1);
10119 }
10120
10121 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10122        Scoding_system_eol_type, 1, 1, 0,
10123        doc: /* Return eol-type of CODING-SYSTEM.
10124 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10125
10126 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10127 and CR respectively.
10128
10129 A vector value indicates that a format of end-of-line should be
10130 detected automatically.  Nth element of the vector is the subsidiary
10131 coding system whose eol-type is N.  */)
10132   (Lisp_Object coding_system)
10133 {
10134   Lisp_Object spec, eol_type;
10135   int n;
10136
10137   if (NILP (coding_system))
10138     coding_system = Qno_conversion;
10139   if (! CODING_SYSTEM_P (coding_system))
10140     return Qnil;
10141   spec = CODING_SYSTEM_SPEC (coding_system);
10142   eol_type = AREF (spec, 2);
10143   if (VECTORP (eol_type))
10144     return Fcopy_sequence (eol_type);
10145   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10146   return make_number (n);
10147 }
10148
10149 #endif /* emacs */
10150
10151 \f
10152 /*** 9. Post-amble ***/
10153
10154 void
10155 init_coding_once (void)
10156 {
10157   int i;
10158
10159   for (i = 0; i < coding_category_max; i++)
10160     {
10161       coding_categories[i].id = -1;
10162       coding_priorities[i] = i;
10163     }
10164
10165   /* ISO2022 specific initialize routine.  */
10166   for (i = 0; i < 0x20; i++)
10167     iso_code_class[i] = ISO_control_0;
10168   for (i = 0x21; i < 0x7F; i++)
10169     iso_code_class[i] = ISO_graphic_plane_0;
10170   for (i = 0x80; i < 0xA0; i++)
10171     iso_code_class[i] = ISO_control_1;
10172   for (i = 0xA1; i < 0xFF; i++)
10173     iso_code_class[i] = ISO_graphic_plane_1;
10174   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10175   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10176   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10177   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10178   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10179   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10180   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10181   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10182   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10183
10184   for (i = 0; i < 256; i++)
10185     {
10186       emacs_mule_bytes[i] = 1;
10187     }
10188   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10189   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10190   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10191   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10192 }
10193
10194 #ifdef emacs
10195
10196 void
10197 syms_of_coding (void)
10198 {
10199   staticpro (&Vcoding_system_hash_table);
10200   {
10201     Lisp_Object args[2];
10202     args[0] = QCtest;
10203     args[1] = Qeq;
10204     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10205   }
10206
10207   staticpro (&Vsjis_coding_system);
10208   Vsjis_coding_system = Qnil;
10209
10210   staticpro (&Vbig5_coding_system);
10211   Vbig5_coding_system = Qnil;
10212
10213   staticpro (&Vcode_conversion_reused_workbuf);
10214   Vcode_conversion_reused_workbuf = Qnil;
10215
10216   staticpro (&Vcode_conversion_workbuf_name);
10217   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10218
10219   reused_workbuf_in_use = 0;
10220
10221   DEFSYM (Qcharset, "charset");
10222   DEFSYM (Qtarget_idx, "target-idx");
10223   DEFSYM (Qcoding_system_history, "coding-system-history");
10224   Fset (Qcoding_system_history, Qnil);
10225
10226   /* Target FILENAME is the first argument.  */
10227   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10228   /* Target FILENAME is the third argument.  */
10229   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10230
10231   DEFSYM (Qcall_process, "call-process");
10232   /* Target PROGRAM is the first argument.  */
10233   Fput (Qcall_process, Qtarget_idx, make_number (0));
10234
10235   DEFSYM (Qcall_process_region, "call-process-region");
10236   /* Target PROGRAM is the third argument.  */
10237   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10238
10239   DEFSYM (Qstart_process, "start-process");
10240   /* Target PROGRAM is the third argument.  */
10241   Fput (Qstart_process, Qtarget_idx, make_number (2));
10242
10243   DEFSYM (Qopen_network_stream, "open-network-stream");
10244   /* Target SERVICE is the fourth argument.  */
10245   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10246
10247   DEFSYM (Qcoding_system, "coding-system");
10248   DEFSYM (Qcoding_aliases, "coding-aliases");
10249
10250   DEFSYM (Qeol_type, "eol-type");
10251   DEFSYM (Qunix, "unix");
10252   DEFSYM (Qdos, "dos");
10253
10254   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10255   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10256   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10257   DEFSYM (Qdefault_char, "default-char");
10258   DEFSYM (Qundecided, "undecided");
10259   DEFSYM (Qno_conversion, "no-conversion");
10260   DEFSYM (Qraw_text, "raw-text");
10261
10262   DEFSYM (Qiso_2022, "iso-2022");
10263
10264   DEFSYM (Qutf_8, "utf-8");
10265   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10266
10267   DEFSYM (Qutf_16, "utf-16");
10268   DEFSYM (Qbig, "big");
10269   DEFSYM (Qlittle, "little");
10270
10271   DEFSYM (Qshift_jis, "shift-jis");
10272   DEFSYM (Qbig5, "big5");
10273
10274   DEFSYM (Qcoding_system_p, "coding-system-p");
10275
10276   DEFSYM (Qcoding_system_error, "coding-system-error");
10277   Fput (Qcoding_system_error, Qerror_conditions,
10278         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10279   Fput (Qcoding_system_error, Qerror_message,
10280         build_pure_c_string ("Invalid coding system"));
10281
10282   /* Intern this now in case it isn't already done.
10283      Setting this variable twice is harmless.
10284      But don't staticpro it here--that is done in alloc.c.  */
10285   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10286
10287   DEFSYM (Qtranslation_table, "translation-table");
10288   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10289   DEFSYM (Qtranslation_table_id, "translation-table-id");
10290   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10291   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10292
10293   DEFSYM (Qvalid_codes, "valid-codes");
10294
10295   DEFSYM (Qemacs_mule, "emacs-mule");
10296
10297   DEFSYM (QCcategory, ":category");
10298   DEFSYM (QCmnemonic, ":mnemonic");
10299   DEFSYM (QCdefault_char, ":default-char");
10300   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10301   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10302   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10303   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10304   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10305
10306   Vcoding_category_table
10307     = Fmake_vector (make_number (coding_category_max), Qnil);
10308   staticpro (&Vcoding_category_table);
10309   /* Followings are target of code detection.  */
10310   ASET (Vcoding_category_table, coding_category_iso_7,
10311         intern_c_string ("coding-category-iso-7"));
10312   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10313         intern_c_string ("coding-category-iso-7-tight"));
10314   ASET (Vcoding_category_table, coding_category_iso_8_1,
10315         intern_c_string ("coding-category-iso-8-1"));
10316   ASET (Vcoding_category_table, coding_category_iso_8_2,
10317         intern_c_string ("coding-category-iso-8-2"));
10318   ASET (Vcoding_category_table, coding_category_iso_7_else,
10319         intern_c_string ("coding-category-iso-7-else"));
10320   ASET (Vcoding_category_table, coding_category_iso_8_else,
10321         intern_c_string ("coding-category-iso-8-else"));
10322   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10323         intern_c_string ("coding-category-utf-8-auto"));
10324   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10325         intern_c_string ("coding-category-utf-8"));
10326   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10327         intern_c_string ("coding-category-utf-8-sig"));
10328   ASET (Vcoding_category_table, coding_category_utf_16_be,
10329         intern_c_string ("coding-category-utf-16-be"));
10330   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10331         intern_c_string ("coding-category-utf-16-auto"));
10332   ASET (Vcoding_category_table, coding_category_utf_16_le,
10333         intern_c_string ("coding-category-utf-16-le"));
10334   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10335         intern_c_string ("coding-category-utf-16-be-nosig"));
10336   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10337         intern_c_string ("coding-category-utf-16-le-nosig"));
10338   ASET (Vcoding_category_table, coding_category_charset,
10339         intern_c_string ("coding-category-charset"));
10340   ASET (Vcoding_category_table, coding_category_sjis,
10341         intern_c_string ("coding-category-sjis"));
10342   ASET (Vcoding_category_table, coding_category_big5,
10343         intern_c_string ("coding-category-big5"));
10344   ASET (Vcoding_category_table, coding_category_ccl,
10345         intern_c_string ("coding-category-ccl"));
10346   ASET (Vcoding_category_table, coding_category_emacs_mule,
10347         intern_c_string ("coding-category-emacs-mule"));
10348   /* Followings are NOT target of code detection.  */
10349   ASET (Vcoding_category_table, coding_category_raw_text,
10350         intern_c_string ("coding-category-raw-text"));
10351   ASET (Vcoding_category_table, coding_category_undecided,
10352         intern_c_string ("coding-category-undecided"));
10353
10354   DEFSYM (Qinsufficient_source, "insufficient-source");
10355   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10356   DEFSYM (Qinvalid_source, "invalid-source");
10357   DEFSYM (Qinterrupted, "interrupted");
10358   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10359   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10360
10361   defsubr (&Scoding_system_p);
10362   defsubr (&Sread_coding_system);
10363   defsubr (&Sread_non_nil_coding_system);
10364   defsubr (&Scheck_coding_system);
10365   defsubr (&Sdetect_coding_region);
10366   defsubr (&Sdetect_coding_string);
10367   defsubr (&Sfind_coding_systems_region_internal);
10368   defsubr (&Sunencodable_char_position);
10369   defsubr (&Scheck_coding_systems_region);
10370   defsubr (&Sdecode_coding_region);
10371   defsubr (&Sencode_coding_region);
10372   defsubr (&Sdecode_coding_string);
10373   defsubr (&Sencode_coding_string);
10374   defsubr (&Sdecode_sjis_char);
10375   defsubr (&Sencode_sjis_char);
10376   defsubr (&Sdecode_big5_char);
10377   defsubr (&Sencode_big5_char);
10378   defsubr (&Sset_terminal_coding_system_internal);
10379   defsubr (&Sset_safe_terminal_coding_system_internal);
10380   defsubr (&Sterminal_coding_system);
10381   defsubr (&Sset_keyboard_coding_system_internal);
10382   defsubr (&Skeyboard_coding_system);
10383   defsubr (&Sfind_operation_coding_system);
10384   defsubr (&Sset_coding_system_priority);
10385   defsubr (&Sdefine_coding_system_internal);
10386   defsubr (&Sdefine_coding_system_alias);
10387   defsubr (&Scoding_system_put);
10388   defsubr (&Scoding_system_base);
10389   defsubr (&Scoding_system_plist);
10390   defsubr (&Scoding_system_aliases);
10391   defsubr (&Scoding_system_eol_type);
10392   defsubr (&Scoding_system_priority_list);
10393
10394   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10395                doc: /* List of coding systems.
10396
10397 Do not alter the value of this variable manually.  This variable should be
10398 updated by the functions `define-coding-system' and
10399 `define-coding-system-alias'.  */);
10400   Vcoding_system_list = Qnil;
10401
10402   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10403                doc: /* Alist of coding system names.
10404 Each element is one element list of coding system name.
10405 This variable is given to `completing-read' as COLLECTION argument.
10406
10407 Do not alter the value of this variable manually.  This variable should be
10408 updated by the functions `make-coding-system' and
10409 `define-coding-system-alias'.  */);
10410   Vcoding_system_alist = Qnil;
10411
10412   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10413                doc: /* List of coding-categories (symbols) ordered by priority.
10414
10415 On detecting a coding system, Emacs tries code detection algorithms
10416 associated with each coding-category one by one in this order.  When
10417 one algorithm agrees with a byte sequence of source text, the coding
10418 system bound to the corresponding coding-category is selected.
10419
10420 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10421   {
10422     int i;
10423
10424     Vcoding_category_list = Qnil;
10425     for (i = coding_category_max - 1; i >= 0; i--)
10426       Vcoding_category_list
10427         = Fcons (AREF (Vcoding_category_table, i),
10428                  Vcoding_category_list);
10429   }
10430
10431   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10432                doc: /* Specify the coding system for read operations.
10433 It is useful to bind this variable with `let', but do not set it globally.
10434 If the value is a coding system, it is used for decoding on read operation.
10435 If not, an appropriate element is used from one of the coding system alists.
10436 There are three such tables: `file-coding-system-alist',
10437 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10438   Vcoding_system_for_read = Qnil;
10439
10440   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10441                doc: /* Specify the coding system for write operations.
10442 Programs bind this variable with `let', but you should not set it globally.
10443 If the value is a coding system, it is used for encoding of output,
10444 when writing it to a file and when sending it to a file or subprocess.
10445
10446 If this does not specify a coding system, an appropriate element
10447 is used from one of the coding system alists.
10448 There are three such tables: `file-coding-system-alist',
10449 `process-coding-system-alist', and `network-coding-system-alist'.
10450 For output to files, if the above procedure does not specify a coding system,
10451 the value of `buffer-file-coding-system' is used.  */);
10452   Vcoding_system_for_write = Qnil;
10453
10454   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10455                doc: /*
10456 Coding system used in the latest file or process I/O.  */);
10457   Vlast_coding_system_used = Qnil;
10458
10459   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10460                doc: /*
10461 Error status of the last code conversion.
10462
10463 When an error was detected in the last code conversion, this variable
10464 is set to one of the following symbols.
10465   `insufficient-source'
10466   `inconsistent-eol'
10467   `invalid-source'
10468   `interrupted'
10469   `insufficient-memory'
10470 When no error was detected, the value doesn't change.  So, to check
10471 the error status of a code conversion by this variable, you must
10472 explicitly set this variable to nil before performing code
10473 conversion.  */);
10474   Vlast_code_conversion_error = Qnil;
10475
10476   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10477                doc: /*
10478 *Non-nil means always inhibit code conversion of end-of-line format.
10479 See info node `Coding Systems' and info node `Text and Binary' concerning
10480 such conversion.  */);
10481   inhibit_eol_conversion = 0;
10482
10483   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10484                doc: /*
10485 Non-nil means process buffer inherits coding system of process output.
10486 Bind it to t if the process output is to be treated as if it were a file
10487 read from some filesystem.  */);
10488   inherit_process_coding_system = 0;
10489
10490   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10491                doc: /*
10492 Alist to decide a coding system to use for a file I/O operation.
10493 The format is ((PATTERN . VAL) ...),
10494 where PATTERN is a regular expression matching a file name,
10495 VAL is a coding system, a cons of coding systems, or a function symbol.
10496 If VAL is a coding system, it is used for both decoding and encoding
10497 the file contents.
10498 If VAL is a cons of coding systems, the car part is used for decoding,
10499 and the cdr part is used for encoding.
10500 If VAL is a function symbol, the function must return a coding system
10501 or a cons of coding systems which are used as above.  The function is
10502 called with an argument that is a list of the arguments with which
10503 `find-operation-coding-system' was called.  If the function can't decide
10504 a coding system, it can return `undecided' so that the normal
10505 code-detection is performed.
10506
10507 See also the function `find-operation-coding-system'
10508 and the variable `auto-coding-alist'.  */);
10509   Vfile_coding_system_alist = Qnil;
10510
10511   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10512                doc: /*
10513 Alist to decide a coding system to use for a process I/O operation.
10514 The format is ((PATTERN . VAL) ...),
10515 where PATTERN is a regular expression matching a program name,
10516 VAL is a coding system, a cons of coding systems, or a function symbol.
10517 If VAL is a coding system, it is used for both decoding what received
10518 from the program and encoding what sent to the program.
10519 If VAL is a cons of coding systems, the car part is used for decoding,
10520 and the cdr part is used for encoding.
10521 If VAL is a function symbol, the function must return a coding system
10522 or a cons of coding systems which are used as above.
10523
10524 See also the function `find-operation-coding-system'.  */);
10525   Vprocess_coding_system_alist = Qnil;
10526
10527   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10528                doc: /*
10529 Alist to decide a coding system to use for a network I/O operation.
10530 The format is ((PATTERN . VAL) ...),
10531 where PATTERN is a regular expression matching a network service name
10532 or is a port number to connect to,
10533 VAL is a coding system, a cons of coding systems, or a function symbol.
10534 If VAL is a coding system, it is used for both decoding what received
10535 from the network stream and encoding what sent to the network stream.
10536 If VAL is a cons of coding systems, the car part is used for decoding,
10537 and the cdr part is used for encoding.
10538 If VAL is a function symbol, the function must return a coding system
10539 or a cons of coding systems which are used as above.
10540
10541 See also the function `find-operation-coding-system'.  */);
10542   Vnetwork_coding_system_alist = Qnil;
10543
10544   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10545                doc: /* Coding system to use with system messages.
10546 Also used for decoding keyboard input on X Window system.  */);
10547   Vlocale_coding_system = Qnil;
10548
10549   /* The eol mnemonics are reset in startup.el system-dependently.  */
10550   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10551                doc: /*
10552 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10553   eol_mnemonic_unix = build_pure_c_string (":");
10554
10555   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10556                doc: /*
10557 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10558   eol_mnemonic_dos = build_pure_c_string ("\\");
10559
10560   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10561                doc: /*
10562 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10563   eol_mnemonic_mac = build_pure_c_string ("/");
10564
10565   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10566                doc: /*
10567 *String displayed in mode line when end-of-line format is not yet determined.  */);
10568   eol_mnemonic_undecided = build_pure_c_string (":");
10569
10570   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10571                doc: /*
10572 *Non-nil enables character translation while encoding and decoding.  */);
10573   Venable_character_translation = Qt;
10574
10575   DEFVAR_LISP ("standard-translation-table-for-decode",
10576                Vstandard_translation_table_for_decode,
10577                doc: /* Table for translating characters while decoding.  */);
10578   Vstandard_translation_table_for_decode = Qnil;
10579
10580   DEFVAR_LISP ("standard-translation-table-for-encode",
10581                Vstandard_translation_table_for_encode,
10582                doc: /* Table for translating characters while encoding.  */);
10583   Vstandard_translation_table_for_encode = Qnil;
10584
10585   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10586                doc: /* Alist of charsets vs revision numbers.
10587 While encoding, if a charset (car part of an element) is found,
10588 designate it with the escape sequence identifying revision (cdr part
10589 of the element).  */);
10590   Vcharset_revision_table = Qnil;
10591
10592   DEFVAR_LISP ("default-process-coding-system",
10593                Vdefault_process_coding_system,
10594                doc: /* Cons of coding systems used for process I/O by default.
10595 The car part is used for decoding a process output,
10596 the cdr part is used for encoding a text to be sent to a process.  */);
10597   Vdefault_process_coding_system = Qnil;
10598
10599   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10600                doc: /*
10601 Table of extra Latin codes in the range 128..159 (inclusive).
10602 This is a vector of length 256.
10603 If Nth element is non-nil, the existence of code N in a file
10604 \(or output of subprocess) doesn't prevent it to be detected as
10605 a coding system of ISO 2022 variant which has a flag
10606 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10607 or reading output of a subprocess.
10608 Only 128th through 159th elements have a meaning.  */);
10609   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10610
10611   DEFVAR_LISP ("select-safe-coding-system-function",
10612                Vselect_safe_coding_system_function,
10613                doc: /*
10614 Function to call to select safe coding system for encoding a text.
10615
10616 If set, this function is called to force a user to select a proper
10617 coding system which can encode the text in the case that a default
10618 coding system used in each operation can't encode the text.  The
10619 function should take care that the buffer is not modified while
10620 the coding system is being selected.
10621
10622 The default value is `select-safe-coding-system' (which see).  */);
10623   Vselect_safe_coding_system_function = Qnil;
10624
10625   DEFVAR_BOOL ("coding-system-require-warning",
10626                coding_system_require_warning,
10627                doc: /* Internal use only.
10628 If non-nil, on writing a file, `select-safe-coding-system-function' is
10629 called even if `coding-system-for-write' is non-nil.  The command
10630 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10631   coding_system_require_warning = 0;
10632
10633
10634   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10635                inhibit_iso_escape_detection,
10636                doc: /*
10637 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10638
10639 When Emacs reads text, it tries to detect how the text is encoded.
10640 This code detection is sensitive to escape sequences.  If Emacs sees
10641 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10642 of the ISO2022 encodings, and decodes text by the corresponding coding
10643 system (e.g. `iso-2022-7bit').
10644
10645 However, there may be a case that you want to read escape sequences in
10646 a file as is.  In such a case, you can set this variable to non-nil.
10647 Then the code detection will ignore any escape sequences, and no text is
10648 detected as encoded in some ISO-2022 encoding.  The result is that all
10649 escape sequences become visible in a buffer.
10650
10651 The default value is nil, and it is strongly recommended not to change
10652 it.  That is because many Emacs Lisp source files that contain
10653 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10654 in Emacs's distribution, and they won't be decoded correctly on
10655 reading if you suppress escape sequence detection.
10656
10657 The other way to read escape sequences in a file without decoding is
10658 to explicitly specify some coding system that doesn't use ISO-2022
10659 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10660   inhibit_iso_escape_detection = 0;
10661
10662   DEFVAR_BOOL ("inhibit-null-byte-detection",
10663                inhibit_null_byte_detection,
10664                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10665 By default, Emacs treats it as binary data, and does not attempt to
10666 decode it.  The effect is as if you specified `no-conversion' for
10667 reading that text.
10668
10669 Set this to non-nil when a regular text happens to include null bytes.
10670 Examples are Index nodes of Info files and null-byte delimited output
10671 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10672 decode text as usual.  */);
10673   inhibit_null_byte_detection = 0;
10674
10675   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10676                doc: /* Char table for translating self-inserting characters.
10677 This is applied to the result of input methods, not their input.
10678 See also `keyboard-translate-table'.
10679
10680 Use of this variable for character code unification was rendered
10681 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10682 internal character representation.  */);
10683     Vtranslation_table_for_input = Qnil;
10684
10685   {
10686     Lisp_Object args[coding_arg_max];
10687     Lisp_Object plist[16];
10688     int i;
10689
10690     for (i = 0; i < coding_arg_max; i++)
10691       args[i] = Qnil;
10692
10693     plist[0] = intern_c_string (":name");
10694     plist[1] = args[coding_arg_name] = Qno_conversion;
10695     plist[2] = intern_c_string (":mnemonic");
10696     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10697     plist[4] = intern_c_string (":coding-type");
10698     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10699     plist[6] = intern_c_string (":ascii-compatible-p");
10700     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10701     plist[8] = intern_c_string (":default-char");
10702     plist[9] = args[coding_arg_default_char] = make_number (0);
10703     plist[10] = intern_c_string (":for-unibyte");
10704     plist[11] = args[coding_arg_for_unibyte] = Qt;
10705     plist[12] = intern_c_string (":docstring");
10706     plist[13] = build_pure_c_string ("Do no conversion.\n\
10707 \n\
10708 When you visit a file with this coding, the file is read into a\n\
10709 unibyte buffer as is, thus each byte of a file is treated as a\n\
10710 character.");
10711     plist[14] = intern_c_string (":eol-type");
10712     plist[15] = args[coding_arg_eol_type] = Qunix;
10713     args[coding_arg_plist] = Flist (16, plist);
10714     Fdefine_coding_system_internal (coding_arg_max, args);
10715
10716     plist[1] = args[coding_arg_name] = Qundecided;
10717     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10718     plist[5] = args[coding_arg_coding_type] = Qundecided;
10719     /* This is already set.
10720        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10721     plist[8] = intern_c_string (":charset-list");
10722     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10723     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10724     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10725     plist[15] = args[coding_arg_eol_type] = Qnil;
10726     args[coding_arg_plist] = Flist (16, plist);
10727     Fdefine_coding_system_internal (coding_arg_max, args);
10728   }
10729
10730   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10731
10732   {
10733     int i;
10734
10735     for (i = 0; i < coding_category_max; i++)
10736       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10737   }
10738 #if defined (DOS_NT)
10739   system_eol_type = Qdos;
10740 #else
10741   system_eol_type = Qunix;
10742 #endif
10743   staticpro (&system_eol_type);
10744 }
10745
10746 char *
10747 emacs_strerror (int error_number)
10748 {
10749   char *str;
10750
10751   synchronize_system_messages_locale ();
10752   str = strerror (error_number);
10753
10754   if (! NILP (Vlocale_coding_system))
10755     {
10756       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10757                                                       Vlocale_coding_system,
10758                                                       0);
10759       str = SSDATA (dec);
10760     }
10761
10762   return str;
10763 }
10764
10765 #endif /* emacs */