src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2012 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   int multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   int multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   int multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288 #include <setjmp.h>
 289
 290 #include "lisp.h"
 291 #include "character.h"
 292 #include "buffer.h"
 293 #include "charset.h"
 294 #include "ccl.h"
 295 #include "composite.h"
 296 #include "coding.h"
 297 #include "window.h"
 298 #include "frame.h"
 299 #include "termhooks.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 static Lisp_Object Qcoding_system, Qeol_type;
 304 static Lisp_Object Qcoding_aliases;
 305 Lisp_Object Qunix, Qdos;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 static Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qutf_8;
 311 static Lisp_Object Qiso_2022;
 312 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 313 static Lisp_Object Qbig, Qlittle;
 314 static Lisp_Object Qcoding_system_history;
 315 static Lisp_Object Qvalid_codes;
 316 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 317 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 static Lisp_Object QCascii_compatible_p;
 320
 321 Lisp_Object Qcall_process, Qcall_process_region;
 322 Lisp_Object Qstart_process, Qopen_network_stream;
 323 static Lisp_Object Qtarget_idx;
 324
 325 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 326 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 327
 328 /* If a symbol has this property, evaluate the value to define the
 329    symbol as a coding system.  */
 330 static Lisp_Object Qcoding_system_define_form;
 331
 332 /* Format of end-of-line decided by system.  This is Qunix on
 333    Unix and Mac, Qdos on DOS/Windows.
 334    This has an effect only for external encoding (i.e. for output to
 335    file and process), not for in-buffer or Lisp string encoding.  */
 336 static Lisp_Object system_eol_type;
 337
 338 #ifdef emacs
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 /* Coding-systems are handed between Emacs Lisp programs and C internal
 348    routines by the following three variables.  */
 349 /* Coding system to be used to encode text for terminal display when
 350    terminal coding system is nil.  */
 351 struct coding_system safe_terminal_coding;
 352
 353 #endif /* emacs */
 354
 355 Lisp_Object Qtranslation_table;
 356 Lisp_Object Qtranslation_table_id;
 357 static Lisp_Object Qtranslation_table_for_decode;
 358 static Lisp_Object Qtranslation_table_for_encode;
 359
 360 /* Two special coding systems.  */
 361 static Lisp_Object Vsjis_coding_system;
 362 static Lisp_Object Vbig5_coding_system;
 363
 364 /* ISO2022 section */
 365
 366 #define CODING_ISO_INITIAL(coding, reg)                 \
 367   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 368                      coding_attr_iso_initial),          \
 369                reg)))
 370
 371
 372 #define CODING_ISO_REQUEST(coding, charset_id)          \
 373   (((charset_id) <= (coding)->max_charset_id            \
 374     ? ((coding)->safe_charsets[charset_id] != 255       \
 375        ? (coding)->safe_charsets[charset_id]            \
 376        : -1)                                            \
 377     : -1))
 378
 379
 380 #define CODING_ISO_FLAGS(coding)        \
 381   ((coding)->spec.iso_2022.flags)
 382 #define CODING_ISO_DESIGNATION(coding, reg)     \
 383   ((coding)->spec.iso_2022.current_designation[reg])
 384 #define CODING_ISO_INVOCATION(coding, plane)    \
 385   ((coding)->spec.iso_2022.current_invocation[plane])
 386 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 387   ((coding)->spec.iso_2022.single_shifting)
 388 #define CODING_ISO_BOL(coding)  \
 389   ((coding)->spec.iso_2022.bol)
 390 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 391   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 392 #define CODING_ISO_CMP_STATUS(coding)   \
 393   (&(coding)->spec.iso_2022.cmp_status)
 394 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 395   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 396 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 397   ((coding)->spec.iso_2022.embedded_utf_8)
 398
 399 /* Control characters of ISO2022.  */
 400                         /* code */      /* function */
 401 #define ISO_CODE_SO     0x0E            /* shift-out */
 402 #define ISO_CODE_SI     0x0F            /* shift-in */
 403 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 404 #define ISO_CODE_ESC    0x1B            /* escape */
 405 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 406 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 407 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 408
 409 /* All code (1-byte) of ISO2022 is classified into one of the
 410    followings.  */
 411 enum iso_code_class_type
 412   {
 413     ISO_control_0,              /* Control codes in the range
 414                                    0x00..0x1F and 0x7F, except for the
 415                                    following 5 codes.  */
 416     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 417     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 418     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 419     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 420     ISO_control_1,              /* Control codes in the range
 421                                    0x80..0x9F, except for the
 422                                    following 3 codes.  */
 423     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 424     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 425     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 426     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 427     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 428     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 429     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 430   };
 431
 432 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 433     `iso-flags' attribute of an iso2022 coding system.  */
 434
 435 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 436    instead of the correct short-form sequence (e.g. ESC $ A).  */
 437 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 438
 439 /* If set, reset graphic planes and registers at end-of-line to the
 440    initial state.  */
 441 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 442
 443 /* If set, reset graphic planes and registers before any control
 444    characters to the initial state.  */
 445 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 446
 447 /* If set, encode by 7-bit environment.  */
 448 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 449
 450 /* If set, use locking-shift function.  */
 451 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 452
 453 /* If set, use single-shift function.  Overwrite
 454    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 455 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 456
 457 /* If set, use designation escape sequence.  */
 458 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 459
 460 /* If set, produce revision number sequence.  */
 461 #define CODING_ISO_FLAG_REVISION        0x0080
 462
 463 /* If set, produce ISO6429's direction specifying sequence.  */
 464 #define CODING_ISO_FLAG_DIRECTION       0x0100
 465
 466 /* If set, assume designation states are reset at beginning of line on
 467    output.  */
 468 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 469
 470 /* If set, designation sequence should be placed at beginning of line
 471    on output.  */
 472 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 473
 474 /* If set, do not encode unsafe characters on output.  */
 475 #define CODING_ISO_FLAG_SAFE            0x0800
 476
 477 /* If set, extra latin codes (128..159) are accepted as a valid code
 478    on input.  */
 479 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 480
 481 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 482
 483 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 484
 485 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 486
 487 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 488
 489 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 490
 491 /* A character to be produced on output if encoding of the original
 492    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 493 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 494
 495 /* UTF-8 section */
 496 #define CODING_UTF_8_BOM(coding)        \
 497   ((coding)->spec.utf_8_bom)
 498
 499 /* UTF-16 section */
 500 #define CODING_UTF_16_BOM(coding)       \
 501   ((coding)->spec.utf_16.bom)
 502
 503 #define CODING_UTF_16_ENDIAN(coding)    \
 504   ((coding)->spec.utf_16.endian)
 505
 506 #define CODING_UTF_16_SURROGATE(coding) \
 507   ((coding)->spec.utf_16.surrogate)
 508
 509
 510 /* CCL section */
 511 #define CODING_CCL_DECODER(coding)      \
 512   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 513 #define CODING_CCL_ENCODER(coding)      \
 514   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 515 #define CODING_CCL_VALIDS(coding)                                          \
 516   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 517
 518 /* Index for each coding category in `coding_categories' */
 519
 520 enum coding_category
 521   {
 522     coding_category_iso_7,
 523     coding_category_iso_7_tight,
 524     coding_category_iso_8_1,
 525     coding_category_iso_8_2,
 526     coding_category_iso_7_else,
 527     coding_category_iso_8_else,
 528     coding_category_utf_8_auto,
 529     coding_category_utf_8_nosig,
 530     coding_category_utf_8_sig,
 531     coding_category_utf_16_auto,
 532     coding_category_utf_16_be,
 533     coding_category_utf_16_le,
 534     coding_category_utf_16_be_nosig,
 535     coding_category_utf_16_le_nosig,
 536     coding_category_charset,
 537     coding_category_sjis,
 538     coding_category_big5,
 539     coding_category_ccl,
 540     coding_category_emacs_mule,
 541     /* All above are targets of code detection.  */
 542     coding_category_raw_text,
 543     coding_category_undecided,
 544     coding_category_max
 545   };
 546
 547 /* Definitions of flag bits used in detect_coding_XXXX.  */
 548 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 549 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 550 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 551 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 552 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 553 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 554 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 555 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 556 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 557 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 558 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 559 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 560 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 561 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 562 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 563 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 564 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 565 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 566 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 567 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 568
 569 /* This value is returned if detect_coding_mask () find nothing other
 570    than ASCII characters.  */
 571 #define CATEGORY_MASK_ANY               \
 572   (CATEGORY_MASK_ISO_7                  \
 573    | CATEGORY_MASK_ISO_7_TIGHT          \
 574    | CATEGORY_MASK_ISO_8_1              \
 575    | CATEGORY_MASK_ISO_8_2              \
 576    | CATEGORY_MASK_ISO_7_ELSE           \
 577    | CATEGORY_MASK_ISO_8_ELSE           \
 578    | CATEGORY_MASK_UTF_8_AUTO           \
 579    | CATEGORY_MASK_UTF_8_NOSIG          \
 580    | CATEGORY_MASK_UTF_8_SIG            \
 581    | CATEGORY_MASK_UTF_16_AUTO          \
 582    | CATEGORY_MASK_UTF_16_BE            \
 583    | CATEGORY_MASK_UTF_16_LE            \
 584    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 585    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 586    | CATEGORY_MASK_CHARSET              \
 587    | CATEGORY_MASK_SJIS                 \
 588    | CATEGORY_MASK_BIG5                 \
 589    | CATEGORY_MASK_CCL                  \
 590    | CATEGORY_MASK_EMACS_MULE)
 591
 592
 593 #define CATEGORY_MASK_ISO_7BIT \
 594   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 595
 596 #define CATEGORY_MASK_ISO_8BIT \
 597   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 598
 599 #define CATEGORY_MASK_ISO_ELSE \
 600   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 601
 602 #define CATEGORY_MASK_ISO_ESCAPE        \
 603   (CATEGORY_MASK_ISO_7                  \
 604    | CATEGORY_MASK_ISO_7_TIGHT          \
 605    | CATEGORY_MASK_ISO_7_ELSE           \
 606    | CATEGORY_MASK_ISO_8_ELSE)
 607
 608 #define CATEGORY_MASK_ISO       \
 609   (  CATEGORY_MASK_ISO_7BIT     \
 610      | CATEGORY_MASK_ISO_8BIT   \
 611      | CATEGORY_MASK_ISO_ELSE)
 612
 613 #define CATEGORY_MASK_UTF_16            \
 614   (CATEGORY_MASK_UTF_16_AUTO            \
 615    | CATEGORY_MASK_UTF_16_BE            \
 616    | CATEGORY_MASK_UTF_16_LE            \
 617    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 618    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 619
 620 #define CATEGORY_MASK_UTF_8     \
 621   (CATEGORY_MASK_UTF_8_AUTO     \
 622    | CATEGORY_MASK_UTF_8_NOSIG  \
 623    | CATEGORY_MASK_UTF_8_SIG)
 624
 625 /* Table of coding categories (Lisp symbols).  This variable is for
 626    internal use only.  */
 627 static Lisp_Object Vcoding_category_table;
 628
 629 /* Table of coding-categories ordered by priority.  */
 630 static enum coding_category coding_priorities[coding_category_max];
 631
 632 /* Nth element is a coding context for the coding system bound to the
 633    Nth coding category.  */
 634 static struct coding_system coding_categories[coding_category_max];
 635
 636 /*** Commonly used macros and functions ***/
 637
 638 #ifndef min
 639 #define min(a, b) ((a) < (b) ? (a) : (b))
 640 #endif
 641 #ifndef max
 642 #define max(a, b) ((a) > (b) ? (a) : (b))
 643 #endif
 644
 645 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 646   do {                                                  \
 647     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 648     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 649   } while (0)
 650
 651
 652 /* Safely get one byte from the source text pointed by SRC which ends
 653    at SRC_END, and set C to that byte.  If there are not enough bytes
 654    in the source, it jumps to `no_more_source'.  If multibytep is
 655    nonzero, and a multibyte character is found at SRC, set C to the
 656    negative value of the character code.  The caller should declare
 657    and set these variables appropriately in advance:
 658         src, src_end, multibytep */
 659
 660 #define ONE_MORE_BYTE(c)                                \
 661   do {                                                  \
 662     if (src == src_end)                                 \
 663       {                                                 \
 664         if (src_base < src)                             \
 665           record_conversion_result                      \
 666             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 667         goto no_more_source;                            \
 668       }                                                 \
 669     c = *src++;                                         \
 670     if (multibytep && (c & 0x80))                       \
 671       {                                                 \
 672         if ((c & 0xFE) == 0xC0)                         \
 673           c = ((c & 1) << 6) | *src++;                  \
 674         else                                            \
 675           {                                             \
 676             src--;                                      \
 677             c = - string_char (src, &src, NULL);        \
 678             record_conversion_result                    \
 679               (coding, CODING_RESULT_INVALID_SRC);      \
 680           }                                             \
 681       }                                                 \
 682     consumed_chars++;                                   \
 683   } while (0)
 684
 685 /* Safely get two bytes from the source text pointed by SRC which ends
 686    at SRC_END, and set C1 and C2 to those bytes while skipping the
 687    heading multibyte characters.  If there are not enough bytes in the
 688    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 689    a multibyte character is found for C2, set C2 to the negative value
 690    of the character code.  The caller should declare and set these
 691    variables appropriately in advance:
 692         src, src_end, multibytep
 693    It is intended that this macro is used in detect_coding_utf_16.  */
 694
 695 #define TWO_MORE_BYTES(c1, c2)                          \
 696   do {                                                  \
 697     do {                                                \
 698       if (src == src_end)                               \
 699         goto no_more_source;                            \
 700       c1 = *src++;                                      \
 701       if (multibytep && (c1 & 0x80))                    \
 702         {                                               \
 703           if ((c1 & 0xFE) == 0xC0)                      \
 704             c1 = ((c1 & 1) << 6) | *src++;              \
 705           else                                          \
 706             {                                           \
 707               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 708               c1 = -1;                                  \
 709             }                                           \
 710         }                                               \
 711     } while (c1 < 0);                                   \
 712     if (src == src_end)                                 \
 713       goto no_more_source;                              \
 714     c2 = *src++;                                        \
 715     if (multibytep && (c2 & 0x80))                      \
 716       {                                                 \
 717         if ((c2 & 0xFE) == 0xC0)                        \
 718           c2 = ((c2 & 1) << 6) | *src++;                \
 719         else                                            \
 720           c2 = -1;                                      \
 721       }                                                 \
 722   } while (0)
 723
 724
 725 /* Store a byte C in the place pointed by DST and increment DST to the
 726    next free point, and increment PRODUCED_CHARS.  The caller should
 727    assure that C is 0..127, and declare and set the variable `dst'
 728    appropriately in advance.
 729 */
 730
 731
 732 #define EMIT_ONE_ASCII_BYTE(c)  \
 733   do {                          \
 734     produced_chars++;           \
 735     *dst++ = (c);               \
 736   } while (0)
 737
 738
 739 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 740
 741 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 742   do {                                  \
 743     produced_chars += 2;                \
 744     *dst++ = (c1), *dst++ = (c2);       \
 745   } while (0)
 746
 747
 748 /* Store a byte C in the place pointed by DST and increment DST to the
 749    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 750    nonzero, store in an appropriate multibyte from.  The caller should
 751    declare and set the variables `dst' and `multibytep' appropriately
 752    in advance.  */
 753
 754 #define EMIT_ONE_BYTE(c)                \
 755   do {                                  \
 756     produced_chars++;                   \
 757     if (multibytep)                     \
 758       {                                 \
 759         unsigned ch = (c);              \
 760         if (ch >= 0x80)                 \
 761           ch = BYTE8_TO_CHAR (ch);      \
 762         CHAR_STRING_ADVANCE (ch, dst);  \
 763       }                                 \
 764     else                                \
 765       *dst++ = (c);                     \
 766   } while (0)
 767
 768
 769 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 770
 771 #define EMIT_TWO_BYTES(c1, c2)          \
 772   do {                                  \
 773     produced_chars += 2;                \
 774     if (multibytep)                     \
 775       {                                 \
 776         unsigned ch;                    \
 777                                         \
 778         ch = (c1);                      \
 779         if (ch >= 0x80)                 \
 780           ch = BYTE8_TO_CHAR (ch);      \
 781         CHAR_STRING_ADVANCE (ch, dst);  \
 782         ch = (c2);                      \
 783         if (ch >= 0x80)                 \
 784           ch = BYTE8_TO_CHAR (ch);      \
 785         CHAR_STRING_ADVANCE (ch, dst);  \
 786       }                                 \
 787     else                                \
 788       {                                 \
 789         *dst++ = (c1);                  \
 790         *dst++ = (c2);                  \
 791       }                                 \
 792   } while (0)
 793
 794
 795 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 796   do {                                  \
 797     EMIT_ONE_BYTE (c1);                 \
 798     EMIT_TWO_BYTES (c2, c3);            \
 799   } while (0)
 800
 801
 802 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 803   do {                                          \
 804     EMIT_TWO_BYTES (c1, c2);                    \
 805     EMIT_TWO_BYTES (c3, c4);                    \
 806   } while (0)
 807
 808
 809 /* Prototypes for static functions.  */
 810 static void record_conversion_result (struct coding_system *coding,
 811                                       enum coding_result_code result);
 812 static int detect_coding_utf_8 (struct coding_system *,
 813                                 struct coding_detection_info *info);
 814 static void decode_coding_utf_8 (struct coding_system *);
 815 static int encode_coding_utf_8 (struct coding_system *);
 816
 817 static int detect_coding_utf_16 (struct coding_system *,
 818                                  struct coding_detection_info *info);
 819 static void decode_coding_utf_16 (struct coding_system *);
 820 static int encode_coding_utf_16 (struct coding_system *);
 821
 822 static int detect_coding_iso_2022 (struct coding_system *,
 823                                    struct coding_detection_info *info);
 824 static void decode_coding_iso_2022 (struct coding_system *);
 825 static int encode_coding_iso_2022 (struct coding_system *);
 826
 827 static int detect_coding_emacs_mule (struct coding_system *,
 828                                      struct coding_detection_info *info);
 829 static void decode_coding_emacs_mule (struct coding_system *);
 830 static int encode_coding_emacs_mule (struct coding_system *);
 831
 832 static int detect_coding_sjis (struct coding_system *,
 833                                struct coding_detection_info *info);
 834 static void decode_coding_sjis (struct coding_system *);
 835 static int encode_coding_sjis (struct coding_system *);
 836
 837 static int detect_coding_big5 (struct coding_system *,
 838                                struct coding_detection_info *info);
 839 static void decode_coding_big5 (struct coding_system *);
 840 static int encode_coding_big5 (struct coding_system *);
 841
 842 static int detect_coding_ccl (struct coding_system *,
 843                               struct coding_detection_info *info);
 844 static void decode_coding_ccl (struct coding_system *);
 845 static int encode_coding_ccl (struct coding_system *);
 846
 847 static void decode_coding_raw_text (struct coding_system *);
 848 static int encode_coding_raw_text (struct coding_system *);
 849
 850 static void coding_set_source (struct coding_system *);
 851 static ptrdiff_t coding_change_source (struct coding_system *);
 852 static void coding_set_destination (struct coding_system *);
 853 static ptrdiff_t coding_change_destination (struct coding_system *);
 854 static void coding_alloc_by_realloc (struct coding_system *, ptrdiff_t);
 855 static void coding_alloc_by_making_gap (struct coding_system *,
 856                                         ptrdiff_t, ptrdiff_t);
 857 static unsigned char *alloc_destination (struct coding_system *,
 858                                          ptrdiff_t, unsigned char *);
 859 static void setup_iso_safe_charsets (Lisp_Object);
 860 static ptrdiff_t encode_designation_at_bol (struct coding_system *,
 861                                       int *, int *, unsigned char *);
 862 static int detect_eol (const unsigned char *,
 863                        ptrdiff_t, enum coding_category);
 864 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
 865 static void decode_eol (struct coding_system *);
 866 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
 867 static Lisp_Object get_translation (Lisp_Object, int *, int *);
 868 static int produce_chars (struct coding_system *, Lisp_Object, int);
 869 static inline void produce_charset (struct coding_system *, int *,
 870                                     ptrdiff_t);
 871 static void produce_annotation (struct coding_system *, ptrdiff_t);
 872 static int decode_coding (struct coding_system *);
 873 static inline int *handle_composition_annotation (ptrdiff_t, ptrdiff_t,
 874                                                   struct coding_system *,
 875                                                   int *, ptrdiff_t *);
 876 static inline int *handle_charset_annotation (ptrdiff_t, ptrdiff_t,
 877                                               struct coding_system *,
 878                                               int *, ptrdiff_t *);
 879 static void consume_chars (struct coding_system *, Lisp_Object, int);
 880 static int encode_coding (struct coding_system *);
 881 static Lisp_Object make_conversion_work_buffer (int);
 882 static Lisp_Object code_conversion_restore (Lisp_Object);
 883 static inline int char_encodable_p (int, Lisp_Object);
 884 static Lisp_Object make_subsidiaries (Lisp_Object);
 885
 886 static void
 887 record_conversion_result (struct coding_system *coding,
 888                           enum coding_result_code result)
 889 {
 890   coding->result = result;
 891   switch (result)
 892     {
 893     case CODING_RESULT_INSUFFICIENT_SRC:
 894       Vlast_code_conversion_error = Qinsufficient_source;
 895       break;
 896     case CODING_RESULT_INCONSISTENT_EOL:
 897       Vlast_code_conversion_error = Qinconsistent_eol;
 898       break;
 899     case CODING_RESULT_INVALID_SRC:
 900       Vlast_code_conversion_error = Qinvalid_source;
 901       break;
 902     case CODING_RESULT_INTERRUPT:
 903       Vlast_code_conversion_error = Qinterrupted;
 904       break;
 905     case CODING_RESULT_INSUFFICIENT_MEM:
 906       Vlast_code_conversion_error = Qinsufficient_memory;
 907       break;
 908     case CODING_RESULT_INSUFFICIENT_DST:
 909       /* Don't record this error in Vlast_code_conversion_error
 910          because it happens just temporarily and is resolved when the
 911          whole conversion is finished.  */
 912       break;
 913     case CODING_RESULT_SUCCESS:
 914       break;
 915     default:
 916       Vlast_code_conversion_error = intern ("Unknown error");
 917     }
 918 }
 919
 920 /* These wrapper macros are used to preserve validity of pointers into
 921    buffer text across calls to decode_char, encode_char, etc, which
 922    could cause relocation of buffers if it loads a charset map,
 923    because loading a charset map allocates large structures.  */
 924
 925 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 926   do {                                                                       \
 927     ptrdiff_t offset;                                                        \
 928                                                                              \
 929     charset_map_loaded = 0;                                                  \
 930     c = DECODE_CHAR (charset, code);                                         \
 931     if (charset_map_loaded                                                   \
 932         && (offset = coding_change_source (coding)))                         \
 933       {                                                                      \
 934         src += offset;                                                       \
 935         src_base += offset;                                                  \
 936         src_end += offset;                                                   \
 937       }                                                                      \
 938   } while (0)
 939
 940 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 941   do {                                                                  \
 942     ptrdiff_t offset;                                                   \
 943                                                                         \
 944     charset_map_loaded = 0;                                             \
 945     code = ENCODE_CHAR (charset, c);                                    \
 946     if (charset_map_loaded                                              \
 947         && (offset = coding_change_destination (coding)))               \
 948       {                                                                 \
 949         dst += offset;                                                  \
 950         dst_end += offset;                                              \
 951       }                                                                 \
 952   } while (0)
 953
 954 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 955   do {                                                                  \
 956     ptrdiff_t offset;                                                   \
 957                                                                         \
 958     charset_map_loaded = 0;                                             \
 959     charset = char_charset (c, charset_list, code_return);              \
 960     if (charset_map_loaded                                              \
 961         && (offset = coding_change_destination (coding)))               \
 962       {                                                                 \
 963         dst += offset;                                                  \
 964         dst_end += offset;                                              \
 965       }                                                                 \
 966   } while (0)
 967
 968 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 969   do {                                                                  \
 970     ptrdiff_t offset;                                                   \
 971                                                                         \
 972     charset_map_loaded = 0;                                             \
 973     result = CHAR_CHARSET_P (c, charset);                               \
 974     if (charset_map_loaded                                              \
 975         && (offset = coding_change_destination (coding)))               \
 976       {                                                                 \
 977         dst += offset;                                                  \
 978         dst_end += offset;                                              \
 979       }                                                                 \
 980   } while (0)
 981
 982
 983 /* If there are at least BYTES length of room at dst, allocate memory
 984    for coding->destination and update dst and dst_end.  We don't have
 985    to take care of coding->source which will be relocated.  It is
 986    handled by calling coding_set_source in encode_coding.  */
 987
 988 #define ASSURE_DESTINATION(bytes)                               \
 989   do {                                                          \
 990     if (dst + (bytes) >= dst_end)                               \
 991       {                                                         \
 992         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 993                                                                 \
 994         dst = alloc_destination (coding, more_bytes, dst);      \
 995         dst_end = coding->destination + coding->dst_bytes;      \
 996       }                                                         \
 997   } while (0)
 998
 999
1000 /* Store multibyte form of the character C in P, and advance P to the
1001    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1002    never calls MAYBE_UNIFY_CHAR.  */
1003
1004 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1005   do {                                          \
1006     if ((c) <= MAX_1_BYTE_CHAR)                 \
1007       *(p)++ = (c);                             \
1008     else if ((c) <= MAX_2_BYTE_CHAR)            \
1009       *(p)++ = (0xC0 | ((c) >> 6)),             \
1010         *(p)++ = (0x80 | ((c) & 0x3F));         \
1011     else if ((c) <= MAX_3_BYTE_CHAR)            \
1012       *(p)++ = (0xE0 | ((c) >> 12)),            \
1013         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1014         *(p)++ = (0x80 | ((c) & 0x3F));         \
1015     else if ((c) <= MAX_4_BYTE_CHAR)            \
1016       *(p)++ = (0xF0 | (c >> 18)),              \
1017         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1018         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1019         *(p)++ = (0x80 | (c & 0x3F));           \
1020     else if ((c) <= MAX_5_BYTE_CHAR)            \
1021       *(p)++ = 0xF8,                            \
1022         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1023         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1024         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1025         *(p)++ = (0x80 | (c & 0x3F));           \
1026     else                                        \
1027       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1028   } while (0)
1029
1030
1031 /* Return the character code of character whose multibyte form is at
1032    P, and advance P to the end of the multibyte form.  This is like
1033    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1034
1035 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1036   (!((p)[0] & 0x80)                                             \
1037    ? *(p)++                                                     \
1038    : ! ((p)[0] & 0x20)                                          \
1039    ? ((p) += 2,                                                 \
1040       ((((p)[-2] & 0x1F) << 6)                                  \
1041        | ((p)[-1] & 0x3F)                                       \
1042        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1043    : ! ((p)[0] & 0x10)                                          \
1044    ? ((p) += 3,                                                 \
1045       ((((p)[-3] & 0x0F) << 12)                                 \
1046        | (((p)[-2] & 0x3F) << 6)                                \
1047        | ((p)[-1] & 0x3F)))                                     \
1048    : ! ((p)[0] & 0x08)                                          \
1049    ? ((p) += 4,                                                 \
1050       ((((p)[-4] & 0xF) << 18)                                  \
1051        | (((p)[-3] & 0x3F) << 12)                               \
1052        | (((p)[-2] & 0x3F) << 6)                                \
1053        | ((p)[-1] & 0x3F)))                                     \
1054    : ((p) += 5,                                                 \
1055       ((((p)[-4] & 0x3F) << 18)                                 \
1056        | (((p)[-3] & 0x3F) << 12)                               \
1057        | (((p)[-2] & 0x3F) << 6)                                \
1058        | ((p)[-1] & 0x3F))))
1059
1060
1061 /* Set coding->source from coding->src_object.  */
1062
1063 static void
1064 coding_set_source (struct coding_system *coding)
1065 {
1066   if (BUFFERP (coding->src_object))
1067     {
1068       struct buffer *buf = XBUFFER (coding->src_object);
1069
1070       if (coding->src_pos < 0)
1071         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1072       else
1073         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1074     }
1075   else if (STRINGP (coding->src_object))
1076     {
1077       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1078     }
1079   else
1080     {
1081       /* Otherwise, the source is C string and is never relocated
1082          automatically.  Thus we don't have to update anything.  */
1083     }
1084 }
1085
1086
1087 /* Set coding->source from coding->src_object, and return how many
1088    bytes coding->source was changed.  */
1089
1090 static ptrdiff_t
1091 coding_change_source (struct coding_system *coding)
1092 {
1093   const unsigned char *orig = coding->source;
1094   coding_set_source (coding);
1095   return coding->source - orig;
1096 }
1097
1098
1099 /* Set coding->destination from coding->dst_object.  */
1100
1101 static void
1102 coding_set_destination (struct coding_system *coding)
1103 {
1104   if (BUFFERP (coding->dst_object))
1105     {
1106       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1107         {
1108           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1109           coding->dst_bytes = (GAP_END_ADDR
1110                                - (coding->src_bytes - coding->consumed)
1111                                - coding->destination);
1112         }
1113       else
1114         {
1115           /* We are sure that coding->dst_pos_byte is before the gap
1116              of the buffer. */
1117           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1118                                  + coding->dst_pos_byte - BEG_BYTE);
1119           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1120                                - coding->destination);
1121         }
1122     }
1123   else
1124     {
1125       /* Otherwise, the destination is C string and is never relocated
1126          automatically.  Thus we don't have to update anything.  */
1127     }
1128 }
1129
1130
1131 /* Set coding->destination from coding->dst_object, and return how
1132    many bytes coding->destination was changed.  */
1133
1134 static ptrdiff_t
1135 coding_change_destination (struct coding_system *coding)
1136 {
1137   const unsigned char *orig = coding->destination;
1138   coding_set_destination (coding);
1139   return coding->destination - orig;
1140 }
1141
1142
1143 static void
1144 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1145 {
1146   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1147     string_overflow ();
1148   coding->destination = xrealloc (coding->destination,
1149                                   coding->dst_bytes + bytes);
1150   coding->dst_bytes += bytes;
1151 }
1152
1153 static void
1154 coding_alloc_by_making_gap (struct coding_system *coding,
1155                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1156 {
1157   if (EQ (coding->src_object, coding->dst_object))
1158     {
1159       /* The gap may contain the produced data at the head and not-yet
1160          consumed data at the tail.  To preserve those data, we at
1161          first make the gap size to zero, then increase the gap
1162          size.  */
1163       ptrdiff_t add = GAP_SIZE;
1164
1165       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1166       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1167       make_gap (bytes);
1168       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1169       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1170     }
1171   else
1172     {
1173       Lisp_Object this_buffer;
1174
1175       this_buffer = Fcurrent_buffer ();
1176       set_buffer_internal (XBUFFER (coding->dst_object));
1177       make_gap (bytes);
1178       set_buffer_internal (XBUFFER (this_buffer));
1179     }
1180 }
1181
1182
1183 static unsigned char *
1184 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1185                    unsigned char *dst)
1186 {
1187   ptrdiff_t offset = dst - coding->destination;
1188
1189   if (BUFFERP (coding->dst_object))
1190     {
1191       struct buffer *buf = XBUFFER (coding->dst_object);
1192
1193       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1194     }
1195   else
1196     coding_alloc_by_realloc (coding, nbytes);
1197   coding_set_destination (coding);
1198   dst = coding->destination + offset;
1199   return dst;
1200 }
1201
1202 /** Macros for annotations.  */
1203
1204 /* An annotation data is stored in the array coding->charbuf in this
1205    format:
1206      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1207    LENGTH is the number of elements in the annotation.
1208    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1209    NCHARS is the number of characters in the text annotated.
1210
1211    The format of the following elements depend on ANNOTATION_MASK.
1212
1213    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1214    follows:
1215      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1216
1217    NBYTES is the number of bytes specified in the header part of
1218    old-style emacs-mule encoding, or 0 for the other kind of
1219    composition.
1220
1221    METHOD is one of enum composition_method.
1222
1223    Optional COMPOSITION-COMPONENTS are characters and composition
1224    rules.
1225
1226    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1227    follows.
1228
1229    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1230    recover from an invalid annotation, and should be skipped by
1231    produce_annotation.  */
1232
1233 /* Maximum length of the header of annotation data.  */
1234 #define MAX_ANNOTATION_LENGTH 5
1235
1236 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1237   do {                                                  \
1238     *(buf)++ = -(len);                                  \
1239     *(buf)++ = (mask);                                  \
1240     *(buf)++ = (nchars);                                \
1241     coding->annotated = 1;                              \
1242   } while (0);
1243
1244 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1245   do {                                                                      \
1246     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1247     *buf++ = nbytes;                                                        \
1248     *buf++ = method;                                                        \
1249   } while (0)
1250
1251
1252 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1253   do {                                                                  \
1254     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1255     *buf++ = id;                                                        \
1256   } while (0)
1257
1258 \f
1259 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1260
1261
1262
1263 \f
1264 /*** 3. UTF-8 ***/
1265
1266 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1267    Check if a text is encoded in UTF-8.  If it is, return 1, else
1268    return 0.  */
1269
1270 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1271 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1272 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1273 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1274 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1275 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1276
1277 #define UTF_8_BOM_1 0xEF
1278 #define UTF_8_BOM_2 0xBB
1279 #define UTF_8_BOM_3 0xBF
1280
1281 static int
1282 detect_coding_utf_8 (struct coding_system *coding,
1283                      struct coding_detection_info *detect_info)
1284 {
1285   const unsigned char *src = coding->source, *src_base;
1286   const unsigned char *src_end = coding->source + coding->src_bytes;
1287   int multibytep = coding->src_multibyte;
1288   ptrdiff_t consumed_chars = 0;
1289   int bom_found = 0;
1290   int found = 0;
1291
1292   detect_info->checked |= CATEGORY_MASK_UTF_8;
1293   /* A coding system of this category is always ASCII compatible.  */
1294   src += coding->head_ascii;
1295
1296   while (1)
1297     {
1298       int c, c1, c2, c3, c4;
1299
1300       src_base = src;
1301       ONE_MORE_BYTE (c);
1302       if (c < 0 || UTF_8_1_OCTET_P (c))
1303         continue;
1304       ONE_MORE_BYTE (c1);
1305       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1306         break;
1307       if (UTF_8_2_OCTET_LEADING_P (c))
1308         {
1309           found = 1;
1310           continue;
1311         }
1312       ONE_MORE_BYTE (c2);
1313       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1314         break;
1315       if (UTF_8_3_OCTET_LEADING_P (c))
1316         {
1317           found = 1;
1318           if (src_base == coding->source
1319               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1320             bom_found = 1;
1321           continue;
1322         }
1323       ONE_MORE_BYTE (c3);
1324       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1325         break;
1326       if (UTF_8_4_OCTET_LEADING_P (c))
1327         {
1328           found = 1;
1329           continue;
1330         }
1331       ONE_MORE_BYTE (c4);
1332       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1333         break;
1334       if (UTF_8_5_OCTET_LEADING_P (c))
1335         {
1336           found = 1;
1337           continue;
1338         }
1339       break;
1340     }
1341   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1342   return 0;
1343
1344  no_more_source:
1345   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1346     {
1347       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1348       return 0;
1349     }
1350   if (bom_found)
1351     {
1352       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1353       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1354     }
1355   else
1356     {
1357       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1358       if (found)
1359         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1360     }
1361   return 1;
1362 }
1363
1364
1365 static void
1366 decode_coding_utf_8 (struct coding_system *coding)
1367 {
1368   const unsigned char *src = coding->source + coding->consumed;
1369   const unsigned char *src_end = coding->source + coding->src_bytes;
1370   const unsigned char *src_base;
1371   int *charbuf = coding->charbuf + coding->charbuf_used;
1372   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1373   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1374   int multibytep = coding->src_multibyte;
1375   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1376   int eol_dos =
1377     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1378   int byte_after_cr = -1;
1379
1380   if (bom != utf_without_bom)
1381     {
1382       int c1, c2, c3;
1383
1384       src_base = src;
1385       ONE_MORE_BYTE (c1);
1386       if (! UTF_8_3_OCTET_LEADING_P (c1))
1387         src = src_base;
1388       else
1389         {
1390           ONE_MORE_BYTE (c2);
1391           if (! UTF_8_EXTRA_OCTET_P (c2))
1392             src = src_base;
1393           else
1394             {
1395               ONE_MORE_BYTE (c3);
1396               if (! UTF_8_EXTRA_OCTET_P (c3))
1397                 src = src_base;
1398               else
1399                 {
1400                   if ((c1 != UTF_8_BOM_1)
1401                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1402                     src = src_base;
1403                   else
1404                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1405                 }
1406             }
1407         }
1408     }
1409   CODING_UTF_8_BOM (coding) = utf_without_bom;
1410
1411   while (1)
1412     {
1413       int c, c1, c2, c3, c4, c5;
1414
1415       src_base = src;
1416       consumed_chars_base = consumed_chars;
1417
1418       if (charbuf >= charbuf_end)
1419         {
1420           if (byte_after_cr >= 0)
1421             src_base--;
1422           break;
1423         }
1424
1425       if (byte_after_cr >= 0)
1426         c1 = byte_after_cr, byte_after_cr = -1;
1427       else
1428         ONE_MORE_BYTE (c1);
1429       if (c1 < 0)
1430         {
1431           c = - c1;
1432         }
1433       else if (UTF_8_1_OCTET_P (c1))
1434         {
1435           if (eol_dos && c1 == '\r')
1436             ONE_MORE_BYTE (byte_after_cr);
1437           c = c1;
1438         }
1439       else
1440         {
1441           ONE_MORE_BYTE (c2);
1442           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1443             goto invalid_code;
1444           if (UTF_8_2_OCTET_LEADING_P (c1))
1445             {
1446               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1447               /* Reject overlong sequences here and below.  Encoders
1448                  producing them are incorrect, they can be misleading,
1449                  and they mess up read/write invariance.  */
1450               if (c < 128)
1451                 goto invalid_code;
1452             }
1453           else
1454             {
1455               ONE_MORE_BYTE (c3);
1456               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1457                 goto invalid_code;
1458               if (UTF_8_3_OCTET_LEADING_P (c1))
1459                 {
1460                   c = (((c1 & 0xF) << 12)
1461                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1462                   if (c < 0x800
1463                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1464                     goto invalid_code;
1465                 }
1466               else
1467                 {
1468                   ONE_MORE_BYTE (c4);
1469                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1470                     goto invalid_code;
1471                   if (UTF_8_4_OCTET_LEADING_P (c1))
1472                     {
1473                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1474                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1475                     if (c < 0x10000)
1476                       goto invalid_code;
1477                     }
1478                   else
1479                     {
1480                       ONE_MORE_BYTE (c5);
1481                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1482                         goto invalid_code;
1483                       if (UTF_8_5_OCTET_LEADING_P (c1))
1484                         {
1485                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1486                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1487                                | (c5 & 0x3F));
1488                           if ((c > MAX_CHAR) || (c < 0x200000))
1489                             goto invalid_code;
1490                         }
1491                       else
1492                         goto invalid_code;
1493                     }
1494                 }
1495             }
1496         }
1497
1498       *charbuf++ = c;
1499       continue;
1500
1501     invalid_code:
1502       src = src_base;
1503       consumed_chars = consumed_chars_base;
1504       ONE_MORE_BYTE (c);
1505       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1506       coding->errors++;
1507     }
1508
1509  no_more_source:
1510   coding->consumed_char += consumed_chars_base;
1511   coding->consumed = src_base - coding->source;
1512   coding->charbuf_used = charbuf - coding->charbuf;
1513 }
1514
1515
1516 static int
1517 encode_coding_utf_8 (struct coding_system *coding)
1518 {
1519   int multibytep = coding->dst_multibyte;
1520   int *charbuf = coding->charbuf;
1521   int *charbuf_end = charbuf + coding->charbuf_used;
1522   unsigned char *dst = coding->destination + coding->produced;
1523   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1524   ptrdiff_t produced_chars = 0;
1525   int c;
1526
1527   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1528     {
1529       ASSURE_DESTINATION (3);
1530       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1531       CODING_UTF_8_BOM (coding) = utf_without_bom;
1532     }
1533
1534   if (multibytep)
1535     {
1536       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1537
1538       while (charbuf < charbuf_end)
1539         {
1540           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1541
1542           ASSURE_DESTINATION (safe_room);
1543           c = *charbuf++;
1544           if (CHAR_BYTE8_P (c))
1545             {
1546               c = CHAR_TO_BYTE8 (c);
1547               EMIT_ONE_BYTE (c);
1548             }
1549           else
1550             {
1551               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1552               for (p = str; p < pend; p++)
1553                 EMIT_ONE_BYTE (*p);
1554             }
1555         }
1556     }
1557   else
1558     {
1559       int safe_room = MAX_MULTIBYTE_LENGTH;
1560
1561       while (charbuf < charbuf_end)
1562         {
1563           ASSURE_DESTINATION (safe_room);
1564           c = *charbuf++;
1565           if (CHAR_BYTE8_P (c))
1566             *dst++ = CHAR_TO_BYTE8 (c);
1567           else
1568             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1569           produced_chars++;
1570         }
1571     }
1572   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1573   coding->produced_char += produced_chars;
1574   coding->produced = dst - coding->destination;
1575   return 0;
1576 }
1577
1578
1579 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1580    Check if a text is encoded in one of UTF-16 based coding systems.
1581    If it is, return 1, else return 0.  */
1582
1583 #define UTF_16_HIGH_SURROGATE_P(val) \
1584   (((val) & 0xFC00) == 0xD800)
1585
1586 #define UTF_16_LOW_SURROGATE_P(val) \
1587   (((val) & 0xFC00) == 0xDC00)
1588
1589
1590 static int
1591 detect_coding_utf_16 (struct coding_system *coding,
1592                       struct coding_detection_info *detect_info)
1593 {
1594   const unsigned char *src = coding->source;
1595   const unsigned char *src_end = coding->source + coding->src_bytes;
1596   int multibytep = coding->src_multibyte;
1597   int c1, c2;
1598
1599   detect_info->checked |= CATEGORY_MASK_UTF_16;
1600   if (coding->mode & CODING_MODE_LAST_BLOCK
1601       && (coding->src_chars & 1))
1602     {
1603       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1604       return 0;
1605     }
1606
1607   TWO_MORE_BYTES (c1, c2);
1608   if ((c1 == 0xFF) && (c2 == 0xFE))
1609     {
1610       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1611                              | CATEGORY_MASK_UTF_16_AUTO);
1612       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1613                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1614                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1615     }
1616   else if ((c1 == 0xFE) && (c2 == 0xFF))
1617     {
1618       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1619                              | CATEGORY_MASK_UTF_16_AUTO);
1620       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1621                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1622                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1623     }
1624   else if (c2 < 0)
1625     {
1626       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1627       return 0;
1628     }
1629   else
1630     {
1631       /* We check the dispersion of Eth and Oth bytes where E is even and
1632          O is odd.  If both are high, we assume binary data.*/
1633       unsigned char e[256], o[256];
1634       unsigned e_num = 1, o_num = 1;
1635
1636       memset (e, 0, 256);
1637       memset (o, 0, 256);
1638       e[c1] = 1;
1639       o[c2] = 1;
1640
1641       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1642                                 |CATEGORY_MASK_UTF_16_BE
1643                                 | CATEGORY_MASK_UTF_16_LE);
1644
1645       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1646              != CATEGORY_MASK_UTF_16)
1647         {
1648           TWO_MORE_BYTES (c1, c2);
1649           if (c2 < 0)
1650             break;
1651           if (! e[c1])
1652             {
1653               e[c1] = 1;
1654               e_num++;
1655               if (e_num >= 128)
1656                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1657             }
1658           if (! o[c2])
1659             {
1660               o[c2] = 1;
1661               o_num++;
1662               if (o_num >= 128)
1663                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1664             }
1665         }
1666       return 0;
1667     }
1668
1669  no_more_source:
1670   return 1;
1671 }
1672
1673 static void
1674 decode_coding_utf_16 (struct coding_system *coding)
1675 {
1676   const unsigned char *src = coding->source + coding->consumed;
1677   const unsigned char *src_end = coding->source + coding->src_bytes;
1678   const unsigned char *src_base;
1679   int *charbuf = coding->charbuf + coding->charbuf_used;
1680   /* We may produces at most 3 chars in one loop.  */
1681   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1682   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1683   int multibytep = coding->src_multibyte;
1684   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1685   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1686   int surrogate = CODING_UTF_16_SURROGATE (coding);
1687   int eol_dos =
1688     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1689   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1690
1691   if (bom == utf_with_bom)
1692     {
1693       int c, c1, c2;
1694
1695       src_base = src;
1696       ONE_MORE_BYTE (c1);
1697       ONE_MORE_BYTE (c2);
1698       c = (c1 << 8) | c2;
1699
1700       if (endian == utf_16_big_endian
1701           ? c != 0xFEFF : c != 0xFFFE)
1702         {
1703           /* The first two bytes are not BOM.  Treat them as bytes
1704              for a normal character.  */
1705           src = src_base;
1706           coding->errors++;
1707         }
1708       CODING_UTF_16_BOM (coding) = utf_without_bom;
1709     }
1710   else if (bom == utf_detect_bom)
1711     {
1712       /* We have already tried to detect BOM and failed in
1713          detect_coding.  */
1714       CODING_UTF_16_BOM (coding) = utf_without_bom;
1715     }
1716
1717   while (1)
1718     {
1719       int c, c1, c2;
1720
1721       src_base = src;
1722       consumed_chars_base = consumed_chars;
1723
1724       if (charbuf >= charbuf_end)
1725         {
1726           if (byte_after_cr1 >= 0)
1727             src_base -= 2;
1728           break;
1729         }
1730
1731       if (byte_after_cr1 >= 0)
1732         c1 = byte_after_cr1, byte_after_cr1 = -1;
1733       else
1734         ONE_MORE_BYTE (c1);
1735       if (c1 < 0)
1736         {
1737           *charbuf++ = -c1;
1738           continue;
1739         }
1740       if (byte_after_cr2 >= 0)
1741         c2 = byte_after_cr2, byte_after_cr2 = -1;
1742       else
1743         ONE_MORE_BYTE (c2);
1744       if (c2 < 0)
1745         {
1746           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1747           *charbuf++ = -c2;
1748           continue;
1749         }
1750       c = (endian == utf_16_big_endian
1751            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1752
1753       if (surrogate)
1754         {
1755           if (! UTF_16_LOW_SURROGATE_P (c))
1756             {
1757               if (endian == utf_16_big_endian)
1758                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1759               else
1760                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1761               *charbuf++ = c1;
1762               *charbuf++ = c2;
1763               coding->errors++;
1764               if (UTF_16_HIGH_SURROGATE_P (c))
1765                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1766               else
1767                 *charbuf++ = c;
1768             }
1769           else
1770             {
1771               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1772               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1773               *charbuf++ = 0x10000 + c;
1774             }
1775         }
1776       else
1777         {
1778           if (UTF_16_HIGH_SURROGATE_P (c))
1779             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1780           else
1781             {
1782               if (eol_dos && c == '\r')
1783                 {
1784                   ONE_MORE_BYTE (byte_after_cr1);
1785                   ONE_MORE_BYTE (byte_after_cr2);
1786                 }
1787               *charbuf++ = c;
1788             }
1789         }
1790     }
1791
1792  no_more_source:
1793   coding->consumed_char += consumed_chars_base;
1794   coding->consumed = src_base - coding->source;
1795   coding->charbuf_used = charbuf - coding->charbuf;
1796 }
1797
1798 static int
1799 encode_coding_utf_16 (struct coding_system *coding)
1800 {
1801   int multibytep = coding->dst_multibyte;
1802   int *charbuf = coding->charbuf;
1803   int *charbuf_end = charbuf + coding->charbuf_used;
1804   unsigned char *dst = coding->destination + coding->produced;
1805   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1806   int safe_room = 8;
1807   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1808   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1809   ptrdiff_t produced_chars = 0;
1810   int c;
1811
1812   if (bom != utf_without_bom)
1813     {
1814       ASSURE_DESTINATION (safe_room);
1815       if (big_endian)
1816         EMIT_TWO_BYTES (0xFE, 0xFF);
1817       else
1818         EMIT_TWO_BYTES (0xFF, 0xFE);
1819       CODING_UTF_16_BOM (coding) = utf_without_bom;
1820     }
1821
1822   while (charbuf < charbuf_end)
1823     {
1824       ASSURE_DESTINATION (safe_room);
1825       c = *charbuf++;
1826       if (c > MAX_UNICODE_CHAR)
1827         c = coding->default_char;
1828
1829       if (c < 0x10000)
1830         {
1831           if (big_endian)
1832             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1833           else
1834             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1835         }
1836       else
1837         {
1838           int c1, c2;
1839
1840           c -= 0x10000;
1841           c1 = (c >> 10) + 0xD800;
1842           c2 = (c & 0x3FF) + 0xDC00;
1843           if (big_endian)
1844             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1845           else
1846             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1847         }
1848     }
1849   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1850   coding->produced = dst - coding->destination;
1851   coding->produced_char += produced_chars;
1852   return 0;
1853 }
1854
1855 \f
1856 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1857
1858 /* Emacs' internal format for representation of multiple character
1859    sets is a kind of multi-byte encoding, i.e. characters are
1860    represented by variable-length sequences of one-byte codes.
1861
1862    ASCII characters and control characters (e.g. `tab', `newline') are
1863    represented by one-byte sequences which are their ASCII codes, in
1864    the range 0x00 through 0x7F.
1865
1866    8-bit characters of the range 0x80..0x9F are represented by
1867    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1868    code + 0x20).
1869
1870    8-bit characters of the range 0xA0..0xFF are represented by
1871    one-byte sequences which are their 8-bit code.
1872
1873    The other characters are represented by a sequence of `base
1874    leading-code', optional `extended leading-code', and one or two
1875    `position-code's.  The length of the sequence is determined by the
1876    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1877    whereas extended leading-code and position-code take the range 0xA0
1878    through 0xFF.  See `charset.h' for more details about leading-code
1879    and position-code.
1880
1881    --- CODE RANGE of Emacs' internal format ---
1882    character set        range
1883    -------------        -----
1884    ascii                0x00..0x7F
1885    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1886    eight-bit-graphic    0xA0..0xBF
1887    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1888    ---------------------------------------------
1889
1890    As this is the internal character representation, the format is
1891    usually not used externally (i.e. in a file or in a data sent to a
1892    process).  But, it is possible to have a text externally in this
1893    format (i.e. by encoding by the coding system `emacs-mule').
1894
1895    In that case, a sequence of one-byte codes has a slightly different
1896    form.
1897
1898    At first, all characters in eight-bit-control are represented by
1899    one-byte sequences which are their 8-bit code.
1900
1901    Next, character composition data are represented by the byte
1902    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1903    where,
1904         METHOD is 0xF2 plus one of composition method (enum
1905         composition_method),
1906
1907         BYTES is 0xA0 plus a byte length of this composition data,
1908
1909         CHARS is 0xA0 plus a number of characters composed by this
1910         data,
1911
1912         COMPONENTs are characters of multibyte form or composition
1913         rules encoded by two-byte of ASCII codes.
1914
1915    In addition, for backward compatibility, the following formats are
1916    also recognized as composition data on decoding.
1917
1918    0x80 MSEQ ...
1919    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1920
1921    Here,
1922         MSEQ is a multibyte form but in these special format:
1923           ASCII: 0xA0 ASCII_CODE+0x80,
1924           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1925         RULE is a one byte code of the range 0xA0..0xF0 that
1926         represents a composition rule.
1927   */
1928
1929 char emacs_mule_bytes[256];
1930
1931
1932 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1933    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1934    else return 0.  */
1935
1936 static int
1937 detect_coding_emacs_mule (struct coding_system *coding,
1938                           struct coding_detection_info *detect_info)
1939 {
1940   const unsigned char *src = coding->source, *src_base;
1941   const unsigned char *src_end = coding->source + coding->src_bytes;
1942   int multibytep = coding->src_multibyte;
1943   ptrdiff_t consumed_chars = 0;
1944   int c;
1945   int found = 0;
1946
1947   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1948   /* A coding system of this category is always ASCII compatible.  */
1949   src += coding->head_ascii;
1950
1951   while (1)
1952     {
1953       src_base = src;
1954       ONE_MORE_BYTE (c);
1955       if (c < 0)
1956         continue;
1957       if (c == 0x80)
1958         {
1959           /* Perhaps the start of composite character.  We simply skip
1960              it because analyzing it is too heavy for detecting.  But,
1961              at least, we check that the composite character
1962              constitutes of more than 4 bytes.  */
1963           const unsigned char *src_start;
1964
1965         repeat:
1966           src_start = src;
1967           do
1968             {
1969               ONE_MORE_BYTE (c);
1970             }
1971           while (c >= 0xA0);
1972
1973           if (src - src_start <= 4)
1974             break;
1975           found = CATEGORY_MASK_EMACS_MULE;
1976           if (c == 0x80)
1977             goto repeat;
1978         }
1979
1980       if (c < 0x80)
1981         {
1982           if (c < 0x20
1983               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1984             break;
1985         }
1986       else
1987         {
1988           int more_bytes = emacs_mule_bytes[c] - 1;
1989
1990           while (more_bytes > 0)
1991             {
1992               ONE_MORE_BYTE (c);
1993               if (c < 0xA0)
1994                 {
1995                   src--;        /* Unread the last byte.  */
1996                   break;
1997                 }
1998               more_bytes--;
1999             }
2000           if (more_bytes != 0)
2001             break;
2002           found = CATEGORY_MASK_EMACS_MULE;
2003         }
2004     }
2005   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2006   return 0;
2007
2008  no_more_source:
2009   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2010     {
2011       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2012       return 0;
2013     }
2014   detect_info->found |= found;
2015   return 1;
2016 }
2017
2018
2019 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2020    character.  If CMP_STATUS indicates that we must expect MSEQ or
2021    RULE described above, decode it and return the negative value of
2022    the decoded character or rule.  If an invalid byte is found, return
2023    -1.  If SRC is too short, return -2.  */
2024
2025 static int
2026 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2027                  int *nbytes, int *nchars, int *id,
2028                  struct composition_status *cmp_status)
2029 {
2030   const unsigned char *src_end = coding->source + coding->src_bytes;
2031   const unsigned char *src_base = src;
2032   int multibytep = coding->src_multibyte;
2033   int charset_ID;
2034   unsigned code;
2035   int c;
2036   int consumed_chars = 0;
2037   int mseq_found = 0;
2038
2039   ONE_MORE_BYTE (c);
2040   if (c < 0)
2041     {
2042       c = -c;
2043       charset_ID = emacs_mule_charset[0];
2044     }
2045   else
2046     {
2047       if (c >= 0xA0)
2048         {
2049           if (cmp_status->state != COMPOSING_NO
2050               && cmp_status->old_form)
2051             {
2052               if (cmp_status->state == COMPOSING_CHAR)
2053                 {
2054                   if (c == 0xA0)
2055                     {
2056                       ONE_MORE_BYTE (c);
2057                       c -= 0x80;
2058                       if (c < 0)
2059                         goto invalid_code;
2060                     }
2061                   else
2062                     c -= 0x20;
2063                   mseq_found = 1;
2064                 }
2065               else
2066                 {
2067                   *nbytes = src - src_base;
2068                   *nchars = consumed_chars;
2069                   return -c;
2070                 }
2071             }
2072           else
2073             goto invalid_code;
2074         }
2075
2076       switch (emacs_mule_bytes[c])
2077         {
2078         case 2:
2079           if ((charset_ID = emacs_mule_charset[c]) < 0)
2080             goto invalid_code;
2081           ONE_MORE_BYTE (c);
2082           if (c < 0xA0)
2083             goto invalid_code;
2084           code = c & 0x7F;
2085           break;
2086
2087         case 3:
2088           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2089               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2090             {
2091               ONE_MORE_BYTE (c);
2092               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2093                 goto invalid_code;
2094               ONE_MORE_BYTE (c);
2095               if (c < 0xA0)
2096                 goto invalid_code;
2097               code = c & 0x7F;
2098             }
2099           else
2100             {
2101               if ((charset_ID = emacs_mule_charset[c]) < 0)
2102                 goto invalid_code;
2103               ONE_MORE_BYTE (c);
2104               if (c < 0xA0)
2105                 goto invalid_code;
2106               code = (c & 0x7F) << 8;
2107               ONE_MORE_BYTE (c);
2108               if (c < 0xA0)
2109                 goto invalid_code;
2110               code |= c & 0x7F;
2111             }
2112           break;
2113
2114         case 4:
2115           ONE_MORE_BYTE (c);
2116           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2117             goto invalid_code;
2118           ONE_MORE_BYTE (c);
2119           if (c < 0xA0)
2120             goto invalid_code;
2121           code = (c & 0x7F) << 8;
2122           ONE_MORE_BYTE (c);
2123           if (c < 0xA0)
2124             goto invalid_code;
2125           code |= c & 0x7F;
2126           break;
2127
2128         case 1:
2129           code = c;
2130           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2131           break;
2132
2133         default:
2134           abort ();
2135         }
2136       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2137                           CHARSET_FROM_ID (charset_ID), code, c);
2138       if (c < 0)
2139         goto invalid_code;
2140     }
2141   *nbytes = src - src_base;
2142   *nchars = consumed_chars;
2143   if (id)
2144     *id = charset_ID;
2145   return (mseq_found ? -c : c);
2146
2147  no_more_source:
2148   return -2;
2149
2150  invalid_code:
2151   return -1;
2152 }
2153
2154
2155 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2156
2157 /* Handle these composition sequence ('|': the end of header elements,
2158    BYTES and CHARS >= 0xA0):
2159
2160    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2161    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2162    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2163
2164    and these old form:
2165
2166    (4) relative composition: 0x80 | MSEQ ... MSEQ
2167    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2168
2169    When the starter 0x80 and the following header elements are found,
2170    this annotation header is produced.
2171
2172         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2173
2174    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2175    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2176
2177    Then, upon reading the following elements, these codes are produced
2178    until the composition end is found:
2179
2180    (1) CHAR ... CHAR
2181    (2) ALT ... ALT CHAR ... CHAR
2182    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2183    (4) CHAR ... CHAR
2184    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2185
2186    When the composition end is found, LENGTH and NCHARS in the
2187    annotation header is updated as below:
2188
2189    (1) LENGTH: unchanged, NCHARS: unchanged
2190    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2191    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2192    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2193    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2194
2195    If an error is found while composing, the annotation header is
2196    changed to the original composition header (plus filler -1s) as
2197    below:
2198
2199    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2200    (5)          [ 0x80 0xFF -1 -1- -1 ]
2201
2202    and the sequence [ -2 DECODED-RULE ] is changed to the original
2203    byte sequence as below:
2204         o the original byte sequence is B: [ B -1 ]
2205         o the original byte sequence is B1 B2: [ B1 B2 ]
2206
2207    Most of the routines are implemented by macros because many
2208    variables and labels in the caller decode_coding_emacs_mule must be
2209    accessible, and they are usually called just once (thus doesn't
2210    increase the size of compiled object).  */
2211
2212 /* Decode a composition rule represented by C as a component of
2213    composition sequence of Emacs 20 style.  Set RULE to the decoded
2214    rule. */
2215
2216 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2217   do {                                                  \
2218     int gref, nref;                                     \
2219                                                         \
2220     c -= 0xA0;                                          \
2221     if (c < 0 || c >= 81)                               \
2222       goto invalid_code;                                \
2223     gref = c / 9, nref = c % 9;                         \
2224     if (gref == 4) gref = 10;                           \
2225     if (nref == 4) nref = 10;                           \
2226     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2227   } while (0)
2228
2229
2230 /* Decode a composition rule represented by C and the following byte
2231    at SRC as a component of composition sequence of Emacs 21 style.
2232    Set RULE to the decoded rule.  */
2233
2234 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2235   do {                                                  \
2236     int gref, nref;                                     \
2237                                                         \
2238     gref = c - 0x20;                                    \
2239     if (gref < 0 || gref >= 81)                         \
2240       goto invalid_code;                                \
2241     ONE_MORE_BYTE (c);                                  \
2242     nref = c - 0x20;                                    \
2243     if (nref < 0 || nref >= 81)                         \
2244       goto invalid_code;                                \
2245     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2246   } while (0)
2247
2248
2249 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2250    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2251    byte length of this composition information, CHARS is the number of
2252    characters composed by this composition.  */
2253
2254 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2255   do {                                                                  \
2256     enum composition_method method = c - 0xF2;                          \
2257     int nbytes, nchars;                                                 \
2258                                                                         \
2259     ONE_MORE_BYTE (c);                                                  \
2260     if (c < 0)                                                          \
2261       goto invalid_code;                                                \
2262     nbytes = c - 0xA0;                                                  \
2263     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2264       goto invalid_code;                                                \
2265     ONE_MORE_BYTE (c);                                                  \
2266     nchars = c - 0xA0;                                                  \
2267     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2268       goto invalid_code;                                                \
2269     cmp_status->old_form = 0;                                           \
2270     cmp_status->method = method;                                        \
2271     if (method == COMPOSITION_RELATIVE)                                 \
2272       cmp_status->state = COMPOSING_CHAR;                               \
2273     else                                                                \
2274       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2275     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2276     cmp_status->nchars = nchars;                                        \
2277     cmp_status->ncomps = nbytes - 4;                                    \
2278     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2279   } while (0)
2280
2281
2282 /* Start of Emacs 20 style format for relative composition.  */
2283
2284 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2285   do {                                                          \
2286     cmp_status->old_form = 1;                                   \
2287     cmp_status->method = COMPOSITION_RELATIVE;                  \
2288     cmp_status->state = COMPOSING_CHAR;                         \
2289     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2290     cmp_status->nchars = cmp_status->ncomps = 0;                \
2291     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2292   } while (0)
2293
2294
2295 /* Start of Emacs 20 style format for rule-base composition.  */
2296
2297 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2298   do {                                                          \
2299     cmp_status->old_form = 1;                                   \
2300     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2301     cmp_status->state = COMPOSING_CHAR;                         \
2302     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2303     cmp_status->nchars = cmp_status->ncomps = 0;                \
2304     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2305   } while (0)
2306
2307
2308 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2309   do {                                                  \
2310     const unsigned char *current_src = src;             \
2311                                                         \
2312     ONE_MORE_BYTE (c);                                  \
2313     if (c < 0)                                          \
2314       goto invalid_code;                                \
2315     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2316         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2317       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2318     else if (c < 0xA0)                                  \
2319       goto invalid_code;                                \
2320     else if (c < 0xC0)                                  \
2321       {                                                 \
2322         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2323         /* Re-read C as a composition component.  */    \
2324         src = current_src;                              \
2325       }                                                 \
2326     else if (c == 0xFF)                                 \
2327       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2328     else                                                \
2329       goto invalid_code;                                \
2330   } while (0)
2331
2332 #define EMACS_MULE_COMPOSITION_END()                            \
2333   do {                                                          \
2334     int idx = - cmp_status->length;                             \
2335                                                                 \
2336     if (cmp_status->old_form)                                   \
2337       charbuf[idx + 2] = cmp_status->nchars;                    \
2338     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2339       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2340     cmp_status->state = COMPOSING_NO;                           \
2341   } while (0)
2342
2343
2344 static int
2345 emacs_mule_finish_composition (int *charbuf,
2346                                struct composition_status *cmp_status)
2347 {
2348   int idx = - cmp_status->length;
2349   int new_chars;
2350
2351   if (cmp_status->old_form && cmp_status->nchars > 0)
2352     {
2353       charbuf[idx + 2] = cmp_status->nchars;
2354       new_chars = 0;
2355       if (cmp_status->method == COMPOSITION_WITH_RULE
2356           && cmp_status->state == COMPOSING_CHAR)
2357         {
2358           /* The last rule was invalid.  */
2359           int rule = charbuf[-1] + 0xA0;
2360
2361           charbuf[-2] = BYTE8_TO_CHAR (rule);
2362           charbuf[-1] = -1;
2363           new_chars = 1;
2364         }
2365     }
2366   else
2367     {
2368       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2369
2370       if (cmp_status->method == COMPOSITION_WITH_RULE)
2371         {
2372           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2373           charbuf[idx++] = -3;
2374           charbuf[idx++] = 0;
2375           new_chars = 1;
2376         }
2377       else
2378         {
2379           int nchars = charbuf[idx + 1] + 0xA0;
2380           int nbytes = charbuf[idx + 2] + 0xA0;
2381
2382           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2383           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2384           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2385           charbuf[idx++] = -1;
2386           new_chars = 4;
2387         }
2388     }
2389   cmp_status->state = COMPOSING_NO;
2390   return new_chars;
2391 }
2392
2393 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2394   do {                                                                    \
2395     if (cmp_status->state != COMPOSING_NO)                                \
2396       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2397   } while (0)
2398
2399
2400 static void
2401 decode_coding_emacs_mule (struct coding_system *coding)
2402 {
2403   const unsigned char *src = coding->source + coding->consumed;
2404   const unsigned char *src_end = coding->source + coding->src_bytes;
2405   const unsigned char *src_base;
2406   int *charbuf = coding->charbuf + coding->charbuf_used;
2407   /* We may produce two annotations (charset and composition) in one
2408      loop and one more charset annotation at the end.  */
2409   int *charbuf_end
2410     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2411       /* We can produce up to 2 characters in a loop.  */
2412       - 1;
2413   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2414   int multibytep = coding->src_multibyte;
2415   ptrdiff_t char_offset = coding->produced_char;
2416   ptrdiff_t last_offset = char_offset;
2417   int last_id = charset_ascii;
2418   int eol_dos =
2419     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2420   int byte_after_cr = -1;
2421   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2422
2423   if (cmp_status->state != COMPOSING_NO)
2424     {
2425       int i;
2426
2427       if (charbuf_end - charbuf < cmp_status->length)
2428         abort ();
2429       for (i = 0; i < cmp_status->length; i++)
2430         *charbuf++ = cmp_status->carryover[i];
2431       coding->annotated = 1;
2432     }
2433
2434   while (1)
2435     {
2436       int c, id IF_LINT (= 0);
2437
2438       src_base = src;
2439       consumed_chars_base = consumed_chars;
2440
2441       if (charbuf >= charbuf_end)
2442         {
2443           if (byte_after_cr >= 0)
2444             src_base--;
2445           break;
2446         }
2447
2448       if (byte_after_cr >= 0)
2449         c = byte_after_cr, byte_after_cr = -1;
2450       else
2451         ONE_MORE_BYTE (c);
2452
2453       if (c < 0 || c == 0x80)
2454         {
2455           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2456           if (c < 0)
2457             {
2458               *charbuf++ = -c;
2459               char_offset++;
2460             }
2461           else
2462             DECODE_EMACS_MULE_COMPOSITION_START ();
2463           continue;
2464         }
2465
2466       if (c < 0x80)
2467         {
2468           if (eol_dos && c == '\r')
2469             ONE_MORE_BYTE (byte_after_cr);
2470           id = charset_ascii;
2471           if (cmp_status->state != COMPOSING_NO)
2472             {
2473               if (cmp_status->old_form)
2474                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2475               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2476                 cmp_status->ncomps--;
2477             }
2478         }
2479       else
2480         {
2481           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2482           /* emacs_mule_char can load a charset map from a file, which
2483              allocates a large structure and might cause buffer text
2484              to be relocated as result.  Thus, we need to remember the
2485              original pointer to buffer text, and fix up all related
2486              pointers after the call.  */
2487           const unsigned char *orig = coding->source;
2488           ptrdiff_t offset;
2489
2490           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2491                                cmp_status);
2492           offset = coding->source - orig;
2493           if (offset)
2494             {
2495               src += offset;
2496               src_base += offset;
2497               src_end += offset;
2498             }
2499           if (c < 0)
2500             {
2501               if (c == -1)
2502                 goto invalid_code;
2503               if (c == -2)
2504                 break;
2505             }
2506           src = src_base + nbytes;
2507           consumed_chars = consumed_chars_base + nchars;
2508           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2509             cmp_status->ncomps -= nchars;
2510         }
2511
2512       /* Now if C >= 0, we found a normally encoded character, if C <
2513          0, we found an old-style composition component character or
2514          rule.  */
2515
2516       if (cmp_status->state == COMPOSING_NO)
2517         {
2518           if (last_id != id)
2519             {
2520               if (last_id != charset_ascii)
2521                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2522                                   last_id);
2523               last_id = id;
2524               last_offset = char_offset;
2525             }
2526           *charbuf++ = c;
2527           char_offset++;
2528         }
2529       else if (cmp_status->state == COMPOSING_CHAR)
2530         {
2531           if (cmp_status->old_form)
2532             {
2533               if (c >= 0)
2534                 {
2535                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2536                   *charbuf++ = c;
2537                   char_offset++;
2538                 }
2539               else
2540                 {
2541                   *charbuf++ = -c;
2542                   cmp_status->nchars++;
2543                   cmp_status->length++;
2544                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2545                     EMACS_MULE_COMPOSITION_END ();
2546                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2547                     cmp_status->state = COMPOSING_RULE;
2548                 }
2549             }
2550           else
2551             {
2552               *charbuf++ = c;
2553               cmp_status->length++;
2554               cmp_status->nchars--;
2555               if (cmp_status->nchars == 0)
2556                 EMACS_MULE_COMPOSITION_END ();
2557             }
2558         }
2559       else if (cmp_status->state == COMPOSING_RULE)
2560         {
2561           int rule;
2562
2563           if (c >= 0)
2564             {
2565               EMACS_MULE_COMPOSITION_END ();
2566               *charbuf++ = c;
2567               char_offset++;
2568             }
2569           else
2570             {
2571               c = -c;
2572               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2573               if (rule < 0)
2574                 goto invalid_code;
2575               *charbuf++ = -2;
2576               *charbuf++ = rule;
2577               cmp_status->length += 2;
2578               cmp_status->state = COMPOSING_CHAR;
2579             }
2580         }
2581       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2582         {
2583           *charbuf++ = c;
2584           cmp_status->length++;
2585           if (cmp_status->ncomps == 0)
2586             cmp_status->state = COMPOSING_CHAR;
2587           else if (cmp_status->ncomps > 0)
2588             {
2589               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2590                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2591             }
2592           else
2593             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2594         }
2595       else                      /* COMPOSING_COMPONENT_RULE */
2596         {
2597           int rule;
2598
2599           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2600           if (rule < 0)
2601             goto invalid_code;
2602           *charbuf++ = -2;
2603           *charbuf++ = rule;
2604           cmp_status->length += 2;
2605           cmp_status->ncomps--;
2606           if (cmp_status->ncomps > 0)
2607             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2608           else
2609             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2610         }
2611       continue;
2612
2613     invalid_code:
2614       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2615       src = src_base;
2616       consumed_chars = consumed_chars_base;
2617       ONE_MORE_BYTE (c);
2618       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2619       char_offset++;
2620       coding->errors++;
2621     }
2622
2623  no_more_source:
2624   if (cmp_status->state != COMPOSING_NO)
2625     {
2626       if (coding->mode & CODING_MODE_LAST_BLOCK)
2627         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2628       else
2629         {
2630           int i;
2631
2632           charbuf -= cmp_status->length;
2633           for (i = 0; i < cmp_status->length; i++)
2634             cmp_status->carryover[i] = charbuf[i];
2635         }
2636     }
2637   if (last_id != charset_ascii)
2638     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2639   coding->consumed_char += consumed_chars_base;
2640   coding->consumed = src_base - coding->source;
2641   coding->charbuf_used = charbuf - coding->charbuf;
2642 }
2643
2644
2645 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2646   do {                                          \
2647     if (id < 0xA0)                              \
2648       codes[0] = id, codes[1] = 0;              \
2649     else if (id < 0xE0)                         \
2650       codes[0] = 0x9A, codes[1] = id;           \
2651     else if (id < 0xF0)                         \
2652       codes[0] = 0x9B, codes[1] = id;           \
2653     else if (id < 0xF5)                         \
2654       codes[0] = 0x9C, codes[1] = id;           \
2655     else                                        \
2656       codes[0] = 0x9D, codes[1] = id;           \
2657   } while (0);
2658
2659
2660 static int
2661 encode_coding_emacs_mule (struct coding_system *coding)
2662 {
2663   int multibytep = coding->dst_multibyte;
2664   int *charbuf = coding->charbuf;
2665   int *charbuf_end = charbuf + coding->charbuf_used;
2666   unsigned char *dst = coding->destination + coding->produced;
2667   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2668   int safe_room = 8;
2669   ptrdiff_t produced_chars = 0;
2670   Lisp_Object attrs, charset_list;
2671   int c;
2672   int preferred_charset_id = -1;
2673
2674   CODING_GET_INFO (coding, attrs, charset_list);
2675   if (! EQ (charset_list, Vemacs_mule_charset_list))
2676     {
2677       charset_list = Vemacs_mule_charset_list;
2678       ASET (attrs, coding_attr_charset_list, charset_list);
2679     }
2680
2681   while (charbuf < charbuf_end)
2682     {
2683       ASSURE_DESTINATION (safe_room);
2684       c = *charbuf++;
2685
2686       if (c < 0)
2687         {
2688           /* Handle an annotation.  */
2689           switch (*charbuf)
2690             {
2691             case CODING_ANNOTATE_COMPOSITION_MASK:
2692               /* Not yet implemented.  */
2693               break;
2694             case CODING_ANNOTATE_CHARSET_MASK:
2695               preferred_charset_id = charbuf[3];
2696               if (preferred_charset_id >= 0
2697                   && NILP (Fmemq (make_number (preferred_charset_id),
2698                                   charset_list)))
2699                 preferred_charset_id = -1;
2700               break;
2701             default:
2702               abort ();
2703             }
2704           charbuf += -c - 1;
2705           continue;
2706         }
2707
2708       if (ASCII_CHAR_P (c))
2709         EMIT_ONE_ASCII_BYTE (c);
2710       else if (CHAR_BYTE8_P (c))
2711         {
2712           c = CHAR_TO_BYTE8 (c);
2713           EMIT_ONE_BYTE (c);
2714         }
2715       else
2716         {
2717           struct charset *charset;
2718           unsigned code;
2719           int dimension;
2720           int emacs_mule_id;
2721           unsigned char leading_codes[2];
2722
2723           if (preferred_charset_id >= 0)
2724             {
2725               int result;
2726
2727               charset = CHARSET_FROM_ID (preferred_charset_id);
2728               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2729               if (result)
2730                 code = ENCODE_CHAR (charset, c);
2731               else
2732                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2733                                      &code, charset);
2734             }
2735           else
2736             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2737                                  &code, charset);
2738           if (! charset)
2739             {
2740               c = coding->default_char;
2741               if (ASCII_CHAR_P (c))
2742                 {
2743                   EMIT_ONE_ASCII_BYTE (c);
2744                   continue;
2745                 }
2746               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2747                                    &code, charset);
2748             }
2749           dimension = CHARSET_DIMENSION (charset);
2750           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2751           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2752           EMIT_ONE_BYTE (leading_codes[0]);
2753           if (leading_codes[1])
2754             EMIT_ONE_BYTE (leading_codes[1]);
2755           if (dimension == 1)
2756             EMIT_ONE_BYTE (code | 0x80);
2757           else
2758             {
2759               code |= 0x8080;
2760               EMIT_ONE_BYTE (code >> 8);
2761               EMIT_ONE_BYTE (code & 0xFF);
2762             }
2763         }
2764     }
2765   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2766   coding->produced_char += produced_chars;
2767   coding->produced = dst - coding->destination;
2768   return 0;
2769 }
2770
2771 \f
2772 /*** 7. ISO2022 handlers ***/
2773
2774 /* The following note describes the coding system ISO2022 briefly.
2775    Since the intention of this note is to help understand the
2776    functions in this file, some parts are NOT ACCURATE or are OVERLY
2777    SIMPLIFIED.  For thorough understanding, please refer to the
2778    original document of ISO2022.  This is equivalent to the standard
2779    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2780
2781    ISO2022 provides many mechanisms to encode several character sets
2782    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2783    is encoded using bytes less than 128.  This may make the encoded
2784    text a little bit longer, but the text passes more easily through
2785    several types of gateway, some of which strip off the MSB (Most
2786    Significant Bit).
2787
2788    There are two kinds of character sets: control character sets and
2789    graphic character sets.  The former contain control characters such
2790    as `newline' and `escape' to provide control functions (control
2791    functions are also provided by escape sequences).  The latter
2792    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2793    two control character sets and many graphic character sets.
2794
2795    Graphic character sets are classified into one of the following
2796    four classes, according to the number of bytes (DIMENSION) and
2797    number of characters in one dimension (CHARS) of the set:
2798    - DIMENSION1_CHARS94
2799    - DIMENSION1_CHARS96
2800    - DIMENSION2_CHARS94
2801    - DIMENSION2_CHARS96
2802
2803    In addition, each character set is assigned an identification tag,
2804    unique for each set, called the "final character" (denoted as <F>
2805    hereafter).  The <F> of each character set is decided by ECMA(*)
2806    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2807    (0x30..0x3F are for private use only).
2808
2809    Note (*): ECMA = European Computer Manufacturers Association
2810
2811    Here are examples of graphic character sets [NAME(<F>)]:
2812         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2813         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2814         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2815         o DIMENSION2_CHARS96 -- none for the moment
2816
2817    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2818         C0 [0x00..0x1F] -- control character plane 0
2819         GL [0x20..0x7F] -- graphic character plane 0
2820         C1 [0x80..0x9F] -- control character plane 1
2821         GR [0xA0..0xFF] -- graphic character plane 1
2822
2823    A control character set is directly designated and invoked to C0 or
2824    C1 by an escape sequence.  The most common case is that:
2825    - ISO646's  control character set is designated/invoked to C0, and
2826    - ISO6429's control character set is designated/invoked to C1,
2827    and usually these designations/invocations are omitted in encoded
2828    text.  In a 7-bit environment, only C0 can be used, and a control
2829    character for C1 is encoded by an appropriate escape sequence to
2830    fit into the environment.  All control characters for C1 are
2831    defined to have corresponding escape sequences.
2832
2833    A graphic character set is at first designated to one of four
2834    graphic registers (G0 through G3), then these graphic registers are
2835    invoked to GL or GR.  These designations and invocations can be
2836    done independently.  The most common case is that G0 is invoked to
2837    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2838    these invocations and designations are omitted in encoded text.
2839    In a 7-bit environment, only GL can be used.
2840
2841    When a graphic character set of CHARS94 is invoked to GL, codes
2842    0x20 and 0x7F of the GL area work as control characters SPACE and
2843    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2844    be used.
2845
2846    There are two ways of invocation: locking-shift and single-shift.
2847    With locking-shift, the invocation lasts until the next different
2848    invocation, whereas with single-shift, the invocation affects the
2849    following character only and doesn't affect the locking-shift
2850    state.  Invocations are done by the following control characters or
2851    escape sequences:
2852
2853    ----------------------------------------------------------------------
2854    abbrev  function                  cntrl escape seq   description
2855    ----------------------------------------------------------------------
2856    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2857    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2858    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2859    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2860    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2861    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2862    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2863    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2864    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2865    ----------------------------------------------------------------------
2866    (*) These are not used by any known coding system.
2867
2868    Control characters for these functions are defined by macros
2869    ISO_CODE_XXX in `coding.h'.
2870
2871    Designations are done by the following escape sequences:
2872    ----------------------------------------------------------------------
2873    escape sequence      description
2874    ----------------------------------------------------------------------
2875    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2876    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2877    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2878    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2879    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2880    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2881    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2882    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2883    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2884    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2885    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2886    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2887    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2888    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2889    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2890    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2891    ----------------------------------------------------------------------
2892
2893    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2894    of dimension 1, chars 94, and final character <F>, etc...
2895
2896    Note (*): Although these designations are not allowed in ISO2022,
2897    Emacs accepts them on decoding, and produces them on encoding
2898    CHARS96 character sets in a coding system which is characterized as
2899    7-bit environment, non-locking-shift, and non-single-shift.
2900
2901    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2902    '(' must be omitted.  We refer to this as "short-form" hereafter.
2903
2904    Now you may notice that there are a lot of ways of encoding the
2905    same multilingual text in ISO2022.  Actually, there exist many
2906    coding systems such as Compound Text (used in X11's inter client
2907    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2908    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2909    localized platforms), and all of these are variants of ISO2022.
2910
2911    In addition to the above, Emacs handles two more kinds of escape
2912    sequences: ISO6429's direction specification and Emacs' private
2913    sequence for specifying character composition.
2914
2915    ISO6429's direction specification takes the following form:
2916         o CSI ']'      -- end of the current direction
2917         o CSI '0' ']'  -- end of the current direction
2918         o CSI '1' ']'  -- start of left-to-right text
2919         o CSI '2' ']'  -- start of right-to-left text
2920    The control character CSI (0x9B: control sequence introducer) is
2921    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2922
2923    Character composition specification takes the following form:
2924         o ESC '0' -- start relative composition
2925         o ESC '1' -- end composition
2926         o ESC '2' -- start rule-base composition (*)
2927         o ESC '3' -- start relative composition with alternate chars  (**)
2928         o ESC '4' -- start rule-base composition with alternate chars  (**)
2929   Since these are not standard escape sequences of any ISO standard,
2930   the use of them with these meanings is restricted to Emacs only.
2931
2932   (*) This form is used only in Emacs 20.7 and older versions,
2933   but newer versions can safely decode it.
2934   (**) This form is used only in Emacs 21.1 and newer versions,
2935   and older versions can't decode it.
2936
2937   Here's a list of example usages of these composition escape
2938   sequences (categorized by `enum composition_method').
2939
2940   COMPOSITION_RELATIVE:
2941         ESC 0 CHAR [ CHAR ] ESC 1
2942   COMPOSITION_WITH_RULE:
2943         ESC 2 CHAR [ RULE CHAR ] ESC 1
2944   COMPOSITION_WITH_ALTCHARS:
2945         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2946   COMPOSITION_WITH_RULE_ALTCHARS:
2947         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2948
2949 static enum iso_code_class_type iso_code_class[256];
2950
2951 #define SAFE_CHARSET_P(coding, id)      \
2952   ((id) <= (coding)->max_charset_id     \
2953    && (coding)->safe_charsets[id] != 255)
2954
2955 static void
2956 setup_iso_safe_charsets (Lisp_Object attrs)
2957 {
2958   Lisp_Object charset_list, safe_charsets;
2959   Lisp_Object request;
2960   Lisp_Object reg_usage;
2961   Lisp_Object tail;
2962   EMACS_INT reg94, reg96;
2963   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2964   int max_charset_id;
2965
2966   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2967   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2968       && ! EQ (charset_list, Viso_2022_charset_list))
2969     {
2970       charset_list = Viso_2022_charset_list;
2971       ASET (attrs, coding_attr_charset_list, charset_list);
2972       ASET (attrs, coding_attr_safe_charsets, Qnil);
2973     }
2974
2975   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2976     return;
2977
2978   max_charset_id = 0;
2979   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2980     {
2981       int id = XINT (XCAR (tail));
2982       if (max_charset_id < id)
2983         max_charset_id = id;
2984     }
2985
2986   safe_charsets = make_uninit_string (max_charset_id + 1);
2987   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2988   request = AREF (attrs, coding_attr_iso_request);
2989   reg_usage = AREF (attrs, coding_attr_iso_usage);
2990   reg94 = XINT (XCAR (reg_usage));
2991   reg96 = XINT (XCDR (reg_usage));
2992
2993   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2994     {
2995       Lisp_Object id;
2996       Lisp_Object reg;
2997       struct charset *charset;
2998
2999       id = XCAR (tail);
3000       charset = CHARSET_FROM_ID (XINT (id));
3001       reg = Fcdr (Fassq (id, request));
3002       if (! NILP (reg))
3003         SSET (safe_charsets, XINT (id), XINT (reg));
3004       else if (charset->iso_chars_96)
3005         {
3006           if (reg96 < 4)
3007             SSET (safe_charsets, XINT (id), reg96);
3008         }
3009       else
3010         {
3011           if (reg94 < 4)
3012             SSET (safe_charsets, XINT (id), reg94);
3013         }
3014     }
3015   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3016 }
3017
3018
3019 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3020    Check if a text is encoded in one of ISO-2022 based coding systems.
3021    If it is, return 1, else return 0.  */
3022
3023 static int
3024 detect_coding_iso_2022 (struct coding_system *coding,
3025                         struct coding_detection_info *detect_info)
3026 {
3027   const unsigned char *src = coding->source, *src_base = src;
3028   const unsigned char *src_end = coding->source + coding->src_bytes;
3029   int multibytep = coding->src_multibyte;
3030   int single_shifting = 0;
3031   int id;
3032   int c, c1;
3033   ptrdiff_t consumed_chars = 0;
3034   int i;
3035   int rejected = 0;
3036   int found = 0;
3037   int composition_count = -1;
3038
3039   detect_info->checked |= CATEGORY_MASK_ISO;
3040
3041   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3042     {
3043       struct coding_system *this = &(coding_categories[i]);
3044       Lisp_Object attrs, val;
3045
3046       if (this->id < 0)
3047         continue;
3048       attrs = CODING_ID_ATTRS (this->id);
3049       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3050           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3051         setup_iso_safe_charsets (attrs);
3052       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3053       this->max_charset_id = SCHARS (val) - 1;
3054       this->safe_charsets = SDATA (val);
3055     }
3056
3057   /* A coding system of this category is always ASCII compatible.  */
3058   src += coding->head_ascii;
3059
3060   while (rejected != CATEGORY_MASK_ISO)
3061     {
3062       src_base = src;
3063       ONE_MORE_BYTE (c);
3064       switch (c)
3065         {
3066         case ISO_CODE_ESC:
3067           if (inhibit_iso_escape_detection)
3068             break;
3069           single_shifting = 0;
3070           ONE_MORE_BYTE (c);
3071           if (c == 'N' || c == 'O')
3072             {
3073               /* ESC <Fe> for SS2 or SS3.  */
3074               single_shifting = 1;
3075               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3076             }
3077           else if (c == '1')
3078             {
3079               /* End of composition.  */
3080               if (composition_count < 0
3081                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3082                 /* Invalid */
3083                 break;
3084               composition_count = -1;
3085               found |= CATEGORY_MASK_ISO;
3086             }
3087           else if (c >= '0' && c <= '4')
3088             {
3089               /* ESC <Fp> for start/end composition.  */
3090               composition_count = 0;
3091             }
3092           else
3093             {
3094               if (c >= '(' && c <= '/')
3095                 {
3096                   /* Designation sequence for a charset of dimension 1.  */
3097                   ONE_MORE_BYTE (c1);
3098                   if (c1 < ' ' || c1 >= 0x80
3099                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3100                     /* Invalid designation sequence.  Just ignore.  */
3101                     break;
3102                 }
3103               else if (c == '$')
3104                 {
3105                   /* Designation sequence for a charset of dimension 2.  */
3106                   ONE_MORE_BYTE (c);
3107                   if (c >= '@' && c <= 'B')
3108                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3109                     id = iso_charset_table[1][0][c];
3110                   else if (c >= '(' && c <= '/')
3111                     {
3112                       ONE_MORE_BYTE (c1);
3113                       if (c1 < ' ' || c1 >= 0x80
3114                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3115                         /* Invalid designation sequence.  Just ignore.  */
3116                         break;
3117                     }
3118                   else
3119                     /* Invalid designation sequence.  Just ignore it.  */
3120                     break;
3121                 }
3122               else
3123                 {
3124                   /* Invalid escape sequence.  Just ignore it.  */
3125                   break;
3126                 }
3127
3128               /* We found a valid designation sequence for CHARSET.  */
3129               rejected |= CATEGORY_MASK_ISO_8BIT;
3130               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3131                                   id))
3132                 found |= CATEGORY_MASK_ISO_7;
3133               else
3134                 rejected |= CATEGORY_MASK_ISO_7;
3135               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3136                                   id))
3137                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3138               else
3139                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3140               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3141                                   id))
3142                 found |= CATEGORY_MASK_ISO_7_ELSE;
3143               else
3144                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3145               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3146                                   id))
3147                 found |= CATEGORY_MASK_ISO_8_ELSE;
3148               else
3149                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3150             }
3151           break;
3152
3153         case ISO_CODE_SO:
3154         case ISO_CODE_SI:
3155           /* Locking shift out/in.  */
3156           if (inhibit_iso_escape_detection)
3157             break;
3158           single_shifting = 0;
3159           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3160           break;
3161
3162         case ISO_CODE_CSI:
3163           /* Control sequence introducer.  */
3164           single_shifting = 0;
3165           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3166           found |= CATEGORY_MASK_ISO_8_ELSE;
3167           goto check_extra_latin;
3168
3169         case ISO_CODE_SS2:
3170         case ISO_CODE_SS3:
3171           /* Single shift.   */
3172           if (inhibit_iso_escape_detection)
3173             break;
3174           single_shifting = 0;
3175           rejected |= CATEGORY_MASK_ISO_7BIT;
3176           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3177               & CODING_ISO_FLAG_SINGLE_SHIFT)
3178             {
3179               found |= CATEGORY_MASK_ISO_8_1;
3180               single_shifting = 1;
3181             }
3182           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3183               & CODING_ISO_FLAG_SINGLE_SHIFT)
3184             {
3185               found |= CATEGORY_MASK_ISO_8_2;
3186               single_shifting = 1;
3187             }
3188           if (single_shifting)
3189             break;
3190         check_extra_latin:
3191           if (! VECTORP (Vlatin_extra_code_table)
3192               || NILP (AREF (Vlatin_extra_code_table, c)))
3193             {
3194               rejected = CATEGORY_MASK_ISO;
3195               break;
3196             }
3197           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3198               & CODING_ISO_FLAG_LATIN_EXTRA)
3199             found |= CATEGORY_MASK_ISO_8_1;
3200           else
3201             rejected |= CATEGORY_MASK_ISO_8_1;
3202           rejected |= CATEGORY_MASK_ISO_8_2;
3203           break;
3204
3205         default:
3206           if (c < 0)
3207             continue;
3208           if (c < 0x80)
3209             {
3210               if (composition_count >= 0)
3211                 composition_count++;
3212               single_shifting = 0;
3213               break;
3214             }
3215           if (c >= 0xA0)
3216             {
3217               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3218               found |= CATEGORY_MASK_ISO_8_1;
3219               /* Check the length of succeeding codes of the range
3220                  0xA0..0FF.  If the byte length is even, we include
3221                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3222                  only when we are not single shifting.  */
3223               if (! single_shifting
3224                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3225                 {
3226                   int len = 1;
3227                   while (src < src_end)
3228                     {
3229                       src_base = src;
3230                       ONE_MORE_BYTE (c);
3231                       if (c < 0xA0)
3232                         {
3233                           src = src_base;
3234                           break;
3235                         }
3236                       len++;
3237                     }
3238
3239                   if (len & 1 && src < src_end)
3240                     {
3241                       rejected |= CATEGORY_MASK_ISO_8_2;
3242                       if (composition_count >= 0)
3243                         composition_count += len;
3244                     }
3245                   else
3246                     {
3247                       found |= CATEGORY_MASK_ISO_8_2;
3248                       if (composition_count >= 0)
3249                         composition_count += len / 2;
3250                     }
3251                 }
3252               break;
3253             }
3254         }
3255     }
3256   detect_info->rejected |= CATEGORY_MASK_ISO;
3257   return 0;
3258
3259  no_more_source:
3260   detect_info->rejected |= rejected;
3261   detect_info->found |= (found & ~rejected);
3262   return 1;
3263 }
3264
3265
3266 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3267    escape sequence should be kept.  */
3268 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3269   do {                                                                  \
3270     int id, prev;                                                       \
3271                                                                         \
3272     if (final < '0' || final >= 128                                     \
3273         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3274         || !SAFE_CHARSET_P (coding, id))                                \
3275       {                                                                 \
3276         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3277         chars_96 = -1;                                                  \
3278         break;                                                          \
3279       }                                                                 \
3280     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3281     if (id == charset_jisx0201_roman)                                   \
3282       {                                                                 \
3283         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3284           id = charset_ascii;                                           \
3285       }                                                                 \
3286     else if (id == charset_jisx0208_1978)                               \
3287       {                                                                 \
3288         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3289           id = charset_jisx0208;                                        \
3290       }                                                                 \
3291     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3292     /* If there was an invalid designation to REG previously, and this  \
3293        designation is ASCII to REG, we should keep this designation     \
3294        sequence.  */                                                    \
3295     if (prev == -2 && id == charset_ascii)                              \
3296       chars_96 = -1;                                                    \
3297   } while (0)
3298
3299
3300 /* Handle these composition sequence (ALT: alternate char):
3301
3302    (1) relative composition: ESC 0 CHAR ... ESC 1
3303    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3304    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3305    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3306
3307    When the start sequence (ESC 0/2/3/4) is found, this annotation
3308    header is produced.
3309
3310         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3311
3312    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3313    produced until the end sequence (ESC 1) is found:
3314
3315    (1) CHAR ... CHAR
3316    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3317    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3318    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3319
3320    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3321    annotation header is updated as below:
3322
3323    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3324    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3325    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3326    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3327
3328    If an error is found while composing, the annotation header is
3329    changed to:
3330
3331         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3332
3333    and the sequence [ -2 DECODED-RULE ] is changed to the original
3334    byte sequence as below:
3335         o the original byte sequence is B: [ B -1 ]
3336         o the original byte sequence is B1 B2: [ B1 B2 ]
3337    and the sequence [ -1 -1 ] is changed to the original byte
3338    sequence:
3339         [ ESC '0' ]
3340 */
3341
3342 /* Decode a composition rule C1 and maybe one more byte from the
3343    source, and set RULE to the encoded composition rule.  If the rule
3344    is invalid, goto invalid_code.  */
3345
3346 #define DECODE_COMPOSITION_RULE(rule)                                   \
3347   do {                                                                  \
3348     rule = c1 - 32;                                                     \
3349     if (rule < 0)                                                       \
3350       goto invalid_code;                                                \
3351     if (rule < 81)              /* old format (before ver.21) */        \
3352       {                                                                 \
3353         int gref = (rule) / 9;                                          \
3354         int nref = (rule) % 9;                                          \
3355         if (gref == 4) gref = 10;                                       \
3356         if (nref == 4) nref = 10;                                       \
3357         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3358       }                                                                 \
3359     else                        /* new format (after ver.21) */         \
3360       {                                                                 \
3361         int b;                                                          \
3362                                                                         \
3363         ONE_MORE_BYTE (b);                                              \
3364         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3365           goto invalid_code;                                            \
3366         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3367         rule += 0x100;   /* Distinguish it from the old format.  */     \
3368       }                                                                 \
3369   } while (0)
3370
3371 #define ENCODE_COMPOSITION_RULE(rule)                           \
3372   do {                                                          \
3373     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3374                                                                 \
3375     if (rule < 0x100)           /* old format */                \
3376       {                                                         \
3377         if (gref == 10) gref = 4;                               \
3378         if (nref == 10) nref = 4;                               \
3379         charbuf[idx] = 32 + gref * 9 + nref;                    \
3380         charbuf[idx + 1] = -1;                                  \
3381         new_chars++;                                            \
3382       }                                                         \
3383     else                                /* new format */        \
3384       {                                                         \
3385         charbuf[idx] = 32 + 81 + gref;                          \
3386         charbuf[idx + 1] = 32 + nref;                           \
3387         new_chars += 2;                                         \
3388       }                                                         \
3389   } while (0)
3390
3391 /* Finish the current composition as invalid.  */
3392
3393 static int finish_composition (int *, struct composition_status *);
3394
3395 static int
3396 finish_composition (int *charbuf, struct composition_status *cmp_status)
3397 {
3398   int idx = - cmp_status->length;
3399   int new_chars;
3400
3401   /* Recover the original ESC sequence */
3402   charbuf[idx++] = ISO_CODE_ESC;
3403   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3404                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3405                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3406                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3407                     : '4');
3408   charbuf[idx++] = -2;
3409   charbuf[idx++] = 0;
3410   charbuf[idx++] = -1;
3411   new_chars = cmp_status->nchars;
3412   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3413     for (; idx < 0; idx++)
3414       {
3415         int elt = charbuf[idx];
3416
3417         if (elt == -2)
3418           {
3419             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3420             idx++;
3421           }
3422         else if (elt == -1)
3423           {
3424             charbuf[idx++] = ISO_CODE_ESC;
3425             charbuf[idx] = '0';
3426             new_chars += 2;
3427           }
3428       }
3429   cmp_status->state = COMPOSING_NO;
3430   return new_chars;
3431 }
3432
3433 /* If characters are under composition, finish the composition.  */
3434 #define MAYBE_FINISH_COMPOSITION()                              \
3435   do {                                                          \
3436     if (cmp_status->state != COMPOSING_NO)                      \
3437       char_offset += finish_composition (charbuf, cmp_status);  \
3438   } while (0)
3439
3440 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3441
3442    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3443    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3444    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3445    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3446
3447    Produce this annotation sequence now:
3448
3449    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3450 */
3451
3452 #define DECODE_COMPOSITION_START(c1)                                       \
3453   do {                                                                     \
3454     if (c1 == '0'                                                          \
3455         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3456              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3457             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3458                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3459       {                                                                    \
3460         *charbuf++ = -1;                                                   \
3461         *charbuf++= -1;                                                    \
3462         cmp_status->state = COMPOSING_CHAR;                                \
3463         cmp_status->length += 2;                                           \
3464       }                                                                    \
3465     else                                                                   \
3466       {                                                                    \
3467         MAYBE_FINISH_COMPOSITION ();                                       \
3468         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3469                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3470                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3471                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3472         cmp_status->state                                                  \
3473           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3474         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3475         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3476         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3477         coding->annotated = 1;                                             \
3478       }                                                                    \
3479   } while (0)
3480
3481
3482 /* Handle composition end sequence ESC 1.  */
3483
3484 #define DECODE_COMPOSITION_END()                                        \
3485   do {                                                                  \
3486     if (cmp_status->nchars == 0                                         \
3487         || ((cmp_status->state == COMPOSING_CHAR)                       \
3488             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3489       {                                                                 \
3490         MAYBE_FINISH_COMPOSITION ();                                    \
3491         goto invalid_code;                                              \
3492       }                                                                 \
3493     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3494       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3495     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3496       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3497     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3498     char_offset += cmp_status->nchars;                                  \
3499     cmp_status->state = COMPOSING_NO;                                   \
3500   } while (0)
3501
3502 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3503
3504 #define STORE_COMPOSITION_RULE(rule)    \
3505   do {                                  \
3506     *charbuf++ = -2;                    \
3507     *charbuf++ = rule;                  \
3508     cmp_status->length += 2;            \
3509     cmp_status->state--;                \
3510   } while (0)
3511
3512 /* Store a composed char or a component char C in charbuf, and update
3513    cmp_status.  */
3514
3515 #define STORE_COMPOSITION_CHAR(c)                                       \
3516   do {                                                                  \
3517     *charbuf++ = (c);                                                   \
3518     cmp_status->length++;                                               \
3519     if (cmp_status->state == COMPOSING_CHAR)                            \
3520       cmp_status->nchars++;                                             \
3521     else                                                                \
3522       cmp_status->ncomps++;                                             \
3523     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3524         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3525             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3526       cmp_status->state++;                                              \
3527   } while (0)
3528
3529
3530 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3531
3532 static void
3533 decode_coding_iso_2022 (struct coding_system *coding)
3534 {
3535   const unsigned char *src = coding->source + coding->consumed;
3536   const unsigned char *src_end = coding->source + coding->src_bytes;
3537   const unsigned char *src_base;
3538   int *charbuf = coding->charbuf + coding->charbuf_used;
3539   /* We may produce two annotations (charset and composition) in one
3540      loop and one more charset annotation at the end.  */
3541   int *charbuf_end
3542     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3543   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3544   int multibytep = coding->src_multibyte;
3545   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3546   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3547   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3548   int charset_id_2, charset_id_3;
3549   struct charset *charset;
3550   int c;
3551   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3552   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3553   ptrdiff_t char_offset = coding->produced_char;
3554   ptrdiff_t last_offset = char_offset;
3555   int last_id = charset_ascii;
3556   int eol_dos =
3557     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3558   int byte_after_cr = -1;
3559   int i;
3560
3561   setup_iso_safe_charsets (attrs);
3562   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3563
3564   if (cmp_status->state != COMPOSING_NO)
3565     {
3566       if (charbuf_end - charbuf < cmp_status->length)
3567         abort ();
3568       for (i = 0; i < cmp_status->length; i++)
3569         *charbuf++ = cmp_status->carryover[i];
3570       coding->annotated = 1;
3571     }
3572
3573   while (1)
3574     {
3575       int c1, c2, c3;
3576
3577       src_base = src;
3578       consumed_chars_base = consumed_chars;
3579
3580       if (charbuf >= charbuf_end)
3581         {
3582           if (byte_after_cr >= 0)
3583             src_base--;
3584           break;
3585         }
3586
3587       if (byte_after_cr >= 0)
3588         c1 = byte_after_cr, byte_after_cr = -1;
3589       else
3590         ONE_MORE_BYTE (c1);
3591       if (c1 < 0)
3592         goto invalid_code;
3593
3594       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3595         {
3596           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3597           char_offset++;
3598           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3599           continue;
3600         }
3601
3602       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3603         {
3604           if (c1 == ISO_CODE_ESC)
3605             {
3606               if (src + 1 >= src_end)
3607                 goto no_more_source;
3608               *charbuf++ = ISO_CODE_ESC;
3609               char_offset++;
3610               if (src[0] == '%' && src[1] == '@')
3611                 {
3612                   src += 2;
3613                   consumed_chars += 2;
3614                   char_offset += 2;
3615                   /* We are sure charbuf can contain two more chars. */
3616                   *charbuf++ = '%';
3617                   *charbuf++ = '@';
3618                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3619                 }
3620             }
3621           else
3622             {
3623               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3624               char_offset++;
3625             }
3626           continue;
3627         }
3628
3629       if ((cmp_status->state == COMPOSING_RULE
3630            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3631           && c1 != ISO_CODE_ESC)
3632         {
3633           int rule;
3634
3635           DECODE_COMPOSITION_RULE (rule);
3636           STORE_COMPOSITION_RULE (rule);
3637           continue;
3638         }
3639
3640       /* We produce at most one character.  */
3641       switch (iso_code_class [c1])
3642         {
3643         case ISO_0x20_or_0x7F:
3644           if (charset_id_0 < 0
3645               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3646             /* This is SPACE or DEL.  */
3647             charset = CHARSET_FROM_ID (charset_ascii);
3648           else
3649             charset = CHARSET_FROM_ID (charset_id_0);
3650           break;
3651
3652         case ISO_graphic_plane_0:
3653           if (charset_id_0 < 0)
3654             charset = CHARSET_FROM_ID (charset_ascii);
3655           else
3656             charset = CHARSET_FROM_ID (charset_id_0);
3657           break;
3658
3659         case ISO_0xA0_or_0xFF:
3660           if (charset_id_1 < 0
3661               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3662               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3663             goto invalid_code;
3664           /* This is a graphic character, we fall down ... */
3665
3666         case ISO_graphic_plane_1:
3667           if (charset_id_1 < 0)
3668             goto invalid_code;
3669           charset = CHARSET_FROM_ID (charset_id_1);
3670           break;
3671
3672         case ISO_control_0:
3673           if (eol_dos && c1 == '\r')
3674             ONE_MORE_BYTE (byte_after_cr);
3675           MAYBE_FINISH_COMPOSITION ();
3676           charset = CHARSET_FROM_ID (charset_ascii);
3677           break;
3678
3679         case ISO_control_1:
3680           goto invalid_code;
3681
3682         case ISO_shift_out:
3683           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3684               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3685             goto invalid_code;
3686           CODING_ISO_INVOCATION (coding, 0) = 1;
3687           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3688           continue;
3689
3690         case ISO_shift_in:
3691           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3692             goto invalid_code;
3693           CODING_ISO_INVOCATION (coding, 0) = 0;
3694           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3695           continue;
3696
3697         case ISO_single_shift_2_7:
3698           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3699             goto invalid_code;
3700         case ISO_single_shift_2:
3701           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3702             goto invalid_code;
3703           /* SS2 is handled as an escape sequence of ESC 'N' */
3704           c1 = 'N';
3705           goto label_escape_sequence;
3706
3707         case ISO_single_shift_3:
3708           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3709             goto invalid_code;
3710           /* SS2 is handled as an escape sequence of ESC 'O' */
3711           c1 = 'O';
3712           goto label_escape_sequence;
3713
3714         case ISO_control_sequence_introducer:
3715           /* CSI is handled as an escape sequence of ESC '[' ...  */
3716           c1 = '[';
3717           goto label_escape_sequence;
3718
3719         case ISO_escape:
3720           ONE_MORE_BYTE (c1);
3721         label_escape_sequence:
3722           /* Escape sequences handled here are invocation,
3723              designation, direction specification, and character
3724              composition specification.  */
3725           switch (c1)
3726             {
3727             case '&':           /* revision of following character set */
3728               ONE_MORE_BYTE (c1);
3729               if (!(c1 >= '@' && c1 <= '~'))
3730                 goto invalid_code;
3731               ONE_MORE_BYTE (c1);
3732               if (c1 != ISO_CODE_ESC)
3733                 goto invalid_code;
3734               ONE_MORE_BYTE (c1);
3735               goto label_escape_sequence;
3736
3737             case '$':           /* designation of 2-byte character set */
3738               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3739                 goto invalid_code;
3740               {
3741                 int reg, chars96;
3742
3743                 ONE_MORE_BYTE (c1);
3744                 if (c1 >= '@' && c1 <= 'B')
3745                   {     /* designation of JISX0208.1978, GB2312.1980,
3746                            or JISX0208.1980 */
3747                     reg = 0, chars96 = 0;
3748                   }
3749                 else if (c1 >= 0x28 && c1 <= 0x2B)
3750                   { /* designation of DIMENSION2_CHARS94 character set */
3751                     reg = c1 - 0x28, chars96 = 0;
3752                     ONE_MORE_BYTE (c1);
3753                   }
3754                 else if (c1 >= 0x2C && c1 <= 0x2F)
3755                   { /* designation of DIMENSION2_CHARS96 character set */
3756                     reg = c1 - 0x2C, chars96 = 1;
3757                     ONE_MORE_BYTE (c1);
3758                   }
3759                 else
3760                   goto invalid_code;
3761                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3762                 /* We must update these variables now.  */
3763                 if (reg == 0)
3764                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3765                 else if (reg == 1)
3766                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3767                 if (chars96 < 0)
3768                   goto invalid_code;
3769               }
3770               continue;
3771
3772             case 'n':           /* invocation of locking-shift-2 */
3773               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3774                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3775                 goto invalid_code;
3776               CODING_ISO_INVOCATION (coding, 0) = 2;
3777               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3778               continue;
3779
3780             case 'o':           /* invocation of locking-shift-3 */
3781               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3782                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3783                 goto invalid_code;
3784               CODING_ISO_INVOCATION (coding, 0) = 3;
3785               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3786               continue;
3787
3788             case 'N':           /* invocation of single-shift-2 */
3789               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3790                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3791                 goto invalid_code;
3792               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3793               if (charset_id_2 < 0)
3794                 charset = CHARSET_FROM_ID (charset_ascii);
3795               else
3796                 charset = CHARSET_FROM_ID (charset_id_2);
3797               ONE_MORE_BYTE (c1);
3798               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3799                 goto invalid_code;
3800               break;
3801
3802             case 'O':           /* invocation of single-shift-3 */
3803               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3804                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3805                 goto invalid_code;
3806               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3807               if (charset_id_3 < 0)
3808                 charset = CHARSET_FROM_ID (charset_ascii);
3809               else
3810                 charset = CHARSET_FROM_ID (charset_id_3);
3811               ONE_MORE_BYTE (c1);
3812               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3813                 goto invalid_code;
3814               break;
3815
3816             case '0': case '2': case '3': case '4': /* start composition */
3817               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3818                 goto invalid_code;
3819               if (last_id != charset_ascii)
3820                 {
3821                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3822                   last_id = charset_ascii;
3823                   last_offset = char_offset;
3824                 }
3825               DECODE_COMPOSITION_START (c1);
3826               continue;
3827
3828             case '1':           /* end composition */
3829               if (cmp_status->state == COMPOSING_NO)
3830                 goto invalid_code;
3831               DECODE_COMPOSITION_END ();
3832               continue;
3833
3834             case '[':           /* specification of direction */
3835               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3836                 goto invalid_code;
3837               /* For the moment, nested direction is not supported.
3838                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3839                  left-to-right, and nonzero means right-to-left.  */
3840               ONE_MORE_BYTE (c1);
3841               switch (c1)
3842                 {
3843                 case ']':       /* end of the current direction */
3844                   coding->mode &= ~CODING_MODE_DIRECTION;
3845
3846                 case '0':       /* end of the current direction */
3847                 case '1':       /* start of left-to-right direction */
3848                   ONE_MORE_BYTE (c1);
3849                   if (c1 == ']')
3850                     coding->mode &= ~CODING_MODE_DIRECTION;
3851                   else
3852                     goto invalid_code;
3853                   break;
3854
3855                 case '2':       /* start of right-to-left direction */
3856                   ONE_MORE_BYTE (c1);
3857                   if (c1 == ']')
3858                     coding->mode |= CODING_MODE_DIRECTION;
3859                   else
3860                     goto invalid_code;
3861                   break;
3862
3863                 default:
3864                   goto invalid_code;
3865                 }
3866               continue;
3867
3868             case '%':
3869               ONE_MORE_BYTE (c1);
3870               if (c1 == '/')
3871                 {
3872                   /* CTEXT extended segment:
3873                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3874                      We keep these bytes as is for the moment.
3875                      They may be decoded by post-read-conversion.  */
3876                   int dim, M, L;
3877                   int size;
3878
3879                   ONE_MORE_BYTE (dim);
3880                   if (dim < '0' || dim > '4')
3881                     goto invalid_code;
3882                   ONE_MORE_BYTE (M);
3883                   if (M < 128)
3884                     goto invalid_code;
3885                   ONE_MORE_BYTE (L);
3886                   if (L < 128)
3887                     goto invalid_code;
3888                   size = ((M - 128) * 128) + (L - 128);
3889                   if (charbuf + 6 > charbuf_end)
3890                     goto break_loop;
3891                   *charbuf++ = ISO_CODE_ESC;
3892                   *charbuf++ = '%';
3893                   *charbuf++ = '/';
3894                   *charbuf++ = dim;
3895                   *charbuf++ = BYTE8_TO_CHAR (M);
3896                   *charbuf++ = BYTE8_TO_CHAR (L);
3897                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3898                 }
3899               else if (c1 == 'G')
3900                 {
3901                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3902                      ESC % G --UTF-8-BYTES-- ESC % @
3903                      We keep these bytes as is for the moment.
3904                      They may be decoded by post-read-conversion.  */
3905                   if (charbuf + 3 > charbuf_end)
3906                     goto break_loop;
3907                   *charbuf++ = ISO_CODE_ESC;
3908                   *charbuf++ = '%';
3909                   *charbuf++ = 'G';
3910                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3911                 }
3912               else
3913                 goto invalid_code;
3914               continue;
3915               break;
3916
3917             default:
3918               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3919                 goto invalid_code;
3920               {
3921                 int reg, chars96;
3922
3923                 if (c1 >= 0x28 && c1 <= 0x2B)
3924                   { /* designation of DIMENSION1_CHARS94 character set */
3925                     reg = c1 - 0x28, chars96 = 0;
3926                     ONE_MORE_BYTE (c1);
3927                   }
3928                 else if (c1 >= 0x2C && c1 <= 0x2F)
3929                   { /* designation of DIMENSION1_CHARS96 character set */
3930                     reg = c1 - 0x2C, chars96 = 1;
3931                     ONE_MORE_BYTE (c1);
3932                   }
3933                 else
3934                   goto invalid_code;
3935                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3936                 /* We must update these variables now.  */
3937                 if (reg == 0)
3938                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3939                 else if (reg == 1)
3940                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3941                 if (chars96 < 0)
3942                   goto invalid_code;
3943               }
3944               continue;
3945             }
3946           break;
3947
3948         default:
3949           abort ();
3950         }
3951
3952       if (cmp_status->state == COMPOSING_NO
3953           && charset->id != charset_ascii
3954           && last_id != charset->id)
3955         {
3956           if (last_id != charset_ascii)
3957             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3958           last_id = charset->id;
3959           last_offset = char_offset;
3960         }
3961
3962       /* Now we know CHARSET and 1st position code C1 of a character.
3963          Produce a decoded character while getting 2nd and 3rd
3964          position codes C2, C3 if necessary.  */
3965       if (CHARSET_DIMENSION (charset) > 1)
3966         {
3967           ONE_MORE_BYTE (c2);
3968           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3969               || ((c1 & 0x80) != (c2 & 0x80)))
3970             /* C2 is not in a valid range.  */
3971             goto invalid_code;
3972           if (CHARSET_DIMENSION (charset) == 2)
3973             c1 = (c1 << 8) | c2;
3974           else
3975             {
3976               ONE_MORE_BYTE (c3);
3977               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3978                   || ((c1 & 0x80) != (c3 & 0x80)))
3979                 /* C3 is not in a valid range.  */
3980                 goto invalid_code;
3981               c1 = (c1 << 16) | (c2 << 8) | c2;
3982             }
3983         }
3984       c1 &= 0x7F7F7F;
3985       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3986       if (c < 0)
3987         {
3988           MAYBE_FINISH_COMPOSITION ();
3989           for (; src_base < src; src_base++, char_offset++)
3990             {
3991               if (ASCII_BYTE_P (*src_base))
3992                 *charbuf++ = *src_base;
3993               else
3994                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3995             }
3996         }
3997       else if (cmp_status->state == COMPOSING_NO)
3998         {
3999           *charbuf++ = c;
4000           char_offset++;
4001         }
4002       else if ((cmp_status->state == COMPOSING_CHAR
4003                 ? cmp_status->nchars
4004                 : cmp_status->ncomps)
4005                >= MAX_COMPOSITION_COMPONENTS)
4006         {
4007           /* Too long composition.  */
4008           MAYBE_FINISH_COMPOSITION ();
4009           *charbuf++ = c;
4010           char_offset++;
4011         }
4012       else
4013         STORE_COMPOSITION_CHAR (c);
4014       continue;
4015
4016     invalid_code:
4017       MAYBE_FINISH_COMPOSITION ();
4018       src = src_base;
4019       consumed_chars = consumed_chars_base;
4020       ONE_MORE_BYTE (c);
4021       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4022       char_offset++;
4023       coding->errors++;
4024       continue;
4025
4026     break_loop:
4027       break;
4028     }
4029
4030  no_more_source:
4031   if (cmp_status->state != COMPOSING_NO)
4032     {
4033       if (coding->mode & CODING_MODE_LAST_BLOCK)
4034         MAYBE_FINISH_COMPOSITION ();
4035       else
4036         {
4037           charbuf -= cmp_status->length;
4038           for (i = 0; i < cmp_status->length; i++)
4039             cmp_status->carryover[i] = charbuf[i];
4040         }
4041     }
4042   else if (last_id != charset_ascii)
4043     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4044   coding->consumed_char += consumed_chars_base;
4045   coding->consumed = src_base - coding->source;
4046   coding->charbuf_used = charbuf - coding->charbuf;
4047 }
4048
4049
4050 /* ISO2022 encoding stuff.  */
4051
4052 /*
4053    It is not enough to say just "ISO2022" on encoding, we have to
4054    specify more details.  In Emacs, each coding system of ISO2022
4055    variant has the following specifications:
4056         1. Initial designation to G0 thru G3.
4057         2. Allows short-form designation?
4058         3. ASCII should be designated to G0 before control characters?
4059         4. ASCII should be designated to G0 at end of line?
4060         5. 7-bit environment or 8-bit environment?
4061         6. Use locking-shift?
4062         7. Use Single-shift?
4063    And the following two are only for Japanese:
4064         8. Use ASCII in place of JIS0201-1976-Roman?
4065         9. Use JISX0208-1983 in place of JISX0208-1978?
4066    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4067    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4068    details.
4069 */
4070
4071 /* Produce codes (escape sequence) for designating CHARSET to graphic
4072    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4073    '@', 'A', or 'B' and the coding system CODING allows, produce
4074    designation sequence of short-form.  */
4075
4076 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4077   do {                                                                  \
4078     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4079     const char *intermediate_char_94 = "()*+";                          \
4080     const char *intermediate_char_96 = ",-./";                          \
4081     int revision = -1;                                                  \
4082                                                                         \
4083     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4084       revision = CHARSET_ISO_REVISION (charset);                        \
4085                                                                         \
4086     if (revision >= 0)                                                  \
4087       {                                                                 \
4088         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4089         EMIT_ONE_BYTE ('@' + revision);                                 \
4090       }                                                                 \
4091     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4092     if (CHARSET_DIMENSION (charset) == 1)                               \
4093       {                                                                 \
4094         int b;                                                          \
4095         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4096           b = intermediate_char_94[reg];                                \
4097         else                                                            \
4098           b = intermediate_char_96[reg];                                \
4099         EMIT_ONE_ASCII_BYTE (b);                                        \
4100       }                                                                 \
4101     else                                                                \
4102       {                                                                 \
4103         EMIT_ONE_ASCII_BYTE ('$');                                      \
4104         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4105           {                                                             \
4106             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4107                 || reg != 0                                             \
4108                 || final_char < '@' || final_char > 'B')                \
4109               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4110           }                                                             \
4111         else                                                            \
4112           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4113       }                                                                 \
4114     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4115                                                                         \
4116     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4117   } while (0)
4118
4119
4120 /* The following two macros produce codes (control character or escape
4121    sequence) for ISO2022 single-shift functions (single-shift-2 and
4122    single-shift-3).  */
4123
4124 #define ENCODE_SINGLE_SHIFT_2                                           \
4125   do {                                                                  \
4126     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4127       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4128     else                                                                \
4129       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4130     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4131   } while (0)
4132
4133
4134 #define ENCODE_SINGLE_SHIFT_3                                           \
4135   do {                                                                  \
4136     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4137       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4138     else                                                                \
4139       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4140     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4141   } while (0)
4142
4143
4144 /* The following four macros produce codes (control character or
4145    escape sequence) for ISO2022 locking-shift functions (shift-in,
4146    shift-out, locking-shift-2, and locking-shift-3).  */
4147
4148 #define ENCODE_SHIFT_IN                                 \
4149   do {                                                  \
4150     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4151     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4152   } while (0)
4153
4154
4155 #define ENCODE_SHIFT_OUT                                \
4156   do {                                                  \
4157     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4158     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4159   } while (0)
4160
4161
4162 #define ENCODE_LOCKING_SHIFT_2                          \
4163   do {                                                  \
4164     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4165     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4166   } while (0)
4167
4168
4169 #define ENCODE_LOCKING_SHIFT_3                          \
4170   do {                                                  \
4171     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4172     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4173   } while (0)
4174
4175
4176 /* Produce codes for a DIMENSION1 character whose character set is
4177    CHARSET and whose position-code is C1.  Designation and invocation
4178    sequences are also produced in advance if necessary.  */
4179
4180 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4181   do {                                                                  \
4182     int id = CHARSET_ID (charset);                                      \
4183                                                                         \
4184     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4185         && id == charset_ascii)                                         \
4186       {                                                                 \
4187         id = charset_jisx0201_roman;                                    \
4188         charset = CHARSET_FROM_ID (id);                                 \
4189       }                                                                 \
4190                                                                         \
4191     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4192       {                                                                 \
4193         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4194           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4195         else                                                            \
4196           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4197         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4198         break;                                                          \
4199       }                                                                 \
4200     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4201       {                                                                 \
4202         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4203         break;                                                          \
4204       }                                                                 \
4205     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4206       {                                                                 \
4207         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4208         break;                                                          \
4209       }                                                                 \
4210     else                                                                \
4211       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4212          must invoke it, or, at first, designate it to some graphic     \
4213          register.  Then repeat the loop to actually produce the        \
4214          character.  */                                                 \
4215       dst = encode_invocation_designation (charset, coding, dst,        \
4216                                            &produced_chars);            \
4217   } while (1)
4218
4219
4220 /* Produce codes for a DIMENSION2 character whose character set is
4221    CHARSET and whose position-codes are C1 and C2.  Designation and
4222    invocation codes are also produced in advance if necessary.  */
4223
4224 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4225   do {                                                                  \
4226     int id = CHARSET_ID (charset);                                      \
4227                                                                         \
4228     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4229         && id == charset_jisx0208)                                      \
4230       {                                                                 \
4231         id = charset_jisx0208_1978;                                     \
4232         charset = CHARSET_FROM_ID (id);                                 \
4233       }                                                                 \
4234                                                                         \
4235     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4236       {                                                                 \
4237         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4238           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4239         else                                                            \
4240           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4241         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4242         break;                                                          \
4243       }                                                                 \
4244     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4245       {                                                                 \
4246         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4247         break;                                                          \
4248       }                                                                 \
4249     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4250       {                                                                 \
4251         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4252         break;                                                          \
4253       }                                                                 \
4254     else                                                                \
4255       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4256          must invoke it, or, at first, designate it to some graphic     \
4257          register.  Then repeat the loop to actually produce the        \
4258          character.  */                                                 \
4259       dst = encode_invocation_designation (charset, coding, dst,        \
4260                                            &produced_chars);            \
4261   } while (1)
4262
4263
4264 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4265   do {                                                                     \
4266     unsigned code;                                                         \
4267     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4268                                                                            \
4269     if (CHARSET_DIMENSION (charset) == 1)                                  \
4270       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4271     else                                                                   \
4272       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4273   } while (0)
4274
4275
4276 /* Produce designation and invocation codes at a place pointed by DST
4277    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4278    Return new DST.  */
4279
4280 static unsigned char *
4281 encode_invocation_designation (struct charset *charset,
4282                                struct coding_system *coding,
4283                                unsigned char *dst, ptrdiff_t *p_nchars)
4284 {
4285   int multibytep = coding->dst_multibyte;
4286   ptrdiff_t produced_chars = *p_nchars;
4287   int reg;                      /* graphic register number */
4288   int id = CHARSET_ID (charset);
4289
4290   /* At first, check designations.  */
4291   for (reg = 0; reg < 4; reg++)
4292     if (id == CODING_ISO_DESIGNATION (coding, reg))
4293       break;
4294
4295   if (reg >= 4)
4296     {
4297       /* CHARSET is not yet designated to any graphic registers.  */
4298       /* At first check the requested designation.  */
4299       reg = CODING_ISO_REQUEST (coding, id);
4300       if (reg < 0)
4301         /* Since CHARSET requests no special designation, designate it
4302            to graphic register 0.  */
4303         reg = 0;
4304
4305       ENCODE_DESIGNATION (charset, reg, coding);
4306     }
4307
4308   if (CODING_ISO_INVOCATION (coding, 0) != reg
4309       && CODING_ISO_INVOCATION (coding, 1) != reg)
4310     {
4311       /* Since the graphic register REG is not invoked to any graphic
4312          planes, invoke it to graphic plane 0.  */
4313       switch (reg)
4314         {
4315         case 0:                 /* graphic register 0 */
4316           ENCODE_SHIFT_IN;
4317           break;
4318
4319         case 1:                 /* graphic register 1 */
4320           ENCODE_SHIFT_OUT;
4321           break;
4322
4323         case 2:                 /* graphic register 2 */
4324           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4325             ENCODE_SINGLE_SHIFT_2;
4326           else
4327             ENCODE_LOCKING_SHIFT_2;
4328           break;
4329
4330         case 3:                 /* graphic register 3 */
4331           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4332             ENCODE_SINGLE_SHIFT_3;
4333           else
4334             ENCODE_LOCKING_SHIFT_3;
4335           break;
4336         }
4337     }
4338
4339   *p_nchars = produced_chars;
4340   return dst;
4341 }
4342
4343
4344 /* Produce codes for designation and invocation to reset the graphic
4345    planes and registers to initial state.  */
4346 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4347   do {                                                                  \
4348     int reg;                                                            \
4349     struct charset *charset;                                            \
4350                                                                         \
4351     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4352       ENCODE_SHIFT_IN;                                                  \
4353     for (reg = 0; reg < 4; reg++)                                       \
4354       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4355           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4356               != CODING_ISO_INITIAL (coding, reg)))                     \
4357         {                                                               \
4358           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4359           ENCODE_DESIGNATION (charset, reg, coding);                    \
4360         }                                                               \
4361   } while (0)
4362
4363
4364 /* Produce designation sequences of charsets in the line started from
4365    CHARBUF to a place pointed by DST, and return the number of
4366    produced bytes.  DST should not directly point a buffer text area
4367    which may be relocated by char_charset call.
4368
4369    If the current block ends before any end-of-line, we may fail to
4370    find all the necessary designations.  */
4371
4372 static ptrdiff_t
4373 encode_designation_at_bol (struct coding_system *coding,
4374                            int *charbuf, int *charbuf_end,
4375                            unsigned char *dst)
4376 {
4377   unsigned char *orig = dst;
4378   struct charset *charset;
4379   /* Table of charsets to be designated to each graphic register.  */
4380   int r[4];
4381   int c, found = 0, reg;
4382   ptrdiff_t produced_chars = 0;
4383   int multibytep = coding->dst_multibyte;
4384   Lisp_Object attrs;
4385   Lisp_Object charset_list;
4386
4387   attrs = CODING_ID_ATTRS (coding->id);
4388   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4389   if (EQ (charset_list, Qiso_2022))
4390     charset_list = Viso_2022_charset_list;
4391
4392   for (reg = 0; reg < 4; reg++)
4393     r[reg] = -1;
4394
4395   while (charbuf < charbuf_end && found < 4)
4396     {
4397       int id;
4398
4399       c = *charbuf++;
4400       if (c == '\n')
4401         break;
4402       charset = char_charset (c, charset_list, NULL);
4403       id = CHARSET_ID (charset);
4404       reg = CODING_ISO_REQUEST (coding, id);
4405       if (reg >= 0 && r[reg] < 0)
4406         {
4407           found++;
4408           r[reg] = id;
4409         }
4410     }
4411
4412   if (found)
4413     {
4414       for (reg = 0; reg < 4; reg++)
4415         if (r[reg] >= 0
4416             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4417           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4418     }
4419
4420   return dst - orig;
4421 }
4422
4423 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4424
4425 static int
4426 encode_coding_iso_2022 (struct coding_system *coding)
4427 {
4428   int multibytep = coding->dst_multibyte;
4429   int *charbuf = coding->charbuf;
4430   int *charbuf_end = charbuf + coding->charbuf_used;
4431   unsigned char *dst = coding->destination + coding->produced;
4432   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4433   int safe_room = 16;
4434   int bol_designation
4435     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4436        && CODING_ISO_BOL (coding));
4437   ptrdiff_t produced_chars = 0;
4438   Lisp_Object attrs, eol_type, charset_list;
4439   int ascii_compatible;
4440   int c;
4441   int preferred_charset_id = -1;
4442
4443   CODING_GET_INFO (coding, attrs, charset_list);
4444   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4445   if (VECTORP (eol_type))
4446     eol_type = Qunix;
4447
4448   setup_iso_safe_charsets (attrs);
4449   /* Charset list may have been changed.  */
4450   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4451   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4452
4453   ascii_compatible
4454     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4455        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4456                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4457
4458   while (charbuf < charbuf_end)
4459     {
4460       ASSURE_DESTINATION (safe_room);
4461
4462       if (bol_designation)
4463         {
4464           /* We have to produce designation sequences if any now.  */
4465           unsigned char desig_buf[16];
4466           int nbytes;
4467           ptrdiff_t offset;
4468
4469           charset_map_loaded = 0;
4470           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4471                                               desig_buf);
4472           if (charset_map_loaded
4473               && (offset = coding_change_destination (coding)))
4474             {
4475               dst += offset;
4476               dst_end += offset;
4477             }
4478           memcpy (dst, desig_buf, nbytes);
4479           dst += nbytes;
4480           /* We are sure that designation sequences are all ASCII bytes.  */
4481           produced_chars += nbytes;
4482           bol_designation = 0;
4483           ASSURE_DESTINATION (safe_room);
4484         }
4485
4486       c = *charbuf++;
4487
4488       if (c < 0)
4489         {
4490           /* Handle an annotation.  */
4491           switch (*charbuf)
4492             {
4493             case CODING_ANNOTATE_COMPOSITION_MASK:
4494               /* Not yet implemented.  */
4495               break;
4496             case CODING_ANNOTATE_CHARSET_MASK:
4497               preferred_charset_id = charbuf[2];
4498               if (preferred_charset_id >= 0
4499                   && NILP (Fmemq (make_number (preferred_charset_id),
4500                                   charset_list)))
4501                 preferred_charset_id = -1;
4502               break;
4503             default:
4504               abort ();
4505             }
4506           charbuf += -c - 1;
4507           continue;
4508         }
4509
4510       /* Now encode the character C.  */
4511       if (c < 0x20 || c == 0x7F)
4512         {
4513           if (c == '\n'
4514               || (c == '\r' && EQ (eol_type, Qmac)))
4515             {
4516               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4517                 ENCODE_RESET_PLANE_AND_REGISTER ();
4518               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4519                 {
4520                   int i;
4521
4522                   for (i = 0; i < 4; i++)
4523                     CODING_ISO_DESIGNATION (coding, i)
4524                       = CODING_ISO_INITIAL (coding, i);
4525                 }
4526               bol_designation
4527                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4528             }
4529           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4530             ENCODE_RESET_PLANE_AND_REGISTER ();
4531           EMIT_ONE_ASCII_BYTE (c);
4532         }
4533       else if (ASCII_CHAR_P (c))
4534         {
4535           if (ascii_compatible)
4536             EMIT_ONE_ASCII_BYTE (c);
4537           else
4538             {
4539               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4540               ENCODE_ISO_CHARACTER (charset, c);
4541             }
4542         }
4543       else if (CHAR_BYTE8_P (c))
4544         {
4545           c = CHAR_TO_BYTE8 (c);
4546           EMIT_ONE_BYTE (c);
4547         }
4548       else
4549         {
4550           struct charset *charset;
4551
4552           if (preferred_charset_id >= 0)
4553             {
4554               int result;
4555
4556               charset = CHARSET_FROM_ID (preferred_charset_id);
4557               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4558               if (! result)
4559                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4560                                      NULL, charset);
4561             }
4562           else
4563             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4564                                  NULL, charset);
4565           if (!charset)
4566             {
4567               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4568                 {
4569                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4570                   charset = CHARSET_FROM_ID (charset_ascii);
4571                 }
4572               else
4573                 {
4574                   c = coding->default_char;
4575                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4576                                        charset_list, NULL, charset);
4577                 }
4578             }
4579           ENCODE_ISO_CHARACTER (charset, c);
4580         }
4581     }
4582
4583   if (coding->mode & CODING_MODE_LAST_BLOCK
4584       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4585     {
4586       ASSURE_DESTINATION (safe_room);
4587       ENCODE_RESET_PLANE_AND_REGISTER ();
4588     }
4589   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4590   CODING_ISO_BOL (coding) = bol_designation;
4591   coding->produced_char += produced_chars;
4592   coding->produced = dst - coding->destination;
4593   return 0;
4594 }
4595
4596 \f
4597 /*** 8,9. SJIS and BIG5 handlers ***/
4598
4599 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4600    quite widely.  So, for the moment, Emacs supports them in the bare
4601    C code.  But, in the future, they may be supported only by CCL.  */
4602
4603 /* SJIS is a coding system encoding three character sets: ASCII, right
4604    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4605    as is.  A character of charset katakana-jisx0201 is encoded by
4606    "position-code + 0x80".  A character of charset japanese-jisx0208
4607    is encoded in 2-byte but two position-codes are divided and shifted
4608    so that it fit in the range below.
4609
4610    --- CODE RANGE of SJIS ---
4611    (character set)      (range)
4612    ASCII                0x00 .. 0x7F
4613    KATAKANA-JISX0201    0xA0 .. 0xDF
4614    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4615             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4616    -------------------------------
4617
4618 */
4619
4620 /* BIG5 is a coding system encoding two character sets: ASCII and
4621    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4622    character set and is encoded in two-byte.
4623
4624    --- CODE RANGE of BIG5 ---
4625    (character set)      (range)
4626    ASCII                0x00 .. 0x7F
4627    Big5 (1st byte)      0xA1 .. 0xFE
4628         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4629    --------------------------
4630
4631   */
4632
4633 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4634    Check if a text is encoded in SJIS.  If it is, return
4635    CATEGORY_MASK_SJIS, else return 0.  */
4636
4637 static int
4638 detect_coding_sjis (struct coding_system *coding,
4639                     struct coding_detection_info *detect_info)
4640 {
4641   const unsigned char *src = coding->source, *src_base;
4642   const unsigned char *src_end = coding->source + coding->src_bytes;
4643   int multibytep = coding->src_multibyte;
4644   ptrdiff_t consumed_chars = 0;
4645   int found = 0;
4646   int c;
4647   Lisp_Object attrs, charset_list;
4648   int max_first_byte_of_2_byte_code;
4649
4650   CODING_GET_INFO (coding, attrs, charset_list);
4651   max_first_byte_of_2_byte_code
4652     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4653
4654   detect_info->checked |= CATEGORY_MASK_SJIS;
4655   /* A coding system of this category is always ASCII compatible.  */
4656   src += coding->head_ascii;
4657
4658   while (1)
4659     {
4660       src_base = src;
4661       ONE_MORE_BYTE (c);
4662       if (c < 0x80)
4663         continue;
4664       if ((c >= 0x81 && c <= 0x9F)
4665           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4666         {
4667           ONE_MORE_BYTE (c);
4668           if (c < 0x40 || c == 0x7F || c > 0xFC)
4669             break;
4670           found = CATEGORY_MASK_SJIS;
4671         }
4672       else if (c >= 0xA0 && c < 0xE0)
4673         found = CATEGORY_MASK_SJIS;
4674       else
4675         break;
4676     }
4677   detect_info->rejected |= CATEGORY_MASK_SJIS;
4678   return 0;
4679
4680  no_more_source:
4681   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4682     {
4683       detect_info->rejected |= CATEGORY_MASK_SJIS;
4684       return 0;
4685     }
4686   detect_info->found |= found;
4687   return 1;
4688 }
4689
4690 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4691    Check if a text is encoded in BIG5.  If it is, return
4692    CATEGORY_MASK_BIG5, else return 0.  */
4693
4694 static int
4695 detect_coding_big5 (struct coding_system *coding,
4696                     struct coding_detection_info *detect_info)
4697 {
4698   const unsigned char *src = coding->source, *src_base;
4699   const unsigned char *src_end = coding->source + coding->src_bytes;
4700   int multibytep = coding->src_multibyte;
4701   ptrdiff_t consumed_chars = 0;
4702   int found = 0;
4703   int c;
4704
4705   detect_info->checked |= CATEGORY_MASK_BIG5;
4706   /* A coding system of this category is always ASCII compatible.  */
4707   src += coding->head_ascii;
4708
4709   while (1)
4710     {
4711       src_base = src;
4712       ONE_MORE_BYTE (c);
4713       if (c < 0x80)
4714         continue;
4715       if (c >= 0xA1)
4716         {
4717           ONE_MORE_BYTE (c);
4718           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4719             return 0;
4720           found = CATEGORY_MASK_BIG5;
4721         }
4722       else
4723         break;
4724     }
4725   detect_info->rejected |= CATEGORY_MASK_BIG5;
4726   return 0;
4727
4728  no_more_source:
4729   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4730     {
4731       detect_info->rejected |= CATEGORY_MASK_BIG5;
4732       return 0;
4733     }
4734   detect_info->found |= found;
4735   return 1;
4736 }
4737
4738 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4739    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4740
4741 static void
4742 decode_coding_sjis (struct coding_system *coding)
4743 {
4744   const unsigned char *src = coding->source + coding->consumed;
4745   const unsigned char *src_end = coding->source + coding->src_bytes;
4746   const unsigned char *src_base;
4747   int *charbuf = coding->charbuf + coding->charbuf_used;
4748   /* We may produce one charset annotation in one loop and one more at
4749      the end.  */
4750   int *charbuf_end
4751     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4752   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4753   int multibytep = coding->src_multibyte;
4754   struct charset *charset_roman, *charset_kanji, *charset_kana;
4755   struct charset *charset_kanji2;
4756   Lisp_Object attrs, charset_list, val;
4757   ptrdiff_t char_offset = coding->produced_char;
4758   ptrdiff_t last_offset = char_offset;
4759   int last_id = charset_ascii;
4760   int eol_dos =
4761     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4762   int byte_after_cr = -1;
4763
4764   CODING_GET_INFO (coding, attrs, charset_list);
4765
4766   val = charset_list;
4767   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4768   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4769   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4770   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4771
4772   while (1)
4773     {
4774       int c, c1;
4775       struct charset *charset;
4776
4777       src_base = src;
4778       consumed_chars_base = consumed_chars;
4779
4780       if (charbuf >= charbuf_end)
4781         {
4782           if (byte_after_cr >= 0)
4783             src_base--;
4784           break;
4785         }
4786
4787       if (byte_after_cr >= 0)
4788         c = byte_after_cr, byte_after_cr = -1;
4789       else
4790         ONE_MORE_BYTE (c);
4791       if (c < 0)
4792         goto invalid_code;
4793       if (c < 0x80)
4794         {
4795           if (eol_dos && c == '\r')
4796             ONE_MORE_BYTE (byte_after_cr);
4797           charset = charset_roman;
4798         }
4799       else if (c == 0x80 || c == 0xA0)
4800         goto invalid_code;
4801       else if (c >= 0xA1 && c <= 0xDF)
4802         {
4803           /* SJIS -> JISX0201-Kana */
4804           c &= 0x7F;
4805           charset = charset_kana;
4806         }
4807       else if (c <= 0xEF)
4808         {
4809           /* SJIS -> JISX0208 */
4810           ONE_MORE_BYTE (c1);
4811           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4812             goto invalid_code;
4813           c = (c << 8) | c1;
4814           SJIS_TO_JIS (c);
4815           charset = charset_kanji;
4816         }
4817       else if (c <= 0xFC && charset_kanji2)
4818         {
4819           /* SJIS -> JISX0213-2 */
4820           ONE_MORE_BYTE (c1);
4821           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4822             goto invalid_code;
4823           c = (c << 8) | c1;
4824           SJIS_TO_JIS2 (c);
4825           charset = charset_kanji2;
4826         }
4827       else
4828         goto invalid_code;
4829       if (charset->id != charset_ascii
4830           && last_id != charset->id)
4831         {
4832           if (last_id != charset_ascii)
4833             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4834           last_id = charset->id;
4835           last_offset = char_offset;
4836         }
4837       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4838       *charbuf++ = c;
4839       char_offset++;
4840       continue;
4841
4842     invalid_code:
4843       src = src_base;
4844       consumed_chars = consumed_chars_base;
4845       ONE_MORE_BYTE (c);
4846       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4847       char_offset++;
4848       coding->errors++;
4849     }
4850
4851  no_more_source:
4852   if (last_id != charset_ascii)
4853     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4854   coding->consumed_char += consumed_chars_base;
4855   coding->consumed = src_base - coding->source;
4856   coding->charbuf_used = charbuf - coding->charbuf;
4857 }
4858
4859 static void
4860 decode_coding_big5 (struct coding_system *coding)
4861 {
4862   const unsigned char *src = coding->source + coding->consumed;
4863   const unsigned char *src_end = coding->source + coding->src_bytes;
4864   const unsigned char *src_base;
4865   int *charbuf = coding->charbuf + coding->charbuf_used;
4866   /* We may produce one charset annotation in one loop and one more at
4867      the end.  */
4868   int *charbuf_end
4869     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4870   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4871   int multibytep = coding->src_multibyte;
4872   struct charset *charset_roman, *charset_big5;
4873   Lisp_Object attrs, charset_list, val;
4874   ptrdiff_t char_offset = coding->produced_char;
4875   ptrdiff_t last_offset = char_offset;
4876   int last_id = charset_ascii;
4877   int eol_dos =
4878     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4879   int byte_after_cr = -1;
4880
4881   CODING_GET_INFO (coding, attrs, charset_list);
4882   val = charset_list;
4883   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4884   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4885
4886   while (1)
4887     {
4888       int c, c1;
4889       struct charset *charset;
4890
4891       src_base = src;
4892       consumed_chars_base = consumed_chars;
4893
4894       if (charbuf >= charbuf_end)
4895         {
4896           if (byte_after_cr >= 0)
4897             src_base--;
4898           break;
4899         }
4900
4901       if (byte_after_cr >= 0)
4902         c = byte_after_cr, byte_after_cr = -1;
4903       else
4904         ONE_MORE_BYTE (c);
4905
4906       if (c < 0)
4907         goto invalid_code;
4908       if (c < 0x80)
4909         {
4910           if (eol_dos && c == '\r')
4911             ONE_MORE_BYTE (byte_after_cr);
4912           charset = charset_roman;
4913         }
4914       else
4915         {
4916           /* BIG5 -> Big5 */
4917           if (c < 0xA1 || c > 0xFE)
4918             goto invalid_code;
4919           ONE_MORE_BYTE (c1);
4920           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4921             goto invalid_code;
4922           c = c << 8 | c1;
4923           charset = charset_big5;
4924         }
4925       if (charset->id != charset_ascii
4926           && last_id != charset->id)
4927         {
4928           if (last_id != charset_ascii)
4929             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4930           last_id = charset->id;
4931           last_offset = char_offset;
4932         }
4933       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4934       *charbuf++ = c;
4935       char_offset++;
4936       continue;
4937
4938     invalid_code:
4939       src = src_base;
4940       consumed_chars = consumed_chars_base;
4941       ONE_MORE_BYTE (c);
4942       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4943       char_offset++;
4944       coding->errors++;
4945     }
4946
4947  no_more_source:
4948   if (last_id != charset_ascii)
4949     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4950   coding->consumed_char += consumed_chars_base;
4951   coding->consumed = src_base - coding->source;
4952   coding->charbuf_used = charbuf - coding->charbuf;
4953 }
4954
4955 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4956    This function can encode charsets `ascii', `katakana-jisx0201',
4957    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4958    are sure that all these charsets are registered as official charset
4959    (i.e. do not have extended leading-codes).  Characters of other
4960    charsets are produced without any encoding.  If SJIS_P is 1, encode
4961    SJIS text, else encode BIG5 text.  */
4962
4963 static int
4964 encode_coding_sjis (struct coding_system *coding)
4965 {
4966   int multibytep = coding->dst_multibyte;
4967   int *charbuf = coding->charbuf;
4968   int *charbuf_end = charbuf + coding->charbuf_used;
4969   unsigned char *dst = coding->destination + coding->produced;
4970   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4971   int safe_room = 4;
4972   ptrdiff_t produced_chars = 0;
4973   Lisp_Object attrs, charset_list, val;
4974   int ascii_compatible;
4975   struct charset *charset_kanji, *charset_kana;
4976   struct charset *charset_kanji2;
4977   int c;
4978
4979   CODING_GET_INFO (coding, attrs, charset_list);
4980   val = XCDR (charset_list);
4981   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4982   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4983   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4984
4985   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4986
4987   while (charbuf < charbuf_end)
4988     {
4989       ASSURE_DESTINATION (safe_room);
4990       c = *charbuf++;
4991       /* Now encode the character C.  */
4992       if (ASCII_CHAR_P (c) && ascii_compatible)
4993         EMIT_ONE_ASCII_BYTE (c);
4994       else if (CHAR_BYTE8_P (c))
4995         {
4996           c = CHAR_TO_BYTE8 (c);
4997           EMIT_ONE_BYTE (c);
4998         }
4999       else
5000         {
5001           unsigned code;
5002           struct charset *charset;
5003           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5004                                &code, charset);
5005
5006           if (!charset)
5007             {
5008               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5009                 {
5010                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5011                   charset = CHARSET_FROM_ID (charset_ascii);
5012                 }
5013               else
5014                 {
5015                   c = coding->default_char;
5016                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5017                                        charset_list, &code, charset);
5018                 }
5019             }
5020           if (code == CHARSET_INVALID_CODE (charset))
5021             abort ();
5022           if (charset == charset_kanji)
5023             {
5024               int c1, c2;
5025               JIS_TO_SJIS (code);
5026               c1 = code >> 8, c2 = code & 0xFF;
5027               EMIT_TWO_BYTES (c1, c2);
5028             }
5029           else if (charset == charset_kana)
5030             EMIT_ONE_BYTE (code | 0x80);
5031           else if (charset_kanji2 && charset == charset_kanji2)
5032             {
5033               int c1, c2;
5034
5035               c1 = code >> 8;
5036               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5037                   || c1 == 0x28
5038                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5039                 {
5040                   JIS_TO_SJIS2 (code);
5041                   c1 = code >> 8, c2 = code & 0xFF;
5042                   EMIT_TWO_BYTES (c1, c2);
5043                 }
5044               else
5045                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5046             }
5047           else
5048             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5049         }
5050     }
5051   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5052   coding->produced_char += produced_chars;
5053   coding->produced = dst - coding->destination;
5054   return 0;
5055 }
5056
5057 static int
5058 encode_coding_big5 (struct coding_system *coding)
5059 {
5060   int multibytep = coding->dst_multibyte;
5061   int *charbuf = coding->charbuf;
5062   int *charbuf_end = charbuf + coding->charbuf_used;
5063   unsigned char *dst = coding->destination + coding->produced;
5064   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5065   int safe_room = 4;
5066   ptrdiff_t produced_chars = 0;
5067   Lisp_Object attrs, charset_list, val;
5068   int ascii_compatible;
5069   struct charset *charset_big5;
5070   int c;
5071
5072   CODING_GET_INFO (coding, attrs, charset_list);
5073   val = XCDR (charset_list);
5074   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5075   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5076
5077   while (charbuf < charbuf_end)
5078     {
5079       ASSURE_DESTINATION (safe_room);
5080       c = *charbuf++;
5081       /* Now encode the character C.  */
5082       if (ASCII_CHAR_P (c) && ascii_compatible)
5083         EMIT_ONE_ASCII_BYTE (c);
5084       else if (CHAR_BYTE8_P (c))
5085         {
5086           c = CHAR_TO_BYTE8 (c);
5087           EMIT_ONE_BYTE (c);
5088         }
5089       else
5090         {
5091           unsigned code;
5092           struct charset *charset;
5093           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5094                                &code, charset);
5095
5096           if (! charset)
5097             {
5098               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5099                 {
5100                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5101                   charset = CHARSET_FROM_ID (charset_ascii);
5102                 }
5103               else
5104                 {
5105                   c = coding->default_char;
5106                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5107                                        charset_list, &code, charset);
5108                 }
5109             }
5110           if (code == CHARSET_INVALID_CODE (charset))
5111             abort ();
5112           if (charset == charset_big5)
5113             {
5114               int c1, c2;
5115
5116               c1 = code >> 8, c2 = code & 0xFF;
5117               EMIT_TWO_BYTES (c1, c2);
5118             }
5119           else
5120             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5121         }
5122     }
5123   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5124   coding->produced_char += produced_chars;
5125   coding->produced = dst - coding->destination;
5126   return 0;
5127 }
5128
5129 \f
5130 /*** 10. CCL handlers ***/
5131
5132 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5133    Check if a text is encoded in a coding system of which
5134    encoder/decoder are written in CCL program.  If it is, return
5135    CATEGORY_MASK_CCL, else return 0.  */
5136
5137 static int
5138 detect_coding_ccl (struct coding_system *coding,
5139                    struct coding_detection_info *detect_info)
5140 {
5141   const unsigned char *src = coding->source, *src_base;
5142   const unsigned char *src_end = coding->source + coding->src_bytes;
5143   int multibytep = coding->src_multibyte;
5144   ptrdiff_t consumed_chars = 0;
5145   int found = 0;
5146   unsigned char *valids;
5147   ptrdiff_t head_ascii = coding->head_ascii;
5148   Lisp_Object attrs;
5149
5150   detect_info->checked |= CATEGORY_MASK_CCL;
5151
5152   coding = &coding_categories[coding_category_ccl];
5153   valids = CODING_CCL_VALIDS (coding);
5154   attrs = CODING_ID_ATTRS (coding->id);
5155   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5156     src += head_ascii;
5157
5158   while (1)
5159     {
5160       int c;
5161
5162       src_base = src;
5163       ONE_MORE_BYTE (c);
5164       if (c < 0 || ! valids[c])
5165         break;
5166       if ((valids[c] > 1))
5167         found = CATEGORY_MASK_CCL;
5168     }
5169   detect_info->rejected |= CATEGORY_MASK_CCL;
5170   return 0;
5171
5172  no_more_source:
5173   detect_info->found |= found;
5174   return 1;
5175 }
5176
5177 static void
5178 decode_coding_ccl (struct coding_system *coding)
5179 {
5180   const unsigned char *src = coding->source + coding->consumed;
5181   const unsigned char *src_end = coding->source + coding->src_bytes;
5182   int *charbuf = coding->charbuf + coding->charbuf_used;
5183   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5184   ptrdiff_t consumed_chars = 0;
5185   int multibytep = coding->src_multibyte;
5186   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5187   int source_charbuf[1024];
5188   int source_byteidx[1025];
5189   Lisp_Object attrs, charset_list;
5190
5191   CODING_GET_INFO (coding, attrs, charset_list);
5192
5193   while (1)
5194     {
5195       const unsigned char *p = src;
5196       int i = 0;
5197
5198       if (multibytep)
5199         {
5200           while (i < 1024 && p < src_end)
5201             {
5202               source_byteidx[i] = p - src;
5203               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5204             }
5205           source_byteidx[i] = p - src;
5206         }
5207       else
5208         while (i < 1024 && p < src_end)
5209           source_charbuf[i++] = *p++;
5210
5211       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5212         ccl->last_block = 1;
5213       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5214                   charset_list);
5215       charbuf += ccl->produced;
5216       if (multibytep)
5217         src += source_byteidx[ccl->consumed];
5218       else
5219         src += ccl->consumed;
5220       consumed_chars += ccl->consumed;
5221       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5222         break;
5223     }
5224
5225   switch (ccl->status)
5226     {
5227     case CCL_STAT_SUSPEND_BY_SRC:
5228       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5229       break;
5230     case CCL_STAT_SUSPEND_BY_DST:
5231       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5232       break;
5233     case CCL_STAT_QUIT:
5234     case CCL_STAT_INVALID_CMD:
5235       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5236       break;
5237     default:
5238       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5239       break;
5240     }
5241   coding->consumed_char += consumed_chars;
5242   coding->consumed = src - coding->source;
5243   coding->charbuf_used = charbuf - coding->charbuf;
5244 }
5245
5246 static int
5247 encode_coding_ccl (struct coding_system *coding)
5248 {
5249   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5250   int multibytep = coding->dst_multibyte;
5251   int *charbuf = coding->charbuf;
5252   int *charbuf_end = charbuf + coding->charbuf_used;
5253   unsigned char *dst = coding->destination + coding->produced;
5254   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5255   int destination_charbuf[1024];
5256   ptrdiff_t produced_chars = 0;
5257   int i;
5258   Lisp_Object attrs, charset_list;
5259
5260   CODING_GET_INFO (coding, attrs, charset_list);
5261   if (coding->consumed_char == coding->src_chars
5262       && coding->mode & CODING_MODE_LAST_BLOCK)
5263     ccl->last_block = 1;
5264
5265   do
5266     {
5267       ccl_driver (ccl, charbuf, destination_charbuf,
5268                   charbuf_end - charbuf, 1024, charset_list);
5269       if (multibytep)
5270         {
5271           ASSURE_DESTINATION (ccl->produced * 2);
5272           for (i = 0; i < ccl->produced; i++)
5273             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5274         }
5275       else
5276         {
5277           ASSURE_DESTINATION (ccl->produced);
5278           for (i = 0; i < ccl->produced; i++)
5279             *dst++ = destination_charbuf[i] & 0xFF;
5280           produced_chars += ccl->produced;
5281         }
5282       charbuf += ccl->consumed;
5283       if (ccl->status == CCL_STAT_QUIT
5284           || ccl->status == CCL_STAT_INVALID_CMD)
5285         break;
5286     }
5287   while (charbuf < charbuf_end);
5288
5289   switch (ccl->status)
5290     {
5291     case CCL_STAT_SUSPEND_BY_SRC:
5292       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5293       break;
5294     case CCL_STAT_SUSPEND_BY_DST:
5295       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5296       break;
5297     case CCL_STAT_QUIT:
5298     case CCL_STAT_INVALID_CMD:
5299       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5300       break;
5301     default:
5302       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5303       break;
5304     }
5305
5306   coding->produced_char += produced_chars;
5307   coding->produced = dst - coding->destination;
5308   return 0;
5309 }
5310
5311
5312 \f
5313 /*** 10, 11. no-conversion handlers ***/
5314
5315 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5316
5317 static void
5318 decode_coding_raw_text (struct coding_system *coding)
5319 {
5320   int eol_dos =
5321     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5322
5323   coding->chars_at_source = 1;
5324   coding->consumed_char = coding->src_chars;
5325   coding->consumed = coding->src_bytes;
5326   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5327     {
5328       coding->consumed_char--;
5329       coding->consumed--;
5330       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5331     }
5332   else
5333     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5334 }
5335
5336 static int
5337 encode_coding_raw_text (struct coding_system *coding)
5338 {
5339   int multibytep = coding->dst_multibyte;
5340   int *charbuf = coding->charbuf;
5341   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5342   unsigned char *dst = coding->destination + coding->produced;
5343   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5344   ptrdiff_t produced_chars = 0;
5345   int c;
5346
5347   if (multibytep)
5348     {
5349       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5350
5351       if (coding->src_multibyte)
5352         while (charbuf < charbuf_end)
5353           {
5354             ASSURE_DESTINATION (safe_room);
5355             c = *charbuf++;
5356             if (ASCII_CHAR_P (c))
5357               EMIT_ONE_ASCII_BYTE (c);
5358             else if (CHAR_BYTE8_P (c))
5359               {
5360                 c = CHAR_TO_BYTE8 (c);
5361                 EMIT_ONE_BYTE (c);
5362               }
5363             else
5364               {
5365                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5366
5367                 CHAR_STRING_ADVANCE (c, p1);
5368                 do
5369                   {
5370                     EMIT_ONE_BYTE (*p0);
5371                     p0++;
5372                   }
5373                 while (p0 < p1);
5374               }
5375           }
5376       else
5377         while (charbuf < charbuf_end)
5378           {
5379             ASSURE_DESTINATION (safe_room);
5380             c = *charbuf++;
5381             EMIT_ONE_BYTE (c);
5382           }
5383     }
5384   else
5385     {
5386       if (coding->src_multibyte)
5387         {
5388           int safe_room = MAX_MULTIBYTE_LENGTH;
5389
5390           while (charbuf < charbuf_end)
5391             {
5392               ASSURE_DESTINATION (safe_room);
5393               c = *charbuf++;
5394               if (ASCII_CHAR_P (c))
5395                 *dst++ = c;
5396               else if (CHAR_BYTE8_P (c))
5397                 *dst++ = CHAR_TO_BYTE8 (c);
5398               else
5399                 CHAR_STRING_ADVANCE (c, dst);
5400             }
5401         }
5402       else
5403         {
5404           ASSURE_DESTINATION (charbuf_end - charbuf);
5405           while (charbuf < charbuf_end && dst < dst_end)
5406             *dst++ = *charbuf++;
5407         }
5408       produced_chars = dst - (coding->destination + coding->produced);
5409     }
5410   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5411   coding->produced_char += produced_chars;
5412   coding->produced = dst - coding->destination;
5413   return 0;
5414 }
5415
5416 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5417    Check if a text is encoded in a charset-based coding system.  If it
5418    is, return 1, else return 0.  */
5419
5420 static int
5421 detect_coding_charset (struct coding_system *coding,
5422                        struct coding_detection_info *detect_info)
5423 {
5424   const unsigned char *src = coding->source, *src_base;
5425   const unsigned char *src_end = coding->source + coding->src_bytes;
5426   int multibytep = coding->src_multibyte;
5427   ptrdiff_t consumed_chars = 0;
5428   Lisp_Object attrs, valids, name;
5429   int found = 0;
5430   ptrdiff_t head_ascii = coding->head_ascii;
5431   int check_latin_extra = 0;
5432
5433   detect_info->checked |= CATEGORY_MASK_CHARSET;
5434
5435   coding = &coding_categories[coding_category_charset];
5436   attrs = CODING_ID_ATTRS (coding->id);
5437   valids = AREF (attrs, coding_attr_charset_valids);
5438   name = CODING_ID_NAME (coding->id);
5439   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5440                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5441       || strncmp (SSDATA (SYMBOL_NAME (name)),
5442                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5443     check_latin_extra = 1;
5444
5445   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5446     src += head_ascii;
5447
5448   while (1)
5449     {
5450       int c;
5451       Lisp_Object val;
5452       struct charset *charset;
5453       int dim, idx;
5454
5455       src_base = src;
5456       ONE_MORE_BYTE (c);
5457       if (c < 0)
5458         continue;
5459       val = AREF (valids, c);
5460       if (NILP (val))
5461         break;
5462       if (c >= 0x80)
5463         {
5464           if (c < 0xA0
5465               && check_latin_extra
5466               && (!VECTORP (Vlatin_extra_code_table)
5467                   || NILP (AREF (Vlatin_extra_code_table, c))))
5468             break;
5469           found = CATEGORY_MASK_CHARSET;
5470         }
5471       if (INTEGERP (val))
5472         {
5473           charset = CHARSET_FROM_ID (XFASTINT (val));
5474           dim = CHARSET_DIMENSION (charset);
5475           for (idx = 1; idx < dim; idx++)
5476             {
5477               if (src == src_end)
5478                 goto too_short;
5479               ONE_MORE_BYTE (c);
5480               if (c < charset->code_space[(dim - 1 - idx) * 4]
5481                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5482                 break;
5483             }
5484           if (idx < dim)
5485             break;
5486         }
5487       else
5488         {
5489           idx = 1;
5490           for (; CONSP (val); val = XCDR (val))
5491             {
5492               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5493               dim = CHARSET_DIMENSION (charset);
5494               while (idx < dim)
5495                 {
5496                   if (src == src_end)
5497                     goto too_short;
5498                   ONE_MORE_BYTE (c);
5499                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5500                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5501                     break;
5502                   idx++;
5503                 }
5504               if (idx == dim)
5505                 {
5506                   val = Qnil;
5507                   break;
5508                 }
5509             }
5510           if (CONSP (val))
5511             break;
5512         }
5513     }
5514  too_short:
5515   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5516   return 0;
5517
5518  no_more_source:
5519   detect_info->found |= found;
5520   return 1;
5521 }
5522
5523 static void
5524 decode_coding_charset (struct coding_system *coding)
5525 {
5526   const unsigned char *src = coding->source + coding->consumed;
5527   const unsigned char *src_end = coding->source + coding->src_bytes;
5528   const unsigned char *src_base;
5529   int *charbuf = coding->charbuf + coding->charbuf_used;
5530   /* We may produce one charset annotation in one loop and one more at
5531      the end.  */
5532   int *charbuf_end
5533     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5534   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5535   int multibytep = coding->src_multibyte;
5536   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5537   Lisp_Object valids;
5538   ptrdiff_t char_offset = coding->produced_char;
5539   ptrdiff_t last_offset = char_offset;
5540   int last_id = charset_ascii;
5541   int eol_dos =
5542     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5543   int byte_after_cr = -1;
5544
5545   valids = AREF (attrs, coding_attr_charset_valids);
5546
5547   while (1)
5548     {
5549       int c;
5550       Lisp_Object val;
5551       struct charset *charset;
5552       int dim;
5553       int len = 1;
5554       unsigned code;
5555
5556       src_base = src;
5557       consumed_chars_base = consumed_chars;
5558
5559       if (charbuf >= charbuf_end)
5560         {
5561           if (byte_after_cr >= 0)
5562             src_base--;
5563           break;
5564         }
5565
5566       if (byte_after_cr >= 0)
5567         {
5568           c = byte_after_cr;
5569           byte_after_cr = -1;
5570         }
5571       else
5572         {
5573           ONE_MORE_BYTE (c);
5574           if (eol_dos && c == '\r')
5575             ONE_MORE_BYTE (byte_after_cr);
5576         }
5577       if (c < 0)
5578         goto invalid_code;
5579       code = c;
5580
5581       val = AREF (valids, c);
5582       if (! INTEGERP (val) && ! CONSP (val))
5583         goto invalid_code;
5584       if (INTEGERP (val))
5585         {
5586           charset = CHARSET_FROM_ID (XFASTINT (val));
5587           dim = CHARSET_DIMENSION (charset);
5588           while (len < dim)
5589             {
5590               ONE_MORE_BYTE (c);
5591               code = (code << 8) | c;
5592               len++;
5593             }
5594           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5595                               charset, code, c);
5596         }
5597       else
5598         {
5599           /* VAL is a list of charset IDs.  It is assured that the
5600              list is sorted by charset dimensions (smaller one
5601              comes first).  */
5602           while (CONSP (val))
5603             {
5604               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5605               dim = CHARSET_DIMENSION (charset);
5606               while (len < dim)
5607                 {
5608                   ONE_MORE_BYTE (c);
5609                   code = (code << 8) | c;
5610                   len++;
5611                 }
5612               CODING_DECODE_CHAR (coding, src, src_base,
5613                                   src_end, charset, code, c);
5614               if (c >= 0)
5615                 break;
5616               val = XCDR (val);
5617             }
5618         }
5619       if (c < 0)
5620         goto invalid_code;
5621       if (charset->id != charset_ascii
5622           && last_id != charset->id)
5623         {
5624           if (last_id != charset_ascii)
5625             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5626           last_id = charset->id;
5627           last_offset = char_offset;
5628         }
5629
5630       *charbuf++ = c;
5631       char_offset++;
5632       continue;
5633
5634     invalid_code:
5635       src = src_base;
5636       consumed_chars = consumed_chars_base;
5637       ONE_MORE_BYTE (c);
5638       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5639       char_offset++;
5640       coding->errors++;
5641     }
5642
5643  no_more_source:
5644   if (last_id != charset_ascii)
5645     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5646   coding->consumed_char += consumed_chars_base;
5647   coding->consumed = src_base - coding->source;
5648   coding->charbuf_used = charbuf - coding->charbuf;
5649 }
5650
5651 static int
5652 encode_coding_charset (struct coding_system *coding)
5653 {
5654   int multibytep = coding->dst_multibyte;
5655   int *charbuf = coding->charbuf;
5656   int *charbuf_end = charbuf + coding->charbuf_used;
5657   unsigned char *dst = coding->destination + coding->produced;
5658   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5659   int safe_room = MAX_MULTIBYTE_LENGTH;
5660   ptrdiff_t produced_chars = 0;
5661   Lisp_Object attrs, charset_list;
5662   int ascii_compatible;
5663   int c;
5664
5665   CODING_GET_INFO (coding, attrs, charset_list);
5666   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5667
5668   while (charbuf < charbuf_end)
5669     {
5670       struct charset *charset;
5671       unsigned code;
5672
5673       ASSURE_DESTINATION (safe_room);
5674       c = *charbuf++;
5675       if (ascii_compatible && ASCII_CHAR_P (c))
5676         EMIT_ONE_ASCII_BYTE (c);
5677       else if (CHAR_BYTE8_P (c))
5678         {
5679           c = CHAR_TO_BYTE8 (c);
5680           EMIT_ONE_BYTE (c);
5681         }
5682       else
5683         {
5684           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5685                                &code, charset);
5686
5687           if (charset)
5688             {
5689               if (CHARSET_DIMENSION (charset) == 1)
5690                 EMIT_ONE_BYTE (code);
5691               else if (CHARSET_DIMENSION (charset) == 2)
5692                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5693               else if (CHARSET_DIMENSION (charset) == 3)
5694                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5695               else
5696                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5697                                  (code >> 8) & 0xFF, code & 0xFF);
5698             }
5699           else
5700             {
5701               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5702                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5703               else
5704                 c = coding->default_char;
5705               EMIT_ONE_BYTE (c);
5706             }
5707         }
5708     }
5709
5710   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5711   coding->produced_char += produced_chars;
5712   coding->produced = dst - coding->destination;
5713   return 0;
5714 }
5715
5716 \f
5717 /*** 7. C library functions ***/
5718
5719 /* Setup coding context CODING from information about CODING_SYSTEM.
5720    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5721    CODING_SYSTEM is invalid, signal an error.  */
5722
5723 void
5724 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5725 {
5726   Lisp_Object attrs;
5727   Lisp_Object eol_type;
5728   Lisp_Object coding_type;
5729   Lisp_Object val;
5730
5731   if (NILP (coding_system))
5732     coding_system = Qundecided;
5733
5734   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5735
5736   attrs = CODING_ID_ATTRS (coding->id);
5737   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5738
5739   coding->mode = 0;
5740   coding->head_ascii = -1;
5741   if (VECTORP (eol_type))
5742     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5743                             | CODING_REQUIRE_DETECTION_MASK);
5744   else if (! EQ (eol_type, Qunix))
5745     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5746                             | CODING_REQUIRE_ENCODING_MASK);
5747   else
5748     coding->common_flags = 0;
5749   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5750     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5751   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5752     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5753   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5754     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5755
5756   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5757   coding->max_charset_id = SCHARS (val) - 1;
5758   coding->safe_charsets = SDATA (val);
5759   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5760   coding->carryover_bytes = 0;
5761
5762   coding_type = CODING_ATTR_TYPE (attrs);
5763   if (EQ (coding_type, Qundecided))
5764     {
5765       coding->detector = NULL;
5766       coding->decoder = decode_coding_raw_text;
5767       coding->encoder = encode_coding_raw_text;
5768       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5769     }
5770   else if (EQ (coding_type, Qiso_2022))
5771     {
5772       int i;
5773       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5774
5775       /* Invoke graphic register 0 to plane 0.  */
5776       CODING_ISO_INVOCATION (coding, 0) = 0;
5777       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5778       CODING_ISO_INVOCATION (coding, 1)
5779         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5780       /* Setup the initial status of designation.  */
5781       for (i = 0; i < 4; i++)
5782         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5783       /* Not single shifting initially.  */
5784       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5785       /* Beginning of buffer should also be regarded as bol. */
5786       CODING_ISO_BOL (coding) = 1;
5787       coding->detector = detect_coding_iso_2022;
5788       coding->decoder = decode_coding_iso_2022;
5789       coding->encoder = encode_coding_iso_2022;
5790       if (flags & CODING_ISO_FLAG_SAFE)
5791         coding->mode |= CODING_MODE_SAFE_ENCODING;
5792       coding->common_flags
5793         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5794             | CODING_REQUIRE_FLUSHING_MASK);
5795       if (flags & CODING_ISO_FLAG_COMPOSITION)
5796         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5797       if (flags & CODING_ISO_FLAG_DESIGNATION)
5798         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5799       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5800         {
5801           setup_iso_safe_charsets (attrs);
5802           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5803           coding->max_charset_id = SCHARS (val) - 1;
5804           coding->safe_charsets = SDATA (val);
5805         }
5806       CODING_ISO_FLAGS (coding) = flags;
5807       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5808       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5809       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5810       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5811     }
5812   else if (EQ (coding_type, Qcharset))
5813     {
5814       coding->detector = detect_coding_charset;
5815       coding->decoder = decode_coding_charset;
5816       coding->encoder = encode_coding_charset;
5817       coding->common_flags
5818         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5819     }
5820   else if (EQ (coding_type, Qutf_8))
5821     {
5822       val = AREF (attrs, coding_attr_utf_bom);
5823       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5824                                    : EQ (val, Qt) ? utf_with_bom
5825                                    : utf_without_bom);
5826       coding->detector = detect_coding_utf_8;
5827       coding->decoder = decode_coding_utf_8;
5828       coding->encoder = encode_coding_utf_8;
5829       coding->common_flags
5830         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5831       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5832         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5833     }
5834   else if (EQ (coding_type, Qutf_16))
5835     {
5836       val = AREF (attrs, coding_attr_utf_bom);
5837       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5838                                     : EQ (val, Qt) ? utf_with_bom
5839                                     : utf_without_bom);
5840       val = AREF (attrs, coding_attr_utf_16_endian);
5841       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5842                                        : utf_16_little_endian);
5843       CODING_UTF_16_SURROGATE (coding) = 0;
5844       coding->detector = detect_coding_utf_16;
5845       coding->decoder = decode_coding_utf_16;
5846       coding->encoder = encode_coding_utf_16;
5847       coding->common_flags
5848         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5849       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5850         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5851     }
5852   else if (EQ (coding_type, Qccl))
5853     {
5854       coding->detector = detect_coding_ccl;
5855       coding->decoder = decode_coding_ccl;
5856       coding->encoder = encode_coding_ccl;
5857       coding->common_flags
5858         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5859             | CODING_REQUIRE_FLUSHING_MASK);
5860     }
5861   else if (EQ (coding_type, Qemacs_mule))
5862     {
5863       coding->detector = detect_coding_emacs_mule;
5864       coding->decoder = decode_coding_emacs_mule;
5865       coding->encoder = encode_coding_emacs_mule;
5866       coding->common_flags
5867         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5868       coding->spec.emacs_mule.full_support = 1;
5869       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5870           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5871         {
5872           Lisp_Object tail, safe_charsets;
5873           int max_charset_id = 0;
5874
5875           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5876                tail = XCDR (tail))
5877             if (max_charset_id < XFASTINT (XCAR (tail)))
5878               max_charset_id = XFASTINT (XCAR (tail));
5879           safe_charsets = make_uninit_string (max_charset_id + 1);
5880           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5881           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5882                tail = XCDR (tail))
5883             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5884           coding->max_charset_id = max_charset_id;
5885           coding->safe_charsets = SDATA (safe_charsets);
5886           coding->spec.emacs_mule.full_support = 1;
5887         }
5888       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5889       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5890     }
5891   else if (EQ (coding_type, Qshift_jis))
5892     {
5893       coding->detector = detect_coding_sjis;
5894       coding->decoder = decode_coding_sjis;
5895       coding->encoder = encode_coding_sjis;
5896       coding->common_flags
5897         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5898     }
5899   else if (EQ (coding_type, Qbig5))
5900     {
5901       coding->detector = detect_coding_big5;
5902       coding->decoder = decode_coding_big5;
5903       coding->encoder = encode_coding_big5;
5904       coding->common_flags
5905         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5906     }
5907   else                          /* EQ (coding_type, Qraw_text) */
5908     {
5909       coding->detector = NULL;
5910       coding->decoder = decode_coding_raw_text;
5911       coding->encoder = encode_coding_raw_text;
5912       if (! EQ (eol_type, Qunix))
5913         {
5914           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5915           if (! VECTORP (eol_type))
5916             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5917         }
5918
5919     }
5920
5921   return;
5922 }
5923
5924 /* Return a list of charsets supported by CODING.  */
5925
5926 Lisp_Object
5927 coding_charset_list (struct coding_system *coding)
5928 {
5929   Lisp_Object attrs, charset_list;
5930
5931   CODING_GET_INFO (coding, attrs, charset_list);
5932   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5933     {
5934       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5935
5936       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5937         charset_list = Viso_2022_charset_list;
5938     }
5939   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5940     {
5941       charset_list = Vemacs_mule_charset_list;
5942     }
5943   return charset_list;
5944 }
5945
5946
5947 /* Return a list of charsets supported by CODING-SYSTEM.  */
5948
5949 Lisp_Object
5950 coding_system_charset_list (Lisp_Object coding_system)
5951 {
5952   ptrdiff_t id;
5953   Lisp_Object attrs, charset_list;
5954
5955   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5956   attrs = CODING_ID_ATTRS (id);
5957
5958   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5959     {
5960       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5961
5962       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5963         charset_list = Viso_2022_charset_list;
5964       else
5965         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5966     }
5967   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5968     {
5969       charset_list = Vemacs_mule_charset_list;
5970     }
5971   else
5972     {
5973       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5974     }
5975   return charset_list;
5976 }
5977
5978
5979 /* Return raw-text or one of its subsidiaries that has the same
5980    eol_type as CODING-SYSTEM.  */
5981
5982 Lisp_Object
5983 raw_text_coding_system (Lisp_Object coding_system)
5984 {
5985   Lisp_Object spec, attrs;
5986   Lisp_Object eol_type, raw_text_eol_type;
5987
5988   if (NILP (coding_system))
5989     return Qraw_text;
5990   spec = CODING_SYSTEM_SPEC (coding_system);
5991   attrs = AREF (spec, 0);
5992
5993   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5994     return coding_system;
5995
5996   eol_type = AREF (spec, 2);
5997   if (VECTORP (eol_type))
5998     return Qraw_text;
5999   spec = CODING_SYSTEM_SPEC (Qraw_text);
6000   raw_text_eol_type = AREF (spec, 2);
6001   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6002           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6003           : AREF (raw_text_eol_type, 2));
6004 }
6005
6006
6007 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6008    the subsidiary that has the same eol-spec as PARENT (if it is not
6009    nil and specifies end-of-line format) or the system's setting
6010    (system_eol_type).  */
6011
6012 Lisp_Object
6013 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6014 {
6015   Lisp_Object spec, eol_type;
6016
6017   if (NILP (coding_system))
6018     coding_system = Qraw_text;
6019   spec = CODING_SYSTEM_SPEC (coding_system);
6020   eol_type = AREF (spec, 2);
6021   if (VECTORP (eol_type))
6022     {
6023       Lisp_Object parent_eol_type;
6024
6025       if (! NILP (parent))
6026         {
6027           Lisp_Object parent_spec;
6028
6029           parent_spec = CODING_SYSTEM_SPEC (parent);
6030           parent_eol_type = AREF (parent_spec, 2);
6031           if (VECTORP (parent_eol_type))
6032             parent_eol_type = system_eol_type;
6033         }
6034       else
6035         parent_eol_type = system_eol_type;
6036       if (EQ (parent_eol_type, Qunix))
6037         coding_system = AREF (eol_type, 0);
6038       else if (EQ (parent_eol_type, Qdos))
6039         coding_system = AREF (eol_type, 1);
6040       else if (EQ (parent_eol_type, Qmac))
6041         coding_system = AREF (eol_type, 2);
6042     }
6043   return coding_system;
6044 }
6045
6046
6047 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6048    decided for writing to a process.  If not, complement them, and
6049    return a new coding system.  */
6050
6051 Lisp_Object
6052 complement_process_encoding_system (Lisp_Object coding_system)
6053 {
6054   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6055   Lisp_Object spec, attrs;
6056   int i;
6057
6058   for (i = 0; i < 3; i++)
6059     {
6060       if (i == 1)
6061         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6062       else if (i == 2)
6063         coding_system = preferred_coding_system ();
6064       spec = CODING_SYSTEM_SPEC (coding_system);
6065       if (NILP (spec))
6066         continue;
6067       attrs = AREF (spec, 0);
6068       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6069         coding_base = CODING_ATTR_BASE_NAME (attrs);
6070       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6071         eol_base = coding_system;
6072       if (! NILP (coding_base) && ! NILP (eol_base))
6073         break;
6074     }
6075
6076   if (i > 0)
6077     /* The original CODING_SYSTEM didn't specify text-conversion or
6078        eol-conversion.  Be sure that we return a fully complemented
6079        coding system.  */
6080     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6081   return coding_system;
6082 }
6083
6084
6085 /* Emacs has a mechanism to automatically detect a coding system if it
6086    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6087    it's impossible to distinguish some coding systems accurately
6088    because they use the same range of codes.  So, at first, coding
6089    systems are categorized into 7, those are:
6090
6091    o coding-category-emacs-mule
6092
6093         The category for a coding system which has the same code range
6094         as Emacs' internal format.  Assigned the coding-system (Lisp
6095         symbol) `emacs-mule' by default.
6096
6097    o coding-category-sjis
6098
6099         The category for a coding system which has the same code range
6100         as SJIS.  Assigned the coding-system (Lisp
6101         symbol) `japanese-shift-jis' by default.
6102
6103    o coding-category-iso-7
6104
6105         The category for a coding system which has the same code range
6106         as ISO2022 of 7-bit environment.  This doesn't use any locking
6107         shift and single shift functions.  This can encode/decode all
6108         charsets.  Assigned the coding-system (Lisp symbol)
6109         `iso-2022-7bit' by default.
6110
6111    o coding-category-iso-7-tight
6112
6113         Same as coding-category-iso-7 except that this can
6114         encode/decode only the specified charsets.
6115
6116    o coding-category-iso-8-1
6117
6118         The category for a coding system which has the same code range
6119         as ISO2022 of 8-bit environment and graphic plane 1 used only
6120         for DIMENSION1 charset.  This doesn't use any locking shift
6121         and single shift functions.  Assigned the coding-system (Lisp
6122         symbol) `iso-latin-1' by default.
6123
6124    o coding-category-iso-8-2
6125
6126         The category for a coding system which has the same code range
6127         as ISO2022 of 8-bit environment and graphic plane 1 used only
6128         for DIMENSION2 charset.  This doesn't use any locking shift
6129         and single shift functions.  Assigned the coding-system (Lisp
6130         symbol) `japanese-iso-8bit' by default.
6131
6132    o coding-category-iso-7-else
6133
6134         The category for a coding system which has the same code range
6135         as ISO2022 of 7-bit environment but uses locking shift or
6136         single shift functions.  Assigned the coding-system (Lisp
6137         symbol) `iso-2022-7bit-lock' by default.
6138
6139    o coding-category-iso-8-else
6140
6141         The category for a coding system which has the same code range
6142         as ISO2022 of 8-bit environment but uses locking shift or
6143         single shift functions.  Assigned the coding-system (Lisp
6144         symbol) `iso-2022-8bit-ss2' by default.
6145
6146    o coding-category-big5
6147
6148         The category for a coding system which has the same code range
6149         as BIG5.  Assigned the coding-system (Lisp symbol)
6150         `cn-big5' by default.
6151
6152    o coding-category-utf-8
6153
6154         The category for a coding system which has the same code range
6155         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6156         symbol) `utf-8' by default.
6157
6158    o coding-category-utf-16-be
6159
6160         The category for a coding system in which a text has an
6161         Unicode signature (cf. Unicode Standard) in the order of BIG
6162         endian at the head.  Assigned the coding-system (Lisp symbol)
6163         `utf-16-be' by default.
6164
6165    o coding-category-utf-16-le
6166
6167         The category for a coding system in which a text has an
6168         Unicode signature (cf. Unicode Standard) in the order of
6169         LITTLE endian at the head.  Assigned the coding-system (Lisp
6170         symbol) `utf-16-le' by default.
6171
6172    o coding-category-ccl
6173
6174         The category for a coding system of which encoder/decoder is
6175         written in CCL programs.  The default value is nil, i.e., no
6176         coding system is assigned.
6177
6178    o coding-category-binary
6179
6180         The category for a coding system not categorized in any of the
6181         above.  Assigned the coding-system (Lisp symbol)
6182         `no-conversion' by default.
6183
6184    Each of them is a Lisp symbol and the value is an actual
6185    `coding-system's (this is also a Lisp symbol) assigned by a user.
6186    What Emacs does actually is to detect a category of coding system.
6187    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6188    decide only one possible category, it selects a category of the
6189    highest priority.  Priorities of categories are also specified by a
6190    user in a Lisp variable `coding-category-list'.
6191
6192 */
6193
6194 #define EOL_SEEN_NONE   0
6195 #define EOL_SEEN_LF     1
6196 #define EOL_SEEN_CR     2
6197 #define EOL_SEEN_CRLF   4
6198
6199 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6200    SOURCE is encoded.  If CATEGORY is one of
6201    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6202    two-byte, else they are encoded by one-byte.
6203
6204    Return one of EOL_SEEN_XXX.  */
6205
6206 #define MAX_EOL_CHECK_COUNT 3
6207
6208 static int
6209 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6210             enum coding_category category)
6211 {
6212   const unsigned char *src = source, *src_end = src + src_bytes;
6213   unsigned char c;
6214   int total  = 0;
6215   int eol_seen = EOL_SEEN_NONE;
6216
6217   if ((1 << category) & CATEGORY_MASK_UTF_16)
6218     {
6219       int msb, lsb;
6220
6221       msb = category == (coding_category_utf_16_le
6222                          | coding_category_utf_16_le_nosig);
6223       lsb = 1 - msb;
6224
6225       while (src + 1 < src_end)
6226         {
6227           c = src[lsb];
6228           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6229             {
6230               int this_eol;
6231
6232               if (c == '\n')
6233                 this_eol = EOL_SEEN_LF;
6234               else if (src + 3 >= src_end
6235                        || src[msb + 2] != 0
6236                        || src[lsb + 2] != '\n')
6237                 this_eol = EOL_SEEN_CR;
6238               else
6239                 {
6240                   this_eol = EOL_SEEN_CRLF;
6241                   src += 2;
6242                 }
6243
6244               if (eol_seen == EOL_SEEN_NONE)
6245                 /* This is the first end-of-line.  */
6246                 eol_seen = this_eol;
6247               else if (eol_seen != this_eol)
6248                 {
6249                   /* The found type is different from what found before.
6250                      Allow for stray ^M characters in DOS EOL files.  */
6251                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6252                       || (eol_seen == EOL_SEEN_CRLF
6253                           && this_eol == EOL_SEEN_CR))
6254                     eol_seen = EOL_SEEN_CRLF;
6255                   else
6256                     {
6257                       eol_seen = EOL_SEEN_LF;
6258                       break;
6259                     }
6260                 }
6261               if (++total == MAX_EOL_CHECK_COUNT)
6262                 break;
6263             }
6264           src += 2;
6265         }
6266     }
6267   else
6268     while (src < src_end)
6269       {
6270         c = *src++;
6271         if (c == '\n' || c == '\r')
6272           {
6273             int this_eol;
6274
6275             if (c == '\n')
6276               this_eol = EOL_SEEN_LF;
6277             else if (src >= src_end || *src != '\n')
6278               this_eol = EOL_SEEN_CR;
6279             else
6280               this_eol = EOL_SEEN_CRLF, src++;
6281
6282             if (eol_seen == EOL_SEEN_NONE)
6283               /* This is the first end-of-line.  */
6284               eol_seen = this_eol;
6285             else if (eol_seen != this_eol)
6286               {
6287                 /* The found type is different from what found before.
6288                    Allow for stray ^M characters in DOS EOL files.  */
6289                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6290                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6291                   eol_seen = EOL_SEEN_CRLF;
6292                 else
6293                   {
6294                     eol_seen = EOL_SEEN_LF;
6295                     break;
6296                   }
6297               }
6298             if (++total == MAX_EOL_CHECK_COUNT)
6299               break;
6300           }
6301       }
6302   return eol_seen;
6303 }
6304
6305
6306 static Lisp_Object
6307 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6308 {
6309   Lisp_Object eol_type;
6310
6311   eol_type = CODING_ID_EOL_TYPE (coding->id);
6312   if (eol_seen & EOL_SEEN_LF)
6313     {
6314       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6315       eol_type = Qunix;
6316     }
6317   else if (eol_seen & EOL_SEEN_CRLF)
6318     {
6319       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6320       eol_type = Qdos;
6321     }
6322   else if (eol_seen & EOL_SEEN_CR)
6323     {
6324       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6325       eol_type = Qmac;
6326     }
6327   return eol_type;
6328 }
6329
6330 /* Detect how a text specified in CODING is encoded.  If a coding
6331    system is detected, update fields of CODING by the detected coding
6332    system.  */
6333
6334 static void
6335 detect_coding (struct coding_system *coding)
6336 {
6337   const unsigned char *src, *src_end;
6338   int saved_mode = coding->mode;
6339
6340   coding->consumed = coding->consumed_char = 0;
6341   coding->produced = coding->produced_char = 0;
6342   coding_set_source (coding);
6343
6344   src_end = coding->source + coding->src_bytes;
6345   coding->head_ascii = 0;
6346
6347   /* If we have not yet decided the text encoding type, detect it
6348      now.  */
6349   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6350     {
6351       int c, i;
6352       struct coding_detection_info detect_info;
6353       int null_byte_found = 0, eight_bit_found = 0;
6354
6355       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6356       for (src = coding->source; src < src_end; src++)
6357         {
6358           c = *src;
6359           if (c & 0x80)
6360             {
6361               eight_bit_found = 1;
6362               if (null_byte_found)
6363                 break;
6364             }
6365           else if (c < 0x20)
6366             {
6367               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6368                   && ! inhibit_iso_escape_detection
6369                   && ! detect_info.checked)
6370                 {
6371                   if (detect_coding_iso_2022 (coding, &detect_info))
6372                     {
6373                       /* We have scanned the whole data.  */
6374                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6375                         {
6376                           /* We didn't find an 8-bit code.  We may
6377                              have found a null-byte, but it's very
6378                              rare that a binary file conforms to
6379                              ISO-2022.  */
6380                           src = src_end;
6381                           coding->head_ascii = src - coding->source;
6382                         }
6383                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6384                       break;
6385                     }
6386                 }
6387               else if (! c && !inhibit_null_byte_detection)
6388                 {
6389                   null_byte_found = 1;
6390                   if (eight_bit_found)
6391                     break;
6392                 }
6393               if (! eight_bit_found)
6394                 coding->head_ascii++;
6395             }
6396           else if (! eight_bit_found)
6397             coding->head_ascii++;
6398         }
6399
6400       if (null_byte_found || eight_bit_found
6401           || coding->head_ascii < coding->src_bytes
6402           || detect_info.found)
6403         {
6404           enum coding_category category;
6405           struct coding_system *this;
6406
6407           if (coding->head_ascii == coding->src_bytes)
6408             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6409             for (i = 0; i < coding_category_raw_text; i++)
6410               {
6411                 category = coding_priorities[i];
6412                 this = coding_categories + category;
6413                 if (detect_info.found & (1 << category))
6414                   break;
6415               }
6416           else
6417             {
6418               if (null_byte_found)
6419                 {
6420                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6421                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6422                 }
6423               for (i = 0; i < coding_category_raw_text; i++)
6424                 {
6425                   category = coding_priorities[i];
6426                   this = coding_categories + category;
6427                   if (this->id < 0)
6428                     {
6429                       /* No coding system of this category is defined.  */
6430                       detect_info.rejected |= (1 << category);
6431                     }
6432                   else if (category >= coding_category_raw_text)
6433                     continue;
6434                   else if (detect_info.checked & (1 << category))
6435                     {
6436                       if (detect_info.found & (1 << category))
6437                         break;
6438                     }
6439                   else if ((*(this->detector)) (coding, &detect_info)
6440                            && detect_info.found & (1 << category))
6441                     {
6442                       if (category == coding_category_utf_16_auto)
6443                         {
6444                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6445                             category = coding_category_utf_16_le;
6446                           else
6447                             category = coding_category_utf_16_be;
6448                         }
6449                       break;
6450                     }
6451                 }
6452             }
6453
6454           if (i < coding_category_raw_text)
6455             setup_coding_system (CODING_ID_NAME (this->id), coding);
6456           else if (null_byte_found)
6457             setup_coding_system (Qno_conversion, coding);
6458           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6459                    == CATEGORY_MASK_ANY)
6460             setup_coding_system (Qraw_text, coding);
6461           else if (detect_info.rejected)
6462             for (i = 0; i < coding_category_raw_text; i++)
6463               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6464                 {
6465                   this = coding_categories + coding_priorities[i];
6466                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6467                   break;
6468                 }
6469         }
6470     }
6471   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6472            == coding_category_utf_8_auto)
6473     {
6474       Lisp_Object coding_systems;
6475       struct coding_detection_info detect_info;
6476
6477       coding_systems
6478         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6479       detect_info.found = detect_info.rejected = 0;
6480       coding->head_ascii = 0;
6481       if (CONSP (coding_systems)
6482           && detect_coding_utf_8 (coding, &detect_info))
6483         {
6484           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6485             setup_coding_system (XCAR (coding_systems), coding);
6486           else
6487             setup_coding_system (XCDR (coding_systems), coding);
6488         }
6489     }
6490   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6491            == coding_category_utf_16_auto)
6492     {
6493       Lisp_Object coding_systems;
6494       struct coding_detection_info detect_info;
6495
6496       coding_systems
6497         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6498       detect_info.found = detect_info.rejected = 0;
6499       coding->head_ascii = 0;
6500       if (CONSP (coding_systems)
6501           && detect_coding_utf_16 (coding, &detect_info))
6502         {
6503           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6504             setup_coding_system (XCAR (coding_systems), coding);
6505           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6506             setup_coding_system (XCDR (coding_systems), coding);
6507         }
6508     }
6509   coding->mode = saved_mode;
6510 }
6511
6512
6513 static void
6514 decode_eol (struct coding_system *coding)
6515 {
6516   Lisp_Object eol_type;
6517   unsigned char *p, *pbeg, *pend;
6518
6519   eol_type = CODING_ID_EOL_TYPE (coding->id);
6520   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6521     return;
6522
6523   if (NILP (coding->dst_object))
6524     pbeg = coding->destination;
6525   else
6526     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6527   pend = pbeg + coding->produced;
6528
6529   if (VECTORP (eol_type))
6530     {
6531       int eol_seen = EOL_SEEN_NONE;
6532
6533       for (p = pbeg; p < pend; p++)
6534         {
6535           if (*p == '\n')
6536             eol_seen |= EOL_SEEN_LF;
6537           else if (*p == '\r')
6538             {
6539               if (p + 1 < pend && *(p + 1) == '\n')
6540                 {
6541                   eol_seen |= EOL_SEEN_CRLF;
6542                   p++;
6543                 }
6544               else
6545                 eol_seen |= EOL_SEEN_CR;
6546             }
6547         }
6548       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6549       if ((eol_seen & EOL_SEEN_CRLF) != 0
6550           && (eol_seen & EOL_SEEN_CR) != 0
6551           && (eol_seen & EOL_SEEN_LF) == 0)
6552         eol_seen = EOL_SEEN_CRLF;
6553       else if (eol_seen != EOL_SEEN_NONE
6554           && eol_seen != EOL_SEEN_LF
6555           && eol_seen != EOL_SEEN_CRLF
6556           && eol_seen != EOL_SEEN_CR)
6557         eol_seen = EOL_SEEN_LF;
6558       if (eol_seen != EOL_SEEN_NONE)
6559         eol_type = adjust_coding_eol_type (coding, eol_seen);
6560     }
6561
6562   if (EQ (eol_type, Qmac))
6563     {
6564       for (p = pbeg; p < pend; p++)
6565         if (*p == '\r')
6566           *p = '\n';
6567     }
6568   else if (EQ (eol_type, Qdos))
6569     {
6570       ptrdiff_t n = 0;
6571
6572       if (NILP (coding->dst_object))
6573         {
6574           /* Start deleting '\r' from the tail to minimize the memory
6575              movement.  */
6576           for (p = pend - 2; p >= pbeg; p--)
6577             if (*p == '\r')
6578               {
6579                 memmove (p, p + 1, pend-- - p - 1);
6580                 n++;
6581               }
6582         }
6583       else
6584         {
6585           ptrdiff_t pos_byte = coding->dst_pos_byte;
6586           ptrdiff_t pos = coding->dst_pos;
6587           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6588
6589           while (pos < pos_end)
6590             {
6591               p = BYTE_POS_ADDR (pos_byte);
6592               if (*p == '\r' && p[1] == '\n')
6593                 {
6594                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6595                   n++;
6596                   pos_end--;
6597                 }
6598               pos++;
6599               if (coding->dst_multibyte)
6600                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6601               else
6602                 pos_byte++;
6603             }
6604         }
6605       coding->produced -= n;
6606       coding->produced_char -= n;
6607     }
6608 }
6609
6610
6611 /* Return a translation table (or list of them) from coding system
6612    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6613    decoding (ENCODEP is zero). */
6614
6615 static Lisp_Object
6616 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6617 {
6618   Lisp_Object standard, translation_table;
6619   Lisp_Object val;
6620
6621   if (NILP (Venable_character_translation))
6622     {
6623       if (max_lookup)
6624         *max_lookup = 0;
6625       return Qnil;
6626     }
6627   if (encodep)
6628     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6629       standard = Vstandard_translation_table_for_encode;
6630   else
6631     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6632       standard = Vstandard_translation_table_for_decode;
6633   if (NILP (translation_table))
6634     translation_table = standard;
6635   else
6636     {
6637       if (SYMBOLP (translation_table))
6638         translation_table = Fget (translation_table, Qtranslation_table);
6639       else if (CONSP (translation_table))
6640         {
6641           translation_table = Fcopy_sequence (translation_table);
6642           for (val = translation_table; CONSP (val); val = XCDR (val))
6643             if (SYMBOLP (XCAR (val)))
6644               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6645         }
6646       if (CHAR_TABLE_P (standard))
6647         {
6648           if (CONSP (translation_table))
6649             translation_table = nconc2 (translation_table,
6650                                         Fcons (standard, Qnil));
6651           else
6652             translation_table = Fcons (translation_table,
6653                                        Fcons (standard, Qnil));
6654         }
6655     }
6656
6657   if (max_lookup)
6658     {
6659       *max_lookup = 1;
6660       if (CHAR_TABLE_P (translation_table)
6661           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6662         {
6663           val = XCHAR_TABLE (translation_table)->extras[1];
6664           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6665             *max_lookup = XFASTINT (val);
6666         }
6667       else if (CONSP (translation_table))
6668         {
6669           Lisp_Object tail;
6670
6671           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6672             if (CHAR_TABLE_P (XCAR (tail))
6673                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6674               {
6675                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6676                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6677                   *max_lookup = XFASTINT (tailval);
6678               }
6679         }
6680     }
6681   return translation_table;
6682 }
6683
6684 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6685   do {                                                          \
6686     trans = Qnil;                                               \
6687     if (CHAR_TABLE_P (table))                                   \
6688       {                                                         \
6689         trans = CHAR_TABLE_REF (table, c);                      \
6690         if (CHARACTERP (trans))                                 \
6691           c = XFASTINT (trans), trans = Qnil;                   \
6692       }                                                         \
6693     else if (CONSP (table))                                     \
6694       {                                                         \
6695         Lisp_Object tail;                                       \
6696                                                                 \
6697         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6698           if (CHAR_TABLE_P (XCAR (tail)))                       \
6699             {                                                   \
6700               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6701               if (CHARACTERP (trans))                           \
6702                 c = XFASTINT (trans), trans = Qnil;             \
6703               else if (! NILP (trans))                          \
6704                 break;                                          \
6705             }                                                   \
6706       }                                                         \
6707   } while (0)
6708
6709
6710 /* Return a translation of character(s) at BUF according to TRANS.
6711    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6712    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6713    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6714    translation is found, and Qnil if not found..
6715    If BUF is too short to lookup characters in FROM, return Qt.  */
6716
6717 static Lisp_Object
6718 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6719 {
6720
6721   if (INTEGERP (trans))
6722     return trans;
6723   for (; CONSP (trans); trans = XCDR (trans))
6724     {
6725       Lisp_Object val = XCAR (trans);
6726       Lisp_Object from = XCAR (val);
6727       ptrdiff_t len = ASIZE (from);
6728       ptrdiff_t i;
6729
6730       for (i = 0; i < len; i++)
6731         {
6732           if (buf + i == buf_end)
6733             return Qt;
6734           if (XINT (AREF (from, i)) != buf[i])
6735             break;
6736         }
6737       if (i == len)
6738         return val;
6739     }
6740   return Qnil;
6741 }
6742
6743
6744 static int
6745 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6746                int last_block)
6747 {
6748   unsigned char *dst = coding->destination + coding->produced;
6749   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6750   ptrdiff_t produced;
6751   ptrdiff_t produced_chars = 0;
6752   int carryover = 0;
6753
6754   if (! coding->chars_at_source)
6755     {
6756       /* Source characters are in coding->charbuf.  */
6757       int *buf = coding->charbuf;
6758       int *buf_end = buf + coding->charbuf_used;
6759
6760       if (EQ (coding->src_object, coding->dst_object))
6761         {
6762           coding_set_source (coding);
6763           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6764         }
6765
6766       while (buf < buf_end)
6767         {
6768           int c = *buf;
6769           ptrdiff_t i;
6770
6771           if (c >= 0)
6772             {
6773               ptrdiff_t from_nchars = 1, to_nchars = 1;
6774               Lisp_Object trans = Qnil;
6775
6776               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6777               if (! NILP (trans))
6778                 {
6779                   trans = get_translation (trans, buf, buf_end);
6780                   if (INTEGERP (trans))
6781                     c = XINT (trans);
6782                   else if (CONSP (trans))
6783                     {
6784                       from_nchars = ASIZE (XCAR (trans));
6785                       trans = XCDR (trans);
6786                       if (INTEGERP (trans))
6787                         c = XINT (trans);
6788                       else
6789                         {
6790                           to_nchars = ASIZE (trans);
6791                           c = XINT (AREF (trans, 0));
6792                         }
6793                     }
6794                   else if (EQ (trans, Qt) && ! last_block)
6795                     break;
6796                 }
6797
6798               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6799                 {
6800                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6801                        / MAX_MULTIBYTE_LENGTH)
6802                       < to_nchars)
6803                     memory_full (SIZE_MAX);
6804                   dst = alloc_destination (coding,
6805                                            buf_end - buf
6806                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6807                                            dst);
6808                   if (EQ (coding->src_object, coding->dst_object))
6809                     {
6810                       coding_set_source (coding);
6811                       dst_end = (((unsigned char *) coding->source)
6812                                  + coding->consumed);
6813                     }
6814                   else
6815                     dst_end = coding->destination + coding->dst_bytes;
6816                 }
6817
6818               for (i = 0; i < to_nchars; i++)
6819                 {
6820                   if (i > 0)
6821                     c = XINT (AREF (trans, i));
6822                   if (coding->dst_multibyte
6823                       || ! CHAR_BYTE8_P (c))
6824                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6825                   else
6826                     *dst++ = CHAR_TO_BYTE8 (c);
6827                 }
6828               produced_chars += to_nchars;
6829               buf += from_nchars;
6830             }
6831           else
6832             /* This is an annotation datum.  (-C) is the length.  */
6833             buf += -c;
6834         }
6835       carryover = buf_end - buf;
6836     }
6837   else
6838     {
6839       /* Source characters are at coding->source.  */
6840       const unsigned char *src = coding->source;
6841       const unsigned char *src_end = src + coding->consumed;
6842
6843       if (EQ (coding->dst_object, coding->src_object))
6844         dst_end = (unsigned char *) src;
6845       if (coding->src_multibyte != coding->dst_multibyte)
6846         {
6847           if (coding->src_multibyte)
6848             {
6849               int multibytep = 1;
6850               ptrdiff_t consumed_chars = 0;
6851
6852               while (1)
6853                 {
6854                   const unsigned char *src_base = src;
6855                   int c;
6856
6857                   ONE_MORE_BYTE (c);
6858                   if (dst == dst_end)
6859                     {
6860                       if (EQ (coding->src_object, coding->dst_object))
6861                         dst_end = (unsigned char *) src;
6862                       if (dst == dst_end)
6863                         {
6864                           ptrdiff_t offset = src - coding->source;
6865
6866                           dst = alloc_destination (coding, src_end - src + 1,
6867                                                    dst);
6868                           dst_end = coding->destination + coding->dst_bytes;
6869                           coding_set_source (coding);
6870                           src = coding->source + offset;
6871                           src_end = coding->source + coding->consumed;
6872                           if (EQ (coding->src_object, coding->dst_object))
6873                             dst_end = (unsigned char *) src;
6874                         }
6875                     }
6876                   *dst++ = c;
6877                   produced_chars++;
6878                 }
6879             no_more_source:
6880               ;
6881             }
6882           else
6883             while (src < src_end)
6884               {
6885                 int multibytep = 1;
6886                 int c = *src++;
6887
6888                 if (dst >= dst_end - 1)
6889                   {
6890                     if (EQ (coding->src_object, coding->dst_object))
6891                       dst_end = (unsigned char *) src;
6892                     if (dst >= dst_end - 1)
6893                       {
6894                         ptrdiff_t offset = src - coding->source;
6895                         ptrdiff_t more_bytes;
6896
6897                         if (EQ (coding->src_object, coding->dst_object))
6898                           more_bytes = ((src_end - src) / 2) + 2;
6899                         else
6900                           more_bytes = src_end - src + 2;
6901                         dst = alloc_destination (coding, more_bytes, dst);
6902                         dst_end = coding->destination + coding->dst_bytes;
6903                         coding_set_source (coding);
6904                         src = coding->source + offset;
6905                         src_end = coding->source + coding->consumed;
6906                         if (EQ (coding->src_object, coding->dst_object))
6907                           dst_end = (unsigned char *) src;
6908                       }
6909                   }
6910                 EMIT_ONE_BYTE (c);
6911               }
6912         }
6913       else
6914         {
6915           if (!EQ (coding->src_object, coding->dst_object))
6916             {
6917               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6918
6919               if (require > 0)
6920                 {
6921                   ptrdiff_t offset = src - coding->source;
6922
6923                   dst = alloc_destination (coding, require, dst);
6924                   coding_set_source (coding);
6925                   src = coding->source + offset;
6926                   src_end = coding->source + coding->consumed;
6927                 }
6928             }
6929           produced_chars = coding->consumed_char;
6930           while (src < src_end)
6931             *dst++ = *src++;
6932         }
6933     }
6934
6935   produced = dst - (coding->destination + coding->produced);
6936   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6937     insert_from_gap (produced_chars, produced);
6938   coding->produced += produced;
6939   coding->produced_char += produced_chars;
6940   return carryover;
6941 }
6942
6943 /* Compose text in CODING->object according to the annotation data at
6944    CHARBUF.  CHARBUF is an array:
6945      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6946  */
6947
6948 static inline void
6949 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6950 {
6951   int len;
6952   ptrdiff_t to;
6953   enum composition_method method;
6954   Lisp_Object components;
6955
6956   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6957   to = pos + charbuf[2];
6958   method = (enum composition_method) (charbuf[4]);
6959
6960   if (method == COMPOSITION_RELATIVE)
6961     components = Qnil;
6962   else
6963     {
6964       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6965       int i, j;
6966
6967       if (method == COMPOSITION_WITH_RULE)
6968         len = charbuf[2] * 3 - 2;
6969       charbuf += MAX_ANNOTATION_LENGTH;
6970       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6971       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6972         {
6973           if (charbuf[i] >= 0)
6974             args[j] = make_number (charbuf[i]);
6975           else
6976             {
6977               i++;
6978               args[j] = make_number (charbuf[i] % 0x100);
6979             }
6980         }
6981       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6982     }
6983   compose_text (pos, to, components, Qnil, coding->dst_object);
6984 }
6985
6986
6987 /* Put `charset' property on text in CODING->object according to
6988    the annotation data at CHARBUF.  CHARBUF is an array:
6989      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6990  */
6991
6992 static inline void
6993 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6994 {
6995   ptrdiff_t from = pos - charbuf[2];
6996   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6997
6998   Fput_text_property (make_number (from), make_number (pos),
6999                       Qcharset, CHARSET_NAME (charset),
7000                       coding->dst_object);
7001 }
7002
7003
7004 #define CHARBUF_SIZE 0x4000
7005
7006 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7007   do {                                                                  \
7008     int size = CHARBUF_SIZE;                                            \
7009                                                                         \
7010     coding->charbuf = NULL;                                             \
7011     while (size > 1024)                                                 \
7012       {                                                                 \
7013         coding->charbuf = alloca (sizeof (int) * size);                 \
7014         if (coding->charbuf)                                            \
7015           break;                                                        \
7016         size >>= 1;                                                     \
7017       }                                                                 \
7018     if (! coding->charbuf)                                              \
7019       {                                                                 \
7020         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7021         return coding->result;                                          \
7022       }                                                                 \
7023     coding->charbuf_size = size;                                        \
7024   } while (0)
7025
7026
7027 static void
7028 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7029 {
7030   int *charbuf = coding->charbuf;
7031   int *charbuf_end = charbuf + coding->charbuf_used;
7032
7033   if (NILP (coding->dst_object))
7034     return;
7035
7036   while (charbuf < charbuf_end)
7037     {
7038       if (*charbuf >= 0)
7039         pos++, charbuf++;
7040       else
7041         {
7042           int len = -*charbuf;
7043
7044           if (len > 2)
7045             switch (charbuf[1])
7046               {
7047               case CODING_ANNOTATE_COMPOSITION_MASK:
7048                 produce_composition (coding, charbuf, pos);
7049                 break;
7050               case CODING_ANNOTATE_CHARSET_MASK:
7051                 produce_charset (coding, charbuf, pos);
7052                 break;
7053               }
7054           charbuf += len;
7055         }
7056     }
7057 }
7058
7059 /* Decode the data at CODING->src_object into CODING->dst_object.
7060    CODING->src_object is a buffer, a string, or nil.
7061    CODING->dst_object is a buffer.
7062
7063    If CODING->src_object is a buffer, it must be the current buffer.
7064    In this case, if CODING->src_pos is positive, it is a position of
7065    the source text in the buffer, otherwise, the source text is in the
7066    gap area of the buffer, and CODING->src_pos specifies the offset of
7067    the text from GPT (which must be the same as PT).  If this is the
7068    same buffer as CODING->dst_object, CODING->src_pos must be
7069    negative.
7070
7071    If CODING->src_object is a string, CODING->src_pos is an index to
7072    that string.
7073
7074    If CODING->src_object is nil, CODING->source must already point to
7075    the non-relocatable memory area.  In this case, CODING->src_pos is
7076    an offset from CODING->source.
7077
7078    The decoded data is inserted at the current point of the buffer
7079    CODING->dst_object.
7080 */
7081
7082 static int
7083 decode_coding (struct coding_system *coding)
7084 {
7085   Lisp_Object attrs;
7086   Lisp_Object undo_list;
7087   Lisp_Object translation_table;
7088   struct ccl_spec cclspec;
7089   int carryover;
7090   int i;
7091
7092   if (BUFFERP (coding->src_object)
7093       && coding->src_pos > 0
7094       && coding->src_pos < GPT
7095       && coding->src_pos + coding->src_chars > GPT)
7096     move_gap_both (coding->src_pos, coding->src_pos_byte);
7097
7098   undo_list = Qt;
7099   if (BUFFERP (coding->dst_object))
7100     {
7101       if (current_buffer != XBUFFER (coding->dst_object))
7102         set_buffer_internal (XBUFFER (coding->dst_object));
7103       if (GPT != PT)
7104         move_gap_both (PT, PT_BYTE);
7105
7106       /* We must disable undo_list in order to record the whole insert
7107          transaction via record_insert at the end.  But doing so also
7108          disables the recording of the first change to the undo_list.
7109          Therefore we check for first change here and record it via
7110          record_first_change if needed.  */
7111       if (MODIFF <= SAVE_MODIFF)
7112         record_first_change ();
7113
7114       undo_list = BVAR (current_buffer, undo_list);
7115       BSET (current_buffer, undo_list, Qt);
7116     }
7117
7118   coding->consumed = coding->consumed_char = 0;
7119   coding->produced = coding->produced_char = 0;
7120   coding->chars_at_source = 0;
7121   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7122   coding->errors = 0;
7123
7124   ALLOC_CONVERSION_WORK_AREA (coding);
7125
7126   attrs = CODING_ID_ATTRS (coding->id);
7127   translation_table = get_translation_table (attrs, 0, NULL);
7128
7129   carryover = 0;
7130   if (coding->decoder == decode_coding_ccl)
7131     {
7132       coding->spec.ccl = &cclspec;
7133       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7134     }
7135   do
7136     {
7137       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7138
7139       coding_set_source (coding);
7140       coding->annotated = 0;
7141       coding->charbuf_used = carryover;
7142       (*(coding->decoder)) (coding);
7143       coding_set_destination (coding);
7144       carryover = produce_chars (coding, translation_table, 0);
7145       if (coding->annotated)
7146         produce_annotation (coding, pos);
7147       for (i = 0; i < carryover; i++)
7148         coding->charbuf[i]
7149           = coding->charbuf[coding->charbuf_used - carryover + i];
7150     }
7151   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7152          || (coding->consumed < coding->src_bytes
7153              && (coding->result == CODING_RESULT_SUCCESS
7154                  || coding->result == CODING_RESULT_INVALID_SRC)));
7155
7156   if (carryover > 0)
7157     {
7158       coding_set_destination (coding);
7159       coding->charbuf_used = carryover;
7160       produce_chars (coding, translation_table, 1);
7161     }
7162
7163   coding->carryover_bytes = 0;
7164   if (coding->consumed < coding->src_bytes)
7165     {
7166       int nbytes = coding->src_bytes - coding->consumed;
7167       const unsigned char *src;
7168
7169       coding_set_source (coding);
7170       coding_set_destination (coding);
7171       src = coding->source + coding->consumed;
7172
7173       if (coding->mode & CODING_MODE_LAST_BLOCK)
7174         {
7175           /* Flush out unprocessed data as binary chars.  We are sure
7176              that the number of data is less than the size of
7177              coding->charbuf.  */
7178           coding->charbuf_used = 0;
7179           coding->chars_at_source = 0;
7180
7181           while (nbytes-- > 0)
7182             {
7183               int c = *src++;
7184
7185               if (c & 0x80)
7186                 c = BYTE8_TO_CHAR (c);
7187               coding->charbuf[coding->charbuf_used++] = c;
7188             }
7189           produce_chars (coding, Qnil, 1);
7190         }
7191       else
7192         {
7193           /* Record unprocessed bytes in coding->carryover.  We are
7194              sure that the number of data is less than the size of
7195              coding->carryover.  */
7196           unsigned char *p = coding->carryover;
7197
7198           if (nbytes > sizeof coding->carryover)
7199             nbytes = sizeof coding->carryover;
7200           coding->carryover_bytes = nbytes;
7201           while (nbytes-- > 0)
7202             *p++ = *src++;
7203         }
7204       coding->consumed = coding->src_bytes;
7205     }
7206
7207   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7208       && !inhibit_eol_conversion)
7209     decode_eol (coding);
7210   if (BUFFERP (coding->dst_object))
7211     {
7212       BSET (current_buffer, undo_list, undo_list);
7213       record_insert (coding->dst_pos, coding->produced_char);
7214     }
7215   return coding->result;
7216 }
7217
7218
7219 /* Extract an annotation datum from a composition starting at POS and
7220    ending before LIMIT of CODING->src_object (buffer or string), store
7221    the data in BUF, set *STOP to a starting position of the next
7222    composition (if any) or to LIMIT, and return the address of the
7223    next element of BUF.
7224
7225    If such an annotation is not found, set *STOP to a starting
7226    position of a composition after POS (if any) or to LIMIT, and
7227    return BUF.  */
7228
7229 static inline int *
7230 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7231                                struct coding_system *coding, int *buf,
7232                                ptrdiff_t *stop)
7233 {
7234   ptrdiff_t start, end;
7235   Lisp_Object prop;
7236
7237   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7238       || end > limit)
7239     *stop = limit;
7240   else if (start > pos)
7241     *stop = start;
7242   else
7243     {
7244       if (start == pos)
7245         {
7246           /* We found a composition.  Store the corresponding
7247              annotation data in BUF.  */
7248           int *head = buf;
7249           enum composition_method method = COMPOSITION_METHOD (prop);
7250           int nchars = COMPOSITION_LENGTH (prop);
7251
7252           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7253           if (method != COMPOSITION_RELATIVE)
7254             {
7255               Lisp_Object components;
7256               ptrdiff_t i, len, i_byte;
7257
7258               components = COMPOSITION_COMPONENTS (prop);
7259               if (VECTORP (components))
7260                 {
7261                   len = ASIZE (components);
7262                   for (i = 0; i < len; i++)
7263                     *buf++ = XINT (AREF (components, i));
7264                 }
7265               else if (STRINGP (components))
7266                 {
7267                   len = SCHARS (components);
7268                   i = i_byte = 0;
7269                   while (i < len)
7270                     {
7271                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7272                       buf++;
7273                     }
7274                 }
7275               else if (INTEGERP (components))
7276                 {
7277                   len = 1;
7278                   *buf++ = XINT (components);
7279                 }
7280               else if (CONSP (components))
7281                 {
7282                   for (len = 0; CONSP (components);
7283                        len++, components = XCDR (components))
7284                     *buf++ = XINT (XCAR (components));
7285                 }
7286               else
7287                 abort ();
7288               *head -= len;
7289             }
7290         }
7291
7292       if (find_composition (end, limit, &start, &end, &prop,
7293                             coding->src_object)
7294           && end <= limit)
7295         *stop = start;
7296       else
7297         *stop = limit;
7298     }
7299   return buf;
7300 }
7301
7302
7303 /* Extract an annotation datum from a text property `charset' at POS of
7304    CODING->src_object (buffer of string), store the data in BUF, set
7305    *STOP to the position where the value of `charset' property changes
7306    (limiting by LIMIT), and return the address of the next element of
7307    BUF.
7308
7309    If the property value is nil, set *STOP to the position where the
7310    property value is non-nil (limiting by LIMIT), and return BUF.  */
7311
7312 static inline int *
7313 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7314                            struct coding_system *coding, int *buf,
7315                            ptrdiff_t *stop)
7316 {
7317   Lisp_Object val, next;
7318   int id;
7319
7320   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7321   if (! NILP (val) && CHARSETP (val))
7322     id = XINT (CHARSET_SYMBOL_ID (val));
7323   else
7324     id = -1;
7325   ADD_CHARSET_DATA (buf, 0, id);
7326   next = Fnext_single_property_change (make_number (pos), Qcharset,
7327                                        coding->src_object,
7328                                        make_number (limit));
7329   *stop = XINT (next);
7330   return buf;
7331 }
7332
7333
7334 static void
7335 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7336                int max_lookup)
7337 {
7338   int *buf = coding->charbuf;
7339   int *buf_end = coding->charbuf + coding->charbuf_size;
7340   const unsigned char *src = coding->source + coding->consumed;
7341   const unsigned char *src_end = coding->source + coding->src_bytes;
7342   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7343   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7344   int multibytep = coding->src_multibyte;
7345   Lisp_Object eol_type;
7346   int c;
7347   ptrdiff_t stop, stop_composition, stop_charset;
7348   int *lookup_buf = NULL;
7349
7350   if (! NILP (translation_table))
7351     lookup_buf = alloca (sizeof (int) * max_lookup);
7352
7353   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7354   if (VECTORP (eol_type))
7355     eol_type = Qunix;
7356
7357   /* Note: composition handling is not yet implemented.  */
7358   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7359
7360   if (NILP (coding->src_object))
7361     stop = stop_composition = stop_charset = end_pos;
7362   else
7363     {
7364       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7365         stop = stop_composition = pos;
7366       else
7367         stop = stop_composition = end_pos;
7368       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7369         stop = stop_charset = pos;
7370       else
7371         stop_charset = end_pos;
7372     }
7373
7374   /* Compensate for CRLF and conversion.  */
7375   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7376   while (buf < buf_end)
7377     {
7378       Lisp_Object trans;
7379
7380       if (pos == stop)
7381         {
7382           if (pos == end_pos)
7383             break;
7384           if (pos == stop_composition)
7385             buf = handle_composition_annotation (pos, end_pos, coding,
7386                                                  buf, &stop_composition);
7387           if (pos == stop_charset)
7388             buf = handle_charset_annotation (pos, end_pos, coding,
7389                                              buf, &stop_charset);
7390           stop = (stop_composition < stop_charset
7391                   ? stop_composition : stop_charset);
7392         }
7393
7394       if (! multibytep)
7395         {
7396           int bytes;
7397
7398           if (coding->encoder == encode_coding_raw_text
7399               || coding->encoder == encode_coding_ccl)
7400             c = *src++, pos++;
7401           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7402             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7403           else
7404             c = BYTE8_TO_CHAR (*src), src++, pos++;
7405         }
7406       else
7407         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7408       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7409         c = '\n';
7410       if (! EQ (eol_type, Qunix))
7411         {
7412           if (c == '\n')
7413             {
7414               if (EQ (eol_type, Qdos))
7415                 *buf++ = '\r';
7416               else
7417                 c = '\r';
7418             }
7419         }
7420
7421       trans = Qnil;
7422       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7423       if (NILP (trans))
7424         *buf++ = c;
7425       else
7426         {
7427           ptrdiff_t from_nchars = 1, to_nchars = 1;
7428           int *lookup_buf_end;
7429           const unsigned char *p = src;
7430           int i;
7431
7432           lookup_buf[0] = c;
7433           for (i = 1; i < max_lookup && p < src_end; i++)
7434             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7435           lookup_buf_end = lookup_buf + i;
7436           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7437           if (INTEGERP (trans))
7438             c = XINT (trans);
7439           else if (CONSP (trans))
7440             {
7441               from_nchars = ASIZE (XCAR (trans));
7442               trans = XCDR (trans);
7443               if (INTEGERP (trans))
7444                 c = XINT (trans);
7445               else
7446                 {
7447                   to_nchars = ASIZE (trans);
7448                   if (buf_end - buf < to_nchars)
7449                     break;
7450                   c = XINT (AREF (trans, 0));
7451                 }
7452             }
7453           else
7454             break;
7455           *buf++ = c;
7456           for (i = 1; i < to_nchars; i++)
7457             *buf++ = XINT (AREF (trans, i));
7458           for (i = 1; i < from_nchars; i++, pos++)
7459             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7460         }
7461     }
7462
7463   coding->consumed = src - coding->source;
7464   coding->consumed_char = pos - coding->src_pos;
7465   coding->charbuf_used = buf - coding->charbuf;
7466   coding->chars_at_source = 0;
7467 }
7468
7469
7470 /* Encode the text at CODING->src_object into CODING->dst_object.
7471    CODING->src_object is a buffer or a string.
7472    CODING->dst_object is a buffer or nil.
7473
7474    If CODING->src_object is a buffer, it must be the current buffer.
7475    In this case, if CODING->src_pos is positive, it is a position of
7476    the source text in the buffer, otherwise. the source text is in the
7477    gap area of the buffer, and coding->src_pos specifies the offset of
7478    the text from GPT (which must be the same as PT).  If this is the
7479    same buffer as CODING->dst_object, CODING->src_pos must be
7480    negative and CODING should not have `pre-write-conversion'.
7481
7482    If CODING->src_object is a string, CODING should not have
7483    `pre-write-conversion'.
7484
7485    If CODING->dst_object is a buffer, the encoded data is inserted at
7486    the current point of that buffer.
7487
7488    If CODING->dst_object is nil, the encoded data is placed at the
7489    memory area specified by CODING->destination.  */
7490
7491 static int
7492 encode_coding (struct coding_system *coding)
7493 {
7494   Lisp_Object attrs;
7495   Lisp_Object translation_table;
7496   int max_lookup;
7497   struct ccl_spec cclspec;
7498
7499   attrs = CODING_ID_ATTRS (coding->id);
7500   if (coding->encoder == encode_coding_raw_text)
7501     translation_table = Qnil, max_lookup = 0;
7502   else
7503     translation_table = get_translation_table (attrs, 1, &max_lookup);
7504
7505   if (BUFFERP (coding->dst_object))
7506     {
7507       set_buffer_internal (XBUFFER (coding->dst_object));
7508       coding->dst_multibyte
7509         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7510     }
7511
7512   coding->consumed = coding->consumed_char = 0;
7513   coding->produced = coding->produced_char = 0;
7514   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7515   coding->errors = 0;
7516
7517   ALLOC_CONVERSION_WORK_AREA (coding);
7518
7519   if (coding->encoder == encode_coding_ccl)
7520     {
7521       coding->spec.ccl = &cclspec;
7522       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7523     }
7524   do {
7525     coding_set_source (coding);
7526     consume_chars (coding, translation_table, max_lookup);
7527     coding_set_destination (coding);
7528     (*(coding->encoder)) (coding);
7529   } while (coding->consumed_char < coding->src_chars);
7530
7531   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7532     insert_from_gap (coding->produced_char, coding->produced);
7533
7534   return (coding->result);
7535 }
7536
7537
7538 /* Name (or base name) of work buffer for code conversion.  */
7539 static Lisp_Object Vcode_conversion_workbuf_name;
7540
7541 /* A working buffer used by the top level conversion.  Once it is
7542    created, it is never destroyed.  It has the name
7543    Vcode_conversion_workbuf_name.  The other working buffers are
7544    destroyed after the use is finished, and their names are modified
7545    versions of Vcode_conversion_workbuf_name.  */
7546 static Lisp_Object Vcode_conversion_reused_workbuf;
7547
7548 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7549 static int reused_workbuf_in_use;
7550
7551
7552 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7553    multibyteness of returning buffer.  */
7554
7555 static Lisp_Object
7556 make_conversion_work_buffer (int multibyte)
7557 {
7558   Lisp_Object name, workbuf;
7559   struct buffer *current;
7560
7561   if (reused_workbuf_in_use++)
7562     {
7563       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7564       workbuf = Fget_buffer_create (name);
7565     }
7566   else
7567     {
7568       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7569         Vcode_conversion_reused_workbuf
7570           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7571       workbuf = Vcode_conversion_reused_workbuf;
7572     }
7573   current = current_buffer;
7574   set_buffer_internal (XBUFFER (workbuf));
7575   /* We can't allow modification hooks to run in the work buffer.  For
7576      instance, directory_files_internal assumes that file decoding
7577      doesn't compile new regexps.  */
7578   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7579   Ferase_buffer ();
7580   BSET (current_buffer, undo_list, Qt);
7581   BSET (current_buffer, enable_multibyte_characters, multibyte ? Qt : Qnil);
7582   set_buffer_internal (current);
7583   return workbuf;
7584 }
7585
7586
7587 static Lisp_Object
7588 code_conversion_restore (Lisp_Object arg)
7589 {
7590   Lisp_Object current, workbuf;
7591   struct gcpro gcpro1;
7592
7593   GCPRO1 (arg);
7594   current = XCAR (arg);
7595   workbuf = XCDR (arg);
7596   if (! NILP (workbuf))
7597     {
7598       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7599         reused_workbuf_in_use = 0;
7600       else
7601         Fkill_buffer (workbuf);
7602     }
7603   set_buffer_internal (XBUFFER (current));
7604   UNGCPRO;
7605   return Qnil;
7606 }
7607
7608 Lisp_Object
7609 code_conversion_save (int with_work_buf, int multibyte)
7610 {
7611   Lisp_Object workbuf = Qnil;
7612
7613   if (with_work_buf)
7614     workbuf = make_conversion_work_buffer (multibyte);
7615   record_unwind_protect (code_conversion_restore,
7616                          Fcons (Fcurrent_buffer (), workbuf));
7617   return workbuf;
7618 }
7619
7620 int
7621 decode_coding_gap (struct coding_system *coding,
7622                    ptrdiff_t chars, ptrdiff_t bytes)
7623 {
7624   ptrdiff_t count = SPECPDL_INDEX ();
7625   Lisp_Object attrs;
7626
7627   code_conversion_save (0, 0);
7628
7629   coding->src_object = Fcurrent_buffer ();
7630   coding->src_chars = chars;
7631   coding->src_bytes = bytes;
7632   coding->src_pos = -chars;
7633   coding->src_pos_byte = -bytes;
7634   coding->src_multibyte = chars < bytes;
7635   coding->dst_object = coding->src_object;
7636   coding->dst_pos = PT;
7637   coding->dst_pos_byte = PT_BYTE;
7638   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7639
7640   if (CODING_REQUIRE_DETECTION (coding))
7641     detect_coding (coding);
7642
7643   coding->mode |= CODING_MODE_LAST_BLOCK;
7644   current_buffer->text->inhibit_shrinking = 1;
7645   decode_coding (coding);
7646   current_buffer->text->inhibit_shrinking = 0;
7647
7648   attrs = CODING_ID_ATTRS (coding->id);
7649   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7650     {
7651       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7652       Lisp_Object val;
7653
7654       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7655       val = call1 (CODING_ATTR_POST_READ (attrs),
7656                    make_number (coding->produced_char));
7657       CHECK_NATNUM (val);
7658       coding->produced_char += Z - prev_Z;
7659       coding->produced += Z_BYTE - prev_Z_BYTE;
7660     }
7661
7662   unbind_to (count, Qnil);
7663   return coding->result;
7664 }
7665
7666
7667 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7668    SRC_OBJECT into DST_OBJECT by coding context CODING.
7669
7670    SRC_OBJECT is a buffer, a string, or Qnil.
7671
7672    If it is a buffer, the text is at point of the buffer.  FROM and TO
7673    are positions in the buffer.
7674
7675    If it is a string, the text is at the beginning of the string.
7676    FROM and TO are indices to the string.
7677
7678    If it is nil, the text is at coding->source.  FROM and TO are
7679    indices to coding->source.
7680
7681    DST_OBJECT is a buffer, Qt, or Qnil.
7682
7683    If it is a buffer, the decoded text is inserted at point of the
7684    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7685    is deleted.
7686
7687    If it is Qt, a string is made from the decoded text, and
7688    set in CODING->dst_object.
7689
7690    If it is Qnil, the decoded text is stored at CODING->destination.
7691    The caller must allocate CODING->dst_bytes bytes at
7692    CODING->destination by xmalloc.  If the decoded text is longer than
7693    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7694  */
7695
7696 void
7697 decode_coding_object (struct coding_system *coding,
7698                       Lisp_Object src_object,
7699                       ptrdiff_t from, ptrdiff_t from_byte,
7700                       ptrdiff_t to, ptrdiff_t to_byte,
7701                       Lisp_Object dst_object)
7702 {
7703   ptrdiff_t count = SPECPDL_INDEX ();
7704   unsigned char *destination IF_LINT (= NULL);
7705   ptrdiff_t dst_bytes IF_LINT (= 0);
7706   ptrdiff_t chars = to - from;
7707   ptrdiff_t bytes = to_byte - from_byte;
7708   Lisp_Object attrs;
7709   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7710   int need_marker_adjustment = 0;
7711   Lisp_Object old_deactivate_mark;
7712
7713   old_deactivate_mark = Vdeactivate_mark;
7714
7715   if (NILP (dst_object))
7716     {
7717       destination = coding->destination;
7718       dst_bytes = coding->dst_bytes;
7719     }
7720
7721   coding->src_object = src_object;
7722   coding->src_chars = chars;
7723   coding->src_bytes = bytes;
7724   coding->src_multibyte = chars < bytes;
7725
7726   if (STRINGP (src_object))
7727     {
7728       coding->src_pos = from;
7729       coding->src_pos_byte = from_byte;
7730     }
7731   else if (BUFFERP (src_object))
7732     {
7733       set_buffer_internal (XBUFFER (src_object));
7734       if (from != GPT)
7735         move_gap_both (from, from_byte);
7736       if (EQ (src_object, dst_object))
7737         {
7738           struct Lisp_Marker *tail;
7739
7740           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7741             {
7742               tail->need_adjustment
7743                 = tail->charpos == (tail->insertion_type ? from : to);
7744               need_marker_adjustment |= tail->need_adjustment;
7745             }
7746           saved_pt = PT, saved_pt_byte = PT_BYTE;
7747           TEMP_SET_PT_BOTH (from, from_byte);
7748           current_buffer->text->inhibit_shrinking = 1;
7749           del_range_both (from, from_byte, to, to_byte, 1);
7750           coding->src_pos = -chars;
7751           coding->src_pos_byte = -bytes;
7752         }
7753       else
7754         {
7755           coding->src_pos = from;
7756           coding->src_pos_byte = from_byte;
7757         }
7758     }
7759
7760   if (CODING_REQUIRE_DETECTION (coding))
7761     detect_coding (coding);
7762   attrs = CODING_ID_ATTRS (coding->id);
7763
7764   if (EQ (dst_object, Qt)
7765       || (! NILP (CODING_ATTR_POST_READ (attrs))
7766           && NILP (dst_object)))
7767     {
7768       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7769       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7770       coding->dst_pos = BEG;
7771       coding->dst_pos_byte = BEG_BYTE;
7772     }
7773   else if (BUFFERP (dst_object))
7774     {
7775       code_conversion_save (0, 0);
7776       coding->dst_object = dst_object;
7777       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7778       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7779       coding->dst_multibyte
7780         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7781     }
7782   else
7783     {
7784       code_conversion_save (0, 0);
7785       coding->dst_object = Qnil;
7786       /* Most callers presume this will return a multibyte result, and they
7787          won't use `binary' or `raw-text' anyway, so let's not worry about
7788          CODING_FOR_UNIBYTE.  */
7789       coding->dst_multibyte = 1;
7790     }
7791
7792   decode_coding (coding);
7793
7794   if (BUFFERP (coding->dst_object))
7795     set_buffer_internal (XBUFFER (coding->dst_object));
7796
7797   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7798     {
7799       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7800       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7801       Lisp_Object val;
7802
7803       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7804       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7805               old_deactivate_mark);
7806       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7807                         make_number (coding->produced_char));
7808       UNGCPRO;
7809       CHECK_NATNUM (val);
7810       coding->produced_char += Z - prev_Z;
7811       coding->produced += Z_BYTE - prev_Z_BYTE;
7812     }
7813
7814   if (EQ (dst_object, Qt))
7815     {
7816       coding->dst_object = Fbuffer_string ();
7817     }
7818   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7819     {
7820       set_buffer_internal (XBUFFER (coding->dst_object));
7821       if (dst_bytes < coding->produced)
7822         {
7823           destination = xrealloc (destination, coding->produced);
7824           if (! destination)
7825             {
7826               record_conversion_result (coding,
7827                                         CODING_RESULT_INSUFFICIENT_MEM);
7828               unbind_to (count, Qnil);
7829               return;
7830             }
7831           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7832             move_gap_both (BEGV, BEGV_BYTE);
7833           memcpy (destination, BEGV_ADDR, coding->produced);
7834           coding->destination = destination;
7835         }
7836     }
7837
7838   if (saved_pt >= 0)
7839     {
7840       /* This is the case of:
7841          (BUFFERP (src_object) && EQ (src_object, dst_object))
7842          As we have moved PT while replacing the original buffer
7843          contents, we must recover it now.  */
7844       set_buffer_internal (XBUFFER (src_object));
7845       current_buffer->text->inhibit_shrinking = 0;
7846       if (saved_pt < from)
7847         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7848       else if (saved_pt < from + chars)
7849         TEMP_SET_PT_BOTH (from, from_byte);
7850       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7851         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7852                           saved_pt_byte + (coding->produced - bytes));
7853       else
7854         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7855                           saved_pt_byte + (coding->produced - bytes));
7856
7857       if (need_marker_adjustment)
7858         {
7859           struct Lisp_Marker *tail;
7860
7861           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7862             if (tail->need_adjustment)
7863               {
7864                 tail->need_adjustment = 0;
7865                 if (tail->insertion_type)
7866                   {
7867                     tail->bytepos = from_byte;
7868                     tail->charpos = from;
7869                   }
7870                 else
7871                   {
7872                     tail->bytepos = from_byte + coding->produced;
7873                     tail->charpos
7874                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7875                          ? tail->bytepos : from + coding->produced_char);
7876                   }
7877               }
7878         }
7879     }
7880
7881   Vdeactivate_mark = old_deactivate_mark;
7882   unbind_to (count, coding->dst_object);
7883 }
7884
7885
7886 void
7887 encode_coding_object (struct coding_system *coding,
7888                       Lisp_Object src_object,
7889                       ptrdiff_t from, ptrdiff_t from_byte,
7890                       ptrdiff_t to, ptrdiff_t to_byte,
7891                       Lisp_Object dst_object)
7892 {
7893   ptrdiff_t count = SPECPDL_INDEX ();
7894   ptrdiff_t chars = to - from;
7895   ptrdiff_t bytes = to_byte - from_byte;
7896   Lisp_Object attrs;
7897   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7898   int need_marker_adjustment = 0;
7899   int kill_src_buffer = 0;
7900   Lisp_Object old_deactivate_mark;
7901
7902   old_deactivate_mark = Vdeactivate_mark;
7903
7904   coding->src_object = src_object;
7905   coding->src_chars = chars;
7906   coding->src_bytes = bytes;
7907   coding->src_multibyte = chars < bytes;
7908
7909   attrs = CODING_ID_ATTRS (coding->id);
7910
7911   if (EQ (src_object, dst_object))
7912     {
7913       struct Lisp_Marker *tail;
7914
7915       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7916         {
7917           tail->need_adjustment
7918             = tail->charpos == (tail->insertion_type ? from : to);
7919           need_marker_adjustment |= tail->need_adjustment;
7920         }
7921     }
7922
7923   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7924     {
7925       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7926       set_buffer_internal (XBUFFER (coding->src_object));
7927       if (STRINGP (src_object))
7928         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7929       else if (BUFFERP (src_object))
7930         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7931       else
7932         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7933
7934       if (EQ (src_object, dst_object))
7935         {
7936           set_buffer_internal (XBUFFER (src_object));
7937           saved_pt = PT, saved_pt_byte = PT_BYTE;
7938           del_range_both (from, from_byte, to, to_byte, 1);
7939           set_buffer_internal (XBUFFER (coding->src_object));
7940         }
7941
7942       {
7943         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7944
7945         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7946                 old_deactivate_mark);
7947         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7948                     make_number (BEG), make_number (Z));
7949         UNGCPRO;
7950       }
7951       if (XBUFFER (coding->src_object) != current_buffer)
7952         kill_src_buffer = 1;
7953       coding->src_object = Fcurrent_buffer ();
7954       if (BEG != GPT)
7955         move_gap_both (BEG, BEG_BYTE);
7956       coding->src_chars = Z - BEG;
7957       coding->src_bytes = Z_BYTE - BEG_BYTE;
7958       coding->src_pos = BEG;
7959       coding->src_pos_byte = BEG_BYTE;
7960       coding->src_multibyte = Z < Z_BYTE;
7961     }
7962   else if (STRINGP (src_object))
7963     {
7964       code_conversion_save (0, 0);
7965       coding->src_pos = from;
7966       coding->src_pos_byte = from_byte;
7967     }
7968   else if (BUFFERP (src_object))
7969     {
7970       code_conversion_save (0, 0);
7971       set_buffer_internal (XBUFFER (src_object));
7972       if (EQ (src_object, dst_object))
7973         {
7974           saved_pt = PT, saved_pt_byte = PT_BYTE;
7975           coding->src_object = del_range_1 (from, to, 1, 1);
7976           coding->src_pos = 0;
7977           coding->src_pos_byte = 0;
7978         }
7979       else
7980         {
7981           if (from < GPT && to >= GPT)
7982             move_gap_both (from, from_byte);
7983           coding->src_pos = from;
7984           coding->src_pos_byte = from_byte;
7985         }
7986     }
7987   else
7988     code_conversion_save (0, 0);
7989
7990   if (BUFFERP (dst_object))
7991     {
7992       coding->dst_object = dst_object;
7993       if (EQ (src_object, dst_object))
7994         {
7995           coding->dst_pos = from;
7996           coding->dst_pos_byte = from_byte;
7997         }
7998       else
7999         {
8000           struct buffer *current = current_buffer;
8001
8002           set_buffer_temp (XBUFFER (dst_object));
8003           coding->dst_pos = PT;
8004           coding->dst_pos_byte = PT_BYTE;
8005           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8006           set_buffer_temp (current);
8007         }
8008       coding->dst_multibyte
8009         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8010     }
8011   else if (EQ (dst_object, Qt))
8012     {
8013       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8014       coding->dst_object = Qnil;
8015       coding->destination = xmalloc (dst_bytes);
8016       coding->dst_bytes = dst_bytes;
8017       coding->dst_multibyte = 0;
8018     }
8019   else
8020     {
8021       coding->dst_object = Qnil;
8022       coding->dst_multibyte = 0;
8023     }
8024
8025   encode_coding (coding);
8026
8027   if (EQ (dst_object, Qt))
8028     {
8029       if (BUFFERP (coding->dst_object))
8030         coding->dst_object = Fbuffer_string ();
8031       else
8032         {
8033           coding->dst_object
8034             = make_unibyte_string ((char *) coding->destination,
8035                                    coding->produced);
8036           xfree (coding->destination);
8037         }
8038     }
8039
8040   if (saved_pt >= 0)
8041     {
8042       /* This is the case of:
8043          (BUFFERP (src_object) && EQ (src_object, dst_object))
8044          As we have moved PT while replacing the original buffer
8045          contents, we must recover it now.  */
8046       set_buffer_internal (XBUFFER (src_object));
8047       if (saved_pt < from)
8048         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8049       else if (saved_pt < from + chars)
8050         TEMP_SET_PT_BOTH (from, from_byte);
8051       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8052         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8053                           saved_pt_byte + (coding->produced - bytes));
8054       else
8055         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8056                           saved_pt_byte + (coding->produced - bytes));
8057
8058       if (need_marker_adjustment)
8059         {
8060           struct Lisp_Marker *tail;
8061
8062           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8063             if (tail->need_adjustment)
8064               {
8065                 tail->need_adjustment = 0;
8066                 if (tail->insertion_type)
8067                   {
8068                     tail->bytepos = from_byte;
8069                     tail->charpos = from;
8070                   }
8071                 else
8072                   {
8073                     tail->bytepos = from_byte + coding->produced;
8074                     tail->charpos
8075                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8076                          ? tail->bytepos : from + coding->produced_char);
8077                   }
8078               }
8079         }
8080     }
8081
8082   if (kill_src_buffer)
8083     Fkill_buffer (coding->src_object);
8084
8085   Vdeactivate_mark = old_deactivate_mark;
8086   unbind_to (count, Qnil);
8087 }
8088
8089
8090 Lisp_Object
8091 preferred_coding_system (void)
8092 {
8093   int id = coding_categories[coding_priorities[0]].id;
8094
8095   return CODING_ID_NAME (id);
8096 }
8097
8098 \f
8099 #ifdef emacs
8100 /*** 8. Emacs Lisp library functions ***/
8101
8102 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8103        doc: /* Return t if OBJECT is nil or a coding-system.
8104 See the documentation of `define-coding-system' for information
8105 about coding-system objects.  */)
8106   (Lisp_Object object)
8107 {
8108   if (NILP (object)
8109       || CODING_SYSTEM_ID (object) >= 0)
8110     return Qt;
8111   if (! SYMBOLP (object)
8112       || NILP (Fget (object, Qcoding_system_define_form)))
8113     return Qnil;
8114   return Qt;
8115 }
8116
8117 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8118        Sread_non_nil_coding_system, 1, 1, 0,
8119        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8120   (Lisp_Object prompt)
8121 {
8122   Lisp_Object val;
8123   do
8124     {
8125       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8126                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8127     }
8128   while (SCHARS (val) == 0);
8129   return (Fintern (val, Qnil));
8130 }
8131
8132 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8133        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8134 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8135 Ignores case when completing coding systems (all Emacs coding systems
8136 are lower-case).  */)
8137   (Lisp_Object prompt, Lisp_Object default_coding_system)
8138 {
8139   Lisp_Object val;
8140   ptrdiff_t count = SPECPDL_INDEX ();
8141
8142   if (SYMBOLP (default_coding_system))
8143     default_coding_system = SYMBOL_NAME (default_coding_system);
8144   specbind (Qcompletion_ignore_case, Qt);
8145   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8146                           Qt, Qnil, Qcoding_system_history,
8147                           default_coding_system, Qnil);
8148   unbind_to (count, Qnil);
8149   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8150 }
8151
8152 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8153        1, 1, 0,
8154        doc: /* Check validity of CODING-SYSTEM.
8155 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8156 It is valid if it is nil or a symbol defined as a coding system by the
8157 function `define-coding-system'.  */)
8158   (Lisp_Object coding_system)
8159 {
8160   Lisp_Object define_form;
8161
8162   define_form = Fget (coding_system, Qcoding_system_define_form);
8163   if (! NILP (define_form))
8164     {
8165       Fput (coding_system, Qcoding_system_define_form, Qnil);
8166       safe_eval (define_form);
8167     }
8168   if (!NILP (Fcoding_system_p (coding_system)))
8169     return coding_system;
8170   xsignal1 (Qcoding_system_error, coding_system);
8171 }
8172
8173 \f
8174 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8175    HIGHEST is nonzero, return the coding system of the highest
8176    priority among the detected coding systems.  Otherwise return a
8177    list of detected coding systems sorted by their priorities.  If
8178    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8179    multibyte form but contains only ASCII and eight-bit chars.
8180    Otherwise, the bytes are raw bytes.
8181
8182    CODING-SYSTEM controls the detection as below:
8183
8184    If it is nil, detect both text-format and eol-format.  If the
8185    text-format part of CODING-SYSTEM is already specified
8186    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8187    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8188    detect only text-format.  */
8189
8190 Lisp_Object
8191 detect_coding_system (const unsigned char *src,
8192                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8193                       int highest, int multibytep,
8194                       Lisp_Object coding_system)
8195 {
8196   const unsigned char *src_end = src + src_bytes;
8197   Lisp_Object attrs, eol_type;
8198   Lisp_Object val = Qnil;
8199   struct coding_system coding;
8200   ptrdiff_t id;
8201   struct coding_detection_info detect_info;
8202   enum coding_category base_category;
8203   int null_byte_found = 0, eight_bit_found = 0;
8204
8205   if (NILP (coding_system))
8206     coding_system = Qundecided;
8207   setup_coding_system (coding_system, &coding);
8208   attrs = CODING_ID_ATTRS (coding.id);
8209   eol_type = CODING_ID_EOL_TYPE (coding.id);
8210   coding_system = CODING_ATTR_BASE_NAME (attrs);
8211
8212   coding.source = src;
8213   coding.src_chars = src_chars;
8214   coding.src_bytes = src_bytes;
8215   coding.src_multibyte = multibytep;
8216   coding.consumed = 0;
8217   coding.mode |= CODING_MODE_LAST_BLOCK;
8218   coding.head_ascii = 0;
8219
8220   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8221
8222   /* At first, detect text-format if necessary.  */
8223   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8224   if (base_category == coding_category_undecided)
8225     {
8226       enum coding_category category IF_LINT (= 0);
8227       struct coding_system *this IF_LINT (= NULL);
8228       int c, i;
8229
8230       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8231       for (; src < src_end; src++)
8232         {
8233           c = *src;
8234           if (c & 0x80)
8235             {
8236               eight_bit_found = 1;
8237               if (null_byte_found)
8238                 break;
8239             }
8240           else if (c < 0x20)
8241             {
8242               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8243                   && ! inhibit_iso_escape_detection
8244                   && ! detect_info.checked)
8245                 {
8246                   if (detect_coding_iso_2022 (&coding, &detect_info))
8247                     {
8248                       /* We have scanned the whole data.  */
8249                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8250                         {
8251                           /* We didn't find an 8-bit code.  We may
8252                              have found a null-byte, but it's very
8253                              rare that a binary file confirm to
8254                              ISO-2022.  */
8255                           src = src_end;
8256                           coding.head_ascii = src - coding.source;
8257                         }
8258                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8259                       break;
8260                     }
8261                 }
8262               else if (! c && !inhibit_null_byte_detection)
8263                 {
8264                   null_byte_found = 1;
8265                   if (eight_bit_found)
8266                     break;
8267                 }
8268               if (! eight_bit_found)
8269                 coding.head_ascii++;
8270             }
8271           else if (! eight_bit_found)
8272             coding.head_ascii++;
8273         }
8274
8275       if (null_byte_found || eight_bit_found
8276           || coding.head_ascii < coding.src_bytes
8277           || detect_info.found)
8278         {
8279           if (coding.head_ascii == coding.src_bytes)
8280             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8281             for (i = 0; i < coding_category_raw_text; i++)
8282               {
8283                 category = coding_priorities[i];
8284                 this = coding_categories + category;
8285                 if (detect_info.found & (1 << category))
8286                   break;
8287               }
8288           else
8289             {
8290               if (null_byte_found)
8291                 {
8292                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8293                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8294                 }
8295               for (i = 0; i < coding_category_raw_text; i++)
8296                 {
8297                   category = coding_priorities[i];
8298                   this = coding_categories + category;
8299
8300                   if (this->id < 0)
8301                     {
8302                       /* No coding system of this category is defined.  */
8303                       detect_info.rejected |= (1 << category);
8304                     }
8305                   else if (category >= coding_category_raw_text)
8306                     continue;
8307                   else if (detect_info.checked & (1 << category))
8308                     {
8309                       if (highest
8310                           && (detect_info.found & (1 << category)))
8311                         break;
8312                     }
8313                   else if ((*(this->detector)) (&coding, &detect_info)
8314                            && highest
8315                            && (detect_info.found & (1 << category)))
8316                     {
8317                       if (category == coding_category_utf_16_auto)
8318                         {
8319                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8320                             category = coding_category_utf_16_le;
8321                           else
8322                             category = coding_category_utf_16_be;
8323                         }
8324                       break;
8325                     }
8326                 }
8327             }
8328         }
8329
8330       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8331           || null_byte_found)
8332         {
8333           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8334           id = CODING_SYSTEM_ID (Qno_conversion);
8335           val = Fcons (make_number (id), Qnil);
8336         }
8337       else if (! detect_info.rejected && ! detect_info.found)
8338         {
8339           detect_info.found = CATEGORY_MASK_ANY;
8340           id = coding_categories[coding_category_undecided].id;
8341           val = Fcons (make_number (id), Qnil);
8342         }
8343       else if (highest)
8344         {
8345           if (detect_info.found)
8346             {
8347               detect_info.found = 1 << category;
8348               val = Fcons (make_number (this->id), Qnil);
8349             }
8350           else
8351             for (i = 0; i < coding_category_raw_text; i++)
8352               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8353                 {
8354                   detect_info.found = 1 << coding_priorities[i];
8355                   id = coding_categories[coding_priorities[i]].id;
8356                   val = Fcons (make_number (id), Qnil);
8357                   break;
8358                 }
8359         }
8360       else
8361         {
8362           int mask = detect_info.rejected | detect_info.found;
8363           int found = 0;
8364
8365           for (i = coding_category_raw_text - 1; i >= 0; i--)
8366             {
8367               category = coding_priorities[i];
8368               if (! (mask & (1 << category)))
8369                 {
8370                   found |= 1 << category;
8371                   id = coding_categories[category].id;
8372                   if (id >= 0)
8373                     val = Fcons (make_number (id), val);
8374                 }
8375             }
8376           for (i = coding_category_raw_text - 1; i >= 0; i--)
8377             {
8378               category = coding_priorities[i];
8379               if (detect_info.found & (1 << category))
8380                 {
8381                   id = coding_categories[category].id;
8382                   val = Fcons (make_number (id), val);
8383                 }
8384             }
8385           detect_info.found |= found;
8386         }
8387     }
8388   else if (base_category == coding_category_utf_8_auto)
8389     {
8390       if (detect_coding_utf_8 (&coding, &detect_info))
8391         {
8392           struct coding_system *this;
8393
8394           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8395             this = coding_categories + coding_category_utf_8_sig;
8396           else
8397             this = coding_categories + coding_category_utf_8_nosig;
8398           val = Fcons (make_number (this->id), Qnil);
8399         }
8400     }
8401   else if (base_category == coding_category_utf_16_auto)
8402     {
8403       if (detect_coding_utf_16 (&coding, &detect_info))
8404         {
8405           struct coding_system *this;
8406
8407           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8408             this = coding_categories + coding_category_utf_16_le;
8409           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8410             this = coding_categories + coding_category_utf_16_be;
8411           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8412             this = coding_categories + coding_category_utf_16_be_nosig;
8413           else
8414             this = coding_categories + coding_category_utf_16_le_nosig;
8415           val = Fcons (make_number (this->id), Qnil);
8416         }
8417     }
8418   else
8419     {
8420       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8421       val = Fcons (make_number (coding.id), Qnil);
8422     }
8423
8424   /* Then, detect eol-format if necessary.  */
8425   {
8426     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8427     Lisp_Object tail;
8428
8429     if (VECTORP (eol_type))
8430       {
8431         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8432           {
8433             if (null_byte_found)
8434               normal_eol = EOL_SEEN_LF;
8435             else
8436               normal_eol = detect_eol (coding.source, src_bytes,
8437                                        coding_category_raw_text);
8438           }
8439         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8440                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8441           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8442                                       coding_category_utf_16_be);
8443         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8444                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8445           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8446                                       coding_category_utf_16_le);
8447       }
8448     else
8449       {
8450         if (EQ (eol_type, Qunix))
8451           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8452         else if (EQ (eol_type, Qdos))
8453           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8454         else
8455           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8456       }
8457
8458     for (tail = val; CONSP (tail); tail = XCDR (tail))
8459       {
8460         enum coding_category category;
8461         int this_eol;
8462
8463         id = XINT (XCAR (tail));
8464         attrs = CODING_ID_ATTRS (id);
8465         category = XINT (CODING_ATTR_CATEGORY (attrs));
8466         eol_type = CODING_ID_EOL_TYPE (id);
8467         if (VECTORP (eol_type))
8468           {
8469             if (category == coding_category_utf_16_be
8470                 || category == coding_category_utf_16_be_nosig)
8471               this_eol = utf_16_be_eol;
8472             else if (category == coding_category_utf_16_le
8473                      || category == coding_category_utf_16_le_nosig)
8474               this_eol = utf_16_le_eol;
8475             else
8476               this_eol = normal_eol;
8477
8478             if (this_eol == EOL_SEEN_LF)
8479               XSETCAR (tail, AREF (eol_type, 0));
8480             else if (this_eol == EOL_SEEN_CRLF)
8481               XSETCAR (tail, AREF (eol_type, 1));
8482             else if (this_eol == EOL_SEEN_CR)
8483               XSETCAR (tail, AREF (eol_type, 2));
8484             else
8485               XSETCAR (tail, CODING_ID_NAME (id));
8486           }
8487         else
8488           XSETCAR (tail, CODING_ID_NAME (id));
8489       }
8490   }
8491
8492   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8493 }
8494
8495
8496 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8497        2, 3, 0,
8498        doc: /* Detect coding system of the text in the region between START and END.
8499 Return a list of possible coding systems ordered by priority.
8500 The coding systems to try and their priorities follows what
8501 the function `coding-system-priority-list' (which see) returns.
8502
8503 If only ASCII characters are found (except for such ISO-2022 control
8504 characters as ESC), it returns a list of single element `undecided'
8505 or its subsidiary coding system according to a detected end-of-line
8506 format.
8507
8508 If optional argument HIGHEST is non-nil, return the coding system of
8509 highest priority.  */)
8510   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8511 {
8512   ptrdiff_t from, to;
8513   ptrdiff_t from_byte, to_byte;
8514
8515   CHECK_NUMBER_COERCE_MARKER (start);
8516   CHECK_NUMBER_COERCE_MARKER (end);
8517
8518   validate_region (&start, &end);
8519   from = XINT (start), to = XINT (end);
8520   from_byte = CHAR_TO_BYTE (from);
8521   to_byte = CHAR_TO_BYTE (to);
8522
8523   if (from < GPT && to >= GPT)
8524     move_gap_both (to, to_byte);
8525
8526   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8527                                to - from, to_byte - from_byte,
8528                                !NILP (highest),
8529                                !NILP (BVAR (current_buffer
8530                                       , enable_multibyte_characters)),
8531                                Qnil);
8532 }
8533
8534 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8535        1, 2, 0,
8536        doc: /* Detect coding system of the text in STRING.
8537 Return a list of possible coding systems ordered by priority.
8538 The coding systems to try and their priorities follows what
8539 the function `coding-system-priority-list' (which see) returns.
8540
8541 If only ASCII characters are found (except for such ISO-2022 control
8542 characters as ESC), it returns a list of single element `undecided'
8543 or its subsidiary coding system according to a detected end-of-line
8544 format.
8545
8546 If optional argument HIGHEST is non-nil, return the coding system of
8547 highest priority.  */)
8548   (Lisp_Object string, Lisp_Object highest)
8549 {
8550   CHECK_STRING (string);
8551
8552   return detect_coding_system (SDATA (string),
8553                                SCHARS (string), SBYTES (string),
8554                                !NILP (highest), STRING_MULTIBYTE (string),
8555                                Qnil);
8556 }
8557
8558
8559 static inline int
8560 char_encodable_p (int c, Lisp_Object attrs)
8561 {
8562   Lisp_Object tail;
8563   struct charset *charset;
8564   Lisp_Object translation_table;
8565
8566   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8567   if (! NILP (translation_table))
8568     c = translate_char (translation_table, c);
8569   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8570        CONSP (tail); tail = XCDR (tail))
8571     {
8572       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8573       if (CHAR_CHARSET_P (c, charset))
8574         break;
8575     }
8576   return (! NILP (tail));
8577 }
8578
8579
8580 /* Return a list of coding systems that safely encode the text between
8581    START and END.  If EXCLUDE is non-nil, it is a list of coding
8582    systems not to check.  The returned list doesn't contain any such
8583    coding systems.  In any case, if the text contains only ASCII or is
8584    unibyte, return t.  */
8585
8586 DEFUN ("find-coding-systems-region-internal",
8587        Ffind_coding_systems_region_internal,
8588        Sfind_coding_systems_region_internal, 2, 3, 0,
8589        doc: /* Internal use only.  */)
8590   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8591 {
8592   Lisp_Object coding_attrs_list, safe_codings;
8593   ptrdiff_t start_byte, end_byte;
8594   const unsigned char *p, *pbeg, *pend;
8595   int c;
8596   Lisp_Object tail, elt, work_table;
8597
8598   if (STRINGP (start))
8599     {
8600       if (!STRING_MULTIBYTE (start)
8601           || SCHARS (start) == SBYTES (start))
8602         return Qt;
8603       start_byte = 0;
8604       end_byte = SBYTES (start);
8605     }
8606   else
8607     {
8608       CHECK_NUMBER_COERCE_MARKER (start);
8609       CHECK_NUMBER_COERCE_MARKER (end);
8610       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8611         args_out_of_range (start, end);
8612       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8613         return Qt;
8614       start_byte = CHAR_TO_BYTE (XINT (start));
8615       end_byte = CHAR_TO_BYTE (XINT (end));
8616       if (XINT (end) - XINT (start) == end_byte - start_byte)
8617         return Qt;
8618
8619       if (XINT (start) < GPT && XINT (end) > GPT)
8620         {
8621           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8622             move_gap_both (XINT (start), start_byte);
8623           else
8624             move_gap_both (XINT (end), end_byte);
8625         }
8626     }
8627
8628   coding_attrs_list = Qnil;
8629   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8630     if (NILP (exclude)
8631         || NILP (Fmemq (XCAR (tail), exclude)))
8632       {
8633         Lisp_Object attrs;
8634
8635         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8636         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8637             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8638           {
8639             ASET (attrs, coding_attr_trans_tbl,
8640                   get_translation_table (attrs, 1, NULL));
8641             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8642           }
8643       }
8644
8645   if (STRINGP (start))
8646     p = pbeg = SDATA (start);
8647   else
8648     p = pbeg = BYTE_POS_ADDR (start_byte);
8649   pend = p + (end_byte - start_byte);
8650
8651   while (p < pend && ASCII_BYTE_P (*p)) p++;
8652   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8653
8654   work_table = Fmake_char_table (Qnil, Qnil);
8655   while (p < pend)
8656     {
8657       if (ASCII_BYTE_P (*p))
8658         p++;
8659       else
8660         {
8661           c = STRING_CHAR_ADVANCE (p);
8662           if (!NILP (char_table_ref (work_table, c)))
8663             /* This character was already checked.  Ignore it.  */
8664             continue;
8665
8666           charset_map_loaded = 0;
8667           for (tail = coding_attrs_list; CONSP (tail);)
8668             {
8669               elt = XCAR (tail);
8670               if (NILP (elt))
8671                 tail = XCDR (tail);
8672               else if (char_encodable_p (c, elt))
8673                 tail = XCDR (tail);
8674               else if (CONSP (XCDR (tail)))
8675                 {
8676                   XSETCAR (tail, XCAR (XCDR (tail)));
8677                   XSETCDR (tail, XCDR (XCDR (tail)));
8678                 }
8679               else
8680                 {
8681                   XSETCAR (tail, Qnil);
8682                   tail = XCDR (tail);
8683                 }
8684             }
8685           if (charset_map_loaded)
8686             {
8687               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8688
8689               if (STRINGP (start))
8690                 pbeg = SDATA (start);
8691               else
8692                 pbeg = BYTE_POS_ADDR (start_byte);
8693               p = pbeg + p_offset;
8694               pend = pbeg + pend_offset;
8695             }
8696           char_table_set (work_table, c, Qt);
8697         }
8698     }
8699
8700   safe_codings = list2 (Qraw_text, Qno_conversion);
8701   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8702     if (! NILP (XCAR (tail)))
8703       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8704
8705   return safe_codings;
8706 }
8707
8708
8709 DEFUN ("unencodable-char-position", Funencodable_char_position,
8710        Sunencodable_char_position, 3, 5, 0,
8711        doc: /*
8712 Return position of first un-encodable character in a region.
8713 START and END specify the region and CODING-SYSTEM specifies the
8714 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8715
8716 If optional 4th argument COUNT is non-nil, it specifies at most how
8717 many un-encodable characters to search.  In this case, the value is a
8718 list of positions.
8719
8720 If optional 5th argument STRING is non-nil, it is a string to search
8721 for un-encodable characters.  In that case, START and END are indexes
8722 to the string.  */)
8723   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8724 {
8725   EMACS_INT n;
8726   struct coding_system coding;
8727   Lisp_Object attrs, charset_list, translation_table;
8728   Lisp_Object positions;
8729   ptrdiff_t from, to;
8730   const unsigned char *p, *stop, *pend;
8731   int ascii_compatible;
8732
8733   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8734   attrs = CODING_ID_ATTRS (coding.id);
8735   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8736     return Qnil;
8737   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8738   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8739   translation_table = get_translation_table (attrs, 1, NULL);
8740
8741   if (NILP (string))
8742     {
8743       validate_region (&start, &end);
8744       from = XINT (start);
8745       to = XINT (end);
8746       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8747           || (ascii_compatible
8748               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8749         return Qnil;
8750       p = CHAR_POS_ADDR (from);
8751       pend = CHAR_POS_ADDR (to);
8752       if (from < GPT && to >= GPT)
8753         stop = GPT_ADDR;
8754       else
8755         stop = pend;
8756     }
8757   else
8758     {
8759       CHECK_STRING (string);
8760       CHECK_NATNUM (start);
8761       CHECK_NATNUM (end);
8762       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8763         args_out_of_range_3 (string, start, end);
8764       from = XINT (start);
8765       to = XINT (end);
8766       if (! STRING_MULTIBYTE (string))
8767         return Qnil;
8768       p = SDATA (string) + string_char_to_byte (string, from);
8769       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8770       if (ascii_compatible && (to - from) == (pend - p))
8771         return Qnil;
8772     }
8773
8774   if (NILP (count))
8775     n = 1;
8776   else
8777     {
8778       CHECK_NATNUM (count);
8779       n = XINT (count);
8780     }
8781
8782   positions = Qnil;
8783   charset_map_loaded = 0;
8784   while (1)
8785     {
8786       int c;
8787
8788       if (ascii_compatible)
8789         while (p < stop && ASCII_BYTE_P (*p))
8790           p++, from++;
8791       if (p >= stop)
8792         {
8793           if (p >= pend)
8794             break;
8795           stop = pend;
8796           p = GAP_END_ADDR;
8797         }
8798
8799       c = STRING_CHAR_ADVANCE (p);
8800       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8801           && ! char_charset (translate_char (translation_table, c),
8802                              charset_list, NULL))
8803         {
8804           positions = Fcons (make_number (from), positions);
8805           n--;
8806           if (n == 0)
8807             break;
8808         }
8809
8810       from++;
8811       if (charset_map_loaded && NILP (string))
8812         {
8813           p = CHAR_POS_ADDR (from);
8814           pend = CHAR_POS_ADDR (to);
8815           if (from < GPT && to >= GPT)
8816             stop = GPT_ADDR;
8817           else
8818             stop = pend;
8819           charset_map_loaded = 0;
8820         }
8821     }
8822
8823   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8824 }
8825
8826
8827 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8828        Scheck_coding_systems_region, 3, 3, 0,
8829        doc: /* Check if the region is encodable by coding systems.
8830
8831 START and END are buffer positions specifying the region.
8832 CODING-SYSTEM-LIST is a list of coding systems to check.
8833
8834 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8835 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8836 whole region, POS0, POS1, ... are buffer positions where non-encodable
8837 characters are found.
8838
8839 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8840 value is nil.
8841
8842 START may be a string.  In that case, check if the string is
8843 encodable, and the value contains indices to the string instead of
8844 buffer positions.  END is ignored.
8845
8846 If the current buffer (or START if it is a string) is unibyte, the value
8847 is nil.  */)
8848   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8849 {
8850   Lisp_Object list;
8851   ptrdiff_t start_byte, end_byte;
8852   ptrdiff_t pos;
8853   const unsigned char *p, *pbeg, *pend;
8854   int c;
8855   Lisp_Object tail, elt, attrs;
8856
8857   if (STRINGP (start))
8858     {
8859       if (!STRING_MULTIBYTE (start)
8860           || SCHARS (start) == SBYTES (start))
8861         return Qnil;
8862       start_byte = 0;
8863       end_byte = SBYTES (start);
8864       pos = 0;
8865     }
8866   else
8867     {
8868       CHECK_NUMBER_COERCE_MARKER (start);
8869       CHECK_NUMBER_COERCE_MARKER (end);
8870       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8871         args_out_of_range (start, end);
8872       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8873         return Qnil;
8874       start_byte = CHAR_TO_BYTE (XINT (start));
8875       end_byte = CHAR_TO_BYTE (XINT (end));
8876       if (XINT (end) - XINT (start) == end_byte - start_byte)
8877         return Qnil;
8878
8879       if (XINT (start) < GPT && XINT (end) > GPT)
8880         {
8881           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8882             move_gap_both (XINT (start), start_byte);
8883           else
8884             move_gap_both (XINT (end), end_byte);
8885         }
8886       pos = XINT (start);
8887     }
8888
8889   list = Qnil;
8890   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8891     {
8892       elt = XCAR (tail);
8893       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8894       ASET (attrs, coding_attr_trans_tbl,
8895             get_translation_table (attrs, 1, NULL));
8896       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8897     }
8898
8899   if (STRINGP (start))
8900     p = pbeg = SDATA (start);
8901   else
8902     p = pbeg = BYTE_POS_ADDR (start_byte);
8903   pend = p + (end_byte - start_byte);
8904
8905   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8906   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8907
8908   while (p < pend)
8909     {
8910       if (ASCII_BYTE_P (*p))
8911         p++;
8912       else
8913         {
8914           c = STRING_CHAR_ADVANCE (p);
8915
8916           charset_map_loaded = 0;
8917           for (tail = list; CONSP (tail); tail = XCDR (tail))
8918             {
8919               elt = XCDR (XCAR (tail));
8920               if (! char_encodable_p (c, XCAR (elt)))
8921                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8922             }
8923           if (charset_map_loaded)
8924             {
8925               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8926
8927               if (STRINGP (start))
8928                 pbeg = SDATA (start);
8929               else
8930                 pbeg = BYTE_POS_ADDR (start_byte);
8931               p = pbeg + p_offset;
8932               pend = pbeg + pend_offset;
8933             }
8934         }
8935       pos++;
8936     }
8937
8938   tail = list;
8939   list = Qnil;
8940   for (; CONSP (tail); tail = XCDR (tail))
8941     {
8942       elt = XCAR (tail);
8943       if (CONSP (XCDR (XCDR (elt))))
8944         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8945                       list);
8946     }
8947
8948   return list;
8949 }
8950
8951
8952 static Lisp_Object
8953 code_convert_region (Lisp_Object start, Lisp_Object end,
8954                      Lisp_Object coding_system, Lisp_Object dst_object,
8955                      int encodep, int norecord)
8956 {
8957   struct coding_system coding;
8958   ptrdiff_t from, from_byte, to, to_byte;
8959   Lisp_Object src_object;
8960
8961   CHECK_NUMBER_COERCE_MARKER (start);
8962   CHECK_NUMBER_COERCE_MARKER (end);
8963   if (NILP (coding_system))
8964     coding_system = Qno_conversion;
8965   else
8966     CHECK_CODING_SYSTEM (coding_system);
8967   src_object = Fcurrent_buffer ();
8968   if (NILP (dst_object))
8969     dst_object = src_object;
8970   else if (! EQ (dst_object, Qt))
8971     CHECK_BUFFER (dst_object);
8972
8973   validate_region (&start, &end);
8974   from = XFASTINT (start);
8975   from_byte = CHAR_TO_BYTE (from);
8976   to = XFASTINT (end);
8977   to_byte = CHAR_TO_BYTE (to);
8978
8979   setup_coding_system (coding_system, &coding);
8980   coding.mode |= CODING_MODE_LAST_BLOCK;
8981
8982   if (encodep)
8983     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8984                           dst_object);
8985   else
8986     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8987                           dst_object);
8988   if (! norecord)
8989     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8990
8991   return (BUFFERP (dst_object)
8992           ? make_number (coding.produced_char)
8993           : coding.dst_object);
8994 }
8995
8996
8997 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8998        3, 4, "r\nzCoding system: ",
8999        doc: /* Decode the current region from the specified coding system.
9000 When called from a program, takes four arguments:
9001         START, END, CODING-SYSTEM, and DESTINATION.
9002 START and END are buffer positions.
9003
9004 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9005 If nil, the region between START and END is replaced by the decoded text.
9006 If buffer, the decoded text is inserted in that buffer after point (point
9007 does not move).
9008 In those cases, the length of the decoded text is returned.
9009 If DESTINATION is t, the decoded text is returned.
9010
9011 This function sets `last-coding-system-used' to the precise coding system
9012 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9013 not fully specified.)  */)
9014   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9015 {
9016   return code_convert_region (start, end, coding_system, destination, 0, 0);
9017 }
9018
9019 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9020        3, 4, "r\nzCoding system: ",
9021        doc: /* Encode the current region by specified coding system.
9022 When called from a program, takes four arguments:
9023         START, END, CODING-SYSTEM and DESTINATION.
9024 START and END are buffer positions.
9025
9026 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9027 If nil, the region between START and END is replace by the encoded text.
9028 If buffer, the encoded text is inserted in that buffer after point (point
9029 does not move).
9030 In those cases, the length of the encoded text is returned.
9031 If DESTINATION is t, the encoded text is returned.
9032
9033 This function sets `last-coding-system-used' to the precise coding system
9034 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9035 not fully specified.)  */)
9036   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9037 {
9038   return code_convert_region (start, end, coding_system, destination, 1, 0);
9039 }
9040
9041 Lisp_Object
9042 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9043                      Lisp_Object dst_object, int encodep, int nocopy, int norecord)
9044 {
9045   struct coding_system coding;
9046   ptrdiff_t chars, bytes;
9047
9048   CHECK_STRING (string);
9049   if (NILP (coding_system))
9050     {
9051       if (! norecord)
9052         Vlast_coding_system_used = Qno_conversion;
9053       if (NILP (dst_object))
9054         return (nocopy ? Fcopy_sequence (string) : string);
9055     }
9056
9057   if (NILP (coding_system))
9058     coding_system = Qno_conversion;
9059   else
9060     CHECK_CODING_SYSTEM (coding_system);
9061   if (NILP (dst_object))
9062     dst_object = Qt;
9063   else if (! EQ (dst_object, Qt))
9064     CHECK_BUFFER (dst_object);
9065
9066   setup_coding_system (coding_system, &coding);
9067   coding.mode |= CODING_MODE_LAST_BLOCK;
9068   chars = SCHARS (string);
9069   bytes = SBYTES (string);
9070   if (encodep)
9071     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9072   else
9073     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9074   if (! norecord)
9075     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9076
9077   return (BUFFERP (dst_object)
9078           ? make_number (coding.produced_char)
9079           : coding.dst_object);
9080 }
9081
9082
9083 /* Encode or decode STRING according to CODING_SYSTEM.
9084    Do not set Vlast_coding_system_used.
9085
9086    This function is called only from macros DECODE_FILE and
9087    ENCODE_FILE, thus we ignore character composition.  */
9088
9089 Lisp_Object
9090 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9091                               int encodep)
9092 {
9093   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9094 }
9095
9096
9097 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9098        2, 4, 0,
9099        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9100
9101 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9102 if the decoding operation is trivial.
9103
9104 Optional fourth arg BUFFER non-nil means that the decoded text is
9105 inserted in that buffer after point (point does not move).  In this
9106 case, the return value is the length of the decoded text.
9107
9108 This function sets `last-coding-system-used' to the precise coding system
9109 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9110 not fully specified.)  */)
9111   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9112 {
9113   return code_convert_string (string, coding_system, buffer,
9114                               0, ! NILP (nocopy), 0);
9115 }
9116
9117 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9118        2, 4, 0,
9119        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9120
9121 Optional third arg NOCOPY non-nil means it is OK to return STRING
9122 itself if the encoding operation is trivial.
9123
9124 Optional fourth arg BUFFER non-nil means that the encoded text is
9125 inserted in that buffer after point (point does not move).  In this
9126 case, the return value is the length of the encoded text.
9127
9128 This function sets `last-coding-system-used' to the precise coding system
9129 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9130 not fully specified.)  */)
9131   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9132 {
9133   return code_convert_string (string, coding_system, buffer,
9134                               1, ! NILP (nocopy), 0);
9135 }
9136
9137 \f
9138 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9139        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9140 Return the corresponding character.  */)
9141   (Lisp_Object code)
9142 {
9143   Lisp_Object spec, attrs, val;
9144   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9145   EMACS_INT ch;
9146   int c;
9147
9148   CHECK_NATNUM (code);
9149   ch = XFASTINT (code);
9150   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9151   attrs = AREF (spec, 0);
9152
9153   if (ASCII_BYTE_P (ch)
9154       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9155     return code;
9156
9157   val = CODING_ATTR_CHARSET_LIST (attrs);
9158   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9159   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9160   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9161
9162   if (ch <= 0x7F)
9163     {
9164       c = ch;
9165       charset = charset_roman;
9166     }
9167   else if (ch >= 0xA0 && ch < 0xDF)
9168     {
9169       c = ch - 0x80;
9170       charset = charset_kana;
9171     }
9172   else
9173     {
9174       EMACS_INT c1 = ch >> 8;
9175       int c2 = ch & 0xFF;
9176
9177       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9178           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9179         error ("Invalid code: %"pI"d", ch);
9180       c = ch;
9181       SJIS_TO_JIS (c);
9182       charset = charset_kanji;
9183     }
9184   c = DECODE_CHAR (charset, c);
9185   if (c < 0)
9186     error ("Invalid code: %"pI"d", ch);
9187   return make_number (c);
9188 }
9189
9190
9191 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9192        doc: /* Encode a Japanese character CH to shift_jis encoding.
9193 Return the corresponding code in SJIS.  */)
9194   (Lisp_Object ch)
9195 {
9196   Lisp_Object spec, attrs, charset_list;
9197   int c;
9198   struct charset *charset;
9199   unsigned code;
9200
9201   CHECK_CHARACTER (ch);
9202   c = XFASTINT (ch);
9203   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9204   attrs = AREF (spec, 0);
9205
9206   if (ASCII_CHAR_P (c)
9207       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9208     return ch;
9209
9210   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9211   charset = char_charset (c, charset_list, &code);
9212   if (code == CHARSET_INVALID_CODE (charset))
9213     error ("Can't encode by shift_jis encoding: %c", c);
9214   JIS_TO_SJIS (code);
9215
9216   return make_number (code);
9217 }
9218
9219 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9220        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9221 Return the corresponding character.  */)
9222   (Lisp_Object code)
9223 {
9224   Lisp_Object spec, attrs, val;
9225   struct charset *charset_roman, *charset_big5, *charset;
9226   EMACS_INT ch;
9227   int c;
9228
9229   CHECK_NATNUM (code);
9230   ch = XFASTINT (code);
9231   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9232   attrs = AREF (spec, 0);
9233
9234   if (ASCII_BYTE_P (ch)
9235       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9236     return code;
9237
9238   val = CODING_ATTR_CHARSET_LIST (attrs);
9239   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9240   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9241
9242   if (ch <= 0x7F)
9243     {
9244       c = ch;
9245       charset = charset_roman;
9246     }
9247   else
9248     {
9249       EMACS_INT b1 = ch >> 8;
9250       int b2 = ch & 0x7F;
9251       if (b1 < 0xA1 || b1 > 0xFE
9252           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9253         error ("Invalid code: %"pI"d", ch);
9254       c = ch;
9255       charset = charset_big5;
9256     }
9257   c = DECODE_CHAR (charset, c);
9258   if (c < 0)
9259     error ("Invalid code: %"pI"d", ch);
9260   return make_number (c);
9261 }
9262
9263 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9264        doc: /* Encode the Big5 character CH to BIG5 coding system.
9265 Return the corresponding character code in Big5.  */)
9266   (Lisp_Object ch)
9267 {
9268   Lisp_Object spec, attrs, charset_list;
9269   struct charset *charset;
9270   int c;
9271   unsigned code;
9272
9273   CHECK_CHARACTER (ch);
9274   c = XFASTINT (ch);
9275   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9276   attrs = AREF (spec, 0);
9277   if (ASCII_CHAR_P (c)
9278       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9279     return ch;
9280
9281   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9282   charset = char_charset (c, charset_list, &code);
9283   if (code == CHARSET_INVALID_CODE (charset))
9284     error ("Can't encode by Big5 encoding: %c", c);
9285
9286   return make_number (code);
9287 }
9288
9289 \f
9290 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9291        Sset_terminal_coding_system_internal, 1, 2, 0,
9292        doc: /* Internal use only.  */)
9293   (Lisp_Object coding_system, Lisp_Object terminal)
9294 {
9295   struct terminal *term = get_terminal (terminal, 1);
9296   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9297   CHECK_SYMBOL (coding_system);
9298   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9299   /* We had better not send unsafe characters to terminal.  */
9300   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9301   /* Character composition should be disabled.  */
9302   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9303   terminal_coding->src_multibyte = 1;
9304   terminal_coding->dst_multibyte = 0;
9305   if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9306     TSET (term, charset_list, coding_charset_list (terminal_coding));
9307   else
9308     TSET (term, charset_list, Fcons (make_number (charset_ascii), Qnil));
9309   return Qnil;
9310 }
9311
9312 DEFUN ("set-safe-terminal-coding-system-internal",
9313        Fset_safe_terminal_coding_system_internal,
9314        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9315        doc: /* Internal use only.  */)
9316   (Lisp_Object coding_system)
9317 {
9318   CHECK_SYMBOL (coding_system);
9319   setup_coding_system (Fcheck_coding_system (coding_system),
9320                        &safe_terminal_coding);
9321   /* Character composition should be disabled.  */
9322   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9323   safe_terminal_coding.src_multibyte = 1;
9324   safe_terminal_coding.dst_multibyte = 0;
9325   return Qnil;
9326 }
9327
9328 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9329        Sterminal_coding_system, 0, 1, 0,
9330        doc: /* Return coding system specified for terminal output on the given terminal.
9331 TERMINAL may be a terminal object, a frame, or nil for the selected
9332 frame's terminal device.  */)
9333   (Lisp_Object terminal)
9334 {
9335   struct coding_system *terminal_coding
9336     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9337   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9338
9339   /* For backward compatibility, return nil if it is `undecided'.  */
9340   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9341 }
9342
9343 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9344        Sset_keyboard_coding_system_internal, 1, 2, 0,
9345        doc: /* Internal use only.  */)
9346   (Lisp_Object coding_system, Lisp_Object terminal)
9347 {
9348   struct terminal *t = get_terminal (terminal, 1);
9349   CHECK_SYMBOL (coding_system);
9350   if (NILP (coding_system))
9351     coding_system = Qno_conversion;
9352   else
9353     Fcheck_coding_system (coding_system);
9354   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9355   /* Character composition should be disabled.  */
9356   TERMINAL_KEYBOARD_CODING (t)->common_flags
9357     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9358   return Qnil;
9359 }
9360
9361 DEFUN ("keyboard-coding-system",
9362        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9363        doc: /* Return coding system specified for decoding keyboard input.  */)
9364   (Lisp_Object terminal)
9365 {
9366   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9367                          (get_terminal (terminal, 1))->id);
9368 }
9369
9370 \f
9371 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9372        Sfind_operation_coding_system,  1, MANY, 0,
9373        doc: /* Choose a coding system for an operation based on the target name.
9374 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9375 DECODING-SYSTEM is the coding system to use for decoding
9376 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9377 for encoding (in case OPERATION does encoding).
9378
9379 The first argument OPERATION specifies an I/O primitive:
9380   For file I/O, `insert-file-contents' or `write-region'.
9381   For process I/O, `call-process', `call-process-region', or `start-process'.
9382   For network I/O, `open-network-stream'.
9383
9384 The remaining arguments should be the same arguments that were passed
9385 to the primitive.  Depending on which primitive, one of those arguments
9386 is selected as the TARGET.  For example, if OPERATION does file I/O,
9387 whichever argument specifies the file name is TARGET.
9388
9389 TARGET has a meaning which depends on OPERATION:
9390   For file I/O, TARGET is a file name (except for the special case below).
9391   For process I/O, TARGET is a process name.
9392   For network I/O, TARGET is a service name or a port number.
9393
9394 This function looks up what is specified for TARGET in
9395 `file-coding-system-alist', `process-coding-system-alist',
9396 or `network-coding-system-alist' depending on OPERATION.
9397 They may specify a coding system, a cons of coding systems,
9398 or a function symbol to call.
9399 In the last case, we call the function with one argument,
9400 which is a list of all the arguments given to this function.
9401 If the function can't decide a coding system, it can return
9402 `undecided' so that the normal code-detection is performed.
9403
9404 If OPERATION is `insert-file-contents', the argument corresponding to
9405 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9406 file name to look up, and BUFFER is a buffer that contains the file's
9407 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9408 function to call for FILENAME, that function should examine the
9409 contents of BUFFER instead of reading the file.
9410
9411 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9412   (ptrdiff_t nargs, Lisp_Object *args)
9413 {
9414   Lisp_Object operation, target_idx, target, val;
9415   register Lisp_Object chain;
9416
9417   if (nargs < 2)
9418     error ("Too few arguments");
9419   operation = args[0];
9420   if (!SYMBOLP (operation)
9421       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9422     error ("Invalid first argument");
9423   if (nargs <= 1 + XFASTINT (target_idx))
9424     error ("Too few arguments for operation `%s'",
9425            SDATA (SYMBOL_NAME (operation)));
9426   target = args[XFASTINT (target_idx) + 1];
9427   if (!(STRINGP (target)
9428         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9429             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9430         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9431     error ("Invalid argument %"pI"d of operation `%s'",
9432            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9433   if (CONSP (target))
9434     target = XCAR (target);
9435
9436   chain = ((EQ (operation, Qinsert_file_contents)
9437             || EQ (operation, Qwrite_region))
9438            ? Vfile_coding_system_alist
9439            : (EQ (operation, Qopen_network_stream)
9440               ? Vnetwork_coding_system_alist
9441               : Vprocess_coding_system_alist));
9442   if (NILP (chain))
9443     return Qnil;
9444
9445   for (; CONSP (chain); chain = XCDR (chain))
9446     {
9447       Lisp_Object elt;
9448
9449       elt = XCAR (chain);
9450       if (CONSP (elt)
9451           && ((STRINGP (target)
9452                && STRINGP (XCAR (elt))
9453                && fast_string_match (XCAR (elt), target) >= 0)
9454               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9455         {
9456           val = XCDR (elt);
9457           /* Here, if VAL is both a valid coding system and a valid
9458              function symbol, we return VAL as a coding system.  */
9459           if (CONSP (val))
9460             return val;
9461           if (! SYMBOLP (val))
9462             return Qnil;
9463           if (! NILP (Fcoding_system_p (val)))
9464             return Fcons (val, val);
9465           if (! NILP (Ffboundp (val)))
9466             {
9467               /* We use call1 rather than safe_call1
9468                  so as to get bug reports about functions called here
9469                  which don't handle the current interface.  */
9470               val = call1 (val, Flist (nargs, args));
9471               if (CONSP (val))
9472                 return val;
9473               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9474                 return Fcons (val, val);
9475             }
9476           return Qnil;
9477         }
9478     }
9479   return Qnil;
9480 }
9481
9482 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9483        Sset_coding_system_priority, 0, MANY, 0,
9484        doc: /* Assign higher priority to the coding systems given as arguments.
9485 If multiple coding systems belong to the same category,
9486 all but the first one are ignored.
9487
9488 usage: (set-coding-system-priority &rest coding-systems)  */)
9489   (ptrdiff_t nargs, Lisp_Object *args)
9490 {
9491   ptrdiff_t i, j;
9492   int changed[coding_category_max];
9493   enum coding_category priorities[coding_category_max];
9494
9495   memset (changed, 0, sizeof changed);
9496
9497   for (i = j = 0; i < nargs; i++)
9498     {
9499       enum coding_category category;
9500       Lisp_Object spec, attrs;
9501
9502       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9503       attrs = AREF (spec, 0);
9504       category = XINT (CODING_ATTR_CATEGORY (attrs));
9505       if (changed[category])
9506         /* Ignore this coding system because a coding system of the
9507            same category already had a higher priority.  */
9508         continue;
9509       changed[category] = 1;
9510       priorities[j++] = category;
9511       if (coding_categories[category].id >= 0
9512           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9513         setup_coding_system (args[i], &coding_categories[category]);
9514       Fset (AREF (Vcoding_category_table, category), args[i]);
9515     }
9516
9517   /* Now we have decided top J priorities.  Reflect the order of the
9518      original priorities to the remaining priorities.  */
9519
9520   for (i = j, j = 0; i < coding_category_max; i++, j++)
9521     {
9522       while (j < coding_category_max
9523              && changed[coding_priorities[j]])
9524         j++;
9525       if (j == coding_category_max)
9526         abort ();
9527       priorities[i] = coding_priorities[j];
9528     }
9529
9530   memcpy (coding_priorities, priorities, sizeof priorities);
9531
9532   /* Update `coding-category-list'.  */
9533   Vcoding_category_list = Qnil;
9534   for (i = coding_category_max; i-- > 0; )
9535     Vcoding_category_list
9536       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9537                Vcoding_category_list);
9538
9539   return Qnil;
9540 }
9541
9542 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9543        Scoding_system_priority_list, 0, 1, 0,
9544        doc: /* Return a list of coding systems ordered by their priorities.
9545 The list contains a subset of coding systems; i.e. coding systems
9546 assigned to each coding category (see `coding-category-list').
9547
9548 HIGHESTP non-nil means just return the highest priority one.  */)
9549   (Lisp_Object highestp)
9550 {
9551   int i;
9552   Lisp_Object val;
9553
9554   for (i = 0, val = Qnil; i < coding_category_max; i++)
9555     {
9556       enum coding_category category = coding_priorities[i];
9557       int id = coding_categories[category].id;
9558       Lisp_Object attrs;
9559
9560       if (id < 0)
9561         continue;
9562       attrs = CODING_ID_ATTRS (id);
9563       if (! NILP (highestp))
9564         return CODING_ATTR_BASE_NAME (attrs);
9565       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9566     }
9567   return Fnreverse (val);
9568 }
9569
9570 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9571
9572 static Lisp_Object
9573 make_subsidiaries (Lisp_Object base)
9574 {
9575   Lisp_Object subsidiaries;
9576   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9577   char *buf = alloca (base_name_len + 6);
9578   int i;
9579
9580   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9581   subsidiaries = Fmake_vector (make_number (3), Qnil);
9582   for (i = 0; i < 3; i++)
9583     {
9584       strcpy (buf + base_name_len, suffixes[i]);
9585       ASET (subsidiaries, i, intern (buf));
9586     }
9587   return subsidiaries;
9588 }
9589
9590
9591 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9592        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9593        doc: /* For internal use only.
9594 usage: (define-coding-system-internal ...)  */)
9595   (ptrdiff_t nargs, Lisp_Object *args)
9596 {
9597   Lisp_Object name;
9598   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9599   Lisp_Object attrs;            /* Vector of attributes.  */
9600   Lisp_Object eol_type;
9601   Lisp_Object aliases;
9602   Lisp_Object coding_type, charset_list, safe_charsets;
9603   enum coding_category category;
9604   Lisp_Object tail, val;
9605   int max_charset_id = 0;
9606   int i;
9607
9608   if (nargs < coding_arg_max)
9609     goto short_args;
9610
9611   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9612
9613   name = args[coding_arg_name];
9614   CHECK_SYMBOL (name);
9615   ASET (attrs, coding_attr_base_name, name);
9616
9617   val = args[coding_arg_mnemonic];
9618   if (! STRINGP (val))
9619     CHECK_CHARACTER (val);
9620   ASET (attrs, coding_attr_mnemonic, val);
9621
9622   coding_type = args[coding_arg_coding_type];
9623   CHECK_SYMBOL (coding_type);
9624   ASET (attrs, coding_attr_type, coding_type);
9625
9626   charset_list = args[coding_arg_charset_list];
9627   if (SYMBOLP (charset_list))
9628     {
9629       if (EQ (charset_list, Qiso_2022))
9630         {
9631           if (! EQ (coding_type, Qiso_2022))
9632             error ("Invalid charset-list");
9633           charset_list = Viso_2022_charset_list;
9634         }
9635       else if (EQ (charset_list, Qemacs_mule))
9636         {
9637           if (! EQ (coding_type, Qemacs_mule))
9638             error ("Invalid charset-list");
9639           charset_list = Vemacs_mule_charset_list;
9640         }
9641       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9642         {
9643           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9644             error ("Invalid charset-list");
9645           if (max_charset_id < XFASTINT (XCAR (tail)))
9646             max_charset_id = XFASTINT (XCAR (tail));
9647         }
9648     }
9649   else
9650     {
9651       charset_list = Fcopy_sequence (charset_list);
9652       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9653         {
9654           struct charset *charset;
9655
9656           val = XCAR (tail);
9657           CHECK_CHARSET_GET_CHARSET (val, charset);
9658           if (EQ (coding_type, Qiso_2022)
9659               ? CHARSET_ISO_FINAL (charset) < 0
9660               : EQ (coding_type, Qemacs_mule)
9661               ? CHARSET_EMACS_MULE_ID (charset) < 0
9662               : 0)
9663             error ("Can't handle charset `%s'",
9664                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9665
9666           XSETCAR (tail, make_number (charset->id));
9667           if (max_charset_id < charset->id)
9668             max_charset_id = charset->id;
9669         }
9670     }
9671   ASET (attrs, coding_attr_charset_list, charset_list);
9672
9673   safe_charsets = make_uninit_string (max_charset_id + 1);
9674   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9675   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9676     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9677   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9678
9679   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9680
9681   val = args[coding_arg_decode_translation_table];
9682   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9683     CHECK_SYMBOL (val);
9684   ASET (attrs, coding_attr_decode_tbl, val);
9685
9686   val = args[coding_arg_encode_translation_table];
9687   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9688     CHECK_SYMBOL (val);
9689   ASET (attrs, coding_attr_encode_tbl, val);
9690
9691   val = args[coding_arg_post_read_conversion];
9692   CHECK_SYMBOL (val);
9693   ASET (attrs, coding_attr_post_read, val);
9694
9695   val = args[coding_arg_pre_write_conversion];
9696   CHECK_SYMBOL (val);
9697   ASET (attrs, coding_attr_pre_write, val);
9698
9699   val = args[coding_arg_default_char];
9700   if (NILP (val))
9701     ASET (attrs, coding_attr_default_char, make_number (' '));
9702   else
9703     {
9704       CHECK_CHARACTER (val);
9705       ASET (attrs, coding_attr_default_char, val);
9706     }
9707
9708   val = args[coding_arg_for_unibyte];
9709   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9710
9711   val = args[coding_arg_plist];
9712   CHECK_LIST (val);
9713   ASET (attrs, coding_attr_plist, val);
9714
9715   if (EQ (coding_type, Qcharset))
9716     {
9717       /* Generate a lisp vector of 256 elements.  Each element is nil,
9718          integer, or a list of charset IDs.
9719
9720          If Nth element is nil, the byte code N is invalid in this
9721          coding system.
9722
9723          If Nth element is a number NUM, N is the first byte of a
9724          charset whose ID is NUM.
9725
9726          If Nth element is a list of charset IDs, N is the first byte
9727          of one of them.  The list is sorted by dimensions of the
9728          charsets.  A charset of smaller dimension comes first. */
9729       val = Fmake_vector (make_number (256), Qnil);
9730
9731       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9732         {
9733           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9734           int dim = CHARSET_DIMENSION (charset);
9735           int idx = (dim - 1) * 4;
9736
9737           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9738             ASET (attrs, coding_attr_ascii_compat, Qt);
9739
9740           for (i = charset->code_space[idx];
9741                i <= charset->code_space[idx + 1]; i++)
9742             {
9743               Lisp_Object tmp, tmp2;
9744               int dim2;
9745
9746               tmp = AREF (val, i);
9747               if (NILP (tmp))
9748                 tmp = XCAR (tail);
9749               else if (NUMBERP (tmp))
9750                 {
9751                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9752                   if (dim < dim2)
9753                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9754                   else
9755                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9756                 }
9757               else
9758                 {
9759                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9760                     {
9761                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9762                       if (dim < dim2)
9763                         break;
9764                     }
9765                   if (NILP (tmp2))
9766                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9767                   else
9768                     {
9769                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9770                       XSETCAR (tmp2, XCAR (tail));
9771                     }
9772                 }
9773               ASET (val, i, tmp);
9774             }
9775         }
9776       ASET (attrs, coding_attr_charset_valids, val);
9777       category = coding_category_charset;
9778     }
9779   else if (EQ (coding_type, Qccl))
9780     {
9781       Lisp_Object valids;
9782
9783       if (nargs < coding_arg_ccl_max)
9784         goto short_args;
9785
9786       val = args[coding_arg_ccl_decoder];
9787       CHECK_CCL_PROGRAM (val);
9788       if (VECTORP (val))
9789         val = Fcopy_sequence (val);
9790       ASET (attrs, coding_attr_ccl_decoder, val);
9791
9792       val = args[coding_arg_ccl_encoder];
9793       CHECK_CCL_PROGRAM (val);
9794       if (VECTORP (val))
9795         val = Fcopy_sequence (val);
9796       ASET (attrs, coding_attr_ccl_encoder, val);
9797
9798       val = args[coding_arg_ccl_valids];
9799       valids = Fmake_string (make_number (256), make_number (0));
9800       for (tail = val; CONSP (tail); tail = XCDR (tail))
9801         {
9802           int from, to;
9803
9804           val = XCAR (tail);
9805           if (INTEGERP (val))
9806             {
9807               if (! (0 <= XINT (val) && XINT (val) <= 255))
9808                 args_out_of_range_3 (val, make_number (0), make_number (255));
9809               from = to = XINT (val);
9810             }
9811           else
9812             {
9813               CHECK_CONS (val);
9814               CHECK_NATNUM_CAR (val);
9815               CHECK_NUMBER_CDR (val);
9816               if (XINT (XCAR (val)) > 255)
9817                 args_out_of_range_3 (XCAR (val),
9818                                      make_number (0), make_number (255));
9819               from = XINT (XCAR (val));
9820               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9821                 args_out_of_range_3 (XCDR (val),
9822                                      XCAR (val), make_number (255));
9823               to = XINT (XCDR (val));
9824             }
9825           for (i = from; i <= to; i++)
9826             SSET (valids, i, 1);
9827         }
9828       ASET (attrs, coding_attr_ccl_valids, valids);
9829
9830       category = coding_category_ccl;
9831     }
9832   else if (EQ (coding_type, Qutf_16))
9833     {
9834       Lisp_Object bom, endian;
9835
9836       ASET (attrs, coding_attr_ascii_compat, Qnil);
9837
9838       if (nargs < coding_arg_utf16_max)
9839         goto short_args;
9840
9841       bom = args[coding_arg_utf16_bom];
9842       if (! NILP (bom) && ! EQ (bom, Qt))
9843         {
9844           CHECK_CONS (bom);
9845           val = XCAR (bom);
9846           CHECK_CODING_SYSTEM (val);
9847           val = XCDR (bom);
9848           CHECK_CODING_SYSTEM (val);
9849         }
9850       ASET (attrs, coding_attr_utf_bom, bom);
9851
9852       endian = args[coding_arg_utf16_endian];
9853       CHECK_SYMBOL (endian);
9854       if (NILP (endian))
9855         endian = Qbig;
9856       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9857         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9858       ASET (attrs, coding_attr_utf_16_endian, endian);
9859
9860       category = (CONSP (bom)
9861                   ? coding_category_utf_16_auto
9862                   : NILP (bom)
9863                   ? (EQ (endian, Qbig)
9864                      ? coding_category_utf_16_be_nosig
9865                      : coding_category_utf_16_le_nosig)
9866                   : (EQ (endian, Qbig)
9867                      ? coding_category_utf_16_be
9868                      : coding_category_utf_16_le));
9869     }
9870   else if (EQ (coding_type, Qiso_2022))
9871     {
9872       Lisp_Object initial, reg_usage, request, flags;
9873
9874       if (nargs < coding_arg_iso2022_max)
9875         goto short_args;
9876
9877       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9878       CHECK_VECTOR (initial);
9879       for (i = 0; i < 4; i++)
9880         {
9881           val = Faref (initial, make_number (i));
9882           if (! NILP (val))
9883             {
9884               struct charset *charset;
9885
9886               CHECK_CHARSET_GET_CHARSET (val, charset);
9887               ASET (initial, i, make_number (CHARSET_ID (charset)));
9888               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9889                 ASET (attrs, coding_attr_ascii_compat, Qt);
9890             }
9891           else
9892             ASET (initial, i, make_number (-1));
9893         }
9894
9895       reg_usage = args[coding_arg_iso2022_reg_usage];
9896       CHECK_CONS (reg_usage);
9897       CHECK_NUMBER_CAR (reg_usage);
9898       CHECK_NUMBER_CDR (reg_usage);
9899
9900       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9901       for (tail = request; CONSP (tail); tail = XCDR (tail))
9902         {
9903           int id;
9904           Lisp_Object tmp1;
9905
9906           val = XCAR (tail);
9907           CHECK_CONS (val);
9908           tmp1 = XCAR (val);
9909           CHECK_CHARSET_GET_ID (tmp1, id);
9910           CHECK_NATNUM_CDR (val);
9911           if (XINT (XCDR (val)) >= 4)
9912             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9913           XSETCAR (val, make_number (id));
9914         }
9915
9916       flags = args[coding_arg_iso2022_flags];
9917       CHECK_NATNUM (flags);
9918       i = XINT (flags) & INT_MAX;
9919       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9920         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9921       flags = make_number (i);
9922
9923       ASET (attrs, coding_attr_iso_initial, initial);
9924       ASET (attrs, coding_attr_iso_usage, reg_usage);
9925       ASET (attrs, coding_attr_iso_request, request);
9926       ASET (attrs, coding_attr_iso_flags, flags);
9927       setup_iso_safe_charsets (attrs);
9928
9929       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9930         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9931                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9932                     ? coding_category_iso_7_else
9933                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9934                     ? coding_category_iso_7
9935                     : coding_category_iso_7_tight);
9936       else
9937         {
9938           int id = XINT (AREF (initial, 1));
9939
9940           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9941                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9942                        || id < 0)
9943                       ? coding_category_iso_8_else
9944                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9945                       ? coding_category_iso_8_1
9946                       : coding_category_iso_8_2);
9947         }
9948       if (category != coding_category_iso_8_1
9949           && category != coding_category_iso_8_2)
9950         ASET (attrs, coding_attr_ascii_compat, Qnil);
9951     }
9952   else if (EQ (coding_type, Qemacs_mule))
9953     {
9954       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9955         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9956       ASET (attrs, coding_attr_ascii_compat, Qt);
9957       category = coding_category_emacs_mule;
9958     }
9959   else if (EQ (coding_type, Qshift_jis))
9960     {
9961
9962       struct charset *charset;
9963
9964       if (XINT (Flength (charset_list)) != 3
9965           && XINT (Flength (charset_list)) != 4)
9966         error ("There should be three or four charsets");
9967
9968       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9969       if (CHARSET_DIMENSION (charset) != 1)
9970         error ("Dimension of charset %s is not one",
9971                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9972       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9973         ASET (attrs, coding_attr_ascii_compat, Qt);
9974
9975       charset_list = XCDR (charset_list);
9976       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9977       if (CHARSET_DIMENSION (charset) != 1)
9978         error ("Dimension of charset %s is not one",
9979                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9980
9981       charset_list = XCDR (charset_list);
9982       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9983       if (CHARSET_DIMENSION (charset) != 2)
9984         error ("Dimension of charset %s is not two",
9985                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9986
9987       charset_list = XCDR (charset_list);
9988       if (! NILP (charset_list))
9989         {
9990           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9991           if (CHARSET_DIMENSION (charset) != 2)
9992             error ("Dimension of charset %s is not two",
9993                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9994         }
9995
9996       category = coding_category_sjis;
9997       Vsjis_coding_system = name;
9998     }
9999   else if (EQ (coding_type, Qbig5))
10000     {
10001       struct charset *charset;
10002
10003       if (XINT (Flength (charset_list)) != 2)
10004         error ("There should be just two charsets");
10005
10006       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10007       if (CHARSET_DIMENSION (charset) != 1)
10008         error ("Dimension of charset %s is not one",
10009                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10010       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10011         ASET (attrs, coding_attr_ascii_compat, Qt);
10012
10013       charset_list = XCDR (charset_list);
10014       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10015       if (CHARSET_DIMENSION (charset) != 2)
10016         error ("Dimension of charset %s is not two",
10017                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10018
10019       category = coding_category_big5;
10020       Vbig5_coding_system = name;
10021     }
10022   else if (EQ (coding_type, Qraw_text))
10023     {
10024       category = coding_category_raw_text;
10025       ASET (attrs, coding_attr_ascii_compat, Qt);
10026     }
10027   else if (EQ (coding_type, Qutf_8))
10028     {
10029       Lisp_Object bom;
10030
10031       if (nargs < coding_arg_utf8_max)
10032         goto short_args;
10033
10034       bom = args[coding_arg_utf8_bom];
10035       if (! NILP (bom) && ! EQ (bom, Qt))
10036         {
10037           CHECK_CONS (bom);
10038           val = XCAR (bom);
10039           CHECK_CODING_SYSTEM (val);
10040           val = XCDR (bom);
10041           CHECK_CODING_SYSTEM (val);
10042         }
10043       ASET (attrs, coding_attr_utf_bom, bom);
10044       if (NILP (bom))
10045         ASET (attrs, coding_attr_ascii_compat, Qt);
10046
10047       category = (CONSP (bom) ? coding_category_utf_8_auto
10048                   : NILP (bom) ? coding_category_utf_8_nosig
10049                   : coding_category_utf_8_sig);
10050     }
10051   else if (EQ (coding_type, Qundecided))
10052     category = coding_category_undecided;
10053   else
10054     error ("Invalid coding system type: %s",
10055            SDATA (SYMBOL_NAME (coding_type)));
10056
10057   ASET (attrs, coding_attr_category, make_number (category));
10058   ASET (attrs, coding_attr_plist,
10059         Fcons (QCcategory,
10060                Fcons (AREF (Vcoding_category_table, category),
10061                       CODING_ATTR_PLIST (attrs))));
10062   ASET (attrs, coding_attr_plist,
10063         Fcons (QCascii_compatible_p,
10064                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10065                       CODING_ATTR_PLIST (attrs))));
10066
10067   eol_type = args[coding_arg_eol_type];
10068   if (! NILP (eol_type)
10069       && ! EQ (eol_type, Qunix)
10070       && ! EQ (eol_type, Qdos)
10071       && ! EQ (eol_type, Qmac))
10072     error ("Invalid eol-type");
10073
10074   aliases = Fcons (name, Qnil);
10075
10076   if (NILP (eol_type))
10077     {
10078       eol_type = make_subsidiaries (name);
10079       for (i = 0; i < 3; i++)
10080         {
10081           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10082
10083           this_name = AREF (eol_type, i);
10084           this_aliases = Fcons (this_name, Qnil);
10085           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10086           this_spec = Fmake_vector (make_number (3), attrs);
10087           ASET (this_spec, 1, this_aliases);
10088           ASET (this_spec, 2, this_eol_type);
10089           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10090           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10091           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10092           if (NILP (val))
10093             Vcoding_system_alist
10094               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10095                        Vcoding_system_alist);
10096         }
10097     }
10098
10099   spec_vec = Fmake_vector (make_number (3), attrs);
10100   ASET (spec_vec, 1, aliases);
10101   ASET (spec_vec, 2, eol_type);
10102
10103   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10104   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10105   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10106   if (NILP (val))
10107     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10108                                   Vcoding_system_alist);
10109
10110   {
10111     int id = coding_categories[category].id;
10112
10113     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10114       setup_coding_system (name, &coding_categories[category]);
10115   }
10116
10117   return Qnil;
10118
10119  short_args:
10120   return Fsignal (Qwrong_number_of_arguments,
10121                   Fcons (intern ("define-coding-system-internal"),
10122                          make_number (nargs)));
10123 }
10124
10125
10126 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10127        3, 3, 0,
10128        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10129   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10130 {
10131   Lisp_Object spec, attrs;
10132
10133   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10134   attrs = AREF (spec, 0);
10135   if (EQ (prop, QCmnemonic))
10136     {
10137       if (! STRINGP (val))
10138         CHECK_CHARACTER (val);
10139       ASET (attrs, coding_attr_mnemonic, val);
10140     }
10141   else if (EQ (prop, QCdefault_char))
10142     {
10143       if (NILP (val))
10144         val = make_number (' ');
10145       else
10146         CHECK_CHARACTER (val);
10147       ASET (attrs, coding_attr_default_char, val);
10148     }
10149   else if (EQ (prop, QCdecode_translation_table))
10150     {
10151       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10152         CHECK_SYMBOL (val);
10153       ASET (attrs, coding_attr_decode_tbl, val);
10154     }
10155   else if (EQ (prop, QCencode_translation_table))
10156     {
10157       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10158         CHECK_SYMBOL (val);
10159       ASET (attrs, coding_attr_encode_tbl, val);
10160     }
10161   else if (EQ (prop, QCpost_read_conversion))
10162     {
10163       CHECK_SYMBOL (val);
10164       ASET (attrs, coding_attr_post_read, val);
10165     }
10166   else if (EQ (prop, QCpre_write_conversion))
10167     {
10168       CHECK_SYMBOL (val);
10169       ASET (attrs, coding_attr_pre_write, val);
10170     }
10171   else if (EQ (prop, QCascii_compatible_p))
10172     {
10173       ASET (attrs, coding_attr_ascii_compat, val);
10174     }
10175
10176   ASET (attrs, coding_attr_plist,
10177         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10178   return val;
10179 }
10180
10181
10182 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10183        Sdefine_coding_system_alias, 2, 2, 0,
10184        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10185   (Lisp_Object alias, Lisp_Object coding_system)
10186 {
10187   Lisp_Object spec, aliases, eol_type, val;
10188
10189   CHECK_SYMBOL (alias);
10190   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10191   aliases = AREF (spec, 1);
10192   /* ALIASES should be a list of length more than zero, and the first
10193      element is a base coding system.  Append ALIAS at the tail of the
10194      list.  */
10195   while (!NILP (XCDR (aliases)))
10196     aliases = XCDR (aliases);
10197   XSETCDR (aliases, Fcons (alias, Qnil));
10198
10199   eol_type = AREF (spec, 2);
10200   if (VECTORP (eol_type))
10201     {
10202       Lisp_Object subsidiaries;
10203       int i;
10204
10205       subsidiaries = make_subsidiaries (alias);
10206       for (i = 0; i < 3; i++)
10207         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10208                                      AREF (eol_type, i));
10209     }
10210
10211   Fputhash (alias, spec, Vcoding_system_hash_table);
10212   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10213   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10214   if (NILP (val))
10215     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10216                                   Vcoding_system_alist);
10217
10218   return Qnil;
10219 }
10220
10221 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10222        1, 1, 0,
10223        doc: /* Return the base of CODING-SYSTEM.
10224 Any alias or subsidiary coding system is not a base coding system.  */)
10225   (Lisp_Object coding_system)
10226 {
10227   Lisp_Object spec, attrs;
10228
10229   if (NILP (coding_system))
10230     return (Qno_conversion);
10231   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10232   attrs = AREF (spec, 0);
10233   return CODING_ATTR_BASE_NAME (attrs);
10234 }
10235
10236 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10237        1, 1, 0,
10238        doc: "Return the property list of CODING-SYSTEM.")
10239   (Lisp_Object coding_system)
10240 {
10241   Lisp_Object spec, attrs;
10242
10243   if (NILP (coding_system))
10244     coding_system = Qno_conversion;
10245   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10246   attrs = AREF (spec, 0);
10247   return CODING_ATTR_PLIST (attrs);
10248 }
10249
10250
10251 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10252        1, 1, 0,
10253        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10254   (Lisp_Object coding_system)
10255 {
10256   Lisp_Object spec;
10257
10258   if (NILP (coding_system))
10259     coding_system = Qno_conversion;
10260   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10261   return AREF (spec, 1);
10262 }
10263
10264 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10265        Scoding_system_eol_type, 1, 1, 0,
10266        doc: /* Return eol-type of CODING-SYSTEM.
10267 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10268
10269 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10270 and CR respectively.
10271
10272 A vector value indicates that a format of end-of-line should be
10273 detected automatically.  Nth element of the vector is the subsidiary
10274 coding system whose eol-type is N.  */)
10275   (Lisp_Object coding_system)
10276 {
10277   Lisp_Object spec, eol_type;
10278   int n;
10279
10280   if (NILP (coding_system))
10281     coding_system = Qno_conversion;
10282   if (! CODING_SYSTEM_P (coding_system))
10283     return Qnil;
10284   spec = CODING_SYSTEM_SPEC (coding_system);
10285   eol_type = AREF (spec, 2);
10286   if (VECTORP (eol_type))
10287     return Fcopy_sequence (eol_type);
10288   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10289   return make_number (n);
10290 }
10291
10292 #endif /* emacs */
10293
10294 \f
10295 /*** 9. Post-amble ***/
10296
10297 void
10298 init_coding_once (void)
10299 {
10300   int i;
10301
10302   for (i = 0; i < coding_category_max; i++)
10303     {
10304       coding_categories[i].id = -1;
10305       coding_priorities[i] = i;
10306     }
10307
10308   /* ISO2022 specific initialize routine.  */
10309   for (i = 0; i < 0x20; i++)
10310     iso_code_class[i] = ISO_control_0;
10311   for (i = 0x21; i < 0x7F; i++)
10312     iso_code_class[i] = ISO_graphic_plane_0;
10313   for (i = 0x80; i < 0xA0; i++)
10314     iso_code_class[i] = ISO_control_1;
10315   for (i = 0xA1; i < 0xFF; i++)
10316     iso_code_class[i] = ISO_graphic_plane_1;
10317   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10318   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10319   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10320   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10321   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10322   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10323   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10324   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10325   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10326
10327   for (i = 0; i < 256; i++)
10328     {
10329       emacs_mule_bytes[i] = 1;
10330     }
10331   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10332   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10333   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10334   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10335 }
10336
10337 #ifdef emacs
10338
10339 void
10340 syms_of_coding (void)
10341 {
10342   staticpro (&Vcoding_system_hash_table);
10343   {
10344     Lisp_Object args[2];
10345     args[0] = QCtest;
10346     args[1] = Qeq;
10347     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10348   }
10349
10350   staticpro (&Vsjis_coding_system);
10351   Vsjis_coding_system = Qnil;
10352
10353   staticpro (&Vbig5_coding_system);
10354   Vbig5_coding_system = Qnil;
10355
10356   staticpro (&Vcode_conversion_reused_workbuf);
10357   Vcode_conversion_reused_workbuf = Qnil;
10358
10359   staticpro (&Vcode_conversion_workbuf_name);
10360   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10361
10362   reused_workbuf_in_use = 0;
10363
10364   DEFSYM (Qcharset, "charset");
10365   DEFSYM (Qtarget_idx, "target-idx");
10366   DEFSYM (Qcoding_system_history, "coding-system-history");
10367   Fset (Qcoding_system_history, Qnil);
10368
10369   /* Target FILENAME is the first argument.  */
10370   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10371   /* Target FILENAME is the third argument.  */
10372   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10373
10374   DEFSYM (Qcall_process, "call-process");
10375   /* Target PROGRAM is the first argument.  */
10376   Fput (Qcall_process, Qtarget_idx, make_number (0));
10377
10378   DEFSYM (Qcall_process_region, "call-process-region");
10379   /* Target PROGRAM is the third argument.  */
10380   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10381
10382   DEFSYM (Qstart_process, "start-process");
10383   /* Target PROGRAM is the third argument.  */
10384   Fput (Qstart_process, Qtarget_idx, make_number (2));
10385
10386   DEFSYM (Qopen_network_stream, "open-network-stream");
10387   /* Target SERVICE is the fourth argument.  */
10388   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10389
10390   DEFSYM (Qcoding_system, "coding-system");
10391   DEFSYM (Qcoding_aliases, "coding-aliases");
10392
10393   DEFSYM (Qeol_type, "eol-type");
10394   DEFSYM (Qunix, "unix");
10395   DEFSYM (Qdos, "dos");
10396
10397   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10398   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10399   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10400   DEFSYM (Qdefault_char, "default-char");
10401   DEFSYM (Qundecided, "undecided");
10402   DEFSYM (Qno_conversion, "no-conversion");
10403   DEFSYM (Qraw_text, "raw-text");
10404
10405   DEFSYM (Qiso_2022, "iso-2022");
10406
10407   DEFSYM (Qutf_8, "utf-8");
10408   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10409
10410   DEFSYM (Qutf_16, "utf-16");
10411   DEFSYM (Qbig, "big");
10412   DEFSYM (Qlittle, "little");
10413
10414   DEFSYM (Qshift_jis, "shift-jis");
10415   DEFSYM (Qbig5, "big5");
10416
10417   DEFSYM (Qcoding_system_p, "coding-system-p");
10418
10419   DEFSYM (Qcoding_system_error, "coding-system-error");
10420   Fput (Qcoding_system_error, Qerror_conditions,
10421         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10422   Fput (Qcoding_system_error, Qerror_message,
10423         build_pure_c_string ("Invalid coding system"));
10424
10425   /* Intern this now in case it isn't already done.
10426      Setting this variable twice is harmless.
10427      But don't staticpro it here--that is done in alloc.c.  */
10428   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10429
10430   DEFSYM (Qtranslation_table, "translation-table");
10431   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10432   DEFSYM (Qtranslation_table_id, "translation-table-id");
10433   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10434   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10435
10436   DEFSYM (Qvalid_codes, "valid-codes");
10437
10438   DEFSYM (Qemacs_mule, "emacs-mule");
10439
10440   DEFSYM (QCcategory, ":category");
10441   DEFSYM (QCmnemonic, ":mnemonic");
10442   DEFSYM (QCdefault_char, ":default-char");
10443   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10444   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10445   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10446   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10447   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10448
10449   Vcoding_category_table
10450     = Fmake_vector (make_number (coding_category_max), Qnil);
10451   staticpro (&Vcoding_category_table);
10452   /* Followings are target of code detection.  */
10453   ASET (Vcoding_category_table, coding_category_iso_7,
10454         intern_c_string ("coding-category-iso-7"));
10455   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10456         intern_c_string ("coding-category-iso-7-tight"));
10457   ASET (Vcoding_category_table, coding_category_iso_8_1,
10458         intern_c_string ("coding-category-iso-8-1"));
10459   ASET (Vcoding_category_table, coding_category_iso_8_2,
10460         intern_c_string ("coding-category-iso-8-2"));
10461   ASET (Vcoding_category_table, coding_category_iso_7_else,
10462         intern_c_string ("coding-category-iso-7-else"));
10463   ASET (Vcoding_category_table, coding_category_iso_8_else,
10464         intern_c_string ("coding-category-iso-8-else"));
10465   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10466         intern_c_string ("coding-category-utf-8-auto"));
10467   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10468         intern_c_string ("coding-category-utf-8"));
10469   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10470         intern_c_string ("coding-category-utf-8-sig"));
10471   ASET (Vcoding_category_table, coding_category_utf_16_be,
10472         intern_c_string ("coding-category-utf-16-be"));
10473   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10474         intern_c_string ("coding-category-utf-16-auto"));
10475   ASET (Vcoding_category_table, coding_category_utf_16_le,
10476         intern_c_string ("coding-category-utf-16-le"));
10477   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10478         intern_c_string ("coding-category-utf-16-be-nosig"));
10479   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10480         intern_c_string ("coding-category-utf-16-le-nosig"));
10481   ASET (Vcoding_category_table, coding_category_charset,
10482         intern_c_string ("coding-category-charset"));
10483   ASET (Vcoding_category_table, coding_category_sjis,
10484         intern_c_string ("coding-category-sjis"));
10485   ASET (Vcoding_category_table, coding_category_big5,
10486         intern_c_string ("coding-category-big5"));
10487   ASET (Vcoding_category_table, coding_category_ccl,
10488         intern_c_string ("coding-category-ccl"));
10489   ASET (Vcoding_category_table, coding_category_emacs_mule,
10490         intern_c_string ("coding-category-emacs-mule"));
10491   /* Followings are NOT target of code detection.  */
10492   ASET (Vcoding_category_table, coding_category_raw_text,
10493         intern_c_string ("coding-category-raw-text"));
10494   ASET (Vcoding_category_table, coding_category_undecided,
10495         intern_c_string ("coding-category-undecided"));
10496
10497   DEFSYM (Qinsufficient_source, "insufficient-source");
10498   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10499   DEFSYM (Qinvalid_source, "invalid-source");
10500   DEFSYM (Qinterrupted, "interrupted");
10501   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10502   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10503
10504   defsubr (&Scoding_system_p);
10505   defsubr (&Sread_coding_system);
10506   defsubr (&Sread_non_nil_coding_system);
10507   defsubr (&Scheck_coding_system);
10508   defsubr (&Sdetect_coding_region);
10509   defsubr (&Sdetect_coding_string);
10510   defsubr (&Sfind_coding_systems_region_internal);
10511   defsubr (&Sunencodable_char_position);
10512   defsubr (&Scheck_coding_systems_region);
10513   defsubr (&Sdecode_coding_region);
10514   defsubr (&Sencode_coding_region);
10515   defsubr (&Sdecode_coding_string);
10516   defsubr (&Sencode_coding_string);
10517   defsubr (&Sdecode_sjis_char);
10518   defsubr (&Sencode_sjis_char);
10519   defsubr (&Sdecode_big5_char);
10520   defsubr (&Sencode_big5_char);
10521   defsubr (&Sset_terminal_coding_system_internal);
10522   defsubr (&Sset_safe_terminal_coding_system_internal);
10523   defsubr (&Sterminal_coding_system);
10524   defsubr (&Sset_keyboard_coding_system_internal);
10525   defsubr (&Skeyboard_coding_system);
10526   defsubr (&Sfind_operation_coding_system);
10527   defsubr (&Sset_coding_system_priority);
10528   defsubr (&Sdefine_coding_system_internal);
10529   defsubr (&Sdefine_coding_system_alias);
10530   defsubr (&Scoding_system_put);
10531   defsubr (&Scoding_system_base);
10532   defsubr (&Scoding_system_plist);
10533   defsubr (&Scoding_system_aliases);
10534   defsubr (&Scoding_system_eol_type);
10535   defsubr (&Scoding_system_priority_list);
10536
10537   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10538                doc: /* List of coding systems.
10539
10540 Do not alter the value of this variable manually.  This variable should be
10541 updated by the functions `define-coding-system' and
10542 `define-coding-system-alias'.  */);
10543   Vcoding_system_list = Qnil;
10544
10545   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10546                doc: /* Alist of coding system names.
10547 Each element is one element list of coding system name.
10548 This variable is given to `completing-read' as COLLECTION argument.
10549
10550 Do not alter the value of this variable manually.  This variable should be
10551 updated by the functions `make-coding-system' and
10552 `define-coding-system-alias'.  */);
10553   Vcoding_system_alist = Qnil;
10554
10555   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10556                doc: /* List of coding-categories (symbols) ordered by priority.
10557
10558 On detecting a coding system, Emacs tries code detection algorithms
10559 associated with each coding-category one by one in this order.  When
10560 one algorithm agrees with a byte sequence of source text, the coding
10561 system bound to the corresponding coding-category is selected.
10562
10563 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10564   {
10565     int i;
10566
10567     Vcoding_category_list = Qnil;
10568     for (i = coding_category_max - 1; i >= 0; i--)
10569       Vcoding_category_list
10570         = Fcons (AREF (Vcoding_category_table, i),
10571                  Vcoding_category_list);
10572   }
10573
10574   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10575                doc: /* Specify the coding system for read operations.
10576 It is useful to bind this variable with `let', but do not set it globally.
10577 If the value is a coding system, it is used for decoding on read operation.
10578 If not, an appropriate element is used from one of the coding system alists.
10579 There are three such tables: `file-coding-system-alist',
10580 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10581   Vcoding_system_for_read = Qnil;
10582
10583   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10584                doc: /* Specify the coding system for write operations.
10585 Programs bind this variable with `let', but you should not set it globally.
10586 If the value is a coding system, it is used for encoding of output,
10587 when writing it to a file and when sending it to a file or subprocess.
10588
10589 If this does not specify a coding system, an appropriate element
10590 is used from one of the coding system alists.
10591 There are three such tables: `file-coding-system-alist',
10592 `process-coding-system-alist', and `network-coding-system-alist'.
10593 For output to files, if the above procedure does not specify a coding system,
10594 the value of `buffer-file-coding-system' is used.  */);
10595   Vcoding_system_for_write = Qnil;
10596
10597   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10598                doc: /*
10599 Coding system used in the latest file or process I/O.  */);
10600   Vlast_coding_system_used = Qnil;
10601
10602   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10603                doc: /*
10604 Error status of the last code conversion.
10605
10606 When an error was detected in the last code conversion, this variable
10607 is set to one of the following symbols.
10608   `insufficient-source'
10609   `inconsistent-eol'
10610   `invalid-source'
10611   `interrupted'
10612   `insufficient-memory'
10613 When no error was detected, the value doesn't change.  So, to check
10614 the error status of a code conversion by this variable, you must
10615 explicitly set this variable to nil before performing code
10616 conversion.  */);
10617   Vlast_code_conversion_error = Qnil;
10618
10619   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10620                doc: /*
10621 *Non-nil means always inhibit code conversion of end-of-line format.
10622 See info node `Coding Systems' and info node `Text and Binary' concerning
10623 such conversion.  */);
10624   inhibit_eol_conversion = 0;
10625
10626   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10627                doc: /*
10628 Non-nil means process buffer inherits coding system of process output.
10629 Bind it to t if the process output is to be treated as if it were a file
10630 read from some filesystem.  */);
10631   inherit_process_coding_system = 0;
10632
10633   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10634                doc: /*
10635 Alist to decide a coding system to use for a file I/O operation.
10636 The format is ((PATTERN . VAL) ...),
10637 where PATTERN is a regular expression matching a file name,
10638 VAL is a coding system, a cons of coding systems, or a function symbol.
10639 If VAL is a coding system, it is used for both decoding and encoding
10640 the file contents.
10641 If VAL is a cons of coding systems, the car part is used for decoding,
10642 and the cdr part is used for encoding.
10643 If VAL is a function symbol, the function must return a coding system
10644 or a cons of coding systems which are used as above.  The function is
10645 called with an argument that is a list of the arguments with which
10646 `find-operation-coding-system' was called.  If the function can't decide
10647 a coding system, it can return `undecided' so that the normal
10648 code-detection is performed.
10649
10650 See also the function `find-operation-coding-system'
10651 and the variable `auto-coding-alist'.  */);
10652   Vfile_coding_system_alist = Qnil;
10653
10654   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10655                doc: /*
10656 Alist to decide a coding system to use for a process I/O operation.
10657 The format is ((PATTERN . VAL) ...),
10658 where PATTERN is a regular expression matching a program name,
10659 VAL is a coding system, a cons of coding systems, or a function symbol.
10660 If VAL is a coding system, it is used for both decoding what received
10661 from the program and encoding what sent to the program.
10662 If VAL is a cons of coding systems, the car part is used for decoding,
10663 and the cdr part is used for encoding.
10664 If VAL is a function symbol, the function must return a coding system
10665 or a cons of coding systems which are used as above.
10666
10667 See also the function `find-operation-coding-system'.  */);
10668   Vprocess_coding_system_alist = Qnil;
10669
10670   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10671                doc: /*
10672 Alist to decide a coding system to use for a network I/O operation.
10673 The format is ((PATTERN . VAL) ...),
10674 where PATTERN is a regular expression matching a network service name
10675 or is a port number to connect to,
10676 VAL is a coding system, a cons of coding systems, or a function symbol.
10677 If VAL is a coding system, it is used for both decoding what received
10678 from the network stream and encoding what sent to the network stream.
10679 If VAL is a cons of coding systems, the car part is used for decoding,
10680 and the cdr part is used for encoding.
10681 If VAL is a function symbol, the function must return a coding system
10682 or a cons of coding systems which are used as above.
10683
10684 See also the function `find-operation-coding-system'.  */);
10685   Vnetwork_coding_system_alist = Qnil;
10686
10687   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10688                doc: /* Coding system to use with system messages.
10689 Also used for decoding keyboard input on X Window system.  */);
10690   Vlocale_coding_system = Qnil;
10691
10692   /* The eol mnemonics are reset in startup.el system-dependently.  */
10693   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10694                doc: /*
10695 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10696   eol_mnemonic_unix = build_pure_c_string (":");
10697
10698   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10699                doc: /*
10700 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10701   eol_mnemonic_dos = build_pure_c_string ("\\");
10702
10703   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10704                doc: /*
10705 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10706   eol_mnemonic_mac = build_pure_c_string ("/");
10707
10708   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10709                doc: /*
10710 *String displayed in mode line when end-of-line format is not yet determined.  */);
10711   eol_mnemonic_undecided = build_pure_c_string (":");
10712
10713   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10714                doc: /*
10715 *Non-nil enables character translation while encoding and decoding.  */);
10716   Venable_character_translation = Qt;
10717
10718   DEFVAR_LISP ("standard-translation-table-for-decode",
10719                Vstandard_translation_table_for_decode,
10720                doc: /* Table for translating characters while decoding.  */);
10721   Vstandard_translation_table_for_decode = Qnil;
10722
10723   DEFVAR_LISP ("standard-translation-table-for-encode",
10724                Vstandard_translation_table_for_encode,
10725                doc: /* Table for translating characters while encoding.  */);
10726   Vstandard_translation_table_for_encode = Qnil;
10727
10728   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10729                doc: /* Alist of charsets vs revision numbers.
10730 While encoding, if a charset (car part of an element) is found,
10731 designate it with the escape sequence identifying revision (cdr part
10732 of the element).  */);
10733   Vcharset_revision_table = Qnil;
10734
10735   DEFVAR_LISP ("default-process-coding-system",
10736                Vdefault_process_coding_system,
10737                doc: /* Cons of coding systems used for process I/O by default.
10738 The car part is used for decoding a process output,
10739 the cdr part is used for encoding a text to be sent to a process.  */);
10740   Vdefault_process_coding_system = Qnil;
10741
10742   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10743                doc: /*
10744 Table of extra Latin codes in the range 128..159 (inclusive).
10745 This is a vector of length 256.
10746 If Nth element is non-nil, the existence of code N in a file
10747 \(or output of subprocess) doesn't prevent it to be detected as
10748 a coding system of ISO 2022 variant which has a flag
10749 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10750 or reading output of a subprocess.
10751 Only 128th through 159th elements have a meaning.  */);
10752   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10753
10754   DEFVAR_LISP ("select-safe-coding-system-function",
10755                Vselect_safe_coding_system_function,
10756                doc: /*
10757 Function to call to select safe coding system for encoding a text.
10758
10759 If set, this function is called to force a user to select a proper
10760 coding system which can encode the text in the case that a default
10761 coding system used in each operation can't encode the text.  The
10762 function should take care that the buffer is not modified while
10763 the coding system is being selected.
10764
10765 The default value is `select-safe-coding-system' (which see).  */);
10766   Vselect_safe_coding_system_function = Qnil;
10767
10768   DEFVAR_BOOL ("coding-system-require-warning",
10769                coding_system_require_warning,
10770                doc: /* Internal use only.
10771 If non-nil, on writing a file, `select-safe-coding-system-function' is
10772 called even if `coding-system-for-write' is non-nil.  The command
10773 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10774   coding_system_require_warning = 0;
10775
10776
10777   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10778                inhibit_iso_escape_detection,
10779                doc: /*
10780 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10781
10782 When Emacs reads text, it tries to detect how the text is encoded.
10783 This code detection is sensitive to escape sequences.  If Emacs sees
10784 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10785 of the ISO2022 encodings, and decodes text by the corresponding coding
10786 system (e.g. `iso-2022-7bit').
10787
10788 However, there may be a case that you want to read escape sequences in
10789 a file as is.  In such a case, you can set this variable to non-nil.
10790 Then the code detection will ignore any escape sequences, and no text is
10791 detected as encoded in some ISO-2022 encoding.  The result is that all
10792 escape sequences become visible in a buffer.
10793
10794 The default value is nil, and it is strongly recommended not to change
10795 it.  That is because many Emacs Lisp source files that contain
10796 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10797 in Emacs's distribution, and they won't be decoded correctly on
10798 reading if you suppress escape sequence detection.
10799
10800 The other way to read escape sequences in a file without decoding is
10801 to explicitly specify some coding system that doesn't use ISO-2022
10802 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10803   inhibit_iso_escape_detection = 0;
10804
10805   DEFVAR_BOOL ("inhibit-null-byte-detection",
10806                inhibit_null_byte_detection,
10807                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10808 By default, Emacs treats it as binary data, and does not attempt to
10809 decode it.  The effect is as if you specified `no-conversion' for
10810 reading that text.
10811
10812 Set this to non-nil when a regular text happens to include null bytes.
10813 Examples are Index nodes of Info files and null-byte delimited output
10814 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10815 decode text as usual.  */);
10816   inhibit_null_byte_detection = 0;
10817
10818   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10819                doc: /* Char table for translating self-inserting characters.
10820 This is applied to the result of input methods, not their input.
10821 See also `keyboard-translate-table'.
10822
10823 Use of this variable for character code unification was rendered
10824 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10825 internal character representation.  */);
10826     Vtranslation_table_for_input = Qnil;
10827
10828   {
10829     Lisp_Object args[coding_arg_max];
10830     Lisp_Object plist[16];
10831     int i;
10832
10833     for (i = 0; i < coding_arg_max; i++)
10834       args[i] = Qnil;
10835
10836     plist[0] = intern_c_string (":name");
10837     plist[1] = args[coding_arg_name] = Qno_conversion;
10838     plist[2] = intern_c_string (":mnemonic");
10839     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10840     plist[4] = intern_c_string (":coding-type");
10841     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10842     plist[6] = intern_c_string (":ascii-compatible-p");
10843     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10844     plist[8] = intern_c_string (":default-char");
10845     plist[9] = args[coding_arg_default_char] = make_number (0);
10846     plist[10] = intern_c_string (":for-unibyte");
10847     plist[11] = args[coding_arg_for_unibyte] = Qt;
10848     plist[12] = intern_c_string (":docstring");
10849     plist[13] = build_pure_c_string ("Do no conversion.\n\
10850 \n\
10851 When you visit a file with this coding, the file is read into a\n\
10852 unibyte buffer as is, thus each byte of a file is treated as a\n\
10853 character.");
10854     plist[14] = intern_c_string (":eol-type");
10855     plist[15] = args[coding_arg_eol_type] = Qunix;
10856     args[coding_arg_plist] = Flist (16, plist);
10857     Fdefine_coding_system_internal (coding_arg_max, args);
10858
10859     plist[1] = args[coding_arg_name] = Qundecided;
10860     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10861     plist[5] = args[coding_arg_coding_type] = Qundecided;
10862     /* This is already set.
10863        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10864     plist[8] = intern_c_string (":charset-list");
10865     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10866     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10867     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10868     plist[15] = args[coding_arg_eol_type] = Qnil;
10869     args[coding_arg_plist] = Flist (16, plist);
10870     Fdefine_coding_system_internal (coding_arg_max, args);
10871   }
10872
10873   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10874
10875   {
10876     int i;
10877
10878     for (i = 0; i < coding_category_max; i++)
10879       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10880   }
10881 #if defined (DOS_NT)
10882   system_eol_type = Qdos;
10883 #else
10884   system_eol_type = Qunix;
10885 #endif
10886   staticpro (&system_eol_type);
10887 }
10888
10889 char *
10890 emacs_strerror (int error_number)
10891 {
10892   char *str;
10893
10894   synchronize_system_messages_locale ();
10895   str = strerror (error_number);
10896
10897   if (! NILP (Vlocale_coding_system))
10898     {
10899       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10900                                                       Vlocale_coding_system,
10901                                                       0);
10902       str = SSDATA (dec);
10903     }
10904
10905   return str;
10906 }
10907
10908 #endif /* emacs */