src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2012 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   int multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   int multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   int multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288 #include <setjmp.h>
 289
 290 #include "lisp.h"
 291 #include "buffer.h"
 292 #include "character.h"
 293 #include "charset.h"
 294 #include "ccl.h"
 295 #include "composite.h"
 296 #include "coding.h"
 297 #include "window.h"
 298 #include "frame.h"
 299 #include "termhooks.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 static Lisp_Object Qcoding_system, Qeol_type;
 304 static Lisp_Object Qcoding_aliases;
 305 Lisp_Object Qunix, Qdos;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 static Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qutf_8;
 311 static Lisp_Object Qiso_2022;
 312 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 313 static Lisp_Object Qbig, Qlittle;
 314 static Lisp_Object Qcoding_system_history;
 315 static Lisp_Object Qvalid_codes;
 316 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 317 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 static Lisp_Object QCascii_compatible_p;
 320
 321 Lisp_Object Qcall_process, Qcall_process_region;
 322 Lisp_Object Qstart_process, Qopen_network_stream;
 323 static Lisp_Object Qtarget_idx;
 324
 325 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 326 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 327
 328 /* If a symbol has this property, evaluate the value to define the
 329    symbol as a coding system.  */
 330 static Lisp_Object Qcoding_system_define_form;
 331
 332 /* Format of end-of-line decided by system.  This is Qunix on
 333    Unix and Mac, Qdos on DOS/Windows.
 334    This has an effect only for external encoding (i.e. for output to
 335    file and process), not for in-buffer or Lisp string encoding.  */
 336 static Lisp_Object system_eol_type;
 337
 338 #ifdef emacs
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 /* Coding-systems are handed between Emacs Lisp programs and C internal
 348    routines by the following three variables.  */
 349 /* Coding system to be used to encode text for terminal display when
 350    terminal coding system is nil.  */
 351 struct coding_system safe_terminal_coding;
 352
 353 #endif /* emacs */
 354
 355 Lisp_Object Qtranslation_table;
 356 Lisp_Object Qtranslation_table_id;
 357 static Lisp_Object Qtranslation_table_for_decode;
 358 static Lisp_Object Qtranslation_table_for_encode;
 359
 360 /* Two special coding systems.  */
 361 static Lisp_Object Vsjis_coding_system;
 362 static Lisp_Object Vbig5_coding_system;
 363
 364 /* ISO2022 section */
 365
 366 #define CODING_ISO_INITIAL(coding, reg)                 \
 367   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 368                      coding_attr_iso_initial),          \
 369                reg)))
 370
 371
 372 #define CODING_ISO_REQUEST(coding, charset_id)          \
 373   (((charset_id) <= (coding)->max_charset_id            \
 374     ? ((coding)->safe_charsets[charset_id] != 255       \
 375        ? (coding)->safe_charsets[charset_id]            \
 376        : -1)                                            \
 377     : -1))
 378
 379
 380 #define CODING_ISO_FLAGS(coding)        \
 381   ((coding)->spec.iso_2022.flags)
 382 #define CODING_ISO_DESIGNATION(coding, reg)     \
 383   ((coding)->spec.iso_2022.current_designation[reg])
 384 #define CODING_ISO_INVOCATION(coding, plane)    \
 385   ((coding)->spec.iso_2022.current_invocation[plane])
 386 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 387   ((coding)->spec.iso_2022.single_shifting)
 388 #define CODING_ISO_BOL(coding)  \
 389   ((coding)->spec.iso_2022.bol)
 390 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 391   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 392 #define CODING_ISO_CMP_STATUS(coding)   \
 393   (&(coding)->spec.iso_2022.cmp_status)
 394 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 395   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 396 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 397   ((coding)->spec.iso_2022.embedded_utf_8)
 398
 399 /* Control characters of ISO2022.  */
 400                         /* code */      /* function */
 401 #define ISO_CODE_SO     0x0E            /* shift-out */
 402 #define ISO_CODE_SI     0x0F            /* shift-in */
 403 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 404 #define ISO_CODE_ESC    0x1B            /* escape */
 405 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 406 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 407 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 408
 409 /* All code (1-byte) of ISO2022 is classified into one of the
 410    followings.  */
 411 enum iso_code_class_type
 412   {
 413     ISO_control_0,              /* Control codes in the range
 414                                    0x00..0x1F and 0x7F, except for the
 415                                    following 5 codes.  */
 416     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 417     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 418     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 419     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 420     ISO_control_1,              /* Control codes in the range
 421                                    0x80..0x9F, except for the
 422                                    following 3 codes.  */
 423     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 424     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 425     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 426     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 427     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 428     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 429     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 430   };
 431
 432 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 433     `iso-flags' attribute of an iso2022 coding system.  */
 434
 435 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 436    instead of the correct short-form sequence (e.g. ESC $ A).  */
 437 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 438
 439 /* If set, reset graphic planes and registers at end-of-line to the
 440    initial state.  */
 441 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 442
 443 /* If set, reset graphic planes and registers before any control
 444    characters to the initial state.  */
 445 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 446
 447 /* If set, encode by 7-bit environment.  */
 448 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 449
 450 /* If set, use locking-shift function.  */
 451 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 452
 453 /* If set, use single-shift function.  Overwrite
 454    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 455 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 456
 457 /* If set, use designation escape sequence.  */
 458 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 459
 460 /* If set, produce revision number sequence.  */
 461 #define CODING_ISO_FLAG_REVISION        0x0080
 462
 463 /* If set, produce ISO6429's direction specifying sequence.  */
 464 #define CODING_ISO_FLAG_DIRECTION       0x0100
 465
 466 /* If set, assume designation states are reset at beginning of line on
 467    output.  */
 468 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 469
 470 /* If set, designation sequence should be placed at beginning of line
 471    on output.  */
 472 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 473
 474 /* If set, do not encode unsafe characters on output.  */
 475 #define CODING_ISO_FLAG_SAFE            0x0800
 476
 477 /* If set, extra latin codes (128..159) are accepted as a valid code
 478    on input.  */
 479 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 480
 481 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 482
 483 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 484
 485 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 486
 487 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 488
 489 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 490
 491 /* A character to be produced on output if encoding of the original
 492    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 493 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 494
 495 /* UTF-8 section */
 496 #define CODING_UTF_8_BOM(coding)        \
 497   ((coding)->spec.utf_8_bom)
 498
 499 /* UTF-16 section */
 500 #define CODING_UTF_16_BOM(coding)       \
 501   ((coding)->spec.utf_16.bom)
 502
 503 #define CODING_UTF_16_ENDIAN(coding)    \
 504   ((coding)->spec.utf_16.endian)
 505
 506 #define CODING_UTF_16_SURROGATE(coding) \
 507   ((coding)->spec.utf_16.surrogate)
 508
 509
 510 /* CCL section */
 511 #define CODING_CCL_DECODER(coding)      \
 512   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 513 #define CODING_CCL_ENCODER(coding)      \
 514   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 515 #define CODING_CCL_VALIDS(coding)                                          \
 516   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 517
 518 /* Index for each coding category in `coding_categories' */
 519
 520 enum coding_category
 521   {
 522     coding_category_iso_7,
 523     coding_category_iso_7_tight,
 524     coding_category_iso_8_1,
 525     coding_category_iso_8_2,
 526     coding_category_iso_7_else,
 527     coding_category_iso_8_else,
 528     coding_category_utf_8_auto,
 529     coding_category_utf_8_nosig,
 530     coding_category_utf_8_sig,
 531     coding_category_utf_16_auto,
 532     coding_category_utf_16_be,
 533     coding_category_utf_16_le,
 534     coding_category_utf_16_be_nosig,
 535     coding_category_utf_16_le_nosig,
 536     coding_category_charset,
 537     coding_category_sjis,
 538     coding_category_big5,
 539     coding_category_ccl,
 540     coding_category_emacs_mule,
 541     /* All above are targets of code detection.  */
 542     coding_category_raw_text,
 543     coding_category_undecided,
 544     coding_category_max
 545   };
 546
 547 /* Definitions of flag bits used in detect_coding_XXXX.  */
 548 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 549 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 550 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 551 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 552 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 553 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 554 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 555 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 556 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 557 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 558 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 559 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 560 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 561 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 562 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 563 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 564 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 565 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 566 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 567 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 568
 569 /* This value is returned if detect_coding_mask () find nothing other
 570    than ASCII characters.  */
 571 #define CATEGORY_MASK_ANY               \
 572   (CATEGORY_MASK_ISO_7                  \
 573    | CATEGORY_MASK_ISO_7_TIGHT          \
 574    | CATEGORY_MASK_ISO_8_1              \
 575    | CATEGORY_MASK_ISO_8_2              \
 576    | CATEGORY_MASK_ISO_7_ELSE           \
 577    | CATEGORY_MASK_ISO_8_ELSE           \
 578    | CATEGORY_MASK_UTF_8_AUTO           \
 579    | CATEGORY_MASK_UTF_8_NOSIG          \
 580    | CATEGORY_MASK_UTF_8_SIG            \
 581    | CATEGORY_MASK_UTF_16_AUTO          \
 582    | CATEGORY_MASK_UTF_16_BE            \
 583    | CATEGORY_MASK_UTF_16_LE            \
 584    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 585    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 586    | CATEGORY_MASK_CHARSET              \
 587    | CATEGORY_MASK_SJIS                 \
 588    | CATEGORY_MASK_BIG5                 \
 589    | CATEGORY_MASK_CCL                  \
 590    | CATEGORY_MASK_EMACS_MULE)
 591
 592
 593 #define CATEGORY_MASK_ISO_7BIT \
 594   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 595
 596 #define CATEGORY_MASK_ISO_8BIT \
 597   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 598
 599 #define CATEGORY_MASK_ISO_ELSE \
 600   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 601
 602 #define CATEGORY_MASK_ISO_ESCAPE        \
 603   (CATEGORY_MASK_ISO_7                  \
 604    | CATEGORY_MASK_ISO_7_TIGHT          \
 605    | CATEGORY_MASK_ISO_7_ELSE           \
 606    | CATEGORY_MASK_ISO_8_ELSE)
 607
 608 #define CATEGORY_MASK_ISO       \
 609   (  CATEGORY_MASK_ISO_7BIT     \
 610      | CATEGORY_MASK_ISO_8BIT   \
 611      | CATEGORY_MASK_ISO_ELSE)
 612
 613 #define CATEGORY_MASK_UTF_16            \
 614   (CATEGORY_MASK_UTF_16_AUTO            \
 615    | CATEGORY_MASK_UTF_16_BE            \
 616    | CATEGORY_MASK_UTF_16_LE            \
 617    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 618    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 619
 620 #define CATEGORY_MASK_UTF_8     \
 621   (CATEGORY_MASK_UTF_8_AUTO     \
 622    | CATEGORY_MASK_UTF_8_NOSIG  \
 623    | CATEGORY_MASK_UTF_8_SIG)
 624
 625 /* Table of coding categories (Lisp symbols).  This variable is for
 626    internal use only.  */
 627 static Lisp_Object Vcoding_category_table;
 628
 629 /* Table of coding-categories ordered by priority.  */
 630 static enum coding_category coding_priorities[coding_category_max];
 631
 632 /* Nth element is a coding context for the coding system bound to the
 633    Nth coding category.  */
 634 static struct coding_system coding_categories[coding_category_max];
 635
 636 /*** Commonly used macros and functions ***/
 637
 638 #ifndef min
 639 #define min(a, b) ((a) < (b) ? (a) : (b))
 640 #endif
 641 #ifndef max
 642 #define max(a, b) ((a) > (b) ? (a) : (b))
 643 #endif
 644
 645 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 646   do {                                                  \
 647     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 648     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 649   } while (0)
 650
 651
 652 /* Safely get one byte from the source text pointed by SRC which ends
 653    at SRC_END, and set C to that byte.  If there are not enough bytes
 654    in the source, it jumps to `no_more_source'.  If multibytep is
 655    nonzero, and a multibyte character is found at SRC, set C to the
 656    negative value of the character code.  The caller should declare
 657    and set these variables appropriately in advance:
 658         src, src_end, multibytep */
 659
 660 #define ONE_MORE_BYTE(c)                                \
 661   do {                                                  \
 662     if (src == src_end)                                 \
 663       {                                                 \
 664         if (src_base < src)                             \
 665           record_conversion_result                      \
 666             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 667         goto no_more_source;                            \
 668       }                                                 \
 669     c = *src++;                                         \
 670     if (multibytep && (c & 0x80))                       \
 671       {                                                 \
 672         if ((c & 0xFE) == 0xC0)                         \
 673           c = ((c & 1) << 6) | *src++;                  \
 674         else                                            \
 675           {                                             \
 676             src--;                                      \
 677             c = - string_char (src, &src, NULL);        \
 678             record_conversion_result                    \
 679               (coding, CODING_RESULT_INVALID_SRC);      \
 680           }                                             \
 681       }                                                 \
 682     consumed_chars++;                                   \
 683   } while (0)
 684
 685 /* Safely get two bytes from the source text pointed by SRC which ends
 686    at SRC_END, and set C1 and C2 to those bytes while skipping the
 687    heading multibyte characters.  If there are not enough bytes in the
 688    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 689    a multibyte character is found for C2, set C2 to the negative value
 690    of the character code.  The caller should declare and set these
 691    variables appropriately in advance:
 692         src, src_end, multibytep
 693    It is intended that this macro is used in detect_coding_utf_16.  */
 694
 695 #define TWO_MORE_BYTES(c1, c2)                          \
 696   do {                                                  \
 697     do {                                                \
 698       if (src == src_end)                               \
 699         goto no_more_source;                            \
 700       c1 = *src++;                                      \
 701       if (multibytep && (c1 & 0x80))                    \
 702         {                                               \
 703           if ((c1 & 0xFE) == 0xC0)                      \
 704             c1 = ((c1 & 1) << 6) | *src++;              \
 705           else                                          \
 706             {                                           \
 707               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 708               c1 = -1;                                  \
 709             }                                           \
 710         }                                               \
 711     } while (c1 < 0);                                   \
 712     if (src == src_end)                                 \
 713       goto no_more_source;                              \
 714     c2 = *src++;                                        \
 715     if (multibytep && (c2 & 0x80))                      \
 716       {                                                 \
 717         if ((c2 & 0xFE) == 0xC0)                        \
 718           c2 = ((c2 & 1) << 6) | *src++;                \
 719         else                                            \
 720           c2 = -1;                                      \
 721       }                                                 \
 722   } while (0)
 723
 724
 725 /* Store a byte C in the place pointed by DST and increment DST to the
 726    next free point, and increment PRODUCED_CHARS.  The caller should
 727    assure that C is 0..127, and declare and set the variable `dst'
 728    appropriately in advance.
 729 */
 730
 731
 732 #define EMIT_ONE_ASCII_BYTE(c)  \
 733   do {                          \
 734     produced_chars++;           \
 735     *dst++ = (c);               \
 736   } while (0)
 737
 738
 739 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 740
 741 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 742   do {                                  \
 743     produced_chars += 2;                \
 744     *dst++ = (c1), *dst++ = (c2);       \
 745   } while (0)
 746
 747
 748 /* Store a byte C in the place pointed by DST and increment DST to the
 749    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 750    nonzero, store in an appropriate multibyte from.  The caller should
 751    declare and set the variables `dst' and `multibytep' appropriately
 752    in advance.  */
 753
 754 #define EMIT_ONE_BYTE(c)                \
 755   do {                                  \
 756     produced_chars++;                   \
 757     if (multibytep)                     \
 758       {                                 \
 759         unsigned ch = (c);              \
 760         if (ch >= 0x80)                 \
 761           ch = BYTE8_TO_CHAR (ch);      \
 762         CHAR_STRING_ADVANCE (ch, dst);  \
 763       }                                 \
 764     else                                \
 765       *dst++ = (c);                     \
 766   } while (0)
 767
 768
 769 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 770
 771 #define EMIT_TWO_BYTES(c1, c2)          \
 772   do {                                  \
 773     produced_chars += 2;                \
 774     if (multibytep)                     \
 775       {                                 \
 776         unsigned ch;                    \
 777                                         \
 778         ch = (c1);                      \
 779         if (ch >= 0x80)                 \
 780           ch = BYTE8_TO_CHAR (ch);      \
 781         CHAR_STRING_ADVANCE (ch, dst);  \
 782         ch = (c2);                      \
 783         if (ch >= 0x80)                 \
 784           ch = BYTE8_TO_CHAR (ch);      \
 785         CHAR_STRING_ADVANCE (ch, dst);  \
 786       }                                 \
 787     else                                \
 788       {                                 \
 789         *dst++ = (c1);                  \
 790         *dst++ = (c2);                  \
 791       }                                 \
 792   } while (0)
 793
 794
 795 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 796   do {                                  \
 797     EMIT_ONE_BYTE (c1);                 \
 798     EMIT_TWO_BYTES (c2, c3);            \
 799   } while (0)
 800
 801
 802 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 803   do {                                          \
 804     EMIT_TWO_BYTES (c1, c2);                    \
 805     EMIT_TWO_BYTES (c3, c4);                    \
 806   } while (0)
 807
 808
 809 /* Prototypes for static functions.  */
 810 static void record_conversion_result (struct coding_system *coding,
 811                                       enum coding_result_code result);
 812 static int detect_coding_utf_8 (struct coding_system *,
 813                                 struct coding_detection_info *info);
 814 static void decode_coding_utf_8 (struct coding_system *);
 815 static int encode_coding_utf_8 (struct coding_system *);
 816
 817 static int detect_coding_utf_16 (struct coding_system *,
 818                                  struct coding_detection_info *info);
 819 static void decode_coding_utf_16 (struct coding_system *);
 820 static int encode_coding_utf_16 (struct coding_system *);
 821
 822 static int detect_coding_iso_2022 (struct coding_system *,
 823                                    struct coding_detection_info *info);
 824 static void decode_coding_iso_2022 (struct coding_system *);
 825 static int encode_coding_iso_2022 (struct coding_system *);
 826
 827 static int detect_coding_emacs_mule (struct coding_system *,
 828                                      struct coding_detection_info *info);
 829 static void decode_coding_emacs_mule (struct coding_system *);
 830 static int encode_coding_emacs_mule (struct coding_system *);
 831
 832 static int detect_coding_sjis (struct coding_system *,
 833                                struct coding_detection_info *info);
 834 static void decode_coding_sjis (struct coding_system *);
 835 static int encode_coding_sjis (struct coding_system *);
 836
 837 static int detect_coding_big5 (struct coding_system *,
 838                                struct coding_detection_info *info);
 839 static void decode_coding_big5 (struct coding_system *);
 840 static int encode_coding_big5 (struct coding_system *);
 841
 842 static int detect_coding_ccl (struct coding_system *,
 843                               struct coding_detection_info *info);
 844 static void decode_coding_ccl (struct coding_system *);
 845 static int encode_coding_ccl (struct coding_system *);
 846
 847 static void decode_coding_raw_text (struct coding_system *);
 848 static int encode_coding_raw_text (struct coding_system *);
 849
 850 static void coding_set_source (struct coding_system *);
 851 static ptrdiff_t coding_change_source (struct coding_system *);
 852 static void coding_set_destination (struct coding_system *);
 853 static ptrdiff_t coding_change_destination (struct coding_system *);
 854 static void coding_alloc_by_realloc (struct coding_system *, ptrdiff_t);
 855 static void coding_alloc_by_making_gap (struct coding_system *,
 856                                         ptrdiff_t, ptrdiff_t);
 857 static unsigned char *alloc_destination (struct coding_system *,
 858                                          ptrdiff_t, unsigned char *);
 859 static void setup_iso_safe_charsets (Lisp_Object);
 860 static ptrdiff_t encode_designation_at_bol (struct coding_system *,
 861                                       int *, int *, unsigned char *);
 862 static int detect_eol (const unsigned char *,
 863                        ptrdiff_t, enum coding_category);
 864 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
 865 static void decode_eol (struct coding_system *);
 866 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
 867 static Lisp_Object get_translation (Lisp_Object, int *, int *);
 868 static int produce_chars (struct coding_system *, Lisp_Object, int);
 869 static inline void produce_charset (struct coding_system *, int *,
 870                                     ptrdiff_t);
 871 static void produce_annotation (struct coding_system *, ptrdiff_t);
 872 static int decode_coding (struct coding_system *);
 873 static inline int *handle_composition_annotation (ptrdiff_t, ptrdiff_t,
 874                                                   struct coding_system *,
 875                                                   int *, ptrdiff_t *);
 876 static inline int *handle_charset_annotation (ptrdiff_t, ptrdiff_t,
 877                                               struct coding_system *,
 878                                               int *, ptrdiff_t *);
 879 static void consume_chars (struct coding_system *, Lisp_Object, int);
 880 static int encode_coding (struct coding_system *);
 881 static Lisp_Object make_conversion_work_buffer (int);
 882 static Lisp_Object code_conversion_restore (Lisp_Object);
 883 static inline int char_encodable_p (int, Lisp_Object);
 884 static Lisp_Object make_subsidiaries (Lisp_Object);
 885
 886 static void
 887 record_conversion_result (struct coding_system *coding,
 888                           enum coding_result_code result)
 889 {
 890   coding->result = result;
 891   switch (result)
 892     {
 893     case CODING_RESULT_INSUFFICIENT_SRC:
 894       Vlast_code_conversion_error = Qinsufficient_source;
 895       break;
 896     case CODING_RESULT_INCONSISTENT_EOL:
 897       Vlast_code_conversion_error = Qinconsistent_eol;
 898       break;
 899     case CODING_RESULT_INVALID_SRC:
 900       Vlast_code_conversion_error = Qinvalid_source;
 901       break;
 902     case CODING_RESULT_INTERRUPT:
 903       Vlast_code_conversion_error = Qinterrupted;
 904       break;
 905     case CODING_RESULT_INSUFFICIENT_MEM:
 906       Vlast_code_conversion_error = Qinsufficient_memory;
 907       break;
 908     case CODING_RESULT_INSUFFICIENT_DST:
 909       /* Don't record this error in Vlast_code_conversion_error
 910          because it happens just temporarily and is resolved when the
 911          whole conversion is finished.  */
 912       break;
 913     case CODING_RESULT_SUCCESS:
 914       break;
 915     default:
 916       Vlast_code_conversion_error = intern ("Unknown error");
 917     }
 918 }
 919
 920 /* These wrapper macros are used to preserve validity of pointers into
 921    buffer text across calls to decode_char, encode_char, etc, which
 922    could cause relocation of buffers if it loads a charset map,
 923    because loading a charset map allocates large structures.  */
 924
 925 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 926   do {                                                                       \
 927     ptrdiff_t offset;                                                        \
 928                                                                              \
 929     charset_map_loaded = 0;                                                  \
 930     c = DECODE_CHAR (charset, code);                                         \
 931     if (charset_map_loaded                                                   \
 932         && (offset = coding_change_source (coding)))                         \
 933       {                                                                      \
 934         src += offset;                                                       \
 935         src_base += offset;                                                  \
 936         src_end += offset;                                                   \
 937       }                                                                      \
 938   } while (0)
 939
 940 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 941   do {                                                                  \
 942     ptrdiff_t offset;                                                   \
 943                                                                         \
 944     charset_map_loaded = 0;                                             \
 945     code = ENCODE_CHAR (charset, c);                                    \
 946     if (charset_map_loaded                                              \
 947         && (offset = coding_change_destination (coding)))               \
 948       {                                                                 \
 949         dst += offset;                                                  \
 950         dst_end += offset;                                              \
 951       }                                                                 \
 952   } while (0)
 953
 954 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 955   do {                                                                  \
 956     ptrdiff_t offset;                                                   \
 957                                                                         \
 958     charset_map_loaded = 0;                                             \
 959     charset = char_charset (c, charset_list, code_return);              \
 960     if (charset_map_loaded                                              \
 961         && (offset = coding_change_destination (coding)))               \
 962       {                                                                 \
 963         dst += offset;                                                  \
 964         dst_end += offset;                                              \
 965       }                                                                 \
 966   } while (0)
 967
 968 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 969   do {                                                                  \
 970     ptrdiff_t offset;                                                   \
 971                                                                         \
 972     charset_map_loaded = 0;                                             \
 973     result = CHAR_CHARSET_P (c, charset);                               \
 974     if (charset_map_loaded                                              \
 975         && (offset = coding_change_destination (coding)))               \
 976       {                                                                 \
 977         dst += offset;                                                  \
 978         dst_end += offset;                                              \
 979       }                                                                 \
 980   } while (0)
 981
 982
 983 /* If there are at least BYTES length of room at dst, allocate memory
 984    for coding->destination and update dst and dst_end.  We don't have
 985    to take care of coding->source which will be relocated.  It is
 986    handled by calling coding_set_source in encode_coding.  */
 987
 988 #define ASSURE_DESTINATION(bytes)                               \
 989   do {                                                          \
 990     if (dst + (bytes) >= dst_end)                               \
 991       {                                                         \
 992         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 993                                                                 \
 994         dst = alloc_destination (coding, more_bytes, dst);      \
 995         dst_end = coding->destination + coding->dst_bytes;      \
 996       }                                                         \
 997   } while (0)
 998
 999
1000 /* Store multibyte form of the character C in P, and advance P to the
1001    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1002    never calls MAYBE_UNIFY_CHAR.  */
1003
1004 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1005   do {                                          \
1006     if ((c) <= MAX_1_BYTE_CHAR)                 \
1007       *(p)++ = (c);                             \
1008     else if ((c) <= MAX_2_BYTE_CHAR)            \
1009       *(p)++ = (0xC0 | ((c) >> 6)),             \
1010         *(p)++ = (0x80 | ((c) & 0x3F));         \
1011     else if ((c) <= MAX_3_BYTE_CHAR)            \
1012       *(p)++ = (0xE0 | ((c) >> 12)),            \
1013         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1014         *(p)++ = (0x80 | ((c) & 0x3F));         \
1015     else if ((c) <= MAX_4_BYTE_CHAR)            \
1016       *(p)++ = (0xF0 | (c >> 18)),              \
1017         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1018         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1019         *(p)++ = (0x80 | (c & 0x3F));           \
1020     else if ((c) <= MAX_5_BYTE_CHAR)            \
1021       *(p)++ = 0xF8,                            \
1022         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1023         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1024         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1025         *(p)++ = (0x80 | (c & 0x3F));           \
1026     else                                        \
1027       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1028   } while (0)
1029
1030
1031 /* Return the character code of character whose multibyte form is at
1032    P, and advance P to the end of the multibyte form.  This is like
1033    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1034
1035 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1036   (!((p)[0] & 0x80)                                             \
1037    ? *(p)++                                                     \
1038    : ! ((p)[0] & 0x20)                                          \
1039    ? ((p) += 2,                                                 \
1040       ((((p)[-2] & 0x1F) << 6)                                  \
1041        | ((p)[-1] & 0x3F)                                       \
1042        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1043    : ! ((p)[0] & 0x10)                                          \
1044    ? ((p) += 3,                                                 \
1045       ((((p)[-3] & 0x0F) << 12)                                 \
1046        | (((p)[-2] & 0x3F) << 6)                                \
1047        | ((p)[-1] & 0x3F)))                                     \
1048    : ! ((p)[0] & 0x08)                                          \
1049    ? ((p) += 4,                                                 \
1050       ((((p)[-4] & 0xF) << 18)                                  \
1051        | (((p)[-3] & 0x3F) << 12)                               \
1052        | (((p)[-2] & 0x3F) << 6)                                \
1053        | ((p)[-1] & 0x3F)))                                     \
1054    : ((p) += 5,                                                 \
1055       ((((p)[-4] & 0x3F) << 18)                                 \
1056        | (((p)[-3] & 0x3F) << 12)                               \
1057        | (((p)[-2] & 0x3F) << 6)                                \
1058        | ((p)[-1] & 0x3F))))
1059
1060
1061 /* Set coding->source from coding->src_object.  */
1062
1063 static void
1064 coding_set_source (struct coding_system *coding)
1065 {
1066   if (BUFFERP (coding->src_object))
1067     {
1068       struct buffer *buf = XBUFFER (coding->src_object);
1069
1070       if (coding->src_pos < 0)
1071         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1072       else
1073         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1074     }
1075   else if (STRINGP (coding->src_object))
1076     {
1077       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1078     }
1079   else
1080     {
1081       /* Otherwise, the source is C string and is never relocated
1082          automatically.  Thus we don't have to update anything.  */
1083     }
1084 }
1085
1086
1087 /* Set coding->source from coding->src_object, and return how many
1088    bytes coding->source was changed.  */
1089
1090 static ptrdiff_t
1091 coding_change_source (struct coding_system *coding)
1092 {
1093   const unsigned char *orig = coding->source;
1094   coding_set_source (coding);
1095   return coding->source - orig;
1096 }
1097
1098
1099 /* Set coding->destination from coding->dst_object.  */
1100
1101 static void
1102 coding_set_destination (struct coding_system *coding)
1103 {
1104   if (BUFFERP (coding->dst_object))
1105     {
1106       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1107         {
1108           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1109           coding->dst_bytes = (GAP_END_ADDR
1110                                - (coding->src_bytes - coding->consumed)
1111                                - coding->destination);
1112         }
1113       else
1114         {
1115           /* We are sure that coding->dst_pos_byte is before the gap
1116              of the buffer. */
1117           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1118                                  + coding->dst_pos_byte - BEG_BYTE);
1119           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1120                                - coding->destination);
1121         }
1122     }
1123   else
1124     {
1125       /* Otherwise, the destination is C string and is never relocated
1126          automatically.  Thus we don't have to update anything.  */
1127     }
1128 }
1129
1130
1131 /* Set coding->destination from coding->dst_object, and return how
1132    many bytes coding->destination was changed.  */
1133
1134 static ptrdiff_t
1135 coding_change_destination (struct coding_system *coding)
1136 {
1137   const unsigned char *orig = coding->destination;
1138   coding_set_destination (coding);
1139   return coding->destination - orig;
1140 }
1141
1142
1143 static void
1144 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1145 {
1146   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1147     string_overflow ();
1148   coding->destination = (unsigned char *) xrealloc (coding->destination,
1149                                                     coding->dst_bytes + bytes);
1150   coding->dst_bytes += bytes;
1151 }
1152
1153 static void
1154 coding_alloc_by_making_gap (struct coding_system *coding,
1155                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1156 {
1157   if (EQ (coding->src_object, coding->dst_object))
1158     {
1159       /* The gap may contain the produced data at the head and not-yet
1160          consumed data at the tail.  To preserve those data, we at
1161          first make the gap size to zero, then increase the gap
1162          size.  */
1163       ptrdiff_t add = GAP_SIZE;
1164
1165       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1166       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1167       make_gap (bytes);
1168       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1169       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1170     }
1171   else
1172     {
1173       Lisp_Object this_buffer;
1174
1175       this_buffer = Fcurrent_buffer ();
1176       set_buffer_internal (XBUFFER (coding->dst_object));
1177       make_gap (bytes);
1178       set_buffer_internal (XBUFFER (this_buffer));
1179     }
1180 }
1181
1182
1183 static unsigned char *
1184 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1185                    unsigned char *dst)
1186 {
1187   ptrdiff_t offset = dst - coding->destination;
1188
1189   if (BUFFERP (coding->dst_object))
1190     {
1191       struct buffer *buf = XBUFFER (coding->dst_object);
1192
1193       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1194     }
1195   else
1196     coding_alloc_by_realloc (coding, nbytes);
1197   coding_set_destination (coding);
1198   dst = coding->destination + offset;
1199   return dst;
1200 }
1201
1202 /** Macros for annotations.  */
1203
1204 /* An annotation data is stored in the array coding->charbuf in this
1205    format:
1206      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1207    LENGTH is the number of elements in the annotation.
1208    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1209    NCHARS is the number of characters in the text annotated.
1210
1211    The format of the following elements depend on ANNOTATION_MASK.
1212
1213    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1214    follows:
1215      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1216
1217    NBYTES is the number of bytes specified in the header part of
1218    old-style emacs-mule encoding, or 0 for the other kind of
1219    composition.
1220
1221    METHOD is one of enum composition_method.
1222
1223    Optional COMPOSITION-COMPONENTS are characters and composition
1224    rules.
1225
1226    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1227    follows.
1228
1229    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1230    recover from an invalid annotation, and should be skipped by
1231    produce_annotation.  */
1232
1233 /* Maximum length of the header of annotation data.  */
1234 #define MAX_ANNOTATION_LENGTH 5
1235
1236 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1237   do {                                                  \
1238     *(buf)++ = -(len);                                  \
1239     *(buf)++ = (mask);                                  \
1240     *(buf)++ = (nchars);                                \
1241     coding->annotated = 1;                              \
1242   } while (0);
1243
1244 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1245   do {                                                                      \
1246     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1247     *buf++ = nbytes;                                                        \
1248     *buf++ = method;                                                        \
1249   } while (0)
1250
1251
1252 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1253   do {                                                                  \
1254     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1255     *buf++ = id;                                                        \
1256   } while (0)
1257
1258 \f
1259 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1260
1261
1262
1263 \f
1264 /*** 3. UTF-8 ***/
1265
1266 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1267    Check if a text is encoded in UTF-8.  If it is, return 1, else
1268    return 0.  */
1269
1270 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1271 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1272 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1273 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1274 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1275 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1276
1277 #define UTF_8_BOM_1 0xEF
1278 #define UTF_8_BOM_2 0xBB
1279 #define UTF_8_BOM_3 0xBF
1280
1281 static int
1282 detect_coding_utf_8 (struct coding_system *coding,
1283                      struct coding_detection_info *detect_info)
1284 {
1285   const unsigned char *src = coding->source, *src_base;
1286   const unsigned char *src_end = coding->source + coding->src_bytes;
1287   int multibytep = coding->src_multibyte;
1288   ptrdiff_t consumed_chars = 0;
1289   int bom_found = 0;
1290   int found = 0;
1291
1292   detect_info->checked |= CATEGORY_MASK_UTF_8;
1293   /* A coding system of this category is always ASCII compatible.  */
1294   src += coding->head_ascii;
1295
1296   while (1)
1297     {
1298       int c, c1, c2, c3, c4;
1299
1300       src_base = src;
1301       ONE_MORE_BYTE (c);
1302       if (c < 0 || UTF_8_1_OCTET_P (c))
1303         continue;
1304       ONE_MORE_BYTE (c1);
1305       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1306         break;
1307       if (UTF_8_2_OCTET_LEADING_P (c))
1308         {
1309           found = 1;
1310           continue;
1311         }
1312       ONE_MORE_BYTE (c2);
1313       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1314         break;
1315       if (UTF_8_3_OCTET_LEADING_P (c))
1316         {
1317           found = 1;
1318           if (src_base == coding->source
1319               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1320             bom_found = 1;
1321           continue;
1322         }
1323       ONE_MORE_BYTE (c3);
1324       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1325         break;
1326       if (UTF_8_4_OCTET_LEADING_P (c))
1327         {
1328           found = 1;
1329           continue;
1330         }
1331       ONE_MORE_BYTE (c4);
1332       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1333         break;
1334       if (UTF_8_5_OCTET_LEADING_P (c))
1335         {
1336           found = 1;
1337           continue;
1338         }
1339       break;
1340     }
1341   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1342   return 0;
1343
1344  no_more_source:
1345   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1346     {
1347       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1348       return 0;
1349     }
1350   if (bom_found)
1351     {
1352       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1353       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1354     }
1355   else
1356     {
1357       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1358       if (found)
1359         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1360     }
1361   return 1;
1362 }
1363
1364
1365 static void
1366 decode_coding_utf_8 (struct coding_system *coding)
1367 {
1368   const unsigned char *src = coding->source + coding->consumed;
1369   const unsigned char *src_end = coding->source + coding->src_bytes;
1370   const unsigned char *src_base;
1371   int *charbuf = coding->charbuf + coding->charbuf_used;
1372   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1373   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1374   int multibytep = coding->src_multibyte;
1375   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1376   int eol_dos =
1377     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1378   int byte_after_cr = -1;
1379
1380   if (bom != utf_without_bom)
1381     {
1382       int c1, c2, c3;
1383
1384       src_base = src;
1385       ONE_MORE_BYTE (c1);
1386       if (! UTF_8_3_OCTET_LEADING_P (c1))
1387         src = src_base;
1388       else
1389         {
1390           ONE_MORE_BYTE (c2);
1391           if (! UTF_8_EXTRA_OCTET_P (c2))
1392             src = src_base;
1393           else
1394             {
1395               ONE_MORE_BYTE (c3);
1396               if (! UTF_8_EXTRA_OCTET_P (c3))
1397                 src = src_base;
1398               else
1399                 {
1400                   if ((c1 != UTF_8_BOM_1)
1401                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1402                     src = src_base;
1403                   else
1404                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1405                 }
1406             }
1407         }
1408     }
1409   CODING_UTF_8_BOM (coding) = utf_without_bom;
1410
1411   while (1)
1412     {
1413       int c, c1, c2, c3, c4, c5;
1414
1415       src_base = src;
1416       consumed_chars_base = consumed_chars;
1417
1418       if (charbuf >= charbuf_end)
1419         {
1420           if (byte_after_cr >= 0)
1421             src_base--;
1422           break;
1423         }
1424
1425       if (byte_after_cr >= 0)
1426         c1 = byte_after_cr, byte_after_cr = -1;
1427       else
1428         ONE_MORE_BYTE (c1);
1429       if (c1 < 0)
1430         {
1431           c = - c1;
1432         }
1433       else if (UTF_8_1_OCTET_P (c1))
1434         {
1435           if (eol_dos && c1 == '\r')
1436             ONE_MORE_BYTE (byte_after_cr);
1437           c = c1;
1438         }
1439       else
1440         {
1441           ONE_MORE_BYTE (c2);
1442           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1443             goto invalid_code;
1444           if (UTF_8_2_OCTET_LEADING_P (c1))
1445             {
1446               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1447               /* Reject overlong sequences here and below.  Encoders
1448                  producing them are incorrect, they can be misleading,
1449                  and they mess up read/write invariance.  */
1450               if (c < 128)
1451                 goto invalid_code;
1452             }
1453           else
1454             {
1455               ONE_MORE_BYTE (c3);
1456               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1457                 goto invalid_code;
1458               if (UTF_8_3_OCTET_LEADING_P (c1))
1459                 {
1460                   c = (((c1 & 0xF) << 12)
1461                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1462                   if (c < 0x800
1463                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1464                     goto invalid_code;
1465                 }
1466               else
1467                 {
1468                   ONE_MORE_BYTE (c4);
1469                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1470                     goto invalid_code;
1471                   if (UTF_8_4_OCTET_LEADING_P (c1))
1472                     {
1473                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1474                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1475                     if (c < 0x10000)
1476                       goto invalid_code;
1477                     }
1478                   else
1479                     {
1480                       ONE_MORE_BYTE (c5);
1481                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1482                         goto invalid_code;
1483                       if (UTF_8_5_OCTET_LEADING_P (c1))
1484                         {
1485                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1486                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1487                                | (c5 & 0x3F));
1488                           if ((c > MAX_CHAR) || (c < 0x200000))
1489                             goto invalid_code;
1490                         }
1491                       else
1492                         goto invalid_code;
1493                     }
1494                 }
1495             }
1496         }
1497
1498       *charbuf++ = c;
1499       continue;
1500
1501     invalid_code:
1502       src = src_base;
1503       consumed_chars = consumed_chars_base;
1504       ONE_MORE_BYTE (c);
1505       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1506       coding->errors++;
1507     }
1508
1509  no_more_source:
1510   coding->consumed_char += consumed_chars_base;
1511   coding->consumed = src_base - coding->source;
1512   coding->charbuf_used = charbuf - coding->charbuf;
1513 }
1514
1515
1516 static int
1517 encode_coding_utf_8 (struct coding_system *coding)
1518 {
1519   int multibytep = coding->dst_multibyte;
1520   int *charbuf = coding->charbuf;
1521   int *charbuf_end = charbuf + coding->charbuf_used;
1522   unsigned char *dst = coding->destination + coding->produced;
1523   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1524   ptrdiff_t produced_chars = 0;
1525   int c;
1526
1527   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1528     {
1529       ASSURE_DESTINATION (3);
1530       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1531       CODING_UTF_8_BOM (coding) = utf_without_bom;
1532     }
1533
1534   if (multibytep)
1535     {
1536       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1537
1538       while (charbuf < charbuf_end)
1539         {
1540           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1541
1542           ASSURE_DESTINATION (safe_room);
1543           c = *charbuf++;
1544           if (CHAR_BYTE8_P (c))
1545             {
1546               c = CHAR_TO_BYTE8 (c);
1547               EMIT_ONE_BYTE (c);
1548             }
1549           else
1550             {
1551               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1552               for (p = str; p < pend; p++)
1553                 EMIT_ONE_BYTE (*p);
1554             }
1555         }
1556     }
1557   else
1558     {
1559       int safe_room = MAX_MULTIBYTE_LENGTH;
1560
1561       while (charbuf < charbuf_end)
1562         {
1563           ASSURE_DESTINATION (safe_room);
1564           c = *charbuf++;
1565           if (CHAR_BYTE8_P (c))
1566             *dst++ = CHAR_TO_BYTE8 (c);
1567           else
1568             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1569           produced_chars++;
1570         }
1571     }
1572   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1573   coding->produced_char += produced_chars;
1574   coding->produced = dst - coding->destination;
1575   return 0;
1576 }
1577
1578
1579 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1580    Check if a text is encoded in one of UTF-16 based coding systems.
1581    If it is, return 1, else return 0.  */
1582
1583 #define UTF_16_HIGH_SURROGATE_P(val) \
1584   (((val) & 0xFC00) == 0xD800)
1585
1586 #define UTF_16_LOW_SURROGATE_P(val) \
1587   (((val) & 0xFC00) == 0xDC00)
1588
1589
1590 static int
1591 detect_coding_utf_16 (struct coding_system *coding,
1592                       struct coding_detection_info *detect_info)
1593 {
1594   const unsigned char *src = coding->source;
1595   const unsigned char *src_end = coding->source + coding->src_bytes;
1596   int multibytep = coding->src_multibyte;
1597   int c1, c2;
1598
1599   detect_info->checked |= CATEGORY_MASK_UTF_16;
1600   if (coding->mode & CODING_MODE_LAST_BLOCK
1601       && (coding->src_chars & 1))
1602     {
1603       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1604       return 0;
1605     }
1606
1607   TWO_MORE_BYTES (c1, c2);
1608   if ((c1 == 0xFF) && (c2 == 0xFE))
1609     {
1610       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1611                              | CATEGORY_MASK_UTF_16_AUTO);
1612       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1613                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1614                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1615     }
1616   else if ((c1 == 0xFE) && (c2 == 0xFF))
1617     {
1618       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1619                              | CATEGORY_MASK_UTF_16_AUTO);
1620       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1621                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1622                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1623     }
1624   else if (c2 < 0)
1625     {
1626       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1627       return 0;
1628     }
1629   else
1630     {
1631       /* We check the dispersion of Eth and Oth bytes where E is even and
1632          O is odd.  If both are high, we assume binary data.*/
1633       unsigned char e[256], o[256];
1634       unsigned e_num = 1, o_num = 1;
1635
1636       memset (e, 0, 256);
1637       memset (o, 0, 256);
1638       e[c1] = 1;
1639       o[c2] = 1;
1640
1641       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1642                                 |CATEGORY_MASK_UTF_16_BE
1643                                 | CATEGORY_MASK_UTF_16_LE);
1644
1645       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1646              != CATEGORY_MASK_UTF_16)
1647         {
1648           TWO_MORE_BYTES (c1, c2);
1649           if (c2 < 0)
1650             break;
1651           if (! e[c1])
1652             {
1653               e[c1] = 1;
1654               e_num++;
1655               if (e_num >= 128)
1656                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1657             }
1658           if (! o[c2])
1659             {
1660               o[c2] = 1;
1661               o_num++;
1662               if (o_num >= 128)
1663                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1664             }
1665         }
1666       return 0;
1667     }
1668
1669  no_more_source:
1670   return 1;
1671 }
1672
1673 static void
1674 decode_coding_utf_16 (struct coding_system *coding)
1675 {
1676   const unsigned char *src = coding->source + coding->consumed;
1677   const unsigned char *src_end = coding->source + coding->src_bytes;
1678   const unsigned char *src_base;
1679   int *charbuf = coding->charbuf + coding->charbuf_used;
1680   /* We may produces at most 3 chars in one loop.  */
1681   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1682   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1683   int multibytep = coding->src_multibyte;
1684   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1685   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1686   int surrogate = CODING_UTF_16_SURROGATE (coding);
1687   int eol_dos =
1688     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1689   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1690
1691   if (bom == utf_with_bom)
1692     {
1693       int c, c1, c2;
1694
1695       src_base = src;
1696       ONE_MORE_BYTE (c1);
1697       ONE_MORE_BYTE (c2);
1698       c = (c1 << 8) | c2;
1699
1700       if (endian == utf_16_big_endian
1701           ? c != 0xFEFF : c != 0xFFFE)
1702         {
1703           /* The first two bytes are not BOM.  Treat them as bytes
1704              for a normal character.  */
1705           src = src_base;
1706           coding->errors++;
1707         }
1708       CODING_UTF_16_BOM (coding) = utf_without_bom;
1709     }
1710   else if (bom == utf_detect_bom)
1711     {
1712       /* We have already tried to detect BOM and failed in
1713          detect_coding.  */
1714       CODING_UTF_16_BOM (coding) = utf_without_bom;
1715     }
1716
1717   while (1)
1718     {
1719       int c, c1, c2;
1720
1721       src_base = src;
1722       consumed_chars_base = consumed_chars;
1723
1724       if (charbuf >= charbuf_end)
1725         {
1726           if (byte_after_cr1 >= 0)
1727             src_base -= 2;
1728           break;
1729         }
1730
1731       if (byte_after_cr1 >= 0)
1732         c1 = byte_after_cr1, byte_after_cr1 = -1;
1733       else
1734         ONE_MORE_BYTE (c1);
1735       if (c1 < 0)
1736         {
1737           *charbuf++ = -c1;
1738           continue;
1739         }
1740       if (byte_after_cr2 >= 0)
1741         c2 = byte_after_cr2, byte_after_cr2 = -1;
1742       else
1743         ONE_MORE_BYTE (c2);
1744       if (c2 < 0)
1745         {
1746           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1747           *charbuf++ = -c2;
1748           continue;
1749         }
1750       c = (endian == utf_16_big_endian
1751            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1752
1753       if (surrogate)
1754         {
1755           if (! UTF_16_LOW_SURROGATE_P (c))
1756             {
1757               if (endian == utf_16_big_endian)
1758                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1759               else
1760                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1761               *charbuf++ = c1;
1762               *charbuf++ = c2;
1763               coding->errors++;
1764               if (UTF_16_HIGH_SURROGATE_P (c))
1765                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1766               else
1767                 *charbuf++ = c;
1768             }
1769           else
1770             {
1771               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1772               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1773               *charbuf++ = 0x10000 + c;
1774             }
1775         }
1776       else
1777         {
1778           if (UTF_16_HIGH_SURROGATE_P (c))
1779             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1780           else
1781             {
1782               if (eol_dos && c == '\r')
1783                 {
1784                   ONE_MORE_BYTE (byte_after_cr1);
1785                   ONE_MORE_BYTE (byte_after_cr2);
1786                 }
1787               *charbuf++ = c;
1788             }
1789         }
1790     }
1791
1792  no_more_source:
1793   coding->consumed_char += consumed_chars_base;
1794   coding->consumed = src_base - coding->source;
1795   coding->charbuf_used = charbuf - coding->charbuf;
1796 }
1797
1798 static int
1799 encode_coding_utf_16 (struct coding_system *coding)
1800 {
1801   int multibytep = coding->dst_multibyte;
1802   int *charbuf = coding->charbuf;
1803   int *charbuf_end = charbuf + coding->charbuf_used;
1804   unsigned char *dst = coding->destination + coding->produced;
1805   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1806   int safe_room = 8;
1807   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1808   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1809   ptrdiff_t produced_chars = 0;
1810   int c;
1811
1812   if (bom != utf_without_bom)
1813     {
1814       ASSURE_DESTINATION (safe_room);
1815       if (big_endian)
1816         EMIT_TWO_BYTES (0xFE, 0xFF);
1817       else
1818         EMIT_TWO_BYTES (0xFF, 0xFE);
1819       CODING_UTF_16_BOM (coding) = utf_without_bom;
1820     }
1821
1822   while (charbuf < charbuf_end)
1823     {
1824       ASSURE_DESTINATION (safe_room);
1825       c = *charbuf++;
1826       if (c > MAX_UNICODE_CHAR)
1827         c = coding->default_char;
1828
1829       if (c < 0x10000)
1830         {
1831           if (big_endian)
1832             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1833           else
1834             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1835         }
1836       else
1837         {
1838           int c1, c2;
1839
1840           c -= 0x10000;
1841           c1 = (c >> 10) + 0xD800;
1842           c2 = (c & 0x3FF) + 0xDC00;
1843           if (big_endian)
1844             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1845           else
1846             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1847         }
1848     }
1849   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1850   coding->produced = dst - coding->destination;
1851   coding->produced_char += produced_chars;
1852   return 0;
1853 }
1854
1855 \f
1856 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1857
1858 /* Emacs' internal format for representation of multiple character
1859    sets is a kind of multi-byte encoding, i.e. characters are
1860    represented by variable-length sequences of one-byte codes.
1861
1862    ASCII characters and control characters (e.g. `tab', `newline') are
1863    represented by one-byte sequences which are their ASCII codes, in
1864    the range 0x00 through 0x7F.
1865
1866    8-bit characters of the range 0x80..0x9F are represented by
1867    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1868    code + 0x20).
1869
1870    8-bit characters of the range 0xA0..0xFF are represented by
1871    one-byte sequences which are their 8-bit code.
1872
1873    The other characters are represented by a sequence of `base
1874    leading-code', optional `extended leading-code', and one or two
1875    `position-code's.  The length of the sequence is determined by the
1876    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1877    whereas extended leading-code and position-code take the range 0xA0
1878    through 0xFF.  See `charset.h' for more details about leading-code
1879    and position-code.
1880
1881    --- CODE RANGE of Emacs' internal format ---
1882    character set        range
1883    -------------        -----
1884    ascii                0x00..0x7F
1885    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1886    eight-bit-graphic    0xA0..0xBF
1887    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1888    ---------------------------------------------
1889
1890    As this is the internal character representation, the format is
1891    usually not used externally (i.e. in a file or in a data sent to a
1892    process).  But, it is possible to have a text externally in this
1893    format (i.e. by encoding by the coding system `emacs-mule').
1894
1895    In that case, a sequence of one-byte codes has a slightly different
1896    form.
1897
1898    At first, all characters in eight-bit-control are represented by
1899    one-byte sequences which are their 8-bit code.
1900
1901    Next, character composition data are represented by the byte
1902    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1903    where,
1904         METHOD is 0xF2 plus one of composition method (enum
1905         composition_method),
1906
1907         BYTES is 0xA0 plus a byte length of this composition data,
1908
1909         CHARS is 0xA0 plus a number of characters composed by this
1910         data,
1911
1912         COMPONENTs are characters of multibyte form or composition
1913         rules encoded by two-byte of ASCII codes.
1914
1915    In addition, for backward compatibility, the following formats are
1916    also recognized as composition data on decoding.
1917
1918    0x80 MSEQ ...
1919    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1920
1921    Here,
1922         MSEQ is a multibyte form but in these special format:
1923           ASCII: 0xA0 ASCII_CODE+0x80,
1924           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1925         RULE is a one byte code of the range 0xA0..0xF0 that
1926         represents a composition rule.
1927   */
1928
1929 char emacs_mule_bytes[256];
1930
1931
1932 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1933    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1934    else return 0.  */
1935
1936 static int
1937 detect_coding_emacs_mule (struct coding_system *coding,
1938                           struct coding_detection_info *detect_info)
1939 {
1940   const unsigned char *src = coding->source, *src_base;
1941   const unsigned char *src_end = coding->source + coding->src_bytes;
1942   int multibytep = coding->src_multibyte;
1943   ptrdiff_t consumed_chars = 0;
1944   int c;
1945   int found = 0;
1946
1947   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1948   /* A coding system of this category is always ASCII compatible.  */
1949   src += coding->head_ascii;
1950
1951   while (1)
1952     {
1953       src_base = src;
1954       ONE_MORE_BYTE (c);
1955       if (c < 0)
1956         continue;
1957       if (c == 0x80)
1958         {
1959           /* Perhaps the start of composite character.  We simply skip
1960              it because analyzing it is too heavy for detecting.  But,
1961              at least, we check that the composite character
1962              constitutes of more than 4 bytes.  */
1963           const unsigned char *src_start;
1964
1965         repeat:
1966           src_start = src;
1967           do
1968             {
1969               ONE_MORE_BYTE (c);
1970             }
1971           while (c >= 0xA0);
1972
1973           if (src - src_start <= 4)
1974             break;
1975           found = CATEGORY_MASK_EMACS_MULE;
1976           if (c == 0x80)
1977             goto repeat;
1978         }
1979
1980       if (c < 0x80)
1981         {
1982           if (c < 0x20
1983               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1984             break;
1985         }
1986       else
1987         {
1988           int more_bytes = emacs_mule_bytes[c] - 1;
1989
1990           while (more_bytes > 0)
1991             {
1992               ONE_MORE_BYTE (c);
1993               if (c < 0xA0)
1994                 {
1995                   src--;        /* Unread the last byte.  */
1996                   break;
1997                 }
1998               more_bytes--;
1999             }
2000           if (more_bytes != 0)
2001             break;
2002           found = CATEGORY_MASK_EMACS_MULE;
2003         }
2004     }
2005   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2006   return 0;
2007
2008  no_more_source:
2009   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2010     {
2011       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2012       return 0;
2013     }
2014   detect_info->found |= found;
2015   return 1;
2016 }
2017
2018
2019 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2020    character.  If CMP_STATUS indicates that we must expect MSEQ or
2021    RULE described above, decode it and return the negative value of
2022    the decoded character or rule.  If an invalid byte is found, return
2023    -1.  If SRC is too short, return -2.  */
2024
2025 static int
2026 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2027                  int *nbytes, int *nchars, int *id,
2028                  struct composition_status *cmp_status)
2029 {
2030   const unsigned char *src_end = coding->source + coding->src_bytes;
2031   const unsigned char *src_base = src;
2032   int multibytep = coding->src_multibyte;
2033   int charset_ID;
2034   unsigned code;
2035   int c;
2036   int consumed_chars = 0;
2037   int mseq_found = 0;
2038
2039   ONE_MORE_BYTE (c);
2040   if (c < 0)
2041     {
2042       c = -c;
2043       charset_ID = emacs_mule_charset[0];
2044     }
2045   else
2046     {
2047       if (c >= 0xA0)
2048         {
2049           if (cmp_status->state != COMPOSING_NO
2050               && cmp_status->old_form)
2051             {
2052               if (cmp_status->state == COMPOSING_CHAR)
2053                 {
2054                   if (c == 0xA0)
2055                     {
2056                       ONE_MORE_BYTE (c);
2057                       c -= 0x80;
2058                       if (c < 0)
2059                         goto invalid_code;
2060                     }
2061                   else
2062                     c -= 0x20;
2063                   mseq_found = 1;
2064                 }
2065               else
2066                 {
2067                   *nbytes = src - src_base;
2068                   *nchars = consumed_chars;
2069                   return -c;
2070                 }
2071             }
2072           else
2073             goto invalid_code;
2074         }
2075
2076       switch (emacs_mule_bytes[c])
2077         {
2078         case 2:
2079           if ((charset_ID = emacs_mule_charset[c]) < 0)
2080             goto invalid_code;
2081           ONE_MORE_BYTE (c);
2082           if (c < 0xA0)
2083             goto invalid_code;
2084           code = c & 0x7F;
2085           break;
2086
2087         case 3:
2088           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2089               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2090             {
2091               ONE_MORE_BYTE (c);
2092               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2093                 goto invalid_code;
2094               ONE_MORE_BYTE (c);
2095               if (c < 0xA0)
2096                 goto invalid_code;
2097               code = c & 0x7F;
2098             }
2099           else
2100             {
2101               if ((charset_ID = emacs_mule_charset[c]) < 0)
2102                 goto invalid_code;
2103               ONE_MORE_BYTE (c);
2104               if (c < 0xA0)
2105                 goto invalid_code;
2106               code = (c & 0x7F) << 8;
2107               ONE_MORE_BYTE (c);
2108               if (c < 0xA0)
2109                 goto invalid_code;
2110               code |= c & 0x7F;
2111             }
2112           break;
2113
2114         case 4:
2115           ONE_MORE_BYTE (c);
2116           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2117             goto invalid_code;
2118           ONE_MORE_BYTE (c);
2119           if (c < 0xA0)
2120             goto invalid_code;
2121           code = (c & 0x7F) << 8;
2122           ONE_MORE_BYTE (c);
2123           if (c < 0xA0)
2124             goto invalid_code;
2125           code |= c & 0x7F;
2126           break;
2127
2128         case 1:
2129           code = c;
2130           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2131           break;
2132
2133         default:
2134           abort ();
2135         }
2136       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2137                           CHARSET_FROM_ID (charset_ID), code, c);
2138       if (c < 0)
2139         goto invalid_code;
2140     }
2141   *nbytes = src - src_base;
2142   *nchars = consumed_chars;
2143   if (id)
2144     *id = charset_ID;
2145   return (mseq_found ? -c : c);
2146
2147  no_more_source:
2148   return -2;
2149
2150  invalid_code:
2151   return -1;
2152 }
2153
2154
2155 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2156
2157 /* Handle these composition sequence ('|': the end of header elements,
2158    BYTES and CHARS >= 0xA0):
2159
2160    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2161    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2162    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2163
2164    and these old form:
2165
2166    (4) relative composition: 0x80 | MSEQ ... MSEQ
2167    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2168
2169    When the starter 0x80 and the following header elements are found,
2170    this annotation header is produced.
2171
2172         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2173
2174    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2175    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2176
2177    Then, upon reading the following elements, these codes are produced
2178    until the composition end is found:
2179
2180    (1) CHAR ... CHAR
2181    (2) ALT ... ALT CHAR ... CHAR
2182    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2183    (4) CHAR ... CHAR
2184    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2185
2186    When the composition end is found, LENGTH and NCHARS in the
2187    annotation header is updated as below:
2188
2189    (1) LENGTH: unchanged, NCHARS: unchanged
2190    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2191    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2192    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2193    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2194
2195    If an error is found while composing, the annotation header is
2196    changed to the original composition header (plus filler -1s) as
2197    below:
2198
2199    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2200    (5)          [ 0x80 0xFF -1 -1- -1 ]
2201
2202    and the sequence [ -2 DECODED-RULE ] is changed to the original
2203    byte sequence as below:
2204         o the original byte sequence is B: [ B -1 ]
2205         o the original byte sequence is B1 B2: [ B1 B2 ]
2206
2207    Most of the routines are implemented by macros because many
2208    variables and labels in the caller decode_coding_emacs_mule must be
2209    accessible, and they are usually called just once (thus doesn't
2210    increase the size of compiled object).  */
2211
2212 /* Decode a composition rule represented by C as a component of
2213    composition sequence of Emacs 20 style.  Set RULE to the decoded
2214    rule. */
2215
2216 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2217   do {                                                  \
2218     int gref, nref;                                     \
2219                                                         \
2220     c -= 0xA0;                                          \
2221     if (c < 0 || c >= 81)                               \
2222       goto invalid_code;                                \
2223     gref = c / 9, nref = c % 9;                         \
2224     if (gref == 4) gref = 10;                           \
2225     if (nref == 4) nref = 10;                           \
2226     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2227   } while (0)
2228
2229
2230 /* Decode a composition rule represented by C and the following byte
2231    at SRC as a component of composition sequence of Emacs 21 style.
2232    Set RULE to the decoded rule.  */
2233
2234 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2235   do {                                                  \
2236     int gref, nref;                                     \
2237                                                         \
2238     gref = c - 0x20;                                    \
2239     if (gref < 0 || gref >= 81)                         \
2240       goto invalid_code;                                \
2241     ONE_MORE_BYTE (c);                                  \
2242     nref = c - 0x20;                                    \
2243     if (nref < 0 || nref >= 81)                         \
2244       goto invalid_code;                                \
2245     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2246   } while (0)
2247
2248
2249 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2250    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2251    byte length of this composition information, CHARS is the number of
2252    characters composed by this composition.  */
2253
2254 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2255   do {                                                                  \
2256     enum composition_method method = c - 0xF2;                          \
2257     int nbytes, nchars;                                                 \
2258                                                                         \
2259     ONE_MORE_BYTE (c);                                                  \
2260     if (c < 0)                                                          \
2261       goto invalid_code;                                                \
2262     nbytes = c - 0xA0;                                                  \
2263     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2264       goto invalid_code;                                                \
2265     ONE_MORE_BYTE (c);                                                  \
2266     nchars = c - 0xA0;                                                  \
2267     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2268       goto invalid_code;                                                \
2269     cmp_status->old_form = 0;                                           \
2270     cmp_status->method = method;                                        \
2271     if (method == COMPOSITION_RELATIVE)                                 \
2272       cmp_status->state = COMPOSING_CHAR;                               \
2273     else                                                                \
2274       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2275     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2276     cmp_status->nchars = nchars;                                        \
2277     cmp_status->ncomps = nbytes - 4;                                    \
2278     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2279   } while (0)
2280
2281
2282 /* Start of Emacs 20 style format for relative composition.  */
2283
2284 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2285   do {                                                          \
2286     cmp_status->old_form = 1;                                   \
2287     cmp_status->method = COMPOSITION_RELATIVE;                  \
2288     cmp_status->state = COMPOSING_CHAR;                         \
2289     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2290     cmp_status->nchars = cmp_status->ncomps = 0;                \
2291     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2292   } while (0)
2293
2294
2295 /* Start of Emacs 20 style format for rule-base composition.  */
2296
2297 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2298   do {                                                          \
2299     cmp_status->old_form = 1;                                   \
2300     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2301     cmp_status->state = COMPOSING_CHAR;                         \
2302     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2303     cmp_status->nchars = cmp_status->ncomps = 0;                \
2304     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2305   } while (0)
2306
2307
2308 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2309   do {                                                  \
2310     const unsigned char *current_src = src;             \
2311                                                         \
2312     ONE_MORE_BYTE (c);                                  \
2313     if (c < 0)                                          \
2314       goto invalid_code;                                \
2315     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2316         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2317       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2318     else if (c < 0xA0)                                  \
2319       goto invalid_code;                                \
2320     else if (c < 0xC0)                                  \
2321       {                                                 \
2322         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2323         /* Re-read C as a composition component.  */    \
2324         src = current_src;                              \
2325       }                                                 \
2326     else if (c == 0xFF)                                 \
2327       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2328     else                                                \
2329       goto invalid_code;                                \
2330   } while (0)
2331
2332 #define EMACS_MULE_COMPOSITION_END()                            \
2333   do {                                                          \
2334     int idx = - cmp_status->length;                             \
2335                                                                 \
2336     if (cmp_status->old_form)                                   \
2337       charbuf[idx + 2] = cmp_status->nchars;                    \
2338     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2339       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2340     cmp_status->state = COMPOSING_NO;                           \
2341   } while (0)
2342
2343
2344 static int
2345 emacs_mule_finish_composition (int *charbuf,
2346                                struct composition_status *cmp_status)
2347 {
2348   int idx = - cmp_status->length;
2349   int new_chars;
2350
2351   if (cmp_status->old_form && cmp_status->nchars > 0)
2352     {
2353       charbuf[idx + 2] = cmp_status->nchars;
2354       new_chars = 0;
2355       if (cmp_status->method == COMPOSITION_WITH_RULE
2356           && cmp_status->state == COMPOSING_CHAR)
2357         {
2358           /* The last rule was invalid.  */
2359           int rule = charbuf[-1] + 0xA0;
2360
2361           charbuf[-2] = BYTE8_TO_CHAR (rule);
2362           charbuf[-1] = -1;
2363           new_chars = 1;
2364         }
2365     }
2366   else
2367     {
2368       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2369
2370       if (cmp_status->method == COMPOSITION_WITH_RULE)
2371         {
2372           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2373           charbuf[idx++] = -3;
2374           charbuf[idx++] = 0;
2375           new_chars = 1;
2376         }
2377       else
2378         {
2379           int nchars = charbuf[idx + 1] + 0xA0;
2380           int nbytes = charbuf[idx + 2] + 0xA0;
2381
2382           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2383           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2384           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2385           charbuf[idx++] = -1;
2386           new_chars = 4;
2387         }
2388     }
2389   cmp_status->state = COMPOSING_NO;
2390   return new_chars;
2391 }
2392
2393 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2394   do {                                                                    \
2395     if (cmp_status->state != COMPOSING_NO)                                \
2396       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2397   } while (0)
2398
2399
2400 static void
2401 decode_coding_emacs_mule (struct coding_system *coding)
2402 {
2403   const unsigned char *src = coding->source + coding->consumed;
2404   const unsigned char *src_end = coding->source + coding->src_bytes;
2405   const unsigned char *src_base;
2406   int *charbuf = coding->charbuf + coding->charbuf_used;
2407   /* We may produce two annotations (charset and composition) in one
2408      loop and one more charset annotation at the end.  */
2409   int *charbuf_end
2410     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2411       /* We can produce up to 2 characters in a loop.  */
2412       - 1;
2413   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2414   int multibytep = coding->src_multibyte;
2415   ptrdiff_t char_offset = coding->produced_char;
2416   ptrdiff_t last_offset = char_offset;
2417   int last_id = charset_ascii;
2418   int eol_dos =
2419     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2420   int byte_after_cr = -1;
2421   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2422
2423   if (cmp_status->state != COMPOSING_NO)
2424     {
2425       int i;
2426
2427       if (charbuf_end - charbuf < cmp_status->length)
2428         abort ();
2429       for (i = 0; i < cmp_status->length; i++)
2430         *charbuf++ = cmp_status->carryover[i];
2431       coding->annotated = 1;
2432     }
2433
2434   while (1)
2435     {
2436       int c, id IF_LINT (= 0);
2437
2438       src_base = src;
2439       consumed_chars_base = consumed_chars;
2440
2441       if (charbuf >= charbuf_end)
2442         {
2443           if (byte_after_cr >= 0)
2444             src_base--;
2445           break;
2446         }
2447
2448       if (byte_after_cr >= 0)
2449         c = byte_after_cr, byte_after_cr = -1;
2450       else
2451         ONE_MORE_BYTE (c);
2452
2453       if (c < 0 || c == 0x80)
2454         {
2455           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2456           if (c < 0)
2457             {
2458               *charbuf++ = -c;
2459               char_offset++;
2460             }
2461           else
2462             DECODE_EMACS_MULE_COMPOSITION_START ();
2463           continue;
2464         }
2465
2466       if (c < 0x80)
2467         {
2468           if (eol_dos && c == '\r')
2469             ONE_MORE_BYTE (byte_after_cr);
2470           id = charset_ascii;
2471           if (cmp_status->state != COMPOSING_NO)
2472             {
2473               if (cmp_status->old_form)
2474                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2475               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2476                 cmp_status->ncomps--;
2477             }
2478         }
2479       else
2480         {
2481           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2482           /* emacs_mule_char can load a charset map from a file, which
2483              allocates a large structure and might cause buffer text
2484              to be relocated as result.  Thus, we need to remember the
2485              original pointer to buffer text, and fix up all related
2486              pointers after the call.  */
2487           const unsigned char *orig = coding->source;
2488           ptrdiff_t offset;
2489
2490           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2491                                cmp_status);
2492           offset = coding->source - orig;
2493           if (offset)
2494             {
2495               src += offset;
2496               src_base += offset;
2497               src_end += offset;
2498             }
2499           if (c < 0)
2500             {
2501               if (c == -1)
2502                 goto invalid_code;
2503               if (c == -2)
2504                 break;
2505             }
2506           src = src_base + nbytes;
2507           consumed_chars = consumed_chars_base + nchars;
2508           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2509             cmp_status->ncomps -= nchars;
2510         }
2511
2512       /* Now if C >= 0, we found a normally encoded character, if C <
2513          0, we found an old-style composition component character or
2514          rule.  */
2515
2516       if (cmp_status->state == COMPOSING_NO)
2517         {
2518           if (last_id != id)
2519             {
2520               if (last_id != charset_ascii)
2521                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2522                                   last_id);
2523               last_id = id;
2524               last_offset = char_offset;
2525             }
2526           *charbuf++ = c;
2527           char_offset++;
2528         }
2529       else if (cmp_status->state == COMPOSING_CHAR)
2530         {
2531           if (cmp_status->old_form)
2532             {
2533               if (c >= 0)
2534                 {
2535                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2536                   *charbuf++ = c;
2537                   char_offset++;
2538                 }
2539               else
2540                 {
2541                   *charbuf++ = -c;
2542                   cmp_status->nchars++;
2543                   cmp_status->length++;
2544                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2545                     EMACS_MULE_COMPOSITION_END ();
2546                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2547                     cmp_status->state = COMPOSING_RULE;
2548                 }
2549             }
2550           else
2551             {
2552               *charbuf++ = c;
2553               cmp_status->length++;
2554               cmp_status->nchars--;
2555               if (cmp_status->nchars == 0)
2556                 EMACS_MULE_COMPOSITION_END ();
2557             }
2558         }
2559       else if (cmp_status->state == COMPOSING_RULE)
2560         {
2561           int rule;
2562
2563           if (c >= 0)
2564             {
2565               EMACS_MULE_COMPOSITION_END ();
2566               *charbuf++ = c;
2567               char_offset++;
2568             }
2569           else
2570             {
2571               c = -c;
2572               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2573               if (rule < 0)
2574                 goto invalid_code;
2575               *charbuf++ = -2;
2576               *charbuf++ = rule;
2577               cmp_status->length += 2;
2578               cmp_status->state = COMPOSING_CHAR;
2579             }
2580         }
2581       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2582         {
2583           *charbuf++ = c;
2584           cmp_status->length++;
2585           if (cmp_status->ncomps == 0)
2586             cmp_status->state = COMPOSING_CHAR;
2587           else if (cmp_status->ncomps > 0)
2588             {
2589               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2590                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2591             }
2592           else
2593             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2594         }
2595       else                      /* COMPOSING_COMPONENT_RULE */
2596         {
2597           int rule;
2598
2599           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2600           if (rule < 0)
2601             goto invalid_code;
2602           *charbuf++ = -2;
2603           *charbuf++ = rule;
2604           cmp_status->length += 2;
2605           cmp_status->ncomps--;
2606           if (cmp_status->ncomps > 0)
2607             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2608           else
2609             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2610         }
2611       continue;
2612
2613     invalid_code:
2614       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2615       src = src_base;
2616       consumed_chars = consumed_chars_base;
2617       ONE_MORE_BYTE (c);
2618       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2619       char_offset++;
2620       coding->errors++;
2621     }
2622
2623  no_more_source:
2624   if (cmp_status->state != COMPOSING_NO)
2625     {
2626       if (coding->mode & CODING_MODE_LAST_BLOCK)
2627         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2628       else
2629         {
2630           int i;
2631
2632           charbuf -= cmp_status->length;
2633           for (i = 0; i < cmp_status->length; i++)
2634             cmp_status->carryover[i] = charbuf[i];
2635         }
2636     }
2637   if (last_id != charset_ascii)
2638     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2639   coding->consumed_char += consumed_chars_base;
2640   coding->consumed = src_base - coding->source;
2641   coding->charbuf_used = charbuf - coding->charbuf;
2642 }
2643
2644
2645 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2646   do {                                          \
2647     if (id < 0xA0)                              \
2648       codes[0] = id, codes[1] = 0;              \
2649     else if (id < 0xE0)                         \
2650       codes[0] = 0x9A, codes[1] = id;           \
2651     else if (id < 0xF0)                         \
2652       codes[0] = 0x9B, codes[1] = id;           \
2653     else if (id < 0xF5)                         \
2654       codes[0] = 0x9C, codes[1] = id;           \
2655     else                                        \
2656       codes[0] = 0x9D, codes[1] = id;           \
2657   } while (0);
2658
2659
2660 static int
2661 encode_coding_emacs_mule (struct coding_system *coding)
2662 {
2663   int multibytep = coding->dst_multibyte;
2664   int *charbuf = coding->charbuf;
2665   int *charbuf_end = charbuf + coding->charbuf_used;
2666   unsigned char *dst = coding->destination + coding->produced;
2667   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2668   int safe_room = 8;
2669   ptrdiff_t produced_chars = 0;
2670   Lisp_Object attrs, charset_list;
2671   int c;
2672   int preferred_charset_id = -1;
2673
2674   CODING_GET_INFO (coding, attrs, charset_list);
2675   if (! EQ (charset_list, Vemacs_mule_charset_list))
2676     {
2677       CODING_ATTR_CHARSET_LIST (attrs)
2678         = charset_list = Vemacs_mule_charset_list;
2679     }
2680
2681   while (charbuf < charbuf_end)
2682     {
2683       ASSURE_DESTINATION (safe_room);
2684       c = *charbuf++;
2685
2686       if (c < 0)
2687         {
2688           /* Handle an annotation.  */
2689           switch (*charbuf)
2690             {
2691             case CODING_ANNOTATE_COMPOSITION_MASK:
2692               /* Not yet implemented.  */
2693               break;
2694             case CODING_ANNOTATE_CHARSET_MASK:
2695               preferred_charset_id = charbuf[3];
2696               if (preferred_charset_id >= 0
2697                   && NILP (Fmemq (make_number (preferred_charset_id),
2698                                   charset_list)))
2699                 preferred_charset_id = -1;
2700               break;
2701             default:
2702               abort ();
2703             }
2704           charbuf += -c - 1;
2705           continue;
2706         }
2707
2708       if (ASCII_CHAR_P (c))
2709         EMIT_ONE_ASCII_BYTE (c);
2710       else if (CHAR_BYTE8_P (c))
2711         {
2712           c = CHAR_TO_BYTE8 (c);
2713           EMIT_ONE_BYTE (c);
2714         }
2715       else
2716         {
2717           struct charset *charset;
2718           unsigned code;
2719           int dimension;
2720           int emacs_mule_id;
2721           unsigned char leading_codes[2];
2722
2723           if (preferred_charset_id >= 0)
2724             {
2725               int result;
2726
2727               charset = CHARSET_FROM_ID (preferred_charset_id);
2728               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2729               if (result)
2730                 code = ENCODE_CHAR (charset, c);
2731               else
2732                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2733                                      &code, charset);
2734             }
2735           else
2736             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2737                                  &code, charset);
2738           if (! charset)
2739             {
2740               c = coding->default_char;
2741               if (ASCII_CHAR_P (c))
2742                 {
2743                   EMIT_ONE_ASCII_BYTE (c);
2744                   continue;
2745                 }
2746               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2747                                    &code, charset);
2748             }
2749           dimension = CHARSET_DIMENSION (charset);
2750           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2751           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2752           EMIT_ONE_BYTE (leading_codes[0]);
2753           if (leading_codes[1])
2754             EMIT_ONE_BYTE (leading_codes[1]);
2755           if (dimension == 1)
2756             EMIT_ONE_BYTE (code | 0x80);
2757           else
2758             {
2759               code |= 0x8080;
2760               EMIT_ONE_BYTE (code >> 8);
2761               EMIT_ONE_BYTE (code & 0xFF);
2762             }
2763         }
2764     }
2765   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2766   coding->produced_char += produced_chars;
2767   coding->produced = dst - coding->destination;
2768   return 0;
2769 }
2770
2771 \f
2772 /*** 7. ISO2022 handlers ***/
2773
2774 /* The following note describes the coding system ISO2022 briefly.
2775    Since the intention of this note is to help understand the
2776    functions in this file, some parts are NOT ACCURATE or are OVERLY
2777    SIMPLIFIED.  For thorough understanding, please refer to the
2778    original document of ISO2022.  This is equivalent to the standard
2779    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2780
2781    ISO2022 provides many mechanisms to encode several character sets
2782    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2783    is encoded using bytes less than 128.  This may make the encoded
2784    text a little bit longer, but the text passes more easily through
2785    several types of gateway, some of which strip off the MSB (Most
2786    Significant Bit).
2787
2788    There are two kinds of character sets: control character sets and
2789    graphic character sets.  The former contain control characters such
2790    as `newline' and `escape' to provide control functions (control
2791    functions are also provided by escape sequences).  The latter
2792    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2793    two control character sets and many graphic character sets.
2794
2795    Graphic character sets are classified into one of the following
2796    four classes, according to the number of bytes (DIMENSION) and
2797    number of characters in one dimension (CHARS) of the set:
2798    - DIMENSION1_CHARS94
2799    - DIMENSION1_CHARS96
2800    - DIMENSION2_CHARS94
2801    - DIMENSION2_CHARS96
2802
2803    In addition, each character set is assigned an identification tag,
2804    unique for each set, called the "final character" (denoted as <F>
2805    hereafter).  The <F> of each character set is decided by ECMA(*)
2806    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2807    (0x30..0x3F are for private use only).
2808
2809    Note (*): ECMA = European Computer Manufacturers Association
2810
2811    Here are examples of graphic character sets [NAME(<F>)]:
2812         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2813         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2814         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2815         o DIMENSION2_CHARS96 -- none for the moment
2816
2817    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2818         C0 [0x00..0x1F] -- control character plane 0
2819         GL [0x20..0x7F] -- graphic character plane 0
2820         C1 [0x80..0x9F] -- control character plane 1
2821         GR [0xA0..0xFF] -- graphic character plane 1
2822
2823    A control character set is directly designated and invoked to C0 or
2824    C1 by an escape sequence.  The most common case is that:
2825    - ISO646's  control character set is designated/invoked to C0, and
2826    - ISO6429's control character set is designated/invoked to C1,
2827    and usually these designations/invocations are omitted in encoded
2828    text.  In a 7-bit environment, only C0 can be used, and a control
2829    character for C1 is encoded by an appropriate escape sequence to
2830    fit into the environment.  All control characters for C1 are
2831    defined to have corresponding escape sequences.
2832
2833    A graphic character set is at first designated to one of four
2834    graphic registers (G0 through G3), then these graphic registers are
2835    invoked to GL or GR.  These designations and invocations can be
2836    done independently.  The most common case is that G0 is invoked to
2837    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2838    these invocations and designations are omitted in encoded text.
2839    In a 7-bit environment, only GL can be used.
2840
2841    When a graphic character set of CHARS94 is invoked to GL, codes
2842    0x20 and 0x7F of the GL area work as control characters SPACE and
2843    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2844    be used.
2845
2846    There are two ways of invocation: locking-shift and single-shift.
2847    With locking-shift, the invocation lasts until the next different
2848    invocation, whereas with single-shift, the invocation affects the
2849    following character only and doesn't affect the locking-shift
2850    state.  Invocations are done by the following control characters or
2851    escape sequences:
2852
2853    ----------------------------------------------------------------------
2854    abbrev  function                  cntrl escape seq   description
2855    ----------------------------------------------------------------------
2856    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2857    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2858    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2859    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2860    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2861    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2862    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2863    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2864    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2865    ----------------------------------------------------------------------
2866    (*) These are not used by any known coding system.
2867
2868    Control characters for these functions are defined by macros
2869    ISO_CODE_XXX in `coding.h'.
2870
2871    Designations are done by the following escape sequences:
2872    ----------------------------------------------------------------------
2873    escape sequence      description
2874    ----------------------------------------------------------------------
2875    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2876    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2877    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2878    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2879    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2880    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2881    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2882    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2883    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2884    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2885    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2886    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2887    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2888    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2889    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2890    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2891    ----------------------------------------------------------------------
2892
2893    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2894    of dimension 1, chars 94, and final character <F>, etc...
2895
2896    Note (*): Although these designations are not allowed in ISO2022,
2897    Emacs accepts them on decoding, and produces them on encoding
2898    CHARS96 character sets in a coding system which is characterized as
2899    7-bit environment, non-locking-shift, and non-single-shift.
2900
2901    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2902    '(' must be omitted.  We refer to this as "short-form" hereafter.
2903
2904    Now you may notice that there are a lot of ways of encoding the
2905    same multilingual text in ISO2022.  Actually, there exist many
2906    coding systems such as Compound Text (used in X11's inter client
2907    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2908    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2909    localized platforms), and all of these are variants of ISO2022.
2910
2911    In addition to the above, Emacs handles two more kinds of escape
2912    sequences: ISO6429's direction specification and Emacs' private
2913    sequence for specifying character composition.
2914
2915    ISO6429's direction specification takes the following form:
2916         o CSI ']'      -- end of the current direction
2917         o CSI '0' ']'  -- end of the current direction
2918         o CSI '1' ']'  -- start of left-to-right text
2919         o CSI '2' ']'  -- start of right-to-left text
2920    The control character CSI (0x9B: control sequence introducer) is
2921    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2922
2923    Character composition specification takes the following form:
2924         o ESC '0' -- start relative composition
2925         o ESC '1' -- end composition
2926         o ESC '2' -- start rule-base composition (*)
2927         o ESC '3' -- start relative composition with alternate chars  (**)
2928         o ESC '4' -- start rule-base composition with alternate chars  (**)
2929   Since these are not standard escape sequences of any ISO standard,
2930   the use of them with these meanings is restricted to Emacs only.
2931
2932   (*) This form is used only in Emacs 20.7 and older versions,
2933   but newer versions can safely decode it.
2934   (**) This form is used only in Emacs 21.1 and newer versions,
2935   and older versions can't decode it.
2936
2937   Here's a list of example usages of these composition escape
2938   sequences (categorized by `enum composition_method').
2939
2940   COMPOSITION_RELATIVE:
2941         ESC 0 CHAR [ CHAR ] ESC 1
2942   COMPOSITION_WITH_RULE:
2943         ESC 2 CHAR [ RULE CHAR ] ESC 1
2944   COMPOSITION_WITH_ALTCHARS:
2945         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2946   COMPOSITION_WITH_RULE_ALTCHARS:
2947         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2948
2949 static enum iso_code_class_type iso_code_class[256];
2950
2951 #define SAFE_CHARSET_P(coding, id)      \
2952   ((id) <= (coding)->max_charset_id     \
2953    && (coding)->safe_charsets[id] != 255)
2954
2955 static void
2956 setup_iso_safe_charsets (Lisp_Object attrs)
2957 {
2958   Lisp_Object charset_list, safe_charsets;
2959   Lisp_Object request;
2960   Lisp_Object reg_usage;
2961   Lisp_Object tail;
2962   EMACS_INT reg94, reg96;
2963   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2964   int max_charset_id;
2965
2966   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2967   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2968       && ! EQ (charset_list, Viso_2022_charset_list))
2969     {
2970       CODING_ATTR_CHARSET_LIST (attrs)
2971         = charset_list = Viso_2022_charset_list;
2972       ASET (attrs, coding_attr_safe_charsets, Qnil);
2973     }
2974
2975   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2976     return;
2977
2978   max_charset_id = 0;
2979   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2980     {
2981       int id = XINT (XCAR (tail));
2982       if (max_charset_id < id)
2983         max_charset_id = id;
2984     }
2985
2986   safe_charsets = make_uninit_string (max_charset_id + 1);
2987   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2988   request = AREF (attrs, coding_attr_iso_request);
2989   reg_usage = AREF (attrs, coding_attr_iso_usage);
2990   reg94 = XINT (XCAR (reg_usage));
2991   reg96 = XINT (XCDR (reg_usage));
2992
2993   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2994     {
2995       Lisp_Object id;
2996       Lisp_Object reg;
2997       struct charset *charset;
2998
2999       id = XCAR (tail);
3000       charset = CHARSET_FROM_ID (XINT (id));
3001       reg = Fcdr (Fassq (id, request));
3002       if (! NILP (reg))
3003         SSET (safe_charsets, XINT (id), XINT (reg));
3004       else if (charset->iso_chars_96)
3005         {
3006           if (reg96 < 4)
3007             SSET (safe_charsets, XINT (id), reg96);
3008         }
3009       else
3010         {
3011           if (reg94 < 4)
3012             SSET (safe_charsets, XINT (id), reg94);
3013         }
3014     }
3015   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3016 }
3017
3018
3019 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3020    Check if a text is encoded in one of ISO-2022 based coding systems.
3021    If it is, return 1, else return 0.  */
3022
3023 static int
3024 detect_coding_iso_2022 (struct coding_system *coding,
3025                         struct coding_detection_info *detect_info)
3026 {
3027   const unsigned char *src = coding->source, *src_base = src;
3028   const unsigned char *src_end = coding->source + coding->src_bytes;
3029   int multibytep = coding->src_multibyte;
3030   int single_shifting = 0;
3031   int id;
3032   int c, c1;
3033   ptrdiff_t consumed_chars = 0;
3034   int i;
3035   int rejected = 0;
3036   int found = 0;
3037   int composition_count = -1;
3038
3039   detect_info->checked |= CATEGORY_MASK_ISO;
3040
3041   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3042     {
3043       struct coding_system *this = &(coding_categories[i]);
3044       Lisp_Object attrs, val;
3045
3046       if (this->id < 0)
3047         continue;
3048       attrs = CODING_ID_ATTRS (this->id);
3049       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3050           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3051         setup_iso_safe_charsets (attrs);
3052       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3053       this->max_charset_id = SCHARS (val) - 1;
3054       this->safe_charsets = SDATA (val);
3055     }
3056
3057   /* A coding system of this category is always ASCII compatible.  */
3058   src += coding->head_ascii;
3059
3060   while (rejected != CATEGORY_MASK_ISO)
3061     {
3062       src_base = src;
3063       ONE_MORE_BYTE (c);
3064       switch (c)
3065         {
3066         case ISO_CODE_ESC:
3067           if (inhibit_iso_escape_detection)
3068             break;
3069           single_shifting = 0;
3070           ONE_MORE_BYTE (c);
3071           if (c == 'N' || c == 'O')
3072             {
3073               /* ESC <Fe> for SS2 or SS3.  */
3074               single_shifting = 1;
3075               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3076             }
3077           else if (c == '1')
3078             {
3079               /* End of composition.  */
3080               if (composition_count < 0
3081                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3082                 /* Invalid */
3083                 break;
3084               composition_count = -1;
3085               found |= CATEGORY_MASK_ISO;
3086             }
3087           else if (c >= '0' && c <= '4')
3088             {
3089               /* ESC <Fp> for start/end composition.  */
3090               composition_count = 0;
3091             }
3092           else
3093             {
3094               if (c >= '(' && c <= '/')
3095                 {
3096                   /* Designation sequence for a charset of dimension 1.  */
3097                   ONE_MORE_BYTE (c1);
3098                   if (c1 < ' ' || c1 >= 0x80
3099                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3100                     /* Invalid designation sequence.  Just ignore.  */
3101                     break;
3102                 }
3103               else if (c == '$')
3104                 {
3105                   /* Designation sequence for a charset of dimension 2.  */
3106                   ONE_MORE_BYTE (c);
3107                   if (c >= '@' && c <= 'B')
3108                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3109                     id = iso_charset_table[1][0][c];
3110                   else if (c >= '(' && c <= '/')
3111                     {
3112                       ONE_MORE_BYTE (c1);
3113                       if (c1 < ' ' || c1 >= 0x80
3114                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3115                         /* Invalid designation sequence.  Just ignore.  */
3116                         break;
3117                     }
3118                   else
3119                     /* Invalid designation sequence.  Just ignore it.  */
3120                     break;
3121                 }
3122               else
3123                 {
3124                   /* Invalid escape sequence.  Just ignore it.  */
3125                   break;
3126                 }
3127
3128               /* We found a valid designation sequence for CHARSET.  */
3129               rejected |= CATEGORY_MASK_ISO_8BIT;
3130               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3131                                   id))
3132                 found |= CATEGORY_MASK_ISO_7;
3133               else
3134                 rejected |= CATEGORY_MASK_ISO_7;
3135               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3136                                   id))
3137                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3138               else
3139                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3140               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3141                                   id))
3142                 found |= CATEGORY_MASK_ISO_7_ELSE;
3143               else
3144                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3145               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3146                                   id))
3147                 found |= CATEGORY_MASK_ISO_8_ELSE;
3148               else
3149                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3150             }
3151           break;
3152
3153         case ISO_CODE_SO:
3154         case ISO_CODE_SI:
3155           /* Locking shift out/in.  */
3156           if (inhibit_iso_escape_detection)
3157             break;
3158           single_shifting = 0;
3159           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3160           break;
3161
3162         case ISO_CODE_CSI:
3163           /* Control sequence introducer.  */
3164           single_shifting = 0;
3165           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3166           found |= CATEGORY_MASK_ISO_8_ELSE;
3167           goto check_extra_latin;
3168
3169         case ISO_CODE_SS2:
3170         case ISO_CODE_SS3:
3171           /* Single shift.   */
3172           if (inhibit_iso_escape_detection)
3173             break;
3174           single_shifting = 0;
3175           rejected |= CATEGORY_MASK_ISO_7BIT;
3176           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3177               & CODING_ISO_FLAG_SINGLE_SHIFT)
3178             {
3179               found |= CATEGORY_MASK_ISO_8_1;
3180               single_shifting = 1;
3181             }
3182           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3183               & CODING_ISO_FLAG_SINGLE_SHIFT)
3184             {
3185               found |= CATEGORY_MASK_ISO_8_2;
3186               single_shifting = 1;
3187             }
3188           if (single_shifting)
3189             break;
3190         check_extra_latin:
3191           if (! VECTORP (Vlatin_extra_code_table)
3192               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3193             {
3194               rejected = CATEGORY_MASK_ISO;
3195               break;
3196             }
3197           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3198               & CODING_ISO_FLAG_LATIN_EXTRA)
3199             found |= CATEGORY_MASK_ISO_8_1;
3200           else
3201             rejected |= CATEGORY_MASK_ISO_8_1;
3202           rejected |= CATEGORY_MASK_ISO_8_2;
3203           break;
3204
3205         default:
3206           if (c < 0)
3207             continue;
3208           if (c < 0x80)
3209             {
3210               if (composition_count >= 0)
3211                 composition_count++;
3212               single_shifting = 0;
3213               break;
3214             }
3215           if (c >= 0xA0)
3216             {
3217               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3218               found |= CATEGORY_MASK_ISO_8_1;
3219               /* Check the length of succeeding codes of the range
3220                  0xA0..0FF.  If the byte length is even, we include
3221                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3222                  only when we are not single shifting.  */
3223               if (! single_shifting
3224                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3225                 {
3226                   int len = 1;
3227                   while (src < src_end)
3228                     {
3229                       src_base = src;
3230                       ONE_MORE_BYTE (c);
3231                       if (c < 0xA0)
3232                         {
3233                           src = src_base;
3234                           break;
3235                         }
3236                       len++;
3237                     }
3238
3239                   if (len & 1 && src < src_end)
3240                     {
3241                       rejected |= CATEGORY_MASK_ISO_8_2;
3242                       if (composition_count >= 0)
3243                         composition_count += len;
3244                     }
3245                   else
3246                     {
3247                       found |= CATEGORY_MASK_ISO_8_2;
3248                       if (composition_count >= 0)
3249                         composition_count += len / 2;
3250                     }
3251                 }
3252               break;
3253             }
3254         }
3255     }
3256   detect_info->rejected |= CATEGORY_MASK_ISO;
3257   return 0;
3258
3259  no_more_source:
3260   detect_info->rejected |= rejected;
3261   detect_info->found |= (found & ~rejected);
3262   return 1;
3263 }
3264
3265
3266 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3267    escape sequence should be kept.  */
3268 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3269   do {                                                                  \
3270     int id, prev;                                                       \
3271                                                                         \
3272     if (final < '0' || final >= 128                                     \
3273         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3274         || !SAFE_CHARSET_P (coding, id))                                \
3275       {                                                                 \
3276         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3277         chars_96 = -1;                                                  \
3278         break;                                                          \
3279       }                                                                 \
3280     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3281     if (id == charset_jisx0201_roman)                                   \
3282       {                                                                 \
3283         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3284           id = charset_ascii;                                           \
3285       }                                                                 \
3286     else if (id == charset_jisx0208_1978)                               \
3287       {                                                                 \
3288         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3289           id = charset_jisx0208;                                        \
3290       }                                                                 \
3291     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3292     /* If there was an invalid designation to REG previously, and this  \
3293        designation is ASCII to REG, we should keep this designation     \
3294        sequence.  */                                                    \
3295     if (prev == -2 && id == charset_ascii)                              \
3296       chars_96 = -1;                                                    \
3297   } while (0)
3298
3299
3300 /* Handle these composition sequence (ALT: alternate char):
3301
3302    (1) relative composition: ESC 0 CHAR ... ESC 1
3303    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3304    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3305    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3306
3307    When the start sequence (ESC 0/2/3/4) is found, this annotation
3308    header is produced.
3309
3310         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3311
3312    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3313    produced until the end sequence (ESC 1) is found:
3314
3315    (1) CHAR ... CHAR
3316    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3317    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3318    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3319
3320    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3321    annotation header is updated as below:
3322
3323    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3324    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3325    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3326    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3327
3328    If an error is found while composing, the annotation header is
3329    changed to:
3330
3331         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3332
3333    and the sequence [ -2 DECODED-RULE ] is changed to the original
3334    byte sequence as below:
3335         o the original byte sequence is B: [ B -1 ]
3336         o the original byte sequence is B1 B2: [ B1 B2 ]
3337    and the sequence [ -1 -1 ] is changed to the original byte
3338    sequence:
3339         [ ESC '0' ]
3340 */
3341
3342 /* Decode a composition rule C1 and maybe one more byte from the
3343    source, and set RULE to the encoded composition rule.  If the rule
3344    is invalid, goto invalid_code.  */
3345
3346 #define DECODE_COMPOSITION_RULE(rule)                                   \
3347   do {                                                                  \
3348     rule = c1 - 32;                                                     \
3349     if (rule < 0)                                                       \
3350       goto invalid_code;                                                \
3351     if (rule < 81)              /* old format (before ver.21) */        \
3352       {                                                                 \
3353         int gref = (rule) / 9;                                          \
3354         int nref = (rule) % 9;                                          \
3355         if (gref == 4) gref = 10;                                       \
3356         if (nref == 4) nref = 10;                                       \
3357         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3358       }                                                                 \
3359     else                        /* new format (after ver.21) */         \
3360       {                                                                 \
3361         int b;                                                          \
3362                                                                         \
3363         ONE_MORE_BYTE (b);                                              \
3364         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3365           goto invalid_code;                                            \
3366         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3367         rule += 0x100;   /* Distinguish it from the old format.  */     \
3368       }                                                                 \
3369   } while (0)
3370
3371 #define ENCODE_COMPOSITION_RULE(rule)                           \
3372   do {                                                          \
3373     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3374                                                                 \
3375     if (rule < 0x100)           /* old format */                \
3376       {                                                         \
3377         if (gref == 10) gref = 4;                               \
3378         if (nref == 10) nref = 4;                               \
3379         charbuf[idx] = 32 + gref * 9 + nref;                    \
3380         charbuf[idx + 1] = -1;                                  \
3381         new_chars++;                                            \
3382       }                                                         \
3383     else                                /* new format */        \
3384       {                                                         \
3385         charbuf[idx] = 32 + 81 + gref;                          \
3386         charbuf[idx + 1] = 32 + nref;                           \
3387         new_chars += 2;                                         \
3388       }                                                         \
3389   } while (0)
3390
3391 /* Finish the current composition as invalid.  */
3392
3393 static int finish_composition (int *, struct composition_status *);
3394
3395 static int
3396 finish_composition (int *charbuf, struct composition_status *cmp_status)
3397 {
3398   int idx = - cmp_status->length;
3399   int new_chars;
3400
3401   /* Recover the original ESC sequence */
3402   charbuf[idx++] = ISO_CODE_ESC;
3403   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3404                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3405                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3406                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3407                     : '4');
3408   charbuf[idx++] = -2;
3409   charbuf[idx++] = 0;
3410   charbuf[idx++] = -1;
3411   new_chars = cmp_status->nchars;
3412   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3413     for (; idx < 0; idx++)
3414       {
3415         int elt = charbuf[idx];
3416
3417         if (elt == -2)
3418           {
3419             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3420             idx++;
3421           }
3422         else if (elt == -1)
3423           {
3424             charbuf[idx++] = ISO_CODE_ESC;
3425             charbuf[idx] = '0';
3426             new_chars += 2;
3427           }
3428       }
3429   cmp_status->state = COMPOSING_NO;
3430   return new_chars;
3431 }
3432
3433 /* If characters are under composition, finish the composition.  */
3434 #define MAYBE_FINISH_COMPOSITION()                              \
3435   do {                                                          \
3436     if (cmp_status->state != COMPOSING_NO)                      \
3437       char_offset += finish_composition (charbuf, cmp_status);  \
3438   } while (0)
3439
3440 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3441
3442    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3443    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3444    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3445    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3446
3447    Produce this annotation sequence now:
3448
3449    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3450 */
3451
3452 #define DECODE_COMPOSITION_START(c1)                                       \
3453   do {                                                                     \
3454     if (c1 == '0'                                                          \
3455         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3456              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3457             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3458                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3459       {                                                                    \
3460         *charbuf++ = -1;                                                   \
3461         *charbuf++= -1;                                                    \
3462         cmp_status->state = COMPOSING_CHAR;                                \
3463         cmp_status->length += 2;                                           \
3464       }                                                                    \
3465     else                                                                   \
3466       {                                                                    \
3467         MAYBE_FINISH_COMPOSITION ();                                       \
3468         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3469                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3470                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3471                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3472         cmp_status->state                                                  \
3473           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3474         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3475         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3476         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3477         coding->annotated = 1;                                             \
3478       }                                                                    \
3479   } while (0)
3480
3481
3482 /* Handle composition end sequence ESC 1.  */
3483
3484 #define DECODE_COMPOSITION_END()                                        \
3485   do {                                                                  \
3486     if (cmp_status->nchars == 0                                         \
3487         || ((cmp_status->state == COMPOSING_CHAR)                       \
3488             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3489       {                                                                 \
3490         MAYBE_FINISH_COMPOSITION ();                                    \
3491         goto invalid_code;                                              \
3492       }                                                                 \
3493     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3494       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3495     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3496       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3497     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3498     char_offset += cmp_status->nchars;                                  \
3499     cmp_status->state = COMPOSING_NO;                                   \
3500   } while (0)
3501
3502 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3503
3504 #define STORE_COMPOSITION_RULE(rule)    \
3505   do {                                  \
3506     *charbuf++ = -2;                    \
3507     *charbuf++ = rule;                  \
3508     cmp_status->length += 2;            \
3509     cmp_status->state--;                \
3510   } while (0)
3511
3512 /* Store a composed char or a component char C in charbuf, and update
3513    cmp_status.  */
3514
3515 #define STORE_COMPOSITION_CHAR(c)                                       \
3516   do {                                                                  \
3517     *charbuf++ = (c);                                                   \
3518     cmp_status->length++;                                               \
3519     if (cmp_status->state == COMPOSING_CHAR)                            \
3520       cmp_status->nchars++;                                             \
3521     else                                                                \
3522       cmp_status->ncomps++;                                             \
3523     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3524         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3525             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3526       cmp_status->state++;                                              \
3527   } while (0)
3528
3529
3530 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3531
3532 static void
3533 decode_coding_iso_2022 (struct coding_system *coding)
3534 {
3535   const unsigned char *src = coding->source + coding->consumed;
3536   const unsigned char *src_end = coding->source + coding->src_bytes;
3537   const unsigned char *src_base;
3538   int *charbuf = coding->charbuf + coding->charbuf_used;
3539   /* We may produce two annotations (charset and composition) in one
3540      loop and one more charset annotation at the end.  */
3541   int *charbuf_end
3542     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3543   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3544   int multibytep = coding->src_multibyte;
3545   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3546   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3547   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3548   int charset_id_2, charset_id_3;
3549   struct charset *charset;
3550   int c;
3551   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3552   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3553   ptrdiff_t char_offset = coding->produced_char;
3554   ptrdiff_t last_offset = char_offset;
3555   int last_id = charset_ascii;
3556   int eol_dos =
3557     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3558   int byte_after_cr = -1;
3559   int i;
3560
3561   setup_iso_safe_charsets (attrs);
3562   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3563
3564   if (cmp_status->state != COMPOSING_NO)
3565     {
3566       if (charbuf_end - charbuf < cmp_status->length)
3567         abort ();
3568       for (i = 0; i < cmp_status->length; i++)
3569         *charbuf++ = cmp_status->carryover[i];
3570       coding->annotated = 1;
3571     }
3572
3573   while (1)
3574     {
3575       int c1, c2, c3;
3576
3577       src_base = src;
3578       consumed_chars_base = consumed_chars;
3579
3580       if (charbuf >= charbuf_end)
3581         {
3582           if (byte_after_cr >= 0)
3583             src_base--;
3584           break;
3585         }
3586
3587       if (byte_after_cr >= 0)
3588         c1 = byte_after_cr, byte_after_cr = -1;
3589       else
3590         ONE_MORE_BYTE (c1);
3591       if (c1 < 0)
3592         goto invalid_code;
3593
3594       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3595         {
3596           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3597           char_offset++;
3598           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3599           continue;
3600         }
3601
3602       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3603         {
3604           if (c1 == ISO_CODE_ESC)
3605             {
3606               if (src + 1 >= src_end)
3607                 goto no_more_source;
3608               *charbuf++ = ISO_CODE_ESC;
3609               char_offset++;
3610               if (src[0] == '%' && src[1] == '@')
3611                 {
3612                   src += 2;
3613                   consumed_chars += 2;
3614                   char_offset += 2;
3615                   /* We are sure charbuf can contain two more chars. */
3616                   *charbuf++ = '%';
3617                   *charbuf++ = '@';
3618                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3619                 }
3620             }
3621           else
3622             {
3623               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3624               char_offset++;
3625             }
3626           continue;
3627         }
3628
3629       if ((cmp_status->state == COMPOSING_RULE
3630            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3631           && c1 != ISO_CODE_ESC)
3632         {
3633           int rule;
3634
3635           DECODE_COMPOSITION_RULE (rule);
3636           STORE_COMPOSITION_RULE (rule);
3637           continue;
3638         }
3639
3640       /* We produce at most one character.  */
3641       switch (iso_code_class [c1])
3642         {
3643         case ISO_0x20_or_0x7F:
3644           if (charset_id_0 < 0
3645               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3646             /* This is SPACE or DEL.  */
3647             charset = CHARSET_FROM_ID (charset_ascii);
3648           else
3649             charset = CHARSET_FROM_ID (charset_id_0);
3650           break;
3651
3652         case ISO_graphic_plane_0:
3653           if (charset_id_0 < 0)
3654             charset = CHARSET_FROM_ID (charset_ascii);
3655           else
3656             charset = CHARSET_FROM_ID (charset_id_0);
3657           break;
3658
3659         case ISO_0xA0_or_0xFF:
3660           if (charset_id_1 < 0
3661               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3662               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3663             goto invalid_code;
3664           /* This is a graphic character, we fall down ... */
3665
3666         case ISO_graphic_plane_1:
3667           if (charset_id_1 < 0)
3668             goto invalid_code;
3669           charset = CHARSET_FROM_ID (charset_id_1);
3670           break;
3671
3672         case ISO_control_0:
3673           if (eol_dos && c1 == '\r')
3674             ONE_MORE_BYTE (byte_after_cr);
3675           MAYBE_FINISH_COMPOSITION ();
3676           charset = CHARSET_FROM_ID (charset_ascii);
3677           break;
3678
3679         case ISO_control_1:
3680           goto invalid_code;
3681
3682         case ISO_shift_out:
3683           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3684               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3685             goto invalid_code;
3686           CODING_ISO_INVOCATION (coding, 0) = 1;
3687           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3688           continue;
3689
3690         case ISO_shift_in:
3691           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3692             goto invalid_code;
3693           CODING_ISO_INVOCATION (coding, 0) = 0;
3694           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3695           continue;
3696
3697         case ISO_single_shift_2_7:
3698           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3699             goto invalid_code;
3700         case ISO_single_shift_2:
3701           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3702             goto invalid_code;
3703           /* SS2 is handled as an escape sequence of ESC 'N' */
3704           c1 = 'N';
3705           goto label_escape_sequence;
3706
3707         case ISO_single_shift_3:
3708           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3709             goto invalid_code;
3710           /* SS2 is handled as an escape sequence of ESC 'O' */
3711           c1 = 'O';
3712           goto label_escape_sequence;
3713
3714         case ISO_control_sequence_introducer:
3715           /* CSI is handled as an escape sequence of ESC '[' ...  */
3716           c1 = '[';
3717           goto label_escape_sequence;
3718
3719         case ISO_escape:
3720           ONE_MORE_BYTE (c1);
3721         label_escape_sequence:
3722           /* Escape sequences handled here are invocation,
3723              designation, direction specification, and character
3724              composition specification.  */
3725           switch (c1)
3726             {
3727             case '&':           /* revision of following character set */
3728               ONE_MORE_BYTE (c1);
3729               if (!(c1 >= '@' && c1 <= '~'))
3730                 goto invalid_code;
3731               ONE_MORE_BYTE (c1);
3732               if (c1 != ISO_CODE_ESC)
3733                 goto invalid_code;
3734               ONE_MORE_BYTE (c1);
3735               goto label_escape_sequence;
3736
3737             case '$':           /* designation of 2-byte character set */
3738               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3739                 goto invalid_code;
3740               {
3741                 int reg, chars96;
3742
3743                 ONE_MORE_BYTE (c1);
3744                 if (c1 >= '@' && c1 <= 'B')
3745                   {     /* designation of JISX0208.1978, GB2312.1980,
3746                            or JISX0208.1980 */
3747                     reg = 0, chars96 = 0;
3748                   }
3749                 else if (c1 >= 0x28 && c1 <= 0x2B)
3750                   { /* designation of DIMENSION2_CHARS94 character set */
3751                     reg = c1 - 0x28, chars96 = 0;
3752                     ONE_MORE_BYTE (c1);
3753                   }
3754                 else if (c1 >= 0x2C && c1 <= 0x2F)
3755                   { /* designation of DIMENSION2_CHARS96 character set */
3756                     reg = c1 - 0x2C, chars96 = 1;
3757                     ONE_MORE_BYTE (c1);
3758                   }
3759                 else
3760                   goto invalid_code;
3761                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3762                 /* We must update these variables now.  */
3763                 if (reg == 0)
3764                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3765                 else if (reg == 1)
3766                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3767                 if (chars96 < 0)
3768                   goto invalid_code;
3769               }
3770               continue;
3771
3772             case 'n':           /* invocation of locking-shift-2 */
3773               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3774                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3775                 goto invalid_code;
3776               CODING_ISO_INVOCATION (coding, 0) = 2;
3777               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3778               continue;
3779
3780             case 'o':           /* invocation of locking-shift-3 */
3781               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3782                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3783                 goto invalid_code;
3784               CODING_ISO_INVOCATION (coding, 0) = 3;
3785               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3786               continue;
3787
3788             case 'N':           /* invocation of single-shift-2 */
3789               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3790                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3791                 goto invalid_code;
3792               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3793               if (charset_id_2 < 0)
3794                 charset = CHARSET_FROM_ID (charset_ascii);
3795               else
3796                 charset = CHARSET_FROM_ID (charset_id_2);
3797               ONE_MORE_BYTE (c1);
3798               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3799                 goto invalid_code;
3800               break;
3801
3802             case 'O':           /* invocation of single-shift-3 */
3803               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3804                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3805                 goto invalid_code;
3806               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3807               if (charset_id_3 < 0)
3808                 charset = CHARSET_FROM_ID (charset_ascii);
3809               else
3810                 charset = CHARSET_FROM_ID (charset_id_3);
3811               ONE_MORE_BYTE (c1);
3812               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3813                 goto invalid_code;
3814               break;
3815
3816             case '0': case '2': case '3': case '4': /* start composition */
3817               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3818                 goto invalid_code;
3819               if (last_id != charset_ascii)
3820                 {
3821                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3822                   last_id = charset_ascii;
3823                   last_offset = char_offset;
3824                 }
3825               DECODE_COMPOSITION_START (c1);
3826               continue;
3827
3828             case '1':           /* end composition */
3829               if (cmp_status->state == COMPOSING_NO)
3830                 goto invalid_code;
3831               DECODE_COMPOSITION_END ();
3832               continue;
3833
3834             case '[':           /* specification of direction */
3835               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3836                 goto invalid_code;
3837               /* For the moment, nested direction is not supported.
3838                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3839                  left-to-right, and nonzero means right-to-left.  */
3840               ONE_MORE_BYTE (c1);
3841               switch (c1)
3842                 {
3843                 case ']':       /* end of the current direction */
3844                   coding->mode &= ~CODING_MODE_DIRECTION;
3845
3846                 case '0':       /* end of the current direction */
3847                 case '1':       /* start of left-to-right direction */
3848                   ONE_MORE_BYTE (c1);
3849                   if (c1 == ']')
3850                     coding->mode &= ~CODING_MODE_DIRECTION;
3851                   else
3852                     goto invalid_code;
3853                   break;
3854
3855                 case '2':       /* start of right-to-left direction */
3856                   ONE_MORE_BYTE (c1);
3857                   if (c1 == ']')
3858                     coding->mode |= CODING_MODE_DIRECTION;
3859                   else
3860                     goto invalid_code;
3861                   break;
3862
3863                 default:
3864                   goto invalid_code;
3865                 }
3866               continue;
3867
3868             case '%':
3869               ONE_MORE_BYTE (c1);
3870               if (c1 == '/')
3871                 {
3872                   /* CTEXT extended segment:
3873                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3874                      We keep these bytes as is for the moment.
3875                      They may be decoded by post-read-conversion.  */
3876                   int dim, M, L;
3877                   int size;
3878
3879                   ONE_MORE_BYTE (dim);
3880                   if (dim < '0' || dim > '4')
3881                     goto invalid_code;
3882                   ONE_MORE_BYTE (M);
3883                   if (M < 128)
3884                     goto invalid_code;
3885                   ONE_MORE_BYTE (L);
3886                   if (L < 128)
3887                     goto invalid_code;
3888                   size = ((M - 128) * 128) + (L - 128);
3889                   if (charbuf + 6 > charbuf_end)
3890                     goto break_loop;
3891                   *charbuf++ = ISO_CODE_ESC;
3892                   *charbuf++ = '%';
3893                   *charbuf++ = '/';
3894                   *charbuf++ = dim;
3895                   *charbuf++ = BYTE8_TO_CHAR (M);
3896                   *charbuf++ = BYTE8_TO_CHAR (L);
3897                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3898                 }
3899               else if (c1 == 'G')
3900                 {
3901                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3902                      ESC % G --UTF-8-BYTES-- ESC % @
3903                      We keep these bytes as is for the moment.
3904                      They may be decoded by post-read-conversion.  */
3905                   if (charbuf + 3 > charbuf_end)
3906                     goto break_loop;
3907                   *charbuf++ = ISO_CODE_ESC;
3908                   *charbuf++ = '%';
3909                   *charbuf++ = 'G';
3910                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3911                 }
3912               else
3913                 goto invalid_code;
3914               continue;
3915               break;
3916
3917             default:
3918               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3919                 goto invalid_code;
3920               {
3921                 int reg, chars96;
3922
3923                 if (c1 >= 0x28 && c1 <= 0x2B)
3924                   { /* designation of DIMENSION1_CHARS94 character set */
3925                     reg = c1 - 0x28, chars96 = 0;
3926                     ONE_MORE_BYTE (c1);
3927                   }
3928                 else if (c1 >= 0x2C && c1 <= 0x2F)
3929                   { /* designation of DIMENSION1_CHARS96 character set */
3930                     reg = c1 - 0x2C, chars96 = 1;
3931                     ONE_MORE_BYTE (c1);
3932                   }
3933                 else
3934                   goto invalid_code;
3935                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3936                 /* We must update these variables now.  */
3937                 if (reg == 0)
3938                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3939                 else if (reg == 1)
3940                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3941                 if (chars96 < 0)
3942                   goto invalid_code;
3943               }
3944               continue;
3945             }
3946           break;
3947
3948         default:
3949           abort ();
3950         }
3951
3952       if (cmp_status->state == COMPOSING_NO
3953           && charset->id != charset_ascii
3954           && last_id != charset->id)
3955         {
3956           if (last_id != charset_ascii)
3957             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3958           last_id = charset->id;
3959           last_offset = char_offset;
3960         }
3961
3962       /* Now we know CHARSET and 1st position code C1 of a character.
3963          Produce a decoded character while getting 2nd and 3rd
3964          position codes C2, C3 if necessary.  */
3965       if (CHARSET_DIMENSION (charset) > 1)
3966         {
3967           ONE_MORE_BYTE (c2);
3968           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3969               || ((c1 & 0x80) != (c2 & 0x80)))
3970             /* C2 is not in a valid range.  */
3971             goto invalid_code;
3972           if (CHARSET_DIMENSION (charset) == 2)
3973             c1 = (c1 << 8) | c2;
3974           else
3975             {
3976               ONE_MORE_BYTE (c3);
3977               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3978                   || ((c1 & 0x80) != (c3 & 0x80)))
3979                 /* C3 is not in a valid range.  */
3980                 goto invalid_code;
3981               c1 = (c1 << 16) | (c2 << 8) | c2;
3982             }
3983         }
3984       c1 &= 0x7F7F7F;
3985       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3986       if (c < 0)
3987         {
3988           MAYBE_FINISH_COMPOSITION ();
3989           for (; src_base < src; src_base++, char_offset++)
3990             {
3991               if (ASCII_BYTE_P (*src_base))
3992                 *charbuf++ = *src_base;
3993               else
3994                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3995             }
3996         }
3997       else if (cmp_status->state == COMPOSING_NO)
3998         {
3999           *charbuf++ = c;
4000           char_offset++;
4001         }
4002       else if ((cmp_status->state == COMPOSING_CHAR
4003                 ? cmp_status->nchars
4004                 : cmp_status->ncomps)
4005                >= MAX_COMPOSITION_COMPONENTS)
4006         {
4007           /* Too long composition.  */
4008           MAYBE_FINISH_COMPOSITION ();
4009           *charbuf++ = c;
4010           char_offset++;
4011         }
4012       else
4013         STORE_COMPOSITION_CHAR (c);
4014       continue;
4015
4016     invalid_code:
4017       MAYBE_FINISH_COMPOSITION ();
4018       src = src_base;
4019       consumed_chars = consumed_chars_base;
4020       ONE_MORE_BYTE (c);
4021       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4022       char_offset++;
4023       coding->errors++;
4024       continue;
4025
4026     break_loop:
4027       break;
4028     }
4029
4030  no_more_source:
4031   if (cmp_status->state != COMPOSING_NO)
4032     {
4033       if (coding->mode & CODING_MODE_LAST_BLOCK)
4034         MAYBE_FINISH_COMPOSITION ();
4035       else
4036         {
4037           charbuf -= cmp_status->length;
4038           for (i = 0; i < cmp_status->length; i++)
4039             cmp_status->carryover[i] = charbuf[i];
4040         }
4041     }
4042   else if (last_id != charset_ascii)
4043     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4044   coding->consumed_char += consumed_chars_base;
4045   coding->consumed = src_base - coding->source;
4046   coding->charbuf_used = charbuf - coding->charbuf;
4047 }
4048
4049
4050 /* ISO2022 encoding stuff.  */
4051
4052 /*
4053    It is not enough to say just "ISO2022" on encoding, we have to
4054    specify more details.  In Emacs, each coding system of ISO2022
4055    variant has the following specifications:
4056         1. Initial designation to G0 thru G3.
4057         2. Allows short-form designation?
4058         3. ASCII should be designated to G0 before control characters?
4059         4. ASCII should be designated to G0 at end of line?
4060         5. 7-bit environment or 8-bit environment?
4061         6. Use locking-shift?
4062         7. Use Single-shift?
4063    And the following two are only for Japanese:
4064         8. Use ASCII in place of JIS0201-1976-Roman?
4065         9. Use JISX0208-1983 in place of JISX0208-1978?
4066    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4067    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4068    details.
4069 */
4070
4071 /* Produce codes (escape sequence) for designating CHARSET to graphic
4072    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4073    '@', 'A', or 'B' and the coding system CODING allows, produce
4074    designation sequence of short-form.  */
4075
4076 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4077   do {                                                                  \
4078     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4079     const char *intermediate_char_94 = "()*+";                          \
4080     const char *intermediate_char_96 = ",-./";                          \
4081     int revision = -1;                                                  \
4082                                                                         \
4083     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4084       revision = CHARSET_ISO_REVISION (charset);                        \
4085                                                                         \
4086     if (revision >= 0)                                                  \
4087       {                                                                 \
4088         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4089         EMIT_ONE_BYTE ('@' + revision);                                 \
4090       }                                                                 \
4091     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4092     if (CHARSET_DIMENSION (charset) == 1)                               \
4093       {                                                                 \
4094         int b;                                                          \
4095         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4096           b = intermediate_char_94[reg];                                \
4097         else                                                            \
4098           b = intermediate_char_96[reg];                                \
4099         EMIT_ONE_ASCII_BYTE (b);                                        \
4100       }                                                                 \
4101     else                                                                \
4102       {                                                                 \
4103         EMIT_ONE_ASCII_BYTE ('$');                                      \
4104         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4105           {                                                             \
4106             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4107                 || reg != 0                                             \
4108                 || final_char < '@' || final_char > 'B')                \
4109               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4110           }                                                             \
4111         else                                                            \
4112           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4113       }                                                                 \
4114     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4115                                                                         \
4116     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4117   } while (0)
4118
4119
4120 /* The following two macros produce codes (control character or escape
4121    sequence) for ISO2022 single-shift functions (single-shift-2 and
4122    single-shift-3).  */
4123
4124 #define ENCODE_SINGLE_SHIFT_2                                           \
4125   do {                                                                  \
4126     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4127       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4128     else                                                                \
4129       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4130     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4131   } while (0)
4132
4133
4134 #define ENCODE_SINGLE_SHIFT_3                                           \
4135   do {                                                                  \
4136     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4137       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4138     else                                                                \
4139       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4140     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4141   } while (0)
4142
4143
4144 /* The following four macros produce codes (control character or
4145    escape sequence) for ISO2022 locking-shift functions (shift-in,
4146    shift-out, locking-shift-2, and locking-shift-3).  */
4147
4148 #define ENCODE_SHIFT_IN                                 \
4149   do {                                                  \
4150     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4151     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4152   } while (0)
4153
4154
4155 #define ENCODE_SHIFT_OUT                                \
4156   do {                                                  \
4157     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4158     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4159   } while (0)
4160
4161
4162 #define ENCODE_LOCKING_SHIFT_2                          \
4163   do {                                                  \
4164     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4165     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4166   } while (0)
4167
4168
4169 #define ENCODE_LOCKING_SHIFT_3                          \
4170   do {                                                  \
4171     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4172     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4173   } while (0)
4174
4175
4176 /* Produce codes for a DIMENSION1 character whose character set is
4177    CHARSET and whose position-code is C1.  Designation and invocation
4178    sequences are also produced in advance if necessary.  */
4179
4180 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4181   do {                                                                  \
4182     int id = CHARSET_ID (charset);                                      \
4183                                                                         \
4184     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4185         && id == charset_ascii)                                         \
4186       {                                                                 \
4187         id = charset_jisx0201_roman;                                    \
4188         charset = CHARSET_FROM_ID (id);                                 \
4189       }                                                                 \
4190                                                                         \
4191     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4192       {                                                                 \
4193         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4194           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4195         else                                                            \
4196           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4197         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4198         break;                                                          \
4199       }                                                                 \
4200     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4201       {                                                                 \
4202         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4203         break;                                                          \
4204       }                                                                 \
4205     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4206       {                                                                 \
4207         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4208         break;                                                          \
4209       }                                                                 \
4210     else                                                                \
4211       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4212          must invoke it, or, at first, designate it to some graphic     \
4213          register.  Then repeat the loop to actually produce the        \
4214          character.  */                                                 \
4215       dst = encode_invocation_designation (charset, coding, dst,        \
4216                                            &produced_chars);            \
4217   } while (1)
4218
4219
4220 /* Produce codes for a DIMENSION2 character whose character set is
4221    CHARSET and whose position-codes are C1 and C2.  Designation and
4222    invocation codes are also produced in advance if necessary.  */
4223
4224 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4225   do {                                                                  \
4226     int id = CHARSET_ID (charset);                                      \
4227                                                                         \
4228     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4229         && id == charset_jisx0208)                                      \
4230       {                                                                 \
4231         id = charset_jisx0208_1978;                                     \
4232         charset = CHARSET_FROM_ID (id);                                 \
4233       }                                                                 \
4234                                                                         \
4235     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4236       {                                                                 \
4237         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4238           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4239         else                                                            \
4240           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4241         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4242         break;                                                          \
4243       }                                                                 \
4244     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4245       {                                                                 \
4246         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4247         break;                                                          \
4248       }                                                                 \
4249     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4250       {                                                                 \
4251         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4252         break;                                                          \
4253       }                                                                 \
4254     else                                                                \
4255       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4256          must invoke it, or, at first, designate it to some graphic     \
4257          register.  Then repeat the loop to actually produce the        \
4258          character.  */                                                 \
4259       dst = encode_invocation_designation (charset, coding, dst,        \
4260                                            &produced_chars);            \
4261   } while (1)
4262
4263
4264 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4265   do {                                                                     \
4266     unsigned code;                                                         \
4267     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4268                                                                            \
4269     if (CHARSET_DIMENSION (charset) == 1)                                  \
4270       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4271     else                                                                   \
4272       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4273   } while (0)
4274
4275
4276 /* Produce designation and invocation codes at a place pointed by DST
4277    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4278    Return new DST.  */
4279
4280 static unsigned char *
4281 encode_invocation_designation (struct charset *charset,
4282                                struct coding_system *coding,
4283                                unsigned char *dst, ptrdiff_t *p_nchars)
4284 {
4285   int multibytep = coding->dst_multibyte;
4286   ptrdiff_t produced_chars = *p_nchars;
4287   int reg;                      /* graphic register number */
4288   int id = CHARSET_ID (charset);
4289
4290   /* At first, check designations.  */
4291   for (reg = 0; reg < 4; reg++)
4292     if (id == CODING_ISO_DESIGNATION (coding, reg))
4293       break;
4294
4295   if (reg >= 4)
4296     {
4297       /* CHARSET is not yet designated to any graphic registers.  */
4298       /* At first check the requested designation.  */
4299       reg = CODING_ISO_REQUEST (coding, id);
4300       if (reg < 0)
4301         /* Since CHARSET requests no special designation, designate it
4302            to graphic register 0.  */
4303         reg = 0;
4304
4305       ENCODE_DESIGNATION (charset, reg, coding);
4306     }
4307
4308   if (CODING_ISO_INVOCATION (coding, 0) != reg
4309       && CODING_ISO_INVOCATION (coding, 1) != reg)
4310     {
4311       /* Since the graphic register REG is not invoked to any graphic
4312          planes, invoke it to graphic plane 0.  */
4313       switch (reg)
4314         {
4315         case 0:                 /* graphic register 0 */
4316           ENCODE_SHIFT_IN;
4317           break;
4318
4319         case 1:                 /* graphic register 1 */
4320           ENCODE_SHIFT_OUT;
4321           break;
4322
4323         case 2:                 /* graphic register 2 */
4324           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4325             ENCODE_SINGLE_SHIFT_2;
4326           else
4327             ENCODE_LOCKING_SHIFT_2;
4328           break;
4329
4330         case 3:                 /* graphic register 3 */
4331           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4332             ENCODE_SINGLE_SHIFT_3;
4333           else
4334             ENCODE_LOCKING_SHIFT_3;
4335           break;
4336         }
4337     }
4338
4339   *p_nchars = produced_chars;
4340   return dst;
4341 }
4342
4343
4344 /* Produce codes for designation and invocation to reset the graphic
4345    planes and registers to initial state.  */
4346 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4347   do {                                                                  \
4348     int reg;                                                            \
4349     struct charset *charset;                                            \
4350                                                                         \
4351     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4352       ENCODE_SHIFT_IN;                                                  \
4353     for (reg = 0; reg < 4; reg++)                                       \
4354       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4355           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4356               != CODING_ISO_INITIAL (coding, reg)))                     \
4357         {                                                               \
4358           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4359           ENCODE_DESIGNATION (charset, reg, coding);                    \
4360         }                                                               \
4361   } while (0)
4362
4363
4364 /* Produce designation sequences of charsets in the line started from
4365    CHARBUF to a place pointed by DST, and return the number of
4366    produced bytes.  DST should not directly point a buffer text area
4367    which may be relocated by char_charset call.
4368
4369    If the current block ends before any end-of-line, we may fail to
4370    find all the necessary designations.  */
4371
4372 static ptrdiff_t
4373 encode_designation_at_bol (struct coding_system *coding,
4374                            int *charbuf, int *charbuf_end,
4375                            unsigned char *dst)
4376 {
4377   unsigned char *orig = dst;
4378   struct charset *charset;
4379   /* Table of charsets to be designated to each graphic register.  */
4380   int r[4];
4381   int c, found = 0, reg;
4382   ptrdiff_t produced_chars = 0;
4383   int multibytep = coding->dst_multibyte;
4384   Lisp_Object attrs;
4385   Lisp_Object charset_list;
4386
4387   attrs = CODING_ID_ATTRS (coding->id);
4388   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4389   if (EQ (charset_list, Qiso_2022))
4390     charset_list = Viso_2022_charset_list;
4391
4392   for (reg = 0; reg < 4; reg++)
4393     r[reg] = -1;
4394
4395   while (charbuf < charbuf_end && found < 4)
4396     {
4397       int id;
4398
4399       c = *charbuf++;
4400       if (c == '\n')
4401         break;
4402       charset = char_charset (c, charset_list, NULL);
4403       id = CHARSET_ID (charset);
4404       reg = CODING_ISO_REQUEST (coding, id);
4405       if (reg >= 0 && r[reg] < 0)
4406         {
4407           found++;
4408           r[reg] = id;
4409         }
4410     }
4411
4412   if (found)
4413     {
4414       for (reg = 0; reg < 4; reg++)
4415         if (r[reg] >= 0
4416             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4417           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4418     }
4419
4420   return dst - orig;
4421 }
4422
4423 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4424
4425 static int
4426 encode_coding_iso_2022 (struct coding_system *coding)
4427 {
4428   int multibytep = coding->dst_multibyte;
4429   int *charbuf = coding->charbuf;
4430   int *charbuf_end = charbuf + coding->charbuf_used;
4431   unsigned char *dst = coding->destination + coding->produced;
4432   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4433   int safe_room = 16;
4434   int bol_designation
4435     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4436        && CODING_ISO_BOL (coding));
4437   ptrdiff_t produced_chars = 0;
4438   Lisp_Object attrs, eol_type, charset_list;
4439   int ascii_compatible;
4440   int c;
4441   int preferred_charset_id = -1;
4442
4443   CODING_GET_INFO (coding, attrs, charset_list);
4444   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4445   if (VECTORP (eol_type))
4446     eol_type = Qunix;
4447
4448   setup_iso_safe_charsets (attrs);
4449   /* Charset list may have been changed.  */
4450   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4451   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4452
4453   ascii_compatible
4454     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4455        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4456                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4457
4458   while (charbuf < charbuf_end)
4459     {
4460       ASSURE_DESTINATION (safe_room);
4461
4462       if (bol_designation)
4463         {
4464           /* We have to produce designation sequences if any now.  */
4465           unsigned char desig_buf[16];
4466           int nbytes;
4467           ptrdiff_t offset;
4468
4469           charset_map_loaded = 0;
4470           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4471                                               desig_buf);
4472           if (charset_map_loaded
4473               && (offset = coding_change_destination (coding)))
4474             {
4475               dst += offset;
4476               dst_end += offset;
4477             }
4478           memcpy (dst, desig_buf, nbytes);
4479           dst += nbytes;
4480           /* We are sure that designation sequences are all ASCII bytes.  */
4481           produced_chars += nbytes;
4482           bol_designation = 0;
4483           ASSURE_DESTINATION (safe_room);
4484         }
4485
4486       c = *charbuf++;
4487
4488       if (c < 0)
4489         {
4490           /* Handle an annotation.  */
4491           switch (*charbuf)
4492             {
4493             case CODING_ANNOTATE_COMPOSITION_MASK:
4494               /* Not yet implemented.  */
4495               break;
4496             case CODING_ANNOTATE_CHARSET_MASK:
4497               preferred_charset_id = charbuf[2];
4498               if (preferred_charset_id >= 0
4499                   && NILP (Fmemq (make_number (preferred_charset_id),
4500                                   charset_list)))
4501                 preferred_charset_id = -1;
4502               break;
4503             default:
4504               abort ();
4505             }
4506           charbuf += -c - 1;
4507           continue;
4508         }
4509
4510       /* Now encode the character C.  */
4511       if (c < 0x20 || c == 0x7F)
4512         {
4513           if (c == '\n'
4514               || (c == '\r' && EQ (eol_type, Qmac)))
4515             {
4516               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4517                 ENCODE_RESET_PLANE_AND_REGISTER ();
4518               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4519                 {
4520                   int i;
4521
4522                   for (i = 0; i < 4; i++)
4523                     CODING_ISO_DESIGNATION (coding, i)
4524                       = CODING_ISO_INITIAL (coding, i);
4525                 }
4526               bol_designation
4527                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4528             }
4529           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4530             ENCODE_RESET_PLANE_AND_REGISTER ();
4531           EMIT_ONE_ASCII_BYTE (c);
4532         }
4533       else if (ASCII_CHAR_P (c))
4534         {
4535           if (ascii_compatible)
4536             EMIT_ONE_ASCII_BYTE (c);
4537           else
4538             {
4539               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4540               ENCODE_ISO_CHARACTER (charset, c);
4541             }
4542         }
4543       else if (CHAR_BYTE8_P (c))
4544         {
4545           c = CHAR_TO_BYTE8 (c);
4546           EMIT_ONE_BYTE (c);
4547         }
4548       else
4549         {
4550           struct charset *charset;
4551
4552           if (preferred_charset_id >= 0)
4553             {
4554               int result;
4555
4556               charset = CHARSET_FROM_ID (preferred_charset_id);
4557               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4558               if (! result)
4559                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4560                                      NULL, charset);
4561             }
4562           else
4563             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4564                                  NULL, charset);
4565           if (!charset)
4566             {
4567               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4568                 {
4569                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4570                   charset = CHARSET_FROM_ID (charset_ascii);
4571                 }
4572               else
4573                 {
4574                   c = coding->default_char;
4575                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4576                                        charset_list, NULL, charset);
4577                 }
4578             }
4579           ENCODE_ISO_CHARACTER (charset, c);
4580         }
4581     }
4582
4583   if (coding->mode & CODING_MODE_LAST_BLOCK
4584       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4585     {
4586       ASSURE_DESTINATION (safe_room);
4587       ENCODE_RESET_PLANE_AND_REGISTER ();
4588     }
4589   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4590   CODING_ISO_BOL (coding) = bol_designation;
4591   coding->produced_char += produced_chars;
4592   coding->produced = dst - coding->destination;
4593   return 0;
4594 }
4595
4596 \f
4597 /*** 8,9. SJIS and BIG5 handlers ***/
4598
4599 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4600    quite widely.  So, for the moment, Emacs supports them in the bare
4601    C code.  But, in the future, they may be supported only by CCL.  */
4602
4603 /* SJIS is a coding system encoding three character sets: ASCII, right
4604    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4605    as is.  A character of charset katakana-jisx0201 is encoded by
4606    "position-code + 0x80".  A character of charset japanese-jisx0208
4607    is encoded in 2-byte but two position-codes are divided and shifted
4608    so that it fit in the range below.
4609
4610    --- CODE RANGE of SJIS ---
4611    (character set)      (range)
4612    ASCII                0x00 .. 0x7F
4613    KATAKANA-JISX0201    0xA0 .. 0xDF
4614    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4615             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4616    -------------------------------
4617
4618 */
4619
4620 /* BIG5 is a coding system encoding two character sets: ASCII and
4621    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4622    character set and is encoded in two-byte.
4623
4624    --- CODE RANGE of BIG5 ---
4625    (character set)      (range)
4626    ASCII                0x00 .. 0x7F
4627    Big5 (1st byte)      0xA1 .. 0xFE
4628         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4629    --------------------------
4630
4631   */
4632
4633 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4634    Check if a text is encoded in SJIS.  If it is, return
4635    CATEGORY_MASK_SJIS, else return 0.  */
4636
4637 static int
4638 detect_coding_sjis (struct coding_system *coding,
4639                     struct coding_detection_info *detect_info)
4640 {
4641   const unsigned char *src = coding->source, *src_base;
4642   const unsigned char *src_end = coding->source + coding->src_bytes;
4643   int multibytep = coding->src_multibyte;
4644   ptrdiff_t consumed_chars = 0;
4645   int found = 0;
4646   int c;
4647   Lisp_Object attrs, charset_list;
4648   int max_first_byte_of_2_byte_code;
4649
4650   CODING_GET_INFO (coding, attrs, charset_list);
4651   max_first_byte_of_2_byte_code
4652     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4653
4654   detect_info->checked |= CATEGORY_MASK_SJIS;
4655   /* A coding system of this category is always ASCII compatible.  */
4656   src += coding->head_ascii;
4657
4658   while (1)
4659     {
4660       src_base = src;
4661       ONE_MORE_BYTE (c);
4662       if (c < 0x80)
4663         continue;
4664       if ((c >= 0x81 && c <= 0x9F)
4665           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4666         {
4667           ONE_MORE_BYTE (c);
4668           if (c < 0x40 || c == 0x7F || c > 0xFC)
4669             break;
4670           found = CATEGORY_MASK_SJIS;
4671         }
4672       else if (c >= 0xA0 && c < 0xE0)
4673         found = CATEGORY_MASK_SJIS;
4674       else
4675         break;
4676     }
4677   detect_info->rejected |= CATEGORY_MASK_SJIS;
4678   return 0;
4679
4680  no_more_source:
4681   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4682     {
4683       detect_info->rejected |= CATEGORY_MASK_SJIS;
4684       return 0;
4685     }
4686   detect_info->found |= found;
4687   return 1;
4688 }
4689
4690 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4691    Check if a text is encoded in BIG5.  If it is, return
4692    CATEGORY_MASK_BIG5, else return 0.  */
4693
4694 static int
4695 detect_coding_big5 (struct coding_system *coding,
4696                     struct coding_detection_info *detect_info)
4697 {
4698   const unsigned char *src = coding->source, *src_base;
4699   const unsigned char *src_end = coding->source + coding->src_bytes;
4700   int multibytep = coding->src_multibyte;
4701   ptrdiff_t consumed_chars = 0;
4702   int found = 0;
4703   int c;
4704
4705   detect_info->checked |= CATEGORY_MASK_BIG5;
4706   /* A coding system of this category is always ASCII compatible.  */
4707   src += coding->head_ascii;
4708
4709   while (1)
4710     {
4711       src_base = src;
4712       ONE_MORE_BYTE (c);
4713       if (c < 0x80)
4714         continue;
4715       if (c >= 0xA1)
4716         {
4717           ONE_MORE_BYTE (c);
4718           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4719             return 0;
4720           found = CATEGORY_MASK_BIG5;
4721         }
4722       else
4723         break;
4724     }
4725   detect_info->rejected |= CATEGORY_MASK_BIG5;
4726   return 0;
4727
4728  no_more_source:
4729   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4730     {
4731       detect_info->rejected |= CATEGORY_MASK_BIG5;
4732       return 0;
4733     }
4734   detect_info->found |= found;
4735   return 1;
4736 }
4737
4738 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4739    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4740
4741 static void
4742 decode_coding_sjis (struct coding_system *coding)
4743 {
4744   const unsigned char *src = coding->source + coding->consumed;
4745   const unsigned char *src_end = coding->source + coding->src_bytes;
4746   const unsigned char *src_base;
4747   int *charbuf = coding->charbuf + coding->charbuf_used;
4748   /* We may produce one charset annotation in one loop and one more at
4749      the end.  */
4750   int *charbuf_end
4751     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4752   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4753   int multibytep = coding->src_multibyte;
4754   struct charset *charset_roman, *charset_kanji, *charset_kana;
4755   struct charset *charset_kanji2;
4756   Lisp_Object attrs, charset_list, val;
4757   ptrdiff_t char_offset = coding->produced_char;
4758   ptrdiff_t last_offset = char_offset;
4759   int last_id = charset_ascii;
4760   int eol_dos =
4761     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4762   int byte_after_cr = -1;
4763
4764   CODING_GET_INFO (coding, attrs, charset_list);
4765
4766   val = charset_list;
4767   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4768   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4769   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4770   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4771
4772   while (1)
4773     {
4774       int c, c1;
4775       struct charset *charset;
4776
4777       src_base = src;
4778       consumed_chars_base = consumed_chars;
4779
4780       if (charbuf >= charbuf_end)
4781         {
4782           if (byte_after_cr >= 0)
4783             src_base--;
4784           break;
4785         }
4786
4787       if (byte_after_cr >= 0)
4788         c = byte_after_cr, byte_after_cr = -1;
4789       else
4790         ONE_MORE_BYTE (c);
4791       if (c < 0)
4792         goto invalid_code;
4793       if (c < 0x80)
4794         {
4795           if (eol_dos && c == '\r')
4796             ONE_MORE_BYTE (byte_after_cr);
4797           charset = charset_roman;
4798         }
4799       else if (c == 0x80 || c == 0xA0)
4800         goto invalid_code;
4801       else if (c >= 0xA1 && c <= 0xDF)
4802         {
4803           /* SJIS -> JISX0201-Kana */
4804           c &= 0x7F;
4805           charset = charset_kana;
4806         }
4807       else if (c <= 0xEF)
4808         {
4809           /* SJIS -> JISX0208 */
4810           ONE_MORE_BYTE (c1);
4811           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4812             goto invalid_code;
4813           c = (c << 8) | c1;
4814           SJIS_TO_JIS (c);
4815           charset = charset_kanji;
4816         }
4817       else if (c <= 0xFC && charset_kanji2)
4818         {
4819           /* SJIS -> JISX0213-2 */
4820           ONE_MORE_BYTE (c1);
4821           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4822             goto invalid_code;
4823           c = (c << 8) | c1;
4824           SJIS_TO_JIS2 (c);
4825           charset = charset_kanji2;
4826         }
4827       else
4828         goto invalid_code;
4829       if (charset->id != charset_ascii
4830           && last_id != charset->id)
4831         {
4832           if (last_id != charset_ascii)
4833             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4834           last_id = charset->id;
4835           last_offset = char_offset;
4836         }
4837       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4838       *charbuf++ = c;
4839       char_offset++;
4840       continue;
4841
4842     invalid_code:
4843       src = src_base;
4844       consumed_chars = consumed_chars_base;
4845       ONE_MORE_BYTE (c);
4846       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4847       char_offset++;
4848       coding->errors++;
4849     }
4850
4851  no_more_source:
4852   if (last_id != charset_ascii)
4853     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4854   coding->consumed_char += consumed_chars_base;
4855   coding->consumed = src_base - coding->source;
4856   coding->charbuf_used = charbuf - coding->charbuf;
4857 }
4858
4859 static void
4860 decode_coding_big5 (struct coding_system *coding)
4861 {
4862   const unsigned char *src = coding->source + coding->consumed;
4863   const unsigned char *src_end = coding->source + coding->src_bytes;
4864   const unsigned char *src_base;
4865   int *charbuf = coding->charbuf + coding->charbuf_used;
4866   /* We may produce one charset annotation in one loop and one more at
4867      the end.  */
4868   int *charbuf_end
4869     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4870   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4871   int multibytep = coding->src_multibyte;
4872   struct charset *charset_roman, *charset_big5;
4873   Lisp_Object attrs, charset_list, val;
4874   ptrdiff_t char_offset = coding->produced_char;
4875   ptrdiff_t last_offset = char_offset;
4876   int last_id = charset_ascii;
4877   int eol_dos =
4878     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4879   int byte_after_cr = -1;
4880
4881   CODING_GET_INFO (coding, attrs, charset_list);
4882   val = charset_list;
4883   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4884   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4885
4886   while (1)
4887     {
4888       int c, c1;
4889       struct charset *charset;
4890
4891       src_base = src;
4892       consumed_chars_base = consumed_chars;
4893
4894       if (charbuf >= charbuf_end)
4895         {
4896           if (byte_after_cr >= 0)
4897             src_base--;
4898           break;
4899         }
4900
4901       if (byte_after_cr >= 0)
4902         c = byte_after_cr, byte_after_cr = -1;
4903       else
4904         ONE_MORE_BYTE (c);
4905
4906       if (c < 0)
4907         goto invalid_code;
4908       if (c < 0x80)
4909         {
4910           if (eol_dos && c == '\r')
4911             ONE_MORE_BYTE (byte_after_cr);
4912           charset = charset_roman;
4913         }
4914       else
4915         {
4916           /* BIG5 -> Big5 */
4917           if (c < 0xA1 || c > 0xFE)
4918             goto invalid_code;
4919           ONE_MORE_BYTE (c1);
4920           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4921             goto invalid_code;
4922           c = c << 8 | c1;
4923           charset = charset_big5;
4924         }
4925       if (charset->id != charset_ascii
4926           && last_id != charset->id)
4927         {
4928           if (last_id != charset_ascii)
4929             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4930           last_id = charset->id;
4931           last_offset = char_offset;
4932         }
4933       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4934       *charbuf++ = c;
4935       char_offset++;
4936       continue;
4937
4938     invalid_code:
4939       src = src_base;
4940       consumed_chars = consumed_chars_base;
4941       ONE_MORE_BYTE (c);
4942       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4943       char_offset++;
4944       coding->errors++;
4945     }
4946
4947  no_more_source:
4948   if (last_id != charset_ascii)
4949     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4950   coding->consumed_char += consumed_chars_base;
4951   coding->consumed = src_base - coding->source;
4952   coding->charbuf_used = charbuf - coding->charbuf;
4953 }
4954
4955 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4956    This function can encode charsets `ascii', `katakana-jisx0201',
4957    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4958    are sure that all these charsets are registered as official charset
4959    (i.e. do not have extended leading-codes).  Characters of other
4960    charsets are produced without any encoding.  If SJIS_P is 1, encode
4961    SJIS text, else encode BIG5 text.  */
4962
4963 static int
4964 encode_coding_sjis (struct coding_system *coding)
4965 {
4966   int multibytep = coding->dst_multibyte;
4967   int *charbuf = coding->charbuf;
4968   int *charbuf_end = charbuf + coding->charbuf_used;
4969   unsigned char *dst = coding->destination + coding->produced;
4970   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4971   int safe_room = 4;
4972   ptrdiff_t produced_chars = 0;
4973   Lisp_Object attrs, charset_list, val;
4974   int ascii_compatible;
4975   struct charset *charset_kanji, *charset_kana;
4976   struct charset *charset_kanji2;
4977   int c;
4978
4979   CODING_GET_INFO (coding, attrs, charset_list);
4980   val = XCDR (charset_list);
4981   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4982   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4983   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4984
4985   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4986
4987   while (charbuf < charbuf_end)
4988     {
4989       ASSURE_DESTINATION (safe_room);
4990       c = *charbuf++;
4991       /* Now encode the character C.  */
4992       if (ASCII_CHAR_P (c) && ascii_compatible)
4993         EMIT_ONE_ASCII_BYTE (c);
4994       else if (CHAR_BYTE8_P (c))
4995         {
4996           c = CHAR_TO_BYTE8 (c);
4997           EMIT_ONE_BYTE (c);
4998         }
4999       else
5000         {
5001           unsigned code;
5002           struct charset *charset;
5003           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5004                                &code, charset);
5005
5006           if (!charset)
5007             {
5008               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5009                 {
5010                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5011                   charset = CHARSET_FROM_ID (charset_ascii);
5012                 }
5013               else
5014                 {
5015                   c = coding->default_char;
5016                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5017                                        charset_list, &code, charset);
5018                 }
5019             }
5020           if (code == CHARSET_INVALID_CODE (charset))
5021             abort ();
5022           if (charset == charset_kanji)
5023             {
5024               int c1, c2;
5025               JIS_TO_SJIS (code);
5026               c1 = code >> 8, c2 = code & 0xFF;
5027               EMIT_TWO_BYTES (c1, c2);
5028             }
5029           else if (charset == charset_kana)
5030             EMIT_ONE_BYTE (code | 0x80);
5031           else if (charset_kanji2 && charset == charset_kanji2)
5032             {
5033               int c1, c2;
5034
5035               c1 = code >> 8;
5036               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5037                   || c1 == 0x28
5038                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5039                 {
5040                   JIS_TO_SJIS2 (code);
5041                   c1 = code >> 8, c2 = code & 0xFF;
5042                   EMIT_TWO_BYTES (c1, c2);
5043                 }
5044               else
5045                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5046             }
5047           else
5048             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5049         }
5050     }
5051   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5052   coding->produced_char += produced_chars;
5053   coding->produced = dst - coding->destination;
5054   return 0;
5055 }
5056
5057 static int
5058 encode_coding_big5 (struct coding_system *coding)
5059 {
5060   int multibytep = coding->dst_multibyte;
5061   int *charbuf = coding->charbuf;
5062   int *charbuf_end = charbuf + coding->charbuf_used;
5063   unsigned char *dst = coding->destination + coding->produced;
5064   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5065   int safe_room = 4;
5066   ptrdiff_t produced_chars = 0;
5067   Lisp_Object attrs, charset_list, val;
5068   int ascii_compatible;
5069   struct charset *charset_big5;
5070   int c;
5071
5072   CODING_GET_INFO (coding, attrs, charset_list);
5073   val = XCDR (charset_list);
5074   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5075   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5076
5077   while (charbuf < charbuf_end)
5078     {
5079       ASSURE_DESTINATION (safe_room);
5080       c = *charbuf++;
5081       /* Now encode the character C.  */
5082       if (ASCII_CHAR_P (c) && ascii_compatible)
5083         EMIT_ONE_ASCII_BYTE (c);
5084       else if (CHAR_BYTE8_P (c))
5085         {
5086           c = CHAR_TO_BYTE8 (c);
5087           EMIT_ONE_BYTE (c);
5088         }
5089       else
5090         {
5091           unsigned code;
5092           struct charset *charset;
5093           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5094                                &code, charset);
5095
5096           if (! charset)
5097             {
5098               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5099                 {
5100                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5101                   charset = CHARSET_FROM_ID (charset_ascii);
5102                 }
5103               else
5104                 {
5105                   c = coding->default_char;
5106                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5107                                        charset_list, &code, charset);
5108                 }
5109             }
5110           if (code == CHARSET_INVALID_CODE (charset))
5111             abort ();
5112           if (charset == charset_big5)
5113             {
5114               int c1, c2;
5115
5116               c1 = code >> 8, c2 = code & 0xFF;
5117               EMIT_TWO_BYTES (c1, c2);
5118             }
5119           else
5120             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5121         }
5122     }
5123   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5124   coding->produced_char += produced_chars;
5125   coding->produced = dst - coding->destination;
5126   return 0;
5127 }
5128
5129 \f
5130 /*** 10. CCL handlers ***/
5131
5132 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5133    Check if a text is encoded in a coding system of which
5134    encoder/decoder are written in CCL program.  If it is, return
5135    CATEGORY_MASK_CCL, else return 0.  */
5136
5137 static int
5138 detect_coding_ccl (struct coding_system *coding,
5139                    struct coding_detection_info *detect_info)
5140 {
5141   const unsigned char *src = coding->source, *src_base;
5142   const unsigned char *src_end = coding->source + coding->src_bytes;
5143   int multibytep = coding->src_multibyte;
5144   ptrdiff_t consumed_chars = 0;
5145   int found = 0;
5146   unsigned char *valids;
5147   ptrdiff_t head_ascii = coding->head_ascii;
5148   Lisp_Object attrs;
5149
5150   detect_info->checked |= CATEGORY_MASK_CCL;
5151
5152   coding = &coding_categories[coding_category_ccl];
5153   valids = CODING_CCL_VALIDS (coding);
5154   attrs = CODING_ID_ATTRS (coding->id);
5155   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5156     src += head_ascii;
5157
5158   while (1)
5159     {
5160       int c;
5161
5162       src_base = src;
5163       ONE_MORE_BYTE (c);
5164       if (c < 0 || ! valids[c])
5165         break;
5166       if ((valids[c] > 1))
5167         found = CATEGORY_MASK_CCL;
5168     }
5169   detect_info->rejected |= CATEGORY_MASK_CCL;
5170   return 0;
5171
5172  no_more_source:
5173   detect_info->found |= found;
5174   return 1;
5175 }
5176
5177 static void
5178 decode_coding_ccl (struct coding_system *coding)
5179 {
5180   const unsigned char *src = coding->source + coding->consumed;
5181   const unsigned char *src_end = coding->source + coding->src_bytes;
5182   int *charbuf = coding->charbuf + coding->charbuf_used;
5183   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5184   ptrdiff_t consumed_chars = 0;
5185   int multibytep = coding->src_multibyte;
5186   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5187   int source_charbuf[1024];
5188   int source_byteidx[1025];
5189   Lisp_Object attrs, charset_list;
5190
5191   CODING_GET_INFO (coding, attrs, charset_list);
5192
5193   while (1)
5194     {
5195       const unsigned char *p = src;
5196       int i = 0;
5197
5198       if (multibytep)
5199         {
5200           while (i < 1024 && p < src_end)
5201             {
5202               source_byteidx[i] = p - src;
5203               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5204             }
5205           source_byteidx[i] = p - src;
5206         }
5207       else
5208         while (i < 1024 && p < src_end)
5209           source_charbuf[i++] = *p++;
5210
5211       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5212         ccl->last_block = 1;
5213       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5214                   charset_list);
5215       charbuf += ccl->produced;
5216       if (multibytep)
5217         src += source_byteidx[ccl->consumed];
5218       else
5219         src += ccl->consumed;
5220       consumed_chars += ccl->consumed;
5221       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5222         break;
5223     }
5224
5225   switch (ccl->status)
5226     {
5227     case CCL_STAT_SUSPEND_BY_SRC:
5228       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5229       break;
5230     case CCL_STAT_SUSPEND_BY_DST:
5231       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5232       break;
5233     case CCL_STAT_QUIT:
5234     case CCL_STAT_INVALID_CMD:
5235       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5236       break;
5237     default:
5238       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5239       break;
5240     }
5241   coding->consumed_char += consumed_chars;
5242   coding->consumed = src - coding->source;
5243   coding->charbuf_used = charbuf - coding->charbuf;
5244 }
5245
5246 static int
5247 encode_coding_ccl (struct coding_system *coding)
5248 {
5249   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5250   int multibytep = coding->dst_multibyte;
5251   int *charbuf = coding->charbuf;
5252   int *charbuf_end = charbuf + coding->charbuf_used;
5253   unsigned char *dst = coding->destination + coding->produced;
5254   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5255   int destination_charbuf[1024];
5256   ptrdiff_t produced_chars = 0;
5257   int i;
5258   Lisp_Object attrs, charset_list;
5259
5260   CODING_GET_INFO (coding, attrs, charset_list);
5261   if (coding->consumed_char == coding->src_chars
5262       && coding->mode & CODING_MODE_LAST_BLOCK)
5263     ccl->last_block = 1;
5264
5265   do
5266     {
5267       ccl_driver (ccl, charbuf, destination_charbuf,
5268                   charbuf_end - charbuf, 1024, charset_list);
5269       if (multibytep)
5270         {
5271           ASSURE_DESTINATION (ccl->produced * 2);
5272           for (i = 0; i < ccl->produced; i++)
5273             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5274         }
5275       else
5276         {
5277           ASSURE_DESTINATION (ccl->produced);
5278           for (i = 0; i < ccl->produced; i++)
5279             *dst++ = destination_charbuf[i] & 0xFF;
5280           produced_chars += ccl->produced;
5281         }
5282       charbuf += ccl->consumed;
5283       if (ccl->status == CCL_STAT_QUIT
5284           || ccl->status == CCL_STAT_INVALID_CMD)
5285         break;
5286     }
5287   while (charbuf < charbuf_end);
5288
5289   switch (ccl->status)
5290     {
5291     case CCL_STAT_SUSPEND_BY_SRC:
5292       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5293       break;
5294     case CCL_STAT_SUSPEND_BY_DST:
5295       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5296       break;
5297     case CCL_STAT_QUIT:
5298     case CCL_STAT_INVALID_CMD:
5299       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5300       break;
5301     default:
5302       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5303       break;
5304     }
5305
5306   coding->produced_char += produced_chars;
5307   coding->produced = dst - coding->destination;
5308   return 0;
5309 }
5310
5311
5312 \f
5313 /*** 10, 11. no-conversion handlers ***/
5314
5315 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5316
5317 static void
5318 decode_coding_raw_text (struct coding_system *coding)
5319 {
5320   int eol_dos =
5321     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5322
5323   coding->chars_at_source = 1;
5324   coding->consumed_char = coding->src_chars;
5325   coding->consumed = coding->src_bytes;
5326   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5327     {
5328       coding->consumed_char--;
5329       coding->consumed--;
5330       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5331     }
5332   else
5333     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5334 }
5335
5336 static int
5337 encode_coding_raw_text (struct coding_system *coding)
5338 {
5339   int multibytep = coding->dst_multibyte;
5340   int *charbuf = coding->charbuf;
5341   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5342   unsigned char *dst = coding->destination + coding->produced;
5343   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5344   ptrdiff_t produced_chars = 0;
5345   int c;
5346
5347   if (multibytep)
5348     {
5349       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5350
5351       if (coding->src_multibyte)
5352         while (charbuf < charbuf_end)
5353           {
5354             ASSURE_DESTINATION (safe_room);
5355             c = *charbuf++;
5356             if (ASCII_CHAR_P (c))
5357               EMIT_ONE_ASCII_BYTE (c);
5358             else if (CHAR_BYTE8_P (c))
5359               {
5360                 c = CHAR_TO_BYTE8 (c);
5361                 EMIT_ONE_BYTE (c);
5362               }
5363             else
5364               {
5365                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5366
5367                 CHAR_STRING_ADVANCE (c, p1);
5368                 do
5369                   {
5370                     EMIT_ONE_BYTE (*p0);
5371                     p0++;
5372                   }
5373                 while (p0 < p1);
5374               }
5375           }
5376       else
5377         while (charbuf < charbuf_end)
5378           {
5379             ASSURE_DESTINATION (safe_room);
5380             c = *charbuf++;
5381             EMIT_ONE_BYTE (c);
5382           }
5383     }
5384   else
5385     {
5386       if (coding->src_multibyte)
5387         {
5388           int safe_room = MAX_MULTIBYTE_LENGTH;
5389
5390           while (charbuf < charbuf_end)
5391             {
5392               ASSURE_DESTINATION (safe_room);
5393               c = *charbuf++;
5394               if (ASCII_CHAR_P (c))
5395                 *dst++ = c;
5396               else if (CHAR_BYTE8_P (c))
5397                 *dst++ = CHAR_TO_BYTE8 (c);
5398               else
5399                 CHAR_STRING_ADVANCE (c, dst);
5400             }
5401         }
5402       else
5403         {
5404           ASSURE_DESTINATION (charbuf_end - charbuf);
5405           while (charbuf < charbuf_end && dst < dst_end)
5406             *dst++ = *charbuf++;
5407         }
5408       produced_chars = dst - (coding->destination + coding->produced);
5409     }
5410   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5411   coding->produced_char += produced_chars;
5412   coding->produced = dst - coding->destination;
5413   return 0;
5414 }
5415
5416 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5417    Check if a text is encoded in a charset-based coding system.  If it
5418    is, return 1, else return 0.  */
5419
5420 static int
5421 detect_coding_charset (struct coding_system *coding,
5422                        struct coding_detection_info *detect_info)
5423 {
5424   const unsigned char *src = coding->source, *src_base;
5425   const unsigned char *src_end = coding->source + coding->src_bytes;
5426   int multibytep = coding->src_multibyte;
5427   ptrdiff_t consumed_chars = 0;
5428   Lisp_Object attrs, valids, name;
5429   int found = 0;
5430   ptrdiff_t head_ascii = coding->head_ascii;
5431   int check_latin_extra = 0;
5432
5433   detect_info->checked |= CATEGORY_MASK_CHARSET;
5434
5435   coding = &coding_categories[coding_category_charset];
5436   attrs = CODING_ID_ATTRS (coding->id);
5437   valids = AREF (attrs, coding_attr_charset_valids);
5438   name = CODING_ID_NAME (coding->id);
5439   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5440                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5441       || strncmp (SSDATA (SYMBOL_NAME (name)),
5442                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5443     check_latin_extra = 1;
5444
5445   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5446     src += head_ascii;
5447
5448   while (1)
5449     {
5450       int c;
5451       Lisp_Object val;
5452       struct charset *charset;
5453       int dim, idx;
5454
5455       src_base = src;
5456       ONE_MORE_BYTE (c);
5457       if (c < 0)
5458         continue;
5459       val = AREF (valids, c);
5460       if (NILP (val))
5461         break;
5462       if (c >= 0x80)
5463         {
5464           if (c < 0xA0
5465               && check_latin_extra
5466               && (!VECTORP (Vlatin_extra_code_table)
5467                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5468             break;
5469           found = CATEGORY_MASK_CHARSET;
5470         }
5471       if (INTEGERP (val))
5472         {
5473           charset = CHARSET_FROM_ID (XFASTINT (val));
5474           dim = CHARSET_DIMENSION (charset);
5475           for (idx = 1; idx < dim; idx++)
5476             {
5477               if (src == src_end)
5478                 goto too_short;
5479               ONE_MORE_BYTE (c);
5480               if (c < charset->code_space[(dim - 1 - idx) * 4]
5481                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5482                 break;
5483             }
5484           if (idx < dim)
5485             break;
5486         }
5487       else
5488         {
5489           idx = 1;
5490           for (; CONSP (val); val = XCDR (val))
5491             {
5492               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5493               dim = CHARSET_DIMENSION (charset);
5494               while (idx < dim)
5495                 {
5496                   if (src == src_end)
5497                     goto too_short;
5498                   ONE_MORE_BYTE (c);
5499                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5500                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5501                     break;
5502                   idx++;
5503                 }
5504               if (idx == dim)
5505                 {
5506                   val = Qnil;
5507                   break;
5508                 }
5509             }
5510           if (CONSP (val))
5511             break;
5512         }
5513     }
5514  too_short:
5515   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5516   return 0;
5517
5518  no_more_source:
5519   detect_info->found |= found;
5520   return 1;
5521 }
5522
5523 static void
5524 decode_coding_charset (struct coding_system *coding)
5525 {
5526   const unsigned char *src = coding->source + coding->consumed;
5527   const unsigned char *src_end = coding->source + coding->src_bytes;
5528   const unsigned char *src_base;
5529   int *charbuf = coding->charbuf + coding->charbuf_used;
5530   /* We may produce one charset annotation in one loop and one more at
5531      the end.  */
5532   int *charbuf_end
5533     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5534   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5535   int multibytep = coding->src_multibyte;
5536   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5537   Lisp_Object valids;
5538   ptrdiff_t char_offset = coding->produced_char;
5539   ptrdiff_t last_offset = char_offset;
5540   int last_id = charset_ascii;
5541   int eol_dos =
5542     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5543   int byte_after_cr = -1;
5544
5545   valids = AREF (attrs, coding_attr_charset_valids);
5546
5547   while (1)
5548     {
5549       int c;
5550       Lisp_Object val;
5551       struct charset *charset;
5552       int dim;
5553       int len = 1;
5554       unsigned code;
5555
5556       src_base = src;
5557       consumed_chars_base = consumed_chars;
5558
5559       if (charbuf >= charbuf_end)
5560         {
5561           if (byte_after_cr >= 0)
5562             src_base--;
5563           break;
5564         }
5565
5566       if (byte_after_cr >= 0)
5567         {
5568           c = byte_after_cr;
5569           byte_after_cr = -1;
5570         }
5571       else
5572         {
5573           ONE_MORE_BYTE (c);
5574           if (eol_dos && c == '\r')
5575             ONE_MORE_BYTE (byte_after_cr);
5576         }
5577       if (c < 0)
5578         goto invalid_code;
5579       code = c;
5580
5581       val = AREF (valids, c);
5582       if (! INTEGERP (val) && ! CONSP (val))
5583         goto invalid_code;
5584       if (INTEGERP (val))
5585         {
5586           charset = CHARSET_FROM_ID (XFASTINT (val));
5587           dim = CHARSET_DIMENSION (charset);
5588           while (len < dim)
5589             {
5590               ONE_MORE_BYTE (c);
5591               code = (code << 8) | c;
5592               len++;
5593             }
5594           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5595                               charset, code, c);
5596         }
5597       else
5598         {
5599           /* VAL is a list of charset IDs.  It is assured that the
5600              list is sorted by charset dimensions (smaller one
5601              comes first).  */
5602           while (CONSP (val))
5603             {
5604               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5605               dim = CHARSET_DIMENSION (charset);
5606               while (len < dim)
5607                 {
5608                   ONE_MORE_BYTE (c);
5609                   code = (code << 8) | c;
5610                   len++;
5611                 }
5612               CODING_DECODE_CHAR (coding, src, src_base,
5613                                   src_end, charset, code, c);
5614               if (c >= 0)
5615                 break;
5616               val = XCDR (val);
5617             }
5618         }
5619       if (c < 0)
5620         goto invalid_code;
5621       if (charset->id != charset_ascii
5622           && last_id != charset->id)
5623         {
5624           if (last_id != charset_ascii)
5625             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5626           last_id = charset->id;
5627           last_offset = char_offset;
5628         }
5629
5630       *charbuf++ = c;
5631       char_offset++;
5632       continue;
5633
5634     invalid_code:
5635       src = src_base;
5636       consumed_chars = consumed_chars_base;
5637       ONE_MORE_BYTE (c);
5638       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5639       char_offset++;
5640       coding->errors++;
5641     }
5642
5643  no_more_source:
5644   if (last_id != charset_ascii)
5645     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5646   coding->consumed_char += consumed_chars_base;
5647   coding->consumed = src_base - coding->source;
5648   coding->charbuf_used = charbuf - coding->charbuf;
5649 }
5650
5651 static int
5652 encode_coding_charset (struct coding_system *coding)
5653 {
5654   int multibytep = coding->dst_multibyte;
5655   int *charbuf = coding->charbuf;
5656   int *charbuf_end = charbuf + coding->charbuf_used;
5657   unsigned char *dst = coding->destination + coding->produced;
5658   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5659   int safe_room = MAX_MULTIBYTE_LENGTH;
5660   ptrdiff_t produced_chars = 0;
5661   Lisp_Object attrs, charset_list;
5662   int ascii_compatible;
5663   int c;
5664
5665   CODING_GET_INFO (coding, attrs, charset_list);
5666   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5667
5668   while (charbuf < charbuf_end)
5669     {
5670       struct charset *charset;
5671       unsigned code;
5672
5673       ASSURE_DESTINATION (safe_room);
5674       c = *charbuf++;
5675       if (ascii_compatible && ASCII_CHAR_P (c))
5676         EMIT_ONE_ASCII_BYTE (c);
5677       else if (CHAR_BYTE8_P (c))
5678         {
5679           c = CHAR_TO_BYTE8 (c);
5680           EMIT_ONE_BYTE (c);
5681         }
5682       else
5683         {
5684           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5685                                &code, charset);
5686
5687           if (charset)
5688             {
5689               if (CHARSET_DIMENSION (charset) == 1)
5690                 EMIT_ONE_BYTE (code);
5691               else if (CHARSET_DIMENSION (charset) == 2)
5692                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5693               else if (CHARSET_DIMENSION (charset) == 3)
5694                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5695               else
5696                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5697                                  (code >> 8) & 0xFF, code & 0xFF);
5698             }
5699           else
5700             {
5701               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5702                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5703               else
5704                 c = coding->default_char;
5705               EMIT_ONE_BYTE (c);
5706             }
5707         }
5708     }
5709
5710   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5711   coding->produced_char += produced_chars;
5712   coding->produced = dst - coding->destination;
5713   return 0;
5714 }
5715
5716 \f
5717 /*** 7. C library functions ***/
5718
5719 /* Setup coding context CODING from information about CODING_SYSTEM.
5720    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5721    CODING_SYSTEM is invalid, signal an error.  */
5722
5723 void
5724 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5725 {
5726   Lisp_Object attrs;
5727   Lisp_Object eol_type;
5728   Lisp_Object coding_type;
5729   Lisp_Object val;
5730
5731   if (NILP (coding_system))
5732     coding_system = Qundecided;
5733
5734   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5735
5736   attrs = CODING_ID_ATTRS (coding->id);
5737   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5738
5739   coding->mode = 0;
5740   coding->head_ascii = -1;
5741   if (VECTORP (eol_type))
5742     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5743                             | CODING_REQUIRE_DETECTION_MASK);
5744   else if (! EQ (eol_type, Qunix))
5745     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5746                             | CODING_REQUIRE_ENCODING_MASK);
5747   else
5748     coding->common_flags = 0;
5749   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5750     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5751   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5752     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5753   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5754     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5755
5756   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5757   coding->max_charset_id = SCHARS (val) - 1;
5758   coding->safe_charsets = SDATA (val);
5759   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5760   coding->carryover_bytes = 0;
5761
5762   coding_type = CODING_ATTR_TYPE (attrs);
5763   if (EQ (coding_type, Qundecided))
5764     {
5765       coding->detector = NULL;
5766       coding->decoder = decode_coding_raw_text;
5767       coding->encoder = encode_coding_raw_text;
5768       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5769     }
5770   else if (EQ (coding_type, Qiso_2022))
5771     {
5772       int i;
5773       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5774
5775       /* Invoke graphic register 0 to plane 0.  */
5776       CODING_ISO_INVOCATION (coding, 0) = 0;
5777       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5778       CODING_ISO_INVOCATION (coding, 1)
5779         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5780       /* Setup the initial status of designation.  */
5781       for (i = 0; i < 4; i++)
5782         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5783       /* Not single shifting initially.  */
5784       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5785       /* Beginning of buffer should also be regarded as bol. */
5786       CODING_ISO_BOL (coding) = 1;
5787       coding->detector = detect_coding_iso_2022;
5788       coding->decoder = decode_coding_iso_2022;
5789       coding->encoder = encode_coding_iso_2022;
5790       if (flags & CODING_ISO_FLAG_SAFE)
5791         coding->mode |= CODING_MODE_SAFE_ENCODING;
5792       coding->common_flags
5793         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5794             | CODING_REQUIRE_FLUSHING_MASK);
5795       if (flags & CODING_ISO_FLAG_COMPOSITION)
5796         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5797       if (flags & CODING_ISO_FLAG_DESIGNATION)
5798         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5799       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5800         {
5801           setup_iso_safe_charsets (attrs);
5802           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5803           coding->max_charset_id = SCHARS (val) - 1;
5804           coding->safe_charsets = SDATA (val);
5805         }
5806       CODING_ISO_FLAGS (coding) = flags;
5807       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5808       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5809       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5810       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5811     }
5812   else if (EQ (coding_type, Qcharset))
5813     {
5814       coding->detector = detect_coding_charset;
5815       coding->decoder = decode_coding_charset;
5816       coding->encoder = encode_coding_charset;
5817       coding->common_flags
5818         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5819     }
5820   else if (EQ (coding_type, Qutf_8))
5821     {
5822       val = AREF (attrs, coding_attr_utf_bom);
5823       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5824                                    : EQ (val, Qt) ? utf_with_bom
5825                                    : utf_without_bom);
5826       coding->detector = detect_coding_utf_8;
5827       coding->decoder = decode_coding_utf_8;
5828       coding->encoder = encode_coding_utf_8;
5829       coding->common_flags
5830         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5831       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5832         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5833     }
5834   else if (EQ (coding_type, Qutf_16))
5835     {
5836       val = AREF (attrs, coding_attr_utf_bom);
5837       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5838                                     : EQ (val, Qt) ? utf_with_bom
5839                                     : utf_without_bom);
5840       val = AREF (attrs, coding_attr_utf_16_endian);
5841       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5842                                        : utf_16_little_endian);
5843       CODING_UTF_16_SURROGATE (coding) = 0;
5844       coding->detector = detect_coding_utf_16;
5845       coding->decoder = decode_coding_utf_16;
5846       coding->encoder = encode_coding_utf_16;
5847       coding->common_flags
5848         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5849       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5850         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5851     }
5852   else if (EQ (coding_type, Qccl))
5853     {
5854       coding->detector = detect_coding_ccl;
5855       coding->decoder = decode_coding_ccl;
5856       coding->encoder = encode_coding_ccl;
5857       coding->common_flags
5858         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5859             | CODING_REQUIRE_FLUSHING_MASK);
5860     }
5861   else if (EQ (coding_type, Qemacs_mule))
5862     {
5863       coding->detector = detect_coding_emacs_mule;
5864       coding->decoder = decode_coding_emacs_mule;
5865       coding->encoder = encode_coding_emacs_mule;
5866       coding->common_flags
5867         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5868       coding->spec.emacs_mule.full_support = 1;
5869       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5870           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5871         {
5872           Lisp_Object tail, safe_charsets;
5873           int max_charset_id = 0;
5874
5875           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5876                tail = XCDR (tail))
5877             if (max_charset_id < XFASTINT (XCAR (tail)))
5878               max_charset_id = XFASTINT (XCAR (tail));
5879           safe_charsets = make_uninit_string (max_charset_id + 1);
5880           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5881           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5882                tail = XCDR (tail))
5883             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5884           coding->max_charset_id = max_charset_id;
5885           coding->safe_charsets = SDATA (safe_charsets);
5886           coding->spec.emacs_mule.full_support = 1;
5887         }
5888       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5889       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5890     }
5891   else if (EQ (coding_type, Qshift_jis))
5892     {
5893       coding->detector = detect_coding_sjis;
5894       coding->decoder = decode_coding_sjis;
5895       coding->encoder = encode_coding_sjis;
5896       coding->common_flags
5897         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5898     }
5899   else if (EQ (coding_type, Qbig5))
5900     {
5901       coding->detector = detect_coding_big5;
5902       coding->decoder = decode_coding_big5;
5903       coding->encoder = encode_coding_big5;
5904       coding->common_flags
5905         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5906     }
5907   else                          /* EQ (coding_type, Qraw_text) */
5908     {
5909       coding->detector = NULL;
5910       coding->decoder = decode_coding_raw_text;
5911       coding->encoder = encode_coding_raw_text;
5912       if (! EQ (eol_type, Qunix))
5913         {
5914           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5915           if (! VECTORP (eol_type))
5916             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5917         }
5918
5919     }
5920
5921   return;
5922 }
5923
5924 /* Return a list of charsets supported by CODING.  */
5925
5926 Lisp_Object
5927 coding_charset_list (struct coding_system *coding)
5928 {
5929   Lisp_Object attrs, charset_list;
5930
5931   CODING_GET_INFO (coding, attrs, charset_list);
5932   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5933     {
5934       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5935
5936       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5937         charset_list = Viso_2022_charset_list;
5938     }
5939   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5940     {
5941       charset_list = Vemacs_mule_charset_list;
5942     }
5943   return charset_list;
5944 }
5945
5946
5947 /* Return a list of charsets supported by CODING-SYSTEM.  */
5948
5949 Lisp_Object
5950 coding_system_charset_list (Lisp_Object coding_system)
5951 {
5952   ptrdiff_t id;
5953   Lisp_Object attrs, charset_list;
5954
5955   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5956   attrs = CODING_ID_ATTRS (id);
5957
5958   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5959     {
5960       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5961
5962       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5963         charset_list = Viso_2022_charset_list;
5964       else
5965         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5966     }
5967   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5968     {
5969       charset_list = Vemacs_mule_charset_list;
5970     }
5971   else
5972     {
5973       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5974     }
5975   return charset_list;
5976 }
5977
5978
5979 /* Return raw-text or one of its subsidiaries that has the same
5980    eol_type as CODING-SYSTEM.  */
5981
5982 Lisp_Object
5983 raw_text_coding_system (Lisp_Object coding_system)
5984 {
5985   Lisp_Object spec, attrs;
5986   Lisp_Object eol_type, raw_text_eol_type;
5987
5988   if (NILP (coding_system))
5989     return Qraw_text;
5990   spec = CODING_SYSTEM_SPEC (coding_system);
5991   attrs = AREF (spec, 0);
5992
5993   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5994     return coding_system;
5995
5996   eol_type = AREF (spec, 2);
5997   if (VECTORP (eol_type))
5998     return Qraw_text;
5999   spec = CODING_SYSTEM_SPEC (Qraw_text);
6000   raw_text_eol_type = AREF (spec, 2);
6001   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6002           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6003           : AREF (raw_text_eol_type, 2));
6004 }
6005
6006
6007 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6008    the subsidiary that has the same eol-spec as PARENT (if it is not
6009    nil and specifies end-of-line format) or the system's setting
6010    (system_eol_type).  */
6011
6012 Lisp_Object
6013 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6014 {
6015   Lisp_Object spec, eol_type;
6016
6017   if (NILP (coding_system))
6018     coding_system = Qraw_text;
6019   spec = CODING_SYSTEM_SPEC (coding_system);
6020   eol_type = AREF (spec, 2);
6021   if (VECTORP (eol_type))
6022     {
6023       Lisp_Object parent_eol_type;
6024
6025       if (! NILP (parent))
6026         {
6027           Lisp_Object parent_spec;
6028
6029           parent_spec = CODING_SYSTEM_SPEC (parent);
6030           parent_eol_type = AREF (parent_spec, 2);
6031           if (VECTORP (parent_eol_type))
6032             parent_eol_type = system_eol_type;
6033         }
6034       else
6035         parent_eol_type = system_eol_type;
6036       if (EQ (parent_eol_type, Qunix))
6037         coding_system = AREF (eol_type, 0);
6038       else if (EQ (parent_eol_type, Qdos))
6039         coding_system = AREF (eol_type, 1);
6040       else if (EQ (parent_eol_type, Qmac))
6041         coding_system = AREF (eol_type, 2);
6042     }
6043   return coding_system;
6044 }
6045
6046
6047 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6048    decided for writing to a process.  If not, complement them, and
6049    return a new coding system.  */
6050
6051 Lisp_Object
6052 complement_process_encoding_system (Lisp_Object coding_system)
6053 {
6054   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6055   Lisp_Object spec, attrs;
6056   int i;
6057
6058   for (i = 0; i < 3; i++)
6059     {
6060       if (i == 1)
6061         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6062       else if (i == 2)
6063         coding_system = preferred_coding_system ();
6064       spec = CODING_SYSTEM_SPEC (coding_system);
6065       if (NILP (spec))
6066         continue;
6067       attrs = AREF (spec, 0);
6068       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6069         coding_base = CODING_ATTR_BASE_NAME (attrs);
6070       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6071         eol_base = coding_system;
6072       if (! NILP (coding_base) && ! NILP (eol_base))
6073         break;
6074     }
6075
6076   if (i > 0)
6077     /* The original CODING_SYSTEM didn't specify text-conversion or
6078        eol-conversion.  Be sure that we return a fully complemented
6079        coding system.  */
6080     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6081   return coding_system;
6082 }
6083
6084
6085 /* Emacs has a mechanism to automatically detect a coding system if it
6086    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6087    it's impossible to distinguish some coding systems accurately
6088    because they use the same range of codes.  So, at first, coding
6089    systems are categorized into 7, those are:
6090
6091    o coding-category-emacs-mule
6092
6093         The category for a coding system which has the same code range
6094         as Emacs' internal format.  Assigned the coding-system (Lisp
6095         symbol) `emacs-mule' by default.
6096
6097    o coding-category-sjis
6098
6099         The category for a coding system which has the same code range
6100         as SJIS.  Assigned the coding-system (Lisp
6101         symbol) `japanese-shift-jis' by default.
6102
6103    o coding-category-iso-7
6104
6105         The category for a coding system which has the same code range
6106         as ISO2022 of 7-bit environment.  This doesn't use any locking
6107         shift and single shift functions.  This can encode/decode all
6108         charsets.  Assigned the coding-system (Lisp symbol)
6109         `iso-2022-7bit' by default.
6110
6111    o coding-category-iso-7-tight
6112
6113         Same as coding-category-iso-7 except that this can
6114         encode/decode only the specified charsets.
6115
6116    o coding-category-iso-8-1
6117
6118         The category for a coding system which has the same code range
6119         as ISO2022 of 8-bit environment and graphic plane 1 used only
6120         for DIMENSION1 charset.  This doesn't use any locking shift
6121         and single shift functions.  Assigned the coding-system (Lisp
6122         symbol) `iso-latin-1' by default.
6123
6124    o coding-category-iso-8-2
6125
6126         The category for a coding system which has the same code range
6127         as ISO2022 of 8-bit environment and graphic plane 1 used only
6128         for DIMENSION2 charset.  This doesn't use any locking shift
6129         and single shift functions.  Assigned the coding-system (Lisp
6130         symbol) `japanese-iso-8bit' by default.
6131
6132    o coding-category-iso-7-else
6133
6134         The category for a coding system which has the same code range
6135         as ISO2022 of 7-bit environment but uses locking shift or
6136         single shift functions.  Assigned the coding-system (Lisp
6137         symbol) `iso-2022-7bit-lock' by default.
6138
6139    o coding-category-iso-8-else
6140
6141         The category for a coding system which has the same code range
6142         as ISO2022 of 8-bit environment but uses locking shift or
6143         single shift functions.  Assigned the coding-system (Lisp
6144         symbol) `iso-2022-8bit-ss2' by default.
6145
6146    o coding-category-big5
6147
6148         The category for a coding system which has the same code range
6149         as BIG5.  Assigned the coding-system (Lisp symbol)
6150         `cn-big5' by default.
6151
6152    o coding-category-utf-8
6153
6154         The category for a coding system which has the same code range
6155         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6156         symbol) `utf-8' by default.
6157
6158    o coding-category-utf-16-be
6159
6160         The category for a coding system in which a text has an
6161         Unicode signature (cf. Unicode Standard) in the order of BIG
6162         endian at the head.  Assigned the coding-system (Lisp symbol)
6163         `utf-16-be' by default.
6164
6165    o coding-category-utf-16-le
6166
6167         The category for a coding system in which a text has an
6168         Unicode signature (cf. Unicode Standard) in the order of
6169         LITTLE endian at the head.  Assigned the coding-system (Lisp
6170         symbol) `utf-16-le' by default.
6171
6172    o coding-category-ccl
6173
6174         The category for a coding system of which encoder/decoder is
6175         written in CCL programs.  The default value is nil, i.e., no
6176         coding system is assigned.
6177
6178    o coding-category-binary
6179
6180         The category for a coding system not categorized in any of the
6181         above.  Assigned the coding-system (Lisp symbol)
6182         `no-conversion' by default.
6183
6184    Each of them is a Lisp symbol and the value is an actual
6185    `coding-system's (this is also a Lisp symbol) assigned by a user.
6186    What Emacs does actually is to detect a category of coding system.
6187    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6188    decide only one possible category, it selects a category of the
6189    highest priority.  Priorities of categories are also specified by a
6190    user in a Lisp variable `coding-category-list'.
6191
6192 */
6193
6194 #define EOL_SEEN_NONE   0
6195 #define EOL_SEEN_LF     1
6196 #define EOL_SEEN_CR     2
6197 #define EOL_SEEN_CRLF   4
6198
6199 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6200    SOURCE is encoded.  If CATEGORY is one of
6201    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6202    two-byte, else they are encoded by one-byte.
6203
6204    Return one of EOL_SEEN_XXX.  */
6205
6206 #define MAX_EOL_CHECK_COUNT 3
6207
6208 static int
6209 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6210             enum coding_category category)
6211 {
6212   const unsigned char *src = source, *src_end = src + src_bytes;
6213   unsigned char c;
6214   int total  = 0;
6215   int eol_seen = EOL_SEEN_NONE;
6216
6217   if ((1 << category) & CATEGORY_MASK_UTF_16)
6218     {
6219       int msb, lsb;
6220
6221       msb = category == (coding_category_utf_16_le
6222                          | coding_category_utf_16_le_nosig);
6223       lsb = 1 - msb;
6224
6225       while (src + 1 < src_end)
6226         {
6227           c = src[lsb];
6228           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6229             {
6230               int this_eol;
6231
6232               if (c == '\n')
6233                 this_eol = EOL_SEEN_LF;
6234               else if (src + 3 >= src_end
6235                        || src[msb + 2] != 0
6236                        || src[lsb + 2] != '\n')
6237                 this_eol = EOL_SEEN_CR;
6238               else
6239                 {
6240                   this_eol = EOL_SEEN_CRLF;
6241                   src += 2;
6242                 }
6243
6244               if (eol_seen == EOL_SEEN_NONE)
6245                 /* This is the first end-of-line.  */
6246                 eol_seen = this_eol;
6247               else if (eol_seen != this_eol)
6248                 {
6249                   /* The found type is different from what found before.
6250                      Allow for stray ^M characters in DOS EOL files.  */
6251                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6252                       || (eol_seen == EOL_SEEN_CRLF
6253                           && this_eol == EOL_SEEN_CR))
6254                     eol_seen = EOL_SEEN_CRLF;
6255                   else
6256                     {
6257                       eol_seen = EOL_SEEN_LF;
6258                       break;
6259                     }
6260                 }
6261               if (++total == MAX_EOL_CHECK_COUNT)
6262                 break;
6263             }
6264           src += 2;
6265         }
6266     }
6267   else
6268     while (src < src_end)
6269       {
6270         c = *src++;
6271         if (c == '\n' || c == '\r')
6272           {
6273             int this_eol;
6274
6275             if (c == '\n')
6276               this_eol = EOL_SEEN_LF;
6277             else if (src >= src_end || *src != '\n')
6278               this_eol = EOL_SEEN_CR;
6279             else
6280               this_eol = EOL_SEEN_CRLF, src++;
6281
6282             if (eol_seen == EOL_SEEN_NONE)
6283               /* This is the first end-of-line.  */
6284               eol_seen = this_eol;
6285             else if (eol_seen != this_eol)
6286               {
6287                 /* The found type is different from what found before.
6288                    Allow for stray ^M characters in DOS EOL files.  */
6289                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6290                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6291                   eol_seen = EOL_SEEN_CRLF;
6292                 else
6293                   {
6294                     eol_seen = EOL_SEEN_LF;
6295                     break;
6296                   }
6297               }
6298             if (++total == MAX_EOL_CHECK_COUNT)
6299               break;
6300           }
6301       }
6302   return eol_seen;
6303 }
6304
6305
6306 static Lisp_Object
6307 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6308 {
6309   Lisp_Object eol_type;
6310
6311   eol_type = CODING_ID_EOL_TYPE (coding->id);
6312   if (eol_seen & EOL_SEEN_LF)
6313     {
6314       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6315       eol_type = Qunix;
6316     }
6317   else if (eol_seen & EOL_SEEN_CRLF)
6318     {
6319       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6320       eol_type = Qdos;
6321     }
6322   else if (eol_seen & EOL_SEEN_CR)
6323     {
6324       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6325       eol_type = Qmac;
6326     }
6327   return eol_type;
6328 }
6329
6330 /* Detect how a text specified in CODING is encoded.  If a coding
6331    system is detected, update fields of CODING by the detected coding
6332    system.  */
6333
6334 static void
6335 detect_coding (struct coding_system *coding)
6336 {
6337   const unsigned char *src, *src_end;
6338   int saved_mode = coding->mode;
6339
6340   coding->consumed = coding->consumed_char = 0;
6341   coding->produced = coding->produced_char = 0;
6342   coding_set_source (coding);
6343
6344   src_end = coding->source + coding->src_bytes;
6345   coding->head_ascii = 0;
6346
6347   /* If we have not yet decided the text encoding type, detect it
6348      now.  */
6349   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6350     {
6351       int c, i;
6352       struct coding_detection_info detect_info;
6353       int null_byte_found = 0, eight_bit_found = 0;
6354
6355       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6356       for (src = coding->source; src < src_end; src++)
6357         {
6358           c = *src;
6359           if (c & 0x80)
6360             {
6361               eight_bit_found = 1;
6362               if (null_byte_found)
6363                 break;
6364             }
6365           else if (c < 0x20)
6366             {
6367               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6368                   && ! inhibit_iso_escape_detection
6369                   && ! detect_info.checked)
6370                 {
6371                   if (detect_coding_iso_2022 (coding, &detect_info))
6372                     {
6373                       /* We have scanned the whole data.  */
6374                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6375                         {
6376                           /* We didn't find an 8-bit code.  We may
6377                              have found a null-byte, but it's very
6378                              rare that a binary file conforms to
6379                              ISO-2022.  */
6380                           src = src_end;
6381                           coding->head_ascii = src - coding->source;
6382                         }
6383                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6384                       break;
6385                     }
6386                 }
6387               else if (! c && !inhibit_null_byte_detection)
6388                 {
6389                   null_byte_found = 1;
6390                   if (eight_bit_found)
6391                     break;
6392                 }
6393               if (! eight_bit_found)
6394                 coding->head_ascii++;
6395             }
6396           else if (! eight_bit_found)
6397             coding->head_ascii++;
6398         }
6399
6400       if (null_byte_found || eight_bit_found
6401           || coding->head_ascii < coding->src_bytes
6402           || detect_info.found)
6403         {
6404           enum coding_category category;
6405           struct coding_system *this;
6406
6407           if (coding->head_ascii == coding->src_bytes)
6408             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6409             for (i = 0; i < coding_category_raw_text; i++)
6410               {
6411                 category = coding_priorities[i];
6412                 this = coding_categories + category;
6413                 if (detect_info.found & (1 << category))
6414                   break;
6415               }
6416           else
6417             {
6418               if (null_byte_found)
6419                 {
6420                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6421                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6422                 }
6423               for (i = 0; i < coding_category_raw_text; i++)
6424                 {
6425                   category = coding_priorities[i];
6426                   this = coding_categories + category;
6427                   if (this->id < 0)
6428                     {
6429                       /* No coding system of this category is defined.  */
6430                       detect_info.rejected |= (1 << category);
6431                     }
6432                   else if (category >= coding_category_raw_text)
6433                     continue;
6434                   else if (detect_info.checked & (1 << category))
6435                     {
6436                       if (detect_info.found & (1 << category))
6437                         break;
6438                     }
6439                   else if ((*(this->detector)) (coding, &detect_info)
6440                            && detect_info.found & (1 << category))
6441                     {
6442                       if (category == coding_category_utf_16_auto)
6443                         {
6444                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6445                             category = coding_category_utf_16_le;
6446                           else
6447                             category = coding_category_utf_16_be;
6448                         }
6449                       break;
6450                     }
6451                 }
6452             }
6453
6454           if (i < coding_category_raw_text)
6455             setup_coding_system (CODING_ID_NAME (this->id), coding);
6456           else if (null_byte_found)
6457             setup_coding_system (Qno_conversion, coding);
6458           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6459                    == CATEGORY_MASK_ANY)
6460             setup_coding_system (Qraw_text, coding);
6461           else if (detect_info.rejected)
6462             for (i = 0; i < coding_category_raw_text; i++)
6463               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6464                 {
6465                   this = coding_categories + coding_priorities[i];
6466                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6467                   break;
6468                 }
6469         }
6470     }
6471   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6472            == coding_category_utf_8_auto)
6473     {
6474       Lisp_Object coding_systems;
6475       struct coding_detection_info detect_info;
6476
6477       coding_systems
6478         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6479       detect_info.found = detect_info.rejected = 0;
6480       coding->head_ascii = 0;
6481       if (CONSP (coding_systems)
6482           && detect_coding_utf_8 (coding, &detect_info))
6483         {
6484           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6485             setup_coding_system (XCAR (coding_systems), coding);
6486           else
6487             setup_coding_system (XCDR (coding_systems), coding);
6488         }
6489     }
6490   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6491            == coding_category_utf_16_auto)
6492     {
6493       Lisp_Object coding_systems;
6494       struct coding_detection_info detect_info;
6495
6496       coding_systems
6497         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6498       detect_info.found = detect_info.rejected = 0;
6499       coding->head_ascii = 0;
6500       if (CONSP (coding_systems)
6501           && detect_coding_utf_16 (coding, &detect_info))
6502         {
6503           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6504             setup_coding_system (XCAR (coding_systems), coding);
6505           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6506             setup_coding_system (XCDR (coding_systems), coding);
6507         }
6508     }
6509   coding->mode = saved_mode;
6510 }
6511
6512
6513 static void
6514 decode_eol (struct coding_system *coding)
6515 {
6516   Lisp_Object eol_type;
6517   unsigned char *p, *pbeg, *pend;
6518
6519   eol_type = CODING_ID_EOL_TYPE (coding->id);
6520   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6521     return;
6522
6523   if (NILP (coding->dst_object))
6524     pbeg = coding->destination;
6525   else
6526     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6527   pend = pbeg + coding->produced;
6528
6529   if (VECTORP (eol_type))
6530     {
6531       int eol_seen = EOL_SEEN_NONE;
6532
6533       for (p = pbeg; p < pend; p++)
6534         {
6535           if (*p == '\n')
6536             eol_seen |= EOL_SEEN_LF;
6537           else if (*p == '\r')
6538             {
6539               if (p + 1 < pend && *(p + 1) == '\n')
6540                 {
6541                   eol_seen |= EOL_SEEN_CRLF;
6542                   p++;
6543                 }
6544               else
6545                 eol_seen |= EOL_SEEN_CR;
6546             }
6547         }
6548       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6549       if ((eol_seen & EOL_SEEN_CRLF) != 0
6550           && (eol_seen & EOL_SEEN_CR) != 0
6551           && (eol_seen & EOL_SEEN_LF) == 0)
6552         eol_seen = EOL_SEEN_CRLF;
6553       else if (eol_seen != EOL_SEEN_NONE
6554           && eol_seen != EOL_SEEN_LF
6555           && eol_seen != EOL_SEEN_CRLF
6556           && eol_seen != EOL_SEEN_CR)
6557         eol_seen = EOL_SEEN_LF;
6558       if (eol_seen != EOL_SEEN_NONE)
6559         eol_type = adjust_coding_eol_type (coding, eol_seen);
6560     }
6561
6562   if (EQ (eol_type, Qmac))
6563     {
6564       for (p = pbeg; p < pend; p++)
6565         if (*p == '\r')
6566           *p = '\n';
6567     }
6568   else if (EQ (eol_type, Qdos))
6569     {
6570       ptrdiff_t n = 0;
6571
6572       if (NILP (coding->dst_object))
6573         {
6574           /* Start deleting '\r' from the tail to minimize the memory
6575              movement.  */
6576           for (p = pend - 2; p >= pbeg; p--)
6577             if (*p == '\r')
6578               {
6579                 memmove (p, p + 1, pend-- - p - 1);
6580                 n++;
6581               }
6582         }
6583       else
6584         {
6585           ptrdiff_t pos_byte = coding->dst_pos_byte;
6586           ptrdiff_t pos = coding->dst_pos;
6587           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6588
6589           while (pos < pos_end)
6590             {
6591               p = BYTE_POS_ADDR (pos_byte);
6592               if (*p == '\r' && p[1] == '\n')
6593                 {
6594                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6595                   n++;
6596                   pos_end--;
6597                 }
6598               pos++;
6599               if (coding->dst_multibyte)
6600                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6601               else
6602                 pos_byte++;
6603             }
6604         }
6605       coding->produced -= n;
6606       coding->produced_char -= n;
6607     }
6608 }
6609
6610
6611 /* Return a translation table (or list of them) from coding system
6612    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6613    decoding (ENCODEP is zero). */
6614
6615 static Lisp_Object
6616 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6617 {
6618   Lisp_Object standard, translation_table;
6619   Lisp_Object val;
6620
6621   if (NILP (Venable_character_translation))
6622     {
6623       if (max_lookup)
6624         *max_lookup = 0;
6625       return Qnil;
6626     }
6627   if (encodep)
6628     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6629       standard = Vstandard_translation_table_for_encode;
6630   else
6631     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6632       standard = Vstandard_translation_table_for_decode;
6633   if (NILP (translation_table))
6634     translation_table = standard;
6635   else
6636     {
6637       if (SYMBOLP (translation_table))
6638         translation_table = Fget (translation_table, Qtranslation_table);
6639       else if (CONSP (translation_table))
6640         {
6641           translation_table = Fcopy_sequence (translation_table);
6642           for (val = translation_table; CONSP (val); val = XCDR (val))
6643             if (SYMBOLP (XCAR (val)))
6644               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6645         }
6646       if (CHAR_TABLE_P (standard))
6647         {
6648           if (CONSP (translation_table))
6649             translation_table = nconc2 (translation_table,
6650                                         Fcons (standard, Qnil));
6651           else
6652             translation_table = Fcons (translation_table,
6653                                        Fcons (standard, Qnil));
6654         }
6655     }
6656
6657   if (max_lookup)
6658     {
6659       *max_lookup = 1;
6660       if (CHAR_TABLE_P (translation_table)
6661           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6662         {
6663           val = XCHAR_TABLE (translation_table)->extras[1];
6664           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6665             *max_lookup = XFASTINT (val);
6666         }
6667       else if (CONSP (translation_table))
6668         {
6669           Lisp_Object tail;
6670
6671           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6672             if (CHAR_TABLE_P (XCAR (tail))
6673                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6674               {
6675                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6676                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6677                   *max_lookup = XFASTINT (tailval);
6678               }
6679         }
6680     }
6681   return translation_table;
6682 }
6683
6684 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6685   do {                                                          \
6686     trans = Qnil;                                               \
6687     if (CHAR_TABLE_P (table))                                   \
6688       {                                                         \
6689         trans = CHAR_TABLE_REF (table, c);                      \
6690         if (CHARACTERP (trans))                                 \
6691           c = XFASTINT (trans), trans = Qnil;                   \
6692       }                                                         \
6693     else if (CONSP (table))                                     \
6694       {                                                         \
6695         Lisp_Object tail;                                       \
6696                                                                 \
6697         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6698           if (CHAR_TABLE_P (XCAR (tail)))                       \
6699             {                                                   \
6700               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6701               if (CHARACTERP (trans))                           \
6702                 c = XFASTINT (trans), trans = Qnil;             \
6703               else if (! NILP (trans))                          \
6704                 break;                                          \
6705             }                                                   \
6706       }                                                         \
6707   } while (0)
6708
6709
6710 /* Return a translation of character(s) at BUF according to TRANS.
6711    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6712    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6713    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6714    translation is found, and Qnil if not found..
6715    If BUF is too short to lookup characters in FROM, return Qt.  */
6716
6717 static Lisp_Object
6718 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6719 {
6720
6721   if (INTEGERP (trans))
6722     return trans;
6723   for (; CONSP (trans); trans = XCDR (trans))
6724     {
6725       Lisp_Object val = XCAR (trans);
6726       Lisp_Object from = XCAR (val);
6727       ptrdiff_t len = ASIZE (from);
6728       ptrdiff_t i;
6729
6730       for (i = 0; i < len; i++)
6731         {
6732           if (buf + i == buf_end)
6733             return Qt;
6734           if (XINT (AREF (from, i)) != buf[i])
6735             break;
6736         }
6737       if (i == len)
6738         return val;
6739     }
6740   return Qnil;
6741 }
6742
6743
6744 static int
6745 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6746                int last_block)
6747 {
6748   unsigned char *dst = coding->destination + coding->produced;
6749   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6750   ptrdiff_t produced;
6751   ptrdiff_t produced_chars = 0;
6752   int carryover = 0;
6753
6754   if (! coding->chars_at_source)
6755     {
6756       /* Source characters are in coding->charbuf.  */
6757       int *buf = coding->charbuf;
6758       int *buf_end = buf + coding->charbuf_used;
6759
6760       if (EQ (coding->src_object, coding->dst_object))
6761         {
6762           coding_set_source (coding);
6763           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6764         }
6765
6766       while (buf < buf_end)
6767         {
6768           int c = *buf, i;
6769
6770           if (c >= 0)
6771             {
6772               ptrdiff_t from_nchars = 1, to_nchars = 1;
6773               Lisp_Object trans = Qnil;
6774
6775               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6776               if (! NILP (trans))
6777                 {
6778                   trans = get_translation (trans, buf, buf_end);
6779                   if (INTEGERP (trans))
6780                     c = XINT (trans);
6781                   else if (CONSP (trans))
6782                     {
6783                       from_nchars = ASIZE (XCAR (trans));
6784                       trans = XCDR (trans);
6785                       if (INTEGERP (trans))
6786                         c = XINT (trans);
6787                       else
6788                         {
6789                           to_nchars = ASIZE (trans);
6790                           c = XINT (AREF (trans, 0));
6791                         }
6792                     }
6793                   else if (EQ (trans, Qt) && ! last_block)
6794                     break;
6795                 }
6796
6797               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6798                 {
6799                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6800                        / MAX_MULTIBYTE_LENGTH)
6801                       < to_nchars)
6802                     memory_full (SIZE_MAX);
6803                   dst = alloc_destination (coding,
6804                                            buf_end - buf
6805                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6806                                            dst);
6807                   if (EQ (coding->src_object, coding->dst_object))
6808                     {
6809                       coding_set_source (coding);
6810                       dst_end = (((unsigned char *) coding->source)
6811                                  + coding->consumed);
6812                     }
6813                   else
6814                     dst_end = coding->destination + coding->dst_bytes;
6815                 }
6816
6817               for (i = 0; i < to_nchars; i++)
6818                 {
6819                   if (i > 0)
6820                     c = XINT (AREF (trans, i));
6821                   if (coding->dst_multibyte
6822                       || ! CHAR_BYTE8_P (c))
6823                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6824                   else
6825                     *dst++ = CHAR_TO_BYTE8 (c);
6826                 }
6827               produced_chars += to_nchars;
6828               buf += from_nchars;
6829             }
6830           else
6831             /* This is an annotation datum.  (-C) is the length.  */
6832             buf += -c;
6833         }
6834       carryover = buf_end - buf;
6835     }
6836   else
6837     {
6838       /* Source characters are at coding->source.  */
6839       const unsigned char *src = coding->source;
6840       const unsigned char *src_end = src + coding->consumed;
6841
6842       if (EQ (coding->dst_object, coding->src_object))
6843         dst_end = (unsigned char *) src;
6844       if (coding->src_multibyte != coding->dst_multibyte)
6845         {
6846           if (coding->src_multibyte)
6847             {
6848               int multibytep = 1;
6849               ptrdiff_t consumed_chars = 0;
6850
6851               while (1)
6852                 {
6853                   const unsigned char *src_base = src;
6854                   int c;
6855
6856                   ONE_MORE_BYTE (c);
6857                   if (dst == dst_end)
6858                     {
6859                       if (EQ (coding->src_object, coding->dst_object))
6860                         dst_end = (unsigned char *) src;
6861                       if (dst == dst_end)
6862                         {
6863                           ptrdiff_t offset = src - coding->source;
6864
6865                           dst = alloc_destination (coding, src_end - src + 1,
6866                                                    dst);
6867                           dst_end = coding->destination + coding->dst_bytes;
6868                           coding_set_source (coding);
6869                           src = coding->source + offset;
6870                           src_end = coding->source + coding->consumed;
6871                           if (EQ (coding->src_object, coding->dst_object))
6872                             dst_end = (unsigned char *) src;
6873                         }
6874                     }
6875                   *dst++ = c;
6876                   produced_chars++;
6877                 }
6878             no_more_source:
6879               ;
6880             }
6881           else
6882             while (src < src_end)
6883               {
6884                 int multibytep = 1;
6885                 int c = *src++;
6886
6887                 if (dst >= dst_end - 1)
6888                   {
6889                     if (EQ (coding->src_object, coding->dst_object))
6890                       dst_end = (unsigned char *) src;
6891                     if (dst >= dst_end - 1)
6892                       {
6893                         ptrdiff_t offset = src - coding->source;
6894                         ptrdiff_t more_bytes;
6895
6896                         if (EQ (coding->src_object, coding->dst_object))
6897                           more_bytes = ((src_end - src) / 2) + 2;
6898                         else
6899                           more_bytes = src_end - src + 2;
6900                         dst = alloc_destination (coding, more_bytes, dst);
6901                         dst_end = coding->destination + coding->dst_bytes;
6902                         coding_set_source (coding);
6903                         src = coding->source + offset;
6904                         src_end = coding->source + coding->consumed;
6905                         if (EQ (coding->src_object, coding->dst_object))
6906                           dst_end = (unsigned char *) src;
6907                       }
6908                   }
6909                 EMIT_ONE_BYTE (c);
6910               }
6911         }
6912       else
6913         {
6914           if (!EQ (coding->src_object, coding->dst_object))
6915             {
6916               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6917
6918               if (require > 0)
6919                 {
6920                   ptrdiff_t offset = src - coding->source;
6921
6922                   dst = alloc_destination (coding, require, dst);
6923                   coding_set_source (coding);
6924                   src = coding->source + offset;
6925                   src_end = coding->source + coding->consumed;
6926                 }
6927             }
6928           produced_chars = coding->consumed_char;
6929           while (src < src_end)
6930             *dst++ = *src++;
6931         }
6932     }
6933
6934   produced = dst - (coding->destination + coding->produced);
6935   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6936     insert_from_gap (produced_chars, produced);
6937   coding->produced += produced;
6938   coding->produced_char += produced_chars;
6939   return carryover;
6940 }
6941
6942 /* Compose text in CODING->object according to the annotation data at
6943    CHARBUF.  CHARBUF is an array:
6944      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6945  */
6946
6947 static inline void
6948 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6949 {
6950   int len;
6951   ptrdiff_t to;
6952   enum composition_method method;
6953   Lisp_Object components;
6954
6955   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6956   to = pos + charbuf[2];
6957   method = (enum composition_method) (charbuf[4]);
6958
6959   if (method == COMPOSITION_RELATIVE)
6960     components = Qnil;
6961   else
6962     {
6963       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6964       int i, j;
6965
6966       if (method == COMPOSITION_WITH_RULE)
6967         len = charbuf[2] * 3 - 2;
6968       charbuf += MAX_ANNOTATION_LENGTH;
6969       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6970       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6971         {
6972           if (charbuf[i] >= 0)
6973             args[j] = make_number (charbuf[i]);
6974           else
6975             {
6976               i++;
6977               args[j] = make_number (charbuf[i] % 0x100);
6978             }
6979         }
6980       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6981     }
6982   compose_text (pos, to, components, Qnil, coding->dst_object);
6983 }
6984
6985
6986 /* Put `charset' property on text in CODING->object according to
6987    the annotation data at CHARBUF.  CHARBUF is an array:
6988      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6989  */
6990
6991 static inline void
6992 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6993 {
6994   ptrdiff_t from = pos - charbuf[2];
6995   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6996
6997   Fput_text_property (make_number (from), make_number (pos),
6998                       Qcharset, CHARSET_NAME (charset),
6999                       coding->dst_object);
7000 }
7001
7002
7003 #define CHARBUF_SIZE 0x4000
7004
7005 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7006   do {                                                                  \
7007     int size = CHARBUF_SIZE;                                            \
7008                                                                         \
7009     coding->charbuf = NULL;                                             \
7010     while (size > 1024)                                                 \
7011       {                                                                 \
7012         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
7013         if (coding->charbuf)                                            \
7014           break;                                                        \
7015         size >>= 1;                                                     \
7016       }                                                                 \
7017     if (! coding->charbuf)                                              \
7018       {                                                                 \
7019         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7020         return coding->result;                                          \
7021       }                                                                 \
7022     coding->charbuf_size = size;                                        \
7023   } while (0)
7024
7025
7026 static void
7027 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7028 {
7029   int *charbuf = coding->charbuf;
7030   int *charbuf_end = charbuf + coding->charbuf_used;
7031
7032   if (NILP (coding->dst_object))
7033     return;
7034
7035   while (charbuf < charbuf_end)
7036     {
7037       if (*charbuf >= 0)
7038         pos++, charbuf++;
7039       else
7040         {
7041           int len = -*charbuf;
7042
7043           if (len > 2)
7044             switch (charbuf[1])
7045               {
7046               case CODING_ANNOTATE_COMPOSITION_MASK:
7047                 produce_composition (coding, charbuf, pos);
7048                 break;
7049               case CODING_ANNOTATE_CHARSET_MASK:
7050                 produce_charset (coding, charbuf, pos);
7051                 break;
7052               }
7053           charbuf += len;
7054         }
7055     }
7056 }
7057
7058 /* Decode the data at CODING->src_object into CODING->dst_object.
7059    CODING->src_object is a buffer, a string, or nil.
7060    CODING->dst_object is a buffer.
7061
7062    If CODING->src_object is a buffer, it must be the current buffer.
7063    In this case, if CODING->src_pos is positive, it is a position of
7064    the source text in the buffer, otherwise, the source text is in the
7065    gap area of the buffer, and CODING->src_pos specifies the offset of
7066    the text from GPT (which must be the same as PT).  If this is the
7067    same buffer as CODING->dst_object, CODING->src_pos must be
7068    negative.
7069
7070    If CODING->src_object is a string, CODING->src_pos is an index to
7071    that string.
7072
7073    If CODING->src_object is nil, CODING->source must already point to
7074    the non-relocatable memory area.  In this case, CODING->src_pos is
7075    an offset from CODING->source.
7076
7077    The decoded data is inserted at the current point of the buffer
7078    CODING->dst_object.
7079 */
7080
7081 static int
7082 decode_coding (struct coding_system *coding)
7083 {
7084   Lisp_Object attrs;
7085   Lisp_Object undo_list;
7086   Lisp_Object translation_table;
7087   struct ccl_spec cclspec;
7088   int carryover;
7089   int i;
7090
7091   if (BUFFERP (coding->src_object)
7092       && coding->src_pos > 0
7093       && coding->src_pos < GPT
7094       && coding->src_pos + coding->src_chars > GPT)
7095     move_gap_both (coding->src_pos, coding->src_pos_byte);
7096
7097   undo_list = Qt;
7098   if (BUFFERP (coding->dst_object))
7099     {
7100       if (current_buffer != XBUFFER (coding->dst_object))
7101         set_buffer_internal (XBUFFER (coding->dst_object));
7102       if (GPT != PT)
7103         move_gap_both (PT, PT_BYTE);
7104       undo_list = BVAR (current_buffer, undo_list);
7105       BVAR (current_buffer, undo_list) = Qt;
7106     }
7107
7108   coding->consumed = coding->consumed_char = 0;
7109   coding->produced = coding->produced_char = 0;
7110   coding->chars_at_source = 0;
7111   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7112   coding->errors = 0;
7113
7114   ALLOC_CONVERSION_WORK_AREA (coding);
7115
7116   attrs = CODING_ID_ATTRS (coding->id);
7117   translation_table = get_translation_table (attrs, 0, NULL);
7118
7119   carryover = 0;
7120   if (coding->decoder == decode_coding_ccl)
7121     {
7122       coding->spec.ccl = &cclspec;
7123       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7124     }
7125   do
7126     {
7127       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7128
7129       coding_set_source (coding);
7130       coding->annotated = 0;
7131       coding->charbuf_used = carryover;
7132       (*(coding->decoder)) (coding);
7133       coding_set_destination (coding);
7134       carryover = produce_chars (coding, translation_table, 0);
7135       if (coding->annotated)
7136         produce_annotation (coding, pos);
7137       for (i = 0; i < carryover; i++)
7138         coding->charbuf[i]
7139           = coding->charbuf[coding->charbuf_used - carryover + i];
7140     }
7141   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7142          || (coding->consumed < coding->src_bytes
7143              && (coding->result == CODING_RESULT_SUCCESS
7144                  || coding->result == CODING_RESULT_INVALID_SRC)));
7145
7146   if (carryover > 0)
7147     {
7148       coding_set_destination (coding);
7149       coding->charbuf_used = carryover;
7150       produce_chars (coding, translation_table, 1);
7151     }
7152
7153   coding->carryover_bytes = 0;
7154   if (coding->consumed < coding->src_bytes)
7155     {
7156       int nbytes = coding->src_bytes - coding->consumed;
7157       const unsigned char *src;
7158
7159       coding_set_source (coding);
7160       coding_set_destination (coding);
7161       src = coding->source + coding->consumed;
7162
7163       if (coding->mode & CODING_MODE_LAST_BLOCK)
7164         {
7165           /* Flush out unprocessed data as binary chars.  We are sure
7166              that the number of data is less than the size of
7167              coding->charbuf.  */
7168           coding->charbuf_used = 0;
7169           coding->chars_at_source = 0;
7170
7171           while (nbytes-- > 0)
7172             {
7173               int c = *src++;
7174
7175               if (c & 0x80)
7176                 c = BYTE8_TO_CHAR (c);
7177               coding->charbuf[coding->charbuf_used++] = c;
7178             }
7179           produce_chars (coding, Qnil, 1);
7180         }
7181       else
7182         {
7183           /* Record unprocessed bytes in coding->carryover.  We are
7184              sure that the number of data is less than the size of
7185              coding->carryover.  */
7186           unsigned char *p = coding->carryover;
7187
7188           if (nbytes > sizeof coding->carryover)
7189             nbytes = sizeof coding->carryover;
7190           coding->carryover_bytes = nbytes;
7191           while (nbytes-- > 0)
7192             *p++ = *src++;
7193         }
7194       coding->consumed = coding->src_bytes;
7195     }
7196
7197   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7198       && !inhibit_eol_conversion)
7199     decode_eol (coding);
7200   if (BUFFERP (coding->dst_object))
7201     {
7202       BVAR (current_buffer, undo_list) = undo_list;
7203       record_insert (coding->dst_pos, coding->produced_char);
7204     }
7205   return coding->result;
7206 }
7207
7208
7209 /* Extract an annotation datum from a composition starting at POS and
7210    ending before LIMIT of CODING->src_object (buffer or string), store
7211    the data in BUF, set *STOP to a starting position of the next
7212    composition (if any) or to LIMIT, and return the address of the
7213    next element of BUF.
7214
7215    If such an annotation is not found, set *STOP to a starting
7216    position of a composition after POS (if any) or to LIMIT, and
7217    return BUF.  */
7218
7219 static inline int *
7220 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7221                                struct coding_system *coding, int *buf,
7222                                ptrdiff_t *stop)
7223 {
7224   ptrdiff_t start, end;
7225   Lisp_Object prop;
7226
7227   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7228       || end > limit)
7229     *stop = limit;
7230   else if (start > pos)
7231     *stop = start;
7232   else
7233     {
7234       if (start == pos)
7235         {
7236           /* We found a composition.  Store the corresponding
7237              annotation data in BUF.  */
7238           int *head = buf;
7239           enum composition_method method = COMPOSITION_METHOD (prop);
7240           int nchars = COMPOSITION_LENGTH (prop);
7241
7242           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7243           if (method != COMPOSITION_RELATIVE)
7244             {
7245               Lisp_Object components;
7246               ptrdiff_t i, len, i_byte;
7247
7248               components = COMPOSITION_COMPONENTS (prop);
7249               if (VECTORP (components))
7250                 {
7251                   len = ASIZE (components);
7252                   for (i = 0; i < len; i++)
7253                     *buf++ = XINT (AREF (components, i));
7254                 }
7255               else if (STRINGP (components))
7256                 {
7257                   len = SCHARS (components);
7258                   i = i_byte = 0;
7259                   while (i < len)
7260                     {
7261                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7262                       buf++;
7263                     }
7264                 }
7265               else if (INTEGERP (components))
7266                 {
7267                   len = 1;
7268                   *buf++ = XINT (components);
7269                 }
7270               else if (CONSP (components))
7271                 {
7272                   for (len = 0; CONSP (components);
7273                        len++, components = XCDR (components))
7274                     *buf++ = XINT (XCAR (components));
7275                 }
7276               else
7277                 abort ();
7278               *head -= len;
7279             }
7280         }
7281
7282       if (find_composition (end, limit, &start, &end, &prop,
7283                             coding->src_object)
7284           && end <= limit)
7285         *stop = start;
7286       else
7287         *stop = limit;
7288     }
7289   return buf;
7290 }
7291
7292
7293 /* Extract an annotation datum from a text property `charset' at POS of
7294    CODING->src_object (buffer of string), store the data in BUF, set
7295    *STOP to the position where the value of `charset' property changes
7296    (limiting by LIMIT), and return the address of the next element of
7297    BUF.
7298
7299    If the property value is nil, set *STOP to the position where the
7300    property value is non-nil (limiting by LIMIT), and return BUF.  */
7301
7302 static inline int *
7303 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7304                            struct coding_system *coding, int *buf,
7305                            ptrdiff_t *stop)
7306 {
7307   Lisp_Object val, next;
7308   int id;
7309
7310   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7311   if (! NILP (val) && CHARSETP (val))
7312     id = XINT (CHARSET_SYMBOL_ID (val));
7313   else
7314     id = -1;
7315   ADD_CHARSET_DATA (buf, 0, id);
7316   next = Fnext_single_property_change (make_number (pos), Qcharset,
7317                                        coding->src_object,
7318                                        make_number (limit));
7319   *stop = XINT (next);
7320   return buf;
7321 }
7322
7323
7324 static void
7325 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7326                int max_lookup)
7327 {
7328   int *buf = coding->charbuf;
7329   int *buf_end = coding->charbuf + coding->charbuf_size;
7330   const unsigned char *src = coding->source + coding->consumed;
7331   const unsigned char *src_end = coding->source + coding->src_bytes;
7332   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7333   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7334   int multibytep = coding->src_multibyte;
7335   Lisp_Object eol_type;
7336   int c;
7337   ptrdiff_t stop, stop_composition, stop_charset;
7338   int *lookup_buf = NULL;
7339
7340   if (! NILP (translation_table))
7341     lookup_buf = alloca (sizeof (int) * max_lookup);
7342
7343   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7344   if (VECTORP (eol_type))
7345     eol_type = Qunix;
7346
7347   /* Note: composition handling is not yet implemented.  */
7348   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7349
7350   if (NILP (coding->src_object))
7351     stop = stop_composition = stop_charset = end_pos;
7352   else
7353     {
7354       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7355         stop = stop_composition = pos;
7356       else
7357         stop = stop_composition = end_pos;
7358       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7359         stop = stop_charset = pos;
7360       else
7361         stop_charset = end_pos;
7362     }
7363
7364   /* Compensate for CRLF and conversion.  */
7365   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7366   while (buf < buf_end)
7367     {
7368       Lisp_Object trans;
7369
7370       if (pos == stop)
7371         {
7372           if (pos == end_pos)
7373             break;
7374           if (pos == stop_composition)
7375             buf = handle_composition_annotation (pos, end_pos, coding,
7376                                                  buf, &stop_composition);
7377           if (pos == stop_charset)
7378             buf = handle_charset_annotation (pos, end_pos, coding,
7379                                              buf, &stop_charset);
7380           stop = (stop_composition < stop_charset
7381                   ? stop_composition : stop_charset);
7382         }
7383
7384       if (! multibytep)
7385         {
7386           int bytes;
7387
7388           if (coding->encoder == encode_coding_raw_text
7389               || coding->encoder == encode_coding_ccl)
7390             c = *src++, pos++;
7391           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7392             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7393           else
7394             c = BYTE8_TO_CHAR (*src), src++, pos++;
7395         }
7396       else
7397         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7398       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7399         c = '\n';
7400       if (! EQ (eol_type, Qunix))
7401         {
7402           if (c == '\n')
7403             {
7404               if (EQ (eol_type, Qdos))
7405                 *buf++ = '\r';
7406               else
7407                 c = '\r';
7408             }
7409         }
7410
7411       trans = Qnil;
7412       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7413       if (NILP (trans))
7414         *buf++ = c;
7415       else
7416         {
7417           ptrdiff_t from_nchars = 1, to_nchars = 1;
7418           int *lookup_buf_end;
7419           const unsigned char *p = src;
7420           int i;
7421
7422           lookup_buf[0] = c;
7423           for (i = 1; i < max_lookup && p < src_end; i++)
7424             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7425           lookup_buf_end = lookup_buf + i;
7426           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7427           if (INTEGERP (trans))
7428             c = XINT (trans);
7429           else if (CONSP (trans))
7430             {
7431               from_nchars = ASIZE (XCAR (trans));
7432               trans = XCDR (trans);
7433               if (INTEGERP (trans))
7434                 c = XINT (trans);
7435               else
7436                 {
7437                   to_nchars = ASIZE (trans);
7438                   if (buf_end - buf < to_nchars)
7439                     break;
7440                   c = XINT (AREF (trans, 0));
7441                 }
7442             }
7443           else
7444             break;
7445           *buf++ = c;
7446           for (i = 1; i < to_nchars; i++)
7447             *buf++ = XINT (AREF (trans, i));
7448           for (i = 1; i < from_nchars; i++, pos++)
7449             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7450         }
7451     }
7452
7453   coding->consumed = src - coding->source;
7454   coding->consumed_char = pos - coding->src_pos;
7455   coding->charbuf_used = buf - coding->charbuf;
7456   coding->chars_at_source = 0;
7457 }
7458
7459
7460 /* Encode the text at CODING->src_object into CODING->dst_object.
7461    CODING->src_object is a buffer or a string.
7462    CODING->dst_object is a buffer or nil.
7463
7464    If CODING->src_object is a buffer, it must be the current buffer.
7465    In this case, if CODING->src_pos is positive, it is a position of
7466    the source text in the buffer, otherwise. the source text is in the
7467    gap area of the buffer, and coding->src_pos specifies the offset of
7468    the text from GPT (which must be the same as PT).  If this is the
7469    same buffer as CODING->dst_object, CODING->src_pos must be
7470    negative and CODING should not have `pre-write-conversion'.
7471
7472    If CODING->src_object is a string, CODING should not have
7473    `pre-write-conversion'.
7474
7475    If CODING->dst_object is a buffer, the encoded data is inserted at
7476    the current point of that buffer.
7477
7478    If CODING->dst_object is nil, the encoded data is placed at the
7479    memory area specified by CODING->destination.  */
7480
7481 static int
7482 encode_coding (struct coding_system *coding)
7483 {
7484   Lisp_Object attrs;
7485   Lisp_Object translation_table;
7486   int max_lookup;
7487   struct ccl_spec cclspec;
7488
7489   attrs = CODING_ID_ATTRS (coding->id);
7490   if (coding->encoder == encode_coding_raw_text)
7491     translation_table = Qnil, max_lookup = 0;
7492   else
7493     translation_table = get_translation_table (attrs, 1, &max_lookup);
7494
7495   if (BUFFERP (coding->dst_object))
7496     {
7497       set_buffer_internal (XBUFFER (coding->dst_object));
7498       coding->dst_multibyte
7499         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7500     }
7501
7502   coding->consumed = coding->consumed_char = 0;
7503   coding->produced = coding->produced_char = 0;
7504   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7505   coding->errors = 0;
7506
7507   ALLOC_CONVERSION_WORK_AREA (coding);
7508
7509   if (coding->encoder == encode_coding_ccl)
7510     {
7511       coding->spec.ccl = &cclspec;
7512       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7513     }
7514   do {
7515     coding_set_source (coding);
7516     consume_chars (coding, translation_table, max_lookup);
7517     coding_set_destination (coding);
7518     (*(coding->encoder)) (coding);
7519   } while (coding->consumed_char < coding->src_chars);
7520
7521   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7522     insert_from_gap (coding->produced_char, coding->produced);
7523
7524   return (coding->result);
7525 }
7526
7527
7528 /* Name (or base name) of work buffer for code conversion.  */
7529 static Lisp_Object Vcode_conversion_workbuf_name;
7530
7531 /* A working buffer used by the top level conversion.  Once it is
7532    created, it is never destroyed.  It has the name
7533    Vcode_conversion_workbuf_name.  The other working buffers are
7534    destroyed after the use is finished, and their names are modified
7535    versions of Vcode_conversion_workbuf_name.  */
7536 static Lisp_Object Vcode_conversion_reused_workbuf;
7537
7538 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7539 static int reused_workbuf_in_use;
7540
7541
7542 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7543    multibyteness of returning buffer.  */
7544
7545 static Lisp_Object
7546 make_conversion_work_buffer (int multibyte)
7547 {
7548   Lisp_Object name, workbuf;
7549   struct buffer *current;
7550
7551   if (reused_workbuf_in_use++)
7552     {
7553       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7554       workbuf = Fget_buffer_create (name);
7555     }
7556   else
7557     {
7558       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7559         Vcode_conversion_reused_workbuf
7560           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7561       workbuf = Vcode_conversion_reused_workbuf;
7562     }
7563   current = current_buffer;
7564   set_buffer_internal (XBUFFER (workbuf));
7565   /* We can't allow modification hooks to run in the work buffer.  For
7566      instance, directory_files_internal assumes that file decoding
7567      doesn't compile new regexps.  */
7568   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7569   Ferase_buffer ();
7570   BVAR (current_buffer, undo_list) = Qt;
7571   BVAR (current_buffer, enable_multibyte_characters) = multibyte ? Qt : Qnil;
7572   set_buffer_internal (current);
7573   return workbuf;
7574 }
7575
7576
7577 static Lisp_Object
7578 code_conversion_restore (Lisp_Object arg)
7579 {
7580   Lisp_Object current, workbuf;
7581   struct gcpro gcpro1;
7582
7583   GCPRO1 (arg);
7584   current = XCAR (arg);
7585   workbuf = XCDR (arg);
7586   if (! NILP (workbuf))
7587     {
7588       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7589         reused_workbuf_in_use = 0;
7590       else if (! NILP (Fbuffer_live_p (workbuf)))
7591         Fkill_buffer (workbuf);
7592     }
7593   set_buffer_internal (XBUFFER (current));
7594   UNGCPRO;
7595   return Qnil;
7596 }
7597
7598 Lisp_Object
7599 code_conversion_save (int with_work_buf, int multibyte)
7600 {
7601   Lisp_Object workbuf = Qnil;
7602
7603   if (with_work_buf)
7604     workbuf = make_conversion_work_buffer (multibyte);
7605   record_unwind_protect (code_conversion_restore,
7606                          Fcons (Fcurrent_buffer (), workbuf));
7607   return workbuf;
7608 }
7609
7610 int
7611 decode_coding_gap (struct coding_system *coding,
7612                    ptrdiff_t chars, ptrdiff_t bytes)
7613 {
7614   ptrdiff_t count = SPECPDL_INDEX ();
7615   Lisp_Object attrs;
7616
7617   code_conversion_save (0, 0);
7618
7619   coding->src_object = Fcurrent_buffer ();
7620   coding->src_chars = chars;
7621   coding->src_bytes = bytes;
7622   coding->src_pos = -chars;
7623   coding->src_pos_byte = -bytes;
7624   coding->src_multibyte = chars < bytes;
7625   coding->dst_object = coding->src_object;
7626   coding->dst_pos = PT;
7627   coding->dst_pos_byte = PT_BYTE;
7628   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7629
7630   if (CODING_REQUIRE_DETECTION (coding))
7631     detect_coding (coding);
7632
7633   coding->mode |= CODING_MODE_LAST_BLOCK;
7634   current_buffer->text->inhibit_shrinking = 1;
7635   decode_coding (coding);
7636   current_buffer->text->inhibit_shrinking = 0;
7637
7638   attrs = CODING_ID_ATTRS (coding->id);
7639   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7640     {
7641       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7642       Lisp_Object val;
7643
7644       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7645       val = call1 (CODING_ATTR_POST_READ (attrs),
7646                    make_number (coding->produced_char));
7647       CHECK_NATNUM (val);
7648       coding->produced_char += Z - prev_Z;
7649       coding->produced += Z_BYTE - prev_Z_BYTE;
7650     }
7651
7652   unbind_to (count, Qnil);
7653   return coding->result;
7654 }
7655
7656
7657 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7658    SRC_OBJECT into DST_OBJECT by coding context CODING.
7659
7660    SRC_OBJECT is a buffer, a string, or Qnil.
7661
7662    If it is a buffer, the text is at point of the buffer.  FROM and TO
7663    are positions in the buffer.
7664
7665    If it is a string, the text is at the beginning of the string.
7666    FROM and TO are indices to the string.
7667
7668    If it is nil, the text is at coding->source.  FROM and TO are
7669    indices to coding->source.
7670
7671    DST_OBJECT is a buffer, Qt, or Qnil.
7672
7673    If it is a buffer, the decoded text is inserted at point of the
7674    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7675    is deleted.
7676
7677    If it is Qt, a string is made from the decoded text, and
7678    set in CODING->dst_object.
7679
7680    If it is Qnil, the decoded text is stored at CODING->destination.
7681    The caller must allocate CODING->dst_bytes bytes at
7682    CODING->destination by xmalloc.  If the decoded text is longer than
7683    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7684  */
7685
7686 void
7687 decode_coding_object (struct coding_system *coding,
7688                       Lisp_Object src_object,
7689                       ptrdiff_t from, ptrdiff_t from_byte,
7690                       ptrdiff_t to, ptrdiff_t to_byte,
7691                       Lisp_Object dst_object)
7692 {
7693   ptrdiff_t count = SPECPDL_INDEX ();
7694   unsigned char *destination IF_LINT (= NULL);
7695   ptrdiff_t dst_bytes IF_LINT (= 0);
7696   ptrdiff_t chars = to - from;
7697   ptrdiff_t bytes = to_byte - from_byte;
7698   Lisp_Object attrs;
7699   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7700   int need_marker_adjustment = 0;
7701   Lisp_Object old_deactivate_mark;
7702
7703   old_deactivate_mark = Vdeactivate_mark;
7704
7705   if (NILP (dst_object))
7706     {
7707       destination = coding->destination;
7708       dst_bytes = coding->dst_bytes;
7709     }
7710
7711   coding->src_object = src_object;
7712   coding->src_chars = chars;
7713   coding->src_bytes = bytes;
7714   coding->src_multibyte = chars < bytes;
7715
7716   if (STRINGP (src_object))
7717     {
7718       coding->src_pos = from;
7719       coding->src_pos_byte = from_byte;
7720     }
7721   else if (BUFFERP (src_object))
7722     {
7723       set_buffer_internal (XBUFFER (src_object));
7724       if (from != GPT)
7725         move_gap_both (from, from_byte);
7726       if (EQ (src_object, dst_object))
7727         {
7728           struct Lisp_Marker *tail;
7729
7730           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7731             {
7732               tail->need_adjustment
7733                 = tail->charpos == (tail->insertion_type ? from : to);
7734               need_marker_adjustment |= tail->need_adjustment;
7735             }
7736           saved_pt = PT, saved_pt_byte = PT_BYTE;
7737           TEMP_SET_PT_BOTH (from, from_byte);
7738           current_buffer->text->inhibit_shrinking = 1;
7739           del_range_both (from, from_byte, to, to_byte, 1);
7740           coding->src_pos = -chars;
7741           coding->src_pos_byte = -bytes;
7742         }
7743       else
7744         {
7745           coding->src_pos = from;
7746           coding->src_pos_byte = from_byte;
7747         }
7748     }
7749
7750   if (CODING_REQUIRE_DETECTION (coding))
7751     detect_coding (coding);
7752   attrs = CODING_ID_ATTRS (coding->id);
7753
7754   if (EQ (dst_object, Qt)
7755       || (! NILP (CODING_ATTR_POST_READ (attrs))
7756           && NILP (dst_object)))
7757     {
7758       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7759       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7760       coding->dst_pos = BEG;
7761       coding->dst_pos_byte = BEG_BYTE;
7762     }
7763   else if (BUFFERP (dst_object))
7764     {
7765       code_conversion_save (0, 0);
7766       coding->dst_object = dst_object;
7767       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7768       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7769       coding->dst_multibyte
7770         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7771     }
7772   else
7773     {
7774       code_conversion_save (0, 0);
7775       coding->dst_object = Qnil;
7776       /* Most callers presume this will return a multibyte result, and they
7777          won't use `binary' or `raw-text' anyway, so let's not worry about
7778          CODING_FOR_UNIBYTE.  */
7779       coding->dst_multibyte = 1;
7780     }
7781
7782   decode_coding (coding);
7783
7784   if (BUFFERP (coding->dst_object))
7785     set_buffer_internal (XBUFFER (coding->dst_object));
7786
7787   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7788     {
7789       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7790       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7791       Lisp_Object val;
7792
7793       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7794       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7795               old_deactivate_mark);
7796       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7797                         make_number (coding->produced_char));
7798       UNGCPRO;
7799       CHECK_NATNUM (val);
7800       coding->produced_char += Z - prev_Z;
7801       coding->produced += Z_BYTE - prev_Z_BYTE;
7802     }
7803
7804   if (EQ (dst_object, Qt))
7805     {
7806       coding->dst_object = Fbuffer_string ();
7807     }
7808   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7809     {
7810       set_buffer_internal (XBUFFER (coding->dst_object));
7811       if (dst_bytes < coding->produced)
7812         {
7813           destination = xrealloc (destination, coding->produced);
7814           if (! destination)
7815             {
7816               record_conversion_result (coding,
7817                                         CODING_RESULT_INSUFFICIENT_MEM);
7818               unbind_to (count, Qnil);
7819               return;
7820             }
7821           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7822             move_gap_both (BEGV, BEGV_BYTE);
7823           memcpy (destination, BEGV_ADDR, coding->produced);
7824           coding->destination = destination;
7825         }
7826     }
7827
7828   if (saved_pt >= 0)
7829     {
7830       /* This is the case of:
7831          (BUFFERP (src_object) && EQ (src_object, dst_object))
7832          As we have moved PT while replacing the original buffer
7833          contents, we must recover it now.  */
7834       set_buffer_internal (XBUFFER (src_object));
7835       current_buffer->text->inhibit_shrinking = 0;
7836       if (saved_pt < from)
7837         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7838       else if (saved_pt < from + chars)
7839         TEMP_SET_PT_BOTH (from, from_byte);
7840       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7841         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7842                           saved_pt_byte + (coding->produced - bytes));
7843       else
7844         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7845                           saved_pt_byte + (coding->produced - bytes));
7846
7847       if (need_marker_adjustment)
7848         {
7849           struct Lisp_Marker *tail;
7850
7851           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7852             if (tail->need_adjustment)
7853               {
7854                 tail->need_adjustment = 0;
7855                 if (tail->insertion_type)
7856                   {
7857                     tail->bytepos = from_byte;
7858                     tail->charpos = from;
7859                   }
7860                 else
7861                   {
7862                     tail->bytepos = from_byte + coding->produced;
7863                     tail->charpos
7864                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7865                          ? tail->bytepos : from + coding->produced_char);
7866                   }
7867               }
7868         }
7869     }
7870
7871   Vdeactivate_mark = old_deactivate_mark;
7872   unbind_to (count, coding->dst_object);
7873 }
7874
7875
7876 void
7877 encode_coding_object (struct coding_system *coding,
7878                       Lisp_Object src_object,
7879                       ptrdiff_t from, ptrdiff_t from_byte,
7880                       ptrdiff_t to, ptrdiff_t to_byte,
7881                       Lisp_Object dst_object)
7882 {
7883   ptrdiff_t count = SPECPDL_INDEX ();
7884   ptrdiff_t chars = to - from;
7885   ptrdiff_t bytes = to_byte - from_byte;
7886   Lisp_Object attrs;
7887   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7888   int need_marker_adjustment = 0;
7889   int kill_src_buffer = 0;
7890   Lisp_Object old_deactivate_mark;
7891
7892   old_deactivate_mark = Vdeactivate_mark;
7893
7894   coding->src_object = src_object;
7895   coding->src_chars = chars;
7896   coding->src_bytes = bytes;
7897   coding->src_multibyte = chars < bytes;
7898
7899   attrs = CODING_ID_ATTRS (coding->id);
7900
7901   if (EQ (src_object, dst_object))
7902     {
7903       struct Lisp_Marker *tail;
7904
7905       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7906         {
7907           tail->need_adjustment
7908             = tail->charpos == (tail->insertion_type ? from : to);
7909           need_marker_adjustment |= tail->need_adjustment;
7910         }
7911     }
7912
7913   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7914     {
7915       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7916       set_buffer_internal (XBUFFER (coding->src_object));
7917       if (STRINGP (src_object))
7918         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7919       else if (BUFFERP (src_object))
7920         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7921       else
7922         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7923
7924       if (EQ (src_object, dst_object))
7925         {
7926           set_buffer_internal (XBUFFER (src_object));
7927           saved_pt = PT, saved_pt_byte = PT_BYTE;
7928           del_range_both (from, from_byte, to, to_byte, 1);
7929           set_buffer_internal (XBUFFER (coding->src_object));
7930         }
7931
7932       {
7933         Lisp_Object args[3];
7934         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7935
7936         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7937                 old_deactivate_mark);
7938         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7939         args[1] = make_number (BEG);
7940         args[2] = make_number (Z);
7941         safe_call (3, args);
7942         UNGCPRO;
7943       }
7944       if (XBUFFER (coding->src_object) != current_buffer)
7945         kill_src_buffer = 1;
7946       coding->src_object = Fcurrent_buffer ();
7947       if (BEG != GPT)
7948         move_gap_both (BEG, BEG_BYTE);
7949       coding->src_chars = Z - BEG;
7950       coding->src_bytes = Z_BYTE - BEG_BYTE;
7951       coding->src_pos = BEG;
7952       coding->src_pos_byte = BEG_BYTE;
7953       coding->src_multibyte = Z < Z_BYTE;
7954     }
7955   else if (STRINGP (src_object))
7956     {
7957       code_conversion_save (0, 0);
7958       coding->src_pos = from;
7959       coding->src_pos_byte = from_byte;
7960     }
7961   else if (BUFFERP (src_object))
7962     {
7963       code_conversion_save (0, 0);
7964       set_buffer_internal (XBUFFER (src_object));
7965       if (EQ (src_object, dst_object))
7966         {
7967           saved_pt = PT, saved_pt_byte = PT_BYTE;
7968           coding->src_object = del_range_1 (from, to, 1, 1);
7969           coding->src_pos = 0;
7970           coding->src_pos_byte = 0;
7971         }
7972       else
7973         {
7974           if (from < GPT && to >= GPT)
7975             move_gap_both (from, from_byte);
7976           coding->src_pos = from;
7977           coding->src_pos_byte = from_byte;
7978         }
7979     }
7980   else
7981     code_conversion_save (0, 0);
7982
7983   if (BUFFERP (dst_object))
7984     {
7985       coding->dst_object = dst_object;
7986       if (EQ (src_object, dst_object))
7987         {
7988           coding->dst_pos = from;
7989           coding->dst_pos_byte = from_byte;
7990         }
7991       else
7992         {
7993           struct buffer *current = current_buffer;
7994
7995           set_buffer_temp (XBUFFER (dst_object));
7996           coding->dst_pos = PT;
7997           coding->dst_pos_byte = PT_BYTE;
7998           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7999           set_buffer_temp (current);
8000         }
8001       coding->dst_multibyte
8002         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8003     }
8004   else if (EQ (dst_object, Qt))
8005     {
8006       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8007       coding->dst_object = Qnil;
8008       coding->destination = (unsigned char *) xmalloc (dst_bytes);
8009       coding->dst_bytes = dst_bytes;
8010       coding->dst_multibyte = 0;
8011     }
8012   else
8013     {
8014       coding->dst_object = Qnil;
8015       coding->dst_multibyte = 0;
8016     }
8017
8018   encode_coding (coding);
8019
8020   if (EQ (dst_object, Qt))
8021     {
8022       if (BUFFERP (coding->dst_object))
8023         coding->dst_object = Fbuffer_string ();
8024       else
8025         {
8026           coding->dst_object
8027             = make_unibyte_string ((char *) coding->destination,
8028                                    coding->produced);
8029           xfree (coding->destination);
8030         }
8031     }
8032
8033   if (saved_pt >= 0)
8034     {
8035       /* This is the case of:
8036          (BUFFERP (src_object) && EQ (src_object, dst_object))
8037          As we have moved PT while replacing the original buffer
8038          contents, we must recover it now.  */
8039       set_buffer_internal (XBUFFER (src_object));
8040       if (saved_pt < from)
8041         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8042       else if (saved_pt < from + chars)
8043         TEMP_SET_PT_BOTH (from, from_byte);
8044       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8045         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8046                           saved_pt_byte + (coding->produced - bytes));
8047       else
8048         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8049                           saved_pt_byte + (coding->produced - bytes));
8050
8051       if (need_marker_adjustment)
8052         {
8053           struct Lisp_Marker *tail;
8054
8055           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8056             if (tail->need_adjustment)
8057               {
8058                 tail->need_adjustment = 0;
8059                 if (tail->insertion_type)
8060                   {
8061                     tail->bytepos = from_byte;
8062                     tail->charpos = from;
8063                   }
8064                 else
8065                   {
8066                     tail->bytepos = from_byte + coding->produced;
8067                     tail->charpos
8068                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8069                          ? tail->bytepos : from + coding->produced_char);
8070                   }
8071               }
8072         }
8073     }
8074
8075   if (kill_src_buffer)
8076     Fkill_buffer (coding->src_object);
8077
8078   Vdeactivate_mark = old_deactivate_mark;
8079   unbind_to (count, Qnil);
8080 }
8081
8082
8083 Lisp_Object
8084 preferred_coding_system (void)
8085 {
8086   int id = coding_categories[coding_priorities[0]].id;
8087
8088   return CODING_ID_NAME (id);
8089 }
8090
8091 \f
8092 #ifdef emacs
8093 /*** 8. Emacs Lisp library functions ***/
8094
8095 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8096        doc: /* Return t if OBJECT is nil or a coding-system.
8097 See the documentation of `define-coding-system' for information
8098 about coding-system objects.  */)
8099   (Lisp_Object object)
8100 {
8101   if (NILP (object)
8102       || CODING_SYSTEM_ID (object) >= 0)
8103     return Qt;
8104   if (! SYMBOLP (object)
8105       || NILP (Fget (object, Qcoding_system_define_form)))
8106     return Qnil;
8107   return Qt;
8108 }
8109
8110 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8111        Sread_non_nil_coding_system, 1, 1, 0,
8112        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8113   (Lisp_Object prompt)
8114 {
8115   Lisp_Object val;
8116   do
8117     {
8118       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8119                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8120     }
8121   while (SCHARS (val) == 0);
8122   return (Fintern (val, Qnil));
8123 }
8124
8125 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8126        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8127 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8128 Ignores case when completing coding systems (all Emacs coding systems
8129 are lower-case).  */)
8130   (Lisp_Object prompt, Lisp_Object default_coding_system)
8131 {
8132   Lisp_Object val;
8133   ptrdiff_t count = SPECPDL_INDEX ();
8134
8135   if (SYMBOLP (default_coding_system))
8136     default_coding_system = SYMBOL_NAME (default_coding_system);
8137   specbind (Qcompletion_ignore_case, Qt);
8138   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8139                           Qt, Qnil, Qcoding_system_history,
8140                           default_coding_system, Qnil);
8141   unbind_to (count, Qnil);
8142   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8143 }
8144
8145 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8146        1, 1, 0,
8147        doc: /* Check validity of CODING-SYSTEM.
8148 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8149 It is valid if it is nil or a symbol defined as a coding system by the
8150 function `define-coding-system'.  */)
8151   (Lisp_Object coding_system)
8152 {
8153   Lisp_Object define_form;
8154
8155   define_form = Fget (coding_system, Qcoding_system_define_form);
8156   if (! NILP (define_form))
8157     {
8158       Fput (coding_system, Qcoding_system_define_form, Qnil);
8159       safe_eval (define_form);
8160     }
8161   if (!NILP (Fcoding_system_p (coding_system)))
8162     return coding_system;
8163   xsignal1 (Qcoding_system_error, coding_system);
8164 }
8165
8166 \f
8167 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8168    HIGHEST is nonzero, return the coding system of the highest
8169    priority among the detected coding systems.  Otherwise return a
8170    list of detected coding systems sorted by their priorities.  If
8171    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8172    multibyte form but contains only ASCII and eight-bit chars.
8173    Otherwise, the bytes are raw bytes.
8174
8175    CODING-SYSTEM controls the detection as below:
8176
8177    If it is nil, detect both text-format and eol-format.  If the
8178    text-format part of CODING-SYSTEM is already specified
8179    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8180    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8181    detect only text-format.  */
8182
8183 Lisp_Object
8184 detect_coding_system (const unsigned char *src,
8185                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8186                       int highest, int multibytep,
8187                       Lisp_Object coding_system)
8188 {
8189   const unsigned char *src_end = src + src_bytes;
8190   Lisp_Object attrs, eol_type;
8191   Lisp_Object val = Qnil;
8192   struct coding_system coding;
8193   ptrdiff_t id;
8194   struct coding_detection_info detect_info;
8195   enum coding_category base_category;
8196   int null_byte_found = 0, eight_bit_found = 0;
8197
8198   if (NILP (coding_system))
8199     coding_system = Qundecided;
8200   setup_coding_system (coding_system, &coding);
8201   attrs = CODING_ID_ATTRS (coding.id);
8202   eol_type = CODING_ID_EOL_TYPE (coding.id);
8203   coding_system = CODING_ATTR_BASE_NAME (attrs);
8204
8205   coding.source = src;
8206   coding.src_chars = src_chars;
8207   coding.src_bytes = src_bytes;
8208   coding.src_multibyte = multibytep;
8209   coding.consumed = 0;
8210   coding.mode |= CODING_MODE_LAST_BLOCK;
8211   coding.head_ascii = 0;
8212
8213   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8214
8215   /* At first, detect text-format if necessary.  */
8216   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8217   if (base_category == coding_category_undecided)
8218     {
8219       enum coding_category category IF_LINT (= 0);
8220       struct coding_system *this IF_LINT (= NULL);
8221       int c, i;
8222
8223       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8224       for (; src < src_end; src++)
8225         {
8226           c = *src;
8227           if (c & 0x80)
8228             {
8229               eight_bit_found = 1;
8230               if (null_byte_found)
8231                 break;
8232             }
8233           else if (c < 0x20)
8234             {
8235               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8236                   && ! inhibit_iso_escape_detection
8237                   && ! detect_info.checked)
8238                 {
8239                   if (detect_coding_iso_2022 (&coding, &detect_info))
8240                     {
8241                       /* We have scanned the whole data.  */
8242                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8243                         {
8244                           /* We didn't find an 8-bit code.  We may
8245                              have found a null-byte, but it's very
8246                              rare that a binary file confirm to
8247                              ISO-2022.  */
8248                           src = src_end;
8249                           coding.head_ascii = src - coding.source;
8250                         }
8251                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8252                       break;
8253                     }
8254                 }
8255               else if (! c && !inhibit_null_byte_detection)
8256                 {
8257                   null_byte_found = 1;
8258                   if (eight_bit_found)
8259                     break;
8260                 }
8261               if (! eight_bit_found)
8262                 coding.head_ascii++;
8263             }
8264           else if (! eight_bit_found)
8265             coding.head_ascii++;
8266         }
8267
8268       if (null_byte_found || eight_bit_found
8269           || coding.head_ascii < coding.src_bytes
8270           || detect_info.found)
8271         {
8272           if (coding.head_ascii == coding.src_bytes)
8273             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8274             for (i = 0; i < coding_category_raw_text; i++)
8275               {
8276                 category = coding_priorities[i];
8277                 this = coding_categories + category;
8278                 if (detect_info.found & (1 << category))
8279                   break;
8280               }
8281           else
8282             {
8283               if (null_byte_found)
8284                 {
8285                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8286                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8287                 }
8288               for (i = 0; i < coding_category_raw_text; i++)
8289                 {
8290                   category = coding_priorities[i];
8291                   this = coding_categories + category;
8292
8293                   if (this->id < 0)
8294                     {
8295                       /* No coding system of this category is defined.  */
8296                       detect_info.rejected |= (1 << category);
8297                     }
8298                   else if (category >= coding_category_raw_text)
8299                     continue;
8300                   else if (detect_info.checked & (1 << category))
8301                     {
8302                       if (highest
8303                           && (detect_info.found & (1 << category)))
8304                         break;
8305                     }
8306                   else if ((*(this->detector)) (&coding, &detect_info)
8307                            && highest
8308                            && (detect_info.found & (1 << category)))
8309                     {
8310                       if (category == coding_category_utf_16_auto)
8311                         {
8312                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8313                             category = coding_category_utf_16_le;
8314                           else
8315                             category = coding_category_utf_16_be;
8316                         }
8317                       break;
8318                     }
8319                 }
8320             }
8321         }
8322
8323       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8324           || null_byte_found)
8325         {
8326           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8327           id = CODING_SYSTEM_ID (Qno_conversion);
8328           val = Fcons (make_number (id), Qnil);
8329         }
8330       else if (! detect_info.rejected && ! detect_info.found)
8331         {
8332           detect_info.found = CATEGORY_MASK_ANY;
8333           id = coding_categories[coding_category_undecided].id;
8334           val = Fcons (make_number (id), Qnil);
8335         }
8336       else if (highest)
8337         {
8338           if (detect_info.found)
8339             {
8340               detect_info.found = 1 << category;
8341               val = Fcons (make_number (this->id), Qnil);
8342             }
8343           else
8344             for (i = 0; i < coding_category_raw_text; i++)
8345               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8346                 {
8347                   detect_info.found = 1 << coding_priorities[i];
8348                   id = coding_categories[coding_priorities[i]].id;
8349                   val = Fcons (make_number (id), Qnil);
8350                   break;
8351                 }
8352         }
8353       else
8354         {
8355           int mask = detect_info.rejected | detect_info.found;
8356           int found = 0;
8357
8358           for (i = coding_category_raw_text - 1; i >= 0; i--)
8359             {
8360               category = coding_priorities[i];
8361               if (! (mask & (1 << category)))
8362                 {
8363                   found |= 1 << category;
8364                   id = coding_categories[category].id;
8365                   if (id >= 0)
8366                     val = Fcons (make_number (id), val);
8367                 }
8368             }
8369           for (i = coding_category_raw_text - 1; i >= 0; i--)
8370             {
8371               category = coding_priorities[i];
8372               if (detect_info.found & (1 << category))
8373                 {
8374                   id = coding_categories[category].id;
8375                   val = Fcons (make_number (id), val);
8376                 }
8377             }
8378           detect_info.found |= found;
8379         }
8380     }
8381   else if (base_category == coding_category_utf_8_auto)
8382     {
8383       if (detect_coding_utf_8 (&coding, &detect_info))
8384         {
8385           struct coding_system *this;
8386
8387           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8388             this = coding_categories + coding_category_utf_8_sig;
8389           else
8390             this = coding_categories + coding_category_utf_8_nosig;
8391           val = Fcons (make_number (this->id), Qnil);
8392         }
8393     }
8394   else if (base_category == coding_category_utf_16_auto)
8395     {
8396       if (detect_coding_utf_16 (&coding, &detect_info))
8397         {
8398           struct coding_system *this;
8399
8400           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8401             this = coding_categories + coding_category_utf_16_le;
8402           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8403             this = coding_categories + coding_category_utf_16_be;
8404           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8405             this = coding_categories + coding_category_utf_16_be_nosig;
8406           else
8407             this = coding_categories + coding_category_utf_16_le_nosig;
8408           val = Fcons (make_number (this->id), Qnil);
8409         }
8410     }
8411   else
8412     {
8413       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8414       val = Fcons (make_number (coding.id), Qnil);
8415     }
8416
8417   /* Then, detect eol-format if necessary.  */
8418   {
8419     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8420     Lisp_Object tail;
8421
8422     if (VECTORP (eol_type))
8423       {
8424         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8425           {
8426             if (null_byte_found)
8427               normal_eol = EOL_SEEN_LF;
8428             else
8429               normal_eol = detect_eol (coding.source, src_bytes,
8430                                        coding_category_raw_text);
8431           }
8432         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8433                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8434           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8435                                       coding_category_utf_16_be);
8436         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8437                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8438           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8439                                       coding_category_utf_16_le);
8440       }
8441     else
8442       {
8443         if (EQ (eol_type, Qunix))
8444           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8445         else if (EQ (eol_type, Qdos))
8446           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8447         else
8448           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8449       }
8450
8451     for (tail = val; CONSP (tail); tail = XCDR (tail))
8452       {
8453         enum coding_category category;
8454         int this_eol;
8455
8456         id = XINT (XCAR (tail));
8457         attrs = CODING_ID_ATTRS (id);
8458         category = XINT (CODING_ATTR_CATEGORY (attrs));
8459         eol_type = CODING_ID_EOL_TYPE (id);
8460         if (VECTORP (eol_type))
8461           {
8462             if (category == coding_category_utf_16_be
8463                 || category == coding_category_utf_16_be_nosig)
8464               this_eol = utf_16_be_eol;
8465             else if (category == coding_category_utf_16_le
8466                      || category == coding_category_utf_16_le_nosig)
8467               this_eol = utf_16_le_eol;
8468             else
8469               this_eol = normal_eol;
8470
8471             if (this_eol == EOL_SEEN_LF)
8472               XSETCAR (tail, AREF (eol_type, 0));
8473             else if (this_eol == EOL_SEEN_CRLF)
8474               XSETCAR (tail, AREF (eol_type, 1));
8475             else if (this_eol == EOL_SEEN_CR)
8476               XSETCAR (tail, AREF (eol_type, 2));
8477             else
8478               XSETCAR (tail, CODING_ID_NAME (id));
8479           }
8480         else
8481           XSETCAR (tail, CODING_ID_NAME (id));
8482       }
8483   }
8484
8485   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8486 }
8487
8488
8489 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8490        2, 3, 0,
8491        doc: /* Detect coding system of the text in the region between START and END.
8492 Return a list of possible coding systems ordered by priority.
8493 The coding systems to try and their priorities follows what
8494 the function `coding-system-priority-list' (which see) returns.
8495
8496 If only ASCII characters are found (except for such ISO-2022 control
8497 characters as ESC), it returns a list of single element `undecided'
8498 or its subsidiary coding system according to a detected end-of-line
8499 format.
8500
8501 If optional argument HIGHEST is non-nil, return the coding system of
8502 highest priority.  */)
8503   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8504 {
8505   ptrdiff_t from, to;
8506   ptrdiff_t from_byte, to_byte;
8507
8508   CHECK_NUMBER_COERCE_MARKER (start);
8509   CHECK_NUMBER_COERCE_MARKER (end);
8510
8511   validate_region (&start, &end);
8512   from = XINT (start), to = XINT (end);
8513   from_byte = CHAR_TO_BYTE (from);
8514   to_byte = CHAR_TO_BYTE (to);
8515
8516   if (from < GPT && to >= GPT)
8517     move_gap_both (to, to_byte);
8518
8519   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8520                                to - from, to_byte - from_byte,
8521                                !NILP (highest),
8522                                !NILP (BVAR (current_buffer
8523                                       , enable_multibyte_characters)),
8524                                Qnil);
8525 }
8526
8527 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8528        1, 2, 0,
8529        doc: /* Detect coding system of the text in STRING.
8530 Return a list of possible coding systems ordered by priority.
8531 The coding systems to try and their priorities follows what
8532 the function `coding-system-priority-list' (which see) returns.
8533
8534 If only ASCII characters are found (except for such ISO-2022 control
8535 characters as ESC), it returns a list of single element `undecided'
8536 or its subsidiary coding system according to a detected end-of-line
8537 format.
8538
8539 If optional argument HIGHEST is non-nil, return the coding system of
8540 highest priority.  */)
8541   (Lisp_Object string, Lisp_Object highest)
8542 {
8543   CHECK_STRING (string);
8544
8545   return detect_coding_system (SDATA (string),
8546                                SCHARS (string), SBYTES (string),
8547                                !NILP (highest), STRING_MULTIBYTE (string),
8548                                Qnil);
8549 }
8550
8551
8552 static inline int
8553 char_encodable_p (int c, Lisp_Object attrs)
8554 {
8555   Lisp_Object tail;
8556   struct charset *charset;
8557   Lisp_Object translation_table;
8558
8559   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8560   if (! NILP (translation_table))
8561     c = translate_char (translation_table, c);
8562   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8563        CONSP (tail); tail = XCDR (tail))
8564     {
8565       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8566       if (CHAR_CHARSET_P (c, charset))
8567         break;
8568     }
8569   return (! NILP (tail));
8570 }
8571
8572
8573 /* Return a list of coding systems that safely encode the text between
8574    START and END.  If EXCLUDE is non-nil, it is a list of coding
8575    systems not to check.  The returned list doesn't contain any such
8576    coding systems.  In any case, if the text contains only ASCII or is
8577    unibyte, return t.  */
8578
8579 DEFUN ("find-coding-systems-region-internal",
8580        Ffind_coding_systems_region_internal,
8581        Sfind_coding_systems_region_internal, 2, 3, 0,
8582        doc: /* Internal use only.  */)
8583   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8584 {
8585   Lisp_Object coding_attrs_list, safe_codings;
8586   ptrdiff_t start_byte, end_byte;
8587   const unsigned char *p, *pbeg, *pend;
8588   int c;
8589   Lisp_Object tail, elt, work_table;
8590
8591   if (STRINGP (start))
8592     {
8593       if (!STRING_MULTIBYTE (start)
8594           || SCHARS (start) == SBYTES (start))
8595         return Qt;
8596       start_byte = 0;
8597       end_byte = SBYTES (start);
8598     }
8599   else
8600     {
8601       CHECK_NUMBER_COERCE_MARKER (start);
8602       CHECK_NUMBER_COERCE_MARKER (end);
8603       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8604         args_out_of_range (start, end);
8605       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8606         return Qt;
8607       start_byte = CHAR_TO_BYTE (XINT (start));
8608       end_byte = CHAR_TO_BYTE (XINT (end));
8609       if (XINT (end) - XINT (start) == end_byte - start_byte)
8610         return Qt;
8611
8612       if (XINT (start) < GPT && XINT (end) > GPT)
8613         {
8614           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8615             move_gap_both (XINT (start), start_byte);
8616           else
8617             move_gap_both (XINT (end), end_byte);
8618         }
8619     }
8620
8621   coding_attrs_list = Qnil;
8622   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8623     if (NILP (exclude)
8624         || NILP (Fmemq (XCAR (tail), exclude)))
8625       {
8626         Lisp_Object attrs;
8627
8628         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8629         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8630             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8631           {
8632             ASET (attrs, coding_attr_trans_tbl,
8633                   get_translation_table (attrs, 1, NULL));
8634             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8635           }
8636       }
8637
8638   if (STRINGP (start))
8639     p = pbeg = SDATA (start);
8640   else
8641     p = pbeg = BYTE_POS_ADDR (start_byte);
8642   pend = p + (end_byte - start_byte);
8643
8644   while (p < pend && ASCII_BYTE_P (*p)) p++;
8645   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8646
8647   work_table = Fmake_char_table (Qnil, Qnil);
8648   while (p < pend)
8649     {
8650       if (ASCII_BYTE_P (*p))
8651         p++;
8652       else
8653         {
8654           c = STRING_CHAR_ADVANCE (p);
8655           if (!NILP (char_table_ref (work_table, c)))
8656             /* This character was already checked.  Ignore it.  */
8657             continue;
8658
8659           charset_map_loaded = 0;
8660           for (tail = coding_attrs_list; CONSP (tail);)
8661             {
8662               elt = XCAR (tail);
8663               if (NILP (elt))
8664                 tail = XCDR (tail);
8665               else if (char_encodable_p (c, elt))
8666                 tail = XCDR (tail);
8667               else if (CONSP (XCDR (tail)))
8668                 {
8669                   XSETCAR (tail, XCAR (XCDR (tail)));
8670                   XSETCDR (tail, XCDR (XCDR (tail)));
8671                 }
8672               else
8673                 {
8674                   XSETCAR (tail, Qnil);
8675                   tail = XCDR (tail);
8676                 }
8677             }
8678           if (charset_map_loaded)
8679             {
8680               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8681
8682               if (STRINGP (start))
8683                 pbeg = SDATA (start);
8684               else
8685                 pbeg = BYTE_POS_ADDR (start_byte);
8686               p = pbeg + p_offset;
8687               pend = pbeg + pend_offset;
8688             }
8689           char_table_set (work_table, c, Qt);
8690         }
8691     }
8692
8693   safe_codings = list2 (Qraw_text, Qno_conversion);
8694   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8695     if (! NILP (XCAR (tail)))
8696       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8697
8698   return safe_codings;
8699 }
8700
8701
8702 DEFUN ("unencodable-char-position", Funencodable_char_position,
8703        Sunencodable_char_position, 3, 5, 0,
8704        doc: /*
8705 Return position of first un-encodable character in a region.
8706 START and END specify the region and CODING-SYSTEM specifies the
8707 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8708
8709 If optional 4th argument COUNT is non-nil, it specifies at most how
8710 many un-encodable characters to search.  In this case, the value is a
8711 list of positions.
8712
8713 If optional 5th argument STRING is non-nil, it is a string to search
8714 for un-encodable characters.  In that case, START and END are indexes
8715 to the string.  */)
8716   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8717 {
8718   EMACS_INT n;
8719   struct coding_system coding;
8720   Lisp_Object attrs, charset_list, translation_table;
8721   Lisp_Object positions;
8722   ptrdiff_t from, to;
8723   const unsigned char *p, *stop, *pend;
8724   int ascii_compatible;
8725
8726   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8727   attrs = CODING_ID_ATTRS (coding.id);
8728   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8729     return Qnil;
8730   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8731   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8732   translation_table = get_translation_table (attrs, 1, NULL);
8733
8734   if (NILP (string))
8735     {
8736       validate_region (&start, &end);
8737       from = XINT (start);
8738       to = XINT (end);
8739       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8740           || (ascii_compatible
8741               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8742         return Qnil;
8743       p = CHAR_POS_ADDR (from);
8744       pend = CHAR_POS_ADDR (to);
8745       if (from < GPT && to >= GPT)
8746         stop = GPT_ADDR;
8747       else
8748         stop = pend;
8749     }
8750   else
8751     {
8752       CHECK_STRING (string);
8753       CHECK_NATNUM (start);
8754       CHECK_NATNUM (end);
8755       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8756         args_out_of_range_3 (string, start, end);
8757       from = XINT (start);
8758       to = XINT (end);
8759       if (! STRING_MULTIBYTE (string))
8760         return Qnil;
8761       p = SDATA (string) + string_char_to_byte (string, from);
8762       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8763       if (ascii_compatible && (to - from) == (pend - p))
8764         return Qnil;
8765     }
8766
8767   if (NILP (count))
8768     n = 1;
8769   else
8770     {
8771       CHECK_NATNUM (count);
8772       n = XINT (count);
8773     }
8774
8775   positions = Qnil;
8776   charset_map_loaded = 0;
8777   while (1)
8778     {
8779       int c;
8780
8781       if (ascii_compatible)
8782         while (p < stop && ASCII_BYTE_P (*p))
8783           p++, from++;
8784       if (p >= stop)
8785         {
8786           if (p >= pend)
8787             break;
8788           stop = pend;
8789           p = GAP_END_ADDR;
8790         }
8791
8792       c = STRING_CHAR_ADVANCE (p);
8793       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8794           && ! char_charset (translate_char (translation_table, c),
8795                              charset_list, NULL))
8796         {
8797           positions = Fcons (make_number (from), positions);
8798           n--;
8799           if (n == 0)
8800             break;
8801         }
8802
8803       from++;
8804       if (charset_map_loaded && NILP (string))
8805         {
8806           p = CHAR_POS_ADDR (from);
8807           pend = CHAR_POS_ADDR (to);
8808           if (from < GPT && to >= GPT)
8809             stop = GPT_ADDR;
8810           else
8811             stop = pend;
8812           charset_map_loaded = 0;
8813         }
8814     }
8815
8816   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8817 }
8818
8819
8820 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8821        Scheck_coding_systems_region, 3, 3, 0,
8822        doc: /* Check if the region is encodable by coding systems.
8823
8824 START and END are buffer positions specifying the region.
8825 CODING-SYSTEM-LIST is a list of coding systems to check.
8826
8827 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8828 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8829 whole region, POS0, POS1, ... are buffer positions where non-encodable
8830 characters are found.
8831
8832 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8833 value is nil.
8834
8835 START may be a string.  In that case, check if the string is
8836 encodable, and the value contains indices to the string instead of
8837 buffer positions.  END is ignored.
8838
8839 If the current buffer (or START if it is a string) is unibyte, the value
8840 is nil.  */)
8841   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8842 {
8843   Lisp_Object list;
8844   ptrdiff_t start_byte, end_byte;
8845   ptrdiff_t pos;
8846   const unsigned char *p, *pbeg, *pend;
8847   int c;
8848   Lisp_Object tail, elt, attrs;
8849
8850   if (STRINGP (start))
8851     {
8852       if (!STRING_MULTIBYTE (start)
8853           || SCHARS (start) == SBYTES (start))
8854         return Qnil;
8855       start_byte = 0;
8856       end_byte = SBYTES (start);
8857       pos = 0;
8858     }
8859   else
8860     {
8861       CHECK_NUMBER_COERCE_MARKER (start);
8862       CHECK_NUMBER_COERCE_MARKER (end);
8863       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8864         args_out_of_range (start, end);
8865       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8866         return Qnil;
8867       start_byte = CHAR_TO_BYTE (XINT (start));
8868       end_byte = CHAR_TO_BYTE (XINT (end));
8869       if (XINT (end) - XINT (start) == end_byte - start_byte)
8870         return Qnil;
8871
8872       if (XINT (start) < GPT && XINT (end) > GPT)
8873         {
8874           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8875             move_gap_both (XINT (start), start_byte);
8876           else
8877             move_gap_both (XINT (end), end_byte);
8878         }
8879       pos = XINT (start);
8880     }
8881
8882   list = Qnil;
8883   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8884     {
8885       elt = XCAR (tail);
8886       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8887       ASET (attrs, coding_attr_trans_tbl,
8888             get_translation_table (attrs, 1, NULL));
8889       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8890     }
8891
8892   if (STRINGP (start))
8893     p = pbeg = SDATA (start);
8894   else
8895     p = pbeg = BYTE_POS_ADDR (start_byte);
8896   pend = p + (end_byte - start_byte);
8897
8898   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8899   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8900
8901   while (p < pend)
8902     {
8903       if (ASCII_BYTE_P (*p))
8904         p++;
8905       else
8906         {
8907           c = STRING_CHAR_ADVANCE (p);
8908
8909           charset_map_loaded = 0;
8910           for (tail = list; CONSP (tail); tail = XCDR (tail))
8911             {
8912               elt = XCDR (XCAR (tail));
8913               if (! char_encodable_p (c, XCAR (elt)))
8914                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8915             }
8916           if (charset_map_loaded)
8917             {
8918               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8919
8920               if (STRINGP (start))
8921                 pbeg = SDATA (start);
8922               else
8923                 pbeg = BYTE_POS_ADDR (start_byte);
8924               p = pbeg + p_offset;
8925               pend = pbeg + pend_offset;
8926             }
8927         }
8928       pos++;
8929     }
8930
8931   tail = list;
8932   list = Qnil;
8933   for (; CONSP (tail); tail = XCDR (tail))
8934     {
8935       elt = XCAR (tail);
8936       if (CONSP (XCDR (XCDR (elt))))
8937         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8938                       list);
8939     }
8940
8941   return list;
8942 }
8943
8944
8945 static Lisp_Object
8946 code_convert_region (Lisp_Object start, Lisp_Object end,
8947                      Lisp_Object coding_system, Lisp_Object dst_object,
8948                      int encodep, int norecord)
8949 {
8950   struct coding_system coding;
8951   ptrdiff_t from, from_byte, to, to_byte;
8952   Lisp_Object src_object;
8953
8954   CHECK_NUMBER_COERCE_MARKER (start);
8955   CHECK_NUMBER_COERCE_MARKER (end);
8956   if (NILP (coding_system))
8957     coding_system = Qno_conversion;
8958   else
8959     CHECK_CODING_SYSTEM (coding_system);
8960   src_object = Fcurrent_buffer ();
8961   if (NILP (dst_object))
8962     dst_object = src_object;
8963   else if (! EQ (dst_object, Qt))
8964     CHECK_BUFFER (dst_object);
8965
8966   validate_region (&start, &end);
8967   from = XFASTINT (start);
8968   from_byte = CHAR_TO_BYTE (from);
8969   to = XFASTINT (end);
8970   to_byte = CHAR_TO_BYTE (to);
8971
8972   setup_coding_system (coding_system, &coding);
8973   coding.mode |= CODING_MODE_LAST_BLOCK;
8974
8975   if (encodep)
8976     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8977                           dst_object);
8978   else
8979     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8980                           dst_object);
8981   if (! norecord)
8982     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8983
8984   return (BUFFERP (dst_object)
8985           ? make_number (coding.produced_char)
8986           : coding.dst_object);
8987 }
8988
8989
8990 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8991        3, 4, "r\nzCoding system: ",
8992        doc: /* Decode the current region from the specified coding system.
8993 When called from a program, takes four arguments:
8994         START, END, CODING-SYSTEM, and DESTINATION.
8995 START and END are buffer positions.
8996
8997 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8998 If nil, the region between START and END is replaced by the decoded text.
8999 If buffer, the decoded text is inserted in that buffer after point (point
9000 does not move).
9001 In those cases, the length of the decoded text is returned.
9002 If DESTINATION is t, the decoded text is returned.
9003
9004 This function sets `last-coding-system-used' to the precise coding system
9005 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9006 not fully specified.)  */)
9007   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9008 {
9009   return code_convert_region (start, end, coding_system, destination, 0, 0);
9010 }
9011
9012 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9013        3, 4, "r\nzCoding system: ",
9014        doc: /* Encode the current region by specified coding system.
9015 When called from a program, takes four arguments:
9016         START, END, CODING-SYSTEM and DESTINATION.
9017 START and END are buffer positions.
9018
9019 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9020 If nil, the region between START and END is replace by the encoded text.
9021 If buffer, the encoded text is inserted in that buffer after point (point
9022 does not move).
9023 In those cases, the length of the encoded text is returned.
9024 If DESTINATION is t, the encoded text is returned.
9025
9026 This function sets `last-coding-system-used' to the precise coding system
9027 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9028 not fully specified.)  */)
9029   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9030 {
9031   return code_convert_region (start, end, coding_system, destination, 1, 0);
9032 }
9033
9034 Lisp_Object
9035 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9036                      Lisp_Object dst_object, int encodep, int nocopy, int norecord)
9037 {
9038   struct coding_system coding;
9039   ptrdiff_t chars, bytes;
9040
9041   CHECK_STRING (string);
9042   if (NILP (coding_system))
9043     {
9044       if (! norecord)
9045         Vlast_coding_system_used = Qno_conversion;
9046       if (NILP (dst_object))
9047         return (nocopy ? Fcopy_sequence (string) : string);
9048     }
9049
9050   if (NILP (coding_system))
9051     coding_system = Qno_conversion;
9052   else
9053     CHECK_CODING_SYSTEM (coding_system);
9054   if (NILP (dst_object))
9055     dst_object = Qt;
9056   else if (! EQ (dst_object, Qt))
9057     CHECK_BUFFER (dst_object);
9058
9059   setup_coding_system (coding_system, &coding);
9060   coding.mode |= CODING_MODE_LAST_BLOCK;
9061   chars = SCHARS (string);
9062   bytes = SBYTES (string);
9063   if (encodep)
9064     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9065   else
9066     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9067   if (! norecord)
9068     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9069
9070   return (BUFFERP (dst_object)
9071           ? make_number (coding.produced_char)
9072           : coding.dst_object);
9073 }
9074
9075
9076 /* Encode or decode STRING according to CODING_SYSTEM.
9077    Do not set Vlast_coding_system_used.
9078
9079    This function is called only from macros DECODE_FILE and
9080    ENCODE_FILE, thus we ignore character composition.  */
9081
9082 Lisp_Object
9083 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9084                               int encodep)
9085 {
9086   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9087 }
9088
9089
9090 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9091        2, 4, 0,
9092        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9093
9094 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9095 if the decoding operation is trivial.
9096
9097 Optional fourth arg BUFFER non-nil means that the decoded text is
9098 inserted in that buffer after point (point does not move).  In this
9099 case, the return value is the length of the decoded text.
9100
9101 This function sets `last-coding-system-used' to the precise coding system
9102 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9103 not fully specified.)  */)
9104   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9105 {
9106   return code_convert_string (string, coding_system, buffer,
9107                               0, ! NILP (nocopy), 0);
9108 }
9109
9110 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9111        2, 4, 0,
9112        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9113
9114 Optional third arg NOCOPY non-nil means it is OK to return STRING
9115 itself if the encoding operation is trivial.
9116
9117 Optional fourth arg BUFFER non-nil means that the encoded text is
9118 inserted in that buffer after point (point does not move).  In this
9119 case, the return value is the length of the encoded text.
9120
9121 This function sets `last-coding-system-used' to the precise coding system
9122 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9123 not fully specified.)  */)
9124   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9125 {
9126   return code_convert_string (string, coding_system, buffer,
9127                               1, ! NILP (nocopy), 0);
9128 }
9129
9130 \f
9131 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9132        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9133 Return the corresponding character.  */)
9134   (Lisp_Object code)
9135 {
9136   Lisp_Object spec, attrs, val;
9137   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9138   EMACS_INT ch;
9139   int c;
9140
9141   CHECK_NATNUM (code);
9142   ch = XFASTINT (code);
9143   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9144   attrs = AREF (spec, 0);
9145
9146   if (ASCII_BYTE_P (ch)
9147       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9148     return code;
9149
9150   val = CODING_ATTR_CHARSET_LIST (attrs);
9151   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9152   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9153   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9154
9155   if (ch <= 0x7F)
9156     {
9157       c = ch;
9158       charset = charset_roman;
9159     }
9160   else if (ch >= 0xA0 && ch < 0xDF)
9161     {
9162       c = ch - 0x80;
9163       charset = charset_kana;
9164     }
9165   else
9166     {
9167       EMACS_INT c1 = ch >> 8;
9168       int c2 = ch & 0xFF;
9169
9170       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9171           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9172         error ("Invalid code: %"pI"d", ch);
9173       c = ch;
9174       SJIS_TO_JIS (c);
9175       charset = charset_kanji;
9176     }
9177   c = DECODE_CHAR (charset, c);
9178   if (c < 0)
9179     error ("Invalid code: %"pI"d", ch);
9180   return make_number (c);
9181 }
9182
9183
9184 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9185        doc: /* Encode a Japanese character CH to shift_jis encoding.
9186 Return the corresponding code in SJIS.  */)
9187   (Lisp_Object ch)
9188 {
9189   Lisp_Object spec, attrs, charset_list;
9190   int c;
9191   struct charset *charset;
9192   unsigned code;
9193
9194   CHECK_CHARACTER (ch);
9195   c = XFASTINT (ch);
9196   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9197   attrs = AREF (spec, 0);
9198
9199   if (ASCII_CHAR_P (c)
9200       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9201     return ch;
9202
9203   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9204   charset = char_charset (c, charset_list, &code);
9205   if (code == CHARSET_INVALID_CODE (charset))
9206     error ("Can't encode by shift_jis encoding: %c", c);
9207   JIS_TO_SJIS (code);
9208
9209   return make_number (code);
9210 }
9211
9212 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9213        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9214 Return the corresponding character.  */)
9215   (Lisp_Object code)
9216 {
9217   Lisp_Object spec, attrs, val;
9218   struct charset *charset_roman, *charset_big5, *charset;
9219   EMACS_INT ch;
9220   int c;
9221
9222   CHECK_NATNUM (code);
9223   ch = XFASTINT (code);
9224   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9225   attrs = AREF (spec, 0);
9226
9227   if (ASCII_BYTE_P (ch)
9228       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9229     return code;
9230
9231   val = CODING_ATTR_CHARSET_LIST (attrs);
9232   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9233   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9234
9235   if (ch <= 0x7F)
9236     {
9237       c = ch;
9238       charset = charset_roman;
9239     }
9240   else
9241     {
9242       EMACS_INT b1 = ch >> 8;
9243       int b2 = ch & 0x7F;
9244       if (b1 < 0xA1 || b1 > 0xFE
9245           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9246         error ("Invalid code: %"pI"d", ch);
9247       c = ch;
9248       charset = charset_big5;
9249     }
9250   c = DECODE_CHAR (charset, c);
9251   if (c < 0)
9252     error ("Invalid code: %"pI"d", ch);
9253   return make_number (c);
9254 }
9255
9256 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9257        doc: /* Encode the Big5 character CH to BIG5 coding system.
9258 Return the corresponding character code in Big5.  */)
9259   (Lisp_Object ch)
9260 {
9261   Lisp_Object spec, attrs, charset_list;
9262   struct charset *charset;
9263   int c;
9264   unsigned code;
9265
9266   CHECK_CHARACTER (ch);
9267   c = XFASTINT (ch);
9268   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9269   attrs = AREF (spec, 0);
9270   if (ASCII_CHAR_P (c)
9271       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9272     return ch;
9273
9274   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9275   charset = char_charset (c, charset_list, &code);
9276   if (code == CHARSET_INVALID_CODE (charset))
9277     error ("Can't encode by Big5 encoding: %c", c);
9278
9279   return make_number (code);
9280 }
9281
9282 \f
9283 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9284        Sset_terminal_coding_system_internal, 1, 2, 0,
9285        doc: /* Internal use only.  */)
9286   (Lisp_Object coding_system, Lisp_Object terminal)
9287 {
9288   struct terminal *term = get_terminal (terminal, 1);
9289   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9290   CHECK_SYMBOL (coding_system);
9291   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9292   /* We had better not send unsafe characters to terminal.  */
9293   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9294   /* Character composition should be disabled.  */
9295   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9296   terminal_coding->src_multibyte = 1;
9297   terminal_coding->dst_multibyte = 0;
9298   if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9299     term->charset_list = coding_charset_list (terminal_coding);
9300   else
9301     term->charset_list = Fcons (make_number (charset_ascii), Qnil);
9302   return Qnil;
9303 }
9304
9305 DEFUN ("set-safe-terminal-coding-system-internal",
9306        Fset_safe_terminal_coding_system_internal,
9307        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9308        doc: /* Internal use only.  */)
9309   (Lisp_Object coding_system)
9310 {
9311   CHECK_SYMBOL (coding_system);
9312   setup_coding_system (Fcheck_coding_system (coding_system),
9313                        &safe_terminal_coding);
9314   /* Character composition should be disabled.  */
9315   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9316   safe_terminal_coding.src_multibyte = 1;
9317   safe_terminal_coding.dst_multibyte = 0;
9318   return Qnil;
9319 }
9320
9321 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9322        Sterminal_coding_system, 0, 1, 0,
9323        doc: /* Return coding system specified for terminal output on the given terminal.
9324 TERMINAL may be a terminal object, a frame, or nil for the selected
9325 frame's terminal device.  */)
9326   (Lisp_Object terminal)
9327 {
9328   struct coding_system *terminal_coding
9329     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9330   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9331
9332   /* For backward compatibility, return nil if it is `undecided'.  */
9333   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9334 }
9335
9336 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9337        Sset_keyboard_coding_system_internal, 1, 2, 0,
9338        doc: /* Internal use only.  */)
9339   (Lisp_Object coding_system, Lisp_Object terminal)
9340 {
9341   struct terminal *t = get_terminal (terminal, 1);
9342   CHECK_SYMBOL (coding_system);
9343   if (NILP (coding_system))
9344     coding_system = Qno_conversion;
9345   else
9346     Fcheck_coding_system (coding_system);
9347   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9348   /* Character composition should be disabled.  */
9349   TERMINAL_KEYBOARD_CODING (t)->common_flags
9350     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9351   return Qnil;
9352 }
9353
9354 DEFUN ("keyboard-coding-system",
9355        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9356        doc: /* Return coding system specified for decoding keyboard input.  */)
9357   (Lisp_Object terminal)
9358 {
9359   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9360                          (get_terminal (terminal, 1))->id);
9361 }
9362
9363 \f
9364 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9365        Sfind_operation_coding_system,  1, MANY, 0,
9366        doc: /* Choose a coding system for an operation based on the target name.
9367 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9368 DECODING-SYSTEM is the coding system to use for decoding
9369 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9370 for encoding (in case OPERATION does encoding).
9371
9372 The first argument OPERATION specifies an I/O primitive:
9373   For file I/O, `insert-file-contents' or `write-region'.
9374   For process I/O, `call-process', `call-process-region', or `start-process'.
9375   For network I/O, `open-network-stream'.
9376
9377 The remaining arguments should be the same arguments that were passed
9378 to the primitive.  Depending on which primitive, one of those arguments
9379 is selected as the TARGET.  For example, if OPERATION does file I/O,
9380 whichever argument specifies the file name is TARGET.
9381
9382 TARGET has a meaning which depends on OPERATION:
9383   For file I/O, TARGET is a file name (except for the special case below).
9384   For process I/O, TARGET is a process name.
9385   For network I/O, TARGET is a service name or a port number.
9386
9387 This function looks up what is specified for TARGET in
9388 `file-coding-system-alist', `process-coding-system-alist',
9389 or `network-coding-system-alist' depending on OPERATION.
9390 They may specify a coding system, a cons of coding systems,
9391 or a function symbol to call.
9392 In the last case, we call the function with one argument,
9393 which is a list of all the arguments given to this function.
9394 If the function can't decide a coding system, it can return
9395 `undecided' so that the normal code-detection is performed.
9396
9397 If OPERATION is `insert-file-contents', the argument corresponding to
9398 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9399 file name to look up, and BUFFER is a buffer that contains the file's
9400 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9401 function to call for FILENAME, that function should examine the
9402 contents of BUFFER instead of reading the file.
9403
9404 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9405   (ptrdiff_t nargs, Lisp_Object *args)
9406 {
9407   Lisp_Object operation, target_idx, target, val;
9408   register Lisp_Object chain;
9409
9410   if (nargs < 2)
9411     error ("Too few arguments");
9412   operation = args[0];
9413   if (!SYMBOLP (operation)
9414       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9415     error ("Invalid first argument");
9416   if (nargs <= 1 + XFASTINT (target_idx))
9417     error ("Too few arguments for operation `%s'",
9418            SDATA (SYMBOL_NAME (operation)));
9419   target = args[XFASTINT (target_idx) + 1];
9420   if (!(STRINGP (target)
9421         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9422             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9423         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9424     error ("Invalid argument %"pI"d of operation `%s'",
9425            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9426   if (CONSP (target))
9427     target = XCAR (target);
9428
9429   chain = ((EQ (operation, Qinsert_file_contents)
9430             || EQ (operation, Qwrite_region))
9431            ? Vfile_coding_system_alist
9432            : (EQ (operation, Qopen_network_stream)
9433               ? Vnetwork_coding_system_alist
9434               : Vprocess_coding_system_alist));
9435   if (NILP (chain))
9436     return Qnil;
9437
9438   for (; CONSP (chain); chain = XCDR (chain))
9439     {
9440       Lisp_Object elt;
9441
9442       elt = XCAR (chain);
9443       if (CONSP (elt)
9444           && ((STRINGP (target)
9445                && STRINGP (XCAR (elt))
9446                && fast_string_match (XCAR (elt), target) >= 0)
9447               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9448         {
9449           val = XCDR (elt);
9450           /* Here, if VAL is both a valid coding system and a valid
9451              function symbol, we return VAL as a coding system.  */
9452           if (CONSP (val))
9453             return val;
9454           if (! SYMBOLP (val))
9455             return Qnil;
9456           if (! NILP (Fcoding_system_p (val)))
9457             return Fcons (val, val);
9458           if (! NILP (Ffboundp (val)))
9459             {
9460               /* We use call1 rather than safe_call1
9461                  so as to get bug reports about functions called here
9462                  which don't handle the current interface.  */
9463               val = call1 (val, Flist (nargs, args));
9464               if (CONSP (val))
9465                 return val;
9466               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9467                 return Fcons (val, val);
9468             }
9469           return Qnil;
9470         }
9471     }
9472   return Qnil;
9473 }
9474
9475 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9476        Sset_coding_system_priority, 0, MANY, 0,
9477        doc: /* Assign higher priority to the coding systems given as arguments.
9478 If multiple coding systems belong to the same category,
9479 all but the first one are ignored.
9480
9481 usage: (set-coding-system-priority &rest coding-systems)  */)
9482   (ptrdiff_t nargs, Lisp_Object *args)
9483 {
9484   ptrdiff_t i, j;
9485   int changed[coding_category_max];
9486   enum coding_category priorities[coding_category_max];
9487
9488   memset (changed, 0, sizeof changed);
9489
9490   for (i = j = 0; i < nargs; i++)
9491     {
9492       enum coding_category category;
9493       Lisp_Object spec, attrs;
9494
9495       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9496       attrs = AREF (spec, 0);
9497       category = XINT (CODING_ATTR_CATEGORY (attrs));
9498       if (changed[category])
9499         /* Ignore this coding system because a coding system of the
9500            same category already had a higher priority.  */
9501         continue;
9502       changed[category] = 1;
9503       priorities[j++] = category;
9504       if (coding_categories[category].id >= 0
9505           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9506         setup_coding_system (args[i], &coding_categories[category]);
9507       Fset (AREF (Vcoding_category_table, category), args[i]);
9508     }
9509
9510   /* Now we have decided top J priorities.  Reflect the order of the
9511      original priorities to the remaining priorities.  */
9512
9513   for (i = j, j = 0; i < coding_category_max; i++, j++)
9514     {
9515       while (j < coding_category_max
9516              && changed[coding_priorities[j]])
9517         j++;
9518       if (j == coding_category_max)
9519         abort ();
9520       priorities[i] = coding_priorities[j];
9521     }
9522
9523   memcpy (coding_priorities, priorities, sizeof priorities);
9524
9525   /* Update `coding-category-list'.  */
9526   Vcoding_category_list = Qnil;
9527   for (i = coding_category_max; i-- > 0; )
9528     Vcoding_category_list
9529       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9530                Vcoding_category_list);
9531
9532   return Qnil;
9533 }
9534
9535 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9536        Scoding_system_priority_list, 0, 1, 0,
9537        doc: /* Return a list of coding systems ordered by their priorities.
9538 The list contains a subset of coding systems; i.e. coding systems
9539 assigned to each coding category (see `coding-category-list').
9540
9541 HIGHESTP non-nil means just return the highest priority one.  */)
9542   (Lisp_Object highestp)
9543 {
9544   int i;
9545   Lisp_Object val;
9546
9547   for (i = 0, val = Qnil; i < coding_category_max; i++)
9548     {
9549       enum coding_category category = coding_priorities[i];
9550       int id = coding_categories[category].id;
9551       Lisp_Object attrs;
9552
9553       if (id < 0)
9554         continue;
9555       attrs = CODING_ID_ATTRS (id);
9556       if (! NILP (highestp))
9557         return CODING_ATTR_BASE_NAME (attrs);
9558       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9559     }
9560   return Fnreverse (val);
9561 }
9562
9563 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9564
9565 static Lisp_Object
9566 make_subsidiaries (Lisp_Object base)
9567 {
9568   Lisp_Object subsidiaries;
9569   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9570   char *buf = (char *) alloca (base_name_len + 6);
9571   int i;
9572
9573   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9574   subsidiaries = Fmake_vector (make_number (3), Qnil);
9575   for (i = 0; i < 3; i++)
9576     {
9577       strcpy (buf + base_name_len, suffixes[i]);
9578       ASET (subsidiaries, i, intern (buf));
9579     }
9580   return subsidiaries;
9581 }
9582
9583
9584 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9585        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9586        doc: /* For internal use only.
9587 usage: (define-coding-system-internal ...)  */)
9588   (ptrdiff_t nargs, Lisp_Object *args)
9589 {
9590   Lisp_Object name;
9591   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9592   Lisp_Object attrs;            /* Vector of attributes.  */
9593   Lisp_Object eol_type;
9594   Lisp_Object aliases;
9595   Lisp_Object coding_type, charset_list, safe_charsets;
9596   enum coding_category category;
9597   Lisp_Object tail, val;
9598   int max_charset_id = 0;
9599   int i;
9600
9601   if (nargs < coding_arg_max)
9602     goto short_args;
9603
9604   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9605
9606   name = args[coding_arg_name];
9607   CHECK_SYMBOL (name);
9608   CODING_ATTR_BASE_NAME (attrs) = name;
9609
9610   val = args[coding_arg_mnemonic];
9611   if (! STRINGP (val))
9612     CHECK_CHARACTER (val);
9613   CODING_ATTR_MNEMONIC (attrs) = val;
9614
9615   coding_type = args[coding_arg_coding_type];
9616   CHECK_SYMBOL (coding_type);
9617   CODING_ATTR_TYPE (attrs) = coding_type;
9618
9619   charset_list = args[coding_arg_charset_list];
9620   if (SYMBOLP (charset_list))
9621     {
9622       if (EQ (charset_list, Qiso_2022))
9623         {
9624           if (! EQ (coding_type, Qiso_2022))
9625             error ("Invalid charset-list");
9626           charset_list = Viso_2022_charset_list;
9627         }
9628       else if (EQ (charset_list, Qemacs_mule))
9629         {
9630           if (! EQ (coding_type, Qemacs_mule))
9631             error ("Invalid charset-list");
9632           charset_list = Vemacs_mule_charset_list;
9633         }
9634       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9635         {
9636           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9637             error ("Invalid charset-list");
9638           if (max_charset_id < XFASTINT (XCAR (tail)))
9639             max_charset_id = XFASTINT (XCAR (tail));
9640         }
9641     }
9642   else
9643     {
9644       charset_list = Fcopy_sequence (charset_list);
9645       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9646         {
9647           struct charset *charset;
9648
9649           val = XCAR (tail);
9650           CHECK_CHARSET_GET_CHARSET (val, charset);
9651           if (EQ (coding_type, Qiso_2022)
9652               ? CHARSET_ISO_FINAL (charset) < 0
9653               : EQ (coding_type, Qemacs_mule)
9654               ? CHARSET_EMACS_MULE_ID (charset) < 0
9655               : 0)
9656             error ("Can't handle charset `%s'",
9657                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9658
9659           XSETCAR (tail, make_number (charset->id));
9660           if (max_charset_id < charset->id)
9661             max_charset_id = charset->id;
9662         }
9663     }
9664   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9665
9666   safe_charsets = make_uninit_string (max_charset_id + 1);
9667   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9668   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9669     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9670   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9671
9672   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9673
9674   val = args[coding_arg_decode_translation_table];
9675   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9676     CHECK_SYMBOL (val);
9677   CODING_ATTR_DECODE_TBL (attrs) = val;
9678
9679   val = args[coding_arg_encode_translation_table];
9680   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9681     CHECK_SYMBOL (val);
9682   CODING_ATTR_ENCODE_TBL (attrs) = val;
9683
9684   val = args[coding_arg_post_read_conversion];
9685   CHECK_SYMBOL (val);
9686   CODING_ATTR_POST_READ (attrs) = val;
9687
9688   val = args[coding_arg_pre_write_conversion];
9689   CHECK_SYMBOL (val);
9690   CODING_ATTR_PRE_WRITE (attrs) = val;
9691
9692   val = args[coding_arg_default_char];
9693   if (NILP (val))
9694     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9695   else
9696     {
9697       CHECK_CHARACTER (val);
9698       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9699     }
9700
9701   val = args[coding_arg_for_unibyte];
9702   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9703
9704   val = args[coding_arg_plist];
9705   CHECK_LIST (val);
9706   CODING_ATTR_PLIST (attrs) = val;
9707
9708   if (EQ (coding_type, Qcharset))
9709     {
9710       /* Generate a lisp vector of 256 elements.  Each element is nil,
9711          integer, or a list of charset IDs.
9712
9713          If Nth element is nil, the byte code N is invalid in this
9714          coding system.
9715
9716          If Nth element is a number NUM, N is the first byte of a
9717          charset whose ID is NUM.
9718
9719          If Nth element is a list of charset IDs, N is the first byte
9720          of one of them.  The list is sorted by dimensions of the
9721          charsets.  A charset of smaller dimension comes first. */
9722       val = Fmake_vector (make_number (256), Qnil);
9723
9724       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9725         {
9726           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9727           int dim = CHARSET_DIMENSION (charset);
9728           int idx = (dim - 1) * 4;
9729
9730           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9731             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9732
9733           for (i = charset->code_space[idx];
9734                i <= charset->code_space[idx + 1]; i++)
9735             {
9736               Lisp_Object tmp, tmp2;
9737               int dim2;
9738
9739               tmp = AREF (val, i);
9740               if (NILP (tmp))
9741                 tmp = XCAR (tail);
9742               else if (NUMBERP (tmp))
9743                 {
9744                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9745                   if (dim < dim2)
9746                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9747                   else
9748                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9749                 }
9750               else
9751                 {
9752                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9753                     {
9754                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9755                       if (dim < dim2)
9756                         break;
9757                     }
9758                   if (NILP (tmp2))
9759                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9760                   else
9761                     {
9762                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9763                       XSETCAR (tmp2, XCAR (tail));
9764                     }
9765                 }
9766               ASET (val, i, tmp);
9767             }
9768         }
9769       ASET (attrs, coding_attr_charset_valids, val);
9770       category = coding_category_charset;
9771     }
9772   else if (EQ (coding_type, Qccl))
9773     {
9774       Lisp_Object valids;
9775
9776       if (nargs < coding_arg_ccl_max)
9777         goto short_args;
9778
9779       val = args[coding_arg_ccl_decoder];
9780       CHECK_CCL_PROGRAM (val);
9781       if (VECTORP (val))
9782         val = Fcopy_sequence (val);
9783       ASET (attrs, coding_attr_ccl_decoder, val);
9784
9785       val = args[coding_arg_ccl_encoder];
9786       CHECK_CCL_PROGRAM (val);
9787       if (VECTORP (val))
9788         val = Fcopy_sequence (val);
9789       ASET (attrs, coding_attr_ccl_encoder, val);
9790
9791       val = args[coding_arg_ccl_valids];
9792       valids = Fmake_string (make_number (256), make_number (0));
9793       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9794         {
9795           int from, to;
9796
9797           val = Fcar (tail);
9798           if (INTEGERP (val))
9799             {
9800               if (! (0 <= XINT (val) && XINT (val) <= 255))
9801                 args_out_of_range_3 (val, make_number (0), make_number (255));
9802               from = to = XINT (val);
9803             }
9804           else
9805             {
9806               CHECK_CONS (val);
9807               CHECK_NATNUM_CAR (val);
9808               CHECK_NUMBER_CDR (val);
9809               if (XINT (XCAR (val)) > 255)
9810                 args_out_of_range_3 (XCAR (val),
9811                                      make_number (0), make_number (255));
9812               from = XINT (XCAR (val));
9813               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9814                 args_out_of_range_3 (XCDR (val),
9815                                      XCAR (val), make_number (255));
9816               to = XINT (XCDR (val));
9817             }
9818           for (i = from; i <= to; i++)
9819             SSET (valids, i, 1);
9820         }
9821       ASET (attrs, coding_attr_ccl_valids, valids);
9822
9823       category = coding_category_ccl;
9824     }
9825   else if (EQ (coding_type, Qutf_16))
9826     {
9827       Lisp_Object bom, endian;
9828
9829       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9830
9831       if (nargs < coding_arg_utf16_max)
9832         goto short_args;
9833
9834       bom = args[coding_arg_utf16_bom];
9835       if (! NILP (bom) && ! EQ (bom, Qt))
9836         {
9837           CHECK_CONS (bom);
9838           val = XCAR (bom);
9839           CHECK_CODING_SYSTEM (val);
9840           val = XCDR (bom);
9841           CHECK_CODING_SYSTEM (val);
9842         }
9843       ASET (attrs, coding_attr_utf_bom, bom);
9844
9845       endian = args[coding_arg_utf16_endian];
9846       CHECK_SYMBOL (endian);
9847       if (NILP (endian))
9848         endian = Qbig;
9849       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9850         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9851       ASET (attrs, coding_attr_utf_16_endian, endian);
9852
9853       category = (CONSP (bom)
9854                   ? coding_category_utf_16_auto
9855                   : NILP (bom)
9856                   ? (EQ (endian, Qbig)
9857                      ? coding_category_utf_16_be_nosig
9858                      : coding_category_utf_16_le_nosig)
9859                   : (EQ (endian, Qbig)
9860                      ? coding_category_utf_16_be
9861                      : coding_category_utf_16_le));
9862     }
9863   else if (EQ (coding_type, Qiso_2022))
9864     {
9865       Lisp_Object initial, reg_usage, request, flags;
9866
9867       if (nargs < coding_arg_iso2022_max)
9868         goto short_args;
9869
9870       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9871       CHECK_VECTOR (initial);
9872       for (i = 0; i < 4; i++)
9873         {
9874           val = Faref (initial, make_number (i));
9875           if (! NILP (val))
9876             {
9877               struct charset *charset;
9878
9879               CHECK_CHARSET_GET_CHARSET (val, charset);
9880               ASET (initial, i, make_number (CHARSET_ID (charset)));
9881               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9882                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9883             }
9884           else
9885             ASET (initial, i, make_number (-1));
9886         }
9887
9888       reg_usage = args[coding_arg_iso2022_reg_usage];
9889       CHECK_CONS (reg_usage);
9890       CHECK_NUMBER_CAR (reg_usage);
9891       CHECK_NUMBER_CDR (reg_usage);
9892
9893       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9894       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9895         {
9896           int id;
9897           Lisp_Object tmp1;
9898
9899           val = Fcar (tail);
9900           CHECK_CONS (val);
9901           tmp1 = XCAR (val);
9902           CHECK_CHARSET_GET_ID (tmp1, id);
9903           CHECK_NATNUM_CDR (val);
9904           if (XINT (XCDR (val)) >= 4)
9905             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9906           XSETCAR (val, make_number (id));
9907         }
9908
9909       flags = args[coding_arg_iso2022_flags];
9910       CHECK_NATNUM (flags);
9911       i = XINT (flags) & INT_MAX;
9912       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9913         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9914       flags = make_number (i);
9915
9916       ASET (attrs, coding_attr_iso_initial, initial);
9917       ASET (attrs, coding_attr_iso_usage, reg_usage);
9918       ASET (attrs, coding_attr_iso_request, request);
9919       ASET (attrs, coding_attr_iso_flags, flags);
9920       setup_iso_safe_charsets (attrs);
9921
9922       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9923         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9924                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9925                     ? coding_category_iso_7_else
9926                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9927                     ? coding_category_iso_7
9928                     : coding_category_iso_7_tight);
9929       else
9930         {
9931           int id = XINT (AREF (initial, 1));
9932
9933           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9934                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9935                        || id < 0)
9936                       ? coding_category_iso_8_else
9937                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9938                       ? coding_category_iso_8_1
9939                       : coding_category_iso_8_2);
9940         }
9941       if (category != coding_category_iso_8_1
9942           && category != coding_category_iso_8_2)
9943         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9944     }
9945   else if (EQ (coding_type, Qemacs_mule))
9946     {
9947       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9948         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9949       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9950       category = coding_category_emacs_mule;
9951     }
9952   else if (EQ (coding_type, Qshift_jis))
9953     {
9954
9955       struct charset *charset;
9956
9957       if (XINT (Flength (charset_list)) != 3
9958           && XINT (Flength (charset_list)) != 4)
9959         error ("There should be three or four charsets");
9960
9961       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9962       if (CHARSET_DIMENSION (charset) != 1)
9963         error ("Dimension of charset %s is not one",
9964                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9965       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9966         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9967
9968       charset_list = XCDR (charset_list);
9969       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9970       if (CHARSET_DIMENSION (charset) != 1)
9971         error ("Dimension of charset %s is not one",
9972                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9973
9974       charset_list = XCDR (charset_list);
9975       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9976       if (CHARSET_DIMENSION (charset) != 2)
9977         error ("Dimension of charset %s is not two",
9978                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9979
9980       charset_list = XCDR (charset_list);
9981       if (! NILP (charset_list))
9982         {
9983           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9984           if (CHARSET_DIMENSION (charset) != 2)
9985             error ("Dimension of charset %s is not two",
9986                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9987         }
9988
9989       category = coding_category_sjis;
9990       Vsjis_coding_system = name;
9991     }
9992   else if (EQ (coding_type, Qbig5))
9993     {
9994       struct charset *charset;
9995
9996       if (XINT (Flength (charset_list)) != 2)
9997         error ("There should be just two charsets");
9998
9999       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10000       if (CHARSET_DIMENSION (charset) != 1)
10001         error ("Dimension of charset %s is not one",
10002                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10003       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10004         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10005
10006       charset_list = XCDR (charset_list);
10007       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10008       if (CHARSET_DIMENSION (charset) != 2)
10009         error ("Dimension of charset %s is not two",
10010                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10011
10012       category = coding_category_big5;
10013       Vbig5_coding_system = name;
10014     }
10015   else if (EQ (coding_type, Qraw_text))
10016     {
10017       category = coding_category_raw_text;
10018       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10019     }
10020   else if (EQ (coding_type, Qutf_8))
10021     {
10022       Lisp_Object bom;
10023
10024       if (nargs < coding_arg_utf8_max)
10025         goto short_args;
10026
10027       bom = args[coding_arg_utf8_bom];
10028       if (! NILP (bom) && ! EQ (bom, Qt))
10029         {
10030           CHECK_CONS (bom);
10031           val = XCAR (bom);
10032           CHECK_CODING_SYSTEM (val);
10033           val = XCDR (bom);
10034           CHECK_CODING_SYSTEM (val);
10035         }
10036       ASET (attrs, coding_attr_utf_bom, bom);
10037       if (NILP (bom))
10038         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10039
10040       category = (CONSP (bom) ? coding_category_utf_8_auto
10041                   : NILP (bom) ? coding_category_utf_8_nosig
10042                   : coding_category_utf_8_sig);
10043     }
10044   else if (EQ (coding_type, Qundecided))
10045     category = coding_category_undecided;
10046   else
10047     error ("Invalid coding system type: %s",
10048            SDATA (SYMBOL_NAME (coding_type)));
10049
10050   CODING_ATTR_CATEGORY (attrs) = make_number (category);
10051   CODING_ATTR_PLIST (attrs)
10052     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10053                                 CODING_ATTR_PLIST (attrs)));
10054   CODING_ATTR_PLIST (attrs)
10055     = Fcons (QCascii_compatible_p,
10056              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10057                     CODING_ATTR_PLIST (attrs)));
10058
10059   eol_type = args[coding_arg_eol_type];
10060   if (! NILP (eol_type)
10061       && ! EQ (eol_type, Qunix)
10062       && ! EQ (eol_type, Qdos)
10063       && ! EQ (eol_type, Qmac))
10064     error ("Invalid eol-type");
10065
10066   aliases = Fcons (name, Qnil);
10067
10068   if (NILP (eol_type))
10069     {
10070       eol_type = make_subsidiaries (name);
10071       for (i = 0; i < 3; i++)
10072         {
10073           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10074
10075           this_name = AREF (eol_type, i);
10076           this_aliases = Fcons (this_name, Qnil);
10077           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10078           this_spec = Fmake_vector (make_number (3), attrs);
10079           ASET (this_spec, 1, this_aliases);
10080           ASET (this_spec, 2, this_eol_type);
10081           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10082           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10083           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10084           if (NILP (val))
10085             Vcoding_system_alist
10086               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10087                        Vcoding_system_alist);
10088         }
10089     }
10090
10091   spec_vec = Fmake_vector (make_number (3), attrs);
10092   ASET (spec_vec, 1, aliases);
10093   ASET (spec_vec, 2, eol_type);
10094
10095   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10096   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10097   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10098   if (NILP (val))
10099     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10100                                   Vcoding_system_alist);
10101
10102   {
10103     int id = coding_categories[category].id;
10104
10105     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10106       setup_coding_system (name, &coding_categories[category]);
10107   }
10108
10109   return Qnil;
10110
10111  short_args:
10112   return Fsignal (Qwrong_number_of_arguments,
10113                   Fcons (intern ("define-coding-system-internal"),
10114                          make_number (nargs)));
10115 }
10116
10117
10118 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10119        3, 3, 0,
10120        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10121   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10122 {
10123   Lisp_Object spec, attrs;
10124
10125   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10126   attrs = AREF (spec, 0);
10127   if (EQ (prop, QCmnemonic))
10128     {
10129       if (! STRINGP (val))
10130         CHECK_CHARACTER (val);
10131       CODING_ATTR_MNEMONIC (attrs) = val;
10132     }
10133   else if (EQ (prop, QCdefault_char))
10134     {
10135       if (NILP (val))
10136         val = make_number (' ');
10137       else
10138         CHECK_CHARACTER (val);
10139       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10140     }
10141   else if (EQ (prop, QCdecode_translation_table))
10142     {
10143       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10144         CHECK_SYMBOL (val);
10145       CODING_ATTR_DECODE_TBL (attrs) = val;
10146     }
10147   else if (EQ (prop, QCencode_translation_table))
10148     {
10149       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10150         CHECK_SYMBOL (val);
10151       CODING_ATTR_ENCODE_TBL (attrs) = val;
10152     }
10153   else if (EQ (prop, QCpost_read_conversion))
10154     {
10155       CHECK_SYMBOL (val);
10156       CODING_ATTR_POST_READ (attrs) = val;
10157     }
10158   else if (EQ (prop, QCpre_write_conversion))
10159     {
10160       CHECK_SYMBOL (val);
10161       CODING_ATTR_PRE_WRITE (attrs) = val;
10162     }
10163   else if (EQ (prop, QCascii_compatible_p))
10164     {
10165       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10166     }
10167
10168   CODING_ATTR_PLIST (attrs)
10169     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10170   return val;
10171 }
10172
10173
10174 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10175        Sdefine_coding_system_alias, 2, 2, 0,
10176        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10177   (Lisp_Object alias, Lisp_Object coding_system)
10178 {
10179   Lisp_Object spec, aliases, eol_type, val;
10180
10181   CHECK_SYMBOL (alias);
10182   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10183   aliases = AREF (spec, 1);
10184   /* ALIASES should be a list of length more than zero, and the first
10185      element is a base coding system.  Append ALIAS at the tail of the
10186      list.  */
10187   while (!NILP (XCDR (aliases)))
10188     aliases = XCDR (aliases);
10189   XSETCDR (aliases, Fcons (alias, Qnil));
10190
10191   eol_type = AREF (spec, 2);
10192   if (VECTORP (eol_type))
10193     {
10194       Lisp_Object subsidiaries;
10195       int i;
10196
10197       subsidiaries = make_subsidiaries (alias);
10198       for (i = 0; i < 3; i++)
10199         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10200                                      AREF (eol_type, i));
10201     }
10202
10203   Fputhash (alias, spec, Vcoding_system_hash_table);
10204   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10205   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10206   if (NILP (val))
10207     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10208                                   Vcoding_system_alist);
10209
10210   return Qnil;
10211 }
10212
10213 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10214        1, 1, 0,
10215        doc: /* Return the base of CODING-SYSTEM.
10216 Any alias or subsidiary coding system is not a base coding system.  */)
10217   (Lisp_Object coding_system)
10218 {
10219   Lisp_Object spec, attrs;
10220
10221   if (NILP (coding_system))
10222     return (Qno_conversion);
10223   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10224   attrs = AREF (spec, 0);
10225   return CODING_ATTR_BASE_NAME (attrs);
10226 }
10227
10228 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10229        1, 1, 0,
10230        doc: "Return the property list of CODING-SYSTEM.")
10231   (Lisp_Object coding_system)
10232 {
10233   Lisp_Object spec, attrs;
10234
10235   if (NILP (coding_system))
10236     coding_system = Qno_conversion;
10237   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10238   attrs = AREF (spec, 0);
10239   return CODING_ATTR_PLIST (attrs);
10240 }
10241
10242
10243 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10244        1, 1, 0,
10245        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10246   (Lisp_Object coding_system)
10247 {
10248   Lisp_Object spec;
10249
10250   if (NILP (coding_system))
10251     coding_system = Qno_conversion;
10252   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10253   return AREF (spec, 1);
10254 }
10255
10256 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10257        Scoding_system_eol_type, 1, 1, 0,
10258        doc: /* Return eol-type of CODING-SYSTEM.
10259 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10260
10261 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10262 and CR respectively.
10263
10264 A vector value indicates that a format of end-of-line should be
10265 detected automatically.  Nth element of the vector is the subsidiary
10266 coding system whose eol-type is N.  */)
10267   (Lisp_Object coding_system)
10268 {
10269   Lisp_Object spec, eol_type;
10270   int n;
10271
10272   if (NILP (coding_system))
10273     coding_system = Qno_conversion;
10274   if (! CODING_SYSTEM_P (coding_system))
10275     return Qnil;
10276   spec = CODING_SYSTEM_SPEC (coding_system);
10277   eol_type = AREF (spec, 2);
10278   if (VECTORP (eol_type))
10279     return Fcopy_sequence (eol_type);
10280   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10281   return make_number (n);
10282 }
10283
10284 #endif /* emacs */
10285
10286 \f
10287 /*** 9. Post-amble ***/
10288
10289 void
10290 init_coding_once (void)
10291 {
10292   int i;
10293
10294   for (i = 0; i < coding_category_max; i++)
10295     {
10296       coding_categories[i].id = -1;
10297       coding_priorities[i] = i;
10298     }
10299
10300   /* ISO2022 specific initialize routine.  */
10301   for (i = 0; i < 0x20; i++)
10302     iso_code_class[i] = ISO_control_0;
10303   for (i = 0x21; i < 0x7F; i++)
10304     iso_code_class[i] = ISO_graphic_plane_0;
10305   for (i = 0x80; i < 0xA0; i++)
10306     iso_code_class[i] = ISO_control_1;
10307   for (i = 0xA1; i < 0xFF; i++)
10308     iso_code_class[i] = ISO_graphic_plane_1;
10309   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10310   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10311   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10312   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10313   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10314   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10315   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10316   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10317   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10318
10319   for (i = 0; i < 256; i++)
10320     {
10321       emacs_mule_bytes[i] = 1;
10322     }
10323   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10324   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10325   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10326   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10327 }
10328
10329 #ifdef emacs
10330
10331 void
10332 syms_of_coding (void)
10333 {
10334   staticpro (&Vcoding_system_hash_table);
10335   {
10336     Lisp_Object args[2];
10337     args[0] = QCtest;
10338     args[1] = Qeq;
10339     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10340   }
10341
10342   staticpro (&Vsjis_coding_system);
10343   Vsjis_coding_system = Qnil;
10344
10345   staticpro (&Vbig5_coding_system);
10346   Vbig5_coding_system = Qnil;
10347
10348   staticpro (&Vcode_conversion_reused_workbuf);
10349   Vcode_conversion_reused_workbuf = Qnil;
10350
10351   staticpro (&Vcode_conversion_workbuf_name);
10352   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10353
10354   reused_workbuf_in_use = 0;
10355
10356   DEFSYM (Qcharset, "charset");
10357   DEFSYM (Qtarget_idx, "target-idx");
10358   DEFSYM (Qcoding_system_history, "coding-system-history");
10359   Fset (Qcoding_system_history, Qnil);
10360
10361   /* Target FILENAME is the first argument.  */
10362   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10363   /* Target FILENAME is the third argument.  */
10364   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10365
10366   DEFSYM (Qcall_process, "call-process");
10367   /* Target PROGRAM is the first argument.  */
10368   Fput (Qcall_process, Qtarget_idx, make_number (0));
10369
10370   DEFSYM (Qcall_process_region, "call-process-region");
10371   /* Target PROGRAM is the third argument.  */
10372   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10373
10374   DEFSYM (Qstart_process, "start-process");
10375   /* Target PROGRAM is the third argument.  */
10376   Fput (Qstart_process, Qtarget_idx, make_number (2));
10377
10378   DEFSYM (Qopen_network_stream, "open-network-stream");
10379   /* Target SERVICE is the fourth argument.  */
10380   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10381
10382   DEFSYM (Qcoding_system, "coding-system");
10383   DEFSYM (Qcoding_aliases, "coding-aliases");
10384
10385   DEFSYM (Qeol_type, "eol-type");
10386   DEFSYM (Qunix, "unix");
10387   DEFSYM (Qdos, "dos");
10388
10389   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10390   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10391   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10392   DEFSYM (Qdefault_char, "default-char");
10393   DEFSYM (Qundecided, "undecided");
10394   DEFSYM (Qno_conversion, "no-conversion");
10395   DEFSYM (Qraw_text, "raw-text");
10396
10397   DEFSYM (Qiso_2022, "iso-2022");
10398
10399   DEFSYM (Qutf_8, "utf-8");
10400   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10401
10402   DEFSYM (Qutf_16, "utf-16");
10403   DEFSYM (Qbig, "big");
10404   DEFSYM (Qlittle, "little");
10405
10406   DEFSYM (Qshift_jis, "shift-jis");
10407   DEFSYM (Qbig5, "big5");
10408
10409   DEFSYM (Qcoding_system_p, "coding-system-p");
10410
10411   DEFSYM (Qcoding_system_error, "coding-system-error");
10412   Fput (Qcoding_system_error, Qerror_conditions,
10413         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10414   Fput (Qcoding_system_error, Qerror_message,
10415         make_pure_c_string ("Invalid coding system"));
10416
10417   /* Intern this now in case it isn't already done.
10418      Setting this variable twice is harmless.
10419      But don't staticpro it here--that is done in alloc.c.  */
10420   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10421
10422   DEFSYM (Qtranslation_table, "translation-table");
10423   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10424   DEFSYM (Qtranslation_table_id, "translation-table-id");
10425   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10426   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10427
10428   DEFSYM (Qvalid_codes, "valid-codes");
10429
10430   DEFSYM (Qemacs_mule, "emacs-mule");
10431
10432   DEFSYM (QCcategory, ":category");
10433   DEFSYM (QCmnemonic, ":mnemonic");
10434   DEFSYM (QCdefault_char, ":default-char");
10435   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10436   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10437   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10438   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10439   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10440
10441   Vcoding_category_table
10442     = Fmake_vector (make_number (coding_category_max), Qnil);
10443   staticpro (&Vcoding_category_table);
10444   /* Followings are target of code detection.  */
10445   ASET (Vcoding_category_table, coding_category_iso_7,
10446         intern_c_string ("coding-category-iso-7"));
10447   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10448         intern_c_string ("coding-category-iso-7-tight"));
10449   ASET (Vcoding_category_table, coding_category_iso_8_1,
10450         intern_c_string ("coding-category-iso-8-1"));
10451   ASET (Vcoding_category_table, coding_category_iso_8_2,
10452         intern_c_string ("coding-category-iso-8-2"));
10453   ASET (Vcoding_category_table, coding_category_iso_7_else,
10454         intern_c_string ("coding-category-iso-7-else"));
10455   ASET (Vcoding_category_table, coding_category_iso_8_else,
10456         intern_c_string ("coding-category-iso-8-else"));
10457   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10458         intern_c_string ("coding-category-utf-8-auto"));
10459   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10460         intern_c_string ("coding-category-utf-8"));
10461   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10462         intern_c_string ("coding-category-utf-8-sig"));
10463   ASET (Vcoding_category_table, coding_category_utf_16_be,
10464         intern_c_string ("coding-category-utf-16-be"));
10465   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10466         intern_c_string ("coding-category-utf-16-auto"));
10467   ASET (Vcoding_category_table, coding_category_utf_16_le,
10468         intern_c_string ("coding-category-utf-16-le"));
10469   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10470         intern_c_string ("coding-category-utf-16-be-nosig"));
10471   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10472         intern_c_string ("coding-category-utf-16-le-nosig"));
10473   ASET (Vcoding_category_table, coding_category_charset,
10474         intern_c_string ("coding-category-charset"));
10475   ASET (Vcoding_category_table, coding_category_sjis,
10476         intern_c_string ("coding-category-sjis"));
10477   ASET (Vcoding_category_table, coding_category_big5,
10478         intern_c_string ("coding-category-big5"));
10479   ASET (Vcoding_category_table, coding_category_ccl,
10480         intern_c_string ("coding-category-ccl"));
10481   ASET (Vcoding_category_table, coding_category_emacs_mule,
10482         intern_c_string ("coding-category-emacs-mule"));
10483   /* Followings are NOT target of code detection.  */
10484   ASET (Vcoding_category_table, coding_category_raw_text,
10485         intern_c_string ("coding-category-raw-text"));
10486   ASET (Vcoding_category_table, coding_category_undecided,
10487         intern_c_string ("coding-category-undecided"));
10488
10489   DEFSYM (Qinsufficient_source, "insufficient-source");
10490   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10491   DEFSYM (Qinvalid_source, "invalid-source");
10492   DEFSYM (Qinterrupted, "interrupted");
10493   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10494   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10495
10496   defsubr (&Scoding_system_p);
10497   defsubr (&Sread_coding_system);
10498   defsubr (&Sread_non_nil_coding_system);
10499   defsubr (&Scheck_coding_system);
10500   defsubr (&Sdetect_coding_region);
10501   defsubr (&Sdetect_coding_string);
10502   defsubr (&Sfind_coding_systems_region_internal);
10503   defsubr (&Sunencodable_char_position);
10504   defsubr (&Scheck_coding_systems_region);
10505   defsubr (&Sdecode_coding_region);
10506   defsubr (&Sencode_coding_region);
10507   defsubr (&Sdecode_coding_string);
10508   defsubr (&Sencode_coding_string);
10509   defsubr (&Sdecode_sjis_char);
10510   defsubr (&Sencode_sjis_char);
10511   defsubr (&Sdecode_big5_char);
10512   defsubr (&Sencode_big5_char);
10513   defsubr (&Sset_terminal_coding_system_internal);
10514   defsubr (&Sset_safe_terminal_coding_system_internal);
10515   defsubr (&Sterminal_coding_system);
10516   defsubr (&Sset_keyboard_coding_system_internal);
10517   defsubr (&Skeyboard_coding_system);
10518   defsubr (&Sfind_operation_coding_system);
10519   defsubr (&Sset_coding_system_priority);
10520   defsubr (&Sdefine_coding_system_internal);
10521   defsubr (&Sdefine_coding_system_alias);
10522   defsubr (&Scoding_system_put);
10523   defsubr (&Scoding_system_base);
10524   defsubr (&Scoding_system_plist);
10525   defsubr (&Scoding_system_aliases);
10526   defsubr (&Scoding_system_eol_type);
10527   defsubr (&Scoding_system_priority_list);
10528
10529   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10530                doc: /* List of coding systems.
10531
10532 Do not alter the value of this variable manually.  This variable should be
10533 updated by the functions `define-coding-system' and
10534 `define-coding-system-alias'.  */);
10535   Vcoding_system_list = Qnil;
10536
10537   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10538                doc: /* Alist of coding system names.
10539 Each element is one element list of coding system name.
10540 This variable is given to `completing-read' as COLLECTION argument.
10541
10542 Do not alter the value of this variable manually.  This variable should be
10543 updated by the functions `make-coding-system' and
10544 `define-coding-system-alias'.  */);
10545   Vcoding_system_alist = Qnil;
10546
10547   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10548                doc: /* List of coding-categories (symbols) ordered by priority.
10549
10550 On detecting a coding system, Emacs tries code detection algorithms
10551 associated with each coding-category one by one in this order.  When
10552 one algorithm agrees with a byte sequence of source text, the coding
10553 system bound to the corresponding coding-category is selected.
10554
10555 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10556   {
10557     int i;
10558
10559     Vcoding_category_list = Qnil;
10560     for (i = coding_category_max - 1; i >= 0; i--)
10561       Vcoding_category_list
10562         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10563                  Vcoding_category_list);
10564   }
10565
10566   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10567                doc: /* Specify the coding system for read operations.
10568 It is useful to bind this variable with `let', but do not set it globally.
10569 If the value is a coding system, it is used for decoding on read operation.
10570 If not, an appropriate element is used from one of the coding system alists.
10571 There are three such tables: `file-coding-system-alist',
10572 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10573   Vcoding_system_for_read = Qnil;
10574
10575   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10576                doc: /* Specify the coding system for write operations.
10577 Programs bind this variable with `let', but you should not set it globally.
10578 If the value is a coding system, it is used for encoding of output,
10579 when writing it to a file and when sending it to a file or subprocess.
10580
10581 If this does not specify a coding system, an appropriate element
10582 is used from one of the coding system alists.
10583 There are three such tables: `file-coding-system-alist',
10584 `process-coding-system-alist', and `network-coding-system-alist'.
10585 For output to files, if the above procedure does not specify a coding system,
10586 the value of `buffer-file-coding-system' is used.  */);
10587   Vcoding_system_for_write = Qnil;
10588
10589   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10590                doc: /*
10591 Coding system used in the latest file or process I/O.  */);
10592   Vlast_coding_system_used = Qnil;
10593
10594   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10595                doc: /*
10596 Error status of the last code conversion.
10597
10598 When an error was detected in the last code conversion, this variable
10599 is set to one of the following symbols.
10600   `insufficient-source'
10601   `inconsistent-eol'
10602   `invalid-source'
10603   `interrupted'
10604   `insufficient-memory'
10605 When no error was detected, the value doesn't change.  So, to check
10606 the error status of a code conversion by this variable, you must
10607 explicitly set this variable to nil before performing code
10608 conversion.  */);
10609   Vlast_code_conversion_error = Qnil;
10610
10611   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10612                doc: /*
10613 *Non-nil means always inhibit code conversion of end-of-line format.
10614 See info node `Coding Systems' and info node `Text and Binary' concerning
10615 such conversion.  */);
10616   inhibit_eol_conversion = 0;
10617
10618   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10619                doc: /*
10620 Non-nil means process buffer inherits coding system of process output.
10621 Bind it to t if the process output is to be treated as if it were a file
10622 read from some filesystem.  */);
10623   inherit_process_coding_system = 0;
10624
10625   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10626                doc: /*
10627 Alist to decide a coding system to use for a file I/O operation.
10628 The format is ((PATTERN . VAL) ...),
10629 where PATTERN is a regular expression matching a file name,
10630 VAL is a coding system, a cons of coding systems, or a function symbol.
10631 If VAL is a coding system, it is used for both decoding and encoding
10632 the file contents.
10633 If VAL is a cons of coding systems, the car part is used for decoding,
10634 and the cdr part is used for encoding.
10635 If VAL is a function symbol, the function must return a coding system
10636 or a cons of coding systems which are used as above.  The function is
10637 called with an argument that is a list of the arguments with which
10638 `find-operation-coding-system' was called.  If the function can't decide
10639 a coding system, it can return `undecided' so that the normal
10640 code-detection is performed.
10641
10642 See also the function `find-operation-coding-system'
10643 and the variable `auto-coding-alist'.  */);
10644   Vfile_coding_system_alist = Qnil;
10645
10646   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10647                doc: /*
10648 Alist to decide a coding system to use for a process I/O operation.
10649 The format is ((PATTERN . VAL) ...),
10650 where PATTERN is a regular expression matching a program name,
10651 VAL is a coding system, a cons of coding systems, or a function symbol.
10652 If VAL is a coding system, it is used for both decoding what received
10653 from the program and encoding what sent to the program.
10654 If VAL is a cons of coding systems, the car part is used for decoding,
10655 and the cdr part is used for encoding.
10656 If VAL is a function symbol, the function must return a coding system
10657 or a cons of coding systems which are used as above.
10658
10659 See also the function `find-operation-coding-system'.  */);
10660   Vprocess_coding_system_alist = Qnil;
10661
10662   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10663                doc: /*
10664 Alist to decide a coding system to use for a network I/O operation.
10665 The format is ((PATTERN . VAL) ...),
10666 where PATTERN is a regular expression matching a network service name
10667 or is a port number to connect to,
10668 VAL is a coding system, a cons of coding systems, or a function symbol.
10669 If VAL is a coding system, it is used for both decoding what received
10670 from the network stream and encoding what sent to the network stream.
10671 If VAL is a cons of coding systems, the car part is used for decoding,
10672 and the cdr part is used for encoding.
10673 If VAL is a function symbol, the function must return a coding system
10674 or a cons of coding systems which are used as above.
10675
10676 See also the function `find-operation-coding-system'.  */);
10677   Vnetwork_coding_system_alist = Qnil;
10678
10679   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10680                doc: /* Coding system to use with system messages.
10681 Also used for decoding keyboard input on X Window system.  */);
10682   Vlocale_coding_system = Qnil;
10683
10684   /* The eol mnemonics are reset in startup.el system-dependently.  */
10685   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10686                doc: /*
10687 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10688   eol_mnemonic_unix = make_pure_c_string (":");
10689
10690   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10691                doc: /*
10692 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10693   eol_mnemonic_dos = make_pure_c_string ("\\");
10694
10695   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10696                doc: /*
10697 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10698   eol_mnemonic_mac = make_pure_c_string ("/");
10699
10700   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10701                doc: /*
10702 *String displayed in mode line when end-of-line format is not yet determined.  */);
10703   eol_mnemonic_undecided = make_pure_c_string (":");
10704
10705   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10706                doc: /*
10707 *Non-nil enables character translation while encoding and decoding.  */);
10708   Venable_character_translation = Qt;
10709
10710   DEFVAR_LISP ("standard-translation-table-for-decode",
10711                Vstandard_translation_table_for_decode,
10712                doc: /* Table for translating characters while decoding.  */);
10713   Vstandard_translation_table_for_decode = Qnil;
10714
10715   DEFVAR_LISP ("standard-translation-table-for-encode",
10716                Vstandard_translation_table_for_encode,
10717                doc: /* Table for translating characters while encoding.  */);
10718   Vstandard_translation_table_for_encode = Qnil;
10719
10720   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10721                doc: /* Alist of charsets vs revision numbers.
10722 While encoding, if a charset (car part of an element) is found,
10723 designate it with the escape sequence identifying revision (cdr part
10724 of the element).  */);
10725   Vcharset_revision_table = Qnil;
10726
10727   DEFVAR_LISP ("default-process-coding-system",
10728                Vdefault_process_coding_system,
10729                doc: /* Cons of coding systems used for process I/O by default.
10730 The car part is used for decoding a process output,
10731 the cdr part is used for encoding a text to be sent to a process.  */);
10732   Vdefault_process_coding_system = Qnil;
10733
10734   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10735                doc: /*
10736 Table of extra Latin codes in the range 128..159 (inclusive).
10737 This is a vector of length 256.
10738 If Nth element is non-nil, the existence of code N in a file
10739 \(or output of subprocess) doesn't prevent it to be detected as
10740 a coding system of ISO 2022 variant which has a flag
10741 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10742 or reading output of a subprocess.
10743 Only 128th through 159th elements have a meaning.  */);
10744   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10745
10746   DEFVAR_LISP ("select-safe-coding-system-function",
10747                Vselect_safe_coding_system_function,
10748                doc: /*
10749 Function to call to select safe coding system for encoding a text.
10750
10751 If set, this function is called to force a user to select a proper
10752 coding system which can encode the text in the case that a default
10753 coding system used in each operation can't encode the text.  The
10754 function should take care that the buffer is not modified while
10755 the coding system is being selected.
10756
10757 The default value is `select-safe-coding-system' (which see).  */);
10758   Vselect_safe_coding_system_function = Qnil;
10759
10760   DEFVAR_BOOL ("coding-system-require-warning",
10761                coding_system_require_warning,
10762                doc: /* Internal use only.
10763 If non-nil, on writing a file, `select-safe-coding-system-function' is
10764 called even if `coding-system-for-write' is non-nil.  The command
10765 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10766   coding_system_require_warning = 0;
10767
10768
10769   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10770                inhibit_iso_escape_detection,
10771                doc: /*
10772 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10773
10774 When Emacs reads text, it tries to detect how the text is encoded.
10775 This code detection is sensitive to escape sequences.  If Emacs sees
10776 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10777 of the ISO2022 encodings, and decodes text by the corresponding coding
10778 system (e.g. `iso-2022-7bit').
10779
10780 However, there may be a case that you want to read escape sequences in
10781 a file as is.  In such a case, you can set this variable to non-nil.
10782 Then the code detection will ignore any escape sequences, and no text is
10783 detected as encoded in some ISO-2022 encoding.  The result is that all
10784 escape sequences become visible in a buffer.
10785
10786 The default value is nil, and it is strongly recommended not to change
10787 it.  That is because many Emacs Lisp source files that contain
10788 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10789 in Emacs's distribution, and they won't be decoded correctly on
10790 reading if you suppress escape sequence detection.
10791
10792 The other way to read escape sequences in a file without decoding is
10793 to explicitly specify some coding system that doesn't use ISO-2022
10794 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10795   inhibit_iso_escape_detection = 0;
10796
10797   DEFVAR_BOOL ("inhibit-null-byte-detection",
10798                inhibit_null_byte_detection,
10799                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10800 By default, Emacs treats it as binary data, and does not attempt to
10801 decode it.  The effect is as if you specified `no-conversion' for
10802 reading that text.
10803
10804 Set this to non-nil when a regular text happens to include null bytes.
10805 Examples are Index nodes of Info files and null-byte delimited output
10806 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10807 decode text as usual.  */);
10808   inhibit_null_byte_detection = 0;
10809
10810   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10811                doc: /* Char table for translating self-inserting characters.
10812 This is applied to the result of input methods, not their input.
10813 See also `keyboard-translate-table'.
10814
10815 Use of this variable for character code unification was rendered
10816 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10817 internal character representation.  */);
10818     Vtranslation_table_for_input = Qnil;
10819
10820   {
10821     Lisp_Object args[coding_arg_max];
10822     Lisp_Object plist[16];
10823     int i;
10824
10825     for (i = 0; i < coding_arg_max; i++)
10826       args[i] = Qnil;
10827
10828     plist[0] = intern_c_string (":name");
10829     plist[1] = args[coding_arg_name] = Qno_conversion;
10830     plist[2] = intern_c_string (":mnemonic");
10831     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10832     plist[4] = intern_c_string (":coding-type");
10833     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10834     plist[6] = intern_c_string (":ascii-compatible-p");
10835     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10836     plist[8] = intern_c_string (":default-char");
10837     plist[9] = args[coding_arg_default_char] = make_number (0);
10838     plist[10] = intern_c_string (":for-unibyte");
10839     plist[11] = args[coding_arg_for_unibyte] = Qt;
10840     plist[12] = intern_c_string (":docstring");
10841     plist[13] = make_pure_c_string ("Do no conversion.\n\
10842 \n\
10843 When you visit a file with this coding, the file is read into a\n\
10844 unibyte buffer as is, thus each byte of a file is treated as a\n\
10845 character.");
10846     plist[14] = intern_c_string (":eol-type");
10847     plist[15] = args[coding_arg_eol_type] = Qunix;
10848     args[coding_arg_plist] = Flist (16, plist);
10849     Fdefine_coding_system_internal (coding_arg_max, args);
10850
10851     plist[1] = args[coding_arg_name] = Qundecided;
10852     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10853     plist[5] = args[coding_arg_coding_type] = Qundecided;
10854     /* This is already set.
10855        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10856     plist[8] = intern_c_string (":charset-list");
10857     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10858     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10859     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10860     plist[15] = args[coding_arg_eol_type] = Qnil;
10861     args[coding_arg_plist] = Flist (16, plist);
10862     Fdefine_coding_system_internal (coding_arg_max, args);
10863   }
10864
10865   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10866
10867   {
10868     int i;
10869
10870     for (i = 0; i < coding_category_max; i++)
10871       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10872   }
10873 #if defined (DOS_NT)
10874   system_eol_type = Qdos;
10875 #else
10876   system_eol_type = Qunix;
10877 #endif
10878   staticpro (&system_eol_type);
10879 }
10880
10881 char *
10882 emacs_strerror (int error_number)
10883 {
10884   char *str;
10885
10886   synchronize_system_messages_locale ();
10887   str = strerror (error_number);
10888
10889   if (! NILP (Vlocale_coding_system))
10890     {
10891       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10892                                                       Vlocale_coding_system,
10893                                                       0);
10894       str = SSDATA (dec);
10895     }
10896
10897   return str;
10898 }
10899
10900 #endif /* emacs */