src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2011 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   int multibytep = coding->src_multibyte;
 162   EMACS_INT consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   int multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   int multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   EMACS_INT produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288 #include <setjmp.h>
 289
 290 #include "lisp.h"
 291 #include "buffer.h"
 292 #include "character.h"
 293 #include "charset.h"
 294 #include "ccl.h"
 295 #include "composite.h"
 296 #include "coding.h"
 297 #include "window.h"
 298 #include "frame.h"
 299 #include "termhooks.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 static Lisp_Object Qcoding_system, Qeol_type;
 304 static Lisp_Object Qcoding_aliases;
 305 Lisp_Object Qunix, Qdos;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 static Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qutf_8;
 311 static Lisp_Object Qiso_2022;
 312 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 313 static Lisp_Object Qbig, Qlittle;
 314 static Lisp_Object Qcoding_system_history;
 315 static Lisp_Object Qvalid_codes;
 316 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 317 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 static Lisp_Object QCascii_compatible_p;
 320
 321 Lisp_Object Qcall_process, Qcall_process_region;
 322 Lisp_Object Qstart_process, Qopen_network_stream;
 323 static Lisp_Object Qtarget_idx;
 324
 325 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 326 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 327
 328 /* If a symbol has this property, evaluate the value to define the
 329    symbol as a coding system.  */
 330 static Lisp_Object Qcoding_system_define_form;
 331
 332 /* Format of end-of-line decided by system.  This is Qunix on
 333    Unix and Mac, Qdos on DOS/Windows.
 334    This has an effect only for external encoding (i.e. for output to
 335    file and process), not for in-buffer or Lisp string encoding.  */
 336 static Lisp_Object system_eol_type;
 337
 338 #ifdef emacs
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 /* Coding-systems are handed between Emacs Lisp programs and C internal
 348    routines by the following three variables.  */
 349 /* Coding system to be used to encode text for terminal display when
 350    terminal coding system is nil.  */
 351 struct coding_system safe_terminal_coding;
 352
 353 #endif /* emacs */
 354
 355 Lisp_Object Qtranslation_table;
 356 Lisp_Object Qtranslation_table_id;
 357 static Lisp_Object Qtranslation_table_for_decode;
 358 static Lisp_Object Qtranslation_table_for_encode;
 359
 360 /* Two special coding systems.  */
 361 static Lisp_Object Vsjis_coding_system;
 362 static Lisp_Object Vbig5_coding_system;
 363
 364 /* ISO2022 section */
 365
 366 #define CODING_ISO_INITIAL(coding, reg)                 \
 367   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 368                      coding_attr_iso_initial),          \
 369                reg)))
 370
 371
 372 #define CODING_ISO_REQUEST(coding, charset_id)          \
 373   (((charset_id) <= (coding)->max_charset_id            \
 374     ? ((coding)->safe_charsets[charset_id] != 255       \
 375        ? (coding)->safe_charsets[charset_id]            \
 376        : -1)                                            \
 377     : -1))
 378
 379
 380 #define CODING_ISO_FLAGS(coding)        \
 381   ((coding)->spec.iso_2022.flags)
 382 #define CODING_ISO_DESIGNATION(coding, reg)     \
 383   ((coding)->spec.iso_2022.current_designation[reg])
 384 #define CODING_ISO_INVOCATION(coding, plane)    \
 385   ((coding)->spec.iso_2022.current_invocation[plane])
 386 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 387   ((coding)->spec.iso_2022.single_shifting)
 388 #define CODING_ISO_BOL(coding)  \
 389   ((coding)->spec.iso_2022.bol)
 390 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 391   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 392 #define CODING_ISO_CMP_STATUS(coding)   \
 393   (&(coding)->spec.iso_2022.cmp_status)
 394 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 395   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 396 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 397   ((coding)->spec.iso_2022.embedded_utf_8)
 398
 399 /* Control characters of ISO2022.  */
 400                         /* code */      /* function */
 401 #define ISO_CODE_SO     0x0E            /* shift-out */
 402 #define ISO_CODE_SI     0x0F            /* shift-in */
 403 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 404 #define ISO_CODE_ESC    0x1B            /* escape */
 405 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 406 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 407 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 408
 409 /* All code (1-byte) of ISO2022 is classified into one of the
 410    followings.  */
 411 enum iso_code_class_type
 412   {
 413     ISO_control_0,              /* Control codes in the range
 414                                    0x00..0x1F and 0x7F, except for the
 415                                    following 5 codes.  */
 416     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 417     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 418     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 419     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 420     ISO_control_1,              /* Control codes in the range
 421                                    0x80..0x9F, except for the
 422                                    following 3 codes.  */
 423     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 424     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 425     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 426     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 427     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 428     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 429     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 430   };
 431
 432 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 433     `iso-flags' attribute of an iso2022 coding system.  */
 434
 435 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 436    instead of the correct short-form sequence (e.g. ESC $ A).  */
 437 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 438
 439 /* If set, reset graphic planes and registers at end-of-line to the
 440    initial state.  */
 441 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 442
 443 /* If set, reset graphic planes and registers before any control
 444    characters to the initial state.  */
 445 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 446
 447 /* If set, encode by 7-bit environment.  */
 448 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 449
 450 /* If set, use locking-shift function.  */
 451 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 452
 453 /* If set, use single-shift function.  Overwrite
 454    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 455 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 456
 457 /* If set, use designation escape sequence.  */
 458 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 459
 460 /* If set, produce revision number sequence.  */
 461 #define CODING_ISO_FLAG_REVISION        0x0080
 462
 463 /* If set, produce ISO6429's direction specifying sequence.  */
 464 #define CODING_ISO_FLAG_DIRECTION       0x0100
 465
 466 /* If set, assume designation states are reset at beginning of line on
 467    output.  */
 468 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 469
 470 /* If set, designation sequence should be placed at beginning of line
 471    on output.  */
 472 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 473
 474 /* If set, do not encode unsafe characters on output.  */
 475 #define CODING_ISO_FLAG_SAFE            0x0800
 476
 477 /* If set, extra latin codes (128..159) are accepted as a valid code
 478    on input.  */
 479 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 480
 481 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 482
 483 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 484
 485 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 486
 487 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 488
 489 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 490
 491 /* A character to be produced on output if encoding of the original
 492    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 493 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 494
 495 /* UTF-8 section */
 496 #define CODING_UTF_8_BOM(coding)        \
 497   ((coding)->spec.utf_8_bom)
 498
 499 /* UTF-16 section */
 500 #define CODING_UTF_16_BOM(coding)       \
 501   ((coding)->spec.utf_16.bom)
 502
 503 #define CODING_UTF_16_ENDIAN(coding)    \
 504   ((coding)->spec.utf_16.endian)
 505
 506 #define CODING_UTF_16_SURROGATE(coding) \
 507   ((coding)->spec.utf_16.surrogate)
 508
 509
 510 /* CCL section */
 511 #define CODING_CCL_DECODER(coding)      \
 512   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 513 #define CODING_CCL_ENCODER(coding)      \
 514   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 515 #define CODING_CCL_VALIDS(coding)                                          \
 516   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 517
 518 /* Index for each coding category in `coding_categories' */
 519
 520 enum coding_category
 521   {
 522     coding_category_iso_7,
 523     coding_category_iso_7_tight,
 524     coding_category_iso_8_1,
 525     coding_category_iso_8_2,
 526     coding_category_iso_7_else,
 527     coding_category_iso_8_else,
 528     coding_category_utf_8_auto,
 529     coding_category_utf_8_nosig,
 530     coding_category_utf_8_sig,
 531     coding_category_utf_16_auto,
 532     coding_category_utf_16_be,
 533     coding_category_utf_16_le,
 534     coding_category_utf_16_be_nosig,
 535     coding_category_utf_16_le_nosig,
 536     coding_category_charset,
 537     coding_category_sjis,
 538     coding_category_big5,
 539     coding_category_ccl,
 540     coding_category_emacs_mule,
 541     /* All above are targets of code detection.  */
 542     coding_category_raw_text,
 543     coding_category_undecided,
 544     coding_category_max
 545   };
 546
 547 /* Definitions of flag bits used in detect_coding_XXXX.  */
 548 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 549 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 550 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 551 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 552 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 553 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 554 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 555 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 556 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 557 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 558 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 559 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 560 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 561 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 562 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 563 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 564 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 565 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 566 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 567 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 568
 569 /* This value is returned if detect_coding_mask () find nothing other
 570    than ASCII characters.  */
 571 #define CATEGORY_MASK_ANY               \
 572   (CATEGORY_MASK_ISO_7                  \
 573    | CATEGORY_MASK_ISO_7_TIGHT          \
 574    | CATEGORY_MASK_ISO_8_1              \
 575    | CATEGORY_MASK_ISO_8_2              \
 576    | CATEGORY_MASK_ISO_7_ELSE           \
 577    | CATEGORY_MASK_ISO_8_ELSE           \
 578    | CATEGORY_MASK_UTF_8_AUTO           \
 579    | CATEGORY_MASK_UTF_8_NOSIG          \
 580    | CATEGORY_MASK_UTF_8_SIG            \
 581    | CATEGORY_MASK_UTF_16_AUTO          \
 582    | CATEGORY_MASK_UTF_16_BE            \
 583    | CATEGORY_MASK_UTF_16_LE            \
 584    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 585    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 586    | CATEGORY_MASK_CHARSET              \
 587    | CATEGORY_MASK_SJIS                 \
 588    | CATEGORY_MASK_BIG5                 \
 589    | CATEGORY_MASK_CCL                  \
 590    | CATEGORY_MASK_EMACS_MULE)
 591
 592
 593 #define CATEGORY_MASK_ISO_7BIT \
 594   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 595
 596 #define CATEGORY_MASK_ISO_8BIT \
 597   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 598
 599 #define CATEGORY_MASK_ISO_ELSE \
 600   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 601
 602 #define CATEGORY_MASK_ISO_ESCAPE        \
 603   (CATEGORY_MASK_ISO_7                  \
 604    | CATEGORY_MASK_ISO_7_TIGHT          \
 605    | CATEGORY_MASK_ISO_7_ELSE           \
 606    | CATEGORY_MASK_ISO_8_ELSE)
 607
 608 #define CATEGORY_MASK_ISO       \
 609   (  CATEGORY_MASK_ISO_7BIT     \
 610      | CATEGORY_MASK_ISO_8BIT   \
 611      | CATEGORY_MASK_ISO_ELSE)
 612
 613 #define CATEGORY_MASK_UTF_16            \
 614   (CATEGORY_MASK_UTF_16_AUTO            \
 615    | CATEGORY_MASK_UTF_16_BE            \
 616    | CATEGORY_MASK_UTF_16_LE            \
 617    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 618    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 619
 620 #define CATEGORY_MASK_UTF_8     \
 621   (CATEGORY_MASK_UTF_8_AUTO     \
 622    | CATEGORY_MASK_UTF_8_NOSIG  \
 623    | CATEGORY_MASK_UTF_8_SIG)
 624
 625 /* Table of coding categories (Lisp symbols).  This variable is for
 626    internal use only.  */
 627 static Lisp_Object Vcoding_category_table;
 628
 629 /* Table of coding-categories ordered by priority.  */
 630 static enum coding_category coding_priorities[coding_category_max];
 631
 632 /* Nth element is a coding context for the coding system bound to the
 633    Nth coding category.  */
 634 static struct coding_system coding_categories[coding_category_max];
 635
 636 /*** Commonly used macros and functions ***/
 637
 638 #ifndef min
 639 #define min(a, b) ((a) < (b) ? (a) : (b))
 640 #endif
 641 #ifndef max
 642 #define max(a, b) ((a) > (b) ? (a) : (b))
 643 #endif
 644
 645 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 646   do {                                                  \
 647     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 648     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 649   } while (0)
 650
 651
 652 /* Safely get one byte from the source text pointed by SRC which ends
 653    at SRC_END, and set C to that byte.  If there are not enough bytes
 654    in the source, it jumps to `no_more_source'.  If multibytep is
 655    nonzero, and a multibyte character is found at SRC, set C to the
 656    negative value of the character code.  The caller should declare
 657    and set these variables appropriately in advance:
 658         src, src_end, multibytep */
 659
 660 #define ONE_MORE_BYTE(c)                                \
 661   do {                                                  \
 662     if (src == src_end)                                 \
 663       {                                                 \
 664         if (src_base < src)                             \
 665           record_conversion_result                      \
 666             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 667         goto no_more_source;                            \
 668       }                                                 \
 669     c = *src++;                                         \
 670     if (multibytep && (c & 0x80))                       \
 671       {                                                 \
 672         if ((c & 0xFE) == 0xC0)                         \
 673           c = ((c & 1) << 6) | *src++;                  \
 674         else                                            \
 675           {                                             \
 676             src--;                                      \
 677             c = - string_char (src, &src, NULL);        \
 678             record_conversion_result                    \
 679               (coding, CODING_RESULT_INVALID_SRC);      \
 680           }                                             \
 681       }                                                 \
 682     consumed_chars++;                                   \
 683   } while (0)
 684
 685 /* Safely get two bytes from the source text pointed by SRC which ends
 686    at SRC_END, and set C1 and C2 to those bytes while skipping the
 687    heading multibyte characters.  If there are not enough bytes in the
 688    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 689    a multibyte character is found for C2, set C2 to the negative value
 690    of the character code.  The caller should declare and set these
 691    variables appropriately in advance:
 692         src, src_end, multibytep
 693    It is intended that this macro is used in detect_coding_utf_16.  */
 694
 695 #define TWO_MORE_BYTES(c1, c2)                          \
 696   do {                                                  \
 697     do {                                                \
 698       if (src == src_end)                               \
 699         goto no_more_source;                            \
 700       c1 = *src++;                                      \
 701       if (multibytep && (c1 & 0x80))                    \
 702         {                                               \
 703           if ((c1 & 0xFE) == 0xC0)                      \
 704             c1 = ((c1 & 1) << 6) | *src++;              \
 705           else                                          \
 706             {                                           \
 707               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 708               c1 = -1;                                  \
 709             }                                           \
 710         }                                               \
 711     } while (c1 < 0);                                   \
 712     if (src == src_end)                                 \
 713       goto no_more_source;                              \
 714     c2 = *src++;                                        \
 715     if (multibytep && (c2 & 0x80))                      \
 716       {                                                 \
 717         if ((c2 & 0xFE) == 0xC0)                        \
 718           c2 = ((c2 & 1) << 6) | *src++;                \
 719         else                                            \
 720           c2 = -1;                                      \
 721       }                                                 \
 722   } while (0)
 723
 724
 725 /* Store a byte C in the place pointed by DST and increment DST to the
 726    next free point, and increment PRODUCED_CHARS.  The caller should
 727    assure that C is 0..127, and declare and set the variable `dst'
 728    appropriately in advance.
 729 */
 730
 731
 732 #define EMIT_ONE_ASCII_BYTE(c)  \
 733   do {                          \
 734     produced_chars++;           \
 735     *dst++ = (c);               \
 736   } while (0)
 737
 738
 739 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 740
 741 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 742   do {                                  \
 743     produced_chars += 2;                \
 744     *dst++ = (c1), *dst++ = (c2);       \
 745   } while (0)
 746
 747
 748 /* Store a byte C in the place pointed by DST and increment DST to the
 749    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 750    nonzero, store in an appropriate multibyte from.  The caller should
 751    declare and set the variables `dst' and `multibytep' appropriately
 752    in advance.  */
 753
 754 #define EMIT_ONE_BYTE(c)                \
 755   do {                                  \
 756     produced_chars++;                   \
 757     if (multibytep)                     \
 758       {                                 \
 759         unsigned ch = (c);              \
 760         if (ch >= 0x80)                 \
 761           ch = BYTE8_TO_CHAR (ch);      \
 762         CHAR_STRING_ADVANCE (ch, dst);  \
 763       }                                 \
 764     else                                \
 765       *dst++ = (c);                     \
 766   } while (0)
 767
 768
 769 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 770
 771 #define EMIT_TWO_BYTES(c1, c2)          \
 772   do {                                  \
 773     produced_chars += 2;                \
 774     if (multibytep)                     \
 775       {                                 \
 776         unsigned ch;                    \
 777                                         \
 778         ch = (c1);                      \
 779         if (ch >= 0x80)                 \
 780           ch = BYTE8_TO_CHAR (ch);      \
 781         CHAR_STRING_ADVANCE (ch, dst);  \
 782         ch = (c2);                      \
 783         if (ch >= 0x80)                 \
 784           ch = BYTE8_TO_CHAR (ch);      \
 785         CHAR_STRING_ADVANCE (ch, dst);  \
 786       }                                 \
 787     else                                \
 788       {                                 \
 789         *dst++ = (c1);                  \
 790         *dst++ = (c2);                  \
 791       }                                 \
 792   } while (0)
 793
 794
 795 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 796   do {                                  \
 797     EMIT_ONE_BYTE (c1);                 \
 798     EMIT_TWO_BYTES (c2, c3);            \
 799   } while (0)
 800
 801
 802 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 803   do {                                          \
 804     EMIT_TWO_BYTES (c1, c2);                    \
 805     EMIT_TWO_BYTES (c3, c4);                    \
 806   } while (0)
 807
 808
 809 /* Prototypes for static functions.  */
 810 static void record_conversion_result (struct coding_system *coding,
 811                                       enum coding_result_code result);
 812 static int detect_coding_utf_8 (struct coding_system *,
 813                                 struct coding_detection_info *info);
 814 static void decode_coding_utf_8 (struct coding_system *);
 815 static int encode_coding_utf_8 (struct coding_system *);
 816
 817 static int detect_coding_utf_16 (struct coding_system *,
 818                                  struct coding_detection_info *info);
 819 static void decode_coding_utf_16 (struct coding_system *);
 820 static int encode_coding_utf_16 (struct coding_system *);
 821
 822 static int detect_coding_iso_2022 (struct coding_system *,
 823                                    struct coding_detection_info *info);
 824 static void decode_coding_iso_2022 (struct coding_system *);
 825 static int encode_coding_iso_2022 (struct coding_system *);
 826
 827 static int detect_coding_emacs_mule (struct coding_system *,
 828                                      struct coding_detection_info *info);
 829 static void decode_coding_emacs_mule (struct coding_system *);
 830 static int encode_coding_emacs_mule (struct coding_system *);
 831
 832 static int detect_coding_sjis (struct coding_system *,
 833                                struct coding_detection_info *info);
 834 static void decode_coding_sjis (struct coding_system *);
 835 static int encode_coding_sjis (struct coding_system *);
 836
 837 static int detect_coding_big5 (struct coding_system *,
 838                                struct coding_detection_info *info);
 839 static void decode_coding_big5 (struct coding_system *);
 840 static int encode_coding_big5 (struct coding_system *);
 841
 842 static int detect_coding_ccl (struct coding_system *,
 843                               struct coding_detection_info *info);
 844 static void decode_coding_ccl (struct coding_system *);
 845 static int encode_coding_ccl (struct coding_system *);
 846
 847 static void decode_coding_raw_text (struct coding_system *);
 848 static int encode_coding_raw_text (struct coding_system *);
 849
 850 static void coding_set_source (struct coding_system *);
 851 static void coding_set_destination (struct coding_system *);
 852 static void coding_alloc_by_realloc (struct coding_system *, EMACS_INT);
 853 static void coding_alloc_by_making_gap (struct coding_system *,
 854                                         EMACS_INT, EMACS_INT);
 855 static unsigned char *alloc_destination (struct coding_system *,
 856                                          EMACS_INT, unsigned char *);
 857 static void setup_iso_safe_charsets (Lisp_Object);
 858 static unsigned char *encode_designation_at_bol (struct coding_system *,
 859                                                  int *, unsigned char *);
 860 static int detect_eol (const unsigned char *,
 861                        EMACS_INT, enum coding_category);
 862 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
 863 static void decode_eol (struct coding_system *);
 864 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
 865 static Lisp_Object get_translation (Lisp_Object, int *, int *);
 866 static int produce_chars (struct coding_system *, Lisp_Object, int);
 867 static INLINE void produce_charset (struct coding_system *, int *,
 868                                     EMACS_INT);
 869 static void produce_annotation (struct coding_system *, EMACS_INT);
 870 static int decode_coding (struct coding_system *);
 871 static INLINE int *handle_composition_annotation (EMACS_INT, EMACS_INT,
 872                                                   struct coding_system *,
 873                                                   int *, EMACS_INT *);
 874 static INLINE int *handle_charset_annotation (EMACS_INT, EMACS_INT,
 875                                               struct coding_system *,
 876                                               int *, EMACS_INT *);
 877 static void consume_chars (struct coding_system *, Lisp_Object, int);
 878 static int encode_coding (struct coding_system *);
 879 static Lisp_Object make_conversion_work_buffer (int);
 880 static Lisp_Object code_conversion_restore (Lisp_Object);
 881 static INLINE int char_encodable_p (int, Lisp_Object);
 882 static Lisp_Object make_subsidiaries (Lisp_Object);
 883
 884 static void
 885 record_conversion_result (struct coding_system *coding,
 886                           enum coding_result_code result)
 887 {
 888   coding->result = result;
 889   switch (result)
 890     {
 891     case CODING_RESULT_INSUFFICIENT_SRC:
 892       Vlast_code_conversion_error = Qinsufficient_source;
 893       break;
 894     case CODING_RESULT_INCONSISTENT_EOL:
 895       Vlast_code_conversion_error = Qinconsistent_eol;
 896       break;
 897     case CODING_RESULT_INVALID_SRC:
 898       Vlast_code_conversion_error = Qinvalid_source;
 899       break;
 900     case CODING_RESULT_INTERRUPT:
 901       Vlast_code_conversion_error = Qinterrupted;
 902       break;
 903     case CODING_RESULT_INSUFFICIENT_MEM:
 904       Vlast_code_conversion_error = Qinsufficient_memory;
 905       break;
 906     case CODING_RESULT_INSUFFICIENT_DST:
 907       /* Don't record this error in Vlast_code_conversion_error
 908          because it happens just temporarily and is resolved when the
 909          whole conversion is finished.  */
 910       break;
 911     case CODING_RESULT_SUCCESS:
 912       break;
 913     default:
 914       Vlast_code_conversion_error = intern ("Unknown error");
 915     }
 916 }
 917
 918 /* This wrapper macro is used to preserve validity of pointers into
 919    buffer text across calls to decode_char, which could cause
 920    relocation of buffers if it loads a charset map, because loading a
 921    charset map allocates large structures.  */
 922 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 923   do {                                                                       \
 924     charset_map_loaded = 0;                                                  \
 925     c = DECODE_CHAR (charset, code);                                         \
 926     if (charset_map_loaded)                                                  \
 927       {                                                                      \
 928         const unsigned char *orig = coding->source;                          \
 929         EMACS_INT offset;                                                    \
 930                                                                              \
 931         coding_set_source (coding);                                          \
 932         offset = coding->source - orig;                                      \
 933         src += offset;                                                       \
 934         src_base += offset;                                                  \
 935         src_end += offset;                                                   \
 936       }                                                                      \
 937   } while (0)
 938
 939
 940 /* If there are at least BYTES length of room at dst, allocate memory
 941    for coding->destination and update dst and dst_end.  We don't have
 942    to take care of coding->source which will be relocated.  It is
 943    handled by calling coding_set_source in encode_coding.  */
 944
 945 #define ASSURE_DESTINATION(bytes)                               \
 946   do {                                                          \
 947     if (dst + (bytes) >= dst_end)                               \
 948       {                                                         \
 949         EMACS_INT more_bytes = charbuf_end - charbuf + (bytes); \
 950                                                                 \
 951         dst = alloc_destination (coding, more_bytes, dst);      \
 952         dst_end = coding->destination + coding->dst_bytes;      \
 953       }                                                         \
 954   } while (0)
 955
 956
 957 /* Store multibyte form of the character C in P, and advance P to the
 958    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 959    never calls MAYBE_UNIFY_CHAR.  */
 960
 961 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 962   do {                                          \
 963     if ((c) <= MAX_1_BYTE_CHAR)                 \
 964       *(p)++ = (c);                             \
 965     else if ((c) <= MAX_2_BYTE_CHAR)            \
 966       *(p)++ = (0xC0 | ((c) >> 6)),             \
 967         *(p)++ = (0x80 | ((c) & 0x3F));         \
 968     else if ((c) <= MAX_3_BYTE_CHAR)            \
 969       *(p)++ = (0xE0 | ((c) >> 12)),            \
 970         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
 971         *(p)++ = (0x80 | ((c) & 0x3F));         \
 972     else if ((c) <= MAX_4_BYTE_CHAR)            \
 973       *(p)++ = (0xF0 | (c >> 18)),              \
 974         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 975         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 976         *(p)++ = (0x80 | (c & 0x3F));           \
 977     else if ((c) <= MAX_5_BYTE_CHAR)            \
 978       *(p)++ = 0xF8,                            \
 979         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
 980         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 981         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 982         *(p)++ = (0x80 | (c & 0x3F));           \
 983     else                                        \
 984       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
 985   } while (0)
 986
 987
 988 /* Return the character code of character whose multibyte form is at
 989    P, and advance P to the end of the multibyte form.  This is like
 990    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
 991
 992 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
 993   (!((p)[0] & 0x80)                                             \
 994    ? *(p)++                                                     \
 995    : ! ((p)[0] & 0x20)                                          \
 996    ? ((p) += 2,                                                 \
 997       ((((p)[-2] & 0x1F) << 6)                                  \
 998        | ((p)[-1] & 0x3F)                                       \
 999        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1000    : ! ((p)[0] & 0x10)                                          \
1001    ? ((p) += 3,                                                 \
1002       ((((p)[-3] & 0x0F) << 12)                                 \
1003        | (((p)[-2] & 0x3F) << 6)                                \
1004        | ((p)[-1] & 0x3F)))                                     \
1005    : ! ((p)[0] & 0x08)                                          \
1006    ? ((p) += 4,                                                 \
1007       ((((p)[-4] & 0xF) << 18)                                  \
1008        | (((p)[-3] & 0x3F) << 12)                               \
1009        | (((p)[-2] & 0x3F) << 6)                                \
1010        | ((p)[-1] & 0x3F)))                                     \
1011    : ((p) += 5,                                                 \
1012       ((((p)[-4] & 0x3F) << 18)                                 \
1013        | (((p)[-3] & 0x3F) << 12)                               \
1014        | (((p)[-2] & 0x3F) << 6)                                \
1015        | ((p)[-1] & 0x3F))))
1016
1017
1018 static void
1019 coding_set_source (struct coding_system *coding)
1020 {
1021   if (BUFFERP (coding->src_object))
1022     {
1023       struct buffer *buf = XBUFFER (coding->src_object);
1024
1025       if (coding->src_pos < 0)
1026         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1027       else
1028         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1029     }
1030   else if (STRINGP (coding->src_object))
1031     {
1032       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1033     }
1034   else
1035     {
1036       /* Otherwise, the source is C string and is never relocated
1037          automatically.  Thus we don't have to update anything.  */
1038     }
1039 }
1040
1041 static void
1042 coding_set_destination (struct coding_system *coding)
1043 {
1044   if (BUFFERP (coding->dst_object))
1045     {
1046       if (coding->src_pos < 0)
1047         {
1048           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1049           coding->dst_bytes = (GAP_END_ADDR
1050                                - (coding->src_bytes - coding->consumed)
1051                                - coding->destination);
1052         }
1053       else
1054         {
1055           /* We are sure that coding->dst_pos_byte is before the gap
1056              of the buffer. */
1057           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1058                                  + coding->dst_pos_byte - BEG_BYTE);
1059           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1060                                - coding->destination);
1061         }
1062     }
1063   else
1064     {
1065       /* Otherwise, the destination is C string and is never relocated
1066          automatically.  Thus we don't have to update anything.  */
1067     }
1068 }
1069
1070
1071 static void
1072 coding_alloc_by_realloc (struct coding_system *coding, EMACS_INT bytes)
1073 {
1074   coding->destination = (unsigned char *) xrealloc (coding->destination,
1075                                                     coding->dst_bytes + bytes);
1076   coding->dst_bytes += bytes;
1077 }
1078
1079 static void
1080 coding_alloc_by_making_gap (struct coding_system *coding,
1081                             EMACS_INT gap_head_used, EMACS_INT bytes)
1082 {
1083   if (EQ (coding->src_object, coding->dst_object))
1084     {
1085       /* The gap may contain the produced data at the head and not-yet
1086          consumed data at the tail.  To preserve those data, we at
1087          first make the gap size to zero, then increase the gap
1088          size.  */
1089       EMACS_INT add = GAP_SIZE;
1090
1091       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1092       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1093       make_gap (bytes);
1094       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1095       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1096     }
1097   else
1098     {
1099       Lisp_Object this_buffer;
1100
1101       this_buffer = Fcurrent_buffer ();
1102       set_buffer_internal (XBUFFER (coding->dst_object));
1103       make_gap (bytes);
1104       set_buffer_internal (XBUFFER (this_buffer));
1105     }
1106 }
1107
1108
1109 static unsigned char *
1110 alloc_destination (struct coding_system *coding, EMACS_INT nbytes,
1111                    unsigned char *dst)
1112 {
1113   EMACS_INT offset = dst - coding->destination;
1114
1115   if (BUFFERP (coding->dst_object))
1116     {
1117       struct buffer *buf = XBUFFER (coding->dst_object);
1118
1119       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1120     }
1121   else
1122     coding_alloc_by_realloc (coding, nbytes);
1123   coding_set_destination (coding);
1124   dst = coding->destination + offset;
1125   return dst;
1126 }
1127
1128 /** Macros for annotations.  */
1129
1130 /* An annotation data is stored in the array coding->charbuf in this
1131    format:
1132      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1133    LENGTH is the number of elements in the annotation.
1134    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1135    NCHARS is the number of characters in the text annotated.
1136
1137    The format of the following elements depend on ANNOTATION_MASK.
1138
1139    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1140    follows:
1141      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1142
1143    NBYTES is the number of bytes specified in the header part of
1144    old-style emacs-mule encoding, or 0 for the other kind of
1145    composition.
1146
1147    METHOD is one of enum composition_method.
1148
1149    Optional COMPOSITION-COMPONENTS are characters and composition
1150    rules.
1151
1152    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1153    follows.
1154
1155    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1156    recover from an invalid annotation, and should be skipped by
1157    produce_annotation.  */
1158
1159 /* Maximum length of the header of annotation data.  */
1160 #define MAX_ANNOTATION_LENGTH 5
1161
1162 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1163   do {                                                  \
1164     *(buf)++ = -(len);                                  \
1165     *(buf)++ = (mask);                                  \
1166     *(buf)++ = (nchars);                                \
1167     coding->annotated = 1;                              \
1168   } while (0);
1169
1170 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1171   do {                                                                      \
1172     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1173     *buf++ = nbytes;                                                        \
1174     *buf++ = method;                                                        \
1175   } while (0)
1176
1177
1178 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1179   do {                                                                  \
1180     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1181     *buf++ = id;                                                        \
1182   } while (0)
1183
1184 \f
1185 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1186
1187
1188
1189 \f
1190 /*** 3. UTF-8 ***/
1191
1192 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1193    Check if a text is encoded in UTF-8.  If it is, return 1, else
1194    return 0.  */
1195
1196 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1197 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1198 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1199 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1200 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1201 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1202
1203 #define UTF_8_BOM_1 0xEF
1204 #define UTF_8_BOM_2 0xBB
1205 #define UTF_8_BOM_3 0xBF
1206
1207 static int
1208 detect_coding_utf_8 (struct coding_system *coding,
1209                      struct coding_detection_info *detect_info)
1210 {
1211   const unsigned char *src = coding->source, *src_base;
1212   const unsigned char *src_end = coding->source + coding->src_bytes;
1213   int multibytep = coding->src_multibyte;
1214   EMACS_INT consumed_chars = 0;
1215   int bom_found = 0;
1216   int found = 0;
1217
1218   detect_info->checked |= CATEGORY_MASK_UTF_8;
1219   /* A coding system of this category is always ASCII compatible.  */
1220   src += coding->head_ascii;
1221
1222   while (1)
1223     {
1224       int c, c1, c2, c3, c4;
1225
1226       src_base = src;
1227       ONE_MORE_BYTE (c);
1228       if (c < 0 || UTF_8_1_OCTET_P (c))
1229         continue;
1230       ONE_MORE_BYTE (c1);
1231       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1232         break;
1233       if (UTF_8_2_OCTET_LEADING_P (c))
1234         {
1235           found = 1;
1236           continue;
1237         }
1238       ONE_MORE_BYTE (c2);
1239       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1240         break;
1241       if (UTF_8_3_OCTET_LEADING_P (c))
1242         {
1243           found = 1;
1244           if (src_base == coding->source
1245               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1246             bom_found = 1;
1247           continue;
1248         }
1249       ONE_MORE_BYTE (c3);
1250       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1251         break;
1252       if (UTF_8_4_OCTET_LEADING_P (c))
1253         {
1254           found = 1;
1255           continue;
1256         }
1257       ONE_MORE_BYTE (c4);
1258       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1259         break;
1260       if (UTF_8_5_OCTET_LEADING_P (c))
1261         {
1262           found = 1;
1263           continue;
1264         }
1265       break;
1266     }
1267   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1268   return 0;
1269
1270  no_more_source:
1271   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1272     {
1273       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1274       return 0;
1275     }
1276   if (bom_found)
1277     {
1278       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1279       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1280     }
1281   else
1282     {
1283       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1284       if (found)
1285         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1286     }
1287   return 1;
1288 }
1289
1290
1291 static void
1292 decode_coding_utf_8 (struct coding_system *coding)
1293 {
1294   const unsigned char *src = coding->source + coding->consumed;
1295   const unsigned char *src_end = coding->source + coding->src_bytes;
1296   const unsigned char *src_base;
1297   int *charbuf = coding->charbuf + coding->charbuf_used;
1298   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1299   EMACS_INT consumed_chars = 0, consumed_chars_base = 0;
1300   int multibytep = coding->src_multibyte;
1301   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1302   int eol_dos =
1303     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1304   int byte_after_cr = -1;
1305
1306   if (bom != utf_without_bom)
1307     {
1308       int c1, c2, c3;
1309
1310       src_base = src;
1311       ONE_MORE_BYTE (c1);
1312       if (! UTF_8_3_OCTET_LEADING_P (c1))
1313         src = src_base;
1314       else
1315         {
1316           ONE_MORE_BYTE (c2);
1317           if (! UTF_8_EXTRA_OCTET_P (c2))
1318             src = src_base;
1319           else
1320             {
1321               ONE_MORE_BYTE (c3);
1322               if (! UTF_8_EXTRA_OCTET_P (c3))
1323                 src = src_base;
1324               else
1325                 {
1326                   if ((c1 != UTF_8_BOM_1)
1327                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1328                     src = src_base;
1329                   else
1330                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1331                 }
1332             }
1333         }
1334     }
1335   CODING_UTF_8_BOM (coding) = utf_without_bom;
1336
1337   while (1)
1338     {
1339       int c, c1, c2, c3, c4, c5;
1340
1341       src_base = src;
1342       consumed_chars_base = consumed_chars;
1343
1344       if (charbuf >= charbuf_end)
1345         {
1346           if (byte_after_cr >= 0)
1347             src_base--;
1348           break;
1349         }
1350
1351       if (byte_after_cr >= 0)
1352         c1 = byte_after_cr, byte_after_cr = -1;
1353       else
1354         ONE_MORE_BYTE (c1);
1355       if (c1 < 0)
1356         {
1357           c = - c1;
1358         }
1359       else if (UTF_8_1_OCTET_P (c1))
1360         {
1361           if (eol_dos && c1 == '\r')
1362             ONE_MORE_BYTE (byte_after_cr);
1363           c = c1;
1364         }
1365       else
1366         {
1367           ONE_MORE_BYTE (c2);
1368           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1369             goto invalid_code;
1370           if (UTF_8_2_OCTET_LEADING_P (c1))
1371             {
1372               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1373               /* Reject overlong sequences here and below.  Encoders
1374                  producing them are incorrect, they can be misleading,
1375                  and they mess up read/write invariance.  */
1376               if (c < 128)
1377                 goto invalid_code;
1378             }
1379           else
1380             {
1381               ONE_MORE_BYTE (c3);
1382               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1383                 goto invalid_code;
1384               if (UTF_8_3_OCTET_LEADING_P (c1))
1385                 {
1386                   c = (((c1 & 0xF) << 12)
1387                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1388                   if (c < 0x800
1389                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1390                     goto invalid_code;
1391                 }
1392               else
1393                 {
1394                   ONE_MORE_BYTE (c4);
1395                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1396                     goto invalid_code;
1397                   if (UTF_8_4_OCTET_LEADING_P (c1))
1398                     {
1399                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1400                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1401                     if (c < 0x10000)
1402                       goto invalid_code;
1403                     }
1404                   else
1405                     {
1406                       ONE_MORE_BYTE (c5);
1407                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1408                         goto invalid_code;
1409                       if (UTF_8_5_OCTET_LEADING_P (c1))
1410                         {
1411                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1412                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1413                                | (c5 & 0x3F));
1414                           if ((c > MAX_CHAR) || (c < 0x200000))
1415                             goto invalid_code;
1416                         }
1417                       else
1418                         goto invalid_code;
1419                     }
1420                 }
1421             }
1422         }
1423
1424       *charbuf++ = c;
1425       continue;
1426
1427     invalid_code:
1428       src = src_base;
1429       consumed_chars = consumed_chars_base;
1430       ONE_MORE_BYTE (c);
1431       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1432       coding->errors++;
1433     }
1434
1435  no_more_source:
1436   coding->consumed_char += consumed_chars_base;
1437   coding->consumed = src_base - coding->source;
1438   coding->charbuf_used = charbuf - coding->charbuf;
1439 }
1440
1441
1442 static int
1443 encode_coding_utf_8 (struct coding_system *coding)
1444 {
1445   int multibytep = coding->dst_multibyte;
1446   int *charbuf = coding->charbuf;
1447   int *charbuf_end = charbuf + coding->charbuf_used;
1448   unsigned char *dst = coding->destination + coding->produced;
1449   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1450   EMACS_INT produced_chars = 0;
1451   int c;
1452
1453   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1454     {
1455       ASSURE_DESTINATION (3);
1456       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1457       CODING_UTF_8_BOM (coding) = utf_without_bom;
1458     }
1459
1460   if (multibytep)
1461     {
1462       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1463
1464       while (charbuf < charbuf_end)
1465         {
1466           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1467
1468           ASSURE_DESTINATION (safe_room);
1469           c = *charbuf++;
1470           if (CHAR_BYTE8_P (c))
1471             {
1472               c = CHAR_TO_BYTE8 (c);
1473               EMIT_ONE_BYTE (c);
1474             }
1475           else
1476             {
1477               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1478               for (p = str; p < pend; p++)
1479                 EMIT_ONE_BYTE (*p);
1480             }
1481         }
1482     }
1483   else
1484     {
1485       int safe_room = MAX_MULTIBYTE_LENGTH;
1486
1487       while (charbuf < charbuf_end)
1488         {
1489           ASSURE_DESTINATION (safe_room);
1490           c = *charbuf++;
1491           if (CHAR_BYTE8_P (c))
1492             *dst++ = CHAR_TO_BYTE8 (c);
1493           else
1494             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1495           produced_chars++;
1496         }
1497     }
1498   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1499   coding->produced_char += produced_chars;
1500   coding->produced = dst - coding->destination;
1501   return 0;
1502 }
1503
1504
1505 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1506    Check if a text is encoded in one of UTF-16 based coding systems.
1507    If it is, return 1, else return 0.  */
1508
1509 #define UTF_16_HIGH_SURROGATE_P(val) \
1510   (((val) & 0xFC00) == 0xD800)
1511
1512 #define UTF_16_LOW_SURROGATE_P(val) \
1513   (((val) & 0xFC00) == 0xDC00)
1514
1515
1516 static int
1517 detect_coding_utf_16 (struct coding_system *coding,
1518                       struct coding_detection_info *detect_info)
1519 {
1520   const unsigned char *src = coding->source;
1521   const unsigned char *src_end = coding->source + coding->src_bytes;
1522   int multibytep = coding->src_multibyte;
1523   int c1, c2;
1524
1525   detect_info->checked |= CATEGORY_MASK_UTF_16;
1526   if (coding->mode & CODING_MODE_LAST_BLOCK
1527       && (coding->src_chars & 1))
1528     {
1529       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1530       return 0;
1531     }
1532
1533   TWO_MORE_BYTES (c1, c2);
1534   if ((c1 == 0xFF) && (c2 == 0xFE))
1535     {
1536       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1537                              | CATEGORY_MASK_UTF_16_AUTO);
1538       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1539                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1540                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1541     }
1542   else if ((c1 == 0xFE) && (c2 == 0xFF))
1543     {
1544       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1545                              | CATEGORY_MASK_UTF_16_AUTO);
1546       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1547                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1548                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1549     }
1550   else if (c2 < 0)
1551     {
1552       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1553       return 0;
1554     }
1555   else
1556     {
1557       /* We check the dispersion of Eth and Oth bytes where E is even and
1558          O is odd.  If both are high, we assume binary data.*/
1559       unsigned char e[256], o[256];
1560       unsigned e_num = 1, o_num = 1;
1561
1562       memset (e, 0, 256);
1563       memset (o, 0, 256);
1564       e[c1] = 1;
1565       o[c2] = 1;
1566
1567       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1568                                 |CATEGORY_MASK_UTF_16_BE
1569                                 | CATEGORY_MASK_UTF_16_LE);
1570
1571       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1572              != CATEGORY_MASK_UTF_16)
1573         {
1574           TWO_MORE_BYTES (c1, c2);
1575           if (c2 < 0)
1576             break;
1577           if (! e[c1])
1578             {
1579               e[c1] = 1;
1580               e_num++;
1581               if (e_num >= 128)
1582                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1583             }
1584           if (! o[c2])
1585             {
1586               o[c2] = 1;
1587               o_num++;
1588               if (o_num >= 128)
1589                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1590             }
1591         }
1592       return 0;
1593     }
1594
1595  no_more_source:
1596   return 1;
1597 }
1598
1599 static void
1600 decode_coding_utf_16 (struct coding_system *coding)
1601 {
1602   const unsigned char *src = coding->source + coding->consumed;
1603   const unsigned char *src_end = coding->source + coding->src_bytes;
1604   const unsigned char *src_base;
1605   int *charbuf = coding->charbuf + coding->charbuf_used;
1606   /* We may produces at most 3 chars in one loop.  */
1607   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1608   EMACS_INT consumed_chars = 0, consumed_chars_base = 0;
1609   int multibytep = coding->src_multibyte;
1610   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1611   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1612   int surrogate = CODING_UTF_16_SURROGATE (coding);
1613   int eol_dos =
1614     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1615   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1616
1617   if (bom == utf_with_bom)
1618     {
1619       int c, c1, c2;
1620
1621       src_base = src;
1622       ONE_MORE_BYTE (c1);
1623       ONE_MORE_BYTE (c2);
1624       c = (c1 << 8) | c2;
1625
1626       if (endian == utf_16_big_endian
1627           ? c != 0xFEFF : c != 0xFFFE)
1628         {
1629           /* The first two bytes are not BOM.  Treat them as bytes
1630              for a normal character.  */
1631           src = src_base;
1632           coding->errors++;
1633         }
1634       CODING_UTF_16_BOM (coding) = utf_without_bom;
1635     }
1636   else if (bom == utf_detect_bom)
1637     {
1638       /* We have already tried to detect BOM and failed in
1639          detect_coding.  */
1640       CODING_UTF_16_BOM (coding) = utf_without_bom;
1641     }
1642
1643   while (1)
1644     {
1645       int c, c1, c2;
1646
1647       src_base = src;
1648       consumed_chars_base = consumed_chars;
1649
1650       if (charbuf >= charbuf_end)
1651         {
1652           if (byte_after_cr1 >= 0)
1653             src_base -= 2;
1654           break;
1655         }
1656
1657       if (byte_after_cr1 >= 0)
1658         c1 = byte_after_cr1, byte_after_cr1 = -1;
1659       else
1660         ONE_MORE_BYTE (c1);
1661       if (c1 < 0)
1662         {
1663           *charbuf++ = -c1;
1664           continue;
1665         }
1666       if (byte_after_cr2 >= 0)
1667         c2 = byte_after_cr2, byte_after_cr2 = -1;
1668       else
1669         ONE_MORE_BYTE (c2);
1670       if (c2 < 0)
1671         {
1672           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1673           *charbuf++ = -c2;
1674           continue;
1675         }
1676       c = (endian == utf_16_big_endian
1677            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1678
1679       if (surrogate)
1680         {
1681           if (! UTF_16_LOW_SURROGATE_P (c))
1682             {
1683               if (endian == utf_16_big_endian)
1684                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1685               else
1686                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1687               *charbuf++ = c1;
1688               *charbuf++ = c2;
1689               coding->errors++;
1690               if (UTF_16_HIGH_SURROGATE_P (c))
1691                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1692               else
1693                 *charbuf++ = c;
1694             }
1695           else
1696             {
1697               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1698               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1699               *charbuf++ = 0x10000 + c;
1700             }
1701         }
1702       else
1703         {
1704           if (UTF_16_HIGH_SURROGATE_P (c))
1705             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1706           else
1707             {
1708               if (eol_dos && c == '\r')
1709                 {
1710                   ONE_MORE_BYTE (byte_after_cr1);
1711                   ONE_MORE_BYTE (byte_after_cr2);
1712                 }
1713               *charbuf++ = c;
1714             }
1715         }
1716     }
1717
1718  no_more_source:
1719   coding->consumed_char += consumed_chars_base;
1720   coding->consumed = src_base - coding->source;
1721   coding->charbuf_used = charbuf - coding->charbuf;
1722 }
1723
1724 static int
1725 encode_coding_utf_16 (struct coding_system *coding)
1726 {
1727   int multibytep = coding->dst_multibyte;
1728   int *charbuf = coding->charbuf;
1729   int *charbuf_end = charbuf + coding->charbuf_used;
1730   unsigned char *dst = coding->destination + coding->produced;
1731   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1732   int safe_room = 8;
1733   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1734   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1735   EMACS_INT produced_chars = 0;
1736   int c;
1737
1738   if (bom != utf_without_bom)
1739     {
1740       ASSURE_DESTINATION (safe_room);
1741       if (big_endian)
1742         EMIT_TWO_BYTES (0xFE, 0xFF);
1743       else
1744         EMIT_TWO_BYTES (0xFF, 0xFE);
1745       CODING_UTF_16_BOM (coding) = utf_without_bom;
1746     }
1747
1748   while (charbuf < charbuf_end)
1749     {
1750       ASSURE_DESTINATION (safe_room);
1751       c = *charbuf++;
1752       if (c > MAX_UNICODE_CHAR)
1753         c = coding->default_char;
1754
1755       if (c < 0x10000)
1756         {
1757           if (big_endian)
1758             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1759           else
1760             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1761         }
1762       else
1763         {
1764           int c1, c2;
1765
1766           c -= 0x10000;
1767           c1 = (c >> 10) + 0xD800;
1768           c2 = (c & 0x3FF) + 0xDC00;
1769           if (big_endian)
1770             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1771           else
1772             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1773         }
1774     }
1775   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1776   coding->produced = dst - coding->destination;
1777   coding->produced_char += produced_chars;
1778   return 0;
1779 }
1780
1781 \f
1782 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1783
1784 /* Emacs' internal format for representation of multiple character
1785    sets is a kind of multi-byte encoding, i.e. characters are
1786    represented by variable-length sequences of one-byte codes.
1787
1788    ASCII characters and control characters (e.g. `tab', `newline') are
1789    represented by one-byte sequences which are their ASCII codes, in
1790    the range 0x00 through 0x7F.
1791
1792    8-bit characters of the range 0x80..0x9F are represented by
1793    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1794    code + 0x20).
1795
1796    8-bit characters of the range 0xA0..0xFF are represented by
1797    one-byte sequences which are their 8-bit code.
1798
1799    The other characters are represented by a sequence of `base
1800    leading-code', optional `extended leading-code', and one or two
1801    `position-code's.  The length of the sequence is determined by the
1802    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1803    whereas extended leading-code and position-code take the range 0xA0
1804    through 0xFF.  See `charset.h' for more details about leading-code
1805    and position-code.
1806
1807    --- CODE RANGE of Emacs' internal format ---
1808    character set        range
1809    -------------        -----
1810    ascii                0x00..0x7F
1811    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1812    eight-bit-graphic    0xA0..0xBF
1813    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1814    ---------------------------------------------
1815
1816    As this is the internal character representation, the format is
1817    usually not used externally (i.e. in a file or in a data sent to a
1818    process).  But, it is possible to have a text externally in this
1819    format (i.e. by encoding by the coding system `emacs-mule').
1820
1821    In that case, a sequence of one-byte codes has a slightly different
1822    form.
1823
1824    At first, all characters in eight-bit-control are represented by
1825    one-byte sequences which are their 8-bit code.
1826
1827    Next, character composition data are represented by the byte
1828    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1829    where,
1830         METHOD is 0xF2 plus one of composition method (enum
1831         composition_method),
1832
1833         BYTES is 0xA0 plus a byte length of this composition data,
1834
1835         CHARS is 0xA0 plus a number of characters composed by this
1836         data,
1837
1838         COMPONENTs are characters of multibyte form or composition
1839         rules encoded by two-byte of ASCII codes.
1840
1841    In addition, for backward compatibility, the following formats are
1842    also recognized as composition data on decoding.
1843
1844    0x80 MSEQ ...
1845    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1846
1847    Here,
1848         MSEQ is a multibyte form but in these special format:
1849           ASCII: 0xA0 ASCII_CODE+0x80,
1850           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1851         RULE is a one byte code of the range 0xA0..0xF0 that
1852         represents a composition rule.
1853   */
1854
1855 char emacs_mule_bytes[256];
1856
1857
1858 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1859    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1860    else return 0.  */
1861
1862 static int
1863 detect_coding_emacs_mule (struct coding_system *coding,
1864                           struct coding_detection_info *detect_info)
1865 {
1866   const unsigned char *src = coding->source, *src_base;
1867   const unsigned char *src_end = coding->source + coding->src_bytes;
1868   int multibytep = coding->src_multibyte;
1869   EMACS_INT consumed_chars = 0;
1870   int c;
1871   int found = 0;
1872
1873   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1874   /* A coding system of this category is always ASCII compatible.  */
1875   src += coding->head_ascii;
1876
1877   while (1)
1878     {
1879       src_base = src;
1880       ONE_MORE_BYTE (c);
1881       if (c < 0)
1882         continue;
1883       if (c == 0x80)
1884         {
1885           /* Perhaps the start of composite character.  We simply skip
1886              it because analyzing it is too heavy for detecting.  But,
1887              at least, we check that the composite character
1888              constitutes of more than 4 bytes.  */
1889           const unsigned char *src_start;
1890
1891         repeat:
1892           src_start = src;
1893           do
1894             {
1895               ONE_MORE_BYTE (c);
1896             }
1897           while (c >= 0xA0);
1898
1899           if (src - src_start <= 4)
1900             break;
1901           found = CATEGORY_MASK_EMACS_MULE;
1902           if (c == 0x80)
1903             goto repeat;
1904         }
1905
1906       if (c < 0x80)
1907         {
1908           if (c < 0x20
1909               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1910             break;
1911         }
1912       else
1913         {
1914           int more_bytes = emacs_mule_bytes[c] - 1;
1915
1916           while (more_bytes > 0)
1917             {
1918               ONE_MORE_BYTE (c);
1919               if (c < 0xA0)
1920                 {
1921                   src--;        /* Unread the last byte.  */
1922                   break;
1923                 }
1924               more_bytes--;
1925             }
1926           if (more_bytes != 0)
1927             break;
1928           found = CATEGORY_MASK_EMACS_MULE;
1929         }
1930     }
1931   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1932   return 0;
1933
1934  no_more_source:
1935   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1936     {
1937       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1938       return 0;
1939     }
1940   detect_info->found |= found;
1941   return 1;
1942 }
1943
1944
1945 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1946    character.  If CMP_STATUS indicates that we must expect MSEQ or
1947    RULE described above, decode it and return the negative value of
1948    the decoded character or rule.  If an invalid byte is found, return
1949    -1.  If SRC is too short, return -2.  */
1950
1951 static int
1952 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1953                  int *nbytes, int *nchars, int *id,
1954                  struct composition_status *cmp_status)
1955 {
1956   const unsigned char *src_end = coding->source + coding->src_bytes;
1957   const unsigned char *src_base = src;
1958   int multibytep = coding->src_multibyte;
1959   int charset_ID;
1960   unsigned code;
1961   int c;
1962   int consumed_chars = 0;
1963   int mseq_found = 0;
1964
1965   ONE_MORE_BYTE (c);
1966   if (c < 0)
1967     {
1968       c = -c;
1969       charset_ID = emacs_mule_charset[0];
1970     }
1971   else
1972     {
1973       if (c >= 0xA0)
1974         {
1975           if (cmp_status->state != COMPOSING_NO
1976               && cmp_status->old_form)
1977             {
1978               if (cmp_status->state == COMPOSING_CHAR)
1979                 {
1980                   if (c == 0xA0)
1981                     {
1982                       ONE_MORE_BYTE (c);
1983                       c -= 0x80;
1984                       if (c < 0)
1985                         goto invalid_code;
1986                     }
1987                   else
1988                     c -= 0x20;
1989                   mseq_found = 1;
1990                 }
1991               else
1992                 {
1993                   *nbytes = src - src_base;
1994                   *nchars = consumed_chars;
1995                   return -c;
1996                 }
1997             }
1998           else
1999             goto invalid_code;
2000         }
2001
2002       switch (emacs_mule_bytes[c])
2003         {
2004         case 2:
2005           if ((charset_ID = emacs_mule_charset[c]) < 0)
2006             goto invalid_code;
2007           ONE_MORE_BYTE (c);
2008           if (c < 0xA0)
2009             goto invalid_code;
2010           code = c & 0x7F;
2011           break;
2012
2013         case 3:
2014           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2015               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2016             {
2017               ONE_MORE_BYTE (c);
2018               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2019                 goto invalid_code;
2020               ONE_MORE_BYTE (c);
2021               if (c < 0xA0)
2022                 goto invalid_code;
2023               code = c & 0x7F;
2024             }
2025           else
2026             {
2027               if ((charset_ID = emacs_mule_charset[c]) < 0)
2028                 goto invalid_code;
2029               ONE_MORE_BYTE (c);
2030               if (c < 0xA0)
2031                 goto invalid_code;
2032               code = (c & 0x7F) << 8;
2033               ONE_MORE_BYTE (c);
2034               if (c < 0xA0)
2035                 goto invalid_code;
2036               code |= c & 0x7F;
2037             }
2038           break;
2039
2040         case 4:
2041           ONE_MORE_BYTE (c);
2042           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2043             goto invalid_code;
2044           ONE_MORE_BYTE (c);
2045           if (c < 0xA0)
2046             goto invalid_code;
2047           code = (c & 0x7F) << 8;
2048           ONE_MORE_BYTE (c);
2049           if (c < 0xA0)
2050             goto invalid_code;
2051           code |= c & 0x7F;
2052           break;
2053
2054         case 1:
2055           code = c;
2056           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2057           break;
2058
2059         default:
2060           abort ();
2061         }
2062       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2063                           CHARSET_FROM_ID (charset_ID), code, c);
2064       if (c < 0)
2065         goto invalid_code;
2066     }
2067   *nbytes = src - src_base;
2068   *nchars = consumed_chars;
2069   if (id)
2070     *id = charset_ID;
2071   return (mseq_found ? -c : c);
2072
2073  no_more_source:
2074   return -2;
2075
2076  invalid_code:
2077   return -1;
2078 }
2079
2080
2081 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2082
2083 /* Handle these composition sequence ('|': the end of header elements,
2084    BYTES and CHARS >= 0xA0):
2085
2086    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2087    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2088    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2089
2090    and these old form:
2091
2092    (4) relative composition: 0x80 | MSEQ ... MSEQ
2093    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2094
2095    When the starter 0x80 and the following header elements are found,
2096    this annotation header is produced.
2097
2098         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2099
2100    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2101    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2102
2103    Then, upon reading the following elements, these codes are produced
2104    until the composition end is found:
2105
2106    (1) CHAR ... CHAR
2107    (2) ALT ... ALT CHAR ... CHAR
2108    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2109    (4) CHAR ... CHAR
2110    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2111
2112    When the composition end is found, LENGTH and NCHARS in the
2113    annotation header is updated as below:
2114
2115    (1) LENGTH: unchanged, NCHARS: unchanged
2116    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2117    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2118    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2119    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2120
2121    If an error is found while composing, the annotation header is
2122    changed to the original composition header (plus filler -1s) as
2123    below:
2124
2125    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2126    (5)          [ 0x80 0xFF -1 -1- -1 ]
2127
2128    and the sequence [ -2 DECODED-RULE ] is changed to the original
2129    byte sequence as below:
2130         o the original byte sequence is B: [ B -1 ]
2131         o the original byte sequence is B1 B2: [ B1 B2 ]
2132
2133    Most of the routines are implemented by macros because many
2134    variables and labels in the caller decode_coding_emacs_mule must be
2135    accessible, and they are usually called just once (thus doesn't
2136    increase the size of compiled object).  */
2137
2138 /* Decode a composition rule represented by C as a component of
2139    composition sequence of Emacs 20 style.  Set RULE to the decoded
2140    rule. */
2141
2142 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2143   do {                                                  \
2144     int gref, nref;                                     \
2145                                                         \
2146     c -= 0xA0;                                          \
2147     if (c < 0 || c >= 81)                               \
2148       goto invalid_code;                                \
2149     gref = c / 9, nref = c % 9;                         \
2150     if (gref == 4) gref = 10;                           \
2151     if (nref == 4) nref = 10;                           \
2152     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2153   } while (0)
2154
2155
2156 /* Decode a composition rule represented by C and the following byte
2157    at SRC as a component of composition sequence of Emacs 21 style.
2158    Set RULE to the decoded rule.  */
2159
2160 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2161   do {                                                  \
2162     int gref, nref;                                     \
2163                                                         \
2164     gref = c - 0x20;                                    \
2165     if (gref < 0 || gref >= 81)                         \
2166       goto invalid_code;                                \
2167     ONE_MORE_BYTE (c);                                  \
2168     nref = c - 0x20;                                    \
2169     if (nref < 0 || nref >= 81)                         \
2170       goto invalid_code;                                \
2171     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2172   } while (0)
2173
2174
2175 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2176    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2177    byte length of this composition information, CHARS is the number of
2178    characters composed by this composition.  */
2179
2180 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2181   do {                                                                  \
2182     enum composition_method method = c - 0xF2;                          \
2183     int nbytes, nchars;                                                 \
2184                                                                         \
2185     ONE_MORE_BYTE (c);                                                  \
2186     if (c < 0)                                                          \
2187       goto invalid_code;                                                \
2188     nbytes = c - 0xA0;                                                  \
2189     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2190       goto invalid_code;                                                \
2191     ONE_MORE_BYTE (c);                                                  \
2192     nchars = c - 0xA0;                                                  \
2193     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2194       goto invalid_code;                                                \
2195     cmp_status->old_form = 0;                                           \
2196     cmp_status->method = method;                                        \
2197     if (method == COMPOSITION_RELATIVE)                                 \
2198       cmp_status->state = COMPOSING_CHAR;                               \
2199     else                                                                \
2200       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2201     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2202     cmp_status->nchars = nchars;                                        \
2203     cmp_status->ncomps = nbytes - 4;                                    \
2204     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2205   } while (0)
2206
2207
2208 /* Start of Emacs 20 style format for relative composition.  */
2209
2210 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2211   do {                                                          \
2212     cmp_status->old_form = 1;                                   \
2213     cmp_status->method = COMPOSITION_RELATIVE;                  \
2214     cmp_status->state = COMPOSING_CHAR;                         \
2215     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2216     cmp_status->nchars = cmp_status->ncomps = 0;                \
2217     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2218   } while (0)
2219
2220
2221 /* Start of Emacs 20 style format for rule-base composition.  */
2222
2223 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2224   do {                                                          \
2225     cmp_status->old_form = 1;                                   \
2226     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2227     cmp_status->state = COMPOSING_CHAR;                         \
2228     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2229     cmp_status->nchars = cmp_status->ncomps = 0;                \
2230     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2231   } while (0)
2232
2233
2234 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2235   do {                                                  \
2236     const unsigned char *current_src = src;             \
2237                                                         \
2238     ONE_MORE_BYTE (c);                                  \
2239     if (c < 0)                                          \
2240       goto invalid_code;                                \
2241     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2242         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2243       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2244     else if (c < 0xA0)                                  \
2245       goto invalid_code;                                \
2246     else if (c < 0xC0)                                  \
2247       {                                                 \
2248         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2249         /* Re-read C as a composition component.  */    \
2250         src = current_src;                              \
2251       }                                                 \
2252     else if (c == 0xFF)                                 \
2253       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2254     else                                                \
2255       goto invalid_code;                                \
2256   } while (0)
2257
2258 #define EMACS_MULE_COMPOSITION_END()                            \
2259   do {                                                          \
2260     int idx = - cmp_status->length;                             \
2261                                                                 \
2262     if (cmp_status->old_form)                                   \
2263       charbuf[idx + 2] = cmp_status->nchars;                    \
2264     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2265       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2266     cmp_status->state = COMPOSING_NO;                           \
2267   } while (0)
2268
2269
2270 static int
2271 emacs_mule_finish_composition (int *charbuf,
2272                                struct composition_status *cmp_status)
2273 {
2274   int idx = - cmp_status->length;
2275   int new_chars;
2276
2277   if (cmp_status->old_form && cmp_status->nchars > 0)
2278     {
2279       charbuf[idx + 2] = cmp_status->nchars;
2280       new_chars = 0;
2281       if (cmp_status->method == COMPOSITION_WITH_RULE
2282           && cmp_status->state == COMPOSING_CHAR)
2283         {
2284           /* The last rule was invalid.  */
2285           int rule = charbuf[-1] + 0xA0;
2286
2287           charbuf[-2] = BYTE8_TO_CHAR (rule);
2288           charbuf[-1] = -1;
2289           new_chars = 1;
2290         }
2291     }
2292   else
2293     {
2294       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2295
2296       if (cmp_status->method == COMPOSITION_WITH_RULE)
2297         {
2298           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2299           charbuf[idx++] = -3;
2300           charbuf[idx++] = 0;
2301           new_chars = 1;
2302         }
2303       else
2304         {
2305           int nchars = charbuf[idx + 1] + 0xA0;
2306           int nbytes = charbuf[idx + 2] + 0xA0;
2307
2308           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2309           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2310           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2311           charbuf[idx++] = -1;
2312           new_chars = 4;
2313         }
2314     }
2315   cmp_status->state = COMPOSING_NO;
2316   return new_chars;
2317 }
2318
2319 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2320   do {                                                                    \
2321     if (cmp_status->state != COMPOSING_NO)                                \
2322       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2323   } while (0)
2324
2325
2326 static void
2327 decode_coding_emacs_mule (struct coding_system *coding)
2328 {
2329   const unsigned char *src = coding->source + coding->consumed;
2330   const unsigned char *src_end = coding->source + coding->src_bytes;
2331   const unsigned char *src_base;
2332   int *charbuf = coding->charbuf + coding->charbuf_used;
2333   /* We may produce two annotations (charset and composition) in one
2334      loop and one more charset annotation at the end.  */
2335   int *charbuf_end
2336     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2337   EMACS_INT consumed_chars = 0, consumed_chars_base;
2338   int multibytep = coding->src_multibyte;
2339   EMACS_INT char_offset = coding->produced_char;
2340   EMACS_INT last_offset = char_offset;
2341   int last_id = charset_ascii;
2342   int eol_dos =
2343     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2344   int byte_after_cr = -1;
2345   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2346
2347   if (cmp_status->state != COMPOSING_NO)
2348     {
2349       int i;
2350
2351       for (i = 0; i < cmp_status->length; i++)
2352         *charbuf++ = cmp_status->carryover[i];
2353       coding->annotated = 1;
2354     }
2355
2356   while (1)
2357     {
2358       int c, id IF_LINT (= 0);
2359
2360       src_base = src;
2361       consumed_chars_base = consumed_chars;
2362
2363       if (charbuf >= charbuf_end)
2364         {
2365           if (byte_after_cr >= 0)
2366             src_base--;
2367           break;
2368         }
2369
2370       if (byte_after_cr >= 0)
2371         c = byte_after_cr, byte_after_cr = -1;
2372       else
2373         ONE_MORE_BYTE (c);
2374
2375       if (c < 0 || c == 0x80)
2376         {
2377           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2378           if (c < 0)
2379             {
2380               *charbuf++ = -c;
2381               char_offset++;
2382             }
2383           else
2384             DECODE_EMACS_MULE_COMPOSITION_START ();
2385           continue;
2386         }
2387
2388       if (c < 0x80)
2389         {
2390           if (eol_dos && c == '\r')
2391             ONE_MORE_BYTE (byte_after_cr);
2392           id = charset_ascii;
2393           if (cmp_status->state != COMPOSING_NO)
2394             {
2395               if (cmp_status->old_form)
2396                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2397               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2398                 cmp_status->ncomps--;
2399             }
2400         }
2401       else
2402         {
2403           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2404           /* emacs_mule_char can load a charset map from a file, which
2405              allocates a large structure and might cause buffer text
2406              to be relocated as result.  Thus, we need to remember the
2407              original pointer to buffer text, and fix up all related
2408              pointers after the call.  */
2409           const unsigned char *orig = coding->source;
2410           EMACS_INT offset;
2411
2412           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2413                                cmp_status);
2414           offset = coding->source - orig;
2415           if (offset)
2416             {
2417               src += offset;
2418               src_base += offset;
2419               src_end += offset;
2420             }
2421           if (c < 0)
2422             {
2423               if (c == -1)
2424                 goto invalid_code;
2425               if (c == -2)
2426                 break;
2427             }
2428           src = src_base + nbytes;
2429           consumed_chars = consumed_chars_base + nchars;
2430           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2431             cmp_status->ncomps -= nchars;
2432         }
2433
2434       /* Now if C >= 0, we found a normally encoded character, if C <
2435          0, we found an old-style composition component character or
2436          rule.  */
2437
2438       if (cmp_status->state == COMPOSING_NO)
2439         {
2440           if (last_id != id)
2441             {
2442               if (last_id != charset_ascii)
2443                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2444                                   last_id);
2445               last_id = id;
2446               last_offset = char_offset;
2447             }
2448           *charbuf++ = c;
2449           char_offset++;
2450         }
2451       else if (cmp_status->state == COMPOSING_CHAR)
2452         {
2453           if (cmp_status->old_form)
2454             {
2455               if (c >= 0)
2456                 {
2457                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2458                   *charbuf++ = c;
2459                   char_offset++;
2460                 }
2461               else
2462                 {
2463                   *charbuf++ = -c;
2464                   cmp_status->nchars++;
2465                   cmp_status->length++;
2466                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2467                     EMACS_MULE_COMPOSITION_END ();
2468                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2469                     cmp_status->state = COMPOSING_RULE;
2470                 }
2471             }
2472           else
2473             {
2474               *charbuf++ = c;
2475               cmp_status->length++;
2476               cmp_status->nchars--;
2477               if (cmp_status->nchars == 0)
2478                 EMACS_MULE_COMPOSITION_END ();
2479             }
2480         }
2481       else if (cmp_status->state == COMPOSING_RULE)
2482         {
2483           int rule;
2484
2485           if (c >= 0)
2486             {
2487               EMACS_MULE_COMPOSITION_END ();
2488               *charbuf++ = c;
2489               char_offset++;
2490             }
2491           else
2492             {
2493               c = -c;
2494               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2495               if (rule < 0)
2496                 goto invalid_code;
2497               *charbuf++ = -2;
2498               *charbuf++ = rule;
2499               cmp_status->length += 2;
2500               cmp_status->state = COMPOSING_CHAR;
2501             }
2502         }
2503       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2504         {
2505           *charbuf++ = c;
2506           cmp_status->length++;
2507           if (cmp_status->ncomps == 0)
2508             cmp_status->state = COMPOSING_CHAR;
2509           else if (cmp_status->ncomps > 0)
2510             {
2511               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2512                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2513             }
2514           else
2515             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2516         }
2517       else                      /* COMPOSING_COMPONENT_RULE */
2518         {
2519           int rule;
2520
2521           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2522           if (rule < 0)
2523             goto invalid_code;
2524           *charbuf++ = -2;
2525           *charbuf++ = rule;
2526           cmp_status->length += 2;
2527           cmp_status->ncomps--;
2528           if (cmp_status->ncomps > 0)
2529             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2530           else
2531             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2532         }
2533       continue;
2534
2535     invalid_code:
2536       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2537       src = src_base;
2538       consumed_chars = consumed_chars_base;
2539       ONE_MORE_BYTE (c);
2540       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2541       char_offset++;
2542       coding->errors++;
2543     }
2544
2545  no_more_source:
2546   if (cmp_status->state != COMPOSING_NO)
2547     {
2548       if (coding->mode & CODING_MODE_LAST_BLOCK)
2549         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2550       else
2551         {
2552           int i;
2553
2554           charbuf -= cmp_status->length;
2555           for (i = 0; i < cmp_status->length; i++)
2556             cmp_status->carryover[i] = charbuf[i];
2557         }
2558     }
2559   if (last_id != charset_ascii)
2560     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2561   coding->consumed_char += consumed_chars_base;
2562   coding->consumed = src_base - coding->source;
2563   coding->charbuf_used = charbuf - coding->charbuf;
2564 }
2565
2566
2567 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2568   do {                                          \
2569     if (id < 0xA0)                              \
2570       codes[0] = id, codes[1] = 0;              \
2571     else if (id < 0xE0)                         \
2572       codes[0] = 0x9A, codes[1] = id;           \
2573     else if (id < 0xF0)                         \
2574       codes[0] = 0x9B, codes[1] = id;           \
2575     else if (id < 0xF5)                         \
2576       codes[0] = 0x9C, codes[1] = id;           \
2577     else                                        \
2578       codes[0] = 0x9D, codes[1] = id;           \
2579   } while (0);
2580
2581
2582 static int
2583 encode_coding_emacs_mule (struct coding_system *coding)
2584 {
2585   int multibytep = coding->dst_multibyte;
2586   int *charbuf = coding->charbuf;
2587   int *charbuf_end = charbuf + coding->charbuf_used;
2588   unsigned char *dst = coding->destination + coding->produced;
2589   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2590   int safe_room = 8;
2591   EMACS_INT produced_chars = 0;
2592   Lisp_Object attrs, charset_list;
2593   int c;
2594   int preferred_charset_id = -1;
2595
2596   CODING_GET_INFO (coding, attrs, charset_list);
2597   if (! EQ (charset_list, Vemacs_mule_charset_list))
2598     {
2599       CODING_ATTR_CHARSET_LIST (attrs)
2600         = charset_list = Vemacs_mule_charset_list;
2601     }
2602
2603   while (charbuf < charbuf_end)
2604     {
2605       ASSURE_DESTINATION (safe_room);
2606       c = *charbuf++;
2607
2608       if (c < 0)
2609         {
2610           /* Handle an annotation.  */
2611           switch (*charbuf)
2612             {
2613             case CODING_ANNOTATE_COMPOSITION_MASK:
2614               /* Not yet implemented.  */
2615               break;
2616             case CODING_ANNOTATE_CHARSET_MASK:
2617               preferred_charset_id = charbuf[3];
2618               if (preferred_charset_id >= 0
2619                   && NILP (Fmemq (make_number (preferred_charset_id),
2620                                   charset_list)))
2621                 preferred_charset_id = -1;
2622               break;
2623             default:
2624               abort ();
2625             }
2626           charbuf += -c - 1;
2627           continue;
2628         }
2629
2630       if (ASCII_CHAR_P (c))
2631         EMIT_ONE_ASCII_BYTE (c);
2632       else if (CHAR_BYTE8_P (c))
2633         {
2634           c = CHAR_TO_BYTE8 (c);
2635           EMIT_ONE_BYTE (c);
2636         }
2637       else
2638         {
2639           struct charset *charset;
2640           unsigned code;
2641           int dimension;
2642           int emacs_mule_id;
2643           unsigned char leading_codes[2];
2644
2645           if (preferred_charset_id >= 0)
2646             {
2647               charset = CHARSET_FROM_ID (preferred_charset_id);
2648               if (CHAR_CHARSET_P (c, charset))
2649                 code = ENCODE_CHAR (charset, c);
2650               else
2651                 charset = char_charset (c, charset_list, &code);
2652             }
2653           else
2654             charset = char_charset (c, charset_list, &code);
2655           if (! charset)
2656             {
2657               c = coding->default_char;
2658               if (ASCII_CHAR_P (c))
2659                 {
2660                   EMIT_ONE_ASCII_BYTE (c);
2661                   continue;
2662                 }
2663               charset = char_charset (c, charset_list, &code);
2664             }
2665           dimension = CHARSET_DIMENSION (charset);
2666           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2667           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2668           EMIT_ONE_BYTE (leading_codes[0]);
2669           if (leading_codes[1])
2670             EMIT_ONE_BYTE (leading_codes[1]);
2671           if (dimension == 1)
2672             EMIT_ONE_BYTE (code | 0x80);
2673           else
2674             {
2675               code |= 0x8080;
2676               EMIT_ONE_BYTE (code >> 8);
2677               EMIT_ONE_BYTE (code & 0xFF);
2678             }
2679         }
2680     }
2681   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2682   coding->produced_char += produced_chars;
2683   coding->produced = dst - coding->destination;
2684   return 0;
2685 }
2686
2687 \f
2688 /*** 7. ISO2022 handlers ***/
2689
2690 /* The following note describes the coding system ISO2022 briefly.
2691    Since the intention of this note is to help understand the
2692    functions in this file, some parts are NOT ACCURATE or are OVERLY
2693    SIMPLIFIED.  For thorough understanding, please refer to the
2694    original document of ISO2022.  This is equivalent to the standard
2695    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2696
2697    ISO2022 provides many mechanisms to encode several character sets
2698    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2699    is encoded using bytes less than 128.  This may make the encoded
2700    text a little bit longer, but the text passes more easily through
2701    several types of gateway, some of which strip off the MSB (Most
2702    Significant Bit).
2703
2704    There are two kinds of character sets: control character sets and
2705    graphic character sets.  The former contain control characters such
2706    as `newline' and `escape' to provide control functions (control
2707    functions are also provided by escape sequences).  The latter
2708    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2709    two control character sets and many graphic character sets.
2710
2711    Graphic character sets are classified into one of the following
2712    four classes, according to the number of bytes (DIMENSION) and
2713    number of characters in one dimension (CHARS) of the set:
2714    - DIMENSION1_CHARS94
2715    - DIMENSION1_CHARS96
2716    - DIMENSION2_CHARS94
2717    - DIMENSION2_CHARS96
2718
2719    In addition, each character set is assigned an identification tag,
2720    unique for each set, called the "final character" (denoted as <F>
2721    hereafter).  The <F> of each character set is decided by ECMA(*)
2722    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2723    (0x30..0x3F are for private use only).
2724
2725    Note (*): ECMA = European Computer Manufacturers Association
2726
2727    Here are examples of graphic character sets [NAME(<F>)]:
2728         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2729         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2730         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2731         o DIMENSION2_CHARS96 -- none for the moment
2732
2733    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2734         C0 [0x00..0x1F] -- control character plane 0
2735         GL [0x20..0x7F] -- graphic character plane 0
2736         C1 [0x80..0x9F] -- control character plane 1
2737         GR [0xA0..0xFF] -- graphic character plane 1
2738
2739    A control character set is directly designated and invoked to C0 or
2740    C1 by an escape sequence.  The most common case is that:
2741    - ISO646's  control character set is designated/invoked to C0, and
2742    - ISO6429's control character set is designated/invoked to C1,
2743    and usually these designations/invocations are omitted in encoded
2744    text.  In a 7-bit environment, only C0 can be used, and a control
2745    character for C1 is encoded by an appropriate escape sequence to
2746    fit into the environment.  All control characters for C1 are
2747    defined to have corresponding escape sequences.
2748
2749    A graphic character set is at first designated to one of four
2750    graphic registers (G0 through G3), then these graphic registers are
2751    invoked to GL or GR.  These designations and invocations can be
2752    done independently.  The most common case is that G0 is invoked to
2753    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2754    these invocations and designations are omitted in encoded text.
2755    In a 7-bit environment, only GL can be used.
2756
2757    When a graphic character set of CHARS94 is invoked to GL, codes
2758    0x20 and 0x7F of the GL area work as control characters SPACE and
2759    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2760    be used.
2761
2762    There are two ways of invocation: locking-shift and single-shift.
2763    With locking-shift, the invocation lasts until the next different
2764    invocation, whereas with single-shift, the invocation affects the
2765    following character only and doesn't affect the locking-shift
2766    state.  Invocations are done by the following control characters or
2767    escape sequences:
2768
2769    ----------------------------------------------------------------------
2770    abbrev  function                  cntrl escape seq   description
2771    ----------------------------------------------------------------------
2772    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2773    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2774    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2775    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2776    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2777    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2778    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2779    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2780    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2781    ----------------------------------------------------------------------
2782    (*) These are not used by any known coding system.
2783
2784    Control characters for these functions are defined by macros
2785    ISO_CODE_XXX in `coding.h'.
2786
2787    Designations are done by the following escape sequences:
2788    ----------------------------------------------------------------------
2789    escape sequence      description
2790    ----------------------------------------------------------------------
2791    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2792    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2793    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2794    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2795    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2796    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2797    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2798    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2799    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2800    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2801    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2802    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2803    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2804    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2805    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2806    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2807    ----------------------------------------------------------------------
2808
2809    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2810    of dimension 1, chars 94, and final character <F>, etc...
2811
2812    Note (*): Although these designations are not allowed in ISO2022,
2813    Emacs accepts them on decoding, and produces them on encoding
2814    CHARS96 character sets in a coding system which is characterized as
2815    7-bit environment, non-locking-shift, and non-single-shift.
2816
2817    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2818    '(' must be omitted.  We refer to this as "short-form" hereafter.
2819
2820    Now you may notice that there are a lot of ways of encoding the
2821    same multilingual text in ISO2022.  Actually, there exist many
2822    coding systems such as Compound Text (used in X11's inter client
2823    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2824    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2825    localized platforms), and all of these are variants of ISO2022.
2826
2827    In addition to the above, Emacs handles two more kinds of escape
2828    sequences: ISO6429's direction specification and Emacs' private
2829    sequence for specifying character composition.
2830
2831    ISO6429's direction specification takes the following form:
2832         o CSI ']'      -- end of the current direction
2833         o CSI '0' ']'  -- end of the current direction
2834         o CSI '1' ']'  -- start of left-to-right text
2835         o CSI '2' ']'  -- start of right-to-left text
2836    The control character CSI (0x9B: control sequence introducer) is
2837    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2838
2839    Character composition specification takes the following form:
2840         o ESC '0' -- start relative composition
2841         o ESC '1' -- end composition
2842         o ESC '2' -- start rule-base composition (*)
2843         o ESC '3' -- start relative composition with alternate chars  (**)
2844         o ESC '4' -- start rule-base composition with alternate chars  (**)
2845   Since these are not standard escape sequences of any ISO standard,
2846   the use of them with these meanings is restricted to Emacs only.
2847
2848   (*) This form is used only in Emacs 20.7 and older versions,
2849   but newer versions can safely decode it.
2850   (**) This form is used only in Emacs 21.1 and newer versions,
2851   and older versions can't decode it.
2852
2853   Here's a list of example usages of these composition escape
2854   sequences (categorized by `enum composition_method').
2855
2856   COMPOSITION_RELATIVE:
2857         ESC 0 CHAR [ CHAR ] ESC 1
2858   COMPOSITION_WITH_RULE:
2859         ESC 2 CHAR [ RULE CHAR ] ESC 1
2860   COMPOSITION_WITH_ALTCHARS:
2861         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2862   COMPOSITION_WITH_RULE_ALTCHARS:
2863         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2864
2865 static enum iso_code_class_type iso_code_class[256];
2866
2867 #define SAFE_CHARSET_P(coding, id)      \
2868   ((id) <= (coding)->max_charset_id     \
2869    && (coding)->safe_charsets[id] != 255)
2870
2871 static void
2872 setup_iso_safe_charsets (Lisp_Object attrs)
2873 {
2874   Lisp_Object charset_list, safe_charsets;
2875   Lisp_Object request;
2876   Lisp_Object reg_usage;
2877   Lisp_Object tail;
2878   int reg94, reg96;
2879   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2880   int max_charset_id;
2881
2882   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2883   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2884       && ! EQ (charset_list, Viso_2022_charset_list))
2885     {
2886       CODING_ATTR_CHARSET_LIST (attrs)
2887         = charset_list = Viso_2022_charset_list;
2888       ASET (attrs, coding_attr_safe_charsets, Qnil);
2889     }
2890
2891   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2892     return;
2893
2894   max_charset_id = 0;
2895   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2896     {
2897       int id = XINT (XCAR (tail));
2898       if (max_charset_id < id)
2899         max_charset_id = id;
2900     }
2901
2902   safe_charsets = make_uninit_string (max_charset_id + 1);
2903   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2904   request = AREF (attrs, coding_attr_iso_request);
2905   reg_usage = AREF (attrs, coding_attr_iso_usage);
2906   reg94 = XINT (XCAR (reg_usage));
2907   reg96 = XINT (XCDR (reg_usage));
2908
2909   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2910     {
2911       Lisp_Object id;
2912       Lisp_Object reg;
2913       struct charset *charset;
2914
2915       id = XCAR (tail);
2916       charset = CHARSET_FROM_ID (XINT (id));
2917       reg = Fcdr (Fassq (id, request));
2918       if (! NILP (reg))
2919         SSET (safe_charsets, XINT (id), XINT (reg));
2920       else if (charset->iso_chars_96)
2921         {
2922           if (reg96 < 4)
2923             SSET (safe_charsets, XINT (id), reg96);
2924         }
2925       else
2926         {
2927           if (reg94 < 4)
2928             SSET (safe_charsets, XINT (id), reg94);
2929         }
2930     }
2931   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2932 }
2933
2934
2935 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2936    Check if a text is encoded in one of ISO-2022 based coding systems.
2937    If it is, return 1, else return 0.  */
2938
2939 static int
2940 detect_coding_iso_2022 (struct coding_system *coding,
2941                         struct coding_detection_info *detect_info)
2942 {
2943   const unsigned char *src = coding->source, *src_base = src;
2944   const unsigned char *src_end = coding->source + coding->src_bytes;
2945   int multibytep = coding->src_multibyte;
2946   int single_shifting = 0;
2947   int id;
2948   int c, c1;
2949   EMACS_INT consumed_chars = 0;
2950   int i;
2951   int rejected = 0;
2952   int found = 0;
2953   int composition_count = -1;
2954
2955   detect_info->checked |= CATEGORY_MASK_ISO;
2956
2957   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2958     {
2959       struct coding_system *this = &(coding_categories[i]);
2960       Lisp_Object attrs, val;
2961
2962       if (this->id < 0)
2963         continue;
2964       attrs = CODING_ID_ATTRS (this->id);
2965       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2966           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2967         setup_iso_safe_charsets (attrs);
2968       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2969       this->max_charset_id = SCHARS (val) - 1;
2970       this->safe_charsets = SDATA (val);
2971     }
2972
2973   /* A coding system of this category is always ASCII compatible.  */
2974   src += coding->head_ascii;
2975
2976   while (rejected != CATEGORY_MASK_ISO)
2977     {
2978       src_base = src;
2979       ONE_MORE_BYTE (c);
2980       switch (c)
2981         {
2982         case ISO_CODE_ESC:
2983           if (inhibit_iso_escape_detection)
2984             break;
2985           single_shifting = 0;
2986           ONE_MORE_BYTE (c);
2987           if (c == 'N' || c == 'O')
2988             {
2989               /* ESC <Fe> for SS2 or SS3.  */
2990               single_shifting = 1;
2991               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2992             }
2993           else if (c == '1')
2994             {
2995               /* End of composition.  */
2996               if (composition_count < 0
2997                   || composition_count > MAX_COMPOSITION_COMPONENTS)
2998                 /* Invalid */
2999                 break;
3000               composition_count = -1;
3001               found |= CATEGORY_MASK_ISO;
3002             }
3003           else if (c >= '0' && c <= '4')
3004             {
3005               /* ESC <Fp> for start/end composition.  */
3006               composition_count = 0;
3007             }
3008           else
3009             {
3010               if (c >= '(' && c <= '/')
3011                 {
3012                   /* Designation sequence for a charset of dimension 1.  */
3013                   ONE_MORE_BYTE (c1);
3014                   if (c1 < ' ' || c1 >= 0x80
3015                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3016                     /* Invalid designation sequence.  Just ignore.  */
3017                     break;
3018                 }
3019               else if (c == '$')
3020                 {
3021                   /* Designation sequence for a charset of dimension 2.  */
3022                   ONE_MORE_BYTE (c);
3023                   if (c >= '@' && c <= 'B')
3024                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3025                     id = iso_charset_table[1][0][c];
3026                   else if (c >= '(' && c <= '/')
3027                     {
3028                       ONE_MORE_BYTE (c1);
3029                       if (c1 < ' ' || c1 >= 0x80
3030                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3031                         /* Invalid designation sequence.  Just ignore.  */
3032                         break;
3033                     }
3034                   else
3035                     /* Invalid designation sequence.  Just ignore it.  */
3036                     break;
3037                 }
3038               else
3039                 {
3040                   /* Invalid escape sequence.  Just ignore it.  */
3041                   break;
3042                 }
3043
3044               /* We found a valid designation sequence for CHARSET.  */
3045               rejected |= CATEGORY_MASK_ISO_8BIT;
3046               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3047                                   id))
3048                 found |= CATEGORY_MASK_ISO_7;
3049               else
3050                 rejected |= CATEGORY_MASK_ISO_7;
3051               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3052                                   id))
3053                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3054               else
3055                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3056               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3057                                   id))
3058                 found |= CATEGORY_MASK_ISO_7_ELSE;
3059               else
3060                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3061               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3062                                   id))
3063                 found |= CATEGORY_MASK_ISO_8_ELSE;
3064               else
3065                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3066             }
3067           break;
3068
3069         case ISO_CODE_SO:
3070         case ISO_CODE_SI:
3071           /* Locking shift out/in.  */
3072           if (inhibit_iso_escape_detection)
3073             break;
3074           single_shifting = 0;
3075           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3076           break;
3077
3078         case ISO_CODE_CSI:
3079           /* Control sequence introducer.  */
3080           single_shifting = 0;
3081           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3082           found |= CATEGORY_MASK_ISO_8_ELSE;
3083           goto check_extra_latin;
3084
3085         case ISO_CODE_SS2:
3086         case ISO_CODE_SS3:
3087           /* Single shift.   */
3088           if (inhibit_iso_escape_detection)
3089             break;
3090           single_shifting = 0;
3091           rejected |= CATEGORY_MASK_ISO_7BIT;
3092           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3093               & CODING_ISO_FLAG_SINGLE_SHIFT)
3094             {
3095               found |= CATEGORY_MASK_ISO_8_1;
3096               single_shifting = 1;
3097             }
3098           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3099               & CODING_ISO_FLAG_SINGLE_SHIFT)
3100             {
3101               found |= CATEGORY_MASK_ISO_8_2;
3102               single_shifting = 1;
3103             }
3104           if (single_shifting)
3105             break;
3106         check_extra_latin:
3107           if (! VECTORP (Vlatin_extra_code_table)
3108               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3109             {
3110               rejected = CATEGORY_MASK_ISO;
3111               break;
3112             }
3113           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3114               & CODING_ISO_FLAG_LATIN_EXTRA)
3115             found |= CATEGORY_MASK_ISO_8_1;
3116           else
3117             rejected |= CATEGORY_MASK_ISO_8_1;
3118           rejected |= CATEGORY_MASK_ISO_8_2;
3119           break;
3120
3121         default:
3122           if (c < 0)
3123             continue;
3124           if (c < 0x80)
3125             {
3126               if (composition_count >= 0)
3127                 composition_count++;
3128               single_shifting = 0;
3129               break;
3130             }
3131           if (c >= 0xA0)
3132             {
3133               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3134               found |= CATEGORY_MASK_ISO_8_1;
3135               /* Check the length of succeeding codes of the range
3136                  0xA0..0FF.  If the byte length is even, we include
3137                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3138                  only when we are not single shifting.  */
3139               if (! single_shifting
3140                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3141                 {
3142                   int len = 1;
3143                   while (src < src_end)
3144                     {
3145                       src_base = src;
3146                       ONE_MORE_BYTE (c);
3147                       if (c < 0xA0)
3148                         {
3149                           src = src_base;
3150                           break;
3151                         }
3152                       len++;
3153                     }
3154
3155                   if (len & 1 && src < src_end)
3156                     {
3157                       rejected |= CATEGORY_MASK_ISO_8_2;
3158                       if (composition_count >= 0)
3159                         composition_count += len;
3160                     }
3161                   else
3162                     {
3163                       found |= CATEGORY_MASK_ISO_8_2;
3164                       if (composition_count >= 0)
3165                         composition_count += len / 2;
3166                     }
3167                 }
3168               break;
3169             }
3170         }
3171     }
3172   detect_info->rejected |= CATEGORY_MASK_ISO;
3173   return 0;
3174
3175  no_more_source:
3176   detect_info->rejected |= rejected;
3177   detect_info->found |= (found & ~rejected);
3178   return 1;
3179 }
3180
3181
3182 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3183    escape sequence should be kept.  */
3184 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3185   do {                                                                  \
3186     int id, prev;                                                       \
3187                                                                         \
3188     if (final < '0' || final >= 128                                     \
3189         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3190         || !SAFE_CHARSET_P (coding, id))                                \
3191       {                                                                 \
3192         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3193         chars_96 = -1;                                                  \
3194         break;                                                          \
3195       }                                                                 \
3196     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3197     if (id == charset_jisx0201_roman)                                   \
3198       {                                                                 \
3199         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3200           id = charset_ascii;                                           \
3201       }                                                                 \
3202     else if (id == charset_jisx0208_1978)                               \
3203       {                                                                 \
3204         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3205           id = charset_jisx0208;                                        \
3206       }                                                                 \
3207     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3208     /* If there was an invalid designation to REG previously, and this  \
3209        designation is ASCII to REG, we should keep this designation     \
3210        sequence.  */                                                    \
3211     if (prev == -2 && id == charset_ascii)                              \
3212       chars_96 = -1;                                                    \
3213   } while (0)
3214
3215
3216 /* Handle these composition sequence (ALT: alternate char):
3217
3218    (1) relative composition: ESC 0 CHAR ... ESC 1
3219    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3220    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3221    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3222
3223    When the start sequence (ESC 0/2/3/4) is found, this annotation
3224    header is produced.
3225
3226         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3227
3228    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3229    produced until the end sequence (ESC 1) is found:
3230
3231    (1) CHAR ... CHAR
3232    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3233    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3234    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3235
3236    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3237    annotation header is updated as below:
3238
3239    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3240    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3241    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3242    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3243
3244    If an error is found while composing, the annotation header is
3245    changed to:
3246
3247         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3248
3249    and the sequence [ -2 DECODED-RULE ] is changed to the original
3250    byte sequence as below:
3251         o the original byte sequence is B: [ B -1 ]
3252         o the original byte sequence is B1 B2: [ B1 B2 ]
3253    and the sequence [ -1 -1 ] is changed to the original byte
3254    sequence:
3255         [ ESC '0' ]
3256 */
3257
3258 /* Decode a composition rule C1 and maybe one more byte from the
3259    source, and set RULE to the encoded composition rule.  If the rule
3260    is invalid, goto invalid_code.  */
3261
3262 #define DECODE_COMPOSITION_RULE(rule)                                   \
3263   do {                                                                  \
3264     rule = c1 - 32;                                                     \
3265     if (rule < 0)                                                       \
3266       goto invalid_code;                                                \
3267     if (rule < 81)              /* old format (before ver.21) */        \
3268       {                                                                 \
3269         int gref = (rule) / 9;                                          \
3270         int nref = (rule) % 9;                                          \
3271         if (gref == 4) gref = 10;                                       \
3272         if (nref == 4) nref = 10;                                       \
3273         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3274       }                                                                 \
3275     else                        /* new format (after ver.21) */         \
3276       {                                                                 \
3277         int b;                                                          \
3278                                                                         \
3279         ONE_MORE_BYTE (b);                                              \
3280         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3281           goto invalid_code;                                            \
3282         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3283         rule += 0x100;   /* Distinguish it from the old format.  */     \
3284       }                                                                 \
3285   } while (0)
3286
3287 #define ENCODE_COMPOSITION_RULE(rule)                           \
3288   do {                                                          \
3289     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3290                                                                 \
3291     if (rule < 0x100)           /* old format */                \
3292       {                                                         \
3293         if (gref == 10) gref = 4;                               \
3294         if (nref == 10) nref = 4;                               \
3295         charbuf[idx] = 32 + gref * 9 + nref;                    \
3296         charbuf[idx + 1] = -1;                                  \
3297         new_chars++;                                            \
3298       }                                                         \
3299     else                                /* new format */        \
3300       {                                                         \
3301         charbuf[idx] = 32 + 81 + gref;                          \
3302         charbuf[idx + 1] = 32 + nref;                           \
3303         new_chars += 2;                                         \
3304       }                                                         \
3305   } while (0)
3306
3307 /* Finish the current composition as invalid.  */
3308
3309 static int finish_composition (int *, struct composition_status *);
3310
3311 static int
3312 finish_composition (int *charbuf, struct composition_status *cmp_status)
3313 {
3314   int idx = - cmp_status->length;
3315   int new_chars;
3316
3317   /* Recover the original ESC sequence */
3318   charbuf[idx++] = ISO_CODE_ESC;
3319   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3320                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3321                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3322                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3323                     : '4');
3324   charbuf[idx++] = -2;
3325   charbuf[idx++] = 0;
3326   charbuf[idx++] = -1;
3327   new_chars = cmp_status->nchars;
3328   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3329     for (; idx < 0; idx++)
3330       {
3331         int elt = charbuf[idx];
3332
3333         if (elt == -2)
3334           {
3335             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3336             idx++;
3337           }
3338         else if (elt == -1)
3339           {
3340             charbuf[idx++] = ISO_CODE_ESC;
3341             charbuf[idx] = '0';
3342             new_chars += 2;
3343           }
3344       }
3345   cmp_status->state = COMPOSING_NO;
3346   return new_chars;
3347 }
3348
3349 /* If characters are under composition, finish the composition.  */
3350 #define MAYBE_FINISH_COMPOSITION()                              \
3351   do {                                                          \
3352     if (cmp_status->state != COMPOSING_NO)                      \
3353       char_offset += finish_composition (charbuf, cmp_status);  \
3354   } while (0)
3355
3356 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3357
3358    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3359    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3360    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3361    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3362
3363    Produce this annotation sequence now:
3364
3365    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3366 */
3367
3368 #define DECODE_COMPOSITION_START(c1)                                       \
3369   do {                                                                     \
3370     if (c1 == '0'                                                          \
3371         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3372              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3373             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3374                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3375       {                                                                    \
3376         *charbuf++ = -1;                                                   \
3377         *charbuf++= -1;                                                    \
3378         cmp_status->state = COMPOSING_CHAR;                                \
3379         cmp_status->length += 2;                                           \
3380       }                                                                    \
3381     else                                                                   \
3382       {                                                                    \
3383         MAYBE_FINISH_COMPOSITION ();                                       \
3384         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3385                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3386                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3387                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3388         cmp_status->state                                                  \
3389           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3390         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3391         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3392         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3393         coding->annotated = 1;                                             \
3394       }                                                                    \
3395   } while (0)
3396
3397
3398 /* Handle composition end sequence ESC 1.  */
3399
3400 #define DECODE_COMPOSITION_END()                                        \
3401   do {                                                                  \
3402     if (cmp_status->nchars == 0                                         \
3403         || ((cmp_status->state == COMPOSING_CHAR)                       \
3404             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3405       {                                                                 \
3406         MAYBE_FINISH_COMPOSITION ();                                    \
3407         goto invalid_code;                                              \
3408       }                                                                 \
3409     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3410       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3411     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3412       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3413     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3414     char_offset += cmp_status->nchars;                                  \
3415     cmp_status->state = COMPOSING_NO;                                   \
3416   } while (0)
3417
3418 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3419
3420 #define STORE_COMPOSITION_RULE(rule)    \
3421   do {                                  \
3422     *charbuf++ = -2;                    \
3423     *charbuf++ = rule;                  \
3424     cmp_status->length += 2;            \
3425     cmp_status->state--;                \
3426   } while (0)
3427
3428 /* Store a composed char or a component char C in charbuf, and update
3429    cmp_status.  */
3430
3431 #define STORE_COMPOSITION_CHAR(c)                                       \
3432   do {                                                                  \
3433     *charbuf++ = (c);                                                   \
3434     cmp_status->length++;                                               \
3435     if (cmp_status->state == COMPOSING_CHAR)                            \
3436       cmp_status->nchars++;                                             \
3437     else                                                                \
3438       cmp_status->ncomps++;                                             \
3439     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3440         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3441             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3442       cmp_status->state++;                                              \
3443   } while (0)
3444
3445
3446 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3447
3448 static void
3449 decode_coding_iso_2022 (struct coding_system *coding)
3450 {
3451   const unsigned char *src = coding->source + coding->consumed;
3452   const unsigned char *src_end = coding->source + coding->src_bytes;
3453   const unsigned char *src_base;
3454   int *charbuf = coding->charbuf + coding->charbuf_used;
3455   /* We may produce two annotations (charset and composition) in one
3456      loop and one more charset annotation at the end.  */
3457   int *charbuf_end
3458     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3459   EMACS_INT consumed_chars = 0, consumed_chars_base;
3460   int multibytep = coding->src_multibyte;
3461   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3462   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3463   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3464   int charset_id_2, charset_id_3;
3465   struct charset *charset;
3466   int c;
3467   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3468   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3469   EMACS_INT char_offset = coding->produced_char;
3470   EMACS_INT last_offset = char_offset;
3471   int last_id = charset_ascii;
3472   int eol_dos =
3473     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3474   int byte_after_cr = -1;
3475   int i;
3476
3477   setup_iso_safe_charsets (attrs);
3478   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3479
3480   if (cmp_status->state != COMPOSING_NO)
3481     {
3482       for (i = 0; i < cmp_status->length; i++)
3483         *charbuf++ = cmp_status->carryover[i];
3484       coding->annotated = 1;
3485     }
3486
3487   while (1)
3488     {
3489       int c1, c2, c3;
3490
3491       src_base = src;
3492       consumed_chars_base = consumed_chars;
3493
3494       if (charbuf >= charbuf_end)
3495         {
3496           if (byte_after_cr >= 0)
3497             src_base--;
3498           break;
3499         }
3500
3501       if (byte_after_cr >= 0)
3502         c1 = byte_after_cr, byte_after_cr = -1;
3503       else
3504         ONE_MORE_BYTE (c1);
3505       if (c1 < 0)
3506         goto invalid_code;
3507
3508       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3509         {
3510           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3511           char_offset++;
3512           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3513           continue;
3514         }
3515
3516       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3517         {
3518           if (c1 == ISO_CODE_ESC)
3519             {
3520               if (src + 1 >= src_end)
3521                 goto no_more_source;
3522               *charbuf++ = ISO_CODE_ESC;
3523               char_offset++;
3524               if (src[0] == '%' && src[1] == '@')
3525                 {
3526                   src += 2;
3527                   consumed_chars += 2;
3528                   char_offset += 2;
3529                   /* We are sure charbuf can contain two more chars. */
3530                   *charbuf++ = '%';
3531                   *charbuf++ = '@';
3532                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3533                 }
3534             }
3535           else
3536             {
3537               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3538               char_offset++;
3539             }
3540           continue;
3541         }
3542
3543       if ((cmp_status->state == COMPOSING_RULE
3544            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3545           && c1 != ISO_CODE_ESC)
3546         {
3547           int rule;
3548
3549           DECODE_COMPOSITION_RULE (rule);
3550           STORE_COMPOSITION_RULE (rule);
3551           continue;
3552         }
3553
3554       /* We produce at most one character.  */
3555       switch (iso_code_class [c1])
3556         {
3557         case ISO_0x20_or_0x7F:
3558           if (charset_id_0 < 0
3559               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3560             /* This is SPACE or DEL.  */
3561             charset = CHARSET_FROM_ID (charset_ascii);
3562           else
3563             charset = CHARSET_FROM_ID (charset_id_0);
3564           break;
3565
3566         case ISO_graphic_plane_0:
3567           if (charset_id_0 < 0)
3568             charset = CHARSET_FROM_ID (charset_ascii);
3569           else
3570             charset = CHARSET_FROM_ID (charset_id_0);
3571           break;
3572
3573         case ISO_0xA0_or_0xFF:
3574           if (charset_id_1 < 0
3575               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3576               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3577             goto invalid_code;
3578           /* This is a graphic character, we fall down ... */
3579
3580         case ISO_graphic_plane_1:
3581           if (charset_id_1 < 0)
3582             goto invalid_code;
3583           charset = CHARSET_FROM_ID (charset_id_1);
3584           break;
3585
3586         case ISO_control_0:
3587           if (eol_dos && c1 == '\r')
3588             ONE_MORE_BYTE (byte_after_cr);
3589           MAYBE_FINISH_COMPOSITION ();
3590           charset = CHARSET_FROM_ID (charset_ascii);
3591           break;
3592
3593         case ISO_control_1:
3594           goto invalid_code;
3595
3596         case ISO_shift_out:
3597           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3598               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3599             goto invalid_code;
3600           CODING_ISO_INVOCATION (coding, 0) = 1;
3601           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3602           continue;
3603
3604         case ISO_shift_in:
3605           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3606             goto invalid_code;
3607           CODING_ISO_INVOCATION (coding, 0) = 0;
3608           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3609           continue;
3610
3611         case ISO_single_shift_2_7:
3612           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3613             goto invalid_code;
3614         case ISO_single_shift_2:
3615           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3616             goto invalid_code;
3617           /* SS2 is handled as an escape sequence of ESC 'N' */
3618           c1 = 'N';
3619           goto label_escape_sequence;
3620
3621         case ISO_single_shift_3:
3622           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3623             goto invalid_code;
3624           /* SS2 is handled as an escape sequence of ESC 'O' */
3625           c1 = 'O';
3626           goto label_escape_sequence;
3627
3628         case ISO_control_sequence_introducer:
3629           /* CSI is handled as an escape sequence of ESC '[' ...  */
3630           c1 = '[';
3631           goto label_escape_sequence;
3632
3633         case ISO_escape:
3634           ONE_MORE_BYTE (c1);
3635         label_escape_sequence:
3636           /* Escape sequences handled here are invocation,
3637              designation, direction specification, and character
3638              composition specification.  */
3639           switch (c1)
3640             {
3641             case '&':           /* revision of following character set */
3642               ONE_MORE_BYTE (c1);
3643               if (!(c1 >= '@' && c1 <= '~'))
3644                 goto invalid_code;
3645               ONE_MORE_BYTE (c1);
3646               if (c1 != ISO_CODE_ESC)
3647                 goto invalid_code;
3648               ONE_MORE_BYTE (c1);
3649               goto label_escape_sequence;
3650
3651             case '$':           /* designation of 2-byte character set */
3652               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3653                 goto invalid_code;
3654               {
3655                 int reg, chars96;
3656
3657                 ONE_MORE_BYTE (c1);
3658                 if (c1 >= '@' && c1 <= 'B')
3659                   {     /* designation of JISX0208.1978, GB2312.1980,
3660                            or JISX0208.1980 */
3661                     reg = 0, chars96 = 0;
3662                   }
3663                 else if (c1 >= 0x28 && c1 <= 0x2B)
3664                   { /* designation of DIMENSION2_CHARS94 character set */
3665                     reg = c1 - 0x28, chars96 = 0;
3666                     ONE_MORE_BYTE (c1);
3667                   }
3668                 else if (c1 >= 0x2C && c1 <= 0x2F)
3669                   { /* designation of DIMENSION2_CHARS96 character set */
3670                     reg = c1 - 0x2C, chars96 = 1;
3671                     ONE_MORE_BYTE (c1);
3672                   }
3673                 else
3674                   goto invalid_code;
3675                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3676                 /* We must update these variables now.  */
3677                 if (reg == 0)
3678                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3679                 else if (reg == 1)
3680                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3681                 if (chars96 < 0)
3682                   goto invalid_code;
3683               }
3684               continue;
3685
3686             case 'n':           /* invocation of locking-shift-2 */
3687               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3688                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3689                 goto invalid_code;
3690               CODING_ISO_INVOCATION (coding, 0) = 2;
3691               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3692               continue;
3693
3694             case 'o':           /* invocation of locking-shift-3 */
3695               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3696                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3697                 goto invalid_code;
3698               CODING_ISO_INVOCATION (coding, 0) = 3;
3699               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3700               continue;
3701
3702             case 'N':           /* invocation of single-shift-2 */
3703               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3704                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3705                 goto invalid_code;
3706               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3707               if (charset_id_2 < 0)
3708                 charset = CHARSET_FROM_ID (charset_ascii);
3709               else
3710                 charset = CHARSET_FROM_ID (charset_id_2);
3711               ONE_MORE_BYTE (c1);
3712               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3713                 goto invalid_code;
3714               break;
3715
3716             case 'O':           /* invocation of single-shift-3 */
3717               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3718                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3719                 goto invalid_code;
3720               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3721               if (charset_id_3 < 0)
3722                 charset = CHARSET_FROM_ID (charset_ascii);
3723               else
3724                 charset = CHARSET_FROM_ID (charset_id_3);
3725               ONE_MORE_BYTE (c1);
3726               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3727                 goto invalid_code;
3728               break;
3729
3730             case '0': case '2': case '3': case '4': /* start composition */
3731               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3732                 goto invalid_code;
3733               if (last_id != charset_ascii)
3734                 {
3735                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3736                   last_id = charset_ascii;
3737                   last_offset = char_offset;
3738                 }
3739               DECODE_COMPOSITION_START (c1);
3740               continue;
3741
3742             case '1':           /* end composition */
3743               if (cmp_status->state == COMPOSING_NO)
3744                 goto invalid_code;
3745               DECODE_COMPOSITION_END ();
3746               continue;
3747
3748             case '[':           /* specification of direction */
3749               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3750                 goto invalid_code;
3751               /* For the moment, nested direction is not supported.
3752                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3753                  left-to-right, and nonzero means right-to-left.  */
3754               ONE_MORE_BYTE (c1);
3755               switch (c1)
3756                 {
3757                 case ']':       /* end of the current direction */
3758                   coding->mode &= ~CODING_MODE_DIRECTION;
3759
3760                 case '0':       /* end of the current direction */
3761                 case '1':       /* start of left-to-right direction */
3762                   ONE_MORE_BYTE (c1);
3763                   if (c1 == ']')
3764                     coding->mode &= ~CODING_MODE_DIRECTION;
3765                   else
3766                     goto invalid_code;
3767                   break;
3768
3769                 case '2':       /* start of right-to-left direction */
3770                   ONE_MORE_BYTE (c1);
3771                   if (c1 == ']')
3772                     coding->mode |= CODING_MODE_DIRECTION;
3773                   else
3774                     goto invalid_code;
3775                   break;
3776
3777                 default:
3778                   goto invalid_code;
3779                 }
3780               continue;
3781
3782             case '%':
3783               ONE_MORE_BYTE (c1);
3784               if (c1 == '/')
3785                 {
3786                   /* CTEXT extended segment:
3787                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3788                      We keep these bytes as is for the moment.
3789                      They may be decoded by post-read-conversion.  */
3790                   int dim, M, L;
3791                   int size;
3792
3793                   ONE_MORE_BYTE (dim);
3794                   if (dim < '0' || dim > '4')
3795                     goto invalid_code;
3796                   ONE_MORE_BYTE (M);
3797                   if (M < 128)
3798                     goto invalid_code;
3799                   ONE_MORE_BYTE (L);
3800                   if (L < 128)
3801                     goto invalid_code;
3802                   size = ((M - 128) * 128) + (L - 128);
3803                   if (charbuf + 6 > charbuf_end)
3804                     goto break_loop;
3805                   *charbuf++ = ISO_CODE_ESC;
3806                   *charbuf++ = '%';
3807                   *charbuf++ = '/';
3808                   *charbuf++ = dim;
3809                   *charbuf++ = BYTE8_TO_CHAR (M);
3810                   *charbuf++ = BYTE8_TO_CHAR (L);
3811                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3812                 }
3813               else if (c1 == 'G')
3814                 {
3815                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3816                      ESC % G --UTF-8-BYTES-- ESC % @
3817                      We keep these bytes as is for the moment.
3818                      They may be decoded by post-read-conversion.  */
3819                   if (charbuf + 3 > charbuf_end)
3820                     goto break_loop;
3821                   *charbuf++ = ISO_CODE_ESC;
3822                   *charbuf++ = '%';
3823                   *charbuf++ = 'G';
3824                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3825                 }
3826               else
3827                 goto invalid_code;
3828               continue;
3829               break;
3830
3831             default:
3832               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3833                 goto invalid_code;
3834               {
3835                 int reg, chars96;
3836
3837                 if (c1 >= 0x28 && c1 <= 0x2B)
3838                   { /* designation of DIMENSION1_CHARS94 character set */
3839                     reg = c1 - 0x28, chars96 = 0;
3840                     ONE_MORE_BYTE (c1);
3841                   }
3842                 else if (c1 >= 0x2C && c1 <= 0x2F)
3843                   { /* designation of DIMENSION1_CHARS96 character set */
3844                     reg = c1 - 0x2C, chars96 = 1;
3845                     ONE_MORE_BYTE (c1);
3846                   }
3847                 else
3848                   goto invalid_code;
3849                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3850                 /* We must update these variables now.  */
3851                 if (reg == 0)
3852                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3853                 else if (reg == 1)
3854                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3855                 if (chars96 < 0)
3856                   goto invalid_code;
3857               }
3858               continue;
3859             }
3860           break;
3861
3862         default:
3863           abort ();
3864         }
3865
3866       if (cmp_status->state == COMPOSING_NO
3867           && charset->id != charset_ascii
3868           && last_id != charset->id)
3869         {
3870           if (last_id != charset_ascii)
3871             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3872           last_id = charset->id;
3873           last_offset = char_offset;
3874         }
3875
3876       /* Now we know CHARSET and 1st position code C1 of a character.
3877          Produce a decoded character while getting 2nd and 3rd
3878          position codes C2, C3 if necessary.  */
3879       if (CHARSET_DIMENSION (charset) > 1)
3880         {
3881           ONE_MORE_BYTE (c2);
3882           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3883               || ((c1 & 0x80) != (c2 & 0x80)))
3884             /* C2 is not in a valid range.  */
3885             goto invalid_code;
3886           if (CHARSET_DIMENSION (charset) == 2)
3887             c1 = (c1 << 8) | c2;
3888           else
3889             {
3890               ONE_MORE_BYTE (c3);
3891               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3892                   || ((c1 & 0x80) != (c3 & 0x80)))
3893                 /* C3 is not in a valid range.  */
3894                 goto invalid_code;
3895               c1 = (c1 << 16) | (c2 << 8) | c2;
3896             }
3897         }
3898       c1 &= 0x7F7F7F;
3899       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3900       if (c < 0)
3901         {
3902           MAYBE_FINISH_COMPOSITION ();
3903           for (; src_base < src; src_base++, char_offset++)
3904             {
3905               if (ASCII_BYTE_P (*src_base))
3906                 *charbuf++ = *src_base;
3907               else
3908                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3909             }
3910         }
3911       else if (cmp_status->state == COMPOSING_NO)
3912         {
3913           *charbuf++ = c;
3914           char_offset++;
3915         }
3916       else if ((cmp_status->state == COMPOSING_CHAR
3917                 ? cmp_status->nchars
3918                 : cmp_status->ncomps)
3919                >= MAX_COMPOSITION_COMPONENTS)
3920         {
3921           /* Too long composition.  */
3922           MAYBE_FINISH_COMPOSITION ();
3923           *charbuf++ = c;
3924           char_offset++;
3925         }
3926       else
3927         STORE_COMPOSITION_CHAR (c);
3928       continue;
3929
3930     invalid_code:
3931       MAYBE_FINISH_COMPOSITION ();
3932       src = src_base;
3933       consumed_chars = consumed_chars_base;
3934       ONE_MORE_BYTE (c);
3935       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3936       char_offset++;
3937       coding->errors++;
3938       continue;
3939
3940     break_loop:
3941       break;
3942     }
3943
3944  no_more_source:
3945   if (cmp_status->state != COMPOSING_NO)
3946     {
3947       if (coding->mode & CODING_MODE_LAST_BLOCK)
3948         MAYBE_FINISH_COMPOSITION ();
3949       else
3950         {
3951           charbuf -= cmp_status->length;
3952           for (i = 0; i < cmp_status->length; i++)
3953             cmp_status->carryover[i] = charbuf[i];
3954         }
3955     }
3956   else if (last_id != charset_ascii)
3957     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3958   coding->consumed_char += consumed_chars_base;
3959   coding->consumed = src_base - coding->source;
3960   coding->charbuf_used = charbuf - coding->charbuf;
3961 }
3962
3963
3964 /* ISO2022 encoding stuff.  */
3965
3966 /*
3967    It is not enough to say just "ISO2022" on encoding, we have to
3968    specify more details.  In Emacs, each coding system of ISO2022
3969    variant has the following specifications:
3970         1. Initial designation to G0 thru G3.
3971         2. Allows short-form designation?
3972         3. ASCII should be designated to G0 before control characters?
3973         4. ASCII should be designated to G0 at end of line?
3974         5. 7-bit environment or 8-bit environment?
3975         6. Use locking-shift?
3976         7. Use Single-shift?
3977    And the following two are only for Japanese:
3978         8. Use ASCII in place of JIS0201-1976-Roman?
3979         9. Use JISX0208-1983 in place of JISX0208-1978?
3980    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3981    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3982    details.
3983 */
3984
3985 /* Produce codes (escape sequence) for designating CHARSET to graphic
3986    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3987    '@', 'A', or 'B' and the coding system CODING allows, produce
3988    designation sequence of short-form.  */
3989
3990 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3991   do {                                                                  \
3992     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3993     const char *intermediate_char_94 = "()*+";                          \
3994     const char *intermediate_char_96 = ",-./";                          \
3995     int revision = -1;                                                  \
3996                                                                         \
3997     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3998       revision = CHARSET_ISO_REVISION (charset);                        \
3999                                                                         \
4000     if (revision >= 0)                                                  \
4001       {                                                                 \
4002         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4003         EMIT_ONE_BYTE ('@' + revision);                                 \
4004       }                                                                 \
4005     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4006     if (CHARSET_DIMENSION (charset) == 1)                               \
4007       {                                                                 \
4008         int b;                                                          \
4009         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4010           b = intermediate_char_94[reg];                                \
4011         else                                                            \
4012           b = intermediate_char_96[reg];                                \
4013         EMIT_ONE_ASCII_BYTE (b);                                        \
4014       }                                                                 \
4015     else                                                                \
4016       {                                                                 \
4017         EMIT_ONE_ASCII_BYTE ('$');                                      \
4018         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4019           {                                                             \
4020             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4021                 || reg != 0                                             \
4022                 || final_char < '@' || final_char > 'B')                \
4023               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4024           }                                                             \
4025         else                                                            \
4026           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4027       }                                                                 \
4028     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4029                                                                         \
4030     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4031   } while (0)
4032
4033
4034 /* The following two macros produce codes (control character or escape
4035    sequence) for ISO2022 single-shift functions (single-shift-2 and
4036    single-shift-3).  */
4037
4038 #define ENCODE_SINGLE_SHIFT_2                                           \
4039   do {                                                                  \
4040     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4041       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4042     else                                                                \
4043       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4044     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4045   } while (0)
4046
4047
4048 #define ENCODE_SINGLE_SHIFT_3                                           \
4049   do {                                                                  \
4050     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4051       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4052     else                                                                \
4053       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4054     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4055   } while (0)
4056
4057
4058 /* The following four macros produce codes (control character or
4059    escape sequence) for ISO2022 locking-shift functions (shift-in,
4060    shift-out, locking-shift-2, and locking-shift-3).  */
4061
4062 #define ENCODE_SHIFT_IN                                 \
4063   do {                                                  \
4064     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4065     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4066   } while (0)
4067
4068
4069 #define ENCODE_SHIFT_OUT                                \
4070   do {                                                  \
4071     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4072     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4073   } while (0)
4074
4075
4076 #define ENCODE_LOCKING_SHIFT_2                          \
4077   do {                                                  \
4078     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4079     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4080   } while (0)
4081
4082
4083 #define ENCODE_LOCKING_SHIFT_3                          \
4084   do {                                                  \
4085     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4086     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4087   } while (0)
4088
4089
4090 /* Produce codes for a DIMENSION1 character whose character set is
4091    CHARSET and whose position-code is C1.  Designation and invocation
4092    sequences are also produced in advance if necessary.  */
4093
4094 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4095   do {                                                                  \
4096     int id = CHARSET_ID (charset);                                      \
4097                                                                         \
4098     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4099         && id == charset_ascii)                                         \
4100       {                                                                 \
4101         id = charset_jisx0201_roman;                                    \
4102         charset = CHARSET_FROM_ID (id);                                 \
4103       }                                                                 \
4104                                                                         \
4105     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4106       {                                                                 \
4107         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4108           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4109         else                                                            \
4110           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4111         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4112         break;                                                          \
4113       }                                                                 \
4114     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4115       {                                                                 \
4116         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4117         break;                                                          \
4118       }                                                                 \
4119     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4120       {                                                                 \
4121         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4122         break;                                                          \
4123       }                                                                 \
4124     else                                                                \
4125       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4126          must invoke it, or, at first, designate it to some graphic     \
4127          register.  Then repeat the loop to actually produce the        \
4128          character.  */                                                 \
4129       dst = encode_invocation_designation (charset, coding, dst,        \
4130                                            &produced_chars);            \
4131   } while (1)
4132
4133
4134 /* Produce codes for a DIMENSION2 character whose character set is
4135    CHARSET and whose position-codes are C1 and C2.  Designation and
4136    invocation codes are also produced in advance if necessary.  */
4137
4138 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4139   do {                                                                  \
4140     int id = CHARSET_ID (charset);                                      \
4141                                                                         \
4142     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4143         && id == charset_jisx0208)                                      \
4144       {                                                                 \
4145         id = charset_jisx0208_1978;                                     \
4146         charset = CHARSET_FROM_ID (id);                                 \
4147       }                                                                 \
4148                                                                         \
4149     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4150       {                                                                 \
4151         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4152           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4153         else                                                            \
4154           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4155         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4156         break;                                                          \
4157       }                                                                 \
4158     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4159       {                                                                 \
4160         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4161         break;                                                          \
4162       }                                                                 \
4163     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4164       {                                                                 \
4165         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4166         break;                                                          \
4167       }                                                                 \
4168     else                                                                \
4169       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4170          must invoke it, or, at first, designate it to some graphic     \
4171          register.  Then repeat the loop to actually produce the        \
4172          character.  */                                                 \
4173       dst = encode_invocation_designation (charset, coding, dst,        \
4174                                            &produced_chars);            \
4175   } while (1)
4176
4177
4178 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4179   do {                                                                     \
4180     int code = ENCODE_CHAR ((charset), (c));                               \
4181                                                                            \
4182     if (CHARSET_DIMENSION (charset) == 1)                                  \
4183       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4184     else                                                                   \
4185       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4186   } while (0)
4187
4188
4189 /* Produce designation and invocation codes at a place pointed by DST
4190    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4191    Return new DST.  */
4192
4193 static unsigned char *
4194 encode_invocation_designation (struct charset *charset,
4195                                struct coding_system *coding,
4196                                unsigned char *dst, EMACS_INT *p_nchars)
4197 {
4198   int multibytep = coding->dst_multibyte;
4199   EMACS_INT produced_chars = *p_nchars;
4200   int reg;                      /* graphic register number */
4201   int id = CHARSET_ID (charset);
4202
4203   /* At first, check designations.  */
4204   for (reg = 0; reg < 4; reg++)
4205     if (id == CODING_ISO_DESIGNATION (coding, reg))
4206       break;
4207
4208   if (reg >= 4)
4209     {
4210       /* CHARSET is not yet designated to any graphic registers.  */
4211       /* At first check the requested designation.  */
4212       reg = CODING_ISO_REQUEST (coding, id);
4213       if (reg < 0)
4214         /* Since CHARSET requests no special designation, designate it
4215            to graphic register 0.  */
4216         reg = 0;
4217
4218       ENCODE_DESIGNATION (charset, reg, coding);
4219     }
4220
4221   if (CODING_ISO_INVOCATION (coding, 0) != reg
4222       && CODING_ISO_INVOCATION (coding, 1) != reg)
4223     {
4224       /* Since the graphic register REG is not invoked to any graphic
4225          planes, invoke it to graphic plane 0.  */
4226       switch (reg)
4227         {
4228         case 0:                 /* graphic register 0 */
4229           ENCODE_SHIFT_IN;
4230           break;
4231
4232         case 1:                 /* graphic register 1 */
4233           ENCODE_SHIFT_OUT;
4234           break;
4235
4236         case 2:                 /* graphic register 2 */
4237           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4238             ENCODE_SINGLE_SHIFT_2;
4239           else
4240             ENCODE_LOCKING_SHIFT_2;
4241           break;
4242
4243         case 3:                 /* graphic register 3 */
4244           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4245             ENCODE_SINGLE_SHIFT_3;
4246           else
4247             ENCODE_LOCKING_SHIFT_3;
4248           break;
4249         }
4250     }
4251
4252   *p_nchars = produced_chars;
4253   return dst;
4254 }
4255
4256
4257 /* Produce codes for designation and invocation to reset the graphic
4258    planes and registers to initial state.  */
4259 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4260   do {                                                                  \
4261     int reg;                                                            \
4262     struct charset *charset;                                            \
4263                                                                         \
4264     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4265       ENCODE_SHIFT_IN;                                                  \
4266     for (reg = 0; reg < 4; reg++)                                       \
4267       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4268           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4269               != CODING_ISO_INITIAL (coding, reg)))                     \
4270         {                                                               \
4271           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4272           ENCODE_DESIGNATION (charset, reg, coding);                    \
4273         }                                                               \
4274   } while (0)
4275
4276
4277 /* Produce designation sequences of charsets in the line started from
4278    SRC to a place pointed by DST, and return updated DST.
4279
4280    If the current block ends before any end-of-line, we may fail to
4281    find all the necessary designations.  */
4282
4283 static unsigned char *
4284 encode_designation_at_bol (struct coding_system *coding, int *charbuf,
4285                            unsigned char *dst)
4286 {
4287   struct charset *charset;
4288   /* Table of charsets to be designated to each graphic register.  */
4289   int r[4];
4290   int c, found = 0, reg;
4291   EMACS_INT produced_chars = 0;
4292   int multibytep = coding->dst_multibyte;
4293   Lisp_Object attrs;
4294   Lisp_Object charset_list;
4295
4296   attrs = CODING_ID_ATTRS (coding->id);
4297   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4298   if (EQ (charset_list, Qiso_2022))
4299     charset_list = Viso_2022_charset_list;
4300
4301   for (reg = 0; reg < 4; reg++)
4302     r[reg] = -1;
4303
4304   while (found < 4)
4305     {
4306       int id;
4307
4308       c = *charbuf++;
4309       if (c == '\n')
4310         break;
4311       charset = char_charset (c, charset_list, NULL);
4312       id = CHARSET_ID (charset);
4313       reg = CODING_ISO_REQUEST (coding, id);
4314       if (reg >= 0 && r[reg] < 0)
4315         {
4316           found++;
4317           r[reg] = id;
4318         }
4319     }
4320
4321   if (found)
4322     {
4323       for (reg = 0; reg < 4; reg++)
4324         if (r[reg] >= 0
4325             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4326           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4327     }
4328
4329   return dst;
4330 }
4331
4332 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4333
4334 static int
4335 encode_coding_iso_2022 (struct coding_system *coding)
4336 {
4337   int multibytep = coding->dst_multibyte;
4338   int *charbuf = coding->charbuf;
4339   int *charbuf_end = charbuf + coding->charbuf_used;
4340   unsigned char *dst = coding->destination + coding->produced;
4341   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4342   int safe_room = 16;
4343   int bol_designation
4344     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4345        && CODING_ISO_BOL (coding));
4346   EMACS_INT produced_chars = 0;
4347   Lisp_Object attrs, eol_type, charset_list;
4348   int ascii_compatible;
4349   int c;
4350   int preferred_charset_id = -1;
4351
4352   CODING_GET_INFO (coding, attrs, charset_list);
4353   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4354   if (VECTORP (eol_type))
4355     eol_type = Qunix;
4356
4357   setup_iso_safe_charsets (attrs);
4358   /* Charset list may have been changed.  */
4359   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4360   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4361
4362   ascii_compatible
4363     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4364        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4365                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4366
4367   while (charbuf < charbuf_end)
4368     {
4369       ASSURE_DESTINATION (safe_room);
4370
4371       if (bol_designation)
4372         {
4373           unsigned char *dst_prev = dst;
4374
4375           /* We have to produce designation sequences if any now.  */
4376           dst = encode_designation_at_bol (coding, charbuf, dst);
4377           bol_designation = 0;
4378           /* We are sure that designation sequences are all ASCII bytes.  */
4379           produced_chars += dst - dst_prev;
4380         }
4381
4382       c = *charbuf++;
4383
4384       if (c < 0)
4385         {
4386           /* Handle an annotation.  */
4387           switch (*charbuf)
4388             {
4389             case CODING_ANNOTATE_COMPOSITION_MASK:
4390               /* Not yet implemented.  */
4391               break;
4392             case CODING_ANNOTATE_CHARSET_MASK:
4393               preferred_charset_id = charbuf[2];
4394               if (preferred_charset_id >= 0
4395                   && NILP (Fmemq (make_number (preferred_charset_id),
4396                                   charset_list)))
4397                 preferred_charset_id = -1;
4398               break;
4399             default:
4400               abort ();
4401             }
4402           charbuf += -c - 1;
4403           continue;
4404         }
4405
4406       /* Now encode the character C.  */
4407       if (c < 0x20 || c == 0x7F)
4408         {
4409           if (c == '\n'
4410               || (c == '\r' && EQ (eol_type, Qmac)))
4411             {
4412               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4413                 ENCODE_RESET_PLANE_AND_REGISTER ();
4414               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4415                 {
4416                   int i;
4417
4418                   for (i = 0; i < 4; i++)
4419                     CODING_ISO_DESIGNATION (coding, i)
4420                       = CODING_ISO_INITIAL (coding, i);
4421                 }
4422               bol_designation
4423                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4424             }
4425           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4426             ENCODE_RESET_PLANE_AND_REGISTER ();
4427           EMIT_ONE_ASCII_BYTE (c);
4428         }
4429       else if (ASCII_CHAR_P (c))
4430         {
4431           if (ascii_compatible)
4432             EMIT_ONE_ASCII_BYTE (c);
4433           else
4434             {
4435               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4436               ENCODE_ISO_CHARACTER (charset, c);
4437             }
4438         }
4439       else if (CHAR_BYTE8_P (c))
4440         {
4441           c = CHAR_TO_BYTE8 (c);
4442           EMIT_ONE_BYTE (c);
4443         }
4444       else
4445         {
4446           struct charset *charset;
4447
4448           if (preferred_charset_id >= 0)
4449             {
4450               charset = CHARSET_FROM_ID (preferred_charset_id);
4451               if (! CHAR_CHARSET_P (c, charset))
4452                 charset = char_charset (c, charset_list, NULL);
4453             }
4454           else
4455             charset = char_charset (c, charset_list, NULL);
4456           if (!charset)
4457             {
4458               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4459                 {
4460                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4461                   charset = CHARSET_FROM_ID (charset_ascii);
4462                 }
4463               else
4464                 {
4465                   c = coding->default_char;
4466                   charset = char_charset (c, charset_list, NULL);
4467                 }
4468             }
4469           ENCODE_ISO_CHARACTER (charset, c);
4470         }
4471     }
4472
4473   if (coding->mode & CODING_MODE_LAST_BLOCK
4474       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4475     {
4476       ASSURE_DESTINATION (safe_room);
4477       ENCODE_RESET_PLANE_AND_REGISTER ();
4478     }
4479   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4480   CODING_ISO_BOL (coding) = bol_designation;
4481   coding->produced_char += produced_chars;
4482   coding->produced = dst - coding->destination;
4483   return 0;
4484 }
4485
4486 \f
4487 /*** 8,9. SJIS and BIG5 handlers ***/
4488
4489 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4490    quite widely.  So, for the moment, Emacs supports them in the bare
4491    C code.  But, in the future, they may be supported only by CCL.  */
4492
4493 /* SJIS is a coding system encoding three character sets: ASCII, right
4494    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4495    as is.  A character of charset katakana-jisx0201 is encoded by
4496    "position-code + 0x80".  A character of charset japanese-jisx0208
4497    is encoded in 2-byte but two position-codes are divided and shifted
4498    so that it fit in the range below.
4499
4500    --- CODE RANGE of SJIS ---
4501    (character set)      (range)
4502    ASCII                0x00 .. 0x7F
4503    KATAKANA-JISX0201    0xA0 .. 0xDF
4504    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4505             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4506    -------------------------------
4507
4508 */
4509
4510 /* BIG5 is a coding system encoding two character sets: ASCII and
4511    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4512    character set and is encoded in two-byte.
4513
4514    --- CODE RANGE of BIG5 ---
4515    (character set)      (range)
4516    ASCII                0x00 .. 0x7F
4517    Big5 (1st byte)      0xA1 .. 0xFE
4518         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4519    --------------------------
4520
4521   */
4522
4523 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4524    Check if a text is encoded in SJIS.  If it is, return
4525    CATEGORY_MASK_SJIS, else return 0.  */
4526
4527 static int
4528 detect_coding_sjis (struct coding_system *coding,
4529                     struct coding_detection_info *detect_info)
4530 {
4531   const unsigned char *src = coding->source, *src_base;
4532   const unsigned char *src_end = coding->source + coding->src_bytes;
4533   int multibytep = coding->src_multibyte;
4534   EMACS_INT consumed_chars = 0;
4535   int found = 0;
4536   int c;
4537   Lisp_Object attrs, charset_list;
4538   int max_first_byte_of_2_byte_code;
4539
4540   CODING_GET_INFO (coding, attrs, charset_list);
4541   max_first_byte_of_2_byte_code
4542     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4543
4544   detect_info->checked |= CATEGORY_MASK_SJIS;
4545   /* A coding system of this category is always ASCII compatible.  */
4546   src += coding->head_ascii;
4547
4548   while (1)
4549     {
4550       src_base = src;
4551       ONE_MORE_BYTE (c);
4552       if (c < 0x80)
4553         continue;
4554       if ((c >= 0x81 && c <= 0x9F)
4555           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4556         {
4557           ONE_MORE_BYTE (c);
4558           if (c < 0x40 || c == 0x7F || c > 0xFC)
4559             break;
4560           found = CATEGORY_MASK_SJIS;
4561         }
4562       else if (c >= 0xA0 && c < 0xE0)
4563         found = CATEGORY_MASK_SJIS;
4564       else
4565         break;
4566     }
4567   detect_info->rejected |= CATEGORY_MASK_SJIS;
4568   return 0;
4569
4570  no_more_source:
4571   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4572     {
4573       detect_info->rejected |= CATEGORY_MASK_SJIS;
4574       return 0;
4575     }
4576   detect_info->found |= found;
4577   return 1;
4578 }
4579
4580 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4581    Check if a text is encoded in BIG5.  If it is, return
4582    CATEGORY_MASK_BIG5, else return 0.  */
4583
4584 static int
4585 detect_coding_big5 (struct coding_system *coding,
4586                     struct coding_detection_info *detect_info)
4587 {
4588   const unsigned char *src = coding->source, *src_base;
4589   const unsigned char *src_end = coding->source + coding->src_bytes;
4590   int multibytep = coding->src_multibyte;
4591   EMACS_INT consumed_chars = 0;
4592   int found = 0;
4593   int c;
4594
4595   detect_info->checked |= CATEGORY_MASK_BIG5;
4596   /* A coding system of this category is always ASCII compatible.  */
4597   src += coding->head_ascii;
4598
4599   while (1)
4600     {
4601       src_base = src;
4602       ONE_MORE_BYTE (c);
4603       if (c < 0x80)
4604         continue;
4605       if (c >= 0xA1)
4606         {
4607           ONE_MORE_BYTE (c);
4608           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4609             return 0;
4610           found = CATEGORY_MASK_BIG5;
4611         }
4612       else
4613         break;
4614     }
4615   detect_info->rejected |= CATEGORY_MASK_BIG5;
4616   return 0;
4617
4618  no_more_source:
4619   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4620     {
4621       detect_info->rejected |= CATEGORY_MASK_BIG5;
4622       return 0;
4623     }
4624   detect_info->found |= found;
4625   return 1;
4626 }
4627
4628 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4629    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4630
4631 static void
4632 decode_coding_sjis (struct coding_system *coding)
4633 {
4634   const unsigned char *src = coding->source + coding->consumed;
4635   const unsigned char *src_end = coding->source + coding->src_bytes;
4636   const unsigned char *src_base;
4637   int *charbuf = coding->charbuf + coding->charbuf_used;
4638   /* We may produce one charset annotation in one loop and one more at
4639      the end.  */
4640   int *charbuf_end
4641     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4642   EMACS_INT consumed_chars = 0, consumed_chars_base;
4643   int multibytep = coding->src_multibyte;
4644   struct charset *charset_roman, *charset_kanji, *charset_kana;
4645   struct charset *charset_kanji2;
4646   Lisp_Object attrs, charset_list, val;
4647   EMACS_INT char_offset = coding->produced_char;
4648   EMACS_INT last_offset = char_offset;
4649   int last_id = charset_ascii;
4650   int eol_dos =
4651     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4652   int byte_after_cr = -1;
4653
4654   CODING_GET_INFO (coding, attrs, charset_list);
4655
4656   val = charset_list;
4657   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4658   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4659   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4660   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4661
4662   while (1)
4663     {
4664       int c, c1;
4665       struct charset *charset;
4666
4667       src_base = src;
4668       consumed_chars_base = consumed_chars;
4669
4670       if (charbuf >= charbuf_end)
4671         {
4672           if (byte_after_cr >= 0)
4673             src_base--;
4674           break;
4675         }
4676
4677       if (byte_after_cr >= 0)
4678         c = byte_after_cr, byte_after_cr = -1;
4679       else
4680         ONE_MORE_BYTE (c);
4681       if (c < 0)
4682         goto invalid_code;
4683       if (c < 0x80)
4684         {
4685           if (eol_dos && c == '\r')
4686             ONE_MORE_BYTE (byte_after_cr);
4687           charset = charset_roman;
4688         }
4689       else if (c == 0x80 || c == 0xA0)
4690         goto invalid_code;
4691       else if (c >= 0xA1 && c <= 0xDF)
4692         {
4693           /* SJIS -> JISX0201-Kana */
4694           c &= 0x7F;
4695           charset = charset_kana;
4696         }
4697       else if (c <= 0xEF)
4698         {
4699           /* SJIS -> JISX0208 */
4700           ONE_MORE_BYTE (c1);
4701           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4702             goto invalid_code;
4703           c = (c << 8) | c1;
4704           SJIS_TO_JIS (c);
4705           charset = charset_kanji;
4706         }
4707       else if (c <= 0xFC && charset_kanji2)
4708         {
4709           /* SJIS -> JISX0213-2 */
4710           ONE_MORE_BYTE (c1);
4711           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4712             goto invalid_code;
4713           c = (c << 8) | c1;
4714           SJIS_TO_JIS2 (c);
4715           charset = charset_kanji2;
4716         }
4717       else
4718         goto invalid_code;
4719       if (charset->id != charset_ascii
4720           && last_id != charset->id)
4721         {
4722           if (last_id != charset_ascii)
4723             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4724           last_id = charset->id;
4725           last_offset = char_offset;
4726         }
4727       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4728       *charbuf++ = c;
4729       char_offset++;
4730       continue;
4731
4732     invalid_code:
4733       src = src_base;
4734       consumed_chars = consumed_chars_base;
4735       ONE_MORE_BYTE (c);
4736       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4737       char_offset++;
4738       coding->errors++;
4739     }
4740
4741  no_more_source:
4742   if (last_id != charset_ascii)
4743     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4744   coding->consumed_char += consumed_chars_base;
4745   coding->consumed = src_base - coding->source;
4746   coding->charbuf_used = charbuf - coding->charbuf;
4747 }
4748
4749 static void
4750 decode_coding_big5 (struct coding_system *coding)
4751 {
4752   const unsigned char *src = coding->source + coding->consumed;
4753   const unsigned char *src_end = coding->source + coding->src_bytes;
4754   const unsigned char *src_base;
4755   int *charbuf = coding->charbuf + coding->charbuf_used;
4756   /* We may produce one charset annotation in one loop and one more at
4757      the end.  */
4758   int *charbuf_end
4759     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4760   EMACS_INT consumed_chars = 0, consumed_chars_base;
4761   int multibytep = coding->src_multibyte;
4762   struct charset *charset_roman, *charset_big5;
4763   Lisp_Object attrs, charset_list, val;
4764   EMACS_INT char_offset = coding->produced_char;
4765   EMACS_INT last_offset = char_offset;
4766   int last_id = charset_ascii;
4767   int eol_dos =
4768     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4769   int byte_after_cr = -1;
4770
4771   CODING_GET_INFO (coding, attrs, charset_list);
4772   val = charset_list;
4773   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4774   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4775
4776   while (1)
4777     {
4778       int c, c1;
4779       struct charset *charset;
4780
4781       src_base = src;
4782       consumed_chars_base = consumed_chars;
4783
4784       if (charbuf >= charbuf_end)
4785         {
4786           if (byte_after_cr >= 0)
4787             src_base--;
4788           break;
4789         }
4790
4791       if (byte_after_cr >= 0)
4792         c = byte_after_cr, byte_after_cr = -1;
4793       else
4794         ONE_MORE_BYTE (c);
4795
4796       if (c < 0)
4797         goto invalid_code;
4798       if (c < 0x80)
4799         {
4800           if (eol_dos && c == '\r')
4801             ONE_MORE_BYTE (byte_after_cr);
4802           charset = charset_roman;
4803         }
4804       else
4805         {
4806           /* BIG5 -> Big5 */
4807           if (c < 0xA1 || c > 0xFE)
4808             goto invalid_code;
4809           ONE_MORE_BYTE (c1);
4810           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4811             goto invalid_code;
4812           c = c << 8 | c1;
4813           charset = charset_big5;
4814         }
4815       if (charset->id != charset_ascii
4816           && last_id != charset->id)
4817         {
4818           if (last_id != charset_ascii)
4819             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4820           last_id = charset->id;
4821           last_offset = char_offset;
4822         }
4823       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4824       *charbuf++ = c;
4825       char_offset++;
4826       continue;
4827
4828     invalid_code:
4829       src = src_base;
4830       consumed_chars = consumed_chars_base;
4831       ONE_MORE_BYTE (c);
4832       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4833       char_offset++;
4834       coding->errors++;
4835     }
4836
4837  no_more_source:
4838   if (last_id != charset_ascii)
4839     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4840   coding->consumed_char += consumed_chars_base;
4841   coding->consumed = src_base - coding->source;
4842   coding->charbuf_used = charbuf - coding->charbuf;
4843 }
4844
4845 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4846    This function can encode charsets `ascii', `katakana-jisx0201',
4847    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4848    are sure that all these charsets are registered as official charset
4849    (i.e. do not have extended leading-codes).  Characters of other
4850    charsets are produced without any encoding.  If SJIS_P is 1, encode
4851    SJIS text, else encode BIG5 text.  */
4852
4853 static int
4854 encode_coding_sjis (struct coding_system *coding)
4855 {
4856   int multibytep = coding->dst_multibyte;
4857   int *charbuf = coding->charbuf;
4858   int *charbuf_end = charbuf + coding->charbuf_used;
4859   unsigned char *dst = coding->destination + coding->produced;
4860   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4861   int safe_room = 4;
4862   EMACS_INT produced_chars = 0;
4863   Lisp_Object attrs, charset_list, val;
4864   int ascii_compatible;
4865   struct charset *charset_kanji, *charset_kana;
4866   struct charset *charset_kanji2;
4867   int c;
4868
4869   CODING_GET_INFO (coding, attrs, charset_list);
4870   val = XCDR (charset_list);
4871   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4872   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4873   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4874
4875   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4876
4877   while (charbuf < charbuf_end)
4878     {
4879       ASSURE_DESTINATION (safe_room);
4880       c = *charbuf++;
4881       /* Now encode the character C.  */
4882       if (ASCII_CHAR_P (c) && ascii_compatible)
4883         EMIT_ONE_ASCII_BYTE (c);
4884       else if (CHAR_BYTE8_P (c))
4885         {
4886           c = CHAR_TO_BYTE8 (c);
4887           EMIT_ONE_BYTE (c);
4888         }
4889       else
4890         {
4891           unsigned code;
4892           struct charset *charset = char_charset (c, charset_list, &code);
4893
4894           if (!charset)
4895             {
4896               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4897                 {
4898                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4899                   charset = CHARSET_FROM_ID (charset_ascii);
4900                 }
4901               else
4902                 {
4903                   c = coding->default_char;
4904                   charset = char_charset (c, charset_list, &code);
4905                 }
4906             }
4907           if (code == CHARSET_INVALID_CODE (charset))
4908             abort ();
4909           if (charset == charset_kanji)
4910             {
4911               int c1, c2;
4912               JIS_TO_SJIS (code);
4913               c1 = code >> 8, c2 = code & 0xFF;
4914               EMIT_TWO_BYTES (c1, c2);
4915             }
4916           else if (charset == charset_kana)
4917             EMIT_ONE_BYTE (code | 0x80);
4918           else if (charset_kanji2 && charset == charset_kanji2)
4919             {
4920               int c1, c2;
4921
4922               c1 = code >> 8;
4923               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4924                   || c1 == 0x28
4925                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4926                 {
4927                   JIS_TO_SJIS2 (code);
4928                   c1 = code >> 8, c2 = code & 0xFF;
4929                   EMIT_TWO_BYTES (c1, c2);
4930                 }
4931               else
4932                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4933             }
4934           else
4935             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4936         }
4937     }
4938   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4939   coding->produced_char += produced_chars;
4940   coding->produced = dst - coding->destination;
4941   return 0;
4942 }
4943
4944 static int
4945 encode_coding_big5 (struct coding_system *coding)
4946 {
4947   int multibytep = coding->dst_multibyte;
4948   int *charbuf = coding->charbuf;
4949   int *charbuf_end = charbuf + coding->charbuf_used;
4950   unsigned char *dst = coding->destination + coding->produced;
4951   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4952   int safe_room = 4;
4953   EMACS_INT produced_chars = 0;
4954   Lisp_Object attrs, charset_list, val;
4955   int ascii_compatible;
4956   struct charset *charset_big5;
4957   int c;
4958
4959   CODING_GET_INFO (coding, attrs, charset_list);
4960   val = XCDR (charset_list);
4961   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4962   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4963
4964   while (charbuf < charbuf_end)
4965     {
4966       ASSURE_DESTINATION (safe_room);
4967       c = *charbuf++;
4968       /* Now encode the character C.  */
4969       if (ASCII_CHAR_P (c) && ascii_compatible)
4970         EMIT_ONE_ASCII_BYTE (c);
4971       else if (CHAR_BYTE8_P (c))
4972         {
4973           c = CHAR_TO_BYTE8 (c);
4974           EMIT_ONE_BYTE (c);
4975         }
4976       else
4977         {
4978           unsigned code;
4979           struct charset *charset = char_charset (c, charset_list, &code);
4980
4981           if (! charset)
4982             {
4983               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4984                 {
4985                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4986                   charset = CHARSET_FROM_ID (charset_ascii);
4987                 }
4988               else
4989                 {
4990                   c = coding->default_char;
4991                   charset = char_charset (c, charset_list, &code);
4992                 }
4993             }
4994           if (code == CHARSET_INVALID_CODE (charset))
4995             abort ();
4996           if (charset == charset_big5)
4997             {
4998               int c1, c2;
4999
5000               c1 = code >> 8, c2 = code & 0xFF;
5001               EMIT_TWO_BYTES (c1, c2);
5002             }
5003           else
5004             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5005         }
5006     }
5007   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5008   coding->produced_char += produced_chars;
5009   coding->produced = dst - coding->destination;
5010   return 0;
5011 }
5012
5013 \f
5014 /*** 10. CCL handlers ***/
5015
5016 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5017    Check if a text is encoded in a coding system of which
5018    encoder/decoder are written in CCL program.  If it is, return
5019    CATEGORY_MASK_CCL, else return 0.  */
5020
5021 static int
5022 detect_coding_ccl (struct coding_system *coding,
5023                    struct coding_detection_info *detect_info)
5024 {
5025   const unsigned char *src = coding->source, *src_base;
5026   const unsigned char *src_end = coding->source + coding->src_bytes;
5027   int multibytep = coding->src_multibyte;
5028   EMACS_INT consumed_chars = 0;
5029   int found = 0;
5030   unsigned char *valids;
5031   EMACS_INT head_ascii = coding->head_ascii;
5032   Lisp_Object attrs;
5033
5034   detect_info->checked |= CATEGORY_MASK_CCL;
5035
5036   coding = &coding_categories[coding_category_ccl];
5037   valids = CODING_CCL_VALIDS (coding);
5038   attrs = CODING_ID_ATTRS (coding->id);
5039   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5040     src += head_ascii;
5041
5042   while (1)
5043     {
5044       int c;
5045
5046       src_base = src;
5047       ONE_MORE_BYTE (c);
5048       if (c < 0 || ! valids[c])
5049         break;
5050       if ((valids[c] > 1))
5051         found = CATEGORY_MASK_CCL;
5052     }
5053   detect_info->rejected |= CATEGORY_MASK_CCL;
5054   return 0;
5055
5056  no_more_source:
5057   detect_info->found |= found;
5058   return 1;
5059 }
5060
5061 static void
5062 decode_coding_ccl (struct coding_system *coding)
5063 {
5064   const unsigned char *src = coding->source + coding->consumed;
5065   const unsigned char *src_end = coding->source + coding->src_bytes;
5066   int *charbuf = coding->charbuf + coding->charbuf_used;
5067   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5068   EMACS_INT consumed_chars = 0;
5069   int multibytep = coding->src_multibyte;
5070   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5071   int source_charbuf[1024];
5072   int source_byteidx[1025];
5073   Lisp_Object attrs, charset_list;
5074
5075   CODING_GET_INFO (coding, attrs, charset_list);
5076
5077   while (1)
5078     {
5079       const unsigned char *p = src;
5080       int i = 0;
5081
5082       if (multibytep)
5083         {
5084           while (i < 1024 && p < src_end)
5085             {
5086               source_byteidx[i] = p - src;
5087               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5088             }
5089           source_byteidx[i] = p - src;
5090         }
5091       else
5092         while (i < 1024 && p < src_end)
5093           source_charbuf[i++] = *p++;
5094
5095       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5096         ccl->last_block = 1;
5097       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5098                   charset_list);
5099       charbuf += ccl->produced;
5100       if (multibytep)
5101         src += source_byteidx[ccl->consumed];
5102       else
5103         src += ccl->consumed;
5104       consumed_chars += ccl->consumed;
5105       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5106         break;
5107     }
5108
5109   switch (ccl->status)
5110     {
5111     case CCL_STAT_SUSPEND_BY_SRC:
5112       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5113       break;
5114     case CCL_STAT_SUSPEND_BY_DST:
5115       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5116       break;
5117     case CCL_STAT_QUIT:
5118     case CCL_STAT_INVALID_CMD:
5119       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5120       break;
5121     default:
5122       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5123       break;
5124     }
5125   coding->consumed_char += consumed_chars;
5126   coding->consumed = src - coding->source;
5127   coding->charbuf_used = charbuf - coding->charbuf;
5128 }
5129
5130 static int
5131 encode_coding_ccl (struct coding_system *coding)
5132 {
5133   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5134   int multibytep = coding->dst_multibyte;
5135   int *charbuf = coding->charbuf;
5136   int *charbuf_end = charbuf + coding->charbuf_used;
5137   unsigned char *dst = coding->destination + coding->produced;
5138   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5139   int destination_charbuf[1024];
5140   EMACS_INT produced_chars = 0;
5141   int i;
5142   Lisp_Object attrs, charset_list;
5143
5144   CODING_GET_INFO (coding, attrs, charset_list);
5145   if (coding->consumed_char == coding->src_chars
5146       && coding->mode & CODING_MODE_LAST_BLOCK)
5147     ccl->last_block = 1;
5148
5149   while (charbuf < charbuf_end)
5150     {
5151       ccl_driver (ccl, charbuf, destination_charbuf,
5152                   charbuf_end - charbuf, 1024, charset_list);
5153       if (multibytep)
5154         {
5155           ASSURE_DESTINATION (ccl->produced * 2);
5156           for (i = 0; i < ccl->produced; i++)
5157             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5158         }
5159       else
5160         {
5161           ASSURE_DESTINATION (ccl->produced);
5162           for (i = 0; i < ccl->produced; i++)
5163             *dst++ = destination_charbuf[i] & 0xFF;
5164           produced_chars += ccl->produced;
5165         }
5166       charbuf += ccl->consumed;
5167       if (ccl->status == CCL_STAT_QUIT
5168           || ccl->status == CCL_STAT_INVALID_CMD)
5169         break;
5170     }
5171
5172   switch (ccl->status)
5173     {
5174     case CCL_STAT_SUSPEND_BY_SRC:
5175       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5176       break;
5177     case CCL_STAT_SUSPEND_BY_DST:
5178       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5179       break;
5180     case CCL_STAT_QUIT:
5181     case CCL_STAT_INVALID_CMD:
5182       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5183       break;
5184     default:
5185       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5186       break;
5187     }
5188
5189   coding->produced_char += produced_chars;
5190   coding->produced = dst - coding->destination;
5191   return 0;
5192 }
5193
5194
5195 \f
5196 /*** 10, 11. no-conversion handlers ***/
5197
5198 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5199
5200 static void
5201 decode_coding_raw_text (struct coding_system *coding)
5202 {
5203   int eol_dos =
5204     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5205
5206   coding->chars_at_source = 1;
5207   coding->consumed_char = coding->src_chars;
5208   coding->consumed = coding->src_bytes;
5209   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5210     {
5211       coding->consumed_char--;
5212       coding->consumed--;
5213       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5214     }
5215   else
5216     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5217 }
5218
5219 static int
5220 encode_coding_raw_text (struct coding_system *coding)
5221 {
5222   int multibytep = coding->dst_multibyte;
5223   int *charbuf = coding->charbuf;
5224   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5225   unsigned char *dst = coding->destination + coding->produced;
5226   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5227   EMACS_INT produced_chars = 0;
5228   int c;
5229
5230   if (multibytep)
5231     {
5232       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5233
5234       if (coding->src_multibyte)
5235         while (charbuf < charbuf_end)
5236           {
5237             ASSURE_DESTINATION (safe_room);
5238             c = *charbuf++;
5239             if (ASCII_CHAR_P (c))
5240               EMIT_ONE_ASCII_BYTE (c);
5241             else if (CHAR_BYTE8_P (c))
5242               {
5243                 c = CHAR_TO_BYTE8 (c);
5244                 EMIT_ONE_BYTE (c);
5245               }
5246             else
5247               {
5248                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5249
5250                 CHAR_STRING_ADVANCE (c, p1);
5251                 do
5252                   {
5253                     EMIT_ONE_BYTE (*p0);
5254                     p0++;
5255                   }
5256                 while (p0 < p1);
5257               }
5258           }
5259       else
5260         while (charbuf < charbuf_end)
5261           {
5262             ASSURE_DESTINATION (safe_room);
5263             c = *charbuf++;
5264             EMIT_ONE_BYTE (c);
5265           }
5266     }
5267   else
5268     {
5269       if (coding->src_multibyte)
5270         {
5271           int safe_room = MAX_MULTIBYTE_LENGTH;
5272
5273           while (charbuf < charbuf_end)
5274             {
5275               ASSURE_DESTINATION (safe_room);
5276               c = *charbuf++;
5277               if (ASCII_CHAR_P (c))
5278                 *dst++ = c;
5279               else if (CHAR_BYTE8_P (c))
5280                 *dst++ = CHAR_TO_BYTE8 (c);
5281               else
5282                 CHAR_STRING_ADVANCE (c, dst);
5283             }
5284         }
5285       else
5286         {
5287           ASSURE_DESTINATION (charbuf_end - charbuf);
5288           while (charbuf < charbuf_end && dst < dst_end)
5289             *dst++ = *charbuf++;
5290         }
5291       produced_chars = dst - (coding->destination + coding->produced);
5292     }
5293   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5294   coding->produced_char += produced_chars;
5295   coding->produced = dst - coding->destination;
5296   return 0;
5297 }
5298
5299 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5300    Check if a text is encoded in a charset-based coding system.  If it
5301    is, return 1, else return 0.  */
5302
5303 static int
5304 detect_coding_charset (struct coding_system *coding,
5305                        struct coding_detection_info *detect_info)
5306 {
5307   const unsigned char *src = coding->source, *src_base;
5308   const unsigned char *src_end = coding->source + coding->src_bytes;
5309   int multibytep = coding->src_multibyte;
5310   EMACS_INT consumed_chars = 0;
5311   Lisp_Object attrs, valids, name;
5312   int found = 0;
5313   EMACS_INT head_ascii = coding->head_ascii;
5314   int check_latin_extra = 0;
5315
5316   detect_info->checked |= CATEGORY_MASK_CHARSET;
5317
5318   coding = &coding_categories[coding_category_charset];
5319   attrs = CODING_ID_ATTRS (coding->id);
5320   valids = AREF (attrs, coding_attr_charset_valids);
5321   name = CODING_ID_NAME (coding->id);
5322   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5323                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5324       || strncmp (SSDATA (SYMBOL_NAME (name)),
5325                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5326     check_latin_extra = 1;
5327
5328   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5329     src += head_ascii;
5330
5331   while (1)
5332     {
5333       int c;
5334       Lisp_Object val;
5335       struct charset *charset;
5336       int dim, idx;
5337
5338       src_base = src;
5339       ONE_MORE_BYTE (c);
5340       if (c < 0)
5341         continue;
5342       val = AREF (valids, c);
5343       if (NILP (val))
5344         break;
5345       if (c >= 0x80)
5346         {
5347           if (c < 0xA0
5348               && check_latin_extra
5349               && (!VECTORP (Vlatin_extra_code_table)
5350                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5351             break;
5352           found = CATEGORY_MASK_CHARSET;
5353         }
5354       if (INTEGERP (val))
5355         {
5356           charset = CHARSET_FROM_ID (XFASTINT (val));
5357           dim = CHARSET_DIMENSION (charset);
5358           for (idx = 1; idx < dim; idx++)
5359             {
5360               if (src == src_end)
5361                 goto too_short;
5362               ONE_MORE_BYTE (c);
5363               if (c < charset->code_space[(dim - 1 - idx) * 2]
5364                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5365                 break;
5366             }
5367           if (idx < dim)
5368             break;
5369         }
5370       else
5371         {
5372           idx = 1;
5373           for (; CONSP (val); val = XCDR (val))
5374             {
5375               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5376               dim = CHARSET_DIMENSION (charset);
5377               while (idx < dim)
5378                 {
5379                   if (src == src_end)
5380                     goto too_short;
5381                   ONE_MORE_BYTE (c);
5382                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5383                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5384                     break;
5385                   idx++;
5386                 }
5387               if (idx == dim)
5388                 {
5389                   val = Qnil;
5390                   break;
5391                 }
5392             }
5393           if (CONSP (val))
5394             break;
5395         }
5396     }
5397  too_short:
5398   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5399   return 0;
5400
5401  no_more_source:
5402   detect_info->found |= found;
5403   return 1;
5404 }
5405
5406 static void
5407 decode_coding_charset (struct coding_system *coding)
5408 {
5409   const unsigned char *src = coding->source + coding->consumed;
5410   const unsigned char *src_end = coding->source + coding->src_bytes;
5411   const unsigned char *src_base;
5412   int *charbuf = coding->charbuf + coding->charbuf_used;
5413   /* We may produce one charset annotation in one loop and one more at
5414      the end.  */
5415   int *charbuf_end
5416     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5417   EMACS_INT consumed_chars = 0, consumed_chars_base;
5418   int multibytep = coding->src_multibyte;
5419   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5420   Lisp_Object valids;
5421   EMACS_INT char_offset = coding->produced_char;
5422   EMACS_INT last_offset = char_offset;
5423   int last_id = charset_ascii;
5424   int eol_dos =
5425     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5426   int byte_after_cr = -1;
5427
5428   valids = AREF (attrs, coding_attr_charset_valids);
5429
5430   while (1)
5431     {
5432       int c;
5433       Lisp_Object val;
5434       struct charset *charset;
5435       int dim;
5436       int len = 1;
5437       unsigned code;
5438
5439       src_base = src;
5440       consumed_chars_base = consumed_chars;
5441
5442       if (charbuf >= charbuf_end)
5443         {
5444           if (byte_after_cr >= 0)
5445             src_base--;
5446           break;
5447         }
5448
5449       if (byte_after_cr >= 0)
5450         {
5451           c = byte_after_cr;
5452           byte_after_cr = -1;
5453         }
5454       else
5455         {
5456           ONE_MORE_BYTE (c);
5457           if (eol_dos && c == '\r')
5458             ONE_MORE_BYTE (byte_after_cr);
5459         }
5460       if (c < 0)
5461         goto invalid_code;
5462       code = c;
5463
5464       val = AREF (valids, c);
5465       if (! INTEGERP (val) && ! CONSP (val))
5466         goto invalid_code;
5467       if (INTEGERP (val))
5468         {
5469           charset = CHARSET_FROM_ID (XFASTINT (val));
5470           dim = CHARSET_DIMENSION (charset);
5471           while (len < dim)
5472             {
5473               ONE_MORE_BYTE (c);
5474               code = (code << 8) | c;
5475               len++;
5476             }
5477           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5478                               charset, code, c);
5479         }
5480       else
5481         {
5482           /* VAL is a list of charset IDs.  It is assured that the
5483              list is sorted by charset dimensions (smaller one
5484              comes first).  */
5485           while (CONSP (val))
5486             {
5487               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5488               dim = CHARSET_DIMENSION (charset);
5489               while (len < dim)
5490                 {
5491                   ONE_MORE_BYTE (c);
5492                   code = (code << 8) | c;
5493                   len++;
5494                 }
5495               CODING_DECODE_CHAR (coding, src, src_base,
5496                                   src_end, charset, code, c);
5497               if (c >= 0)
5498                 break;
5499               val = XCDR (val);
5500             }
5501         }
5502       if (c < 0)
5503         goto invalid_code;
5504       if (charset->id != charset_ascii
5505           && last_id != charset->id)
5506         {
5507           if (last_id != charset_ascii)
5508             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5509           last_id = charset->id;
5510           last_offset = char_offset;
5511         }
5512
5513       *charbuf++ = c;
5514       char_offset++;
5515       continue;
5516
5517     invalid_code:
5518       src = src_base;
5519       consumed_chars = consumed_chars_base;
5520       ONE_MORE_BYTE (c);
5521       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5522       char_offset++;
5523       coding->errors++;
5524     }
5525
5526  no_more_source:
5527   if (last_id != charset_ascii)
5528     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5529   coding->consumed_char += consumed_chars_base;
5530   coding->consumed = src_base - coding->source;
5531   coding->charbuf_used = charbuf - coding->charbuf;
5532 }
5533
5534 static int
5535 encode_coding_charset (struct coding_system *coding)
5536 {
5537   int multibytep = coding->dst_multibyte;
5538   int *charbuf = coding->charbuf;
5539   int *charbuf_end = charbuf + coding->charbuf_used;
5540   unsigned char *dst = coding->destination + coding->produced;
5541   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5542   int safe_room = MAX_MULTIBYTE_LENGTH;
5543   EMACS_INT produced_chars = 0;
5544   Lisp_Object attrs, charset_list;
5545   int ascii_compatible;
5546   int c;
5547
5548   CODING_GET_INFO (coding, attrs, charset_list);
5549   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5550
5551   while (charbuf < charbuf_end)
5552     {
5553       struct charset *charset;
5554       unsigned code;
5555
5556       ASSURE_DESTINATION (safe_room);
5557       c = *charbuf++;
5558       if (ascii_compatible && ASCII_CHAR_P (c))
5559         EMIT_ONE_ASCII_BYTE (c);
5560       else if (CHAR_BYTE8_P (c))
5561         {
5562           c = CHAR_TO_BYTE8 (c);
5563           EMIT_ONE_BYTE (c);
5564         }
5565       else
5566         {
5567           charset = char_charset (c, charset_list, &code);
5568           if (charset)
5569             {
5570               if (CHARSET_DIMENSION (charset) == 1)
5571                 EMIT_ONE_BYTE (code);
5572               else if (CHARSET_DIMENSION (charset) == 2)
5573                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5574               else if (CHARSET_DIMENSION (charset) == 3)
5575                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5576               else
5577                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5578                                  (code >> 8) & 0xFF, code & 0xFF);
5579             }
5580           else
5581             {
5582               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5583                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5584               else
5585                 c = coding->default_char;
5586               EMIT_ONE_BYTE (c);
5587             }
5588         }
5589     }
5590
5591   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5592   coding->produced_char += produced_chars;
5593   coding->produced = dst - coding->destination;
5594   return 0;
5595 }
5596
5597 \f
5598 /*** 7. C library functions ***/
5599
5600 /* Setup coding context CODING from information about CODING_SYSTEM.
5601    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5602    CODING_SYSTEM is invalid, signal an error.  */
5603
5604 void
5605 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5606 {
5607   Lisp_Object attrs;
5608   Lisp_Object eol_type;
5609   Lisp_Object coding_type;
5610   Lisp_Object val;
5611
5612   if (NILP (coding_system))
5613     coding_system = Qundecided;
5614
5615   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5616
5617   attrs = CODING_ID_ATTRS (coding->id);
5618   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5619
5620   coding->mode = 0;
5621   coding->head_ascii = -1;
5622   if (VECTORP (eol_type))
5623     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5624                             | CODING_REQUIRE_DETECTION_MASK);
5625   else if (! EQ (eol_type, Qunix))
5626     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5627                             | CODING_REQUIRE_ENCODING_MASK);
5628   else
5629     coding->common_flags = 0;
5630   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5631     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5632   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5633     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5634   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5635     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5636
5637   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5638   coding->max_charset_id = SCHARS (val) - 1;
5639   coding->safe_charsets = SDATA (val);
5640   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5641   coding->carryover_bytes = 0;
5642
5643   coding_type = CODING_ATTR_TYPE (attrs);
5644   if (EQ (coding_type, Qundecided))
5645     {
5646       coding->detector = NULL;
5647       coding->decoder = decode_coding_raw_text;
5648       coding->encoder = encode_coding_raw_text;
5649       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5650     }
5651   else if (EQ (coding_type, Qiso_2022))
5652     {
5653       int i;
5654       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5655
5656       /* Invoke graphic register 0 to plane 0.  */
5657       CODING_ISO_INVOCATION (coding, 0) = 0;
5658       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5659       CODING_ISO_INVOCATION (coding, 1)
5660         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5661       /* Setup the initial status of designation.  */
5662       for (i = 0; i < 4; i++)
5663         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5664       /* Not single shifting initially.  */
5665       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5666       /* Beginning of buffer should also be regarded as bol. */
5667       CODING_ISO_BOL (coding) = 1;
5668       coding->detector = detect_coding_iso_2022;
5669       coding->decoder = decode_coding_iso_2022;
5670       coding->encoder = encode_coding_iso_2022;
5671       if (flags & CODING_ISO_FLAG_SAFE)
5672         coding->mode |= CODING_MODE_SAFE_ENCODING;
5673       coding->common_flags
5674         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5675             | CODING_REQUIRE_FLUSHING_MASK);
5676       if (flags & CODING_ISO_FLAG_COMPOSITION)
5677         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5678       if (flags & CODING_ISO_FLAG_DESIGNATION)
5679         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5680       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5681         {
5682           setup_iso_safe_charsets (attrs);
5683           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5684           coding->max_charset_id = SCHARS (val) - 1;
5685           coding->safe_charsets = SDATA (val);
5686         }
5687       CODING_ISO_FLAGS (coding) = flags;
5688       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5689       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5690       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5691       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5692     }
5693   else if (EQ (coding_type, Qcharset))
5694     {
5695       coding->detector = detect_coding_charset;
5696       coding->decoder = decode_coding_charset;
5697       coding->encoder = encode_coding_charset;
5698       coding->common_flags
5699         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5700     }
5701   else if (EQ (coding_type, Qutf_8))
5702     {
5703       val = AREF (attrs, coding_attr_utf_bom);
5704       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5705                                    : EQ (val, Qt) ? utf_with_bom
5706                                    : utf_without_bom);
5707       coding->detector = detect_coding_utf_8;
5708       coding->decoder = decode_coding_utf_8;
5709       coding->encoder = encode_coding_utf_8;
5710       coding->common_flags
5711         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5712       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5713         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5714     }
5715   else if (EQ (coding_type, Qutf_16))
5716     {
5717       val = AREF (attrs, coding_attr_utf_bom);
5718       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5719                                     : EQ (val, Qt) ? utf_with_bom
5720                                     : utf_without_bom);
5721       val = AREF (attrs, coding_attr_utf_16_endian);
5722       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5723                                        : utf_16_little_endian);
5724       CODING_UTF_16_SURROGATE (coding) = 0;
5725       coding->detector = detect_coding_utf_16;
5726       coding->decoder = decode_coding_utf_16;
5727       coding->encoder = encode_coding_utf_16;
5728       coding->common_flags
5729         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5730       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5731         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5732     }
5733   else if (EQ (coding_type, Qccl))
5734     {
5735       coding->detector = detect_coding_ccl;
5736       coding->decoder = decode_coding_ccl;
5737       coding->encoder = encode_coding_ccl;
5738       coding->common_flags
5739         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5740             | CODING_REQUIRE_FLUSHING_MASK);
5741     }
5742   else if (EQ (coding_type, Qemacs_mule))
5743     {
5744       coding->detector = detect_coding_emacs_mule;
5745       coding->decoder = decode_coding_emacs_mule;
5746       coding->encoder = encode_coding_emacs_mule;
5747       coding->common_flags
5748         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5749       coding->spec.emacs_mule.full_support = 1;
5750       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5751           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5752         {
5753           Lisp_Object tail, safe_charsets;
5754           int max_charset_id = 0;
5755
5756           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5757                tail = XCDR (tail))
5758             if (max_charset_id < XFASTINT (XCAR (tail)))
5759               max_charset_id = XFASTINT (XCAR (tail));
5760           safe_charsets = make_uninit_string (max_charset_id + 1);
5761           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5762           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5763                tail = XCDR (tail))
5764             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5765           coding->max_charset_id = max_charset_id;
5766           coding->safe_charsets = SDATA (safe_charsets);
5767           coding->spec.emacs_mule.full_support = 1;
5768         }
5769       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5770       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5771     }
5772   else if (EQ (coding_type, Qshift_jis))
5773     {
5774       coding->detector = detect_coding_sjis;
5775       coding->decoder = decode_coding_sjis;
5776       coding->encoder = encode_coding_sjis;
5777       coding->common_flags
5778         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5779     }
5780   else if (EQ (coding_type, Qbig5))
5781     {
5782       coding->detector = detect_coding_big5;
5783       coding->decoder = decode_coding_big5;
5784       coding->encoder = encode_coding_big5;
5785       coding->common_flags
5786         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5787     }
5788   else                          /* EQ (coding_type, Qraw_text) */
5789     {
5790       coding->detector = NULL;
5791       coding->decoder = decode_coding_raw_text;
5792       coding->encoder = encode_coding_raw_text;
5793       if (! EQ (eol_type, Qunix))
5794         {
5795           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5796           if (! VECTORP (eol_type))
5797             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5798         }
5799
5800     }
5801
5802   return;
5803 }
5804
5805 /* Return a list of charsets supported by CODING.  */
5806
5807 Lisp_Object
5808 coding_charset_list (struct coding_system *coding)
5809 {
5810   Lisp_Object attrs, charset_list;
5811
5812   CODING_GET_INFO (coding, attrs, charset_list);
5813   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5814     {
5815       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5816
5817       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5818         charset_list = Viso_2022_charset_list;
5819     }
5820   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5821     {
5822       charset_list = Vemacs_mule_charset_list;
5823     }
5824   return charset_list;
5825 }
5826
5827
5828 /* Return a list of charsets supported by CODING-SYSTEM.  */
5829
5830 Lisp_Object
5831 coding_system_charset_list (Lisp_Object coding_system)
5832 {
5833   int id;
5834   Lisp_Object attrs, charset_list;
5835
5836   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5837   attrs = CODING_ID_ATTRS (id);
5838
5839   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5840     {
5841       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5842
5843       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5844         charset_list = Viso_2022_charset_list;
5845       else
5846         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5847     }
5848   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5849     {
5850       charset_list = Vemacs_mule_charset_list;
5851     }
5852   else
5853     {
5854       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5855     }
5856   return charset_list;
5857 }
5858
5859
5860 /* Return raw-text or one of its subsidiaries that has the same
5861    eol_type as CODING-SYSTEM.  */
5862
5863 Lisp_Object
5864 raw_text_coding_system (Lisp_Object coding_system)
5865 {
5866   Lisp_Object spec, attrs;
5867   Lisp_Object eol_type, raw_text_eol_type;
5868
5869   if (NILP (coding_system))
5870     return Qraw_text;
5871   spec = CODING_SYSTEM_SPEC (coding_system);
5872   attrs = AREF (spec, 0);
5873
5874   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5875     return coding_system;
5876
5877   eol_type = AREF (spec, 2);
5878   if (VECTORP (eol_type))
5879     return Qraw_text;
5880   spec = CODING_SYSTEM_SPEC (Qraw_text);
5881   raw_text_eol_type = AREF (spec, 2);
5882   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5883           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5884           : AREF (raw_text_eol_type, 2));
5885 }
5886
5887
5888 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5889    the subsidiary that has the same eol-spec as PARENT (if it is not
5890    nil and specifies end-of-line format) or the system's setting
5891    (system_eol_type).  */
5892
5893 Lisp_Object
5894 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5895 {
5896   Lisp_Object spec, eol_type;
5897
5898   if (NILP (coding_system))
5899     coding_system = Qraw_text;
5900   spec = CODING_SYSTEM_SPEC (coding_system);
5901   eol_type = AREF (spec, 2);
5902   if (VECTORP (eol_type))
5903     {
5904       Lisp_Object parent_eol_type;
5905
5906       if (! NILP (parent))
5907         {
5908           Lisp_Object parent_spec;
5909
5910           parent_spec = CODING_SYSTEM_SPEC (parent);
5911           parent_eol_type = AREF (parent_spec, 2);
5912           if (VECTORP (parent_eol_type))
5913             parent_eol_type = system_eol_type;
5914         }
5915       else
5916         parent_eol_type = system_eol_type;
5917       if (EQ (parent_eol_type, Qunix))
5918         coding_system = AREF (eol_type, 0);
5919       else if (EQ (parent_eol_type, Qdos))
5920         coding_system = AREF (eol_type, 1);
5921       else if (EQ (parent_eol_type, Qmac))
5922         coding_system = AREF (eol_type, 2);
5923     }
5924   return coding_system;
5925 }
5926
5927
5928 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5929    decided for writing to a process.  If not, complement them, and
5930    return a new coding system.  */
5931
5932 Lisp_Object
5933 complement_process_encoding_system (Lisp_Object coding_system)
5934 {
5935   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5936   Lisp_Object spec, attrs;
5937   int i;
5938
5939   for (i = 0; i < 3; i++)
5940     {
5941       if (i == 1)
5942         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5943       else if (i == 2)
5944         coding_system = preferred_coding_system ();
5945       spec = CODING_SYSTEM_SPEC (coding_system);
5946       if (NILP (spec))
5947         continue;
5948       attrs = AREF (spec, 0);
5949       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5950         coding_base = CODING_ATTR_BASE_NAME (attrs);
5951       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5952         eol_base = coding_system;
5953       if (! NILP (coding_base) && ! NILP (eol_base))
5954         break;
5955     }
5956
5957   if (i > 0)
5958     /* The original CODING_SYSTEM didn't specify text-conversion or
5959        eol-conversion.  Be sure that we return a fully complemented
5960        coding system.  */
5961     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5962   return coding_system;
5963 }
5964
5965
5966 /* Emacs has a mechanism to automatically detect a coding system if it
5967    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5968    it's impossible to distinguish some coding systems accurately
5969    because they use the same range of codes.  So, at first, coding
5970    systems are categorized into 7, those are:
5971
5972    o coding-category-emacs-mule
5973
5974         The category for a coding system which has the same code range
5975         as Emacs' internal format.  Assigned the coding-system (Lisp
5976         symbol) `emacs-mule' by default.
5977
5978    o coding-category-sjis
5979
5980         The category for a coding system which has the same code range
5981         as SJIS.  Assigned the coding-system (Lisp
5982         symbol) `japanese-shift-jis' by default.
5983
5984    o coding-category-iso-7
5985
5986         The category for a coding system which has the same code range
5987         as ISO2022 of 7-bit environment.  This doesn't use any locking
5988         shift and single shift functions.  This can encode/decode all
5989         charsets.  Assigned the coding-system (Lisp symbol)
5990         `iso-2022-7bit' by default.
5991
5992    o coding-category-iso-7-tight
5993
5994         Same as coding-category-iso-7 except that this can
5995         encode/decode only the specified charsets.
5996
5997    o coding-category-iso-8-1
5998
5999         The category for a coding system which has the same code range
6000         as ISO2022 of 8-bit environment and graphic plane 1 used only
6001         for DIMENSION1 charset.  This doesn't use any locking shift
6002         and single shift functions.  Assigned the coding-system (Lisp
6003         symbol) `iso-latin-1' by default.
6004
6005    o coding-category-iso-8-2
6006
6007         The category for a coding system which has the same code range
6008         as ISO2022 of 8-bit environment and graphic plane 1 used only
6009         for DIMENSION2 charset.  This doesn't use any locking shift
6010         and single shift functions.  Assigned the coding-system (Lisp
6011         symbol) `japanese-iso-8bit' by default.
6012
6013    o coding-category-iso-7-else
6014
6015         The category for a coding system which has the same code range
6016         as ISO2022 of 7-bit environment but uses locking shift or
6017         single shift functions.  Assigned the coding-system (Lisp
6018         symbol) `iso-2022-7bit-lock' by default.
6019
6020    o coding-category-iso-8-else
6021
6022         The category for a coding system which has the same code range
6023         as ISO2022 of 8-bit environment but uses locking shift or
6024         single shift functions.  Assigned the coding-system (Lisp
6025         symbol) `iso-2022-8bit-ss2' by default.
6026
6027    o coding-category-big5
6028
6029         The category for a coding system which has the same code range
6030         as BIG5.  Assigned the coding-system (Lisp symbol)
6031         `cn-big5' by default.
6032
6033    o coding-category-utf-8
6034
6035         The category for a coding system which has the same code range
6036         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6037         symbol) `utf-8' by default.
6038
6039    o coding-category-utf-16-be
6040
6041         The category for a coding system in which a text has an
6042         Unicode signature (cf. Unicode Standard) in the order of BIG
6043         endian at the head.  Assigned the coding-system (Lisp symbol)
6044         `utf-16-be' by default.
6045
6046    o coding-category-utf-16-le
6047
6048         The category for a coding system in which a text has an
6049         Unicode signature (cf. Unicode Standard) in the order of
6050         LITTLE endian at the head.  Assigned the coding-system (Lisp
6051         symbol) `utf-16-le' by default.
6052
6053    o coding-category-ccl
6054
6055         The category for a coding system of which encoder/decoder is
6056         written in CCL programs.  The default value is nil, i.e., no
6057         coding system is assigned.
6058
6059    o coding-category-binary
6060
6061         The category for a coding system not categorized in any of the
6062         above.  Assigned the coding-system (Lisp symbol)
6063         `no-conversion' by default.
6064
6065    Each of them is a Lisp symbol and the value is an actual
6066    `coding-system's (this is also a Lisp symbol) assigned by a user.
6067    What Emacs does actually is to detect a category of coding system.
6068    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6069    decide only one possible category, it selects a category of the
6070    highest priority.  Priorities of categories are also specified by a
6071    user in a Lisp variable `coding-category-list'.
6072
6073 */
6074
6075 #define EOL_SEEN_NONE   0
6076 #define EOL_SEEN_LF     1
6077 #define EOL_SEEN_CR     2
6078 #define EOL_SEEN_CRLF   4
6079
6080 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6081    SOURCE is encoded.  If CATEGORY is one of
6082    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6083    two-byte, else they are encoded by one-byte.
6084
6085    Return one of EOL_SEEN_XXX.  */
6086
6087 #define MAX_EOL_CHECK_COUNT 3
6088
6089 static int
6090 detect_eol (const unsigned char *source, EMACS_INT src_bytes,
6091             enum coding_category category)
6092 {
6093   const unsigned char *src = source, *src_end = src + src_bytes;
6094   unsigned char c;
6095   int total  = 0;
6096   int eol_seen = EOL_SEEN_NONE;
6097
6098   if ((1 << category) & CATEGORY_MASK_UTF_16)
6099     {
6100       int msb, lsb;
6101
6102       msb = category == (coding_category_utf_16_le
6103                          | coding_category_utf_16_le_nosig);
6104       lsb = 1 - msb;
6105
6106       while (src + 1 < src_end)
6107         {
6108           c = src[lsb];
6109           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6110             {
6111               int this_eol;
6112
6113               if (c == '\n')
6114                 this_eol = EOL_SEEN_LF;
6115               else if (src + 3 >= src_end
6116                        || src[msb + 2] != 0
6117                        || src[lsb + 2] != '\n')
6118                 this_eol = EOL_SEEN_CR;
6119               else
6120                 {
6121                   this_eol = EOL_SEEN_CRLF;
6122                   src += 2;
6123                 }
6124
6125               if (eol_seen == EOL_SEEN_NONE)
6126                 /* This is the first end-of-line.  */
6127                 eol_seen = this_eol;
6128               else if (eol_seen != this_eol)
6129                 {
6130                   /* The found type is different from what found before.
6131                      Allow for stray ^M characters in DOS EOL files.  */
6132                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6133                       || (eol_seen == EOL_SEEN_CRLF
6134                           && this_eol == EOL_SEEN_CR))
6135                     eol_seen = EOL_SEEN_CRLF;
6136                   else
6137                     {
6138                       eol_seen = EOL_SEEN_LF;
6139                       break;
6140                     }
6141                 }
6142               if (++total == MAX_EOL_CHECK_COUNT)
6143                 break;
6144             }
6145           src += 2;
6146         }
6147     }
6148   else
6149     while (src < src_end)
6150       {
6151         c = *src++;
6152         if (c == '\n' || c == '\r')
6153           {
6154             int this_eol;
6155
6156             if (c == '\n')
6157               this_eol = EOL_SEEN_LF;
6158             else if (src >= src_end || *src != '\n')
6159               this_eol = EOL_SEEN_CR;
6160             else
6161               this_eol = EOL_SEEN_CRLF, src++;
6162
6163             if (eol_seen == EOL_SEEN_NONE)
6164               /* This is the first end-of-line.  */
6165               eol_seen = this_eol;
6166             else if (eol_seen != this_eol)
6167               {
6168                 /* The found type is different from what found before.
6169                    Allow for stray ^M characters in DOS EOL files.  */
6170                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6171                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6172                   eol_seen = EOL_SEEN_CRLF;
6173                 else
6174                   {
6175                     eol_seen = EOL_SEEN_LF;
6176                     break;
6177                   }
6178               }
6179             if (++total == MAX_EOL_CHECK_COUNT)
6180               break;
6181           }
6182       }
6183   return eol_seen;
6184 }
6185
6186
6187 static Lisp_Object
6188 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6189 {
6190   Lisp_Object eol_type;
6191
6192   eol_type = CODING_ID_EOL_TYPE (coding->id);
6193   if (eol_seen & EOL_SEEN_LF)
6194     {
6195       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6196       eol_type = Qunix;
6197     }
6198   else if (eol_seen & EOL_SEEN_CRLF)
6199     {
6200       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6201       eol_type = Qdos;
6202     }
6203   else if (eol_seen & EOL_SEEN_CR)
6204     {
6205       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6206       eol_type = Qmac;
6207     }
6208   return eol_type;
6209 }
6210
6211 /* Detect how a text specified in CODING is encoded.  If a coding
6212    system is detected, update fields of CODING by the detected coding
6213    system.  */
6214
6215 static void
6216 detect_coding (struct coding_system *coding)
6217 {
6218   const unsigned char *src, *src_end;
6219   int saved_mode = coding->mode;
6220
6221   coding->consumed = coding->consumed_char = 0;
6222   coding->produced = coding->produced_char = 0;
6223   coding_set_source (coding);
6224
6225   src_end = coding->source + coding->src_bytes;
6226   coding->head_ascii = 0;
6227
6228   /* If we have not yet decided the text encoding type, detect it
6229      now.  */
6230   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6231     {
6232       int c, i;
6233       struct coding_detection_info detect_info;
6234       int null_byte_found = 0, eight_bit_found = 0;
6235
6236       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6237       for (src = coding->source; src < src_end; src++)
6238         {
6239           c = *src;
6240           if (c & 0x80)
6241             {
6242               eight_bit_found = 1;
6243               if (null_byte_found)
6244                 break;
6245             }
6246           else if (c < 0x20)
6247             {
6248               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6249                   && ! inhibit_iso_escape_detection
6250                   && ! detect_info.checked)
6251                 {
6252                   if (detect_coding_iso_2022 (coding, &detect_info))
6253                     {
6254                       /* We have scanned the whole data.  */
6255                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6256                         {
6257                           /* We didn't find an 8-bit code.  We may
6258                              have found a null-byte, but it's very
6259                              rare that a binary file conforms to
6260                              ISO-2022.  */
6261                           src = src_end;
6262                           coding->head_ascii = src - coding->source;
6263                         }
6264                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6265                       break;
6266                     }
6267                 }
6268               else if (! c && !inhibit_null_byte_detection)
6269                 {
6270                   null_byte_found = 1;
6271                   if (eight_bit_found)
6272                     break;
6273                 }
6274               if (! eight_bit_found)
6275                 coding->head_ascii++;
6276             }
6277           else if (! eight_bit_found)
6278             coding->head_ascii++;
6279         }
6280
6281       if (null_byte_found || eight_bit_found
6282           || coding->head_ascii < coding->src_bytes
6283           || detect_info.found)
6284         {
6285           enum coding_category category;
6286           struct coding_system *this;
6287
6288           if (coding->head_ascii == coding->src_bytes)
6289             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6290             for (i = 0; i < coding_category_raw_text; i++)
6291               {
6292                 category = coding_priorities[i];
6293                 this = coding_categories + category;
6294                 if (detect_info.found & (1 << category))
6295                   break;
6296               }
6297           else
6298             {
6299               if (null_byte_found)
6300                 {
6301                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6302                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6303                 }
6304               for (i = 0; i < coding_category_raw_text; i++)
6305                 {
6306                   category = coding_priorities[i];
6307                   this = coding_categories + category;
6308                   if (this->id < 0)
6309                     {
6310                       /* No coding system of this category is defined.  */
6311                       detect_info.rejected |= (1 << category);
6312                     }
6313                   else if (category >= coding_category_raw_text)
6314                     continue;
6315                   else if (detect_info.checked & (1 << category))
6316                     {
6317                       if (detect_info.found & (1 << category))
6318                         break;
6319                     }
6320                   else if ((*(this->detector)) (coding, &detect_info)
6321                            && detect_info.found & (1 << category))
6322                     {
6323                       if (category == coding_category_utf_16_auto)
6324                         {
6325                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6326                             category = coding_category_utf_16_le;
6327                           else
6328                             category = coding_category_utf_16_be;
6329                         }
6330                       break;
6331                     }
6332                 }
6333             }
6334
6335           if (i < coding_category_raw_text)
6336             setup_coding_system (CODING_ID_NAME (this->id), coding);
6337           else if (null_byte_found)
6338             setup_coding_system (Qno_conversion, coding);
6339           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6340                    == CATEGORY_MASK_ANY)
6341             setup_coding_system (Qraw_text, coding);
6342           else if (detect_info.rejected)
6343             for (i = 0; i < coding_category_raw_text; i++)
6344               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6345                 {
6346                   this = coding_categories + coding_priorities[i];
6347                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6348                   break;
6349                 }
6350         }
6351     }
6352   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6353            == coding_category_utf_8_auto)
6354     {
6355       Lisp_Object coding_systems;
6356       struct coding_detection_info detect_info;
6357
6358       coding_systems
6359         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6360       detect_info.found = detect_info.rejected = 0;
6361       coding->head_ascii = 0;
6362       if (CONSP (coding_systems)
6363           && detect_coding_utf_8 (coding, &detect_info))
6364         {
6365           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6366             setup_coding_system (XCAR (coding_systems), coding);
6367           else
6368             setup_coding_system (XCDR (coding_systems), coding);
6369         }
6370     }
6371   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6372            == coding_category_utf_16_auto)
6373     {
6374       Lisp_Object coding_systems;
6375       struct coding_detection_info detect_info;
6376
6377       coding_systems
6378         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6379       detect_info.found = detect_info.rejected = 0;
6380       coding->head_ascii = 0;
6381       if (CONSP (coding_systems)
6382           && detect_coding_utf_16 (coding, &detect_info))
6383         {
6384           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6385             setup_coding_system (XCAR (coding_systems), coding);
6386           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6387             setup_coding_system (XCDR (coding_systems), coding);
6388         }
6389     }
6390   coding->mode = saved_mode;
6391 }
6392
6393
6394 static void
6395 decode_eol (struct coding_system *coding)
6396 {
6397   Lisp_Object eol_type;
6398   unsigned char *p, *pbeg, *pend;
6399
6400   eol_type = CODING_ID_EOL_TYPE (coding->id);
6401   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6402     return;
6403
6404   if (NILP (coding->dst_object))
6405     pbeg = coding->destination;
6406   else
6407     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6408   pend = pbeg + coding->produced;
6409
6410   if (VECTORP (eol_type))
6411     {
6412       int eol_seen = EOL_SEEN_NONE;
6413
6414       for (p = pbeg; p < pend; p++)
6415         {
6416           if (*p == '\n')
6417             eol_seen |= EOL_SEEN_LF;
6418           else if (*p == '\r')
6419             {
6420               if (p + 1 < pend && *(p + 1) == '\n')
6421                 {
6422                   eol_seen |= EOL_SEEN_CRLF;
6423                   p++;
6424                 }
6425               else
6426                 eol_seen |= EOL_SEEN_CR;
6427             }
6428         }
6429       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6430       if ((eol_seen & EOL_SEEN_CRLF) != 0
6431           && (eol_seen & EOL_SEEN_CR) != 0
6432           && (eol_seen & EOL_SEEN_LF) == 0)
6433         eol_seen = EOL_SEEN_CRLF;
6434       else if (eol_seen != EOL_SEEN_NONE
6435           && eol_seen != EOL_SEEN_LF
6436           && eol_seen != EOL_SEEN_CRLF
6437           && eol_seen != EOL_SEEN_CR)
6438         eol_seen = EOL_SEEN_LF;
6439       if (eol_seen != EOL_SEEN_NONE)
6440         eol_type = adjust_coding_eol_type (coding, eol_seen);
6441     }
6442
6443   if (EQ (eol_type, Qmac))
6444     {
6445       for (p = pbeg; p < pend; p++)
6446         if (*p == '\r')
6447           *p = '\n';
6448     }
6449   else if (EQ (eol_type, Qdos))
6450     {
6451       EMACS_INT n = 0;
6452
6453       if (NILP (coding->dst_object))
6454         {
6455           /* Start deleting '\r' from the tail to minimize the memory
6456              movement.  */
6457           for (p = pend - 2; p >= pbeg; p--)
6458             if (*p == '\r')
6459               {
6460                 memmove (p, p + 1, pend-- - p - 1);
6461                 n++;
6462               }
6463         }
6464       else
6465         {
6466           EMACS_INT pos_byte = coding->dst_pos_byte;
6467           EMACS_INT pos = coding->dst_pos;
6468           EMACS_INT pos_end = pos + coding->produced_char - 1;
6469
6470           while (pos < pos_end)
6471             {
6472               p = BYTE_POS_ADDR (pos_byte);
6473               if (*p == '\r' && p[1] == '\n')
6474                 {
6475                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6476                   n++;
6477                   pos_end--;
6478                 }
6479               pos++;
6480               if (coding->dst_multibyte)
6481                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6482               else
6483                 pos_byte++;
6484             }
6485         }
6486       coding->produced -= n;
6487       coding->produced_char -= n;
6488     }
6489 }
6490
6491
6492 /* Return a translation table (or list of them) from coding system
6493    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6494    decoding (ENCODEP is zero). */
6495
6496 static Lisp_Object
6497 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6498 {
6499   Lisp_Object standard, translation_table;
6500   Lisp_Object val;
6501
6502   if (NILP (Venable_character_translation))
6503     {
6504       if (max_lookup)
6505         *max_lookup = 0;
6506       return Qnil;
6507     }
6508   if (encodep)
6509     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6510       standard = Vstandard_translation_table_for_encode;
6511   else
6512     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6513       standard = Vstandard_translation_table_for_decode;
6514   if (NILP (translation_table))
6515     translation_table = standard;
6516   else
6517     {
6518       if (SYMBOLP (translation_table))
6519         translation_table = Fget (translation_table, Qtranslation_table);
6520       else if (CONSP (translation_table))
6521         {
6522           translation_table = Fcopy_sequence (translation_table);
6523           for (val = translation_table; CONSP (val); val = XCDR (val))
6524             if (SYMBOLP (XCAR (val)))
6525               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6526         }
6527       if (CHAR_TABLE_P (standard))
6528         {
6529           if (CONSP (translation_table))
6530             translation_table = nconc2 (translation_table,
6531                                         Fcons (standard, Qnil));
6532           else
6533             translation_table = Fcons (translation_table,
6534                                        Fcons (standard, Qnil));
6535         }
6536     }
6537
6538   if (max_lookup)
6539     {
6540       *max_lookup = 1;
6541       if (CHAR_TABLE_P (translation_table)
6542           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6543         {
6544           val = XCHAR_TABLE (translation_table)->extras[1];
6545           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6546             *max_lookup = XFASTINT (val);
6547         }
6548       else if (CONSP (translation_table))
6549         {
6550           Lisp_Object tail;
6551
6552           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6553             if (CHAR_TABLE_P (XCAR (tail))
6554                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6555               {
6556                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6557                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6558                   *max_lookup = XFASTINT (tailval);
6559               }
6560         }
6561     }
6562   return translation_table;
6563 }
6564
6565 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6566   do {                                                          \
6567     trans = Qnil;                                               \
6568     if (CHAR_TABLE_P (table))                                   \
6569       {                                                         \
6570         trans = CHAR_TABLE_REF (table, c);                      \
6571         if (CHARACTERP (trans))                                 \
6572           c = XFASTINT (trans), trans = Qnil;                   \
6573       }                                                         \
6574     else if (CONSP (table))                                     \
6575       {                                                         \
6576         Lisp_Object tail;                                       \
6577                                                                 \
6578         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6579           if (CHAR_TABLE_P (XCAR (tail)))                       \
6580             {                                                   \
6581               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6582               if (CHARACTERP (trans))                           \
6583                 c = XFASTINT (trans), trans = Qnil;             \
6584               else if (! NILP (trans))                          \
6585                 break;                                          \
6586             }                                                   \
6587       }                                                         \
6588   } while (0)
6589
6590
6591 /* Return a translation of character(s) at BUF according to TRANS.
6592    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6593    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6594    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6595    translation is found, and Qnil if not found..
6596    If BUF is too short to lookup characters in FROM, return Qt.  */
6597
6598 static Lisp_Object
6599 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6600 {
6601
6602   if (INTEGERP (trans))
6603     return trans;
6604   for (; CONSP (trans); trans = XCDR (trans))
6605     {
6606       Lisp_Object val = XCAR (trans);
6607       Lisp_Object from = XCAR (val);
6608       int len = ASIZE (from);
6609       int i;
6610
6611       for (i = 0; i < len; i++)
6612         {
6613           if (buf + i == buf_end)
6614             return Qt;
6615           if (XINT (AREF (from, i)) != buf[i])
6616             break;
6617         }
6618       if (i == len)
6619         return val;
6620     }
6621   return Qnil;
6622 }
6623
6624
6625 static int
6626 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6627                int last_block)
6628 {
6629   unsigned char *dst = coding->destination + coding->produced;
6630   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6631   EMACS_INT produced;
6632   EMACS_INT produced_chars = 0;
6633   int carryover = 0;
6634
6635   if (! coding->chars_at_source)
6636     {
6637       /* Source characters are in coding->charbuf.  */
6638       int *buf = coding->charbuf;
6639       int *buf_end = buf + coding->charbuf_used;
6640
6641       if (EQ (coding->src_object, coding->dst_object))
6642         {
6643           coding_set_source (coding);
6644           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6645         }
6646
6647       while (buf < buf_end)
6648         {
6649           int c = *buf, i;
6650
6651           if (c >= 0)
6652             {
6653               EMACS_INT from_nchars = 1, to_nchars = 1;
6654               Lisp_Object trans = Qnil;
6655
6656               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6657               if (! NILP (trans))
6658                 {
6659                   trans = get_translation (trans, buf, buf_end);
6660                   if (INTEGERP (trans))
6661                     c = XINT (trans);
6662                   else if (CONSP (trans))
6663                     {
6664                       from_nchars = ASIZE (XCAR (trans));
6665                       trans = XCDR (trans);
6666                       if (INTEGERP (trans))
6667                         c = XINT (trans);
6668                       else
6669                         {
6670                           to_nchars = ASIZE (trans);
6671                           c = XINT (AREF (trans, 0));
6672                         }
6673                     }
6674                   else if (EQ (trans, Qt) && ! last_block)
6675                     break;
6676                 }
6677
6678               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6679                 {
6680                   dst = alloc_destination (coding,
6681                                            buf_end - buf
6682                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6683                                            dst);
6684                   if (EQ (coding->src_object, coding->dst_object))
6685                     {
6686                       coding_set_source (coding);
6687                       dst_end = (((unsigned char *) coding->source)
6688                                  + coding->consumed);
6689                     }
6690                   else
6691                     dst_end = coding->destination + coding->dst_bytes;
6692                 }
6693
6694               for (i = 0; i < to_nchars; i++)
6695                 {
6696                   if (i > 0)
6697                     c = XINT (AREF (trans, i));
6698                   if (coding->dst_multibyte
6699                       || ! CHAR_BYTE8_P (c))
6700                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6701                   else
6702                     *dst++ = CHAR_TO_BYTE8 (c);
6703                 }
6704               produced_chars += to_nchars;
6705               buf += from_nchars;
6706             }
6707           else
6708             /* This is an annotation datum.  (-C) is the length.  */
6709             buf += -c;
6710         }
6711       carryover = buf_end - buf;
6712     }
6713   else
6714     {
6715       /* Source characters are at coding->source.  */
6716       const unsigned char *src = coding->source;
6717       const unsigned char *src_end = src + coding->consumed;
6718
6719       if (EQ (coding->dst_object, coding->src_object))
6720         dst_end = (unsigned char *) src;
6721       if (coding->src_multibyte != coding->dst_multibyte)
6722         {
6723           if (coding->src_multibyte)
6724             {
6725               int multibytep = 1;
6726               EMACS_INT consumed_chars = 0;
6727
6728               while (1)
6729                 {
6730                   const unsigned char *src_base = src;
6731                   int c;
6732
6733                   ONE_MORE_BYTE (c);
6734                   if (dst == dst_end)
6735                     {
6736                       if (EQ (coding->src_object, coding->dst_object))
6737                         dst_end = (unsigned char *) src;
6738                       if (dst == dst_end)
6739                         {
6740                           EMACS_INT offset = src - coding->source;
6741
6742                           dst = alloc_destination (coding, src_end - src + 1,
6743                                                    dst);
6744                           dst_end = coding->destination + coding->dst_bytes;
6745                           coding_set_source (coding);
6746                           src = coding->source + offset;
6747                           src_end = coding->source + coding->src_bytes;
6748                           if (EQ (coding->src_object, coding->dst_object))
6749                             dst_end = (unsigned char *) src;
6750                         }
6751                     }
6752                   *dst++ = c;
6753                   produced_chars++;
6754                 }
6755             no_more_source:
6756               ;
6757             }
6758           else
6759             while (src < src_end)
6760               {
6761                 int multibytep = 1;
6762                 int c = *src++;
6763
6764                 if (dst >= dst_end - 1)
6765                   {
6766                     if (EQ (coding->src_object, coding->dst_object))
6767                       dst_end = (unsigned char *) src;
6768                     if (dst >= dst_end - 1)
6769                       {
6770                         EMACS_INT offset = src - coding->source;
6771                         EMACS_INT more_bytes;
6772
6773                         if (EQ (coding->src_object, coding->dst_object))
6774                           more_bytes = ((src_end - src) / 2) + 2;
6775                         else
6776                           more_bytes = src_end - src + 2;
6777                         dst = alloc_destination (coding, more_bytes, dst);
6778                         dst_end = coding->destination + coding->dst_bytes;
6779                         coding_set_source (coding);
6780                         src = coding->source + offset;
6781                         src_end = coding->source + coding->src_bytes;
6782                         if (EQ (coding->src_object, coding->dst_object))
6783                           dst_end = (unsigned char *) src;
6784                       }
6785                   }
6786                 EMIT_ONE_BYTE (c);
6787               }
6788         }
6789       else
6790         {
6791           if (!EQ (coding->src_object, coding->dst_object))
6792             {
6793               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6794
6795               if (require > 0)
6796                 {
6797                   EMACS_INT offset = src - coding->source;
6798
6799                   dst = alloc_destination (coding, require, dst);
6800                   coding_set_source (coding);
6801                   src = coding->source + offset;
6802                   src_end = coding->source + coding->src_bytes;
6803                 }
6804             }
6805           produced_chars = coding->consumed_char;
6806           while (src < src_end)
6807             *dst++ = *src++;
6808         }
6809     }
6810
6811   produced = dst - (coding->destination + coding->produced);
6812   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6813     insert_from_gap (produced_chars, produced);
6814   coding->produced += produced;
6815   coding->produced_char += produced_chars;
6816   return carryover;
6817 }
6818
6819 /* Compose text in CODING->object according to the annotation data at
6820    CHARBUF.  CHARBUF is an array:
6821      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6822  */
6823
6824 static INLINE void
6825 produce_composition (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6826 {
6827   int len;
6828   EMACS_INT to;
6829   enum composition_method method;
6830   Lisp_Object components;
6831
6832   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6833   to = pos + charbuf[2];
6834   method = (enum composition_method) (charbuf[4]);
6835
6836   if (method == COMPOSITION_RELATIVE)
6837     components = Qnil;
6838   else
6839     {
6840       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6841       int i, j;
6842
6843       if (method == COMPOSITION_WITH_RULE)
6844         len = charbuf[2] * 3 - 2;
6845       charbuf += MAX_ANNOTATION_LENGTH;
6846       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6847       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6848         {
6849           if (charbuf[i] >= 0)
6850             args[j] = make_number (charbuf[i]);
6851           else
6852             {
6853               i++;
6854               args[j] = make_number (charbuf[i] % 0x100);
6855             }
6856         }
6857       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6858     }
6859   compose_text (pos, to, components, Qnil, coding->dst_object);
6860 }
6861
6862
6863 /* Put `charset' property on text in CODING->object according to
6864    the annotation data at CHARBUF.  CHARBUF is an array:
6865      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6866  */
6867
6868 static INLINE void
6869 produce_charset (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6870 {
6871   EMACS_INT from = pos - charbuf[2];
6872   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6873
6874   Fput_text_property (make_number (from), make_number (pos),
6875                       Qcharset, CHARSET_NAME (charset),
6876                       coding->dst_object);
6877 }
6878
6879
6880 #define CHARBUF_SIZE 0x4000
6881
6882 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6883   do {                                                                  \
6884     int size = CHARBUF_SIZE;                                            \
6885                                                                         \
6886     coding->charbuf = NULL;                                             \
6887     while (size > 1024)                                                 \
6888       {                                                                 \
6889         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6890         if (coding->charbuf)                                            \
6891           break;                                                        \
6892         size >>= 1;                                                     \
6893       }                                                                 \
6894     if (! coding->charbuf)                                              \
6895       {                                                                 \
6896         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6897         return coding->result;                                          \
6898       }                                                                 \
6899     coding->charbuf_size = size;                                        \
6900   } while (0)
6901
6902
6903 static void
6904 produce_annotation (struct coding_system *coding, EMACS_INT pos)
6905 {
6906   int *charbuf = coding->charbuf;
6907   int *charbuf_end = charbuf + coding->charbuf_used;
6908
6909   if (NILP (coding->dst_object))
6910     return;
6911
6912   while (charbuf < charbuf_end)
6913     {
6914       if (*charbuf >= 0)
6915         pos++, charbuf++;
6916       else
6917         {
6918           int len = -*charbuf;
6919
6920           if (len > 2)
6921             switch (charbuf[1])
6922               {
6923               case CODING_ANNOTATE_COMPOSITION_MASK:
6924                 produce_composition (coding, charbuf, pos);
6925                 break;
6926               case CODING_ANNOTATE_CHARSET_MASK:
6927                 produce_charset (coding, charbuf, pos);
6928                 break;
6929               }
6930           charbuf += len;
6931         }
6932     }
6933 }
6934
6935 /* Decode the data at CODING->src_object into CODING->dst_object.
6936    CODING->src_object is a buffer, a string, or nil.
6937    CODING->dst_object is a buffer.
6938
6939    If CODING->src_object is a buffer, it must be the current buffer.
6940    In this case, if CODING->src_pos is positive, it is a position of
6941    the source text in the buffer, otherwise, the source text is in the
6942    gap area of the buffer, and CODING->src_pos specifies the offset of
6943    the text from GPT (which must be the same as PT).  If this is the
6944    same buffer as CODING->dst_object, CODING->src_pos must be
6945    negative.
6946
6947    If CODING->src_object is a string, CODING->src_pos is an index to
6948    that string.
6949
6950    If CODING->src_object is nil, CODING->source must already point to
6951    the non-relocatable memory area.  In this case, CODING->src_pos is
6952    an offset from CODING->source.
6953
6954    The decoded data is inserted at the current point of the buffer
6955    CODING->dst_object.
6956 */
6957
6958 static int
6959 decode_coding (struct coding_system *coding)
6960 {
6961   Lisp_Object attrs;
6962   Lisp_Object undo_list;
6963   Lisp_Object translation_table;
6964   struct ccl_spec cclspec;
6965   int carryover;
6966   int i;
6967
6968   if (BUFFERP (coding->src_object)
6969       && coding->src_pos > 0
6970       && coding->src_pos < GPT
6971       && coding->src_pos + coding->src_chars > GPT)
6972     move_gap_both (coding->src_pos, coding->src_pos_byte);
6973
6974   undo_list = Qt;
6975   if (BUFFERP (coding->dst_object))
6976     {
6977       if (current_buffer != XBUFFER (coding->dst_object))
6978         set_buffer_internal (XBUFFER (coding->dst_object));
6979       if (GPT != PT)
6980         move_gap_both (PT, PT_BYTE);
6981       undo_list = BVAR (current_buffer, undo_list);
6982       BVAR (current_buffer, undo_list) = Qt;
6983     }
6984
6985   coding->consumed = coding->consumed_char = 0;
6986   coding->produced = coding->produced_char = 0;
6987   coding->chars_at_source = 0;
6988   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6989   coding->errors = 0;
6990
6991   ALLOC_CONVERSION_WORK_AREA (coding);
6992
6993   attrs = CODING_ID_ATTRS (coding->id);
6994   translation_table = get_translation_table (attrs, 0, NULL);
6995
6996   carryover = 0;
6997   if (coding->decoder == decode_coding_ccl)
6998     {
6999       coding->spec.ccl = &cclspec;
7000       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7001     }
7002   do
7003     {
7004       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7005
7006       coding_set_source (coding);
7007       coding->annotated = 0;
7008       coding->charbuf_used = carryover;
7009       (*(coding->decoder)) (coding);
7010       coding_set_destination (coding);
7011       carryover = produce_chars (coding, translation_table, 0);
7012       if (coding->annotated)
7013         produce_annotation (coding, pos);
7014       for (i = 0; i < carryover; i++)
7015         coding->charbuf[i]
7016           = coding->charbuf[coding->charbuf_used - carryover + i];
7017     }
7018   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7019          || (coding->consumed < coding->src_bytes
7020              && (coding->result == CODING_RESULT_SUCCESS
7021                  || coding->result == CODING_RESULT_INVALID_SRC)));
7022
7023   if (carryover > 0)
7024     {
7025       coding_set_destination (coding);
7026       coding->charbuf_used = carryover;
7027       produce_chars (coding, translation_table, 1);
7028     }
7029
7030   coding->carryover_bytes = 0;
7031   if (coding->consumed < coding->src_bytes)
7032     {
7033       int nbytes = coding->src_bytes - coding->consumed;
7034       const unsigned char *src;
7035
7036       coding_set_source (coding);
7037       coding_set_destination (coding);
7038       src = coding->source + coding->consumed;
7039
7040       if (coding->mode & CODING_MODE_LAST_BLOCK)
7041         {
7042           /* Flush out unprocessed data as binary chars.  We are sure
7043              that the number of data is less than the size of
7044              coding->charbuf.  */
7045           coding->charbuf_used = 0;
7046           coding->chars_at_source = 0;
7047
7048           while (nbytes-- > 0)
7049             {
7050               int c = *src++;
7051
7052               if (c & 0x80)
7053                 c = BYTE8_TO_CHAR (c);
7054               coding->charbuf[coding->charbuf_used++] = c;
7055             }
7056           produce_chars (coding, Qnil, 1);
7057         }
7058       else
7059         {
7060           /* Record unprocessed bytes in coding->carryover.  We are
7061              sure that the number of data is less than the size of
7062              coding->carryover.  */
7063           unsigned char *p = coding->carryover;
7064
7065           if (nbytes > sizeof coding->carryover)
7066             nbytes = sizeof coding->carryover;
7067           coding->carryover_bytes = nbytes;
7068           while (nbytes-- > 0)
7069             *p++ = *src++;
7070         }
7071       coding->consumed = coding->src_bytes;
7072     }
7073
7074   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7075       && !inhibit_eol_conversion)
7076     decode_eol (coding);
7077   if (BUFFERP (coding->dst_object))
7078     {
7079       BVAR (current_buffer, undo_list) = undo_list;
7080       record_insert (coding->dst_pos, coding->produced_char);
7081     }
7082   return coding->result;
7083 }
7084
7085
7086 /* Extract an annotation datum from a composition starting at POS and
7087    ending before LIMIT of CODING->src_object (buffer or string), store
7088    the data in BUF, set *STOP to a starting position of the next
7089    composition (if any) or to LIMIT, and return the address of the
7090    next element of BUF.
7091
7092    If such an annotation is not found, set *STOP to a starting
7093    position of a composition after POS (if any) or to LIMIT, and
7094    return BUF.  */
7095
7096 static INLINE int *
7097 handle_composition_annotation (EMACS_INT pos, EMACS_INT limit,
7098                                struct coding_system *coding, int *buf,
7099                                EMACS_INT *stop)
7100 {
7101   EMACS_INT start, end;
7102   Lisp_Object prop;
7103
7104   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7105       || end > limit)
7106     *stop = limit;
7107   else if (start > pos)
7108     *stop = start;
7109   else
7110     {
7111       if (start == pos)
7112         {
7113           /* We found a composition.  Store the corresponding
7114              annotation data in BUF.  */
7115           int *head = buf;
7116           enum composition_method method = COMPOSITION_METHOD (prop);
7117           int nchars = COMPOSITION_LENGTH (prop);
7118
7119           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7120           if (method != COMPOSITION_RELATIVE)
7121             {
7122               Lisp_Object components;
7123               int len, i, i_byte;
7124
7125               components = COMPOSITION_COMPONENTS (prop);
7126               if (VECTORP (components))
7127                 {
7128                   len = XVECTOR (components)->size;
7129                   for (i = 0; i < len; i++)
7130                     *buf++ = XINT (AREF (components, i));
7131                 }
7132               else if (STRINGP (components))
7133                 {
7134                   len = SCHARS (components);
7135                   i = i_byte = 0;
7136                   while (i < len)
7137                     {
7138                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7139                       buf++;
7140                     }
7141                 }
7142               else if (INTEGERP (components))
7143                 {
7144                   len = 1;
7145                   *buf++ = XINT (components);
7146                 }
7147               else if (CONSP (components))
7148                 {
7149                   for (len = 0; CONSP (components);
7150                        len++, components = XCDR (components))
7151                     *buf++ = XINT (XCAR (components));
7152                 }
7153               else
7154                 abort ();
7155               *head -= len;
7156             }
7157         }
7158
7159       if (find_composition (end, limit, &start, &end, &prop,
7160                             coding->src_object)
7161           && end <= limit)
7162         *stop = start;
7163       else
7164         *stop = limit;
7165     }
7166   return buf;
7167 }
7168
7169
7170 /* Extract an annotation datum from a text property `charset' at POS of
7171    CODING->src_object (buffer of string), store the data in BUF, set
7172    *STOP to the position where the value of `charset' property changes
7173    (limiting by LIMIT), and return the address of the next element of
7174    BUF.
7175
7176    If the property value is nil, set *STOP to the position where the
7177    property value is non-nil (limiting by LIMIT), and return BUF.  */
7178
7179 static INLINE int *
7180 handle_charset_annotation (EMACS_INT pos, EMACS_INT limit,
7181                            struct coding_system *coding, int *buf,
7182                            EMACS_INT *stop)
7183 {
7184   Lisp_Object val, next;
7185   int id;
7186
7187   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7188   if (! NILP (val) && CHARSETP (val))
7189     id = XINT (CHARSET_SYMBOL_ID (val));
7190   else
7191     id = -1;
7192   ADD_CHARSET_DATA (buf, 0, id);
7193   next = Fnext_single_property_change (make_number (pos), Qcharset,
7194                                        coding->src_object,
7195                                        make_number (limit));
7196   *stop = XINT (next);
7197   return buf;
7198 }
7199
7200
7201 static void
7202 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7203                int max_lookup)
7204 {
7205   int *buf = coding->charbuf;
7206   int *buf_end = coding->charbuf + coding->charbuf_size;
7207   const unsigned char *src = coding->source + coding->consumed;
7208   const unsigned char *src_end = coding->source + coding->src_bytes;
7209   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7210   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7211   int multibytep = coding->src_multibyte;
7212   Lisp_Object eol_type;
7213   int c;
7214   EMACS_INT stop, stop_composition, stop_charset;
7215   int *lookup_buf = NULL;
7216
7217   if (! NILP (translation_table))
7218     lookup_buf = alloca (sizeof (int) * max_lookup);
7219
7220   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7221   if (VECTORP (eol_type))
7222     eol_type = Qunix;
7223
7224   /* Note: composition handling is not yet implemented.  */
7225   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7226
7227   if (NILP (coding->src_object))
7228     stop = stop_composition = stop_charset = end_pos;
7229   else
7230     {
7231       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7232         stop = stop_composition = pos;
7233       else
7234         stop = stop_composition = end_pos;
7235       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7236         stop = stop_charset = pos;
7237       else
7238         stop_charset = end_pos;
7239     }
7240
7241   /* Compensate for CRLF and conversion.  */
7242   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7243   while (buf < buf_end)
7244     {
7245       Lisp_Object trans;
7246
7247       if (pos == stop)
7248         {
7249           if (pos == end_pos)
7250             break;
7251           if (pos == stop_composition)
7252             buf = handle_composition_annotation (pos, end_pos, coding,
7253                                                  buf, &stop_composition);
7254           if (pos == stop_charset)
7255             buf = handle_charset_annotation (pos, end_pos, coding,
7256                                              buf, &stop_charset);
7257           stop = (stop_composition < stop_charset
7258                   ? stop_composition : stop_charset);
7259         }
7260
7261       if (! multibytep)
7262         {
7263           EMACS_INT bytes;
7264
7265           if (coding->encoder == encode_coding_raw_text
7266               || coding->encoder == encode_coding_ccl)
7267             c = *src++, pos++;
7268           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7269             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7270           else
7271             c = BYTE8_TO_CHAR (*src), src++, pos++;
7272         }
7273       else
7274         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7275       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7276         c = '\n';
7277       if (! EQ (eol_type, Qunix))
7278         {
7279           if (c == '\n')
7280             {
7281               if (EQ (eol_type, Qdos))
7282                 *buf++ = '\r';
7283               else
7284                 c = '\r';
7285             }
7286         }
7287
7288       trans = Qnil;
7289       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7290       if (NILP (trans))
7291         *buf++ = c;
7292       else
7293         {
7294           int from_nchars = 1, to_nchars = 1;
7295           int *lookup_buf_end;
7296           const unsigned char *p = src;
7297           int i;
7298
7299           lookup_buf[0] = c;
7300           for (i = 1; i < max_lookup && p < src_end; i++)
7301             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7302           lookup_buf_end = lookup_buf + i;
7303           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7304           if (INTEGERP (trans))
7305             c = XINT (trans);
7306           else if (CONSP (trans))
7307             {
7308               from_nchars = ASIZE (XCAR (trans));
7309               trans = XCDR (trans);
7310               if (INTEGERP (trans))
7311                 c = XINT (trans);
7312               else
7313                 {
7314                   to_nchars = ASIZE (trans);
7315                   if (buf + to_nchars > buf_end)
7316                     break;
7317                   c = XINT (AREF (trans, 0));
7318                 }
7319             }
7320           else
7321             break;
7322           *buf++ = c;
7323           for (i = 1; i < to_nchars; i++)
7324             *buf++ = XINT (AREF (trans, i));
7325           for (i = 1; i < from_nchars; i++, pos++)
7326             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7327         }
7328     }
7329
7330   coding->consumed = src - coding->source;
7331   coding->consumed_char = pos - coding->src_pos;
7332   coding->charbuf_used = buf - coding->charbuf;
7333   coding->chars_at_source = 0;
7334 }
7335
7336
7337 /* Encode the text at CODING->src_object into CODING->dst_object.
7338    CODING->src_object is a buffer or a string.
7339    CODING->dst_object is a buffer or nil.
7340
7341    If CODING->src_object is a buffer, it must be the current buffer.
7342    In this case, if CODING->src_pos is positive, it is a position of
7343    the source text in the buffer, otherwise. the source text is in the
7344    gap area of the buffer, and coding->src_pos specifies the offset of
7345    the text from GPT (which must be the same as PT).  If this is the
7346    same buffer as CODING->dst_object, CODING->src_pos must be
7347    negative and CODING should not have `pre-write-conversion'.
7348
7349    If CODING->src_object is a string, CODING should not have
7350    `pre-write-conversion'.
7351
7352    If CODING->dst_object is a buffer, the encoded data is inserted at
7353    the current point of that buffer.
7354
7355    If CODING->dst_object is nil, the encoded data is placed at the
7356    memory area specified by CODING->destination.  */
7357
7358 static int
7359 encode_coding (struct coding_system *coding)
7360 {
7361   Lisp_Object attrs;
7362   Lisp_Object translation_table;
7363   int max_lookup;
7364   struct ccl_spec cclspec;
7365
7366   attrs = CODING_ID_ATTRS (coding->id);
7367   if (coding->encoder == encode_coding_raw_text)
7368     translation_table = Qnil, max_lookup = 0;
7369   else
7370     translation_table = get_translation_table (attrs, 1, &max_lookup);
7371
7372   if (BUFFERP (coding->dst_object))
7373     {
7374       set_buffer_internal (XBUFFER (coding->dst_object));
7375       coding->dst_multibyte
7376         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7377     }
7378
7379   coding->consumed = coding->consumed_char = 0;
7380   coding->produced = coding->produced_char = 0;
7381   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7382   coding->errors = 0;
7383
7384   ALLOC_CONVERSION_WORK_AREA (coding);
7385
7386   if (coding->encoder == encode_coding_ccl)
7387     {
7388       coding->spec.ccl = &cclspec;
7389       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7390     }
7391   do {
7392     coding_set_source (coding);
7393     consume_chars (coding, translation_table, max_lookup);
7394     coding_set_destination (coding);
7395     (*(coding->encoder)) (coding);
7396   } while (coding->consumed_char < coding->src_chars);
7397
7398   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7399     insert_from_gap (coding->produced_char, coding->produced);
7400
7401   return (coding->result);
7402 }
7403
7404
7405 /* Name (or base name) of work buffer for code conversion.  */
7406 static Lisp_Object Vcode_conversion_workbuf_name;
7407
7408 /* A working buffer used by the top level conversion.  Once it is
7409    created, it is never destroyed.  It has the name
7410    Vcode_conversion_workbuf_name.  The other working buffers are
7411    destroyed after the use is finished, and their names are modified
7412    versions of Vcode_conversion_workbuf_name.  */
7413 static Lisp_Object Vcode_conversion_reused_workbuf;
7414
7415 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7416 static int reused_workbuf_in_use;
7417
7418
7419 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7420    multibyteness of returning buffer.  */
7421
7422 static Lisp_Object
7423 make_conversion_work_buffer (int multibyte)
7424 {
7425   Lisp_Object name, workbuf;
7426   struct buffer *current;
7427
7428   if (reused_workbuf_in_use++)
7429     {
7430       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7431       workbuf = Fget_buffer_create (name);
7432     }
7433   else
7434     {
7435       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7436         Vcode_conversion_reused_workbuf
7437           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7438       workbuf = Vcode_conversion_reused_workbuf;
7439     }
7440   current = current_buffer;
7441   set_buffer_internal (XBUFFER (workbuf));
7442   /* We can't allow modification hooks to run in the work buffer.  For
7443      instance, directory_files_internal assumes that file decoding
7444      doesn't compile new regexps.  */
7445   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7446   Ferase_buffer ();
7447   BVAR (current_buffer, undo_list) = Qt;
7448   BVAR (current_buffer, enable_multibyte_characters) = multibyte ? Qt : Qnil;
7449   set_buffer_internal (current);
7450   return workbuf;
7451 }
7452
7453
7454 static Lisp_Object
7455 code_conversion_restore (Lisp_Object arg)
7456 {
7457   Lisp_Object current, workbuf;
7458   struct gcpro gcpro1;
7459
7460   GCPRO1 (arg);
7461   current = XCAR (arg);
7462   workbuf = XCDR (arg);
7463   if (! NILP (workbuf))
7464     {
7465       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7466         reused_workbuf_in_use = 0;
7467       else if (! NILP (Fbuffer_live_p (workbuf)))
7468         Fkill_buffer (workbuf);
7469     }
7470   set_buffer_internal (XBUFFER (current));
7471   UNGCPRO;
7472   return Qnil;
7473 }
7474
7475 Lisp_Object
7476 code_conversion_save (int with_work_buf, int multibyte)
7477 {
7478   Lisp_Object workbuf = Qnil;
7479
7480   if (with_work_buf)
7481     workbuf = make_conversion_work_buffer (multibyte);
7482   record_unwind_protect (code_conversion_restore,
7483                          Fcons (Fcurrent_buffer (), workbuf));
7484   return workbuf;
7485 }
7486
7487 int
7488 decode_coding_gap (struct coding_system *coding,
7489                    EMACS_INT chars, EMACS_INT bytes)
7490 {
7491   int count = SPECPDL_INDEX ();
7492   Lisp_Object attrs;
7493
7494   code_conversion_save (0, 0);
7495
7496   coding->src_object = Fcurrent_buffer ();
7497   coding->src_chars = chars;
7498   coding->src_bytes = bytes;
7499   coding->src_pos = -chars;
7500   coding->src_pos_byte = -bytes;
7501   coding->src_multibyte = chars < bytes;
7502   coding->dst_object = coding->src_object;
7503   coding->dst_pos = PT;
7504   coding->dst_pos_byte = PT_BYTE;
7505   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7506
7507   if (CODING_REQUIRE_DETECTION (coding))
7508     detect_coding (coding);
7509
7510   coding->mode |= CODING_MODE_LAST_BLOCK;
7511   current_buffer->text->inhibit_shrinking = 1;
7512   decode_coding (coding);
7513   current_buffer->text->inhibit_shrinking = 0;
7514
7515   attrs = CODING_ID_ATTRS (coding->id);
7516   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7517     {
7518       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7519       Lisp_Object val;
7520
7521       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7522       val = call1 (CODING_ATTR_POST_READ (attrs),
7523                    make_number (coding->produced_char));
7524       CHECK_NATNUM (val);
7525       coding->produced_char += Z - prev_Z;
7526       coding->produced += Z_BYTE - prev_Z_BYTE;
7527     }
7528
7529   unbind_to (count, Qnil);
7530   return coding->result;
7531 }
7532
7533
7534 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7535    SRC_OBJECT into DST_OBJECT by coding context CODING.
7536
7537    SRC_OBJECT is a buffer, a string, or Qnil.
7538
7539    If it is a buffer, the text is at point of the buffer.  FROM and TO
7540    are positions in the buffer.
7541
7542    If it is a string, the text is at the beginning of the string.
7543    FROM and TO are indices to the string.
7544
7545    If it is nil, the text is at coding->source.  FROM and TO are
7546    indices to coding->source.
7547
7548    DST_OBJECT is a buffer, Qt, or Qnil.
7549
7550    If it is a buffer, the decoded text is inserted at point of the
7551    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7552    is deleted.
7553
7554    If it is Qt, a string is made from the decoded text, and
7555    set in CODING->dst_object.
7556
7557    If it is Qnil, the decoded text is stored at CODING->destination.
7558    The caller must allocate CODING->dst_bytes bytes at
7559    CODING->destination by xmalloc.  If the decoded text is longer than
7560    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7561  */
7562
7563 void
7564 decode_coding_object (struct coding_system *coding,
7565                       Lisp_Object src_object,
7566                       EMACS_INT from, EMACS_INT from_byte,
7567                       EMACS_INT to, EMACS_INT to_byte,
7568                       Lisp_Object dst_object)
7569 {
7570   int count = SPECPDL_INDEX ();
7571   unsigned char *destination IF_LINT (= NULL);
7572   EMACS_INT dst_bytes IF_LINT (= 0);
7573   EMACS_INT chars = to - from;
7574   EMACS_INT bytes = to_byte - from_byte;
7575   Lisp_Object attrs;
7576   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7577   int need_marker_adjustment = 0;
7578   Lisp_Object old_deactivate_mark;
7579
7580   old_deactivate_mark = Vdeactivate_mark;
7581
7582   if (NILP (dst_object))
7583     {
7584       destination = coding->destination;
7585       dst_bytes = coding->dst_bytes;
7586     }
7587
7588   coding->src_object = src_object;
7589   coding->src_chars = chars;
7590   coding->src_bytes = bytes;
7591   coding->src_multibyte = chars < bytes;
7592
7593   if (STRINGP (src_object))
7594     {
7595       coding->src_pos = from;
7596       coding->src_pos_byte = from_byte;
7597     }
7598   else if (BUFFERP (src_object))
7599     {
7600       set_buffer_internal (XBUFFER (src_object));
7601       if (from != GPT)
7602         move_gap_both (from, from_byte);
7603       if (EQ (src_object, dst_object))
7604         {
7605           struct Lisp_Marker *tail;
7606
7607           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7608             {
7609               tail->need_adjustment
7610                 = tail->charpos == (tail->insertion_type ? from : to);
7611               need_marker_adjustment |= tail->need_adjustment;
7612             }
7613           saved_pt = PT, saved_pt_byte = PT_BYTE;
7614           TEMP_SET_PT_BOTH (from, from_byte);
7615           current_buffer->text->inhibit_shrinking = 1;
7616           del_range_both (from, from_byte, to, to_byte, 1);
7617           coding->src_pos = -chars;
7618           coding->src_pos_byte = -bytes;
7619         }
7620       else
7621         {
7622           coding->src_pos = from;
7623           coding->src_pos_byte = from_byte;
7624         }
7625     }
7626
7627   if (CODING_REQUIRE_DETECTION (coding))
7628     detect_coding (coding);
7629   attrs = CODING_ID_ATTRS (coding->id);
7630
7631   if (EQ (dst_object, Qt)
7632       || (! NILP (CODING_ATTR_POST_READ (attrs))
7633           && NILP (dst_object)))
7634     {
7635       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7636       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7637       coding->dst_pos = BEG;
7638       coding->dst_pos_byte = BEG_BYTE;
7639     }
7640   else if (BUFFERP (dst_object))
7641     {
7642       code_conversion_save (0, 0);
7643       coding->dst_object = dst_object;
7644       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7645       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7646       coding->dst_multibyte
7647         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7648     }
7649   else
7650     {
7651       code_conversion_save (0, 0);
7652       coding->dst_object = Qnil;
7653       /* Most callers presume this will return a multibyte result, and they
7654          won't use `binary' or `raw-text' anyway, so let's not worry about
7655          CODING_FOR_UNIBYTE.  */
7656       coding->dst_multibyte = 1;
7657     }
7658
7659   decode_coding (coding);
7660
7661   if (BUFFERP (coding->dst_object))
7662     set_buffer_internal (XBUFFER (coding->dst_object));
7663
7664   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7665     {
7666       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7667       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7668       Lisp_Object val;
7669
7670       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7671       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7672               old_deactivate_mark);
7673       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7674                         make_number (coding->produced_char));
7675       UNGCPRO;
7676       CHECK_NATNUM (val);
7677       coding->produced_char += Z - prev_Z;
7678       coding->produced += Z_BYTE - prev_Z_BYTE;
7679     }
7680
7681   if (EQ (dst_object, Qt))
7682     {
7683       coding->dst_object = Fbuffer_string ();
7684     }
7685   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7686     {
7687       set_buffer_internal (XBUFFER (coding->dst_object));
7688       if (dst_bytes < coding->produced)
7689         {
7690           destination = xrealloc (destination, coding->produced);
7691           if (! destination)
7692             {
7693               record_conversion_result (coding,
7694                                         CODING_RESULT_INSUFFICIENT_MEM);
7695               unbind_to (count, Qnil);
7696               return;
7697             }
7698           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7699             move_gap_both (BEGV, BEGV_BYTE);
7700           memcpy (destination, BEGV_ADDR, coding->produced);
7701           coding->destination = destination;
7702         }
7703     }
7704
7705   if (saved_pt >= 0)
7706     {
7707       /* This is the case of:
7708          (BUFFERP (src_object) && EQ (src_object, dst_object))
7709          As we have moved PT while replacing the original buffer
7710          contents, we must recover it now.  */
7711       set_buffer_internal (XBUFFER (src_object));
7712       current_buffer->text->inhibit_shrinking = 0;
7713       if (saved_pt < from)
7714         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7715       else if (saved_pt < from + chars)
7716         TEMP_SET_PT_BOTH (from, from_byte);
7717       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7718         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7719                           saved_pt_byte + (coding->produced - bytes));
7720       else
7721         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7722                           saved_pt_byte + (coding->produced - bytes));
7723
7724       if (need_marker_adjustment)
7725         {
7726           struct Lisp_Marker *tail;
7727
7728           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7729             if (tail->need_adjustment)
7730               {
7731                 tail->need_adjustment = 0;
7732                 if (tail->insertion_type)
7733                   {
7734                     tail->bytepos = from_byte;
7735                     tail->charpos = from;
7736                   }
7737                 else
7738                   {
7739                     tail->bytepos = from_byte + coding->produced;
7740                     tail->charpos
7741                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7742                          ? tail->bytepos : from + coding->produced_char);
7743                   }
7744               }
7745         }
7746     }
7747
7748   Vdeactivate_mark = old_deactivate_mark;
7749   unbind_to (count, coding->dst_object);
7750 }
7751
7752
7753 void
7754 encode_coding_object (struct coding_system *coding,
7755                       Lisp_Object src_object,
7756                       EMACS_INT from, EMACS_INT from_byte,
7757                       EMACS_INT to, EMACS_INT to_byte,
7758                       Lisp_Object dst_object)
7759 {
7760   int count = SPECPDL_INDEX ();
7761   EMACS_INT chars = to - from;
7762   EMACS_INT bytes = to_byte - from_byte;
7763   Lisp_Object attrs;
7764   int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7765   int need_marker_adjustment = 0;
7766   int kill_src_buffer = 0;
7767   Lisp_Object old_deactivate_mark;
7768
7769   old_deactivate_mark = Vdeactivate_mark;
7770
7771   coding->src_object = src_object;
7772   coding->src_chars = chars;
7773   coding->src_bytes = bytes;
7774   coding->src_multibyte = chars < bytes;
7775
7776   attrs = CODING_ID_ATTRS (coding->id);
7777
7778   if (EQ (src_object, dst_object))
7779     {
7780       struct Lisp_Marker *tail;
7781
7782       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7783         {
7784           tail->need_adjustment
7785             = tail->charpos == (tail->insertion_type ? from : to);
7786           need_marker_adjustment |= tail->need_adjustment;
7787         }
7788     }
7789
7790   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7791     {
7792       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7793       set_buffer_internal (XBUFFER (coding->src_object));
7794       if (STRINGP (src_object))
7795         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7796       else if (BUFFERP (src_object))
7797         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7798       else
7799         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7800
7801       if (EQ (src_object, dst_object))
7802         {
7803           set_buffer_internal (XBUFFER (src_object));
7804           saved_pt = PT, saved_pt_byte = PT_BYTE;
7805           del_range_both (from, from_byte, to, to_byte, 1);
7806           set_buffer_internal (XBUFFER (coding->src_object));
7807         }
7808
7809       {
7810         Lisp_Object args[3];
7811         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7812
7813         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7814                 old_deactivate_mark);
7815         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7816         args[1] = make_number (BEG);
7817         args[2] = make_number (Z);
7818         safe_call (3, args);
7819         UNGCPRO;
7820       }
7821       if (XBUFFER (coding->src_object) != current_buffer)
7822         kill_src_buffer = 1;
7823       coding->src_object = Fcurrent_buffer ();
7824       if (BEG != GPT)
7825         move_gap_both (BEG, BEG_BYTE);
7826       coding->src_chars = Z - BEG;
7827       coding->src_bytes = Z_BYTE - BEG_BYTE;
7828       coding->src_pos = BEG;
7829       coding->src_pos_byte = BEG_BYTE;
7830       coding->src_multibyte = Z < Z_BYTE;
7831     }
7832   else if (STRINGP (src_object))
7833     {
7834       code_conversion_save (0, 0);
7835       coding->src_pos = from;
7836       coding->src_pos_byte = from_byte;
7837     }
7838   else if (BUFFERP (src_object))
7839     {
7840       code_conversion_save (0, 0);
7841       set_buffer_internal (XBUFFER (src_object));
7842       if (EQ (src_object, dst_object))
7843         {
7844           saved_pt = PT, saved_pt_byte = PT_BYTE;
7845           coding->src_object = del_range_1 (from, to, 1, 1);
7846           coding->src_pos = 0;
7847           coding->src_pos_byte = 0;
7848         }
7849       else
7850         {
7851           if (from < GPT && to >= GPT)
7852             move_gap_both (from, from_byte);
7853           coding->src_pos = from;
7854           coding->src_pos_byte = from_byte;
7855         }
7856     }
7857   else
7858     code_conversion_save (0, 0);
7859
7860   if (BUFFERP (dst_object))
7861     {
7862       coding->dst_object = dst_object;
7863       if (EQ (src_object, dst_object))
7864         {
7865           coding->dst_pos = from;
7866           coding->dst_pos_byte = from_byte;
7867         }
7868       else
7869         {
7870           struct buffer *current = current_buffer;
7871
7872           set_buffer_temp (XBUFFER (dst_object));
7873           coding->dst_pos = PT;
7874           coding->dst_pos_byte = PT_BYTE;
7875           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7876           set_buffer_temp (current);
7877         }
7878       coding->dst_multibyte
7879         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7880     }
7881   else if (EQ (dst_object, Qt))
7882     {
7883       coding->dst_object = Qnil;
7884       coding->dst_bytes = coding->src_chars;
7885       if (coding->dst_bytes == 0)
7886         coding->dst_bytes = 1;
7887       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7888       coding->dst_multibyte = 0;
7889     }
7890   else
7891     {
7892       coding->dst_object = Qnil;
7893       coding->dst_multibyte = 0;
7894     }
7895
7896   encode_coding (coding);
7897
7898   if (EQ (dst_object, Qt))
7899     {
7900       if (BUFFERP (coding->dst_object))
7901         coding->dst_object = Fbuffer_string ();
7902       else
7903         {
7904           coding->dst_object
7905             = make_unibyte_string ((char *) coding->destination,
7906                                    coding->produced);
7907           xfree (coding->destination);
7908         }
7909     }
7910
7911   if (saved_pt >= 0)
7912     {
7913       /* This is the case of:
7914          (BUFFERP (src_object) && EQ (src_object, dst_object))
7915          As we have moved PT while replacing the original buffer
7916          contents, we must recover it now.  */
7917       set_buffer_internal (XBUFFER (src_object));
7918       if (saved_pt < from)
7919         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7920       else if (saved_pt < from + chars)
7921         TEMP_SET_PT_BOTH (from, from_byte);
7922       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7923         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7924                           saved_pt_byte + (coding->produced - bytes));
7925       else
7926         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7927                           saved_pt_byte + (coding->produced - bytes));
7928
7929       if (need_marker_adjustment)
7930         {
7931           struct Lisp_Marker *tail;
7932
7933           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7934             if (tail->need_adjustment)
7935               {
7936                 tail->need_adjustment = 0;
7937                 if (tail->insertion_type)
7938                   {
7939                     tail->bytepos = from_byte;
7940                     tail->charpos = from;
7941                   }
7942                 else
7943                   {
7944                     tail->bytepos = from_byte + coding->produced;
7945                     tail->charpos
7946                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7947                          ? tail->bytepos : from + coding->produced_char);
7948                   }
7949               }
7950         }
7951     }
7952
7953   if (kill_src_buffer)
7954     Fkill_buffer (coding->src_object);
7955
7956   Vdeactivate_mark = old_deactivate_mark;
7957   unbind_to (count, Qnil);
7958 }
7959
7960
7961 Lisp_Object
7962 preferred_coding_system (void)
7963 {
7964   int id = coding_categories[coding_priorities[0]].id;
7965
7966   return CODING_ID_NAME (id);
7967 }
7968
7969 \f
7970 #ifdef emacs
7971 /*** 8. Emacs Lisp library functions ***/
7972
7973 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7974        doc: /* Return t if OBJECT is nil or a coding-system.
7975 See the documentation of `define-coding-system' for information
7976 about coding-system objects.  */)
7977   (Lisp_Object object)
7978 {
7979   if (NILP (object)
7980       || CODING_SYSTEM_ID (object) >= 0)
7981     return Qt;
7982   if (! SYMBOLP (object)
7983       || NILP (Fget (object, Qcoding_system_define_form)))
7984     return Qnil;
7985   return Qt;
7986 }
7987
7988 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7989        Sread_non_nil_coding_system, 1, 1, 0,
7990        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7991   (Lisp_Object prompt)
7992 {
7993   Lisp_Object val;
7994   do
7995     {
7996       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7997                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7998     }
7999   while (SCHARS (val) == 0);
8000   return (Fintern (val, Qnil));
8001 }
8002
8003 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8004        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8005 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8006 Ignores case when completing coding systems (all Emacs coding systems
8007 are lower-case).  */)
8008   (Lisp_Object prompt, Lisp_Object default_coding_system)
8009 {
8010   Lisp_Object val;
8011   int count = SPECPDL_INDEX ();
8012
8013   if (SYMBOLP (default_coding_system))
8014     default_coding_system = SYMBOL_NAME (default_coding_system);
8015   specbind (Qcompletion_ignore_case, Qt);
8016   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8017                           Qt, Qnil, Qcoding_system_history,
8018                           default_coding_system, Qnil);
8019   unbind_to (count, Qnil);
8020   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8021 }
8022
8023 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8024        1, 1, 0,
8025        doc: /* Check validity of CODING-SYSTEM.
8026 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8027 It is valid if it is nil or a symbol defined as a coding system by the
8028 function `define-coding-system'.  */)
8029   (Lisp_Object coding_system)
8030 {
8031   Lisp_Object define_form;
8032
8033   define_form = Fget (coding_system, Qcoding_system_define_form);
8034   if (! NILP (define_form))
8035     {
8036       Fput (coding_system, Qcoding_system_define_form, Qnil);
8037       safe_eval (define_form);
8038     }
8039   if (!NILP (Fcoding_system_p (coding_system)))
8040     return coding_system;
8041   xsignal1 (Qcoding_system_error, coding_system);
8042 }
8043
8044 \f
8045 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8046    HIGHEST is nonzero, return the coding system of the highest
8047    priority among the detected coding systems.  Otherwise return a
8048    list of detected coding systems sorted by their priorities.  If
8049    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8050    multibyte form but contains only ASCII and eight-bit chars.
8051    Otherwise, the bytes are raw bytes.
8052
8053    CODING-SYSTEM controls the detection as below:
8054
8055    If it is nil, detect both text-format and eol-format.  If the
8056    text-format part of CODING-SYSTEM is already specified
8057    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8058    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8059    detect only text-format.  */
8060
8061 Lisp_Object
8062 detect_coding_system (const unsigned char *src,
8063                       EMACS_INT src_chars, EMACS_INT src_bytes,
8064                       int highest, int multibytep,
8065                       Lisp_Object coding_system)
8066 {
8067   const unsigned char *src_end = src + src_bytes;
8068   Lisp_Object attrs, eol_type;
8069   Lisp_Object val = Qnil;
8070   struct coding_system coding;
8071   int id;
8072   struct coding_detection_info detect_info;
8073   enum coding_category base_category;
8074   int null_byte_found = 0, eight_bit_found = 0;
8075
8076   if (NILP (coding_system))
8077     coding_system = Qundecided;
8078   setup_coding_system (coding_system, &coding);
8079   attrs = CODING_ID_ATTRS (coding.id);
8080   eol_type = CODING_ID_EOL_TYPE (coding.id);
8081   coding_system = CODING_ATTR_BASE_NAME (attrs);
8082
8083   coding.source = src;
8084   coding.src_chars = src_chars;
8085   coding.src_bytes = src_bytes;
8086   coding.src_multibyte = multibytep;
8087   coding.consumed = 0;
8088   coding.mode |= CODING_MODE_LAST_BLOCK;
8089   coding.head_ascii = 0;
8090
8091   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8092
8093   /* At first, detect text-format if necessary.  */
8094   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8095   if (base_category == coding_category_undecided)
8096     {
8097       enum coding_category category IF_LINT (= 0);
8098       struct coding_system *this IF_LINT (= NULL);
8099       int c, i;
8100
8101       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8102       for (; src < src_end; src++)
8103         {
8104           c = *src;
8105           if (c & 0x80)
8106             {
8107               eight_bit_found = 1;
8108               if (null_byte_found)
8109                 break;
8110             }
8111           else if (c < 0x20)
8112             {
8113               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8114                   && ! inhibit_iso_escape_detection
8115                   && ! detect_info.checked)
8116                 {
8117                   if (detect_coding_iso_2022 (&coding, &detect_info))
8118                     {
8119                       /* We have scanned the whole data.  */
8120                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8121                         {
8122                           /* We didn't find an 8-bit code.  We may
8123                              have found a null-byte, but it's very
8124                              rare that a binary file confirm to
8125                              ISO-2022.  */
8126                           src = src_end;
8127                           coding.head_ascii = src - coding.source;
8128                         }
8129                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8130                       break;
8131                     }
8132                 }
8133               else if (! c && !inhibit_null_byte_detection)
8134                 {
8135                   null_byte_found = 1;
8136                   if (eight_bit_found)
8137                     break;
8138                 }
8139               if (! eight_bit_found)
8140                 coding.head_ascii++;
8141             }
8142           else if (! eight_bit_found)
8143             coding.head_ascii++;
8144         }
8145
8146       if (null_byte_found || eight_bit_found
8147           || coding.head_ascii < coding.src_bytes
8148           || detect_info.found)
8149         {
8150           if (coding.head_ascii == coding.src_bytes)
8151             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8152             for (i = 0; i < coding_category_raw_text; i++)
8153               {
8154                 category = coding_priorities[i];
8155                 this = coding_categories + category;
8156                 if (detect_info.found & (1 << category))
8157                   break;
8158               }
8159           else
8160             {
8161               if (null_byte_found)
8162                 {
8163                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8164                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8165                 }
8166               for (i = 0; i < coding_category_raw_text; i++)
8167                 {
8168                   category = coding_priorities[i];
8169                   this = coding_categories + category;
8170
8171                   if (this->id < 0)
8172                     {
8173                       /* No coding system of this category is defined.  */
8174                       detect_info.rejected |= (1 << category);
8175                     }
8176                   else if (category >= coding_category_raw_text)
8177                     continue;
8178                   else if (detect_info.checked & (1 << category))
8179                     {
8180                       if (highest
8181                           && (detect_info.found & (1 << category)))
8182                         break;
8183                     }
8184                   else if ((*(this->detector)) (&coding, &detect_info)
8185                            && highest
8186                            && (detect_info.found & (1 << category)))
8187                     {
8188                       if (category == coding_category_utf_16_auto)
8189                         {
8190                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8191                             category = coding_category_utf_16_le;
8192                           else
8193                             category = coding_category_utf_16_be;
8194                         }
8195                       break;
8196                     }
8197                 }
8198             }
8199         }
8200
8201       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8202           || null_byte_found)
8203         {
8204           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8205           id = CODING_SYSTEM_ID (Qno_conversion);
8206           val = Fcons (make_number (id), Qnil);
8207         }
8208       else if (! detect_info.rejected && ! detect_info.found)
8209         {
8210           detect_info.found = CATEGORY_MASK_ANY;
8211           id = coding_categories[coding_category_undecided].id;
8212           val = Fcons (make_number (id), Qnil);
8213         }
8214       else if (highest)
8215         {
8216           if (detect_info.found)
8217             {
8218               detect_info.found = 1 << category;
8219               val = Fcons (make_number (this->id), Qnil);
8220             }
8221           else
8222             for (i = 0; i < coding_category_raw_text; i++)
8223               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8224                 {
8225                   detect_info.found = 1 << coding_priorities[i];
8226                   id = coding_categories[coding_priorities[i]].id;
8227                   val = Fcons (make_number (id), Qnil);
8228                   break;
8229                 }
8230         }
8231       else
8232         {
8233           int mask = detect_info.rejected | detect_info.found;
8234           int found = 0;
8235
8236           for (i = coding_category_raw_text - 1; i >= 0; i--)
8237             {
8238               category = coding_priorities[i];
8239               if (! (mask & (1 << category)))
8240                 {
8241                   found |= 1 << category;
8242                   id = coding_categories[category].id;
8243                   if (id >= 0)
8244                     val = Fcons (make_number (id), val);
8245                 }
8246             }
8247           for (i = coding_category_raw_text - 1; i >= 0; i--)
8248             {
8249               category = coding_priorities[i];
8250               if (detect_info.found & (1 << category))
8251                 {
8252                   id = coding_categories[category].id;
8253                   val = Fcons (make_number (id), val);
8254                 }
8255             }
8256           detect_info.found |= found;
8257         }
8258     }
8259   else if (base_category == coding_category_utf_8_auto)
8260     {
8261       if (detect_coding_utf_8 (&coding, &detect_info))
8262         {
8263           struct coding_system *this;
8264
8265           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8266             this = coding_categories + coding_category_utf_8_sig;
8267           else
8268             this = coding_categories + coding_category_utf_8_nosig;
8269           val = Fcons (make_number (this->id), Qnil);
8270         }
8271     }
8272   else if (base_category == coding_category_utf_16_auto)
8273     {
8274       if (detect_coding_utf_16 (&coding, &detect_info))
8275         {
8276           struct coding_system *this;
8277
8278           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8279             this = coding_categories + coding_category_utf_16_le;
8280           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8281             this = coding_categories + coding_category_utf_16_be;
8282           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8283             this = coding_categories + coding_category_utf_16_be_nosig;
8284           else
8285             this = coding_categories + coding_category_utf_16_le_nosig;
8286           val = Fcons (make_number (this->id), Qnil);
8287         }
8288     }
8289   else
8290     {
8291       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8292       val = Fcons (make_number (coding.id), Qnil);
8293     }
8294
8295   /* Then, detect eol-format if necessary.  */
8296   {
8297     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8298     Lisp_Object tail;
8299
8300     if (VECTORP (eol_type))
8301       {
8302         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8303           {
8304             if (null_byte_found)
8305               normal_eol = EOL_SEEN_LF;
8306             else
8307               normal_eol = detect_eol (coding.source, src_bytes,
8308                                        coding_category_raw_text);
8309           }
8310         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8311                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8312           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8313                                       coding_category_utf_16_be);
8314         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8315                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8316           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8317                                       coding_category_utf_16_le);
8318       }
8319     else
8320       {
8321         if (EQ (eol_type, Qunix))
8322           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8323         else if (EQ (eol_type, Qdos))
8324           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8325         else
8326           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8327       }
8328
8329     for (tail = val; CONSP (tail); tail = XCDR (tail))
8330       {
8331         enum coding_category category;
8332         int this_eol;
8333
8334         id = XINT (XCAR (tail));
8335         attrs = CODING_ID_ATTRS (id);
8336         category = XINT (CODING_ATTR_CATEGORY (attrs));
8337         eol_type = CODING_ID_EOL_TYPE (id);
8338         if (VECTORP (eol_type))
8339           {
8340             if (category == coding_category_utf_16_be
8341                 || category == coding_category_utf_16_be_nosig)
8342               this_eol = utf_16_be_eol;
8343             else if (category == coding_category_utf_16_le
8344                      || category == coding_category_utf_16_le_nosig)
8345               this_eol = utf_16_le_eol;
8346             else
8347               this_eol = normal_eol;
8348
8349             if (this_eol == EOL_SEEN_LF)
8350               XSETCAR (tail, AREF (eol_type, 0));
8351             else if (this_eol == EOL_SEEN_CRLF)
8352               XSETCAR (tail, AREF (eol_type, 1));
8353             else if (this_eol == EOL_SEEN_CR)
8354               XSETCAR (tail, AREF (eol_type, 2));
8355             else
8356               XSETCAR (tail, CODING_ID_NAME (id));
8357           }
8358         else
8359           XSETCAR (tail, CODING_ID_NAME (id));
8360       }
8361   }
8362
8363   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8364 }
8365
8366
8367 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8368        2, 3, 0,
8369        doc: /* Detect coding system of the text in the region between START and END.
8370 Return a list of possible coding systems ordered by priority.
8371 The coding systems to try and their priorities follows what
8372 the function `coding-system-priority-list' (which see) returns.
8373
8374 If only ASCII characters are found (except for such ISO-2022 control
8375 characters as ESC), it returns a list of single element `undecided'
8376 or its subsidiary coding system according to a detected end-of-line
8377 format.
8378
8379 If optional argument HIGHEST is non-nil, return the coding system of
8380 highest priority.  */)
8381   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8382 {
8383   int from, to;
8384   int from_byte, to_byte;
8385
8386   CHECK_NUMBER_COERCE_MARKER (start);
8387   CHECK_NUMBER_COERCE_MARKER (end);
8388
8389   validate_region (&start, &end);
8390   from = XINT (start), to = XINT (end);
8391   from_byte = CHAR_TO_BYTE (from);
8392   to_byte = CHAR_TO_BYTE (to);
8393
8394   if (from < GPT && to >= GPT)
8395     move_gap_both (to, to_byte);
8396
8397   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8398                                to - from, to_byte - from_byte,
8399                                !NILP (highest),
8400                                !NILP (BVAR (current_buffer
8401                                       , enable_multibyte_characters)),
8402                                Qnil);
8403 }
8404
8405 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8406        1, 2, 0,
8407        doc: /* Detect coding system of the text in STRING.
8408 Return a list of possible coding systems ordered by priority.
8409 The coding systems to try and their priorities follows what
8410 the function `coding-system-priority-list' (which see) returns.
8411
8412 If only ASCII characters are found (except for such ISO-2022 control
8413 characters as ESC), it returns a list of single element `undecided'
8414 or its subsidiary coding system according to a detected end-of-line
8415 format.
8416
8417 If optional argument HIGHEST is non-nil, return the coding system of
8418 highest priority.  */)
8419   (Lisp_Object string, Lisp_Object highest)
8420 {
8421   CHECK_STRING (string);
8422
8423   return detect_coding_system (SDATA (string),
8424                                SCHARS (string), SBYTES (string),
8425                                !NILP (highest), STRING_MULTIBYTE (string),
8426                                Qnil);
8427 }
8428
8429
8430 static INLINE int
8431 char_encodable_p (int c, Lisp_Object attrs)
8432 {
8433   Lisp_Object tail;
8434   struct charset *charset;
8435   Lisp_Object translation_table;
8436
8437   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8438   if (! NILP (translation_table))
8439     c = translate_char (translation_table, c);
8440   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8441        CONSP (tail); tail = XCDR (tail))
8442     {
8443       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8444       if (CHAR_CHARSET_P (c, charset))
8445         break;
8446     }
8447   return (! NILP (tail));
8448 }
8449
8450
8451 /* Return a list of coding systems that safely encode the text between
8452    START and END.  If EXCLUDE is non-nil, it is a list of coding
8453    systems not to check.  The returned list doesn't contain any such
8454    coding systems.  In any case, if the text contains only ASCII or is
8455    unibyte, return t.  */
8456
8457 DEFUN ("find-coding-systems-region-internal",
8458        Ffind_coding_systems_region_internal,
8459        Sfind_coding_systems_region_internal, 2, 3, 0,
8460        doc: /* Internal use only.  */)
8461   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8462 {
8463   Lisp_Object coding_attrs_list, safe_codings;
8464   EMACS_INT start_byte, end_byte;
8465   const unsigned char *p, *pbeg, *pend;
8466   int c;
8467   Lisp_Object tail, elt, work_table;
8468
8469   if (STRINGP (start))
8470     {
8471       if (!STRING_MULTIBYTE (start)
8472           || SCHARS (start) == SBYTES (start))
8473         return Qt;
8474       start_byte = 0;
8475       end_byte = SBYTES (start);
8476     }
8477   else
8478     {
8479       CHECK_NUMBER_COERCE_MARKER (start);
8480       CHECK_NUMBER_COERCE_MARKER (end);
8481       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8482         args_out_of_range (start, end);
8483       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8484         return Qt;
8485       start_byte = CHAR_TO_BYTE (XINT (start));
8486       end_byte = CHAR_TO_BYTE (XINT (end));
8487       if (XINT (end) - XINT (start) == end_byte - start_byte)
8488         return Qt;
8489
8490       if (XINT (start) < GPT && XINT (end) > GPT)
8491         {
8492           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8493             move_gap_both (XINT (start), start_byte);
8494           else
8495             move_gap_both (XINT (end), end_byte);
8496         }
8497     }
8498
8499   coding_attrs_list = Qnil;
8500   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8501     if (NILP (exclude)
8502         || NILP (Fmemq (XCAR (tail), exclude)))
8503       {
8504         Lisp_Object attrs;
8505
8506         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8507         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8508             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8509           {
8510             ASET (attrs, coding_attr_trans_tbl,
8511                   get_translation_table (attrs, 1, NULL));
8512             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8513           }
8514       }
8515
8516   if (STRINGP (start))
8517     p = pbeg = SDATA (start);
8518   else
8519     p = pbeg = BYTE_POS_ADDR (start_byte);
8520   pend = p + (end_byte - start_byte);
8521
8522   while (p < pend && ASCII_BYTE_P (*p)) p++;
8523   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8524
8525   work_table = Fmake_char_table (Qnil, Qnil);
8526   while (p < pend)
8527     {
8528       if (ASCII_BYTE_P (*p))
8529         p++;
8530       else
8531         {
8532           c = STRING_CHAR_ADVANCE (p);
8533           if (!NILP (char_table_ref (work_table, c)))
8534             /* This character was already checked.  Ignore it.  */
8535             continue;
8536
8537           charset_map_loaded = 0;
8538           for (tail = coding_attrs_list; CONSP (tail);)
8539             {
8540               elt = XCAR (tail);
8541               if (NILP (elt))
8542                 tail = XCDR (tail);
8543               else if (char_encodable_p (c, elt))
8544                 tail = XCDR (tail);
8545               else if (CONSP (XCDR (tail)))
8546                 {
8547                   XSETCAR (tail, XCAR (XCDR (tail)));
8548                   XSETCDR (tail, XCDR (XCDR (tail)));
8549                 }
8550               else
8551                 {
8552                   XSETCAR (tail, Qnil);
8553                   tail = XCDR (tail);
8554                 }
8555             }
8556           if (charset_map_loaded)
8557             {
8558               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8559
8560               if (STRINGP (start))
8561                 pbeg = SDATA (start);
8562               else
8563                 pbeg = BYTE_POS_ADDR (start_byte);
8564               p = pbeg + p_offset;
8565               pend = pbeg + pend_offset;
8566             }
8567           char_table_set (work_table, c, Qt);
8568         }
8569     }
8570
8571   safe_codings = list2 (Qraw_text, Qno_conversion);
8572   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8573     if (! NILP (XCAR (tail)))
8574       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8575
8576   return safe_codings;
8577 }
8578
8579
8580 DEFUN ("unencodable-char-position", Funencodable_char_position,
8581        Sunencodable_char_position, 3, 5, 0,
8582        doc: /*
8583 Return position of first un-encodable character in a region.
8584 START and END specify the region and CODING-SYSTEM specifies the
8585 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8586
8587 If optional 4th argument COUNT is non-nil, it specifies at most how
8588 many un-encodable characters to search.  In this case, the value is a
8589 list of positions.
8590
8591 If optional 5th argument STRING is non-nil, it is a string to search
8592 for un-encodable characters.  In that case, START and END are indexes
8593 to the string.  */)
8594   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8595 {
8596   int n;
8597   struct coding_system coding;
8598   Lisp_Object attrs, charset_list, translation_table;
8599   Lisp_Object positions;
8600   int from, to;
8601   const unsigned char *p, *stop, *pend;
8602   int ascii_compatible;
8603
8604   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8605   attrs = CODING_ID_ATTRS (coding.id);
8606   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8607     return Qnil;
8608   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8609   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8610   translation_table = get_translation_table (attrs, 1, NULL);
8611
8612   if (NILP (string))
8613     {
8614       validate_region (&start, &end);
8615       from = XINT (start);
8616       to = XINT (end);
8617       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8618           || (ascii_compatible
8619               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8620         return Qnil;
8621       p = CHAR_POS_ADDR (from);
8622       pend = CHAR_POS_ADDR (to);
8623       if (from < GPT && to >= GPT)
8624         stop = GPT_ADDR;
8625       else
8626         stop = pend;
8627     }
8628   else
8629     {
8630       CHECK_STRING (string);
8631       CHECK_NATNUM (start);
8632       CHECK_NATNUM (end);
8633       from = XINT (start);
8634       to = XINT (end);
8635       if (from > to
8636           || to > SCHARS (string))
8637         args_out_of_range_3 (string, start, end);
8638       if (! STRING_MULTIBYTE (string))
8639         return Qnil;
8640       p = SDATA (string) + string_char_to_byte (string, from);
8641       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8642       if (ascii_compatible && (to - from) == (pend - p))
8643         return Qnil;
8644     }
8645
8646   if (NILP (count))
8647     n = 1;
8648   else
8649     {
8650       CHECK_NATNUM (count);
8651       n = XINT (count);
8652     }
8653
8654   positions = Qnil;
8655   while (1)
8656     {
8657       int c;
8658
8659       if (ascii_compatible)
8660         while (p < stop && ASCII_BYTE_P (*p))
8661           p++, from++;
8662       if (p >= stop)
8663         {
8664           if (p >= pend)
8665             break;
8666           stop = pend;
8667           p = GAP_END_ADDR;
8668         }
8669
8670       c = STRING_CHAR_ADVANCE (p);
8671       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8672           && ! char_charset (translate_char (translation_table, c),
8673                              charset_list, NULL))
8674         {
8675           positions = Fcons (make_number (from), positions);
8676           n--;
8677           if (n == 0)
8678             break;
8679         }
8680
8681       from++;
8682     }
8683
8684   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8685 }
8686
8687
8688 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8689        Scheck_coding_systems_region, 3, 3, 0,
8690        doc: /* Check if the region is encodable by coding systems.
8691
8692 START and END are buffer positions specifying the region.
8693 CODING-SYSTEM-LIST is a list of coding systems to check.
8694
8695 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8696 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8697 whole region, POS0, POS1, ... are buffer positions where non-encodable
8698 characters are found.
8699
8700 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8701 value is nil.
8702
8703 START may be a string.  In that case, check if the string is
8704 encodable, and the value contains indices to the string instead of
8705 buffer positions.  END is ignored.
8706
8707 If the current buffer (or START if it is a string) is unibyte, the value
8708 is nil.  */)
8709   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8710 {
8711   Lisp_Object list;
8712   EMACS_INT start_byte, end_byte;
8713   int pos;
8714   const unsigned char *p, *pbeg, *pend;
8715   int c;
8716   Lisp_Object tail, elt, attrs;
8717
8718   if (STRINGP (start))
8719     {
8720       if (!STRING_MULTIBYTE (start)
8721           || SCHARS (start) == SBYTES (start))
8722         return Qnil;
8723       start_byte = 0;
8724       end_byte = SBYTES (start);
8725       pos = 0;
8726     }
8727   else
8728     {
8729       CHECK_NUMBER_COERCE_MARKER (start);
8730       CHECK_NUMBER_COERCE_MARKER (end);
8731       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8732         args_out_of_range (start, end);
8733       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8734         return Qnil;
8735       start_byte = CHAR_TO_BYTE (XINT (start));
8736       end_byte = CHAR_TO_BYTE (XINT (end));
8737       if (XINT (end) - XINT (start) == end_byte - start_byte)
8738         return Qnil;
8739
8740       if (XINT (start) < GPT && XINT (end) > GPT)
8741         {
8742           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8743             move_gap_both (XINT (start), start_byte);
8744           else
8745             move_gap_both (XINT (end), end_byte);
8746         }
8747       pos = XINT (start);
8748     }
8749
8750   list = Qnil;
8751   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8752     {
8753       elt = XCAR (tail);
8754       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8755       ASET (attrs, coding_attr_trans_tbl,
8756             get_translation_table (attrs, 1, NULL));
8757       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8758     }
8759
8760   if (STRINGP (start))
8761     p = pbeg = SDATA (start);
8762   else
8763     p = pbeg = BYTE_POS_ADDR (start_byte);
8764   pend = p + (end_byte - start_byte);
8765
8766   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8767   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8768
8769   while (p < pend)
8770     {
8771       if (ASCII_BYTE_P (*p))
8772         p++;
8773       else
8774         {
8775           c = STRING_CHAR_ADVANCE (p);
8776
8777           charset_map_loaded = 0;
8778           for (tail = list; CONSP (tail); tail = XCDR (tail))
8779             {
8780               elt = XCDR (XCAR (tail));
8781               if (! char_encodable_p (c, XCAR (elt)))
8782                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8783             }
8784           if (charset_map_loaded)
8785             {
8786               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8787
8788               if (STRINGP (start))
8789                 pbeg = SDATA (start);
8790               else
8791                 pbeg = BYTE_POS_ADDR (start_byte);
8792               p = pbeg + p_offset;
8793               pend = pbeg + pend_offset;
8794             }
8795         }
8796       pos++;
8797     }
8798
8799   tail = list;
8800   list = Qnil;
8801   for (; CONSP (tail); tail = XCDR (tail))
8802     {
8803       elt = XCAR (tail);
8804       if (CONSP (XCDR (XCDR (elt))))
8805         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8806                       list);
8807     }
8808
8809   return list;
8810 }
8811
8812
8813 static Lisp_Object
8814 code_convert_region (Lisp_Object start, Lisp_Object end,
8815                      Lisp_Object coding_system, Lisp_Object dst_object,
8816                      int encodep, int norecord)
8817 {
8818   struct coding_system coding;
8819   EMACS_INT from, from_byte, to, to_byte;
8820   Lisp_Object src_object;
8821
8822   CHECK_NUMBER_COERCE_MARKER (start);
8823   CHECK_NUMBER_COERCE_MARKER (end);
8824   if (NILP (coding_system))
8825     coding_system = Qno_conversion;
8826   else
8827     CHECK_CODING_SYSTEM (coding_system);
8828   src_object = Fcurrent_buffer ();
8829   if (NILP (dst_object))
8830     dst_object = src_object;
8831   else if (! EQ (dst_object, Qt))
8832     CHECK_BUFFER (dst_object);
8833
8834   validate_region (&start, &end);
8835   from = XFASTINT (start);
8836   from_byte = CHAR_TO_BYTE (from);
8837   to = XFASTINT (end);
8838   to_byte = CHAR_TO_BYTE (to);
8839
8840   setup_coding_system (coding_system, &coding);
8841   coding.mode |= CODING_MODE_LAST_BLOCK;
8842
8843   if (encodep)
8844     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8845                           dst_object);
8846   else
8847     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8848                           dst_object);
8849   if (! norecord)
8850     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8851
8852   return (BUFFERP (dst_object)
8853           ? make_number (coding.produced_char)
8854           : coding.dst_object);
8855 }
8856
8857
8858 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8859        3, 4, "r\nzCoding system: ",
8860        doc: /* Decode the current region from the specified coding system.
8861 When called from a program, takes four arguments:
8862         START, END, CODING-SYSTEM, and DESTINATION.
8863 START and END are buffer positions.
8864
8865 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8866 If nil, the region between START and END is replaced by the decoded text.
8867 If buffer, the decoded text is inserted in that buffer after point (point
8868 does not move).
8869 In those cases, the length of the decoded text is returned.
8870 If DESTINATION is t, the decoded text is returned.
8871
8872 This function sets `last-coding-system-used' to the precise coding system
8873 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8874 not fully specified.)  */)
8875   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8876 {
8877   return code_convert_region (start, end, coding_system, destination, 0, 0);
8878 }
8879
8880 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8881        3, 4, "r\nzCoding system: ",
8882        doc: /* Encode the current region by specified coding system.
8883 When called from a program, takes four arguments:
8884         START, END, CODING-SYSTEM and DESTINATION.
8885 START and END are buffer positions.
8886
8887 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8888 If nil, the region between START and END is replace by the encoded text.
8889 If buffer, the encoded text is inserted in that buffer after point (point
8890 does not move).
8891 In those cases, the length of the encoded text is returned.
8892 If DESTINATION is t, the encoded text is returned.
8893
8894 This function sets `last-coding-system-used' to the precise coding system
8895 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8896 not fully specified.)  */)
8897   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8898 {
8899   return code_convert_region (start, end, coding_system, destination, 1, 0);
8900 }
8901
8902 Lisp_Object
8903 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8904                      Lisp_Object dst_object, int encodep, int nocopy, int norecord)
8905 {
8906   struct coding_system coding;
8907   EMACS_INT chars, bytes;
8908
8909   CHECK_STRING (string);
8910   if (NILP (coding_system))
8911     {
8912       if (! norecord)
8913         Vlast_coding_system_used = Qno_conversion;
8914       if (NILP (dst_object))
8915         return (nocopy ? Fcopy_sequence (string) : string);
8916     }
8917
8918   if (NILP (coding_system))
8919     coding_system = Qno_conversion;
8920   else
8921     CHECK_CODING_SYSTEM (coding_system);
8922   if (NILP (dst_object))
8923     dst_object = Qt;
8924   else if (! EQ (dst_object, Qt))
8925     CHECK_BUFFER (dst_object);
8926
8927   setup_coding_system (coding_system, &coding);
8928   coding.mode |= CODING_MODE_LAST_BLOCK;
8929   chars = SCHARS (string);
8930   bytes = SBYTES (string);
8931   if (encodep)
8932     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8933   else
8934     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8935   if (! norecord)
8936     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8937
8938   return (BUFFERP (dst_object)
8939           ? make_number (coding.produced_char)
8940           : coding.dst_object);
8941 }
8942
8943
8944 /* Encode or decode STRING according to CODING_SYSTEM.
8945    Do not set Vlast_coding_system_used.
8946
8947    This function is called only from macros DECODE_FILE and
8948    ENCODE_FILE, thus we ignore character composition.  */
8949
8950 Lisp_Object
8951 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8952                               int encodep)
8953 {
8954   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8955 }
8956
8957
8958 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8959        2, 4, 0,
8960        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8961
8962 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8963 if the decoding operation is trivial.
8964
8965 Optional fourth arg BUFFER non-nil means that the decoded text is
8966 inserted in that buffer after point (point does not move).  In this
8967 case, the return value is the length of the decoded text.
8968
8969 This function sets `last-coding-system-used' to the precise coding system
8970 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8971 not fully specified.)  */)
8972   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
8973 {
8974   return code_convert_string (string, coding_system, buffer,
8975                               0, ! NILP (nocopy), 0);
8976 }
8977
8978 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8979        2, 4, 0,
8980        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8981
8982 Optional third arg NOCOPY non-nil means it is OK to return STRING
8983 itself if the encoding operation is trivial.
8984
8985 Optional fourth arg BUFFER non-nil means that the encoded text is
8986 inserted in that buffer after point (point does not move).  In this
8987 case, the return value is the length of the encoded text.
8988
8989 This function sets `last-coding-system-used' to the precise coding system
8990 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8991 not fully specified.)  */)
8992   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
8993 {
8994   return code_convert_string (string, coding_system, buffer,
8995                               1, ! NILP (nocopy), 1);
8996 }
8997
8998 \f
8999 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9000        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9001 Return the corresponding character.  */)
9002   (Lisp_Object code)
9003 {
9004   Lisp_Object spec, attrs, val;
9005   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9006   EMACS_INT ch;
9007   int c;
9008
9009   CHECK_NATNUM (code);
9010   ch = XFASTINT (code);
9011   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9012   attrs = AREF (spec, 0);
9013
9014   if (ASCII_BYTE_P (ch)
9015       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9016     return code;
9017
9018   val = CODING_ATTR_CHARSET_LIST (attrs);
9019   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9020   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9021   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9022
9023   if (ch <= 0x7F)
9024     {
9025       c = ch;
9026       charset = charset_roman;
9027     }
9028   else if (ch >= 0xA0 && ch < 0xDF)
9029     {
9030       c = ch - 0x80;
9031       charset = charset_kana;
9032     }
9033   else
9034     {
9035       EMACS_INT c1 = ch >> 8;
9036       int c2 = ch & 0xFF;
9037
9038       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9039           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9040         error ("Invalid code: %"pI"d", ch);
9041       c = ch;
9042       SJIS_TO_JIS (c);
9043       charset = charset_kanji;
9044     }
9045   c = DECODE_CHAR (charset, c);
9046   if (c < 0)
9047     error ("Invalid code: %"pI"d", ch);
9048   return make_number (c);
9049 }
9050
9051
9052 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9053        doc: /* Encode a Japanese character CH to shift_jis encoding.
9054 Return the corresponding code in SJIS.  */)
9055   (Lisp_Object ch)
9056 {
9057   Lisp_Object spec, attrs, charset_list;
9058   int c;
9059   struct charset *charset;
9060   unsigned code;
9061
9062   CHECK_CHARACTER (ch);
9063   c = XFASTINT (ch);
9064   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9065   attrs = AREF (spec, 0);
9066
9067   if (ASCII_CHAR_P (c)
9068       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9069     return ch;
9070
9071   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9072   charset = char_charset (c, charset_list, &code);
9073   if (code == CHARSET_INVALID_CODE (charset))
9074     error ("Can't encode by shift_jis encoding: %d", c);
9075   JIS_TO_SJIS (code);
9076
9077   return make_number (code);
9078 }
9079
9080 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9081        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9082 Return the corresponding character.  */)
9083   (Lisp_Object code)
9084 {
9085   Lisp_Object spec, attrs, val;
9086   struct charset *charset_roman, *charset_big5, *charset;
9087   EMACS_INT ch;
9088   int c;
9089
9090   CHECK_NATNUM (code);
9091   ch = XFASTINT (code);
9092   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9093   attrs = AREF (spec, 0);
9094
9095   if (ASCII_BYTE_P (ch)
9096       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9097     return code;
9098
9099   val = CODING_ATTR_CHARSET_LIST (attrs);
9100   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9101   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9102
9103   if (ch <= 0x7F)
9104     {
9105       c = ch;
9106       charset = charset_roman;
9107     }
9108   else
9109     {
9110       EMACS_INT b1 = ch >> 8;
9111       int b2 = ch & 0x7F;
9112       if (b1 < 0xA1 || b1 > 0xFE
9113           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9114         error ("Invalid code: %"pI"d", ch);
9115       c = ch;
9116       charset = charset_big5;
9117     }
9118   c = DECODE_CHAR (charset, c);
9119   if (c < 0)
9120     error ("Invalid code: %"pI"d", ch);
9121   return make_number (c);
9122 }
9123
9124 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9125        doc: /* Encode the Big5 character CH to BIG5 coding system.
9126 Return the corresponding character code in Big5.  */)
9127   (Lisp_Object ch)
9128 {
9129   Lisp_Object spec, attrs, charset_list;
9130   struct charset *charset;
9131   int c;
9132   unsigned code;
9133
9134   CHECK_CHARACTER (ch);
9135   c = XFASTINT (ch);
9136   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9137   attrs = AREF (spec, 0);
9138   if (ASCII_CHAR_P (c)
9139       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9140     return ch;
9141
9142   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9143   charset = char_charset (c, charset_list, &code);
9144   if (code == CHARSET_INVALID_CODE (charset))
9145     error ("Can't encode by Big5 encoding: %d", c);
9146
9147   return make_number (code);
9148 }
9149
9150 \f
9151 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9152        Sset_terminal_coding_system_internal, 1, 2, 0,
9153        doc: /* Internal use only.  */)
9154   (Lisp_Object coding_system, Lisp_Object terminal)
9155 {
9156   struct terminal *term = get_terminal (terminal, 1);
9157   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9158   CHECK_SYMBOL (coding_system);
9159   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9160   /* We had better not send unsafe characters to terminal.  */
9161   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9162   /* Character composition should be disabled.  */
9163   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9164   terminal_coding->src_multibyte = 1;
9165   terminal_coding->dst_multibyte = 0;
9166   if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9167     term->charset_list = coding_charset_list (terminal_coding);
9168   else
9169     term->charset_list = Fcons (make_number (charset_ascii), Qnil);
9170   return Qnil;
9171 }
9172
9173 DEFUN ("set-safe-terminal-coding-system-internal",
9174        Fset_safe_terminal_coding_system_internal,
9175        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9176        doc: /* Internal use only.  */)
9177   (Lisp_Object coding_system)
9178 {
9179   CHECK_SYMBOL (coding_system);
9180   setup_coding_system (Fcheck_coding_system (coding_system),
9181                        &safe_terminal_coding);
9182   /* Character composition should be disabled.  */
9183   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9184   safe_terminal_coding.src_multibyte = 1;
9185   safe_terminal_coding.dst_multibyte = 0;
9186   return Qnil;
9187 }
9188
9189 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9190        Sterminal_coding_system, 0, 1, 0,
9191        doc: /* Return coding system specified for terminal output on the given terminal.
9192 TERMINAL may be a terminal object, a frame, or nil for the selected
9193 frame's terminal device.  */)
9194   (Lisp_Object terminal)
9195 {
9196   struct coding_system *terminal_coding
9197     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9198   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9199
9200   /* For backward compatibility, return nil if it is `undecided'. */
9201   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9202 }
9203
9204 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9205        Sset_keyboard_coding_system_internal, 1, 2, 0,
9206        doc: /* Internal use only.  */)
9207   (Lisp_Object coding_system, Lisp_Object terminal)
9208 {
9209   struct terminal *t = get_terminal (terminal, 1);
9210   CHECK_SYMBOL (coding_system);
9211   if (NILP (coding_system))
9212     coding_system = Qno_conversion;
9213   else
9214     Fcheck_coding_system (coding_system);
9215   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9216   /* Character composition should be disabled.  */
9217   TERMINAL_KEYBOARD_CODING (t)->common_flags
9218     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9219   return Qnil;
9220 }
9221
9222 DEFUN ("keyboard-coding-system",
9223        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9224        doc: /* Return coding system specified for decoding keyboard input.  */)
9225   (Lisp_Object terminal)
9226 {
9227   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9228                          (get_terminal (terminal, 1))->id);
9229 }
9230
9231 \f
9232 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9233        Sfind_operation_coding_system,  1, MANY, 0,
9234        doc: /* Choose a coding system for an operation based on the target name.
9235 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9236 DECODING-SYSTEM is the coding system to use for decoding
9237 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9238 for encoding (in case OPERATION does encoding).
9239
9240 The first argument OPERATION specifies an I/O primitive:
9241   For file I/O, `insert-file-contents' or `write-region'.
9242   For process I/O, `call-process', `call-process-region', or `start-process'.
9243   For network I/O, `open-network-stream'.
9244
9245 The remaining arguments should be the same arguments that were passed
9246 to the primitive.  Depending on which primitive, one of those arguments
9247 is selected as the TARGET.  For example, if OPERATION does file I/O,
9248 whichever argument specifies the file name is TARGET.
9249
9250 TARGET has a meaning which depends on OPERATION:
9251   For file I/O, TARGET is a file name (except for the special case below).
9252   For process I/O, TARGET is a process name.
9253   For network I/O, TARGET is a service name or a port number.
9254
9255 This function looks up what is specified for TARGET in
9256 `file-coding-system-alist', `process-coding-system-alist',
9257 or `network-coding-system-alist' depending on OPERATION.
9258 They may specify a coding system, a cons of coding systems,
9259 or a function symbol to call.
9260 In the last case, we call the function with one argument,
9261 which is a list of all the arguments given to this function.
9262 If the function can't decide a coding system, it can return
9263 `undecided' so that the normal code-detection is performed.
9264
9265 If OPERATION is `insert-file-contents', the argument corresponding to
9266 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9267 file name to look up, and BUFFER is a buffer that contains the file's
9268 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9269 function to call for FILENAME, that function should examine the
9270 contents of BUFFER instead of reading the file.
9271
9272 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9273   (size_t nargs, Lisp_Object *args)
9274 {
9275   Lisp_Object operation, target_idx, target, val;
9276   register Lisp_Object chain;
9277
9278   if (nargs < 2)
9279     error ("Too few arguments");
9280   operation = args[0];
9281   if (!SYMBOLP (operation)
9282       || !NATNUMP (target_idx = Fget (operation, Qtarget_idx)))
9283     error ("Invalid first argument");
9284   if (nargs < 1 + XFASTINT (target_idx))
9285     error ("Too few arguments for operation: %s",
9286            SDATA (SYMBOL_NAME (operation)));
9287   target = args[XFASTINT (target_idx) + 1];
9288   if (!(STRINGP (target)
9289         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9290             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9291         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9292     error ("Invalid %"pI"dth argument", XFASTINT (target_idx) + 1);
9293   if (CONSP (target))
9294     target = XCAR (target);
9295
9296   chain = ((EQ (operation, Qinsert_file_contents)
9297             || EQ (operation, Qwrite_region))
9298            ? Vfile_coding_system_alist
9299            : (EQ (operation, Qopen_network_stream)
9300               ? Vnetwork_coding_system_alist
9301               : Vprocess_coding_system_alist));
9302   if (NILP (chain))
9303     return Qnil;
9304
9305   for (; CONSP (chain); chain = XCDR (chain))
9306     {
9307       Lisp_Object elt;
9308
9309       elt = XCAR (chain);
9310       if (CONSP (elt)
9311           && ((STRINGP (target)
9312                && STRINGP (XCAR (elt))
9313                && fast_string_match (XCAR (elt), target) >= 0)
9314               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9315         {
9316           val = XCDR (elt);
9317           /* Here, if VAL is both a valid coding system and a valid
9318              function symbol, we return VAL as a coding system.  */
9319           if (CONSP (val))
9320             return val;
9321           if (! SYMBOLP (val))
9322             return Qnil;
9323           if (! NILP (Fcoding_system_p (val)))
9324             return Fcons (val, val);
9325           if (! NILP (Ffboundp (val)))
9326             {
9327               /* We use call1 rather than safe_call1
9328                  so as to get bug reports about functions called here
9329                  which don't handle the current interface.  */
9330               val = call1 (val, Flist (nargs, args));
9331               if (CONSP (val))
9332                 return val;
9333               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9334                 return Fcons (val, val);
9335             }
9336           return Qnil;
9337         }
9338     }
9339   return Qnil;
9340 }
9341
9342 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9343        Sset_coding_system_priority, 0, MANY, 0,
9344        doc: /* Assign higher priority to the coding systems given as arguments.
9345 If multiple coding systems belong to the same category,
9346 all but the first one are ignored.
9347
9348 usage: (set-coding-system-priority &rest coding-systems)  */)
9349   (size_t nargs, Lisp_Object *args)
9350 {
9351   size_t i, j;
9352   int changed[coding_category_max];
9353   enum coding_category priorities[coding_category_max];
9354
9355   memset (changed, 0, sizeof changed);
9356
9357   for (i = j = 0; i < nargs; i++)
9358     {
9359       enum coding_category category;
9360       Lisp_Object spec, attrs;
9361
9362       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9363       attrs = AREF (spec, 0);
9364       category = XINT (CODING_ATTR_CATEGORY (attrs));
9365       if (changed[category])
9366         /* Ignore this coding system because a coding system of the
9367            same category already had a higher priority.  */
9368         continue;
9369       changed[category] = 1;
9370       priorities[j++] = category;
9371       if (coding_categories[category].id >= 0
9372           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9373         setup_coding_system (args[i], &coding_categories[category]);
9374       Fset (AREF (Vcoding_category_table, category), args[i]);
9375     }
9376
9377   /* Now we have decided top J priorities.  Reflect the order of the
9378      original priorities to the remaining priorities.  */
9379
9380   for (i = j, j = 0; i < coding_category_max; i++, j++)
9381     {
9382       while (j < coding_category_max
9383              && changed[coding_priorities[j]])
9384         j++;
9385       if (j == coding_category_max)
9386         abort ();
9387       priorities[i] = coding_priorities[j];
9388     }
9389
9390   memcpy (coding_priorities, priorities, sizeof priorities);
9391
9392   /* Update `coding-category-list'.  */
9393   Vcoding_category_list = Qnil;
9394   for (i = coding_category_max; i-- > 0; )
9395     Vcoding_category_list
9396       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9397                Vcoding_category_list);
9398
9399   return Qnil;
9400 }
9401
9402 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9403        Scoding_system_priority_list, 0, 1, 0,
9404        doc: /* Return a list of coding systems ordered by their priorities.
9405 The list contains a subset of coding systems; i.e. coding systems
9406 assigned to each coding category (see `coding-category-list').
9407
9408 HIGHESTP non-nil means just return the highest priority one.  */)
9409   (Lisp_Object highestp)
9410 {
9411   int i;
9412   Lisp_Object val;
9413
9414   for (i = 0, val = Qnil; i < coding_category_max; i++)
9415     {
9416       enum coding_category category = coding_priorities[i];
9417       int id = coding_categories[category].id;
9418       Lisp_Object attrs;
9419
9420       if (id < 0)
9421         continue;
9422       attrs = CODING_ID_ATTRS (id);
9423       if (! NILP (highestp))
9424         return CODING_ATTR_BASE_NAME (attrs);
9425       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9426     }
9427   return Fnreverse (val);
9428 }
9429
9430 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9431
9432 static Lisp_Object
9433 make_subsidiaries (Lisp_Object base)
9434 {
9435   Lisp_Object subsidiaries;
9436   int base_name_len = SBYTES (SYMBOL_NAME (base));
9437   char *buf = (char *) alloca (base_name_len + 6);
9438   int i;
9439
9440   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9441   subsidiaries = Fmake_vector (make_number (3), Qnil);
9442   for (i = 0; i < 3; i++)
9443     {
9444       memcpy (buf + base_name_len, suffixes[i], strlen (suffixes[i]) + 1);
9445       ASET (subsidiaries, i, intern (buf));
9446     }
9447   return subsidiaries;
9448 }
9449
9450
9451 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9452        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9453        doc: /* For internal use only.
9454 usage: (define-coding-system-internal ...)  */)
9455   (size_t nargs, Lisp_Object *args)
9456 {
9457   Lisp_Object name;
9458   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9459   Lisp_Object attrs;            /* Vector of attributes.  */
9460   Lisp_Object eol_type;
9461   Lisp_Object aliases;
9462   Lisp_Object coding_type, charset_list, safe_charsets;
9463   enum coding_category category;
9464   Lisp_Object tail, val;
9465   int max_charset_id = 0;
9466   int i;
9467
9468   if (nargs < coding_arg_max)
9469     goto short_args;
9470
9471   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9472
9473   name = args[coding_arg_name];
9474   CHECK_SYMBOL (name);
9475   CODING_ATTR_BASE_NAME (attrs) = name;
9476
9477   val = args[coding_arg_mnemonic];
9478   if (! STRINGP (val))
9479     CHECK_CHARACTER (val);
9480   CODING_ATTR_MNEMONIC (attrs) = val;
9481
9482   coding_type = args[coding_arg_coding_type];
9483   CHECK_SYMBOL (coding_type);
9484   CODING_ATTR_TYPE (attrs) = coding_type;
9485
9486   charset_list = args[coding_arg_charset_list];
9487   if (SYMBOLP (charset_list))
9488     {
9489       if (EQ (charset_list, Qiso_2022))
9490         {
9491           if (! EQ (coding_type, Qiso_2022))
9492             error ("Invalid charset-list");
9493           charset_list = Viso_2022_charset_list;
9494         }
9495       else if (EQ (charset_list, Qemacs_mule))
9496         {
9497           if (! EQ (coding_type, Qemacs_mule))
9498             error ("Invalid charset-list");
9499           charset_list = Vemacs_mule_charset_list;
9500         }
9501       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9502         if (max_charset_id < XFASTINT (XCAR (tail)))
9503           max_charset_id = XFASTINT (XCAR (tail));
9504     }
9505   else
9506     {
9507       charset_list = Fcopy_sequence (charset_list);
9508       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9509         {
9510           struct charset *charset;
9511
9512           val = XCAR (tail);
9513           CHECK_CHARSET_GET_CHARSET (val, charset);
9514           if (EQ (coding_type, Qiso_2022)
9515               ? CHARSET_ISO_FINAL (charset) < 0
9516               : EQ (coding_type, Qemacs_mule)
9517               ? CHARSET_EMACS_MULE_ID (charset) < 0
9518               : 0)
9519             error ("Can't handle charset `%s'",
9520                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9521
9522           XSETCAR (tail, make_number (charset->id));
9523           if (max_charset_id < charset->id)
9524             max_charset_id = charset->id;
9525         }
9526     }
9527   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9528
9529   safe_charsets = make_uninit_string (max_charset_id + 1);
9530   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9531   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9532     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9533   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9534
9535   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9536
9537   val = args[coding_arg_decode_translation_table];
9538   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9539     CHECK_SYMBOL (val);
9540   CODING_ATTR_DECODE_TBL (attrs) = val;
9541
9542   val = args[coding_arg_encode_translation_table];
9543   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9544     CHECK_SYMBOL (val);
9545   CODING_ATTR_ENCODE_TBL (attrs) = val;
9546
9547   val = args[coding_arg_post_read_conversion];
9548   CHECK_SYMBOL (val);
9549   CODING_ATTR_POST_READ (attrs) = val;
9550
9551   val = args[coding_arg_pre_write_conversion];
9552   CHECK_SYMBOL (val);
9553   CODING_ATTR_PRE_WRITE (attrs) = val;
9554
9555   val = args[coding_arg_default_char];
9556   if (NILP (val))
9557     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9558   else
9559     {
9560       CHECK_CHARACTER (val);
9561       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9562     }
9563
9564   val = args[coding_arg_for_unibyte];
9565   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9566
9567   val = args[coding_arg_plist];
9568   CHECK_LIST (val);
9569   CODING_ATTR_PLIST (attrs) = val;
9570
9571   if (EQ (coding_type, Qcharset))
9572     {
9573       /* Generate a lisp vector of 256 elements.  Each element is nil,
9574          integer, or a list of charset IDs.
9575
9576          If Nth element is nil, the byte code N is invalid in this
9577          coding system.
9578
9579          If Nth element is a number NUM, N is the first byte of a
9580          charset whose ID is NUM.
9581
9582          If Nth element is a list of charset IDs, N is the first byte
9583          of one of them.  The list is sorted by dimensions of the
9584          charsets.  A charset of smaller dimension comes first. */
9585       val = Fmake_vector (make_number (256), Qnil);
9586
9587       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9588         {
9589           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9590           int dim = CHARSET_DIMENSION (charset);
9591           int idx = (dim - 1) * 4;
9592
9593           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9594             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9595
9596           for (i = charset->code_space[idx];
9597                i <= charset->code_space[idx + 1]; i++)
9598             {
9599               Lisp_Object tmp, tmp2;
9600               int dim2;
9601
9602               tmp = AREF (val, i);
9603               if (NILP (tmp))
9604                 tmp = XCAR (tail);
9605               else if (NUMBERP (tmp))
9606                 {
9607                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9608                   if (dim < dim2)
9609                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9610                   else
9611                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9612                 }
9613               else
9614                 {
9615                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9616                     {
9617                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9618                       if (dim < dim2)
9619                         break;
9620                     }
9621                   if (NILP (tmp2))
9622                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9623                   else
9624                     {
9625                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9626                       XSETCAR (tmp2, XCAR (tail));
9627                     }
9628                 }
9629               ASET (val, i, tmp);
9630             }
9631         }
9632       ASET (attrs, coding_attr_charset_valids, val);
9633       category = coding_category_charset;
9634     }
9635   else if (EQ (coding_type, Qccl))
9636     {
9637       Lisp_Object valids;
9638
9639       if (nargs < coding_arg_ccl_max)
9640         goto short_args;
9641
9642       val = args[coding_arg_ccl_decoder];
9643       CHECK_CCL_PROGRAM (val);
9644       if (VECTORP (val))
9645         val = Fcopy_sequence (val);
9646       ASET (attrs, coding_attr_ccl_decoder, val);
9647
9648       val = args[coding_arg_ccl_encoder];
9649       CHECK_CCL_PROGRAM (val);
9650       if (VECTORP (val))
9651         val = Fcopy_sequence (val);
9652       ASET (attrs, coding_attr_ccl_encoder, val);
9653
9654       val = args[coding_arg_ccl_valids];
9655       valids = Fmake_string (make_number (256), make_number (0));
9656       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9657         {
9658           int from, to;
9659
9660           val = Fcar (tail);
9661           if (INTEGERP (val))
9662             {
9663               from = to = XINT (val);
9664               if (from < 0 || from > 255)
9665                 args_out_of_range_3 (val, make_number (0), make_number (255));
9666             }
9667           else
9668             {
9669               CHECK_CONS (val);
9670               CHECK_NATNUM_CAR (val);
9671               CHECK_NATNUM_CDR (val);
9672               from = XINT (XCAR (val));
9673               if (from > 255)
9674                 args_out_of_range_3 (XCAR (val),
9675                                      make_number (0), make_number (255));
9676               to = XINT (XCDR (val));
9677               if (to < from || to > 255)
9678                 args_out_of_range_3 (XCDR (val),
9679                                      XCAR (val), make_number (255));
9680             }
9681           for (i = from; i <= to; i++)
9682             SSET (valids, i, 1);
9683         }
9684       ASET (attrs, coding_attr_ccl_valids, valids);
9685
9686       category = coding_category_ccl;
9687     }
9688   else if (EQ (coding_type, Qutf_16))
9689     {
9690       Lisp_Object bom, endian;
9691
9692       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9693
9694       if (nargs < coding_arg_utf16_max)
9695         goto short_args;
9696
9697       bom = args[coding_arg_utf16_bom];
9698       if (! NILP (bom) && ! EQ (bom, Qt))
9699         {
9700           CHECK_CONS (bom);
9701           val = XCAR (bom);
9702           CHECK_CODING_SYSTEM (val);
9703           val = XCDR (bom);
9704           CHECK_CODING_SYSTEM (val);
9705         }
9706       ASET (attrs, coding_attr_utf_bom, bom);
9707
9708       endian = args[coding_arg_utf16_endian];
9709       CHECK_SYMBOL (endian);
9710       if (NILP (endian))
9711         endian = Qbig;
9712       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9713         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9714       ASET (attrs, coding_attr_utf_16_endian, endian);
9715
9716       category = (CONSP (bom)
9717                   ? coding_category_utf_16_auto
9718                   : NILP (bom)
9719                   ? (EQ (endian, Qbig)
9720                      ? coding_category_utf_16_be_nosig
9721                      : coding_category_utf_16_le_nosig)
9722                   : (EQ (endian, Qbig)
9723                      ? coding_category_utf_16_be
9724                      : coding_category_utf_16_le));
9725     }
9726   else if (EQ (coding_type, Qiso_2022))
9727     {
9728       Lisp_Object initial, reg_usage, request, flags;
9729
9730       if (nargs < coding_arg_iso2022_max)
9731         goto short_args;
9732
9733       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9734       CHECK_VECTOR (initial);
9735       for (i = 0; i < 4; i++)
9736         {
9737           val = Faref (initial, make_number (i));
9738           if (! NILP (val))
9739             {
9740               struct charset *charset;
9741
9742               CHECK_CHARSET_GET_CHARSET (val, charset);
9743               ASET (initial, i, make_number (CHARSET_ID (charset)));
9744               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9745                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9746             }
9747           else
9748             ASET (initial, i, make_number (-1));
9749         }
9750
9751       reg_usage = args[coding_arg_iso2022_reg_usage];
9752       CHECK_CONS (reg_usage);
9753       CHECK_NUMBER_CAR (reg_usage);
9754       CHECK_NUMBER_CDR (reg_usage);
9755
9756       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9757       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9758         {
9759           int id;
9760           Lisp_Object tmp1;
9761
9762           val = Fcar (tail);
9763           CHECK_CONS (val);
9764           tmp1 = XCAR (val);
9765           CHECK_CHARSET_GET_ID (tmp1, id);
9766           CHECK_NATNUM_CDR (val);
9767           if (XINT (XCDR (val)) >= 4)
9768             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9769           XSETCAR (val, make_number (id));
9770         }
9771
9772       flags = args[coding_arg_iso2022_flags];
9773       CHECK_NATNUM (flags);
9774       i = XINT (flags);
9775       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9776         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9777
9778       ASET (attrs, coding_attr_iso_initial, initial);
9779       ASET (attrs, coding_attr_iso_usage, reg_usage);
9780       ASET (attrs, coding_attr_iso_request, request);
9781       ASET (attrs, coding_attr_iso_flags, flags);
9782       setup_iso_safe_charsets (attrs);
9783
9784       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9785         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9786                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9787                     ? coding_category_iso_7_else
9788                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9789                     ? coding_category_iso_7
9790                     : coding_category_iso_7_tight);
9791       else
9792         {
9793           int id = XINT (AREF (initial, 1));
9794
9795           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9796                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9797                        || id < 0)
9798                       ? coding_category_iso_8_else
9799                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9800                       ? coding_category_iso_8_1
9801                       : coding_category_iso_8_2);
9802         }
9803       if (category != coding_category_iso_8_1
9804           && category != coding_category_iso_8_2)
9805         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9806     }
9807   else if (EQ (coding_type, Qemacs_mule))
9808     {
9809       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9810         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9811       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9812       category = coding_category_emacs_mule;
9813     }
9814   else if (EQ (coding_type, Qshift_jis))
9815     {
9816
9817       struct charset *charset;
9818
9819       if (XINT (Flength (charset_list)) != 3
9820           && XINT (Flength (charset_list)) != 4)
9821         error ("There should be three or four charsets");
9822
9823       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9824       if (CHARSET_DIMENSION (charset) != 1)
9825         error ("Dimension of charset %s is not one",
9826                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9827       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9828         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9829
9830       charset_list = XCDR (charset_list);
9831       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9832       if (CHARSET_DIMENSION (charset) != 1)
9833         error ("Dimension of charset %s is not one",
9834                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9835
9836       charset_list = XCDR (charset_list);
9837       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9838       if (CHARSET_DIMENSION (charset) != 2)
9839         error ("Dimension of charset %s is not two",
9840                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9841
9842       charset_list = XCDR (charset_list);
9843       if (! NILP (charset_list))
9844         {
9845           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9846           if (CHARSET_DIMENSION (charset) != 2)
9847             error ("Dimension of charset %s is not two",
9848                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9849         }
9850
9851       category = coding_category_sjis;
9852       Vsjis_coding_system = name;
9853     }
9854   else if (EQ (coding_type, Qbig5))
9855     {
9856       struct charset *charset;
9857
9858       if (XINT (Flength (charset_list)) != 2)
9859         error ("There should be just two charsets");
9860
9861       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9862       if (CHARSET_DIMENSION (charset) != 1)
9863         error ("Dimension of charset %s is not one",
9864                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9865       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9866         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9867
9868       charset_list = XCDR (charset_list);
9869       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9870       if (CHARSET_DIMENSION (charset) != 2)
9871         error ("Dimension of charset %s is not two",
9872                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9873
9874       category = coding_category_big5;
9875       Vbig5_coding_system = name;
9876     }
9877   else if (EQ (coding_type, Qraw_text))
9878     {
9879       category = coding_category_raw_text;
9880       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9881     }
9882   else if (EQ (coding_type, Qutf_8))
9883     {
9884       Lisp_Object bom;
9885
9886       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9887
9888       if (nargs < coding_arg_utf8_max)
9889         goto short_args;
9890
9891       bom = args[coding_arg_utf8_bom];
9892       if (! NILP (bom) && ! EQ (bom, Qt))
9893         {
9894           CHECK_CONS (bom);
9895           val = XCAR (bom);
9896           CHECK_CODING_SYSTEM (val);
9897           val = XCDR (bom);
9898           CHECK_CODING_SYSTEM (val);
9899         }
9900       ASET (attrs, coding_attr_utf_bom, bom);
9901
9902       category = (CONSP (bom) ? coding_category_utf_8_auto
9903                   : NILP (bom) ? coding_category_utf_8_nosig
9904                   : coding_category_utf_8_sig);
9905     }
9906   else if (EQ (coding_type, Qundecided))
9907     category = coding_category_undecided;
9908   else
9909     error ("Invalid coding system type: %s",
9910            SDATA (SYMBOL_NAME (coding_type)));
9911
9912   CODING_ATTR_CATEGORY (attrs) = make_number (category);
9913   CODING_ATTR_PLIST (attrs)
9914     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9915                                 CODING_ATTR_PLIST (attrs)));
9916   CODING_ATTR_PLIST (attrs)
9917     = Fcons (QCascii_compatible_p,
9918              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9919                     CODING_ATTR_PLIST (attrs)));
9920
9921   eol_type = args[coding_arg_eol_type];
9922   if (! NILP (eol_type)
9923       && ! EQ (eol_type, Qunix)
9924       && ! EQ (eol_type, Qdos)
9925       && ! EQ (eol_type, Qmac))
9926     error ("Invalid eol-type");
9927
9928   aliases = Fcons (name, Qnil);
9929
9930   if (NILP (eol_type))
9931     {
9932       eol_type = make_subsidiaries (name);
9933       for (i = 0; i < 3; i++)
9934         {
9935           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9936
9937           this_name = AREF (eol_type, i);
9938           this_aliases = Fcons (this_name, Qnil);
9939           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9940           this_spec = Fmake_vector (make_number (3), attrs);
9941           ASET (this_spec, 1, this_aliases);
9942           ASET (this_spec, 2, this_eol_type);
9943           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9944           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9945           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9946           if (NILP (val))
9947             Vcoding_system_alist
9948               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9949                        Vcoding_system_alist);
9950         }
9951     }
9952
9953   spec_vec = Fmake_vector (make_number (3), attrs);
9954   ASET (spec_vec, 1, aliases);
9955   ASET (spec_vec, 2, eol_type);
9956
9957   Fputhash (name, spec_vec, Vcoding_system_hash_table);
9958   Vcoding_system_list = Fcons (name, Vcoding_system_list);
9959   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9960   if (NILP (val))
9961     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9962                                   Vcoding_system_alist);
9963
9964   {
9965     int id = coding_categories[category].id;
9966
9967     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9968       setup_coding_system (name, &coding_categories[category]);
9969   }
9970
9971   return Qnil;
9972
9973  short_args:
9974   return Fsignal (Qwrong_number_of_arguments,
9975                   Fcons (intern ("define-coding-system-internal"),
9976                          make_number (nargs)));
9977 }
9978
9979
9980 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9981        3, 3, 0,
9982        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
9983   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
9984 {
9985   Lisp_Object spec, attrs;
9986
9987   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9988   attrs = AREF (spec, 0);
9989   if (EQ (prop, QCmnemonic))
9990     {
9991       if (! STRINGP (val))
9992         CHECK_CHARACTER (val);
9993       CODING_ATTR_MNEMONIC (attrs) = val;
9994     }
9995   else if (EQ (prop, QCdefault_char))
9996     {
9997       if (NILP (val))
9998         val = make_number (' ');
9999       else
10000         CHECK_CHARACTER (val);
10001       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10002     }
10003   else if (EQ (prop, QCdecode_translation_table))
10004     {
10005       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10006         CHECK_SYMBOL (val);
10007       CODING_ATTR_DECODE_TBL (attrs) = val;
10008     }
10009   else if (EQ (prop, QCencode_translation_table))
10010     {
10011       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10012         CHECK_SYMBOL (val);
10013       CODING_ATTR_ENCODE_TBL (attrs) = val;
10014     }
10015   else if (EQ (prop, QCpost_read_conversion))
10016     {
10017       CHECK_SYMBOL (val);
10018       CODING_ATTR_POST_READ (attrs) = val;
10019     }
10020   else if (EQ (prop, QCpre_write_conversion))
10021     {
10022       CHECK_SYMBOL (val);
10023       CODING_ATTR_PRE_WRITE (attrs) = val;
10024     }
10025   else if (EQ (prop, QCascii_compatible_p))
10026     {
10027       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10028     }
10029
10030   CODING_ATTR_PLIST (attrs)
10031     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10032   return val;
10033 }
10034
10035
10036 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10037        Sdefine_coding_system_alias, 2, 2, 0,
10038        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10039   (Lisp_Object alias, Lisp_Object coding_system)
10040 {
10041   Lisp_Object spec, aliases, eol_type, val;
10042
10043   CHECK_SYMBOL (alias);
10044   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10045   aliases = AREF (spec, 1);
10046   /* ALIASES should be a list of length more than zero, and the first
10047      element is a base coding system.  Append ALIAS at the tail of the
10048      list.  */
10049   while (!NILP (XCDR (aliases)))
10050     aliases = XCDR (aliases);
10051   XSETCDR (aliases, Fcons (alias, Qnil));
10052
10053   eol_type = AREF (spec, 2);
10054   if (VECTORP (eol_type))
10055     {
10056       Lisp_Object subsidiaries;
10057       int i;
10058
10059       subsidiaries = make_subsidiaries (alias);
10060       for (i = 0; i < 3; i++)
10061         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10062                                      AREF (eol_type, i));
10063     }
10064
10065   Fputhash (alias, spec, Vcoding_system_hash_table);
10066   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10067   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10068   if (NILP (val))
10069     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10070                                   Vcoding_system_alist);
10071
10072   return Qnil;
10073 }
10074
10075 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10076        1, 1, 0,
10077        doc: /* Return the base of CODING-SYSTEM.
10078 Any alias or subsidiary coding system is not a base coding system.  */)
10079   (Lisp_Object coding_system)
10080 {
10081   Lisp_Object spec, attrs;
10082
10083   if (NILP (coding_system))
10084     return (Qno_conversion);
10085   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10086   attrs = AREF (spec, 0);
10087   return CODING_ATTR_BASE_NAME (attrs);
10088 }
10089
10090 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10091        1, 1, 0,
10092        doc: "Return the property list of CODING-SYSTEM.")
10093   (Lisp_Object coding_system)
10094 {
10095   Lisp_Object spec, attrs;
10096
10097   if (NILP (coding_system))
10098     coding_system = Qno_conversion;
10099   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10100   attrs = AREF (spec, 0);
10101   return CODING_ATTR_PLIST (attrs);
10102 }
10103
10104
10105 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10106        1, 1, 0,
10107        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10108   (Lisp_Object coding_system)
10109 {
10110   Lisp_Object spec;
10111
10112   if (NILP (coding_system))
10113     coding_system = Qno_conversion;
10114   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10115   return AREF (spec, 1);
10116 }
10117
10118 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10119        Scoding_system_eol_type, 1, 1, 0,
10120        doc: /* Return eol-type of CODING-SYSTEM.
10121 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10122
10123 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10124 and CR respectively.
10125
10126 A vector value indicates that a format of end-of-line should be
10127 detected automatically.  Nth element of the vector is the subsidiary
10128 coding system whose eol-type is N.  */)
10129   (Lisp_Object coding_system)
10130 {
10131   Lisp_Object spec, eol_type;
10132   int n;
10133
10134   if (NILP (coding_system))
10135     coding_system = Qno_conversion;
10136   if (! CODING_SYSTEM_P (coding_system))
10137     return Qnil;
10138   spec = CODING_SYSTEM_SPEC (coding_system);
10139   eol_type = AREF (spec, 2);
10140   if (VECTORP (eol_type))
10141     return Fcopy_sequence (eol_type);
10142   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10143   return make_number (n);
10144 }
10145
10146 #endif /* emacs */
10147
10148 \f
10149 /*** 9. Post-amble ***/
10150
10151 void
10152 init_coding_once (void)
10153 {
10154   int i;
10155
10156   for (i = 0; i < coding_category_max; i++)
10157     {
10158       coding_categories[i].id = -1;
10159       coding_priorities[i] = i;
10160     }
10161
10162   /* ISO2022 specific initialize routine.  */
10163   for (i = 0; i < 0x20; i++)
10164     iso_code_class[i] = ISO_control_0;
10165   for (i = 0x21; i < 0x7F; i++)
10166     iso_code_class[i] = ISO_graphic_plane_0;
10167   for (i = 0x80; i < 0xA0; i++)
10168     iso_code_class[i] = ISO_control_1;
10169   for (i = 0xA1; i < 0xFF; i++)
10170     iso_code_class[i] = ISO_graphic_plane_1;
10171   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10172   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10173   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10174   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10175   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10176   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10177   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10178   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10179   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10180
10181   for (i = 0; i < 256; i++)
10182     {
10183       emacs_mule_bytes[i] = 1;
10184     }
10185   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10186   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10187   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10188   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10189 }
10190
10191 #ifdef emacs
10192
10193 void
10194 syms_of_coding (void)
10195 {
10196   staticpro (&Vcoding_system_hash_table);
10197   {
10198     Lisp_Object args[2];
10199     args[0] = QCtest;
10200     args[1] = Qeq;
10201     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10202   }
10203
10204   staticpro (&Vsjis_coding_system);
10205   Vsjis_coding_system = Qnil;
10206
10207   staticpro (&Vbig5_coding_system);
10208   Vbig5_coding_system = Qnil;
10209
10210   staticpro (&Vcode_conversion_reused_workbuf);
10211   Vcode_conversion_reused_workbuf = Qnil;
10212
10213   staticpro (&Vcode_conversion_workbuf_name);
10214   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10215
10216   reused_workbuf_in_use = 0;
10217
10218   DEFSYM (Qcharset, "charset");
10219   DEFSYM (Qtarget_idx, "target-idx");
10220   DEFSYM (Qcoding_system_history, "coding-system-history");
10221   Fset (Qcoding_system_history, Qnil);
10222
10223   /* Target FILENAME is the first argument.  */
10224   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10225   /* Target FILENAME is the third argument.  */
10226   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10227
10228   DEFSYM (Qcall_process, "call-process");
10229   /* Target PROGRAM is the first argument.  */
10230   Fput (Qcall_process, Qtarget_idx, make_number (0));
10231
10232   DEFSYM (Qcall_process_region, "call-process-region");
10233   /* Target PROGRAM is the third argument.  */
10234   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10235
10236   DEFSYM (Qstart_process, "start-process");
10237   /* Target PROGRAM is the third argument.  */
10238   Fput (Qstart_process, Qtarget_idx, make_number (2));
10239
10240   DEFSYM (Qopen_network_stream, "open-network-stream");
10241   /* Target SERVICE is the fourth argument.  */
10242   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10243
10244   DEFSYM (Qcoding_system, "coding-system");
10245   DEFSYM (Qcoding_aliases, "coding-aliases");
10246
10247   DEFSYM (Qeol_type, "eol-type");
10248   DEFSYM (Qunix, "unix");
10249   DEFSYM (Qdos, "dos");
10250
10251   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10252   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10253   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10254   DEFSYM (Qdefault_char, "default-char");
10255   DEFSYM (Qundecided, "undecided");
10256   DEFSYM (Qno_conversion, "no-conversion");
10257   DEFSYM (Qraw_text, "raw-text");
10258
10259   DEFSYM (Qiso_2022, "iso-2022");
10260
10261   DEFSYM (Qutf_8, "utf-8");
10262   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10263
10264   DEFSYM (Qutf_16, "utf-16");
10265   DEFSYM (Qbig, "big");
10266   DEFSYM (Qlittle, "little");
10267
10268   DEFSYM (Qshift_jis, "shift-jis");
10269   DEFSYM (Qbig5, "big5");
10270
10271   DEFSYM (Qcoding_system_p, "coding-system-p");
10272
10273   DEFSYM (Qcoding_system_error, "coding-system-error");
10274   Fput (Qcoding_system_error, Qerror_conditions,
10275         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10276   Fput (Qcoding_system_error, Qerror_message,
10277         make_pure_c_string ("Invalid coding system"));
10278
10279   /* Intern this now in case it isn't already done.
10280      Setting this variable twice is harmless.
10281      But don't staticpro it here--that is done in alloc.c.  */
10282   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10283
10284   DEFSYM (Qtranslation_table, "translation-table");
10285   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10286   DEFSYM (Qtranslation_table_id, "translation-table-id");
10287   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10288   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10289
10290   DEFSYM (Qvalid_codes, "valid-codes");
10291
10292   DEFSYM (Qemacs_mule, "emacs-mule");
10293
10294   DEFSYM (QCcategory, ":category");
10295   DEFSYM (QCmnemonic, ":mnemonic");
10296   DEFSYM (QCdefault_char, ":default-char");
10297   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10298   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10299   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10300   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10301   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10302
10303   Vcoding_category_table
10304     = Fmake_vector (make_number (coding_category_max), Qnil);
10305   staticpro (&Vcoding_category_table);
10306   /* Followings are target of code detection.  */
10307   ASET (Vcoding_category_table, coding_category_iso_7,
10308         intern_c_string ("coding-category-iso-7"));
10309   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10310         intern_c_string ("coding-category-iso-7-tight"));
10311   ASET (Vcoding_category_table, coding_category_iso_8_1,
10312         intern_c_string ("coding-category-iso-8-1"));
10313   ASET (Vcoding_category_table, coding_category_iso_8_2,
10314         intern_c_string ("coding-category-iso-8-2"));
10315   ASET (Vcoding_category_table, coding_category_iso_7_else,
10316         intern_c_string ("coding-category-iso-7-else"));
10317   ASET (Vcoding_category_table, coding_category_iso_8_else,
10318         intern_c_string ("coding-category-iso-8-else"));
10319   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10320         intern_c_string ("coding-category-utf-8-auto"));
10321   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10322         intern_c_string ("coding-category-utf-8"));
10323   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10324         intern_c_string ("coding-category-utf-8-sig"));
10325   ASET (Vcoding_category_table, coding_category_utf_16_be,
10326         intern_c_string ("coding-category-utf-16-be"));
10327   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10328         intern_c_string ("coding-category-utf-16-auto"));
10329   ASET (Vcoding_category_table, coding_category_utf_16_le,
10330         intern_c_string ("coding-category-utf-16-le"));
10331   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10332         intern_c_string ("coding-category-utf-16-be-nosig"));
10333   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10334         intern_c_string ("coding-category-utf-16-le-nosig"));
10335   ASET (Vcoding_category_table, coding_category_charset,
10336         intern_c_string ("coding-category-charset"));
10337   ASET (Vcoding_category_table, coding_category_sjis,
10338         intern_c_string ("coding-category-sjis"));
10339   ASET (Vcoding_category_table, coding_category_big5,
10340         intern_c_string ("coding-category-big5"));
10341   ASET (Vcoding_category_table, coding_category_ccl,
10342         intern_c_string ("coding-category-ccl"));
10343   ASET (Vcoding_category_table, coding_category_emacs_mule,
10344         intern_c_string ("coding-category-emacs-mule"));
10345   /* Followings are NOT target of code detection.  */
10346   ASET (Vcoding_category_table, coding_category_raw_text,
10347         intern_c_string ("coding-category-raw-text"));
10348   ASET (Vcoding_category_table, coding_category_undecided,
10349         intern_c_string ("coding-category-undecided"));
10350
10351   DEFSYM (Qinsufficient_source, "insufficient-source");
10352   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10353   DEFSYM (Qinvalid_source, "invalid-source");
10354   DEFSYM (Qinterrupted, "interrupted");
10355   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10356   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10357
10358   defsubr (&Scoding_system_p);
10359   defsubr (&Sread_coding_system);
10360   defsubr (&Sread_non_nil_coding_system);
10361   defsubr (&Scheck_coding_system);
10362   defsubr (&Sdetect_coding_region);
10363   defsubr (&Sdetect_coding_string);
10364   defsubr (&Sfind_coding_systems_region_internal);
10365   defsubr (&Sunencodable_char_position);
10366   defsubr (&Scheck_coding_systems_region);
10367   defsubr (&Sdecode_coding_region);
10368   defsubr (&Sencode_coding_region);
10369   defsubr (&Sdecode_coding_string);
10370   defsubr (&Sencode_coding_string);
10371   defsubr (&Sdecode_sjis_char);
10372   defsubr (&Sencode_sjis_char);
10373   defsubr (&Sdecode_big5_char);
10374   defsubr (&Sencode_big5_char);
10375   defsubr (&Sset_terminal_coding_system_internal);
10376   defsubr (&Sset_safe_terminal_coding_system_internal);
10377   defsubr (&Sterminal_coding_system);
10378   defsubr (&Sset_keyboard_coding_system_internal);
10379   defsubr (&Skeyboard_coding_system);
10380   defsubr (&Sfind_operation_coding_system);
10381   defsubr (&Sset_coding_system_priority);
10382   defsubr (&Sdefine_coding_system_internal);
10383   defsubr (&Sdefine_coding_system_alias);
10384   defsubr (&Scoding_system_put);
10385   defsubr (&Scoding_system_base);
10386   defsubr (&Scoding_system_plist);
10387   defsubr (&Scoding_system_aliases);
10388   defsubr (&Scoding_system_eol_type);
10389   defsubr (&Scoding_system_priority_list);
10390
10391   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10392                doc: /* List of coding systems.
10393
10394 Do not alter the value of this variable manually.  This variable should be
10395 updated by the functions `define-coding-system' and
10396 `define-coding-system-alias'.  */);
10397   Vcoding_system_list = Qnil;
10398
10399   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10400                doc: /* Alist of coding system names.
10401 Each element is one element list of coding system name.
10402 This variable is given to `completing-read' as COLLECTION argument.
10403
10404 Do not alter the value of this variable manually.  This variable should be
10405 updated by the functions `make-coding-system' and
10406 `define-coding-system-alias'.  */);
10407   Vcoding_system_alist = Qnil;
10408
10409   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10410                doc: /* List of coding-categories (symbols) ordered by priority.
10411
10412 On detecting a coding system, Emacs tries code detection algorithms
10413 associated with each coding-category one by one in this order.  When
10414 one algorithm agrees with a byte sequence of source text, the coding
10415 system bound to the corresponding coding-category is selected.
10416
10417 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10418   {
10419     int i;
10420
10421     Vcoding_category_list = Qnil;
10422     for (i = coding_category_max - 1; i >= 0; i--)
10423       Vcoding_category_list
10424         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10425                  Vcoding_category_list);
10426   }
10427
10428   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10429                doc: /* Specify the coding system for read operations.
10430 It is useful to bind this variable with `let', but do not set it globally.
10431 If the value is a coding system, it is used for decoding on read operation.
10432 If not, an appropriate element is used from one of the coding system alists.
10433 There are three such tables: `file-coding-system-alist',
10434 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10435   Vcoding_system_for_read = Qnil;
10436
10437   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10438                doc: /* Specify the coding system for write operations.
10439 Programs bind this variable with `let', but you should not set it globally.
10440 If the value is a coding system, it is used for encoding of output,
10441 when writing it to a file and when sending it to a file or subprocess.
10442
10443 If this does not specify a coding system, an appropriate element
10444 is used from one of the coding system alists.
10445 There are three such tables: `file-coding-system-alist',
10446 `process-coding-system-alist', and `network-coding-system-alist'.
10447 For output to files, if the above procedure does not specify a coding system,
10448 the value of `buffer-file-coding-system' is used.  */);
10449   Vcoding_system_for_write = Qnil;
10450
10451   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10452                doc: /*
10453 Coding system used in the latest file or process I/O.  */);
10454   Vlast_coding_system_used = Qnil;
10455
10456   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10457                doc: /*
10458 Error status of the last code conversion.
10459
10460 When an error was detected in the last code conversion, this variable
10461 is set to one of the following symbols.
10462   `insufficient-source'
10463   `inconsistent-eol'
10464   `invalid-source'
10465   `interrupted'
10466   `insufficient-memory'
10467 When no error was detected, the value doesn't change.  So, to check
10468 the error status of a code conversion by this variable, you must
10469 explicitly set this variable to nil before performing code
10470 conversion.  */);
10471   Vlast_code_conversion_error = Qnil;
10472
10473   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10474                doc: /*
10475 *Non-nil means always inhibit code conversion of end-of-line format.
10476 See info node `Coding Systems' and info node `Text and Binary' concerning
10477 such conversion.  */);
10478   inhibit_eol_conversion = 0;
10479
10480   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10481                doc: /*
10482 Non-nil means process buffer inherits coding system of process output.
10483 Bind it to t if the process output is to be treated as if it were a file
10484 read from some filesystem.  */);
10485   inherit_process_coding_system = 0;
10486
10487   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10488                doc: /*
10489 Alist to decide a coding system to use for a file I/O operation.
10490 The format is ((PATTERN . VAL) ...),
10491 where PATTERN is a regular expression matching a file name,
10492 VAL is a coding system, a cons of coding systems, or a function symbol.
10493 If VAL is a coding system, it is used for both decoding and encoding
10494 the file contents.
10495 If VAL is a cons of coding systems, the car part is used for decoding,
10496 and the cdr part is used for encoding.
10497 If VAL is a function symbol, the function must return a coding system
10498 or a cons of coding systems which are used as above.  The function is
10499 called with an argument that is a list of the arguments with which
10500 `find-operation-coding-system' was called.  If the function can't decide
10501 a coding system, it can return `undecided' so that the normal
10502 code-detection is performed.
10503
10504 See also the function `find-operation-coding-system'
10505 and the variable `auto-coding-alist'.  */);
10506   Vfile_coding_system_alist = Qnil;
10507
10508   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10509                doc: /*
10510 Alist to decide a coding system to use for a process I/O operation.
10511 The format is ((PATTERN . VAL) ...),
10512 where PATTERN is a regular expression matching a program name,
10513 VAL is a coding system, a cons of coding systems, or a function symbol.
10514 If VAL is a coding system, it is used for both decoding what received
10515 from the program and encoding what sent to the program.
10516 If VAL is a cons of coding systems, the car part is used for decoding,
10517 and the cdr part is used for encoding.
10518 If VAL is a function symbol, the function must return a coding system
10519 or a cons of coding systems which are used as above.
10520
10521 See also the function `find-operation-coding-system'.  */);
10522   Vprocess_coding_system_alist = Qnil;
10523
10524   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10525                doc: /*
10526 Alist to decide a coding system to use for a network I/O operation.
10527 The format is ((PATTERN . VAL) ...),
10528 where PATTERN is a regular expression matching a network service name
10529 or is a port number to connect to,
10530 VAL is a coding system, a cons of coding systems, or a function symbol.
10531 If VAL is a coding system, it is used for both decoding what received
10532 from the network stream and encoding what sent to the network stream.
10533 If VAL is a cons of coding systems, the car part is used for decoding,
10534 and the cdr part is used for encoding.
10535 If VAL is a function symbol, the function must return a coding system
10536 or a cons of coding systems which are used as above.
10537
10538 See also the function `find-operation-coding-system'.  */);
10539   Vnetwork_coding_system_alist = Qnil;
10540
10541   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10542                doc: /* Coding system to use with system messages.
10543 Also used for decoding keyboard input on X Window system.  */);
10544   Vlocale_coding_system = Qnil;
10545
10546   /* The eol mnemonics are reset in startup.el system-dependently.  */
10547   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10548                doc: /*
10549 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10550   eol_mnemonic_unix = make_pure_c_string (":");
10551
10552   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10553                doc: /*
10554 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10555   eol_mnemonic_dos = make_pure_c_string ("\\");
10556
10557   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10558                doc: /*
10559 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10560   eol_mnemonic_mac = make_pure_c_string ("/");
10561
10562   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10563                doc: /*
10564 *String displayed in mode line when end-of-line format is not yet determined.  */);
10565   eol_mnemonic_undecided = make_pure_c_string (":");
10566
10567   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10568                doc: /*
10569 *Non-nil enables character translation while encoding and decoding.  */);
10570   Venable_character_translation = Qt;
10571
10572   DEFVAR_LISP ("standard-translation-table-for-decode",
10573                Vstandard_translation_table_for_decode,
10574                doc: /* Table for translating characters while decoding.  */);
10575   Vstandard_translation_table_for_decode = Qnil;
10576
10577   DEFVAR_LISP ("standard-translation-table-for-encode",
10578                Vstandard_translation_table_for_encode,
10579                doc: /* Table for translating characters while encoding.  */);
10580   Vstandard_translation_table_for_encode = Qnil;
10581
10582   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10583                doc: /* Alist of charsets vs revision numbers.
10584 While encoding, if a charset (car part of an element) is found,
10585 designate it with the escape sequence identifying revision (cdr part
10586 of the element).  */);
10587   Vcharset_revision_table = Qnil;
10588
10589   DEFVAR_LISP ("default-process-coding-system",
10590                Vdefault_process_coding_system,
10591                doc: /* Cons of coding systems used for process I/O by default.
10592 The car part is used for decoding a process output,
10593 the cdr part is used for encoding a text to be sent to a process.  */);
10594   Vdefault_process_coding_system = Qnil;
10595
10596   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10597                doc: /*
10598 Table of extra Latin codes in the range 128..159 (inclusive).
10599 This is a vector of length 256.
10600 If Nth element is non-nil, the existence of code N in a file
10601 \(or output of subprocess) doesn't prevent it to be detected as
10602 a coding system of ISO 2022 variant which has a flag
10603 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10604 or reading output of a subprocess.
10605 Only 128th through 159th elements have a meaning.  */);
10606   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10607
10608   DEFVAR_LISP ("select-safe-coding-system-function",
10609                Vselect_safe_coding_system_function,
10610                doc: /*
10611 Function to call to select safe coding system for encoding a text.
10612
10613 If set, this function is called to force a user to select a proper
10614 coding system which can encode the text in the case that a default
10615 coding system used in each operation can't encode the text.  The
10616 function should take care that the buffer is not modified while
10617 the coding system is being selected.
10618
10619 The default value is `select-safe-coding-system' (which see).  */);
10620   Vselect_safe_coding_system_function = Qnil;
10621
10622   DEFVAR_BOOL ("coding-system-require-warning",
10623                coding_system_require_warning,
10624                doc: /* Internal use only.
10625 If non-nil, on writing a file, `select-safe-coding-system-function' is
10626 called even if `coding-system-for-write' is non-nil.  The command
10627 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10628   coding_system_require_warning = 0;
10629
10630
10631   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10632                inhibit_iso_escape_detection,
10633                doc: /*
10634 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10635
10636 When Emacs reads text, it tries to detect how the text is encoded.
10637 This code detection is sensitive to escape sequences.  If Emacs sees
10638 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10639 of the ISO2022 encodings, and decodes text by the corresponding coding
10640 system (e.g. `iso-2022-7bit').
10641
10642 However, there may be a case that you want to read escape sequences in
10643 a file as is.  In such a case, you can set this variable to non-nil.
10644 Then the code detection will ignore any escape sequences, and no text is
10645 detected as encoded in some ISO-2022 encoding.  The result is that all
10646 escape sequences become visible in a buffer.
10647
10648 The default value is nil, and it is strongly recommended not to change
10649 it.  That is because many Emacs Lisp source files that contain
10650 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10651 in Emacs's distribution, and they won't be decoded correctly on
10652 reading if you suppress escape sequence detection.
10653
10654 The other way to read escape sequences in a file without decoding is
10655 to explicitly specify some coding system that doesn't use ISO-2022
10656 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10657   inhibit_iso_escape_detection = 0;
10658
10659   DEFVAR_BOOL ("inhibit-null-byte-detection",
10660                inhibit_null_byte_detection,
10661                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10662 By default, Emacs treats it as binary data, and does not attempt to
10663 decode it.  The effect is as if you specified `no-conversion' for
10664 reading that text.
10665
10666 Set this to non-nil when a regular text happens to include null bytes.
10667 Examples are Index nodes of Info files and null-byte delimited output
10668 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10669 decode text as usual.  */);
10670   inhibit_null_byte_detection = 0;
10671
10672   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10673                doc: /* Char table for translating self-inserting characters.
10674 This is applied to the result of input methods, not their input.
10675 See also `keyboard-translate-table'.
10676
10677 Use of this variable for character code unification was rendered
10678 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10679 internal character representation.  */);
10680     Vtranslation_table_for_input = Qnil;
10681
10682   {
10683     Lisp_Object args[coding_arg_max];
10684     Lisp_Object plist[16];
10685     int i;
10686
10687     for (i = 0; i < coding_arg_max; i++)
10688       args[i] = Qnil;
10689
10690     plist[0] = intern_c_string (":name");
10691     plist[1] = args[coding_arg_name] = Qno_conversion;
10692     plist[2] = intern_c_string (":mnemonic");
10693     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10694     plist[4] = intern_c_string (":coding-type");
10695     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10696     plist[6] = intern_c_string (":ascii-compatible-p");
10697     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10698     plist[8] = intern_c_string (":default-char");
10699     plist[9] = args[coding_arg_default_char] = make_number (0);
10700     plist[10] = intern_c_string (":for-unibyte");
10701     plist[11] = args[coding_arg_for_unibyte] = Qt;
10702     plist[12] = intern_c_string (":docstring");
10703     plist[13] = make_pure_c_string ("Do no conversion.\n\
10704 \n\
10705 When you visit a file with this coding, the file is read into a\n\
10706 unibyte buffer as is, thus each byte of a file is treated as a\n\
10707 character.");
10708     plist[14] = intern_c_string (":eol-type");
10709     plist[15] = args[coding_arg_eol_type] = Qunix;
10710     args[coding_arg_plist] = Flist (16, plist);
10711     Fdefine_coding_system_internal (coding_arg_max, args);
10712
10713     plist[1] = args[coding_arg_name] = Qundecided;
10714     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10715     plist[5] = args[coding_arg_coding_type] = Qundecided;
10716     /* This is already set.
10717        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10718     plist[8] = intern_c_string (":charset-list");
10719     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10720     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10721     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10722     plist[15] = args[coding_arg_eol_type] = Qnil;
10723     args[coding_arg_plist] = Flist (16, plist);
10724     Fdefine_coding_system_internal (coding_arg_max, args);
10725   }
10726
10727   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10728
10729   {
10730     int i;
10731
10732     for (i = 0; i < coding_category_max; i++)
10733       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10734   }
10735 #if defined (DOS_NT)
10736   system_eol_type = Qdos;
10737 #else
10738   system_eol_type = Qunix;
10739 #endif
10740   staticpro (&system_eol_type);
10741 }
10742
10743 char *
10744 emacs_strerror (int error_number)
10745 {
10746   char *str;
10747
10748   synchronize_system_messages_locale ();
10749   str = strerror (error_number);
10750
10751   if (! NILP (Vlocale_coding_system))
10752     {
10753       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10754                                                       Vlocale_coding_system,
10755                                                       0);
10756       str = SSDATA (dec);
10757     }
10758
10759   return str;
10760 }
10761
10762 #endif /* emacs */